linux/net/rds/tcp.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35#include <linux/in.h>
  36#include <linux/module.h>
  37#include <net/tcp.h>
  38#include <net/net_namespace.h>
  39#include <net/netns/generic.h>
  40#include <net/addrconf.h>
  41
  42#include "rds.h"
  43#include "tcp.h"
  44
  45/* only for info exporting */
  46static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
  47static LIST_HEAD(rds_tcp_tc_list);
  48
  49/* rds_tcp_tc_count counts only IPv4 connections.
  50 * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
  51 */
  52static unsigned int rds_tcp_tc_count;
  53#if IS_ENABLED(CONFIG_IPV6)
  54static unsigned int rds6_tcp_tc_count;
  55#endif
  56
  57/* Track rds_tcp_connection structs so they can be cleaned up */
  58static DEFINE_SPINLOCK(rds_tcp_conn_lock);
  59static LIST_HEAD(rds_tcp_conn_list);
  60static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
  61
  62static struct kmem_cache *rds_tcp_conn_slab;
  63
  64static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
  65                                 void *buffer, size_t *lenp, loff_t *fpos);
  66
  67static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
  68static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
  69
  70static struct ctl_table rds_tcp_sysctl_table[] = {
  71#define RDS_TCP_SNDBUF  0
  72        {
  73                .procname       = "rds_tcp_sndbuf",
  74                /* data is per-net pointer */
  75                .maxlen         = sizeof(int),
  76                .mode           = 0644,
  77                .proc_handler   = rds_tcp_skbuf_handler,
  78                .extra1         = &rds_tcp_min_sndbuf,
  79        },
  80#define RDS_TCP_RCVBUF  1
  81        {
  82                .procname       = "rds_tcp_rcvbuf",
  83                /* data is per-net pointer */
  84                .maxlen         = sizeof(int),
  85                .mode           = 0644,
  86                .proc_handler   = rds_tcp_skbuf_handler,
  87                .extra1         = &rds_tcp_min_rcvbuf,
  88        },
  89        { }
  90};
  91
  92u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
  93{
  94        /* seq# of the last byte of data in tcp send buffer */
  95        return tcp_sk(tc->t_sock->sk)->write_seq;
  96}
  97
  98u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
  99{
 100        return tcp_sk(tc->t_sock->sk)->snd_una;
 101}
 102
 103void rds_tcp_restore_callbacks(struct socket *sock,
 104                               struct rds_tcp_connection *tc)
 105{
 106        rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
 107        write_lock_bh(&sock->sk->sk_callback_lock);
 108
 109        /* done under the callback_lock to serialize with write_space */
 110        spin_lock(&rds_tcp_tc_list_lock);
 111        list_del_init(&tc->t_list_item);
 112#if IS_ENABLED(CONFIG_IPV6)
 113        rds6_tcp_tc_count--;
 114#endif
 115        if (!tc->t_cpath->cp_conn->c_isv6)
 116                rds_tcp_tc_count--;
 117        spin_unlock(&rds_tcp_tc_list_lock);
 118
 119        tc->t_sock = NULL;
 120
 121        sock->sk->sk_write_space = tc->t_orig_write_space;
 122        sock->sk->sk_data_ready = tc->t_orig_data_ready;
 123        sock->sk->sk_state_change = tc->t_orig_state_change;
 124        sock->sk->sk_user_data = NULL;
 125
 126        write_unlock_bh(&sock->sk->sk_callback_lock);
 127}
 128
 129/*
 130 * rds_tcp_reset_callbacks() switches the to the new sock and
 131 * returns the existing tc->t_sock.
 132 *
 133 * The only functions that set tc->t_sock are rds_tcp_set_callbacks
 134 * and rds_tcp_reset_callbacks.  Send and receive trust that
 135 * it is set.  The absence of RDS_CONN_UP bit protects those paths
 136 * from being called while it isn't set.
 137 */
 138void rds_tcp_reset_callbacks(struct socket *sock,
 139                             struct rds_conn_path *cp)
 140{
 141        struct rds_tcp_connection *tc = cp->cp_transport_data;
 142        struct socket *osock = tc->t_sock;
 143
 144        if (!osock)
 145                goto newsock;
 146
 147        /* Need to resolve a duelling SYN between peers.
 148         * We have an outstanding SYN to this peer, which may
 149         * potentially have transitioned to the RDS_CONN_UP state,
 150         * so we must quiesce any send threads before resetting
 151         * cp_transport_data. We quiesce these threads by setting
 152         * cp_state to something other than RDS_CONN_UP, and then
 153         * waiting for any existing threads in rds_send_xmit to
 154         * complete release_in_xmit(). (Subsequent threads entering
 155         * rds_send_xmit() will bail on !rds_conn_up().
 156         *
 157         * However an incoming syn-ack at this point would end up
 158         * marking the conn as RDS_CONN_UP, and would again permit
 159         * rds_send_xmi() threads through, so ideally we would
 160         * synchronize on RDS_CONN_UP after lock_sock(), but cannot
 161         * do that: waiting on !RDS_IN_XMIT after lock_sock() may
 162         * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
 163         * would not get set. As a result, we set c_state to
 164         * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
 165         * cannot mark rds_conn_path_up() in the window before lock_sock()
 166         */
 167        atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
 168        wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
 169        lock_sock(osock->sk);
 170        /* reset receive side state for rds_tcp_data_recv() for osock  */
 171        cancel_delayed_work_sync(&cp->cp_send_w);
 172        cancel_delayed_work_sync(&cp->cp_recv_w);
 173        if (tc->t_tinc) {
 174                rds_inc_put(&tc->t_tinc->ti_inc);
 175                tc->t_tinc = NULL;
 176        }
 177        tc->t_tinc_hdr_rem = sizeof(struct rds_header);
 178        tc->t_tinc_data_rem = 0;
 179        rds_tcp_restore_callbacks(osock, tc);
 180        release_sock(osock->sk);
 181        sock_release(osock);
 182newsock:
 183        rds_send_path_reset(cp);
 184        lock_sock(sock->sk);
 185        rds_tcp_set_callbacks(sock, cp);
 186        release_sock(sock->sk);
 187}
 188
 189/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
 190 * above rds_tcp_reset_callbacks for notes about synchronization
 191 * with data path
 192 */
 193void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
 194{
 195        struct rds_tcp_connection *tc = cp->cp_transport_data;
 196
 197        rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
 198        write_lock_bh(&sock->sk->sk_callback_lock);
 199
 200        /* done under the callback_lock to serialize with write_space */
 201        spin_lock(&rds_tcp_tc_list_lock);
 202        list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
 203#if IS_ENABLED(CONFIG_IPV6)
 204        rds6_tcp_tc_count++;
 205#endif
 206        if (!tc->t_cpath->cp_conn->c_isv6)
 207                rds_tcp_tc_count++;
 208        spin_unlock(&rds_tcp_tc_list_lock);
 209
 210        /* accepted sockets need our listen data ready undone */
 211        if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
 212                sock->sk->sk_data_ready = sock->sk->sk_user_data;
 213
 214        tc->t_sock = sock;
 215        tc->t_cpath = cp;
 216        tc->t_orig_data_ready = sock->sk->sk_data_ready;
 217        tc->t_orig_write_space = sock->sk->sk_write_space;
 218        tc->t_orig_state_change = sock->sk->sk_state_change;
 219
 220        sock->sk->sk_user_data = cp;
 221        sock->sk->sk_data_ready = rds_tcp_data_ready;
 222        sock->sk->sk_write_space = rds_tcp_write_space;
 223        sock->sk->sk_state_change = rds_tcp_state_change;
 224
 225        write_unlock_bh(&sock->sk->sk_callback_lock);
 226}
 227
 228/* Handle RDS_INFO_TCP_SOCKETS socket option.  It only returns IPv4
 229 * connections for backward compatibility.
 230 */
 231static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 232                            struct rds_info_iterator *iter,
 233                            struct rds_info_lengths *lens)
 234{
 235        struct rds_info_tcp_socket tsinfo;
 236        struct rds_tcp_connection *tc;
 237        unsigned long flags;
 238
 239        spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
 240
 241        if (len / sizeof(tsinfo) < rds_tcp_tc_count)
 242                goto out;
 243
 244        list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
 245                struct inet_sock *inet = inet_sk(tc->t_sock->sk);
 246
 247                if (tc->t_cpath->cp_conn->c_isv6)
 248                        continue;
 249
 250                tsinfo.local_addr = inet->inet_saddr;
 251                tsinfo.local_port = inet->inet_sport;
 252                tsinfo.peer_addr = inet->inet_daddr;
 253                tsinfo.peer_port = inet->inet_dport;
 254
 255                tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
 256                tsinfo.data_rem = tc->t_tinc_data_rem;
 257                tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
 258                tsinfo.last_expected_una = tc->t_last_expected_una;
 259                tsinfo.last_seen_una = tc->t_last_seen_una;
 260                tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
 261
 262                rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
 263        }
 264
 265out:
 266        lens->nr = rds_tcp_tc_count;
 267        lens->each = sizeof(tsinfo);
 268
 269        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 270}
 271
 272#if IS_ENABLED(CONFIG_IPV6)
 273/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
 274 * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
 275 * address.
 276 */
 277static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
 278                             struct rds_info_iterator *iter,
 279                             struct rds_info_lengths *lens)
 280{
 281        struct rds6_info_tcp_socket tsinfo6;
 282        struct rds_tcp_connection *tc;
 283        unsigned long flags;
 284
 285        spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
 286
 287        if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
 288                goto out;
 289
 290        list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
 291                struct sock *sk = tc->t_sock->sk;
 292                struct inet_sock *inet = inet_sk(sk);
 293
 294                tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
 295                tsinfo6.local_port = inet->inet_sport;
 296                tsinfo6.peer_addr = sk->sk_v6_daddr;
 297                tsinfo6.peer_port = inet->inet_dport;
 298
 299                tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
 300                tsinfo6.data_rem = tc->t_tinc_data_rem;
 301                tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
 302                tsinfo6.last_expected_una = tc->t_last_expected_una;
 303                tsinfo6.last_seen_una = tc->t_last_seen_una;
 304
 305                rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
 306        }
 307
 308out:
 309        lens->nr = rds6_tcp_tc_count;
 310        lens->each = sizeof(tsinfo6);
 311
 312        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 313}
 314#endif
 315
 316int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
 317                        __u32 scope_id)
 318{
 319        struct net_device *dev = NULL;
 320#if IS_ENABLED(CONFIG_IPV6)
 321        int ret;
 322#endif
 323
 324        if (ipv6_addr_v4mapped(addr)) {
 325                if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
 326                        return 0;
 327                return -EADDRNOTAVAIL;
 328        }
 329
 330        /* If the scope_id is specified, check only those addresses
 331         * hosted on the specified interface.
 332         */
 333        if (scope_id != 0) {
 334                rcu_read_lock();
 335                dev = dev_get_by_index_rcu(net, scope_id);
 336                /* scope_id is not valid... */
 337                if (!dev) {
 338                        rcu_read_unlock();
 339                        return -EADDRNOTAVAIL;
 340                }
 341                rcu_read_unlock();
 342        }
 343#if IS_ENABLED(CONFIG_IPV6)
 344        ret = ipv6_chk_addr(net, addr, dev, 0);
 345        if (ret)
 346                return 0;
 347#endif
 348        return -EADDRNOTAVAIL;
 349}
 350
 351static void rds_tcp_conn_free(void *arg)
 352{
 353        struct rds_tcp_connection *tc = arg;
 354        unsigned long flags;
 355
 356        rdsdebug("freeing tc %p\n", tc);
 357
 358        spin_lock_irqsave(&rds_tcp_conn_lock, flags);
 359        if (!tc->t_tcp_node_detached)
 360                list_del(&tc->t_tcp_node);
 361        spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
 362
 363        kmem_cache_free(rds_tcp_conn_slab, tc);
 364}
 365
 366static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 367{
 368        struct rds_tcp_connection *tc;
 369        int i, j;
 370        int ret = 0;
 371
 372        for (i = 0; i < RDS_MPATH_WORKERS; i++) {
 373                tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
 374                if (!tc) {
 375                        ret = -ENOMEM;
 376                        goto fail;
 377                }
 378                mutex_init(&tc->t_conn_path_lock);
 379                tc->t_sock = NULL;
 380                tc->t_tinc = NULL;
 381                tc->t_tinc_hdr_rem = sizeof(struct rds_header);
 382                tc->t_tinc_data_rem = 0;
 383
 384                conn->c_path[i].cp_transport_data = tc;
 385                tc->t_cpath = &conn->c_path[i];
 386                tc->t_tcp_node_detached = true;
 387
 388                rdsdebug("rds_conn_path [%d] tc %p\n", i,
 389                         conn->c_path[i].cp_transport_data);
 390        }
 391        spin_lock_irq(&rds_tcp_conn_lock);
 392        for (i = 0; i < RDS_MPATH_WORKERS; i++) {
 393                tc = conn->c_path[i].cp_transport_data;
 394                tc->t_tcp_node_detached = false;
 395                list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
 396        }
 397        spin_unlock_irq(&rds_tcp_conn_lock);
 398fail:
 399        if (ret) {
 400                for (j = 0; j < i; j++)
 401                        rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
 402        }
 403        return ret;
 404}
 405
 406static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
 407{
 408        struct rds_tcp_connection *tc, *_tc;
 409
 410        list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
 411                if (tc->t_cpath->cp_conn == conn)
 412                        return true;
 413        }
 414        return false;
 415}
 416
 417static void rds_tcp_set_unloading(void)
 418{
 419        atomic_set(&rds_tcp_unloading, 1);
 420}
 421
 422static bool rds_tcp_is_unloading(struct rds_connection *conn)
 423{
 424        return atomic_read(&rds_tcp_unloading) != 0;
 425}
 426
 427static void rds_tcp_destroy_conns(void)
 428{
 429        struct rds_tcp_connection *tc, *_tc;
 430        LIST_HEAD(tmp_list);
 431
 432        /* avoid calling conn_destroy with irqs off */
 433        spin_lock_irq(&rds_tcp_conn_lock);
 434        list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 435                if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
 436                        list_move_tail(&tc->t_tcp_node, &tmp_list);
 437        }
 438        spin_unlock_irq(&rds_tcp_conn_lock);
 439
 440        list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 441                rds_conn_destroy(tc->t_cpath->cp_conn);
 442}
 443
 444static void rds_tcp_exit(void);
 445
 446static u8 rds_tcp_get_tos_map(u8 tos)
 447{
 448        /* all user tos mapped to default 0 for TCP transport */
 449        return 0;
 450}
 451
 452struct rds_transport rds_tcp_transport = {
 453        .laddr_check            = rds_tcp_laddr_check,
 454        .xmit_path_prepare      = rds_tcp_xmit_path_prepare,
 455        .xmit_path_complete     = rds_tcp_xmit_path_complete,
 456        .xmit                   = rds_tcp_xmit,
 457        .recv_path              = rds_tcp_recv_path,
 458        .conn_alloc             = rds_tcp_conn_alloc,
 459        .conn_free              = rds_tcp_conn_free,
 460        .conn_path_connect      = rds_tcp_conn_path_connect,
 461        .conn_path_shutdown     = rds_tcp_conn_path_shutdown,
 462        .inc_copy_to_user       = rds_tcp_inc_copy_to_user,
 463        .inc_free               = rds_tcp_inc_free,
 464        .stats_info_copy        = rds_tcp_stats_info_copy,
 465        .exit                   = rds_tcp_exit,
 466        .get_tos_map            = rds_tcp_get_tos_map,
 467        .t_owner                = THIS_MODULE,
 468        .t_name                 = "tcp",
 469        .t_type                 = RDS_TRANS_TCP,
 470        .t_prefer_loopback      = 1,
 471        .t_mp_capable           = 1,
 472        .t_unloading            = rds_tcp_is_unloading,
 473};
 474
 475static unsigned int rds_tcp_netid;
 476
 477/* per-network namespace private data for this module */
 478struct rds_tcp_net {
 479        struct socket *rds_tcp_listen_sock;
 480        struct work_struct rds_tcp_accept_w;
 481        struct ctl_table_header *rds_tcp_sysctl;
 482        struct ctl_table *ctl_table;
 483        int sndbuf_size;
 484        int rcvbuf_size;
 485};
 486
 487/* All module specific customizations to the RDS-TCP socket should be done in
 488 * rds_tcp_tune() and applied after socket creation.
 489 */
 490void rds_tcp_tune(struct socket *sock)
 491{
 492        struct sock *sk = sock->sk;
 493        struct net *net = sock_net(sk);
 494        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 495
 496        tcp_sock_set_nodelay(sock->sk);
 497        lock_sock(sk);
 498        if (rtn->sndbuf_size > 0) {
 499                sk->sk_sndbuf = rtn->sndbuf_size;
 500                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 501        }
 502        if (rtn->rcvbuf_size > 0) {
 503                sk->sk_sndbuf = rtn->rcvbuf_size;
 504                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 505        }
 506        release_sock(sk);
 507}
 508
 509static void rds_tcp_accept_worker(struct work_struct *work)
 510{
 511        struct rds_tcp_net *rtn = container_of(work,
 512                                               struct rds_tcp_net,
 513                                               rds_tcp_accept_w);
 514
 515        while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
 516                cond_resched();
 517}
 518
 519void rds_tcp_accept_work(struct sock *sk)
 520{
 521        struct net *net = sock_net(sk);
 522        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 523
 524        queue_work(rds_wq, &rtn->rds_tcp_accept_w);
 525}
 526
 527static __net_init int rds_tcp_init_net(struct net *net)
 528{
 529        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 530        struct ctl_table *tbl;
 531        int err = 0;
 532
 533        memset(rtn, 0, sizeof(*rtn));
 534
 535        /* {snd, rcv}buf_size default to 0, which implies we let the
 536         * stack pick the value, and permit auto-tuning of buffer size.
 537         */
 538        if (net == &init_net) {
 539                tbl = rds_tcp_sysctl_table;
 540        } else {
 541                tbl = kmemdup(rds_tcp_sysctl_table,
 542                              sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
 543                if (!tbl) {
 544                        pr_warn("could not set allocate sysctl table\n");
 545                        return -ENOMEM;
 546                }
 547                rtn->ctl_table = tbl;
 548        }
 549        tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
 550        tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
 551        rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
 552        if (!rtn->rds_tcp_sysctl) {
 553                pr_warn("could not register sysctl\n");
 554                err = -ENOMEM;
 555                goto fail;
 556        }
 557
 558#if IS_ENABLED(CONFIG_IPV6)
 559        rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
 560#else
 561        rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
 562#endif
 563        if (!rtn->rds_tcp_listen_sock) {
 564                pr_warn("could not set up IPv6 listen sock\n");
 565
 566#if IS_ENABLED(CONFIG_IPV6)
 567                /* Try IPv4 as some systems disable IPv6 */
 568                rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
 569                if (!rtn->rds_tcp_listen_sock) {
 570#endif
 571                        unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
 572                        rtn->rds_tcp_sysctl = NULL;
 573                        err = -EAFNOSUPPORT;
 574                        goto fail;
 575#if IS_ENABLED(CONFIG_IPV6)
 576                }
 577#endif
 578        }
 579        INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
 580        return 0;
 581
 582fail:
 583        if (net != &init_net)
 584                kfree(tbl);
 585        return err;
 586}
 587
 588static void rds_tcp_kill_sock(struct net *net)
 589{
 590        struct rds_tcp_connection *tc, *_tc;
 591        LIST_HEAD(tmp_list);
 592        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 593        struct socket *lsock = rtn->rds_tcp_listen_sock;
 594
 595        rtn->rds_tcp_listen_sock = NULL;
 596        rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
 597        spin_lock_irq(&rds_tcp_conn_lock);
 598        list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 599                struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 600
 601                if (net != c_net)
 602                        continue;
 603                if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
 604                        list_move_tail(&tc->t_tcp_node, &tmp_list);
 605                } else {
 606                        list_del(&tc->t_tcp_node);
 607                        tc->t_tcp_node_detached = true;
 608                }
 609        }
 610        spin_unlock_irq(&rds_tcp_conn_lock);
 611        list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 612                rds_conn_destroy(tc->t_cpath->cp_conn);
 613}
 614
 615static void __net_exit rds_tcp_exit_net(struct net *net)
 616{
 617        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 618
 619        rds_tcp_kill_sock(net);
 620
 621        if (rtn->rds_tcp_sysctl)
 622                unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
 623
 624        if (net != &init_net)
 625                kfree(rtn->ctl_table);
 626}
 627
 628static struct pernet_operations rds_tcp_net_ops = {
 629        .init = rds_tcp_init_net,
 630        .exit = rds_tcp_exit_net,
 631        .id = &rds_tcp_netid,
 632        .size = sizeof(struct rds_tcp_net),
 633};
 634
 635void *rds_tcp_listen_sock_def_readable(struct net *net)
 636{
 637        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 638        struct socket *lsock = rtn->rds_tcp_listen_sock;
 639
 640        if (!lsock)
 641                return NULL;
 642
 643        return lsock->sk->sk_user_data;
 644}
 645
 646/* when sysctl is used to modify some kernel socket parameters,this
 647 * function  resets the RDS connections in that netns  so that we can
 648 * restart with new parameters.  The assumption is that such reset
 649 * events are few and far-between.
 650 */
 651static void rds_tcp_sysctl_reset(struct net *net)
 652{
 653        struct rds_tcp_connection *tc, *_tc;
 654
 655        spin_lock_irq(&rds_tcp_conn_lock);
 656        list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 657                struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 658
 659                if (net != c_net || !tc->t_sock)
 660                        continue;
 661
 662                /* reconnect with new parameters */
 663                rds_conn_path_drop(tc->t_cpath, false);
 664        }
 665        spin_unlock_irq(&rds_tcp_conn_lock);
 666}
 667
 668static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
 669                                 void *buffer, size_t *lenp, loff_t *fpos)
 670{
 671        struct net *net = current->nsproxy->net_ns;
 672        int err;
 673
 674        err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
 675        if (err < 0) {
 676                pr_warn("Invalid input. Must be >= %d\n",
 677                        *(int *)(ctl->extra1));
 678                return err;
 679        }
 680        if (write)
 681                rds_tcp_sysctl_reset(net);
 682        return 0;
 683}
 684
 685static void rds_tcp_exit(void)
 686{
 687        rds_tcp_set_unloading();
 688        synchronize_rcu();
 689        rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 690#if IS_ENABLED(CONFIG_IPV6)
 691        rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 692#endif
 693        unregister_pernet_device(&rds_tcp_net_ops);
 694        rds_tcp_destroy_conns();
 695        rds_trans_unregister(&rds_tcp_transport);
 696        rds_tcp_recv_exit();
 697        kmem_cache_destroy(rds_tcp_conn_slab);
 698}
 699module_exit(rds_tcp_exit);
 700
 701static int rds_tcp_init(void)
 702{
 703        int ret;
 704
 705        rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
 706                                              sizeof(struct rds_tcp_connection),
 707                                              0, 0, NULL);
 708        if (!rds_tcp_conn_slab) {
 709                ret = -ENOMEM;
 710                goto out;
 711        }
 712
 713        ret = rds_tcp_recv_init();
 714        if (ret)
 715                goto out_slab;
 716
 717        ret = register_pernet_device(&rds_tcp_net_ops);
 718        if (ret)
 719                goto out_recv;
 720
 721        rds_trans_register(&rds_tcp_transport);
 722
 723        rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 724#if IS_ENABLED(CONFIG_IPV6)
 725        rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 726#endif
 727
 728        goto out;
 729out_recv:
 730        rds_tcp_recv_exit();
 731out_slab:
 732        kmem_cache_destroy(rds_tcp_conn_slab);
 733out:
 734        return ret;
 735}
 736module_init(rds_tcp_init);
 737
 738MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
 739MODULE_DESCRIPTION("RDS: TCP transport");
 740MODULE_LICENSE("Dual BSD/GPL");
 741