linux/net/rds/tcp.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006 Oracle.  All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35#include <linux/in.h>
  36#include <linux/module.h>
  37#include <net/tcp.h>
  38#include <net/net_namespace.h>
  39#include <net/netns/generic.h>
  40#include <net/tcp.h>
  41
  42#include "rds.h"
  43#include "tcp.h"
  44
  45/* only for info exporting */
  46static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
  47static LIST_HEAD(rds_tcp_tc_list);
  48static unsigned int rds_tcp_tc_count;
  49
  50/* Track rds_tcp_connection structs so they can be cleaned up */
  51static DEFINE_SPINLOCK(rds_tcp_conn_lock);
  52static LIST_HEAD(rds_tcp_conn_list);
  53
  54static struct kmem_cache *rds_tcp_conn_slab;
  55
  56#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
  57
  58/* doing it this way avoids calling tcp_sk() */
  59void rds_tcp_nonagle(struct socket *sock)
  60{
  61        mm_segment_t oldfs = get_fs();
  62        int val = 1;
  63
  64        set_fs(KERNEL_DS);
  65        sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
  66                              sizeof(val));
  67        set_fs(oldfs);
  68}
  69
  70/* All module specific customizations to the RDS-TCP socket should be done in
  71 * rds_tcp_tune() and applied after socket creation. In general these
  72 * customizations should be tunable via module_param()
  73 */
  74void rds_tcp_tune(struct socket *sock)
  75{
  76        rds_tcp_nonagle(sock);
  77}
  78
  79u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
  80{
  81        return tcp_sk(tc->t_sock->sk)->snd_nxt;
  82}
  83
  84u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
  85{
  86        return tcp_sk(tc->t_sock->sk)->snd_una;
  87}
  88
  89void rds_tcp_restore_callbacks(struct socket *sock,
  90                               struct rds_tcp_connection *tc)
  91{
  92        rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
  93        write_lock_bh(&sock->sk->sk_callback_lock);
  94
  95        /* done under the callback_lock to serialize with write_space */
  96        spin_lock(&rds_tcp_tc_list_lock);
  97        list_del_init(&tc->t_list_item);
  98        rds_tcp_tc_count--;
  99        spin_unlock(&rds_tcp_tc_list_lock);
 100
 101        tc->t_sock = NULL;
 102
 103        sock->sk->sk_write_space = tc->t_orig_write_space;
 104        sock->sk->sk_data_ready = tc->t_orig_data_ready;
 105        sock->sk->sk_state_change = tc->t_orig_state_change;
 106        sock->sk->sk_user_data = NULL;
 107
 108        write_unlock_bh(&sock->sk->sk_callback_lock);
 109}
 110
 111/*
 112 * This is the only path that sets tc->t_sock.  Send and receive trust that
 113 * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
 114 * called while it isn't set.
 115 */
 116void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
 117{
 118        struct rds_tcp_connection *tc = conn->c_transport_data;
 119
 120        rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
 121        write_lock_bh(&sock->sk->sk_callback_lock);
 122
 123        /* done under the callback_lock to serialize with write_space */
 124        spin_lock(&rds_tcp_tc_list_lock);
 125        list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
 126        rds_tcp_tc_count++;
 127        spin_unlock(&rds_tcp_tc_list_lock);
 128
 129        /* accepted sockets need our listen data ready undone */
 130        if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
 131                sock->sk->sk_data_ready = sock->sk->sk_user_data;
 132
 133        tc->t_sock = sock;
 134        tc->conn = conn;
 135        tc->t_orig_data_ready = sock->sk->sk_data_ready;
 136        tc->t_orig_write_space = sock->sk->sk_write_space;
 137        tc->t_orig_state_change = sock->sk->sk_state_change;
 138
 139        sock->sk->sk_user_data = conn;
 140        sock->sk->sk_data_ready = rds_tcp_data_ready;
 141        sock->sk->sk_write_space = rds_tcp_write_space;
 142        sock->sk->sk_state_change = rds_tcp_state_change;
 143
 144        write_unlock_bh(&sock->sk->sk_callback_lock);
 145}
 146
 147static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
 148                            struct rds_info_iterator *iter,
 149                            struct rds_info_lengths *lens)
 150{
 151        struct rds_info_tcp_socket tsinfo;
 152        struct rds_tcp_connection *tc;
 153        unsigned long flags;
 154        struct sockaddr_in sin;
 155        int sinlen;
 156
 157        spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
 158
 159        if (len / sizeof(tsinfo) < rds_tcp_tc_count)
 160                goto out;
 161
 162        list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
 163
 164                sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
 165                tsinfo.local_addr = sin.sin_addr.s_addr;
 166                tsinfo.local_port = sin.sin_port;
 167                sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
 168                tsinfo.peer_addr = sin.sin_addr.s_addr;
 169                tsinfo.peer_port = sin.sin_port;
 170
 171                tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
 172                tsinfo.data_rem = tc->t_tinc_data_rem;
 173                tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
 174                tsinfo.last_expected_una = tc->t_last_expected_una;
 175                tsinfo.last_seen_una = tc->t_last_seen_una;
 176
 177                rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
 178        }
 179
 180out:
 181        lens->nr = rds_tcp_tc_count;
 182        lens->each = sizeof(tsinfo);
 183
 184        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 185}
 186
 187static int rds_tcp_laddr_check(struct net *net, __be32 addr)
 188{
 189        if (inet_addr_type(net, addr) == RTN_LOCAL)
 190                return 0;
 191        return -EADDRNOTAVAIL;
 192}
 193
 194static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 195{
 196        struct rds_tcp_connection *tc;
 197
 198        tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
 199        if (!tc)
 200                return -ENOMEM;
 201
 202        tc->t_sock = NULL;
 203        tc->t_tinc = NULL;
 204        tc->t_tinc_hdr_rem = sizeof(struct rds_header);
 205        tc->t_tinc_data_rem = 0;
 206
 207        conn->c_transport_data = tc;
 208
 209        spin_lock_irq(&rds_tcp_conn_lock);
 210        list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
 211        spin_unlock_irq(&rds_tcp_conn_lock);
 212
 213        rdsdebug("alloced tc %p\n", conn->c_transport_data);
 214        return 0;
 215}
 216
 217static void rds_tcp_conn_free(void *arg)
 218{
 219        struct rds_tcp_connection *tc = arg;
 220        unsigned long flags;
 221        rdsdebug("freeing tc %p\n", tc);
 222
 223        spin_lock_irqsave(&rds_tcp_conn_lock, flags);
 224        list_del(&tc->t_tcp_node);
 225        spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
 226
 227        kmem_cache_free(rds_tcp_conn_slab, tc);
 228}
 229
 230static void rds_tcp_destroy_conns(void)
 231{
 232        struct rds_tcp_connection *tc, *_tc;
 233        LIST_HEAD(tmp_list);
 234
 235        /* avoid calling conn_destroy with irqs off */
 236        spin_lock_irq(&rds_tcp_conn_lock);
 237        list_splice(&rds_tcp_conn_list, &tmp_list);
 238        INIT_LIST_HEAD(&rds_tcp_conn_list);
 239        spin_unlock_irq(&rds_tcp_conn_lock);
 240
 241        list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
 242                if (tc->conn->c_passive)
 243                        rds_conn_destroy(tc->conn->c_passive);
 244                rds_conn_destroy(tc->conn);
 245        }
 246}
 247
 248static void rds_tcp_exit(void);
 249
 250struct rds_transport rds_tcp_transport = {
 251        .laddr_check            = rds_tcp_laddr_check,
 252        .xmit_prepare           = rds_tcp_xmit_prepare,
 253        .xmit_complete          = rds_tcp_xmit_complete,
 254        .xmit                   = rds_tcp_xmit,
 255        .recv                   = rds_tcp_recv,
 256        .conn_alloc             = rds_tcp_conn_alloc,
 257        .conn_free              = rds_tcp_conn_free,
 258        .conn_connect           = rds_tcp_conn_connect,
 259        .conn_shutdown          = rds_tcp_conn_shutdown,
 260        .inc_copy_to_user       = rds_tcp_inc_copy_to_user,
 261        .inc_free               = rds_tcp_inc_free,
 262        .stats_info_copy        = rds_tcp_stats_info_copy,
 263        .exit                   = rds_tcp_exit,
 264        .t_owner                = THIS_MODULE,
 265        .t_name                 = "tcp",
 266        .t_type                 = RDS_TRANS_TCP,
 267        .t_prefer_loopback      = 1,
 268};
 269
 270static int rds_tcp_netid;
 271
 272/* per-network namespace private data for this module */
 273struct rds_tcp_net {
 274        struct socket *rds_tcp_listen_sock;
 275        struct work_struct rds_tcp_accept_w;
 276};
 277
 278static void rds_tcp_accept_worker(struct work_struct *work)
 279{
 280        struct rds_tcp_net *rtn = container_of(work,
 281                                               struct rds_tcp_net,
 282                                               rds_tcp_accept_w);
 283
 284        while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
 285                cond_resched();
 286}
 287
 288void rds_tcp_accept_work(struct sock *sk)
 289{
 290        struct net *net = sock_net(sk);
 291        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 292
 293        queue_work(rds_wq, &rtn->rds_tcp_accept_w);
 294}
 295
 296static __net_init int rds_tcp_init_net(struct net *net)
 297{
 298        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 299
 300        rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
 301        if (!rtn->rds_tcp_listen_sock) {
 302                pr_warn("could not set up listen sock\n");
 303                return -EAFNOSUPPORT;
 304        }
 305        INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
 306        return 0;
 307}
 308
 309static void __net_exit rds_tcp_exit_net(struct net *net)
 310{
 311        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 312
 313        /* If rds_tcp_exit_net() is called as a result of netns deletion,
 314         * the rds_tcp_kill_sock() device notifier would already have cleaned
 315         * up the listen socket, thus there is no work to do in this function.
 316         *
 317         * If rds_tcp_exit_net() is called as a result of module unload,
 318         * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
 319         * we do need to clean up the listen socket here.
 320         */
 321        if (rtn->rds_tcp_listen_sock) {
 322                rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
 323                rtn->rds_tcp_listen_sock = NULL;
 324                flush_work(&rtn->rds_tcp_accept_w);
 325        }
 326}
 327
 328static struct pernet_operations rds_tcp_net_ops = {
 329        .init = rds_tcp_init_net,
 330        .exit = rds_tcp_exit_net,
 331        .id = &rds_tcp_netid,
 332        .size = sizeof(struct rds_tcp_net),
 333};
 334
 335static void rds_tcp_kill_sock(struct net *net)
 336{
 337        struct rds_tcp_connection *tc, *_tc;
 338        struct sock *sk;
 339        LIST_HEAD(tmp_list);
 340        struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 341
 342        rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
 343        rtn->rds_tcp_listen_sock = NULL;
 344        flush_work(&rtn->rds_tcp_accept_w);
 345        spin_lock_irq(&rds_tcp_conn_lock);
 346        list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 347                struct net *c_net = read_pnet(&tc->conn->c_net);
 348
 349                if (net != c_net || !tc->t_sock)
 350                        continue;
 351                list_move_tail(&tc->t_tcp_node, &tmp_list);
 352        }
 353        spin_unlock_irq(&rds_tcp_conn_lock);
 354        list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
 355                sk = tc->t_sock->sk;
 356                sk->sk_prot->disconnect(sk, 0);
 357                tcp_done(sk);
 358                if (tc->conn->c_passive)
 359                        rds_conn_destroy(tc->conn->c_passive);
 360                rds_conn_destroy(tc->conn);
 361        }
 362}
 363
 364static int rds_tcp_dev_event(struct notifier_block *this,
 365                             unsigned long event, void *ptr)
 366{
 367        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 368
 369        /* rds-tcp registers as a pernet subys, so the ->exit will only
 370         * get invoked after network acitivity has quiesced. We need to
 371         * clean up all sockets  to quiesce network activity, and use
 372         * the unregistration of the per-net loopback device as a trigger
 373         * to start that cleanup.
 374         */
 375        if (event == NETDEV_UNREGISTER_FINAL &&
 376            dev->ifindex == LOOPBACK_IFINDEX)
 377                rds_tcp_kill_sock(dev_net(dev));
 378
 379        return NOTIFY_DONE;
 380}
 381
 382static struct notifier_block rds_tcp_dev_notifier = {
 383        .notifier_call        = rds_tcp_dev_event,
 384        .priority = -10, /* must be called after other network notifiers */
 385};
 386
 387static void rds_tcp_exit(void)
 388{
 389        rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 390        unregister_pernet_subsys(&rds_tcp_net_ops);
 391        if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
 392                pr_warn("could not unregister rds_tcp_dev_notifier\n");
 393        rds_tcp_destroy_conns();
 394        rds_trans_unregister(&rds_tcp_transport);
 395        rds_tcp_recv_exit();
 396        kmem_cache_destroy(rds_tcp_conn_slab);
 397}
 398module_exit(rds_tcp_exit);
 399
 400static int rds_tcp_init(void)
 401{
 402        int ret;
 403
 404        rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
 405                                              sizeof(struct rds_tcp_connection),
 406                                              0, 0, NULL);
 407        if (!rds_tcp_conn_slab) {
 408                ret = -ENOMEM;
 409                goto out;
 410        }
 411
 412        ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
 413        if (ret) {
 414                pr_warn("could not register rds_tcp_dev_notifier\n");
 415                goto out;
 416        }
 417
 418        ret = register_pernet_subsys(&rds_tcp_net_ops);
 419        if (ret)
 420                goto out_slab;
 421
 422        ret = rds_tcp_recv_init();
 423        if (ret)
 424                goto out_slab;
 425
 426        ret = rds_trans_register(&rds_tcp_transport);
 427        if (ret)
 428                goto out_recv;
 429
 430        rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 431
 432        goto out;
 433
 434out_recv:
 435        rds_tcp_recv_exit();
 436out_slab:
 437        unregister_pernet_subsys(&rds_tcp_net_ops);
 438        kmem_cache_destroy(rds_tcp_conn_slab);
 439out:
 440        return ret;
 441}
 442module_init(rds_tcp_init);
 443
 444MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
 445MODULE_DESCRIPTION("RDS: TCP transport");
 446MODULE_LICENSE("Dual BSD/GPL");
 447
 448