linux/net/rds/af_rds.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006 Oracle.  All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/module.h>
  34#include <linux/errno.h>
  35#include <linux/kernel.h>
  36#include <linux/gfp.h>
  37#include <linux/in.h>
  38#include <linux/poll.h>
  39#include <net/sock.h>
  40
  41#include "rds.h"
  42
  43char *rds_str_array(char **array, size_t elements, size_t index)
  44{
  45        if ((index < elements) && array[index])
  46                return array[index];
  47        else
  48                return "unknown";
  49}
  50EXPORT_SYMBOL(rds_str_array);
  51
  52/* this is just used for stats gathering :/ */
  53static DEFINE_SPINLOCK(rds_sock_lock);
  54static unsigned long rds_sock_count;
  55static LIST_HEAD(rds_sock_list);
  56DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
  57
  58/*
  59 * This is called as the final descriptor referencing this socket is closed.
  60 * We have to unbind the socket so that another socket can be bound to the
  61 * address it was using.
  62 *
  63 * We have to be careful about racing with the incoming path.  sock_orphan()
  64 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
  65 * messages shouldn't be queued.
  66 */
  67static int rds_release(struct socket *sock)
  68{
  69        struct sock *sk = sock->sk;
  70        struct rds_sock *rs;
  71
  72        if (!sk)
  73                goto out;
  74
  75        rs = rds_sk_to_rs(sk);
  76
  77        sock_orphan(sk);
  78        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
  79         * that ensures the recv path has completed messing
  80         * with the socket. */
  81        rds_clear_recv_queue(rs);
  82        rds_cong_remove_socket(rs);
  83
  84        /*
  85         * the binding lookup hash uses rcu, we need to
  86         * make sure we sychronize_rcu before we free our
  87         * entry
  88         */
  89        rds_remove_bound(rs);
  90        synchronize_rcu();
  91
  92        rds_send_drop_to(rs, NULL);
  93        rds_rdma_drop_keys(rs);
  94        rds_notify_queue_get(rs, NULL);
  95
  96        spin_lock_bh(&rds_sock_lock);
  97        list_del_init(&rs->rs_item);
  98        rds_sock_count--;
  99        spin_unlock_bh(&rds_sock_lock);
 100
 101        rds_trans_put(rs->rs_transport);
 102
 103        sock->sk = NULL;
 104        sock_put(sk);
 105out:
 106        return 0;
 107}
 108
 109/*
 110 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
 111 * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
 112 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 113 * this seems more conservative.
 114 * NB - normally, one would use sk_callback_lock for this, but we can
 115 * get here from interrupts, whereas the network code grabs sk_callback_lock
 116 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 117 */
 118void rds_wake_sk_sleep(struct rds_sock *rs)
 119{
 120        unsigned long flags;
 121
 122        read_lock_irqsave(&rs->rs_recv_lock, flags);
 123        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
 124        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 125}
 126
 127static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 128                       int *uaddr_len, int peer)
 129{
 130        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 131        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 132
 133        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 134
 135        /* racey, don't care */
 136        if (peer) {
 137                if (!rs->rs_conn_addr)
 138                        return -ENOTCONN;
 139
 140                sin->sin_port = rs->rs_conn_port;
 141                sin->sin_addr.s_addr = rs->rs_conn_addr;
 142        } else {
 143                sin->sin_port = rs->rs_bound_port;
 144                sin->sin_addr.s_addr = rs->rs_bound_addr;
 145        }
 146
 147        sin->sin_family = AF_INET;
 148
 149        *uaddr_len = sizeof(*sin);
 150        return 0;
 151}
 152
 153/*
 154 * RDS' poll is without a doubt the least intuitive part of the interface,
 155 * as POLLIN and POLLOUT do not behave entirely as you would expect from
 156 * a network protocol.
 157 *
 158 * POLLIN is asserted if
 159 *  -   there is data on the receive queue.
 160 *  -   to signal that a previously congested destination may have become
 161 *      uncongested
 162 *  -   A notification has been queued to the socket (this can be a congestion
 163 *      update, or a RDMA completion).
 164 *
 165 * POLLOUT is asserted if there is room on the send queue. This does not mean
 166 * however, that the next sendmsg() call will succeed. If the application tries
 167 * to send to a congested destination, the system call may still fail (and
 168 * return ENOBUFS).
 169 */
 170static unsigned int rds_poll(struct file *file, struct socket *sock,
 171                             poll_table *wait)
 172{
 173        struct sock *sk = sock->sk;
 174        struct rds_sock *rs = rds_sk_to_rs(sk);
 175        unsigned int mask = 0;
 176        unsigned long flags;
 177
 178        poll_wait(file, sk_sleep(sk), wait);
 179
 180        if (rs->rs_seen_congestion)
 181                poll_wait(file, &rds_poll_waitq, wait);
 182
 183        read_lock_irqsave(&rs->rs_recv_lock, flags);
 184        if (!rs->rs_cong_monitor) {
 185                /* When a congestion map was updated, we signal POLLIN for
 186                 * "historical" reasons. Applications can also poll for
 187                 * WRBAND instead. */
 188                if (rds_cong_updated_since(&rs->rs_cong_track))
 189                        mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
 190        } else {
 191                spin_lock(&rs->rs_lock);
 192                if (rs->rs_cong_notify)
 193                        mask |= (POLLIN | POLLRDNORM);
 194                spin_unlock(&rs->rs_lock);
 195        }
 196        if (!list_empty(&rs->rs_recv_queue) ||
 197            !list_empty(&rs->rs_notify_queue))
 198                mask |= (POLLIN | POLLRDNORM);
 199        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 200                mask |= (POLLOUT | POLLWRNORM);
 201        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 202
 203        /* clear state any time we wake a seen-congested socket */
 204        if (mask)
 205                rs->rs_seen_congestion = 0;
 206
 207        return mask;
 208}
 209
 210static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 211{
 212        return -ENOIOCTLCMD;
 213}
 214
 215static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
 216                              int len)
 217{
 218        struct sockaddr_in sin;
 219        int ret = 0;
 220
 221        /* racing with another thread binding seems ok here */
 222        if (rs->rs_bound_addr == 0) {
 223                ret = -ENOTCONN; /* XXX not a great errno */
 224                goto out;
 225        }
 226
 227        if (len < sizeof(struct sockaddr_in)) {
 228                ret = -EINVAL;
 229                goto out;
 230        }
 231
 232        if (copy_from_user(&sin, optval, sizeof(sin))) {
 233                ret = -EFAULT;
 234                goto out;
 235        }
 236
 237        rds_send_drop_to(rs, &sin);
 238out:
 239        return ret;
 240}
 241
 242static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
 243                               int optlen)
 244{
 245        int value;
 246
 247        if (optlen < sizeof(int))
 248                return -EINVAL;
 249        if (get_user(value, (int __user *) optval))
 250                return -EFAULT;
 251        *optvar = !!value;
 252        return 0;
 253}
 254
 255static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
 256                            int optlen)
 257{
 258        int ret;
 259
 260        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
 261        if (ret == 0) {
 262                if (rs->rs_cong_monitor) {
 263                        rds_cong_add_socket(rs);
 264                } else {
 265                        rds_cong_remove_socket(rs);
 266                        rs->rs_cong_mask = 0;
 267                        rs->rs_cong_notify = 0;
 268                }
 269        }
 270        return ret;
 271}
 272
 273static int rds_setsockopt(struct socket *sock, int level, int optname,
 274                          char __user *optval, unsigned int optlen)
 275{
 276        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 277        int ret;
 278
 279        if (level != SOL_RDS) {
 280                ret = -ENOPROTOOPT;
 281                goto out;
 282        }
 283
 284        switch (optname) {
 285        case RDS_CANCEL_SENT_TO:
 286                ret = rds_cancel_sent_to(rs, optval, optlen);
 287                break;
 288        case RDS_GET_MR:
 289                ret = rds_get_mr(rs, optval, optlen);
 290                break;
 291        case RDS_GET_MR_FOR_DEST:
 292                ret = rds_get_mr_for_dest(rs, optval, optlen);
 293                break;
 294        case RDS_FREE_MR:
 295                ret = rds_free_mr(rs, optval, optlen);
 296                break;
 297        case RDS_RECVERR:
 298                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
 299                break;
 300        case RDS_CONG_MONITOR:
 301                ret = rds_cong_monitor(rs, optval, optlen);
 302                break;
 303        default:
 304                ret = -ENOPROTOOPT;
 305        }
 306out:
 307        return ret;
 308}
 309
 310static int rds_getsockopt(struct socket *sock, int level, int optname,
 311                          char __user *optval, int __user *optlen)
 312{
 313        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 314        int ret = -ENOPROTOOPT, len;
 315
 316        if (level != SOL_RDS)
 317                goto out;
 318
 319        if (get_user(len, optlen)) {
 320                ret = -EFAULT;
 321                goto out;
 322        }
 323
 324        switch (optname) {
 325        case RDS_INFO_FIRST ... RDS_INFO_LAST:
 326                ret = rds_info_getsockopt(sock, optname, optval,
 327                                          optlen);
 328                break;
 329
 330        case RDS_RECVERR:
 331                if (len < sizeof(int))
 332                        ret = -EINVAL;
 333                else
 334                if (put_user(rs->rs_recverr, (int __user *) optval) ||
 335                    put_user(sizeof(int), optlen))
 336                        ret = -EFAULT;
 337                else
 338                        ret = 0;
 339                break;
 340        default:
 341                break;
 342        }
 343
 344out:
 345        return ret;
 346
 347}
 348
 349static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
 350                       int addr_len, int flags)
 351{
 352        struct sock *sk = sock->sk;
 353        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 354        struct rds_sock *rs = rds_sk_to_rs(sk);
 355        int ret = 0;
 356
 357        lock_sock(sk);
 358
 359        if (addr_len != sizeof(struct sockaddr_in)) {
 360                ret = -EINVAL;
 361                goto out;
 362        }
 363
 364        if (sin->sin_family != AF_INET) {
 365                ret = -EAFNOSUPPORT;
 366                goto out;
 367        }
 368
 369        if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
 370                ret = -EDESTADDRREQ;
 371                goto out;
 372        }
 373
 374        rs->rs_conn_addr = sin->sin_addr.s_addr;
 375        rs->rs_conn_port = sin->sin_port;
 376
 377out:
 378        release_sock(sk);
 379        return ret;
 380}
 381
 382static struct proto rds_proto = {
 383        .name     = "RDS",
 384        .owner    = THIS_MODULE,
 385        .obj_size = sizeof(struct rds_sock),
 386};
 387
 388static const struct proto_ops rds_proto_ops = {
 389        .family =       AF_RDS,
 390        .owner =        THIS_MODULE,
 391        .release =      rds_release,
 392        .bind =         rds_bind,
 393        .connect =      rds_connect,
 394        .socketpair =   sock_no_socketpair,
 395        .accept =       sock_no_accept,
 396        .getname =      rds_getname,
 397        .poll =         rds_poll,
 398        .ioctl =        rds_ioctl,
 399        .listen =       sock_no_listen,
 400        .shutdown =     sock_no_shutdown,
 401        .setsockopt =   rds_setsockopt,
 402        .getsockopt =   rds_getsockopt,
 403        .sendmsg =      rds_sendmsg,
 404        .recvmsg =      rds_recvmsg,
 405        .mmap =         sock_no_mmap,
 406        .sendpage =     sock_no_sendpage,
 407};
 408
 409static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 410{
 411        struct rds_sock *rs;
 412
 413        sock_init_data(sock, sk);
 414        sock->ops               = &rds_proto_ops;
 415        sk->sk_protocol         = protocol;
 416
 417        rs = rds_sk_to_rs(sk);
 418        spin_lock_init(&rs->rs_lock);
 419        rwlock_init(&rs->rs_recv_lock);
 420        INIT_LIST_HEAD(&rs->rs_send_queue);
 421        INIT_LIST_HEAD(&rs->rs_recv_queue);
 422        INIT_LIST_HEAD(&rs->rs_notify_queue);
 423        INIT_LIST_HEAD(&rs->rs_cong_list);
 424        spin_lock_init(&rs->rs_rdma_lock);
 425        rs->rs_rdma_keys = RB_ROOT;
 426
 427        spin_lock_bh(&rds_sock_lock);
 428        list_add_tail(&rs->rs_item, &rds_sock_list);
 429        rds_sock_count++;
 430        spin_unlock_bh(&rds_sock_lock);
 431
 432        return 0;
 433}
 434
 435static int rds_create(struct net *net, struct socket *sock, int protocol,
 436                      int kern)
 437{
 438        struct sock *sk;
 439
 440        if (sock->type != SOCK_SEQPACKET || protocol)
 441                return -ESOCKTNOSUPPORT;
 442
 443        sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
 444        if (!sk)
 445                return -ENOMEM;
 446
 447        return __rds_create(sock, sk, protocol);
 448}
 449
 450void rds_sock_addref(struct rds_sock *rs)
 451{
 452        sock_hold(rds_rs_to_sk(rs));
 453}
 454
 455void rds_sock_put(struct rds_sock *rs)
 456{
 457        sock_put(rds_rs_to_sk(rs));
 458}
 459
 460static const struct net_proto_family rds_family_ops = {
 461        .family =       AF_RDS,
 462        .create =       rds_create,
 463        .owner  =       THIS_MODULE,
 464};
 465
 466static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 467                              struct rds_info_iterator *iter,
 468                              struct rds_info_lengths *lens)
 469{
 470        struct rds_sock *rs;
 471        struct rds_incoming *inc;
 472        unsigned int total = 0;
 473
 474        len /= sizeof(struct rds_info_message);
 475
 476        spin_lock_bh(&rds_sock_lock);
 477
 478        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 479                read_lock(&rs->rs_recv_lock);
 480
 481                /* XXX too lazy to maintain counts.. */
 482                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
 483                        total++;
 484                        if (total <= len)
 485                                rds_inc_info_copy(inc, iter, inc->i_saddr,
 486                                                  rs->rs_bound_addr, 1);
 487                }
 488
 489                read_unlock(&rs->rs_recv_lock);
 490        }
 491
 492        spin_unlock_bh(&rds_sock_lock);
 493
 494        lens->nr = total;
 495        lens->each = sizeof(struct rds_info_message);
 496}
 497
 498static void rds_sock_info(struct socket *sock, unsigned int len,
 499                          struct rds_info_iterator *iter,
 500                          struct rds_info_lengths *lens)
 501{
 502        struct rds_info_socket sinfo;
 503        struct rds_sock *rs;
 504
 505        len /= sizeof(struct rds_info_socket);
 506
 507        spin_lock_bh(&rds_sock_lock);
 508
 509        if (len < rds_sock_count)
 510                goto out;
 511
 512        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 513                sinfo.sndbuf = rds_sk_sndbuf(rs);
 514                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
 515                sinfo.bound_addr = rs->rs_bound_addr;
 516                sinfo.connected_addr = rs->rs_conn_addr;
 517                sinfo.bound_port = rs->rs_bound_port;
 518                sinfo.connected_port = rs->rs_conn_port;
 519                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
 520
 521                rds_info_copy(iter, &sinfo, sizeof(sinfo));
 522        }
 523
 524out:
 525        lens->nr = rds_sock_count;
 526        lens->each = sizeof(struct rds_info_socket);
 527
 528        spin_unlock_bh(&rds_sock_lock);
 529}
 530
 531static void rds_exit(void)
 532{
 533        sock_unregister(rds_family_ops.family);
 534        proto_unregister(&rds_proto);
 535        rds_conn_exit();
 536        rds_cong_exit();
 537        rds_sysctl_exit();
 538        rds_threads_exit();
 539        rds_stats_exit();
 540        rds_page_exit();
 541        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
 542        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 543}
 544module_exit(rds_exit);
 545
 546static int rds_init(void)
 547{
 548        int ret;
 549
 550        ret = rds_conn_init();
 551        if (ret)
 552                goto out;
 553        ret = rds_threads_init();
 554        if (ret)
 555                goto out_conn;
 556        ret = rds_sysctl_init();
 557        if (ret)
 558                goto out_threads;
 559        ret = rds_stats_init();
 560        if (ret)
 561                goto out_sysctl;
 562        ret = proto_register(&rds_proto, 1);
 563        if (ret)
 564                goto out_stats;
 565        ret = sock_register(&rds_family_ops);
 566        if (ret)
 567                goto out_proto;
 568
 569        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
 570        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 571
 572        goto out;
 573
 574out_proto:
 575        proto_unregister(&rds_proto);
 576out_stats:
 577        rds_stats_exit();
 578out_sysctl:
 579        rds_sysctl_exit();
 580out_threads:
 581        rds_threads_exit();
 582out_conn:
 583        rds_conn_exit();
 584        rds_cong_exit();
 585        rds_page_exit();
 586out:
 587        return ret;
 588}
 589module_init(rds_init);
 590
 591#define DRV_VERSION     "4.0"
 592#define DRV_RELDATE     "Feb 12, 2009"
 593
 594MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
 595MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
 596                   " v" DRV_VERSION " (" DRV_RELDATE ")");
 597MODULE_VERSION(DRV_VERSION);
 598MODULE_LICENSE("Dual BSD/GPL");
 599MODULE_ALIAS_NETPROTO(PF_RDS);
 600