linux/net/rds/af_rds.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/module.h>
  34#include <linux/errno.h>
  35#include <linux/kernel.h>
  36#include <linux/gfp.h>
  37#include <linux/in.h>
  38#include <linux/ipv6.h>
  39#include <linux/poll.h>
  40#include <net/sock.h>
  41
  42#include "rds.h"
  43
  44/* this is just used for stats gathering :/ */
  45static DEFINE_SPINLOCK(rds_sock_lock);
  46static unsigned long rds_sock_count;
  47static LIST_HEAD(rds_sock_list);
  48DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
  49
  50/*
  51 * This is called as the final descriptor referencing this socket is closed.
  52 * We have to unbind the socket so that another socket can be bound to the
  53 * address it was using.
  54 *
  55 * We have to be careful about racing with the incoming path.  sock_orphan()
  56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
  57 * messages shouldn't be queued.
  58 */
  59static int rds_release(struct socket *sock)
  60{
  61        struct sock *sk = sock->sk;
  62        struct rds_sock *rs;
  63
  64        if (!sk)
  65                goto out;
  66
  67        rs = rds_sk_to_rs(sk);
  68
  69        sock_orphan(sk);
  70        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
  71         * that ensures the recv path has completed messing
  72         * with the socket. */
  73        rds_clear_recv_queue(rs);
  74        rds_cong_remove_socket(rs);
  75
  76        rds_remove_bound(rs);
  77
  78        rds_send_drop_to(rs, NULL);
  79        rds_rdma_drop_keys(rs);
  80        rds_notify_queue_get(rs, NULL);
  81        rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
  82
  83        spin_lock_bh(&rds_sock_lock);
  84        list_del_init(&rs->rs_item);
  85        rds_sock_count--;
  86        spin_unlock_bh(&rds_sock_lock);
  87
  88        rds_trans_put(rs->rs_transport);
  89
  90        sock->sk = NULL;
  91        sock_put(sk);
  92out:
  93        return 0;
  94}
  95
  96/*
  97 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
  98 * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
  99 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 100 * this seems more conservative.
 101 * NB - normally, one would use sk_callback_lock for this, but we can
 102 * get here from interrupts, whereas the network code grabs sk_callback_lock
 103 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 104 */
 105void rds_wake_sk_sleep(struct rds_sock *rs)
 106{
 107        unsigned long flags;
 108
 109        read_lock_irqsave(&rs->rs_recv_lock, flags);
 110        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
 111        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 112}
 113
 114static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 115                       int peer)
 116{
 117        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 118        struct sockaddr_in6 *sin6;
 119        struct sockaddr_in *sin;
 120        int uaddr_len;
 121
 122        /* racey, don't care */
 123        if (peer) {
 124                if (ipv6_addr_any(&rs->rs_conn_addr))
 125                        return -ENOTCONN;
 126
 127                if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
 128                        sin = (struct sockaddr_in *)uaddr;
 129                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 130                        sin->sin_family = AF_INET;
 131                        sin->sin_port = rs->rs_conn_port;
 132                        sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
 133                        uaddr_len = sizeof(*sin);
 134                } else {
 135                        sin6 = (struct sockaddr_in6 *)uaddr;
 136                        sin6->sin6_family = AF_INET6;
 137                        sin6->sin6_port = rs->rs_conn_port;
 138                        sin6->sin6_addr = rs->rs_conn_addr;
 139                        sin6->sin6_flowinfo = 0;
 140                        /* scope_id is the same as in the bound address. */
 141                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
 142                        uaddr_len = sizeof(*sin6);
 143                }
 144        } else {
 145                /* If socket is not yet bound and the socket is connected,
 146                 * set the return address family to be the same as the
 147                 * connected address, but with 0 address value.  If it is not
 148                 * connected, set the family to be AF_UNSPEC (value 0) and
 149                 * the address size to be that of an IPv4 address.
 150                 */
 151                if (ipv6_addr_any(&rs->rs_bound_addr)) {
 152                        if (ipv6_addr_any(&rs->rs_conn_addr)) {
 153                                sin = (struct sockaddr_in *)uaddr;
 154                                memset(sin, 0, sizeof(*sin));
 155                                sin->sin_family = AF_UNSPEC;
 156                                return sizeof(*sin);
 157                        }
 158
 159#if IS_ENABLED(CONFIG_IPV6)
 160                        if (!(ipv6_addr_type(&rs->rs_conn_addr) &
 161                              IPV6_ADDR_MAPPED)) {
 162                                sin6 = (struct sockaddr_in6 *)uaddr;
 163                                memset(sin6, 0, sizeof(*sin6));
 164                                sin6->sin6_family = AF_INET6;
 165                                return sizeof(*sin6);
 166                        }
 167#endif
 168
 169                        sin = (struct sockaddr_in *)uaddr;
 170                        memset(sin, 0, sizeof(*sin));
 171                        sin->sin_family = AF_INET;
 172                        return sizeof(*sin);
 173                }
 174                if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
 175                        sin = (struct sockaddr_in *)uaddr;
 176                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 177                        sin->sin_family = AF_INET;
 178                        sin->sin_port = rs->rs_bound_port;
 179                        sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
 180                        uaddr_len = sizeof(*sin);
 181                } else {
 182                        sin6 = (struct sockaddr_in6 *)uaddr;
 183                        sin6->sin6_family = AF_INET6;
 184                        sin6->sin6_port = rs->rs_bound_port;
 185                        sin6->sin6_addr = rs->rs_bound_addr;
 186                        sin6->sin6_flowinfo = 0;
 187                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
 188                        uaddr_len = sizeof(*sin6);
 189                }
 190        }
 191
 192        return uaddr_len;
 193}
 194
 195/*
 196 * RDS' poll is without a doubt the least intuitive part of the interface,
 197 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
 198 * a network protocol.
 199 *
 200 * EPOLLIN is asserted if
 201 *  -   there is data on the receive queue.
 202 *  -   to signal that a previously congested destination may have become
 203 *      uncongested
 204 *  -   A notification has been queued to the socket (this can be a congestion
 205 *      update, or a RDMA completion, or a MSG_ZEROCOPY completion).
 206 *
 207 * EPOLLOUT is asserted if there is room on the send queue. This does not mean
 208 * however, that the next sendmsg() call will succeed. If the application tries
 209 * to send to a congested destination, the system call may still fail (and
 210 * return ENOBUFS).
 211 */
 212static __poll_t rds_poll(struct file *file, struct socket *sock,
 213                             poll_table *wait)
 214{
 215        struct sock *sk = sock->sk;
 216        struct rds_sock *rs = rds_sk_to_rs(sk);
 217        __poll_t mask = 0;
 218        unsigned long flags;
 219
 220        poll_wait(file, sk_sleep(sk), wait);
 221
 222        if (rs->rs_seen_congestion)
 223                poll_wait(file, &rds_poll_waitq, wait);
 224
 225        read_lock_irqsave(&rs->rs_recv_lock, flags);
 226        if (!rs->rs_cong_monitor) {
 227                /* When a congestion map was updated, we signal EPOLLIN for
 228                 * "historical" reasons. Applications can also poll for
 229                 * WRBAND instead. */
 230                if (rds_cong_updated_since(&rs->rs_cong_track))
 231                        mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
 232        } else {
 233                spin_lock(&rs->rs_lock);
 234                if (rs->rs_cong_notify)
 235                        mask |= (EPOLLIN | EPOLLRDNORM);
 236                spin_unlock(&rs->rs_lock);
 237        }
 238        if (!list_empty(&rs->rs_recv_queue) ||
 239            !list_empty(&rs->rs_notify_queue) ||
 240            !list_empty(&rs->rs_zcookie_queue.zcookie_head))
 241                mask |= (EPOLLIN | EPOLLRDNORM);
 242        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 243                mask |= (EPOLLOUT | EPOLLWRNORM);
 244        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 245                mask |= POLLERR;
 246        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 247
 248        /* clear state any time we wake a seen-congested socket */
 249        if (mask)
 250                rs->rs_seen_congestion = 0;
 251
 252        return mask;
 253}
 254
 255static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 256{
 257        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 258        rds_tos_t utos, tos = 0;
 259
 260        switch (cmd) {
 261        case SIOCRDSSETTOS:
 262                if (get_user(utos, (rds_tos_t __user *)arg))
 263                        return -EFAULT;
 264
 265                if (rs->rs_transport &&
 266                    rs->rs_transport->get_tos_map)
 267                        tos = rs->rs_transport->get_tos_map(utos);
 268                else
 269                        return -ENOIOCTLCMD;
 270
 271                spin_lock_bh(&rds_sock_lock);
 272                if (rs->rs_tos || rs->rs_conn) {
 273                        spin_unlock_bh(&rds_sock_lock);
 274                        return -EINVAL;
 275                }
 276                rs->rs_tos = tos;
 277                spin_unlock_bh(&rds_sock_lock);
 278                break;
 279        case SIOCRDSGETTOS:
 280                spin_lock_bh(&rds_sock_lock);
 281                tos = rs->rs_tos;
 282                spin_unlock_bh(&rds_sock_lock);
 283                if (put_user(tos, (rds_tos_t __user *)arg))
 284                        return -EFAULT;
 285                break;
 286        default:
 287                return -ENOIOCTLCMD;
 288        }
 289
 290        return 0;
 291}
 292
 293static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
 294                              int len)
 295{
 296        struct sockaddr_in6 sin6;
 297        struct sockaddr_in sin;
 298        int ret = 0;
 299
 300        /* racing with another thread binding seems ok here */
 301        if (ipv6_addr_any(&rs->rs_bound_addr)) {
 302                ret = -ENOTCONN; /* XXX not a great errno */
 303                goto out;
 304        }
 305
 306        if (len < sizeof(struct sockaddr_in)) {
 307                ret = -EINVAL;
 308                goto out;
 309        } else if (len < sizeof(struct sockaddr_in6)) {
 310                /* Assume IPv4 */
 311                if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
 312                        ret = -EFAULT;
 313                        goto out;
 314                }
 315                ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
 316                sin6.sin6_port = sin.sin_port;
 317        } else {
 318                if (copy_from_user(&sin6, optval,
 319                                   sizeof(struct sockaddr_in6))) {
 320                        ret = -EFAULT;
 321                        goto out;
 322                }
 323        }
 324
 325        rds_send_drop_to(rs, &sin6);
 326out:
 327        return ret;
 328}
 329
 330static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
 331                               int optlen)
 332{
 333        int value;
 334
 335        if (optlen < sizeof(int))
 336                return -EINVAL;
 337        if (get_user(value, (int __user *) optval))
 338                return -EFAULT;
 339        *optvar = !!value;
 340        return 0;
 341}
 342
 343static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
 344                            int optlen)
 345{
 346        int ret;
 347
 348        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
 349        if (ret == 0) {
 350                if (rs->rs_cong_monitor) {
 351                        rds_cong_add_socket(rs);
 352                } else {
 353                        rds_cong_remove_socket(rs);
 354                        rs->rs_cong_mask = 0;
 355                        rs->rs_cong_notify = 0;
 356                }
 357        }
 358        return ret;
 359}
 360
 361static int rds_set_transport(struct rds_sock *rs, char __user *optval,
 362                             int optlen)
 363{
 364        int t_type;
 365
 366        if (rs->rs_transport)
 367                return -EOPNOTSUPP; /* previously attached to transport */
 368
 369        if (optlen != sizeof(int))
 370                return -EINVAL;
 371
 372        if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
 373                return -EFAULT;
 374
 375        if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
 376                return -EINVAL;
 377
 378        rs->rs_transport = rds_trans_get(t_type);
 379
 380        return rs->rs_transport ? 0 : -ENOPROTOOPT;
 381}
 382
 383static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 384                                 int optlen, int optname)
 385{
 386        int val, valbool;
 387
 388        if (optlen != sizeof(int))
 389                return -EFAULT;
 390
 391        if (get_user(val, (int __user *)optval))
 392                return -EFAULT;
 393
 394        valbool = val ? 1 : 0;
 395
 396        if (optname == SO_TIMESTAMP_NEW)
 397                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 398
 399        if (valbool)
 400                sock_set_flag(sk, SOCK_RCVTSTAMP);
 401        else
 402                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 403
 404        return 0;
 405}
 406
 407static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
 408                                  int optlen)
 409{
 410        struct rds_rx_trace_so trace;
 411        int i;
 412
 413        if (optlen != sizeof(struct rds_rx_trace_so))
 414                return -EFAULT;
 415
 416        if (copy_from_user(&trace, optval, sizeof(trace)))
 417                return -EFAULT;
 418
 419        if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
 420                return -EFAULT;
 421
 422        rs->rs_rx_traces = trace.rx_traces;
 423        for (i = 0; i < rs->rs_rx_traces; i++) {
 424                if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
 425                        rs->rs_rx_traces = 0;
 426                        return -EFAULT;
 427                }
 428                rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
 429        }
 430
 431        return 0;
 432}
 433
 434static int rds_setsockopt(struct socket *sock, int level, int optname,
 435                          char __user *optval, unsigned int optlen)
 436{
 437        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 438        int ret;
 439
 440        if (level != SOL_RDS) {
 441                ret = -ENOPROTOOPT;
 442                goto out;
 443        }
 444
 445        switch (optname) {
 446        case RDS_CANCEL_SENT_TO:
 447                ret = rds_cancel_sent_to(rs, optval, optlen);
 448                break;
 449        case RDS_GET_MR:
 450                ret = rds_get_mr(rs, optval, optlen);
 451                break;
 452        case RDS_GET_MR_FOR_DEST:
 453                ret = rds_get_mr_for_dest(rs, optval, optlen);
 454                break;
 455        case RDS_FREE_MR:
 456                ret = rds_free_mr(rs, optval, optlen);
 457                break;
 458        case RDS_RECVERR:
 459                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
 460                break;
 461        case RDS_CONG_MONITOR:
 462                ret = rds_cong_monitor(rs, optval, optlen);
 463                break;
 464        case SO_RDS_TRANSPORT:
 465                lock_sock(sock->sk);
 466                ret = rds_set_transport(rs, optval, optlen);
 467                release_sock(sock->sk);
 468                break;
 469        case SO_TIMESTAMP_OLD:
 470        case SO_TIMESTAMP_NEW:
 471                lock_sock(sock->sk);
 472                ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
 473                release_sock(sock->sk);
 474                break;
 475        case SO_RDS_MSG_RXPATH_LATENCY:
 476                ret = rds_recv_track_latency(rs, optval, optlen);
 477                break;
 478        default:
 479                ret = -ENOPROTOOPT;
 480        }
 481out:
 482        return ret;
 483}
 484
 485static int rds_getsockopt(struct socket *sock, int level, int optname,
 486                          char __user *optval, int __user *optlen)
 487{
 488        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 489        int ret = -ENOPROTOOPT, len;
 490        int trans;
 491
 492        if (level != SOL_RDS)
 493                goto out;
 494
 495        if (get_user(len, optlen)) {
 496                ret = -EFAULT;
 497                goto out;
 498        }
 499
 500        switch (optname) {
 501        case RDS_INFO_FIRST ... RDS_INFO_LAST:
 502                ret = rds_info_getsockopt(sock, optname, optval,
 503                                          optlen);
 504                break;
 505
 506        case RDS_RECVERR:
 507                if (len < sizeof(int))
 508                        ret = -EINVAL;
 509                else
 510                if (put_user(rs->rs_recverr, (int __user *) optval) ||
 511                    put_user(sizeof(int), optlen))
 512                        ret = -EFAULT;
 513                else
 514                        ret = 0;
 515                break;
 516        case SO_RDS_TRANSPORT:
 517                if (len < sizeof(int)) {
 518                        ret = -EINVAL;
 519                        break;
 520                }
 521                trans = (rs->rs_transport ? rs->rs_transport->t_type :
 522                         RDS_TRANS_NONE); /* unbound */
 523                if (put_user(trans, (int __user *)optval) ||
 524                    put_user(sizeof(int), optlen))
 525                        ret = -EFAULT;
 526                else
 527                        ret = 0;
 528                break;
 529        default:
 530                break;
 531        }
 532
 533out:
 534        return ret;
 535
 536}
 537
 538static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
 539                       int addr_len, int flags)
 540{
 541        struct sock *sk = sock->sk;
 542        struct sockaddr_in *sin;
 543        struct rds_sock *rs = rds_sk_to_rs(sk);
 544        int ret = 0;
 545
 546        if (addr_len < offsetofend(struct sockaddr, sa_family))
 547                return -EINVAL;
 548
 549        lock_sock(sk);
 550
 551        switch (uaddr->sa_family) {
 552        case AF_INET:
 553                sin = (struct sockaddr_in *)uaddr;
 554                if (addr_len < sizeof(struct sockaddr_in)) {
 555                        ret = -EINVAL;
 556                        break;
 557                }
 558                if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
 559                        ret = -EDESTADDRREQ;
 560                        break;
 561                }
 562                if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
 563                    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
 564                        ret = -EINVAL;
 565                        break;
 566                }
 567                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
 568                rs->rs_conn_port = sin->sin_port;
 569                break;
 570
 571#if IS_ENABLED(CONFIG_IPV6)
 572        case AF_INET6: {
 573                struct sockaddr_in6 *sin6;
 574                int addr_type;
 575
 576                sin6 = (struct sockaddr_in6 *)uaddr;
 577                if (addr_len < sizeof(struct sockaddr_in6)) {
 578                        ret = -EINVAL;
 579                        break;
 580                }
 581                addr_type = ipv6_addr_type(&sin6->sin6_addr);
 582                if (!(addr_type & IPV6_ADDR_UNICAST)) {
 583                        __be32 addr4;
 584
 585                        if (!(addr_type & IPV6_ADDR_MAPPED)) {
 586                                ret = -EPROTOTYPE;
 587                                break;
 588                        }
 589
 590                        /* It is a mapped address.  Need to do some sanity
 591                         * checks.
 592                         */
 593                        addr4 = sin6->sin6_addr.s6_addr32[3];
 594                        if (addr4 == htonl(INADDR_ANY) ||
 595                            addr4 == htonl(INADDR_BROADCAST) ||
 596                            IN_MULTICAST(ntohl(addr4))) {
 597                                ret = -EPROTOTYPE;
 598                                break;
 599                        }
 600                }
 601
 602                if (addr_type & IPV6_ADDR_LINKLOCAL) {
 603                        /* If socket is arleady bound to a link local address,
 604                         * the peer address must be on the same link.
 605                         */
 606                        if (sin6->sin6_scope_id == 0 ||
 607                            (!ipv6_addr_any(&rs->rs_bound_addr) &&
 608                             rs->rs_bound_scope_id &&
 609                             sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
 610                                ret = -EINVAL;
 611                                break;
 612                        }
 613                        /* Remember the connected address scope ID.  It will
 614                         * be checked against the binding local address when
 615                         * the socket is bound.
 616                         */
 617                        rs->rs_bound_scope_id = sin6->sin6_scope_id;
 618                }
 619                rs->rs_conn_addr = sin6->sin6_addr;
 620                rs->rs_conn_port = sin6->sin6_port;
 621                break;
 622        }
 623#endif
 624
 625        default:
 626                ret = -EAFNOSUPPORT;
 627                break;
 628        }
 629
 630        release_sock(sk);
 631        return ret;
 632}
 633
 634static struct proto rds_proto = {
 635        .name     = "RDS",
 636        .owner    = THIS_MODULE,
 637        .obj_size = sizeof(struct rds_sock),
 638};
 639
 640static const struct proto_ops rds_proto_ops = {
 641        .family =       AF_RDS,
 642        .owner =        THIS_MODULE,
 643        .release =      rds_release,
 644        .bind =         rds_bind,
 645        .connect =      rds_connect,
 646        .socketpair =   sock_no_socketpair,
 647        .accept =       sock_no_accept,
 648        .getname =      rds_getname,
 649        .poll =         rds_poll,
 650        .ioctl =        rds_ioctl,
 651        .listen =       sock_no_listen,
 652        .shutdown =     sock_no_shutdown,
 653        .setsockopt =   rds_setsockopt,
 654        .getsockopt =   rds_getsockopt,
 655        .sendmsg =      rds_sendmsg,
 656        .recvmsg =      rds_recvmsg,
 657        .mmap =         sock_no_mmap,
 658        .sendpage =     sock_no_sendpage,
 659};
 660
 661static void rds_sock_destruct(struct sock *sk)
 662{
 663        struct rds_sock *rs = rds_sk_to_rs(sk);
 664
 665        WARN_ON((&rs->rs_item != rs->rs_item.next ||
 666                 &rs->rs_item != rs->rs_item.prev));
 667}
 668
 669static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 670{
 671        struct rds_sock *rs;
 672
 673        sock_init_data(sock, sk);
 674        sock->ops               = &rds_proto_ops;
 675        sk->sk_protocol         = protocol;
 676        sk->sk_destruct         = rds_sock_destruct;
 677
 678        rs = rds_sk_to_rs(sk);
 679        spin_lock_init(&rs->rs_lock);
 680        rwlock_init(&rs->rs_recv_lock);
 681        INIT_LIST_HEAD(&rs->rs_send_queue);
 682        INIT_LIST_HEAD(&rs->rs_recv_queue);
 683        INIT_LIST_HEAD(&rs->rs_notify_queue);
 684        INIT_LIST_HEAD(&rs->rs_cong_list);
 685        rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
 686        spin_lock_init(&rs->rs_rdma_lock);
 687        rs->rs_rdma_keys = RB_ROOT;
 688        rs->rs_rx_traces = 0;
 689        rs->rs_tos = 0;
 690        rs->rs_conn = NULL;
 691
 692        spin_lock_bh(&rds_sock_lock);
 693        list_add_tail(&rs->rs_item, &rds_sock_list);
 694        rds_sock_count++;
 695        spin_unlock_bh(&rds_sock_lock);
 696
 697        return 0;
 698}
 699
 700static int rds_create(struct net *net, struct socket *sock, int protocol,
 701                      int kern)
 702{
 703        struct sock *sk;
 704
 705        if (sock->type != SOCK_SEQPACKET || protocol)
 706                return -ESOCKTNOSUPPORT;
 707
 708        sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
 709        if (!sk)
 710                return -ENOMEM;
 711
 712        return __rds_create(sock, sk, protocol);
 713}
 714
 715void rds_sock_addref(struct rds_sock *rs)
 716{
 717        sock_hold(rds_rs_to_sk(rs));
 718}
 719
 720void rds_sock_put(struct rds_sock *rs)
 721{
 722        sock_put(rds_rs_to_sk(rs));
 723}
 724
 725static const struct net_proto_family rds_family_ops = {
 726        .family =       AF_RDS,
 727        .create =       rds_create,
 728        .owner  =       THIS_MODULE,
 729};
 730
 731static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 732                              struct rds_info_iterator *iter,
 733                              struct rds_info_lengths *lens)
 734{
 735        struct rds_sock *rs;
 736        struct rds_incoming *inc;
 737        unsigned int total = 0;
 738
 739        len /= sizeof(struct rds_info_message);
 740
 741        spin_lock_bh(&rds_sock_lock);
 742
 743        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 744                read_lock(&rs->rs_recv_lock);
 745
 746                /* XXX too lazy to maintain counts.. */
 747                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
 748                        total++;
 749                        if (total <= len)
 750                                rds_inc_info_copy(inc, iter,
 751                                                  inc->i_saddr.s6_addr32[3],
 752                                                  rs->rs_bound_addr_v4,
 753                                                  1);
 754                }
 755
 756                read_unlock(&rs->rs_recv_lock);
 757        }
 758
 759        spin_unlock_bh(&rds_sock_lock);
 760
 761        lens->nr = total;
 762        lens->each = sizeof(struct rds_info_message);
 763}
 764
 765static void rds_sock_info(struct socket *sock, unsigned int len,
 766                          struct rds_info_iterator *iter,
 767                          struct rds_info_lengths *lens)
 768{
 769        struct rds_info_socket sinfo;
 770        struct rds_sock *rs;
 771
 772        len /= sizeof(struct rds_info_socket);
 773
 774        spin_lock_bh(&rds_sock_lock);
 775
 776        if (len < rds_sock_count)
 777                goto out;
 778
 779        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 780                sinfo.sndbuf = rds_sk_sndbuf(rs);
 781                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
 782                sinfo.bound_addr = rs->rs_bound_addr_v4;
 783                sinfo.connected_addr = rs->rs_conn_addr_v4;
 784                sinfo.bound_port = rs->rs_bound_port;
 785                sinfo.connected_port = rs->rs_conn_port;
 786                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
 787
 788                rds_info_copy(iter, &sinfo, sizeof(sinfo));
 789        }
 790
 791out:
 792        lens->nr = rds_sock_count;
 793        lens->each = sizeof(struct rds_info_socket);
 794
 795        spin_unlock_bh(&rds_sock_lock);
 796}
 797
 798static void rds_exit(void)
 799{
 800        sock_unregister(rds_family_ops.family);
 801        proto_unregister(&rds_proto);
 802        rds_conn_exit();
 803        rds_cong_exit();
 804        rds_sysctl_exit();
 805        rds_threads_exit();
 806        rds_stats_exit();
 807        rds_page_exit();
 808        rds_bind_lock_destroy();
 809        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
 810        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 811}
 812module_exit(rds_exit);
 813
 814u32 rds_gen_num;
 815
 816static int rds_init(void)
 817{
 818        int ret;
 819
 820        net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
 821
 822        ret = rds_bind_lock_init();
 823        if (ret)
 824                goto out;
 825
 826        ret = rds_conn_init();
 827        if (ret)
 828                goto out_bind;
 829
 830        ret = rds_threads_init();
 831        if (ret)
 832                goto out_conn;
 833        ret = rds_sysctl_init();
 834        if (ret)
 835                goto out_threads;
 836        ret = rds_stats_init();
 837        if (ret)
 838                goto out_sysctl;
 839        ret = proto_register(&rds_proto, 1);
 840        if (ret)
 841                goto out_stats;
 842        ret = sock_register(&rds_family_ops);
 843        if (ret)
 844                goto out_proto;
 845
 846        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
 847        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 848
 849        goto out;
 850
 851out_proto:
 852        proto_unregister(&rds_proto);
 853out_stats:
 854        rds_stats_exit();
 855out_sysctl:
 856        rds_sysctl_exit();
 857out_threads:
 858        rds_threads_exit();
 859out_conn:
 860        rds_conn_exit();
 861        rds_cong_exit();
 862        rds_page_exit();
 863out_bind:
 864        rds_bind_lock_destroy();
 865out:
 866        return ret;
 867}
 868module_init(rds_init);
 869
 870#define DRV_VERSION     "4.0"
 871#define DRV_RELDATE     "Feb 12, 2009"
 872
 873MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
 874MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
 875                   " v" DRV_VERSION " (" DRV_RELDATE ")");
 876MODULE_VERSION(DRV_VERSION);
 877MODULE_LICENSE("Dual BSD/GPL");
 878MODULE_ALIAS_NETPROTO(PF_RDS);
 879