linux/net/rds/af_rds.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/module.h>
  34#include <linux/errno.h>
  35#include <linux/kernel.h>
  36#include <linux/gfp.h>
  37#include <linux/in.h>
  38#include <linux/ipv6.h>
  39#include <linux/poll.h>
  40#include <net/sock.h>
  41
  42#include "rds.h"
  43
  44/* this is just used for stats gathering :/ */
  45static DEFINE_SPINLOCK(rds_sock_lock);
  46static unsigned long rds_sock_count;
  47static LIST_HEAD(rds_sock_list);
  48DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
  49
  50/*
  51 * This is called as the final descriptor referencing this socket is closed.
  52 * We have to unbind the socket so that another socket can be bound to the
  53 * address it was using.
  54 *
  55 * We have to be careful about racing with the incoming path.  sock_orphan()
  56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
  57 * messages shouldn't be queued.
  58 */
  59static int rds_release(struct socket *sock)
  60{
  61        struct sock *sk = sock->sk;
  62        struct rds_sock *rs;
  63
  64        if (!sk)
  65                goto out;
  66
  67        rs = rds_sk_to_rs(sk);
  68
  69        sock_orphan(sk);
  70        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
  71         * that ensures the recv path has completed messing
  72         * with the socket. */
  73        rds_clear_recv_queue(rs);
  74        rds_cong_remove_socket(rs);
  75
  76        rds_remove_bound(rs);
  77
  78        rds_send_drop_to(rs, NULL);
  79        rds_rdma_drop_keys(rs);
  80        rds_notify_queue_get(rs, NULL);
  81        rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
  82
  83        spin_lock_bh(&rds_sock_lock);
  84        list_del_init(&rs->rs_item);
  85        rds_sock_count--;
  86        spin_unlock_bh(&rds_sock_lock);
  87
  88        rds_trans_put(rs->rs_transport);
  89
  90        sock->sk = NULL;
  91        sock_put(sk);
  92out:
  93        return 0;
  94}
  95
  96/*
  97 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
  98 * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
  99 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 100 * this seems more conservative.
 101 * NB - normally, one would use sk_callback_lock for this, but we can
 102 * get here from interrupts, whereas the network code grabs sk_callback_lock
 103 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 104 */
 105void rds_wake_sk_sleep(struct rds_sock *rs)
 106{
 107        unsigned long flags;
 108
 109        read_lock_irqsave(&rs->rs_recv_lock, flags);
 110        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
 111        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 112}
 113
 114static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 115                       int peer)
 116{
 117        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 118        struct sockaddr_in6 *sin6;
 119        struct sockaddr_in *sin;
 120        int uaddr_len;
 121
 122        /* racey, don't care */
 123        if (peer) {
 124                if (ipv6_addr_any(&rs->rs_conn_addr))
 125                        return -ENOTCONN;
 126
 127                if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
 128                        sin = (struct sockaddr_in *)uaddr;
 129                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 130                        sin->sin_family = AF_INET;
 131                        sin->sin_port = rs->rs_conn_port;
 132                        sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
 133                        uaddr_len = sizeof(*sin);
 134                } else {
 135                        sin6 = (struct sockaddr_in6 *)uaddr;
 136                        sin6->sin6_family = AF_INET6;
 137                        sin6->sin6_port = rs->rs_conn_port;
 138                        sin6->sin6_addr = rs->rs_conn_addr;
 139                        sin6->sin6_flowinfo = 0;
 140                        /* scope_id is the same as in the bound address. */
 141                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
 142                        uaddr_len = sizeof(*sin6);
 143                }
 144        } else {
 145                /* If socket is not yet bound and the socket is connected,
 146                 * set the return address family to be the same as the
 147                 * connected address, but with 0 address value.  If it is not
 148                 * connected, set the family to be AF_UNSPEC (value 0) and
 149                 * the address size to be that of an IPv4 address.
 150                 */
 151                if (ipv6_addr_any(&rs->rs_bound_addr)) {
 152                        if (ipv6_addr_any(&rs->rs_conn_addr)) {
 153                                sin = (struct sockaddr_in *)uaddr;
 154                                memset(sin, 0, sizeof(*sin));
 155                                sin->sin_family = AF_UNSPEC;
 156                                return sizeof(*sin);
 157                        }
 158
 159#if IS_ENABLED(CONFIG_IPV6)
 160                        if (!(ipv6_addr_type(&rs->rs_conn_addr) &
 161                              IPV6_ADDR_MAPPED)) {
 162                                sin6 = (struct sockaddr_in6 *)uaddr;
 163                                memset(sin6, 0, sizeof(*sin6));
 164                                sin6->sin6_family = AF_INET6;
 165                                return sizeof(*sin6);
 166                        }
 167#endif
 168
 169                        sin = (struct sockaddr_in *)uaddr;
 170                        memset(sin, 0, sizeof(*sin));
 171                        sin->sin_family = AF_INET;
 172                        return sizeof(*sin);
 173                }
 174                if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
 175                        sin = (struct sockaddr_in *)uaddr;
 176                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 177                        sin->sin_family = AF_INET;
 178                        sin->sin_port = rs->rs_bound_port;
 179                        sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
 180                        uaddr_len = sizeof(*sin);
 181                } else {
 182                        sin6 = (struct sockaddr_in6 *)uaddr;
 183                        sin6->sin6_family = AF_INET6;
 184                        sin6->sin6_port = rs->rs_bound_port;
 185                        sin6->sin6_addr = rs->rs_bound_addr;
 186                        sin6->sin6_flowinfo = 0;
 187                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
 188                        uaddr_len = sizeof(*sin6);
 189                }
 190        }
 191
 192        return uaddr_len;
 193}
 194
 195/*
 196 * RDS' poll is without a doubt the least intuitive part of the interface,
 197 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
 198 * a network protocol.
 199 *
 200 * EPOLLIN is asserted if
 201 *  -   there is data on the receive queue.
 202 *  -   to signal that a previously congested destination may have become
 203 *      uncongested
 204 *  -   A notification has been queued to the socket (this can be a congestion
 205 *      update, or a RDMA completion, or a MSG_ZEROCOPY completion).
 206 *
 207 * EPOLLOUT is asserted if there is room on the send queue. This does not mean
 208 * however, that the next sendmsg() call will succeed. If the application tries
 209 * to send to a congested destination, the system call may still fail (and
 210 * return ENOBUFS).
 211 */
 212static __poll_t rds_poll(struct file *file, struct socket *sock,
 213                             poll_table *wait)
 214{
 215        struct sock *sk = sock->sk;
 216        struct rds_sock *rs = rds_sk_to_rs(sk);
 217        __poll_t mask = 0;
 218        unsigned long flags;
 219
 220        poll_wait(file, sk_sleep(sk), wait);
 221
 222        if (rs->rs_seen_congestion)
 223                poll_wait(file, &rds_poll_waitq, wait);
 224
 225        read_lock_irqsave(&rs->rs_recv_lock, flags);
 226        if (!rs->rs_cong_monitor) {
 227                /* When a congestion map was updated, we signal EPOLLIN for
 228                 * "historical" reasons. Applications can also poll for
 229                 * WRBAND instead. */
 230                if (rds_cong_updated_since(&rs->rs_cong_track))
 231                        mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
 232        } else {
 233                spin_lock(&rs->rs_lock);
 234                if (rs->rs_cong_notify)
 235                        mask |= (EPOLLIN | EPOLLRDNORM);
 236                spin_unlock(&rs->rs_lock);
 237        }
 238        if (!list_empty(&rs->rs_recv_queue) ||
 239            !list_empty(&rs->rs_notify_queue) ||
 240            !list_empty(&rs->rs_zcookie_queue.zcookie_head))
 241                mask |= (EPOLLIN | EPOLLRDNORM);
 242        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 243                mask |= (EPOLLOUT | EPOLLWRNORM);
 244        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 245                mask |= POLLERR;
 246        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 247
 248        /* clear state any time we wake a seen-congested socket */
 249        if (mask)
 250                rs->rs_seen_congestion = 0;
 251
 252        return mask;
 253}
 254
 255static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 256{
 257        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 258        rds_tos_t utos, tos = 0;
 259
 260        switch (cmd) {
 261        case SIOCRDSSETTOS:
 262                if (get_user(utos, (rds_tos_t __user *)arg))
 263                        return -EFAULT;
 264
 265                if (rs->rs_transport &&
 266                    rs->rs_transport->get_tos_map)
 267                        tos = rs->rs_transport->get_tos_map(utos);
 268                else
 269                        return -ENOIOCTLCMD;
 270
 271                spin_lock_bh(&rds_sock_lock);
 272                if (rs->rs_tos || rs->rs_conn) {
 273                        spin_unlock_bh(&rds_sock_lock);
 274                        return -EINVAL;
 275                }
 276                rs->rs_tos = tos;
 277                spin_unlock_bh(&rds_sock_lock);
 278                break;
 279        case SIOCRDSGETTOS:
 280                spin_lock_bh(&rds_sock_lock);
 281                tos = rs->rs_tos;
 282                spin_unlock_bh(&rds_sock_lock);
 283                if (put_user(tos, (rds_tos_t __user *)arg))
 284                        return -EFAULT;
 285                break;
 286        default:
 287                return -ENOIOCTLCMD;
 288        }
 289
 290        return 0;
 291}
 292
 293static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
 294{
 295        struct sockaddr_in6 sin6;
 296        struct sockaddr_in sin;
 297        int ret = 0;
 298
 299        /* racing with another thread binding seems ok here */
 300        if (ipv6_addr_any(&rs->rs_bound_addr)) {
 301                ret = -ENOTCONN; /* XXX not a great errno */
 302                goto out;
 303        }
 304
 305        if (len < sizeof(struct sockaddr_in)) {
 306                ret = -EINVAL;
 307                goto out;
 308        } else if (len < sizeof(struct sockaddr_in6)) {
 309                /* Assume IPv4 */
 310                if (copy_from_sockptr(&sin, optval,
 311                                sizeof(struct sockaddr_in))) {
 312                        ret = -EFAULT;
 313                        goto out;
 314                }
 315                ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
 316                sin6.sin6_port = sin.sin_port;
 317        } else {
 318                if (copy_from_sockptr(&sin6, optval,
 319                                   sizeof(struct sockaddr_in6))) {
 320                        ret = -EFAULT;
 321                        goto out;
 322                }
 323        }
 324
 325        rds_send_drop_to(rs, &sin6);
 326out:
 327        return ret;
 328}
 329
 330static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
 331                               int optlen)
 332{
 333        int value;
 334
 335        if (optlen < sizeof(int))
 336                return -EINVAL;
 337        if (copy_from_sockptr(&value, optval, sizeof(int)))
 338                return -EFAULT;
 339        *optvar = !!value;
 340        return 0;
 341}
 342
 343static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
 344{
 345        int ret;
 346
 347        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
 348        if (ret == 0) {
 349                if (rs->rs_cong_monitor) {
 350                        rds_cong_add_socket(rs);
 351                } else {
 352                        rds_cong_remove_socket(rs);
 353                        rs->rs_cong_mask = 0;
 354                        rs->rs_cong_notify = 0;
 355                }
 356        }
 357        return ret;
 358}
 359
 360static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
 361{
 362        int t_type;
 363
 364        if (rs->rs_transport)
 365                return -EOPNOTSUPP; /* previously attached to transport */
 366
 367        if (optlen != sizeof(int))
 368                return -EINVAL;
 369
 370        if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
 371                return -EFAULT;
 372
 373        if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
 374                return -EINVAL;
 375
 376        rs->rs_transport = rds_trans_get(t_type);
 377
 378        return rs->rs_transport ? 0 : -ENOPROTOOPT;
 379}
 380
 381static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
 382                                 int optlen, int optname)
 383{
 384        int val, valbool;
 385
 386        if (optlen != sizeof(int))
 387                return -EFAULT;
 388
 389        if (copy_from_sockptr(&val, optval, sizeof(int)))
 390                return -EFAULT;
 391
 392        valbool = val ? 1 : 0;
 393
 394        if (optname == SO_TIMESTAMP_NEW)
 395                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 396
 397        if (valbool)
 398                sock_set_flag(sk, SOCK_RCVTSTAMP);
 399        else
 400                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 401
 402        return 0;
 403}
 404
 405static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
 406                                  int optlen)
 407{
 408        struct rds_rx_trace_so trace;
 409        int i;
 410
 411        if (optlen != sizeof(struct rds_rx_trace_so))
 412                return -EFAULT;
 413
 414        if (copy_from_sockptr(&trace, optval, sizeof(trace)))
 415                return -EFAULT;
 416
 417        if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
 418                return -EFAULT;
 419
 420        rs->rs_rx_traces = trace.rx_traces;
 421        for (i = 0; i < rs->rs_rx_traces; i++) {
 422                if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
 423                        rs->rs_rx_traces = 0;
 424                        return -EFAULT;
 425                }
 426                rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
 427        }
 428
 429        return 0;
 430}
 431
 432static int rds_setsockopt(struct socket *sock, int level, int optname,
 433                          sockptr_t optval, unsigned int optlen)
 434{
 435        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 436        int ret;
 437
 438        if (level != SOL_RDS) {
 439                ret = -ENOPROTOOPT;
 440                goto out;
 441        }
 442
 443        switch (optname) {
 444        case RDS_CANCEL_SENT_TO:
 445                ret = rds_cancel_sent_to(rs, optval, optlen);
 446                break;
 447        case RDS_GET_MR:
 448                ret = rds_get_mr(rs, optval, optlen);
 449                break;
 450        case RDS_GET_MR_FOR_DEST:
 451                ret = rds_get_mr_for_dest(rs, optval, optlen);
 452                break;
 453        case RDS_FREE_MR:
 454                ret = rds_free_mr(rs, optval, optlen);
 455                break;
 456        case RDS_RECVERR:
 457                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
 458                break;
 459        case RDS_CONG_MONITOR:
 460                ret = rds_cong_monitor(rs, optval, optlen);
 461                break;
 462        case SO_RDS_TRANSPORT:
 463                lock_sock(sock->sk);
 464                ret = rds_set_transport(rs, optval, optlen);
 465                release_sock(sock->sk);
 466                break;
 467        case SO_TIMESTAMP_OLD:
 468        case SO_TIMESTAMP_NEW:
 469                lock_sock(sock->sk);
 470                ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
 471                release_sock(sock->sk);
 472                break;
 473        case SO_RDS_MSG_RXPATH_LATENCY:
 474                ret = rds_recv_track_latency(rs, optval, optlen);
 475                break;
 476        default:
 477                ret = -ENOPROTOOPT;
 478        }
 479out:
 480        return ret;
 481}
 482
 483static int rds_getsockopt(struct socket *sock, int level, int optname,
 484                          char __user *optval, int __user *optlen)
 485{
 486        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
 487        int ret = -ENOPROTOOPT, len;
 488        int trans;
 489
 490        if (level != SOL_RDS)
 491                goto out;
 492
 493        if (get_user(len, optlen)) {
 494                ret = -EFAULT;
 495                goto out;
 496        }
 497
 498        switch (optname) {
 499        case RDS_INFO_FIRST ... RDS_INFO_LAST:
 500                ret = rds_info_getsockopt(sock, optname, optval,
 501                                          optlen);
 502                break;
 503
 504        case RDS_RECVERR:
 505                if (len < sizeof(int))
 506                        ret = -EINVAL;
 507                else
 508                if (put_user(rs->rs_recverr, (int __user *) optval) ||
 509                    put_user(sizeof(int), optlen))
 510                        ret = -EFAULT;
 511                else
 512                        ret = 0;
 513                break;
 514        case SO_RDS_TRANSPORT:
 515                if (len < sizeof(int)) {
 516                        ret = -EINVAL;
 517                        break;
 518                }
 519                trans = (rs->rs_transport ? rs->rs_transport->t_type :
 520                         RDS_TRANS_NONE); /* unbound */
 521                if (put_user(trans, (int __user *)optval) ||
 522                    put_user(sizeof(int), optlen))
 523                        ret = -EFAULT;
 524                else
 525                        ret = 0;
 526                break;
 527        default:
 528                break;
 529        }
 530
 531out:
 532        return ret;
 533
 534}
 535
 536static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
 537                       int addr_len, int flags)
 538{
 539        struct sock *sk = sock->sk;
 540        struct sockaddr_in *sin;
 541        struct rds_sock *rs = rds_sk_to_rs(sk);
 542        int ret = 0;
 543
 544        if (addr_len < offsetofend(struct sockaddr, sa_family))
 545                return -EINVAL;
 546
 547        lock_sock(sk);
 548
 549        switch (uaddr->sa_family) {
 550        case AF_INET:
 551                sin = (struct sockaddr_in *)uaddr;
 552                if (addr_len < sizeof(struct sockaddr_in)) {
 553                        ret = -EINVAL;
 554                        break;
 555                }
 556                if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
 557                        ret = -EDESTADDRREQ;
 558                        break;
 559                }
 560                if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
 561                    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
 562                        ret = -EINVAL;
 563                        break;
 564                }
 565                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
 566                rs->rs_conn_port = sin->sin_port;
 567                break;
 568
 569#if IS_ENABLED(CONFIG_IPV6)
 570        case AF_INET6: {
 571                struct sockaddr_in6 *sin6;
 572                int addr_type;
 573
 574                sin6 = (struct sockaddr_in6 *)uaddr;
 575                if (addr_len < sizeof(struct sockaddr_in6)) {
 576                        ret = -EINVAL;
 577                        break;
 578                }
 579                addr_type = ipv6_addr_type(&sin6->sin6_addr);
 580                if (!(addr_type & IPV6_ADDR_UNICAST)) {
 581                        __be32 addr4;
 582
 583                        if (!(addr_type & IPV6_ADDR_MAPPED)) {
 584                                ret = -EPROTOTYPE;
 585                                break;
 586                        }
 587
 588                        /* It is a mapped address.  Need to do some sanity
 589                         * checks.
 590                         */
 591                        addr4 = sin6->sin6_addr.s6_addr32[3];
 592                        if (addr4 == htonl(INADDR_ANY) ||
 593                            addr4 == htonl(INADDR_BROADCAST) ||
 594                            ipv4_is_multicast(addr4)) {
 595                                ret = -EPROTOTYPE;
 596                                break;
 597                        }
 598                }
 599
 600                if (addr_type & IPV6_ADDR_LINKLOCAL) {
 601                        /* If socket is arleady bound to a link local address,
 602                         * the peer address must be on the same link.
 603                         */
 604                        if (sin6->sin6_scope_id == 0 ||
 605                            (!ipv6_addr_any(&rs->rs_bound_addr) &&
 606                             rs->rs_bound_scope_id &&
 607                             sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
 608                                ret = -EINVAL;
 609                                break;
 610                        }
 611                        /* Remember the connected address scope ID.  It will
 612                         * be checked against the binding local address when
 613                         * the socket is bound.
 614                         */
 615                        rs->rs_bound_scope_id = sin6->sin6_scope_id;
 616                }
 617                rs->rs_conn_addr = sin6->sin6_addr;
 618                rs->rs_conn_port = sin6->sin6_port;
 619                break;
 620        }
 621#endif
 622
 623        default:
 624                ret = -EAFNOSUPPORT;
 625                break;
 626        }
 627
 628        release_sock(sk);
 629        return ret;
 630}
 631
 632static struct proto rds_proto = {
 633        .name     = "RDS",
 634        .owner    = THIS_MODULE,
 635        .obj_size = sizeof(struct rds_sock),
 636};
 637
 638static const struct proto_ops rds_proto_ops = {
 639        .family =       AF_RDS,
 640        .owner =        THIS_MODULE,
 641        .release =      rds_release,
 642        .bind =         rds_bind,
 643        .connect =      rds_connect,
 644        .socketpair =   sock_no_socketpair,
 645        .accept =       sock_no_accept,
 646        .getname =      rds_getname,
 647        .poll =         rds_poll,
 648        .ioctl =        rds_ioctl,
 649        .listen =       sock_no_listen,
 650        .shutdown =     sock_no_shutdown,
 651        .setsockopt =   rds_setsockopt,
 652        .getsockopt =   rds_getsockopt,
 653        .sendmsg =      rds_sendmsg,
 654        .recvmsg =      rds_recvmsg,
 655        .mmap =         sock_no_mmap,
 656        .sendpage =     sock_no_sendpage,
 657};
 658
 659static void rds_sock_destruct(struct sock *sk)
 660{
 661        struct rds_sock *rs = rds_sk_to_rs(sk);
 662
 663        WARN_ON((&rs->rs_item != rs->rs_item.next ||
 664                 &rs->rs_item != rs->rs_item.prev));
 665}
 666
 667static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 668{
 669        struct rds_sock *rs;
 670
 671        sock_init_data(sock, sk);
 672        sock->ops               = &rds_proto_ops;
 673        sk->sk_protocol         = protocol;
 674        sk->sk_destruct         = rds_sock_destruct;
 675
 676        rs = rds_sk_to_rs(sk);
 677        spin_lock_init(&rs->rs_lock);
 678        rwlock_init(&rs->rs_recv_lock);
 679        INIT_LIST_HEAD(&rs->rs_send_queue);
 680        INIT_LIST_HEAD(&rs->rs_recv_queue);
 681        INIT_LIST_HEAD(&rs->rs_notify_queue);
 682        INIT_LIST_HEAD(&rs->rs_cong_list);
 683        rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
 684        spin_lock_init(&rs->rs_rdma_lock);
 685        rs->rs_rdma_keys = RB_ROOT;
 686        rs->rs_rx_traces = 0;
 687        rs->rs_tos = 0;
 688        rs->rs_conn = NULL;
 689
 690        spin_lock_bh(&rds_sock_lock);
 691        list_add_tail(&rs->rs_item, &rds_sock_list);
 692        rds_sock_count++;
 693        spin_unlock_bh(&rds_sock_lock);
 694
 695        return 0;
 696}
 697
 698static int rds_create(struct net *net, struct socket *sock, int protocol,
 699                      int kern)
 700{
 701        struct sock *sk;
 702
 703        if (sock->type != SOCK_SEQPACKET || protocol)
 704                return -ESOCKTNOSUPPORT;
 705
 706        sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
 707        if (!sk)
 708                return -ENOMEM;
 709
 710        return __rds_create(sock, sk, protocol);
 711}
 712
 713void rds_sock_addref(struct rds_sock *rs)
 714{
 715        sock_hold(rds_rs_to_sk(rs));
 716}
 717
 718void rds_sock_put(struct rds_sock *rs)
 719{
 720        sock_put(rds_rs_to_sk(rs));
 721}
 722
 723static const struct net_proto_family rds_family_ops = {
 724        .family =       AF_RDS,
 725        .create =       rds_create,
 726        .owner  =       THIS_MODULE,
 727};
 728
 729static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 730                              struct rds_info_iterator *iter,
 731                              struct rds_info_lengths *lens)
 732{
 733        struct rds_sock *rs;
 734        struct rds_incoming *inc;
 735        unsigned int total = 0;
 736
 737        len /= sizeof(struct rds_info_message);
 738
 739        spin_lock_bh(&rds_sock_lock);
 740
 741        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 742                /* This option only supports IPv4 sockets. */
 743                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
 744                        continue;
 745
 746                read_lock(&rs->rs_recv_lock);
 747
 748                /* XXX too lazy to maintain counts.. */
 749                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
 750                        total++;
 751                        if (total <= len)
 752                                rds_inc_info_copy(inc, iter,
 753                                                  inc->i_saddr.s6_addr32[3],
 754                                                  rs->rs_bound_addr_v4,
 755                                                  1);
 756                }
 757
 758                read_unlock(&rs->rs_recv_lock);
 759        }
 760
 761        spin_unlock_bh(&rds_sock_lock);
 762
 763        lens->nr = total;
 764        lens->each = sizeof(struct rds_info_message);
 765}
 766
 767#if IS_ENABLED(CONFIG_IPV6)
 768static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
 769                               struct rds_info_iterator *iter,
 770                               struct rds_info_lengths *lens)
 771{
 772        struct rds_incoming *inc;
 773        unsigned int total = 0;
 774        struct rds_sock *rs;
 775
 776        len /= sizeof(struct rds6_info_message);
 777
 778        spin_lock_bh(&rds_sock_lock);
 779
 780        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 781                read_lock(&rs->rs_recv_lock);
 782
 783                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
 784                        total++;
 785                        if (total <= len)
 786                                rds6_inc_info_copy(inc, iter, &inc->i_saddr,
 787                                                   &rs->rs_bound_addr, 1);
 788                }
 789
 790                read_unlock(&rs->rs_recv_lock);
 791        }
 792
 793        spin_unlock_bh(&rds_sock_lock);
 794
 795        lens->nr = total;
 796        lens->each = sizeof(struct rds6_info_message);
 797}
 798#endif
 799
 800static void rds_sock_info(struct socket *sock, unsigned int len,
 801                          struct rds_info_iterator *iter,
 802                          struct rds_info_lengths *lens)
 803{
 804        struct rds_info_socket sinfo;
 805        unsigned int cnt = 0;
 806        struct rds_sock *rs;
 807
 808        len /= sizeof(struct rds_info_socket);
 809
 810        spin_lock_bh(&rds_sock_lock);
 811
 812        if (len < rds_sock_count) {
 813                cnt = rds_sock_count;
 814                goto out;
 815        }
 816
 817        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 818                /* This option only supports IPv4 sockets. */
 819                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
 820                        continue;
 821                sinfo.sndbuf = rds_sk_sndbuf(rs);
 822                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
 823                sinfo.bound_addr = rs->rs_bound_addr_v4;
 824                sinfo.connected_addr = rs->rs_conn_addr_v4;
 825                sinfo.bound_port = rs->rs_bound_port;
 826                sinfo.connected_port = rs->rs_conn_port;
 827                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
 828
 829                rds_info_copy(iter, &sinfo, sizeof(sinfo));
 830                cnt++;
 831        }
 832
 833out:
 834        lens->nr = cnt;
 835        lens->each = sizeof(struct rds_info_socket);
 836
 837        spin_unlock_bh(&rds_sock_lock);
 838}
 839
 840#if IS_ENABLED(CONFIG_IPV6)
 841static void rds6_sock_info(struct socket *sock, unsigned int len,
 842                           struct rds_info_iterator *iter,
 843                           struct rds_info_lengths *lens)
 844{
 845        struct rds6_info_socket sinfo6;
 846        struct rds_sock *rs;
 847
 848        len /= sizeof(struct rds6_info_socket);
 849
 850        spin_lock_bh(&rds_sock_lock);
 851
 852        if (len < rds_sock_count)
 853                goto out;
 854
 855        list_for_each_entry(rs, &rds_sock_list, rs_item) {
 856                sinfo6.sndbuf = rds_sk_sndbuf(rs);
 857                sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
 858                sinfo6.bound_addr = rs->rs_bound_addr;
 859                sinfo6.connected_addr = rs->rs_conn_addr;
 860                sinfo6.bound_port = rs->rs_bound_port;
 861                sinfo6.connected_port = rs->rs_conn_port;
 862                sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
 863
 864                rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
 865        }
 866
 867 out:
 868        lens->nr = rds_sock_count;
 869        lens->each = sizeof(struct rds6_info_socket);
 870
 871        spin_unlock_bh(&rds_sock_lock);
 872}
 873#endif
 874
 875static void rds_exit(void)
 876{
 877        sock_unregister(rds_family_ops.family);
 878        proto_unregister(&rds_proto);
 879        rds_conn_exit();
 880        rds_cong_exit();
 881        rds_sysctl_exit();
 882        rds_threads_exit();
 883        rds_stats_exit();
 884        rds_page_exit();
 885        rds_bind_lock_destroy();
 886        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
 887        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 888#if IS_ENABLED(CONFIG_IPV6)
 889        rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
 890        rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
 891#endif
 892}
 893module_exit(rds_exit);
 894
 895u32 rds_gen_num;
 896
 897static int rds_init(void)
 898{
 899        int ret;
 900
 901        net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
 902
 903        ret = rds_bind_lock_init();
 904        if (ret)
 905                goto out;
 906
 907        ret = rds_conn_init();
 908        if (ret)
 909                goto out_bind;
 910
 911        ret = rds_threads_init();
 912        if (ret)
 913                goto out_conn;
 914        ret = rds_sysctl_init();
 915        if (ret)
 916                goto out_threads;
 917        ret = rds_stats_init();
 918        if (ret)
 919                goto out_sysctl;
 920        ret = proto_register(&rds_proto, 1);
 921        if (ret)
 922                goto out_stats;
 923        ret = sock_register(&rds_family_ops);
 924        if (ret)
 925                goto out_proto;
 926
 927        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
 928        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 929#if IS_ENABLED(CONFIG_IPV6)
 930        rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
 931        rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
 932#endif
 933
 934        goto out;
 935
 936out_proto:
 937        proto_unregister(&rds_proto);
 938out_stats:
 939        rds_stats_exit();
 940out_sysctl:
 941        rds_sysctl_exit();
 942out_threads:
 943        rds_threads_exit();
 944out_conn:
 945        rds_conn_exit();
 946        rds_cong_exit();
 947        rds_page_exit();
 948out_bind:
 949        rds_bind_lock_destroy();
 950out:
 951        return ret;
 952}
 953module_init(rds_init);
 954
 955#define DRV_VERSION     "4.0"
 956#define DRV_RELDATE     "Feb 12, 2009"
 957
 958MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
 959MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
 960                   " v" DRV_VERSION " (" DRV_RELDATE ")");
 961MODULE_VERSION(DRV_VERSION);
 962MODULE_LICENSE("Dual BSD/GPL");
 963MODULE_ALIAS_NETPROTO(PF_RDS);
 964