linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#include <linux/capability.h>
  93#include <linux/errno.h>
  94#include <linux/types.h>
  95#include <linux/socket.h>
  96#include <linux/in.h>
  97#include <linux/kernel.h>
  98#include <linux/module.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/sched.h>
 102#include <linux/timer.h>
 103#include <linux/string.h>
 104#include <linux/sockios.h>
 105#include <linux/net.h>
 106#include <linux/mm.h>
 107#include <linux/slab.h>
 108#include <linux/interrupt.h>
 109#include <linux/poll.h>
 110#include <linux/tcp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113
 114#include <asm/uaccess.h>
 115#include <asm/system.h>
 116
 117#include <linux/netdevice.h>
 118#include <net/protocol.h>
 119#include <linux/skbuff.h>
 120#include <net/net_namespace.h>
 121#include <net/request_sock.h>
 122#include <net/sock.h>
 123#include <linux/net_tstamp.h>
 124#include <net/xfrm.h>
 125#include <linux/ipsec.h>
 126
 127#include <linux/filter.h>
 128
 129#ifdef CONFIG_INET
 130#include <net/tcp.h>
 131#endif
 132
 133/*
 134 * Each address family might have different locking rules, so we have
 135 * one slock key per address family:
 136 */
 137static struct lock_class_key af_family_keys[AF_MAX];
 138static struct lock_class_key af_family_slock_keys[AF_MAX];
 139
 140/*
 141 * Make lock validator output more readable. (we pre-construct these
 142 * strings build-time, so that runtime initialization of socket
 143 * locks is fast):
 144 */
 145static const char *const af_family_key_strings[AF_MAX+1] = {
 146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 158  "sk_lock-AF_IEEE802154",
 159  "sk_lock-AF_MAX"
 160};
 161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 162  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 163  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 164  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 165  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 166  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 167  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 168  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 169  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 170  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 171  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 172  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 173  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 174  "slock-AF_IEEE802154",
 175  "slock-AF_MAX"
 176};
 177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 178  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 179  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 180  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 181  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 182  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 183  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 184  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 185  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 186  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 187  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 188  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 189  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 190  "clock-AF_IEEE802154",
 191  "clock-AF_MAX"
 192};
 193
 194/*
 195 * sk_callback_lock locking rules are per-address-family,
 196 * so split the lock classes by using a per-AF key:
 197 */
 198static struct lock_class_key af_callback_keys[AF_MAX];
 199
 200/* Take into consideration the size of the struct sk_buff overhead in the
 201 * determination of these values, since that is non-constant across
 202 * platforms.  This makes socket queueing behavior and performance
 203 * not depend upon such differences.
 204 */
 205#define _SK_MEM_PACKETS         256
 206#define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 207#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 208#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 209
 210/* Run time adjustable parameters. */
 211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 215
 216/* Maximal space eaten by iovec or ancilliary data plus some space */
 217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 218EXPORT_SYMBOL(sysctl_optmem_max);
 219
 220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 221{
 222        struct timeval tv;
 223
 224        if (optlen < sizeof(tv))
 225                return -EINVAL;
 226        if (copy_from_user(&tv, optval, sizeof(tv)))
 227                return -EFAULT;
 228        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 229                return -EDOM;
 230
 231        if (tv.tv_sec < 0) {
 232                static int warned __read_mostly;
 233
 234                *timeo_p = 0;
 235                if (warned < 10 && net_ratelimit()) {
 236                        warned++;
 237                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 238                               "tries to set negative timeout\n",
 239                                current->comm, task_pid_nr(current));
 240                }
 241                return 0;
 242        }
 243        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 244        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 245                return 0;
 246        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 247                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 248        return 0;
 249}
 250
 251static void sock_warn_obsolete_bsdism(const char *name)
 252{
 253        static int warned;
 254        static char warncomm[TASK_COMM_LEN];
 255        if (strcmp(warncomm, current->comm) && warned < 5) {
 256                strcpy(warncomm,  current->comm);
 257                printk(KERN_WARNING "process `%s' is using obsolete "
 258                       "%s SO_BSDCOMPAT\n", warncomm, name);
 259                warned++;
 260        }
 261}
 262
 263static void sock_disable_timestamp(struct sock *sk, int flag)
 264{
 265        if (sock_flag(sk, flag)) {
 266                sock_reset_flag(sk, flag);
 267                if (!sock_flag(sk, SOCK_TIMESTAMP) &&
 268                    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
 269                        net_disable_timestamp();
 270                }
 271        }
 272}
 273
 274
 275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 276{
 277        int err = 0;
 278        int skb_len;
 279
 280        /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 281           number of warnings when compiling with -W --ANK
 282         */
 283        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 284            (unsigned)sk->sk_rcvbuf) {
 285                err = -ENOMEM;
 286                goto out;
 287        }
 288
 289        err = sk_filter(sk, skb);
 290        if (err)
 291                goto out;
 292
 293        if (!sk_rmem_schedule(sk, skb->truesize)) {
 294                err = -ENOBUFS;
 295                goto out;
 296        }
 297
 298        skb->dev = NULL;
 299        skb_set_owner_r(skb, sk);
 300
 301        /* Cache the SKB length before we tack it onto the receive
 302         * queue.  Once it is added it no longer belongs to us and
 303         * may be freed by other threads of control pulling packets
 304         * from the queue.
 305         */
 306        skb_len = skb->len;
 307
 308        skb_queue_tail(&sk->sk_receive_queue, skb);
 309
 310        if (!sock_flag(sk, SOCK_DEAD))
 311                sk->sk_data_ready(sk, skb_len);
 312out:
 313        return err;
 314}
 315EXPORT_SYMBOL(sock_queue_rcv_skb);
 316
 317int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 318{
 319        int rc = NET_RX_SUCCESS;
 320
 321        if (sk_filter(sk, skb))
 322                goto discard_and_relse;
 323
 324        skb->dev = NULL;
 325
 326        if (nested)
 327                bh_lock_sock_nested(sk);
 328        else
 329                bh_lock_sock(sk);
 330        if (!sock_owned_by_user(sk)) {
 331                /*
 332                 * trylock + unlock semantics:
 333                 */
 334                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 335
 336                rc = sk_backlog_rcv(sk, skb);
 337
 338                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 339        } else
 340                sk_add_backlog(sk, skb);
 341        bh_unlock_sock(sk);
 342out:
 343        sock_put(sk);
 344        return rc;
 345discard_and_relse:
 346        kfree_skb(skb);
 347        goto out;
 348}
 349EXPORT_SYMBOL(sk_receive_skb);
 350
 351struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 352{
 353        struct dst_entry *dst = sk->sk_dst_cache;
 354
 355        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 356                sk->sk_dst_cache = NULL;
 357                dst_release(dst);
 358                return NULL;
 359        }
 360
 361        return dst;
 362}
 363EXPORT_SYMBOL(__sk_dst_check);
 364
 365struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 366{
 367        struct dst_entry *dst = sk_dst_get(sk);
 368
 369        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 370                sk_dst_reset(sk);
 371                dst_release(dst);
 372                return NULL;
 373        }
 374
 375        return dst;
 376}
 377EXPORT_SYMBOL(sk_dst_check);
 378
 379static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 380{
 381        int ret = -ENOPROTOOPT;
 382#ifdef CONFIG_NETDEVICES
 383        struct net *net = sock_net(sk);
 384        char devname[IFNAMSIZ];
 385        int index;
 386
 387        /* Sorry... */
 388        ret = -EPERM;
 389        if (!capable(CAP_NET_RAW))
 390                goto out;
 391
 392        ret = -EINVAL;
 393        if (optlen < 0)
 394                goto out;
 395
 396        /* Bind this socket to a particular device like "eth0",
 397         * as specified in the passed interface name. If the
 398         * name is "" or the option length is zero the socket
 399         * is not bound.
 400         */
 401        if (optlen > IFNAMSIZ - 1)
 402                optlen = IFNAMSIZ - 1;
 403        memset(devname, 0, sizeof(devname));
 404
 405        ret = -EFAULT;
 406        if (copy_from_user(devname, optval, optlen))
 407                goto out;
 408
 409        if (devname[0] == '\0') {
 410                index = 0;
 411        } else {
 412                struct net_device *dev = dev_get_by_name(net, devname);
 413
 414                ret = -ENODEV;
 415                if (!dev)
 416                        goto out;
 417
 418                index = dev->ifindex;
 419                dev_put(dev);
 420        }
 421
 422        lock_sock(sk);
 423        sk->sk_bound_dev_if = index;
 424        sk_dst_reset(sk);
 425        release_sock(sk);
 426
 427        ret = 0;
 428
 429out:
 430#endif
 431
 432        return ret;
 433}
 434
 435static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 436{
 437        if (valbool)
 438                sock_set_flag(sk, bit);
 439        else
 440                sock_reset_flag(sk, bit);
 441}
 442
 443/*
 444 *      This is meant for all protocols to use and covers goings on
 445 *      at the socket level. Everything here is generic.
 446 */
 447
 448int sock_setsockopt(struct socket *sock, int level, int optname,
 449                    char __user *optval, unsigned int optlen)
 450{
 451        struct sock *sk = sock->sk;
 452        int val;
 453        int valbool;
 454        struct linger ling;
 455        int ret = 0;
 456
 457        /*
 458         *      Options without arguments
 459         */
 460
 461        if (optname == SO_BINDTODEVICE)
 462                return sock_bindtodevice(sk, optval, optlen);
 463
 464        if (optlen < sizeof(int))
 465                return -EINVAL;
 466
 467        if (get_user(val, (int __user *)optval))
 468                return -EFAULT;
 469
 470        valbool = val ? 1 : 0;
 471
 472        lock_sock(sk);
 473
 474        switch (optname) {
 475        case SO_DEBUG:
 476                if (val && !capable(CAP_NET_ADMIN))
 477                        ret = -EACCES;
 478                else
 479                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 480                break;
 481        case SO_REUSEADDR:
 482                sk->sk_reuse = valbool;
 483                break;
 484        case SO_TYPE:
 485        case SO_PROTOCOL:
 486        case SO_DOMAIN:
 487        case SO_ERROR:
 488                ret = -ENOPROTOOPT;
 489                break;
 490        case SO_DONTROUTE:
 491                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 492                break;
 493        case SO_BROADCAST:
 494                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 495                break;
 496        case SO_SNDBUF:
 497                /* Don't error on this BSD doesn't and if you think
 498                   about it this is right. Otherwise apps have to
 499                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 500                   are treated in BSD as hints */
 501
 502                if (val > sysctl_wmem_max)
 503                        val = sysctl_wmem_max;
 504set_sndbuf:
 505                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 506                if ((val * 2) < SOCK_MIN_SNDBUF)
 507                        sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 508                else
 509                        sk->sk_sndbuf = val * 2;
 510
 511                /*
 512                 *      Wake up sending tasks if we
 513                 *      upped the value.
 514                 */
 515                sk->sk_write_space(sk);
 516                break;
 517
 518        case SO_SNDBUFFORCE:
 519                if (!capable(CAP_NET_ADMIN)) {
 520                        ret = -EPERM;
 521                        break;
 522                }
 523                goto set_sndbuf;
 524
 525        case SO_RCVBUF:
 526                /* Don't error on this BSD doesn't and if you think
 527                   about it this is right. Otherwise apps have to
 528                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 529                   are treated in BSD as hints */
 530
 531                if (val > sysctl_rmem_max)
 532                        val = sysctl_rmem_max;
 533set_rcvbuf:
 534                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 535                /*
 536                 * We double it on the way in to account for
 537                 * "struct sk_buff" etc. overhead.   Applications
 538                 * assume that the SO_RCVBUF setting they make will
 539                 * allow that much actual data to be received on that
 540                 * socket.
 541                 *
 542                 * Applications are unaware that "struct sk_buff" and
 543                 * other overheads allocate from the receive buffer
 544                 * during socket buffer allocation.
 545                 *
 546                 * And after considering the possible alternatives,
 547                 * returning the value we actually used in getsockopt
 548                 * is the most desirable behavior.
 549                 */
 550                if ((val * 2) < SOCK_MIN_RCVBUF)
 551                        sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 552                else
 553                        sk->sk_rcvbuf = val * 2;
 554                break;
 555
 556        case SO_RCVBUFFORCE:
 557                if (!capable(CAP_NET_ADMIN)) {
 558                        ret = -EPERM;
 559                        break;
 560                }
 561                goto set_rcvbuf;
 562
 563        case SO_KEEPALIVE:
 564#ifdef CONFIG_INET
 565                if (sk->sk_protocol == IPPROTO_TCP)
 566                        tcp_set_keepalive(sk, valbool);
 567#endif
 568                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 569                break;
 570
 571        case SO_OOBINLINE:
 572                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 573                break;
 574
 575        case SO_NO_CHECK:
 576                sk->sk_no_check = valbool;
 577                break;
 578
 579        case SO_PRIORITY:
 580                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 581                        sk->sk_priority = val;
 582                else
 583                        ret = -EPERM;
 584                break;
 585
 586        case SO_LINGER:
 587                if (optlen < sizeof(ling)) {
 588                        ret = -EINVAL;  /* 1003.1g */
 589                        break;
 590                }
 591                if (copy_from_user(&ling, optval, sizeof(ling))) {
 592                        ret = -EFAULT;
 593                        break;
 594                }
 595                if (!ling.l_onoff)
 596                        sock_reset_flag(sk, SOCK_LINGER);
 597                else {
 598#if (BITS_PER_LONG == 32)
 599                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 600                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 601                        else
 602#endif
 603                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 604                        sock_set_flag(sk, SOCK_LINGER);
 605                }
 606                break;
 607
 608        case SO_BSDCOMPAT:
 609                sock_warn_obsolete_bsdism("setsockopt");
 610                break;
 611
 612        case SO_PASSCRED:
 613                if (valbool)
 614                        set_bit(SOCK_PASSCRED, &sock->flags);
 615                else
 616                        clear_bit(SOCK_PASSCRED, &sock->flags);
 617                break;
 618
 619        case SO_TIMESTAMP:
 620        case SO_TIMESTAMPNS:
 621                if (valbool)  {
 622                        if (optname == SO_TIMESTAMP)
 623                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 624                        else
 625                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 626                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 627                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 628                } else {
 629                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 630                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 631                }
 632                break;
 633
 634        case SO_TIMESTAMPING:
 635                if (val & ~SOF_TIMESTAMPING_MASK) {
 636                        ret = -EINVAL;
 637                        break;
 638                }
 639                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 640                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 641                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 642                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 643                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 644                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 645                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 646                        sock_enable_timestamp(sk,
 647                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 648                else
 649                        sock_disable_timestamp(sk,
 650                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 651                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 652                                  val & SOF_TIMESTAMPING_SOFTWARE);
 653                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 654                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 655                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 656                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 657                break;
 658
 659        case SO_RCVLOWAT:
 660                if (val < 0)
 661                        val = INT_MAX;
 662                sk->sk_rcvlowat = val ? : 1;
 663                break;
 664
 665        case SO_RCVTIMEO:
 666                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 667                break;
 668
 669        case SO_SNDTIMEO:
 670                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 671                break;
 672
 673        case SO_ATTACH_FILTER:
 674                ret = -EINVAL;
 675                if (optlen == sizeof(struct sock_fprog)) {
 676                        struct sock_fprog fprog;
 677
 678                        ret = -EFAULT;
 679                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 680                                break;
 681
 682                        ret = sk_attach_filter(&fprog, sk);
 683                }
 684                break;
 685
 686        case SO_DETACH_FILTER:
 687                ret = sk_detach_filter(sk);
 688                break;
 689
 690        case SO_PASSSEC:
 691                if (valbool)
 692                        set_bit(SOCK_PASSSEC, &sock->flags);
 693                else
 694                        clear_bit(SOCK_PASSSEC, &sock->flags);
 695                break;
 696        case SO_MARK:
 697                if (!capable(CAP_NET_ADMIN))
 698                        ret = -EPERM;
 699                else
 700                        sk->sk_mark = val;
 701                break;
 702
 703                /* We implement the SO_SNDLOWAT etc to
 704                   not be settable (1003.1g 5.3) */
 705        default:
 706                ret = -ENOPROTOOPT;
 707                break;
 708        }
 709        release_sock(sk);
 710        return ret;
 711}
 712EXPORT_SYMBOL(sock_setsockopt);
 713
 714
 715int sock_getsockopt(struct socket *sock, int level, int optname,
 716                    char __user *optval, int __user *optlen)
 717{
 718        struct sock *sk = sock->sk;
 719
 720        union {
 721                int val;
 722                struct linger ling;
 723                struct timeval tm;
 724        } v;
 725
 726        unsigned int lv = sizeof(int);
 727        int len;
 728
 729        if (get_user(len, optlen))
 730                return -EFAULT;
 731        if (len < 0)
 732                return -EINVAL;
 733
 734        memset(&v, 0, sizeof(v));
 735
 736        switch (optname) {
 737        case SO_DEBUG:
 738                v.val = sock_flag(sk, SOCK_DBG);
 739                break;
 740
 741        case SO_DONTROUTE:
 742                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 743                break;
 744
 745        case SO_BROADCAST:
 746                v.val = !!sock_flag(sk, SOCK_BROADCAST);
 747                break;
 748
 749        case SO_SNDBUF:
 750                v.val = sk->sk_sndbuf;
 751                break;
 752
 753        case SO_RCVBUF:
 754                v.val = sk->sk_rcvbuf;
 755                break;
 756
 757        case SO_REUSEADDR:
 758                v.val = sk->sk_reuse;
 759                break;
 760
 761        case SO_KEEPALIVE:
 762                v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 763                break;
 764
 765        case SO_TYPE:
 766                v.val = sk->sk_type;
 767                break;
 768
 769        case SO_PROTOCOL:
 770                v.val = sk->sk_protocol;
 771                break;
 772
 773        case SO_DOMAIN:
 774                v.val = sk->sk_family;
 775                break;
 776
 777        case SO_ERROR:
 778                v.val = -sock_error(sk);
 779                if (v.val == 0)
 780                        v.val = xchg(&sk->sk_err_soft, 0);
 781                break;
 782
 783        case SO_OOBINLINE:
 784                v.val = !!sock_flag(sk, SOCK_URGINLINE);
 785                break;
 786
 787        case SO_NO_CHECK:
 788                v.val = sk->sk_no_check;
 789                break;
 790
 791        case SO_PRIORITY:
 792                v.val = sk->sk_priority;
 793                break;
 794
 795        case SO_LINGER:
 796                lv              = sizeof(v.ling);
 797                v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 798                v.ling.l_linger = sk->sk_lingertime / HZ;
 799                break;
 800
 801        case SO_BSDCOMPAT:
 802                sock_warn_obsolete_bsdism("getsockopt");
 803                break;
 804
 805        case SO_TIMESTAMP:
 806                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 807                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 808                break;
 809
 810        case SO_TIMESTAMPNS:
 811                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 812                break;
 813
 814        case SO_TIMESTAMPING:
 815                v.val = 0;
 816                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 817                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 818                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 819                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 820                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 821                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 822                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 823                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 824                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 825                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 826                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 827                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 828                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 829                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 830                break;
 831
 832        case SO_RCVTIMEO:
 833                lv = sizeof(struct timeval);
 834                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 835                        v.tm.tv_sec = 0;
 836                        v.tm.tv_usec = 0;
 837                } else {
 838                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 839                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 840                }
 841                break;
 842
 843        case SO_SNDTIMEO:
 844                lv = sizeof(struct timeval);
 845                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 846                        v.tm.tv_sec = 0;
 847                        v.tm.tv_usec = 0;
 848                } else {
 849                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 850                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 851                }
 852                break;
 853
 854        case SO_RCVLOWAT:
 855                v.val = sk->sk_rcvlowat;
 856                break;
 857
 858        case SO_SNDLOWAT:
 859                v.val = 1;
 860                break;
 861
 862        case SO_PASSCRED:
 863                v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 864                break;
 865
 866        case SO_PEERCRED:
 867                if (len > sizeof(sk->sk_peercred))
 868                        len = sizeof(sk->sk_peercred);
 869                if (copy_to_user(optval, &sk->sk_peercred, len))
 870                        return -EFAULT;
 871                goto lenout;
 872
 873        case SO_PEERNAME:
 874        {
 875                char address[128];
 876
 877                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 878                        return -ENOTCONN;
 879                if (lv < len)
 880                        return -EINVAL;
 881                if (copy_to_user(optval, address, len))
 882                        return -EFAULT;
 883                goto lenout;
 884        }
 885
 886        /* Dubious BSD thing... Probably nobody even uses it, but
 887         * the UNIX standard wants it for whatever reason... -DaveM
 888         */
 889        case SO_ACCEPTCONN:
 890                v.val = sk->sk_state == TCP_LISTEN;
 891                break;
 892
 893        case SO_PASSSEC:
 894                v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 895                break;
 896
 897        case SO_PEERSEC:
 898                return security_socket_getpeersec_stream(sock, optval, optlen, len);
 899
 900        case SO_MARK:
 901                v.val = sk->sk_mark;
 902                break;
 903
 904        default:
 905                return -ENOPROTOOPT;
 906        }
 907
 908        if (len > lv)
 909                len = lv;
 910        if (copy_to_user(optval, &v, len))
 911                return -EFAULT;
 912lenout:
 913        if (put_user(len, optlen))
 914                return -EFAULT;
 915        return 0;
 916}
 917
 918/*
 919 * Initialize an sk_lock.
 920 *
 921 * (We also register the sk_lock with the lock validator.)
 922 */
 923static inline void sock_lock_init(struct sock *sk)
 924{
 925        sock_lock_init_class_and_name(sk,
 926                        af_family_slock_key_strings[sk->sk_family],
 927                        af_family_slock_keys + sk->sk_family,
 928                        af_family_key_strings[sk->sk_family],
 929                        af_family_keys + sk->sk_family);
 930}
 931
 932/*
 933 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 934 * even temporarly, because of RCU lookups. sk_node should also be left as is.
 935 */
 936static void sock_copy(struct sock *nsk, const struct sock *osk)
 937{
 938#ifdef CONFIG_SECURITY_NETWORK
 939        void *sptr = nsk->sk_security;
 940#endif
 941        BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
 942                     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
 943        memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
 944               osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
 945#ifdef CONFIG_SECURITY_NETWORK
 946        nsk->sk_security = sptr;
 947        security_sk_clone(osk, nsk);
 948#endif
 949}
 950
 951static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 952                int family)
 953{
 954        struct sock *sk;
 955        struct kmem_cache *slab;
 956
 957        slab = prot->slab;
 958        if (slab != NULL) {
 959                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
 960                if (!sk)
 961                        return sk;
 962                if (priority & __GFP_ZERO) {
 963                        /*
 964                         * caches using SLAB_DESTROY_BY_RCU should let
 965                         * sk_node.next un-modified. Special care is taken
 966                         * when initializing object to zero.
 967                         */
 968                        if (offsetof(struct sock, sk_node.next) != 0)
 969                                memset(sk, 0, offsetof(struct sock, sk_node.next));
 970                        memset(&sk->sk_node.pprev, 0,
 971                               prot->obj_size - offsetof(struct sock,
 972                                                         sk_node.pprev));
 973                }
 974        }
 975        else
 976                sk = kmalloc(prot->obj_size, priority);
 977
 978        if (sk != NULL) {
 979                kmemcheck_annotate_bitfield(sk, flags);
 980
 981                if (security_sk_alloc(sk, family, priority))
 982                        goto out_free;
 983
 984                if (!try_module_get(prot->owner))
 985                        goto out_free_sec;
 986        }
 987
 988        return sk;
 989
 990out_free_sec:
 991        security_sk_free(sk);
 992out_free:
 993        if (slab != NULL)
 994                kmem_cache_free(slab, sk);
 995        else
 996                kfree(sk);
 997        return NULL;
 998}
 999
1000static void sk_prot_free(struct proto *prot, struct sock *sk)
1001{
1002        struct kmem_cache *slab;
1003        struct module *owner;
1004
1005        owner = prot->owner;
1006        slab = prot->slab;
1007
1008        security_sk_free(sk);
1009        if (slab != NULL)
1010                kmem_cache_free(slab, sk);
1011        else
1012                kfree(sk);
1013        module_put(owner);
1014}
1015
1016/**
1017 *      sk_alloc - All socket objects are allocated here
1018 *      @net: the applicable net namespace
1019 *      @family: protocol family
1020 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1021 *      @prot: struct proto associated with this new sock instance
1022 */
1023struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1024                      struct proto *prot)
1025{
1026        struct sock *sk;
1027
1028        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1029        if (sk) {
1030                sk->sk_family = family;
1031                /*
1032                 * See comment in struct sock definition to understand
1033                 * why we need sk_prot_creator -acme
1034                 */
1035                sk->sk_prot = sk->sk_prot_creator = prot;
1036                sock_lock_init(sk);
1037                sock_net_set(sk, get_net(net));
1038                atomic_set(&sk->sk_wmem_alloc, 1);
1039        }
1040
1041        return sk;
1042}
1043EXPORT_SYMBOL(sk_alloc);
1044
1045static void __sk_free(struct sock *sk)
1046{
1047        struct sk_filter *filter;
1048
1049        if (sk->sk_destruct)
1050                sk->sk_destruct(sk);
1051
1052        filter = rcu_dereference(sk->sk_filter);
1053        if (filter) {
1054                sk_filter_uncharge(sk, filter);
1055                rcu_assign_pointer(sk->sk_filter, NULL);
1056        }
1057
1058        sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1059        sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1060
1061        if (atomic_read(&sk->sk_omem_alloc))
1062                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1063                       __func__, atomic_read(&sk->sk_omem_alloc));
1064
1065        put_net(sock_net(sk));
1066        sk_prot_free(sk->sk_prot_creator, sk);
1067}
1068
1069void sk_free(struct sock *sk)
1070{
1071        /*
1072         * We substract one from sk_wmem_alloc and can know if
1073         * some packets are still in some tx queue.
1074         * If not null, sock_wfree() will call __sk_free(sk) later
1075         */
1076        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1077                __sk_free(sk);
1078}
1079EXPORT_SYMBOL(sk_free);
1080
1081/*
1082 * Last sock_put should drop referrence to sk->sk_net. It has already
1083 * been dropped in sk_change_net. Taking referrence to stopping namespace
1084 * is not an option.
1085 * Take referrence to a socket to remove it from hash _alive_ and after that
1086 * destroy it in the context of init_net.
1087 */
1088void sk_release_kernel(struct sock *sk)
1089{
1090        if (sk == NULL || sk->sk_socket == NULL)
1091                return;
1092
1093        sock_hold(sk);
1094        sock_release(sk->sk_socket);
1095        release_net(sock_net(sk));
1096        sock_net_set(sk, get_net(&init_net));
1097        sock_put(sk);
1098}
1099EXPORT_SYMBOL(sk_release_kernel);
1100
1101struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1102{
1103        struct sock *newsk;
1104
1105        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1106        if (newsk != NULL) {
1107                struct sk_filter *filter;
1108
1109                sock_copy(newsk, sk);
1110
1111                /* SANITY */
1112                get_net(sock_net(newsk));
1113                sk_node_init(&newsk->sk_node);
1114                sock_lock_init(newsk);
1115                bh_lock_sock(newsk);
1116                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1117
1118                atomic_set(&newsk->sk_rmem_alloc, 0);
1119                /*
1120                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1121                 */
1122                atomic_set(&newsk->sk_wmem_alloc, 1);
1123                atomic_set(&newsk->sk_omem_alloc, 0);
1124                skb_queue_head_init(&newsk->sk_receive_queue);
1125                skb_queue_head_init(&newsk->sk_write_queue);
1126#ifdef CONFIG_NET_DMA
1127                skb_queue_head_init(&newsk->sk_async_wait_queue);
1128#endif
1129
1130                rwlock_init(&newsk->sk_dst_lock);
1131                rwlock_init(&newsk->sk_callback_lock);
1132                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1133                                af_callback_keys + newsk->sk_family,
1134                                af_family_clock_key_strings[newsk->sk_family]);
1135
1136                newsk->sk_dst_cache     = NULL;
1137                newsk->sk_wmem_queued   = 0;
1138                newsk->sk_forward_alloc = 0;
1139                newsk->sk_send_head     = NULL;
1140                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1141
1142                sock_reset_flag(newsk, SOCK_DONE);
1143                skb_queue_head_init(&newsk->sk_error_queue);
1144
1145                filter = newsk->sk_filter;
1146                if (filter != NULL)
1147                        sk_filter_charge(newsk, filter);
1148
1149                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1150                        /* It is still raw copy of parent, so invalidate
1151                         * destructor and make plain sk_free() */
1152                        newsk->sk_destruct = NULL;
1153                        sk_free(newsk);
1154                        newsk = NULL;
1155                        goto out;
1156                }
1157
1158                newsk->sk_err      = 0;
1159                newsk->sk_priority = 0;
1160                /*
1161                 * Before updating sk_refcnt, we must commit prior changes to memory
1162                 * (Documentation/RCU/rculist_nulls.txt for details)
1163                 */
1164                smp_wmb();
1165                atomic_set(&newsk->sk_refcnt, 2);
1166
1167                /*
1168                 * Increment the counter in the same struct proto as the master
1169                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1170                 * is the same as sk->sk_prot->socks, as this field was copied
1171                 * with memcpy).
1172                 *
1173                 * This _changes_ the previous behaviour, where
1174                 * tcp_create_openreq_child always was incrementing the
1175                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1176                 * to be taken into account in all callers. -acme
1177                 */
1178                sk_refcnt_debug_inc(newsk);
1179                sk_set_socket(newsk, NULL);
1180                newsk->sk_sleep  = NULL;
1181
1182                if (newsk->sk_prot->sockets_allocated)
1183                        percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1184        }
1185out:
1186        return newsk;
1187}
1188EXPORT_SYMBOL_GPL(sk_clone);
1189
1190void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1191{
1192        __sk_dst_set(sk, dst);
1193        sk->sk_route_caps = dst->dev->features;
1194        if (sk->sk_route_caps & NETIF_F_GSO)
1195                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1196        if (sk_can_gso(sk)) {
1197                if (dst->header_len) {
1198                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1199                } else {
1200                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1201                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1202                }
1203        }
1204}
1205EXPORT_SYMBOL_GPL(sk_setup_caps);
1206
1207void __init sk_init(void)
1208{
1209        if (totalram_pages <= 4096) {
1210                sysctl_wmem_max = 32767;
1211                sysctl_rmem_max = 32767;
1212                sysctl_wmem_default = 32767;
1213                sysctl_rmem_default = 32767;
1214        } else if (totalram_pages >= 131072) {
1215                sysctl_wmem_max = 131071;
1216                sysctl_rmem_max = 131071;
1217        }
1218}
1219
1220/*
1221 *      Simple resource managers for sockets.
1222 */
1223
1224
1225/*
1226 * Write buffer destructor automatically called from kfree_skb.
1227 */
1228void sock_wfree(struct sk_buff *skb)
1229{
1230        struct sock *sk = skb->sk;
1231        unsigned int len = skb->truesize;
1232
1233        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1234                /*
1235                 * Keep a reference on sk_wmem_alloc, this will be released
1236                 * after sk_write_space() call
1237                 */
1238                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1239                sk->sk_write_space(sk);
1240                len = 1;
1241        }
1242        /*
1243         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1244         * could not do because of in-flight packets
1245         */
1246        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1247                __sk_free(sk);
1248}
1249EXPORT_SYMBOL(sock_wfree);
1250
1251/*
1252 * Read buffer destructor automatically called from kfree_skb.
1253 */
1254void sock_rfree(struct sk_buff *skb)
1255{
1256        struct sock *sk = skb->sk;
1257
1258        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1259        sk_mem_uncharge(skb->sk, skb->truesize);
1260}
1261EXPORT_SYMBOL(sock_rfree);
1262
1263
1264int sock_i_uid(struct sock *sk)
1265{
1266        int uid;
1267
1268        read_lock(&sk->sk_callback_lock);
1269        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1270        read_unlock(&sk->sk_callback_lock);
1271        return uid;
1272}
1273EXPORT_SYMBOL(sock_i_uid);
1274
1275unsigned long sock_i_ino(struct sock *sk)
1276{
1277        unsigned long ino;
1278
1279        read_lock(&sk->sk_callback_lock);
1280        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1281        read_unlock(&sk->sk_callback_lock);
1282        return ino;
1283}
1284EXPORT_SYMBOL(sock_i_ino);
1285
1286/*
1287 * Allocate a skb from the socket's send buffer.
1288 */
1289struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1290                             gfp_t priority)
1291{
1292        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1293                struct sk_buff *skb = alloc_skb(size, priority);
1294                if (skb) {
1295                        skb_set_owner_w(skb, sk);
1296                        return skb;
1297                }
1298        }
1299        return NULL;
1300}
1301EXPORT_SYMBOL(sock_wmalloc);
1302
1303/*
1304 * Allocate a skb from the socket's receive buffer.
1305 */
1306struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1307                             gfp_t priority)
1308{
1309        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1310                struct sk_buff *skb = alloc_skb(size, priority);
1311                if (skb) {
1312                        skb_set_owner_r(skb, sk);
1313                        return skb;
1314                }
1315        }
1316        return NULL;
1317}
1318
1319/*
1320 * Allocate a memory block from the socket's option memory buffer.
1321 */
1322void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1323{
1324        if ((unsigned)size <= sysctl_optmem_max &&
1325            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1326                void *mem;
1327                /* First do the add, to avoid the race if kmalloc
1328                 * might sleep.
1329                 */
1330                atomic_add(size, &sk->sk_omem_alloc);
1331                mem = kmalloc(size, priority);
1332                if (mem)
1333                        return mem;
1334                atomic_sub(size, &sk->sk_omem_alloc);
1335        }
1336        return NULL;
1337}
1338EXPORT_SYMBOL(sock_kmalloc);
1339
1340/*
1341 * Free an option memory block.
1342 */
1343void sock_kfree_s(struct sock *sk, void *mem, int size)
1344{
1345        kfree(mem);
1346        atomic_sub(size, &sk->sk_omem_alloc);
1347}
1348EXPORT_SYMBOL(sock_kfree_s);
1349
1350/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1351   I think, these locks should be removed for datagram sockets.
1352 */
1353static long sock_wait_for_wmem(struct sock *sk, long timeo)
1354{
1355        DEFINE_WAIT(wait);
1356
1357        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1358        for (;;) {
1359                if (!timeo)
1360                        break;
1361                if (signal_pending(current))
1362                        break;
1363                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1364                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1365                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1366                        break;
1367                if (sk->sk_shutdown & SEND_SHUTDOWN)
1368                        break;
1369                if (sk->sk_err)
1370                        break;
1371                timeo = schedule_timeout(timeo);
1372        }
1373        finish_wait(sk->sk_sleep, &wait);
1374        return timeo;
1375}
1376
1377
1378/*
1379 *      Generic send/receive buffer handlers
1380 */
1381
1382struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1383                                     unsigned long data_len, int noblock,
1384                                     int *errcode)
1385{
1386        struct sk_buff *skb;
1387        gfp_t gfp_mask;
1388        long timeo;
1389        int err;
1390
1391        gfp_mask = sk->sk_allocation;
1392        if (gfp_mask & __GFP_WAIT)
1393                gfp_mask |= __GFP_REPEAT;
1394
1395        timeo = sock_sndtimeo(sk, noblock);
1396        while (1) {
1397                err = sock_error(sk);
1398                if (err != 0)
1399                        goto failure;
1400
1401                err = -EPIPE;
1402                if (sk->sk_shutdown & SEND_SHUTDOWN)
1403                        goto failure;
1404
1405                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1406                        skb = alloc_skb(header_len, gfp_mask);
1407                        if (skb) {
1408                                int npages;
1409                                int i;
1410
1411                                /* No pages, we're done... */
1412                                if (!data_len)
1413                                        break;
1414
1415                                npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1416                                skb->truesize += data_len;
1417                                skb_shinfo(skb)->nr_frags = npages;
1418                                for (i = 0; i < npages; i++) {
1419                                        struct page *page;
1420                                        skb_frag_t *frag;
1421
1422                                        page = alloc_pages(sk->sk_allocation, 0);
1423                                        if (!page) {
1424                                                err = -ENOBUFS;
1425                                                skb_shinfo(skb)->nr_frags = i;
1426                                                kfree_skb(skb);
1427                                                goto failure;
1428                                        }
1429
1430                                        frag = &skb_shinfo(skb)->frags[i];
1431                                        frag->page = page;
1432                                        frag->page_offset = 0;
1433                                        frag->size = (data_len >= PAGE_SIZE ?
1434                                                      PAGE_SIZE :
1435                                                      data_len);
1436                                        data_len -= PAGE_SIZE;
1437                                }
1438
1439                                /* Full success... */
1440                                break;
1441                        }
1442                        err = -ENOBUFS;
1443                        goto failure;
1444                }
1445                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1446                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1447                err = -EAGAIN;
1448                if (!timeo)
1449                        goto failure;
1450                if (signal_pending(current))
1451                        goto interrupted;
1452                timeo = sock_wait_for_wmem(sk, timeo);
1453        }
1454
1455        skb_set_owner_w(skb, sk);
1456        return skb;
1457
1458interrupted:
1459        err = sock_intr_errno(timeo);
1460failure:
1461        *errcode = err;
1462        return NULL;
1463}
1464EXPORT_SYMBOL(sock_alloc_send_pskb);
1465
1466struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1467                                    int noblock, int *errcode)
1468{
1469        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1470}
1471EXPORT_SYMBOL(sock_alloc_send_skb);
1472
1473static void __lock_sock(struct sock *sk)
1474{
1475        DEFINE_WAIT(wait);
1476
1477        for (;;) {
1478                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1479                                        TASK_UNINTERRUPTIBLE);
1480                spin_unlock_bh(&sk->sk_lock.slock);
1481                schedule();
1482                spin_lock_bh(&sk->sk_lock.slock);
1483                if (!sock_owned_by_user(sk))
1484                        break;
1485        }
1486        finish_wait(&sk->sk_lock.wq, &wait);
1487}
1488
1489static void __release_sock(struct sock *sk)
1490{
1491        struct sk_buff *skb = sk->sk_backlog.head;
1492
1493        do {
1494                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1495                bh_unlock_sock(sk);
1496
1497                do {
1498                        struct sk_buff *next = skb->next;
1499
1500                        skb->next = NULL;
1501                        sk_backlog_rcv(sk, skb);
1502
1503                        /*
1504                         * We are in process context here with softirqs
1505                         * disabled, use cond_resched_softirq() to preempt.
1506                         * This is safe to do because we've taken the backlog
1507                         * queue private:
1508                         */
1509                        cond_resched_softirq();
1510
1511                        skb = next;
1512                } while (skb != NULL);
1513
1514                bh_lock_sock(sk);
1515        } while ((skb = sk->sk_backlog.head) != NULL);
1516}
1517
1518/**
1519 * sk_wait_data - wait for data to arrive at sk_receive_queue
1520 * @sk:    sock to wait on
1521 * @timeo: for how long
1522 *
1523 * Now socket state including sk->sk_err is changed only under lock,
1524 * hence we may omit checks after joining wait queue.
1525 * We check receive queue before schedule() only as optimization;
1526 * it is very likely that release_sock() added new data.
1527 */
1528int sk_wait_data(struct sock *sk, long *timeo)
1529{
1530        int rc;
1531        DEFINE_WAIT(wait);
1532
1533        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1534        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1535        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1536        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1537        finish_wait(sk->sk_sleep, &wait);
1538        return rc;
1539}
1540EXPORT_SYMBOL(sk_wait_data);
1541
1542/**
1543 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1544 *      @sk: socket
1545 *      @size: memory size to allocate
1546 *      @kind: allocation type
1547 *
1548 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1549 *      rmem allocation. This function assumes that protocols which have
1550 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1551 */
1552int __sk_mem_schedule(struct sock *sk, int size, int kind)
1553{
1554        struct proto *prot = sk->sk_prot;
1555        int amt = sk_mem_pages(size);
1556        int allocated;
1557
1558        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1559        allocated = atomic_add_return(amt, prot->memory_allocated);
1560
1561        /* Under limit. */
1562        if (allocated <= prot->sysctl_mem[0]) {
1563                if (prot->memory_pressure && *prot->memory_pressure)
1564                        *prot->memory_pressure = 0;
1565                return 1;
1566        }
1567
1568        /* Under pressure. */
1569        if (allocated > prot->sysctl_mem[1])
1570                if (prot->enter_memory_pressure)
1571                        prot->enter_memory_pressure(sk);
1572
1573        /* Over hard limit. */
1574        if (allocated > prot->sysctl_mem[2])
1575                goto suppress_allocation;
1576
1577        /* guarantee minimum buffer size under pressure */
1578        if (kind == SK_MEM_RECV) {
1579                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1580                        return 1;
1581        } else { /* SK_MEM_SEND */
1582                if (sk->sk_type == SOCK_STREAM) {
1583                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1584                                return 1;
1585                } else if (atomic_read(&sk->sk_wmem_alloc) <
1586                           prot->sysctl_wmem[0])
1587                                return 1;
1588        }
1589
1590        if (prot->memory_pressure) {
1591                int alloc;
1592
1593                if (!*prot->memory_pressure)
1594                        return 1;
1595                alloc = percpu_counter_read_positive(prot->sockets_allocated);
1596                if (prot->sysctl_mem[2] > alloc *
1597                    sk_mem_pages(sk->sk_wmem_queued +
1598                                 atomic_read(&sk->sk_rmem_alloc) +
1599                                 sk->sk_forward_alloc))
1600                        return 1;
1601        }
1602
1603suppress_allocation:
1604
1605        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1606                sk_stream_moderate_sndbuf(sk);
1607
1608                /* Fail only if socket is _under_ its sndbuf.
1609                 * In this case we cannot block, so that we have to fail.
1610                 */
1611                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1612                        return 1;
1613        }
1614
1615        /* Alas. Undo changes. */
1616        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1617        atomic_sub(amt, prot->memory_allocated);
1618        return 0;
1619}
1620EXPORT_SYMBOL(__sk_mem_schedule);
1621
1622/**
1623 *      __sk_reclaim - reclaim memory_allocated
1624 *      @sk: socket
1625 */
1626void __sk_mem_reclaim(struct sock *sk)
1627{
1628        struct proto *prot = sk->sk_prot;
1629
1630        atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1631                   prot->memory_allocated);
1632        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1633
1634        if (prot->memory_pressure && *prot->memory_pressure &&
1635            (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1636                *prot->memory_pressure = 0;
1637}
1638EXPORT_SYMBOL(__sk_mem_reclaim);
1639
1640
1641/*
1642 * Set of default routines for initialising struct proto_ops when
1643 * the protocol does not support a particular function. In certain
1644 * cases where it makes no sense for a protocol to have a "do nothing"
1645 * function, some default processing is provided.
1646 */
1647
1648int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1649{
1650        return -EOPNOTSUPP;
1651}
1652EXPORT_SYMBOL(sock_no_bind);
1653
1654int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1655                    int len, int flags)
1656{
1657        return -EOPNOTSUPP;
1658}
1659EXPORT_SYMBOL(sock_no_connect);
1660
1661int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1662{
1663        return -EOPNOTSUPP;
1664}
1665EXPORT_SYMBOL(sock_no_socketpair);
1666
1667int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1668{
1669        return -EOPNOTSUPP;
1670}
1671EXPORT_SYMBOL(sock_no_accept);
1672
1673int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1674                    int *len, int peer)
1675{
1676        return -EOPNOTSUPP;
1677}
1678EXPORT_SYMBOL(sock_no_getname);
1679
1680unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1681{
1682        return 0;
1683}
1684EXPORT_SYMBOL(sock_no_poll);
1685
1686int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1687{
1688        return -EOPNOTSUPP;
1689}
1690EXPORT_SYMBOL(sock_no_ioctl);
1691
1692int sock_no_listen(struct socket *sock, int backlog)
1693{
1694        return -EOPNOTSUPP;
1695}
1696EXPORT_SYMBOL(sock_no_listen);
1697
1698int sock_no_shutdown(struct socket *sock, int how)
1699{
1700        return -EOPNOTSUPP;
1701}
1702EXPORT_SYMBOL(sock_no_shutdown);
1703
1704int sock_no_setsockopt(struct socket *sock, int level, int optname,
1705                    char __user *optval, unsigned int optlen)
1706{
1707        return -EOPNOTSUPP;
1708}
1709EXPORT_SYMBOL(sock_no_setsockopt);
1710
1711int sock_no_getsockopt(struct socket *sock, int level, int optname,
1712                    char __user *optval, int __user *optlen)
1713{
1714        return -EOPNOTSUPP;
1715}
1716EXPORT_SYMBOL(sock_no_getsockopt);
1717
1718int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1719                    size_t len)
1720{
1721        return -EOPNOTSUPP;
1722}
1723EXPORT_SYMBOL(sock_no_sendmsg);
1724
1725int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1726                    size_t len, int flags)
1727{
1728        return -EOPNOTSUPP;
1729}
1730EXPORT_SYMBOL(sock_no_recvmsg);
1731
1732int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1733{
1734        /* Mirror missing mmap method error code */
1735        return -ENODEV;
1736}
1737EXPORT_SYMBOL(sock_no_mmap);
1738
1739ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1740{
1741        ssize_t res;
1742        struct msghdr msg = {.msg_flags = flags};
1743        struct kvec iov;
1744        char *kaddr = kmap(page);
1745        iov.iov_base = kaddr + offset;
1746        iov.iov_len = size;
1747        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1748        kunmap(page);
1749        return res;
1750}
1751EXPORT_SYMBOL(sock_no_sendpage);
1752
1753/*
1754 *      Default Socket Callbacks
1755 */
1756
1757static void sock_def_wakeup(struct sock *sk)
1758{
1759        read_lock(&sk->sk_callback_lock);
1760        if (sk_has_sleeper(sk))
1761                wake_up_interruptible_all(sk->sk_sleep);
1762        read_unlock(&sk->sk_callback_lock);
1763}
1764
1765static void sock_def_error_report(struct sock *sk)
1766{
1767        read_lock(&sk->sk_callback_lock);
1768        if (sk_has_sleeper(sk))
1769                wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1770        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1771        read_unlock(&sk->sk_callback_lock);
1772}
1773
1774static void sock_def_readable(struct sock *sk, int len)
1775{
1776        read_lock(&sk->sk_callback_lock);
1777        if (sk_has_sleeper(sk))
1778                wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1779                                                POLLRDNORM | POLLRDBAND);
1780        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1781        read_unlock(&sk->sk_callback_lock);
1782}
1783
1784static void sock_def_write_space(struct sock *sk)
1785{
1786        read_lock(&sk->sk_callback_lock);
1787
1788        /* Do not wake up a writer until he can make "significant"
1789         * progress.  --DaveM
1790         */
1791        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1792                if (sk_has_sleeper(sk))
1793                        wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1794                                                POLLWRNORM | POLLWRBAND);
1795
1796                /* Should agree with poll, otherwise some programs break */
1797                if (sock_writeable(sk))
1798                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1799        }
1800
1801        read_unlock(&sk->sk_callback_lock);
1802}
1803
1804static void sock_def_destruct(struct sock *sk)
1805{
1806        kfree(sk->sk_protinfo);
1807}
1808
1809void sk_send_sigurg(struct sock *sk)
1810{
1811        if (sk->sk_socket && sk->sk_socket->file)
1812                if (send_sigurg(&sk->sk_socket->file->f_owner))
1813                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1814}
1815EXPORT_SYMBOL(sk_send_sigurg);
1816
1817void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1818                    unsigned long expires)
1819{
1820        if (!mod_timer(timer, expires))
1821                sock_hold(sk);
1822}
1823EXPORT_SYMBOL(sk_reset_timer);
1824
1825void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1826{
1827        if (timer_pending(timer) && del_timer(timer))
1828                __sock_put(sk);
1829}
1830EXPORT_SYMBOL(sk_stop_timer);
1831
1832void sock_init_data(struct socket *sock, struct sock *sk)
1833{
1834        skb_queue_head_init(&sk->sk_receive_queue);
1835        skb_queue_head_init(&sk->sk_write_queue);
1836        skb_queue_head_init(&sk->sk_error_queue);
1837#ifdef CONFIG_NET_DMA
1838        skb_queue_head_init(&sk->sk_async_wait_queue);
1839#endif
1840
1841        sk->sk_send_head        =       NULL;
1842
1843        init_timer(&sk->sk_timer);
1844
1845        sk->sk_allocation       =       GFP_KERNEL;
1846        sk->sk_rcvbuf           =       sysctl_rmem_default;
1847        sk->sk_sndbuf           =       sysctl_wmem_default;
1848        sk->sk_state            =       TCP_CLOSE;
1849        sk_set_socket(sk, sock);
1850
1851        sock_set_flag(sk, SOCK_ZAPPED);
1852
1853        if (sock) {
1854                sk->sk_type     =       sock->type;
1855                sk->sk_sleep    =       &sock->wait;
1856                sock->sk        =       sk;
1857        } else
1858                sk->sk_sleep    =       NULL;
1859
1860        rwlock_init(&sk->sk_dst_lock);
1861        rwlock_init(&sk->sk_callback_lock);
1862        lockdep_set_class_and_name(&sk->sk_callback_lock,
1863                        af_callback_keys + sk->sk_family,
1864                        af_family_clock_key_strings[sk->sk_family]);
1865
1866        sk->sk_state_change     =       sock_def_wakeup;
1867        sk->sk_data_ready       =       sock_def_readable;
1868        sk->sk_write_space      =       sock_def_write_space;
1869        sk->sk_error_report     =       sock_def_error_report;
1870        sk->sk_destruct         =       sock_def_destruct;
1871
1872        sk->sk_sndmsg_page      =       NULL;
1873        sk->sk_sndmsg_off       =       0;
1874
1875        sk->sk_peercred.pid     =       0;
1876        sk->sk_peercred.uid     =       -1;
1877        sk->sk_peercred.gid     =       -1;
1878        sk->sk_write_pending    =       0;
1879        sk->sk_rcvlowat         =       1;
1880        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1881        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1882
1883        sk->sk_stamp = ktime_set(-1L, 0);
1884
1885        /*
1886         * Before updating sk_refcnt, we must commit prior changes to memory
1887         * (Documentation/RCU/rculist_nulls.txt for details)
1888         */
1889        smp_wmb();
1890        atomic_set(&sk->sk_refcnt, 1);
1891        atomic_set(&sk->sk_drops, 0);
1892}
1893EXPORT_SYMBOL(sock_init_data);
1894
1895void lock_sock_nested(struct sock *sk, int subclass)
1896{
1897        might_sleep();
1898        spin_lock_bh(&sk->sk_lock.slock);
1899        if (sk->sk_lock.owned)
1900                __lock_sock(sk);
1901        sk->sk_lock.owned = 1;
1902        spin_unlock(&sk->sk_lock.slock);
1903        /*
1904         * The sk_lock has mutex_lock() semantics here:
1905         */
1906        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1907        local_bh_enable();
1908}
1909EXPORT_SYMBOL(lock_sock_nested);
1910
1911void release_sock(struct sock *sk)
1912{
1913        /*
1914         * The sk_lock has mutex_unlock() semantics:
1915         */
1916        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1917
1918        spin_lock_bh(&sk->sk_lock.slock);
1919        if (sk->sk_backlog.tail)
1920                __release_sock(sk);
1921        sk->sk_lock.owned = 0;
1922        if (waitqueue_active(&sk->sk_lock.wq))
1923                wake_up(&sk->sk_lock.wq);
1924        spin_unlock_bh(&sk->sk_lock.slock);
1925}
1926EXPORT_SYMBOL(release_sock);
1927
1928int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1929{
1930        struct timeval tv;
1931        if (!sock_flag(sk, SOCK_TIMESTAMP))
1932                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1933        tv = ktime_to_timeval(sk->sk_stamp);
1934        if (tv.tv_sec == -1)
1935                return -ENOENT;
1936        if (tv.tv_sec == 0) {
1937                sk->sk_stamp = ktime_get_real();
1938                tv = ktime_to_timeval(sk->sk_stamp);
1939        }
1940        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1941}
1942EXPORT_SYMBOL(sock_get_timestamp);
1943
1944int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1945{
1946        struct timespec ts;
1947        if (!sock_flag(sk, SOCK_TIMESTAMP))
1948                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1949        ts = ktime_to_timespec(sk->sk_stamp);
1950        if (ts.tv_sec == -1)
1951                return -ENOENT;
1952        if (ts.tv_sec == 0) {
1953                sk->sk_stamp = ktime_get_real();
1954                ts = ktime_to_timespec(sk->sk_stamp);
1955        }
1956        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1957}
1958EXPORT_SYMBOL(sock_get_timestampns);
1959
1960void sock_enable_timestamp(struct sock *sk, int flag)
1961{
1962        if (!sock_flag(sk, flag)) {
1963                sock_set_flag(sk, flag);
1964                /*
1965                 * we just set one of the two flags which require net
1966                 * time stamping, but time stamping might have been on
1967                 * already because of the other one
1968                 */
1969                if (!sock_flag(sk,
1970                                flag == SOCK_TIMESTAMP ?
1971                                SOCK_TIMESTAMPING_RX_SOFTWARE :
1972                                SOCK_TIMESTAMP))
1973                        net_enable_timestamp();
1974        }
1975}
1976
1977/*
1978 *      Get a socket option on an socket.
1979 *
1980 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1981 *      asynchronous errors should be reported by getsockopt. We assume
1982 *      this means if you specify SO_ERROR (otherwise whats the point of it).
1983 */
1984int sock_common_getsockopt(struct socket *sock, int level, int optname,
1985                           char __user *optval, int __user *optlen)
1986{
1987        struct sock *sk = sock->sk;
1988
1989        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1990}
1991EXPORT_SYMBOL(sock_common_getsockopt);
1992
1993#ifdef CONFIG_COMPAT
1994int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1995                                  char __user *optval, int __user *optlen)
1996{
1997        struct sock *sk = sock->sk;
1998
1999        if (sk->sk_prot->compat_getsockopt != NULL)
2000                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2001                                                      optval, optlen);
2002        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2003}
2004EXPORT_SYMBOL(compat_sock_common_getsockopt);
2005#endif
2006
2007int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2008                        struct msghdr *msg, size_t size, int flags)
2009{
2010        struct sock *sk = sock->sk;
2011        int addr_len = 0;
2012        int err;
2013
2014        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2015                                   flags & ~MSG_DONTWAIT, &addr_len);
2016        if (err >= 0)
2017                msg->msg_namelen = addr_len;
2018        return err;
2019}
2020EXPORT_SYMBOL(sock_common_recvmsg);
2021
2022/*
2023 *      Set socket options on an inet socket.
2024 */
2025int sock_common_setsockopt(struct socket *sock, int level, int optname,
2026                           char __user *optval, unsigned int optlen)
2027{
2028        struct sock *sk = sock->sk;
2029
2030        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2031}
2032EXPORT_SYMBOL(sock_common_setsockopt);
2033
2034#ifdef CONFIG_COMPAT
2035int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2036                                  char __user *optval, unsigned int optlen)
2037{
2038        struct sock *sk = sock->sk;
2039
2040        if (sk->sk_prot->compat_setsockopt != NULL)
2041                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2042                                                      optval, optlen);
2043        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2044}
2045EXPORT_SYMBOL(compat_sock_common_setsockopt);
2046#endif
2047
2048void sk_common_release(struct sock *sk)
2049{
2050        if (sk->sk_prot->destroy)
2051                sk->sk_prot->destroy(sk);
2052
2053        /*
2054         * Observation: when sock_common_release is called, processes have
2055         * no access to socket. But net still has.
2056         * Step one, detach it from networking:
2057         *
2058         * A. Remove from hash tables.
2059         */
2060
2061        sk->sk_prot->unhash(sk);
2062
2063        /*
2064         * In this point socket cannot receive new packets, but it is possible
2065         * that some packets are in flight because some CPU runs receiver and
2066         * did hash table lookup before we unhashed socket. They will achieve
2067         * receive queue and will be purged by socket destructor.
2068         *
2069         * Also we still have packets pending on receive queue and probably,
2070         * our own packets waiting in device queues. sock_destroy will drain
2071         * receive queue, but transmitted packets will delay socket destruction
2072         * until the last reference will be released.
2073         */
2074
2075        sock_orphan(sk);
2076
2077        xfrm_sk_free_policy(sk);
2078
2079        sk_refcnt_debug_release(sk);
2080        sock_put(sk);
2081}
2082EXPORT_SYMBOL(sk_common_release);
2083
2084static DEFINE_RWLOCK(proto_list_lock);
2085static LIST_HEAD(proto_list);
2086
2087#ifdef CONFIG_PROC_FS
2088#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2089struct prot_inuse {
2090        int val[PROTO_INUSE_NR];
2091};
2092
2093static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2094
2095#ifdef CONFIG_NET_NS
2096void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2097{
2098        int cpu = smp_processor_id();
2099        per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2100}
2101EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2102
2103int sock_prot_inuse_get(struct net *net, struct proto *prot)
2104{
2105        int cpu, idx = prot->inuse_idx;
2106        int res = 0;
2107
2108        for_each_possible_cpu(cpu)
2109                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2110
2111        return res >= 0 ? res : 0;
2112}
2113EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2114
2115static int sock_inuse_init_net(struct net *net)
2116{
2117        net->core.inuse = alloc_percpu(struct prot_inuse);
2118        return net->core.inuse ? 0 : -ENOMEM;
2119}
2120
2121static void sock_inuse_exit_net(struct net *net)
2122{
2123        free_percpu(net->core.inuse);
2124}
2125
2126static struct pernet_operations net_inuse_ops = {
2127        .init = sock_inuse_init_net,
2128        .exit = sock_inuse_exit_net,
2129};
2130
2131static __init int net_inuse_init(void)
2132{
2133        if (register_pernet_subsys(&net_inuse_ops))
2134                panic("Cannot initialize net inuse counters");
2135
2136        return 0;
2137}
2138
2139core_initcall(net_inuse_init);
2140#else
2141static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2142
2143void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2144{
2145        __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2146}
2147EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2148
2149int sock_prot_inuse_get(struct net *net, struct proto *prot)
2150{
2151        int cpu, idx = prot->inuse_idx;
2152        int res = 0;
2153
2154        for_each_possible_cpu(cpu)
2155                res += per_cpu(prot_inuse, cpu).val[idx];
2156
2157        return res >= 0 ? res : 0;
2158}
2159EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2160#endif
2161
2162static void assign_proto_idx(struct proto *prot)
2163{
2164        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2165
2166        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2167                printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2168                return;
2169        }
2170
2171        set_bit(prot->inuse_idx, proto_inuse_idx);
2172}
2173
2174static void release_proto_idx(struct proto *prot)
2175{
2176        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2177                clear_bit(prot->inuse_idx, proto_inuse_idx);
2178}
2179#else
2180static inline void assign_proto_idx(struct proto *prot)
2181{
2182}
2183
2184static inline void release_proto_idx(struct proto *prot)
2185{
2186}
2187#endif
2188
2189int proto_register(struct proto *prot, int alloc_slab)
2190{
2191        if (alloc_slab) {
2192                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2193                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2194                                        NULL);
2195
2196                if (prot->slab == NULL) {
2197                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2198                               prot->name);
2199                        goto out;
2200                }
2201
2202                if (prot->rsk_prot != NULL) {
2203                        static const char mask[] = "request_sock_%s";
2204
2205                        prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2206                        if (prot->rsk_prot->slab_name == NULL)
2207                                goto out_free_sock_slab;
2208
2209                        sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2210                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2211                                                                 prot->rsk_prot->obj_size, 0,
2212                                                                 SLAB_HWCACHE_ALIGN, NULL);
2213
2214                        if (prot->rsk_prot->slab == NULL) {
2215                                printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2216                                       prot->name);
2217                                goto out_free_request_sock_slab_name;
2218                        }
2219                }
2220
2221                if (prot->twsk_prot != NULL) {
2222                        static const char mask[] = "tw_sock_%s";
2223
2224                        prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2225
2226                        if (prot->twsk_prot->twsk_slab_name == NULL)
2227                                goto out_free_request_sock_slab;
2228
2229                        sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2230                        prot->twsk_prot->twsk_slab =
2231                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2232                                                  prot->twsk_prot->twsk_obj_size,
2233                                                  0,
2234                                                  SLAB_HWCACHE_ALIGN |
2235                                                        prot->slab_flags,
2236                                                  NULL);
2237                        if (prot->twsk_prot->twsk_slab == NULL)
2238                                goto out_free_timewait_sock_slab_name;
2239                }
2240        }
2241
2242        write_lock(&proto_list_lock);
2243        list_add(&prot->node, &proto_list);
2244        assign_proto_idx(prot);
2245        write_unlock(&proto_list_lock);
2246        return 0;
2247
2248out_free_timewait_sock_slab_name:
2249        kfree(prot->twsk_prot->twsk_slab_name);
2250out_free_request_sock_slab:
2251        if (prot->rsk_prot && prot->rsk_prot->slab) {
2252                kmem_cache_destroy(prot->rsk_prot->slab);
2253                prot->rsk_prot->slab = NULL;
2254        }
2255out_free_request_sock_slab_name:
2256        kfree(prot->rsk_prot->slab_name);
2257out_free_sock_slab:
2258        kmem_cache_destroy(prot->slab);
2259        prot->slab = NULL;
2260out:
2261        return -ENOBUFS;
2262}
2263EXPORT_SYMBOL(proto_register);
2264
2265void proto_unregister(struct proto *prot)
2266{
2267        write_lock(&proto_list_lock);
2268        release_proto_idx(prot);
2269        list_del(&prot->node);
2270        write_unlock(&proto_list_lock);
2271
2272        if (prot->slab != NULL) {
2273                kmem_cache_destroy(prot->slab);
2274                prot->slab = NULL;
2275        }
2276
2277        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2278                kmem_cache_destroy(prot->rsk_prot->slab);
2279                kfree(prot->rsk_prot->slab_name);
2280                prot->rsk_prot->slab = NULL;
2281        }
2282
2283        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2284                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2285                kfree(prot->twsk_prot->twsk_slab_name);
2286                prot->twsk_prot->twsk_slab = NULL;
2287        }
2288}
2289EXPORT_SYMBOL(proto_unregister);
2290
2291#ifdef CONFIG_PROC_FS
2292static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2293        __acquires(proto_list_lock)
2294{
2295        read_lock(&proto_list_lock);
2296        return seq_list_start_head(&proto_list, *pos);
2297}
2298
2299static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2300{
2301        return seq_list_next(v, &proto_list, pos);
2302}
2303
2304static void proto_seq_stop(struct seq_file *seq, void *v)
2305        __releases(proto_list_lock)
2306{
2307        read_unlock(&proto_list_lock);
2308}
2309
2310static char proto_method_implemented(const void *method)
2311{
2312        return method == NULL ? 'n' : 'y';
2313}
2314
2315static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2316{
2317        seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2318                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2319                   proto->name,
2320                   proto->obj_size,
2321                   sock_prot_inuse_get(seq_file_net(seq), proto),
2322                   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2323                   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2324                   proto->max_header,
2325                   proto->slab == NULL ? "no" : "yes",
2326                   module_name(proto->owner),
2327                   proto_method_implemented(proto->close),
2328                   proto_method_implemented(proto->connect),
2329                   proto_method_implemented(proto->disconnect),
2330                   proto_method_implemented(proto->accept),
2331                   proto_method_implemented(proto->ioctl),
2332                   proto_method_implemented(proto->init),
2333                   proto_method_implemented(proto->destroy),
2334                   proto_method_implemented(proto->shutdown),
2335                   proto_method_implemented(proto->setsockopt),
2336                   proto_method_implemented(proto->getsockopt),
2337                   proto_method_implemented(proto->sendmsg),
2338                   proto_method_implemented(proto->recvmsg),
2339                   proto_method_implemented(proto->sendpage),
2340                   proto_method_implemented(proto->bind),
2341                   proto_method_implemented(proto->backlog_rcv),
2342                   proto_method_implemented(proto->hash),
2343                   proto_method_implemented(proto->unhash),
2344                   proto_method_implemented(proto->get_port),
2345                   proto_method_implemented(proto->enter_memory_pressure));
2346}
2347
2348static int proto_seq_show(struct seq_file *seq, void *v)
2349{
2350        if (v == &proto_list)
2351                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2352                           "protocol",
2353                           "size",
2354                           "sockets",
2355                           "memory",
2356                           "press",
2357                           "maxhdr",
2358                           "slab",
2359                           "module",
2360                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2361        else
2362                proto_seq_printf(seq, list_entry(v, struct proto, node));
2363        return 0;
2364}
2365
2366static const struct seq_operations proto_seq_ops = {
2367        .start  = proto_seq_start,
2368        .next   = proto_seq_next,
2369        .stop   = proto_seq_stop,
2370        .show   = proto_seq_show,
2371};
2372
2373static int proto_seq_open(struct inode *inode, struct file *file)
2374{
2375        return seq_open_net(inode, file, &proto_seq_ops,
2376                            sizeof(struct seq_net_private));
2377}
2378
2379static const struct file_operations proto_seq_fops = {
2380        .owner          = THIS_MODULE,
2381        .open           = proto_seq_open,
2382        .read           = seq_read,
2383        .llseek         = seq_lseek,
2384        .release        = seq_release_net,
2385};
2386
2387static __net_init int proto_init_net(struct net *net)
2388{
2389        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2390                return -ENOMEM;
2391
2392        return 0;
2393}
2394
2395static __net_exit void proto_exit_net(struct net *net)
2396{
2397        proc_net_remove(net, "protocols");
2398}
2399
2400
2401static __net_initdata struct pernet_operations proto_net_ops = {
2402        .init = proto_init_net,
2403        .exit = proto_exit_net,
2404};
2405
2406static int __init proto_init(void)
2407{
2408        return register_pernet_subsys(&proto_net_ops);
2409}
2410
2411subsys_initcall(proto_init);
2412
2413#endif /* PROC_FS */
2414