linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#include <linux/capability.h>
  93#include <linux/errno.h>
  94#include <linux/types.h>
  95#include <linux/socket.h>
  96#include <linux/in.h>
  97#include <linux/kernel.h>
  98#include <linux/module.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/sched.h>
 102#include <linux/timer.h>
 103#include <linux/string.h>
 104#include <linux/sockios.h>
 105#include <linux/net.h>
 106#include <linux/mm.h>
 107#include <linux/slab.h>
 108#include <linux/interrupt.h>
 109#include <linux/poll.h>
 110#include <linux/tcp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113#include <linux/user_namespace.h>
 114
 115#include <asm/uaccess.h>
 116#include <asm/system.h>
 117
 118#include <linux/netdevice.h>
 119#include <net/protocol.h>
 120#include <linux/skbuff.h>
 121#include <net/net_namespace.h>
 122#include <net/request_sock.h>
 123#include <net/sock.h>
 124#include <linux/net_tstamp.h>
 125#include <net/xfrm.h>
 126#include <linux/ipsec.h>
 127#include <net/cls_cgroup.h>
 128
 129#include <linux/filter.h>
 130
 131#ifdef CONFIG_INET
 132#include <net/tcp.h>
 133#endif
 134
 135/*
 136 * Each address family might have different locking rules, so we have
 137 * one slock key per address family:
 138 */
 139static struct lock_class_key af_family_keys[AF_MAX];
 140static struct lock_class_key af_family_slock_keys[AF_MAX];
 141
 142/*
 143 * Make lock validator output more readable. (we pre-construct these
 144 * strings build-time, so that runtime initialization of socket
 145 * locks is fast):
 146 */
 147static const char *const af_family_key_strings[AF_MAX+1] = {
 148  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 149  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 150  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 151  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 152  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 153  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 154  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 155  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 156  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 157  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 158  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 159  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 160  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 161  "sk_lock-AF_MAX"
 162};
 163static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 164  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 165  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 166  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 167  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 168  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 169  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 170  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 171  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 172  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 173  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 174  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 175  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 176  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 177  "slock-AF_MAX"
 178};
 179static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 180  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 181  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 182  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 183  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 184  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 185  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 186  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 187  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 188  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 189  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 190  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 191  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 192  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 193  "clock-AF_MAX"
 194};
 195
 196/*
 197 * sk_callback_lock locking rules are per-address-family,
 198 * so split the lock classes by using a per-AF key:
 199 */
 200static struct lock_class_key af_callback_keys[AF_MAX];
 201
 202/* Take into consideration the size of the struct sk_buff overhead in the
 203 * determination of these values, since that is non-constant across
 204 * platforms.  This makes socket queueing behavior and performance
 205 * not depend upon such differences.
 206 */
 207#define _SK_MEM_PACKETS         256
 208#define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 209#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 210#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 211
 212/* Run time adjustable parameters. */
 213__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 214__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 215__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 216__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 217
 218/* Maximal space eaten by iovec or ancilliary data plus some space */
 219int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 220EXPORT_SYMBOL(sysctl_optmem_max);
 221
 222#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
 223int net_cls_subsys_id = -1;
 224EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 225#endif
 226
 227static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 228{
 229        struct timeval tv;
 230
 231        if (optlen < sizeof(tv))
 232                return -EINVAL;
 233        if (copy_from_user(&tv, optval, sizeof(tv)))
 234                return -EFAULT;
 235        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 236                return -EDOM;
 237
 238        if (tv.tv_sec < 0) {
 239                static int warned __read_mostly;
 240
 241                *timeo_p = 0;
 242                if (warned < 10 && net_ratelimit()) {
 243                        warned++;
 244                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 245                               "tries to set negative timeout\n",
 246                                current->comm, task_pid_nr(current));
 247                }
 248                return 0;
 249        }
 250        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 251        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 252                return 0;
 253        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 254                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 255        return 0;
 256}
 257
 258static void sock_warn_obsolete_bsdism(const char *name)
 259{
 260        static int warned;
 261        static char warncomm[TASK_COMM_LEN];
 262        if (strcmp(warncomm, current->comm) && warned < 5) {
 263                strcpy(warncomm,  current->comm);
 264                printk(KERN_WARNING "process `%s' is using obsolete "
 265                       "%s SO_BSDCOMPAT\n", warncomm, name);
 266                warned++;
 267        }
 268}
 269
 270static void sock_disable_timestamp(struct sock *sk, int flag)
 271{
 272        if (sock_flag(sk, flag)) {
 273                sock_reset_flag(sk, flag);
 274                if (!sock_flag(sk, SOCK_TIMESTAMP) &&
 275                    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
 276                        net_disable_timestamp();
 277                }
 278        }
 279}
 280
 281
 282int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 283{
 284        int err;
 285        int skb_len;
 286        unsigned long flags;
 287        struct sk_buff_head *list = &sk->sk_receive_queue;
 288
 289        /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 290           number of warnings when compiling with -W --ANK
 291         */
 292        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 293            (unsigned)sk->sk_rcvbuf) {
 294                atomic_inc(&sk->sk_drops);
 295                return -ENOMEM;
 296        }
 297
 298        err = sk_filter(sk, skb);
 299        if (err)
 300                return err;
 301
 302        if (!sk_rmem_schedule(sk, skb->truesize)) {
 303                atomic_inc(&sk->sk_drops);
 304                return -ENOBUFS;
 305        }
 306
 307        skb->dev = NULL;
 308        skb_set_owner_r(skb, sk);
 309
 310        /* Cache the SKB length before we tack it onto the receive
 311         * queue.  Once it is added it no longer belongs to us and
 312         * may be freed by other threads of control pulling packets
 313         * from the queue.
 314         */
 315        skb_len = skb->len;
 316
 317        /* we escape from rcu protected region, make sure we dont leak
 318         * a norefcounted dst
 319         */
 320        skb_dst_force(skb);
 321
 322        spin_lock_irqsave(&list->lock, flags);
 323        skb->dropcount = atomic_read(&sk->sk_drops);
 324        __skb_queue_tail(list, skb);
 325        spin_unlock_irqrestore(&list->lock, flags);
 326
 327        if (!sock_flag(sk, SOCK_DEAD))
 328                sk->sk_data_ready(sk, skb_len);
 329        return 0;
 330}
 331EXPORT_SYMBOL(sock_queue_rcv_skb);
 332
 333int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 334{
 335        int rc = NET_RX_SUCCESS;
 336
 337        if (sk_filter(sk, skb))
 338                goto discard_and_relse;
 339
 340        skb->dev = NULL;
 341
 342        if (sk_rcvqueues_full(sk, skb)) {
 343                atomic_inc(&sk->sk_drops);
 344                goto discard_and_relse;
 345        }
 346        if (nested)
 347                bh_lock_sock_nested(sk);
 348        else
 349                bh_lock_sock(sk);
 350        if (!sock_owned_by_user(sk)) {
 351                /*
 352                 * trylock + unlock semantics:
 353                 */
 354                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 355
 356                rc = sk_backlog_rcv(sk, skb);
 357
 358                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 359        } else if (sk_add_backlog(sk, skb)) {
 360                bh_unlock_sock(sk);
 361                atomic_inc(&sk->sk_drops);
 362                goto discard_and_relse;
 363        }
 364
 365        bh_unlock_sock(sk);
 366out:
 367        sock_put(sk);
 368        return rc;
 369discard_and_relse:
 370        kfree_skb(skb);
 371        goto out;
 372}
 373EXPORT_SYMBOL(sk_receive_skb);
 374
 375void sk_reset_txq(struct sock *sk)
 376{
 377        sk_tx_queue_clear(sk);
 378}
 379EXPORT_SYMBOL(sk_reset_txq);
 380
 381struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 382{
 383        struct dst_entry *dst = __sk_dst_get(sk);
 384
 385        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 386                sk_tx_queue_clear(sk);
 387                rcu_assign_pointer(sk->sk_dst_cache, NULL);
 388                dst_release(dst);
 389                return NULL;
 390        }
 391
 392        return dst;
 393}
 394EXPORT_SYMBOL(__sk_dst_check);
 395
 396struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 397{
 398        struct dst_entry *dst = sk_dst_get(sk);
 399
 400        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 401                sk_dst_reset(sk);
 402                dst_release(dst);
 403                return NULL;
 404        }
 405
 406        return dst;
 407}
 408EXPORT_SYMBOL(sk_dst_check);
 409
 410static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 411{
 412        int ret = -ENOPROTOOPT;
 413#ifdef CONFIG_NETDEVICES
 414        struct net *net = sock_net(sk);
 415        char devname[IFNAMSIZ];
 416        int index;
 417
 418        /* Sorry... */
 419        ret = -EPERM;
 420        if (!capable(CAP_NET_RAW))
 421                goto out;
 422
 423        ret = -EINVAL;
 424        if (optlen < 0)
 425                goto out;
 426
 427        /* Bind this socket to a particular device like "eth0",
 428         * as specified in the passed interface name. If the
 429         * name is "" or the option length is zero the socket
 430         * is not bound.
 431         */
 432        if (optlen > IFNAMSIZ - 1)
 433                optlen = IFNAMSIZ - 1;
 434        memset(devname, 0, sizeof(devname));
 435
 436        ret = -EFAULT;
 437        if (copy_from_user(devname, optval, optlen))
 438                goto out;
 439
 440        index = 0;
 441        if (devname[0] != '\0') {
 442                struct net_device *dev;
 443
 444                rcu_read_lock();
 445                dev = dev_get_by_name_rcu(net, devname);
 446                if (dev)
 447                        index = dev->ifindex;
 448                rcu_read_unlock();
 449                ret = -ENODEV;
 450                if (!dev)
 451                        goto out;
 452        }
 453
 454        lock_sock(sk);
 455        sk->sk_bound_dev_if = index;
 456        sk_dst_reset(sk);
 457        release_sock(sk);
 458
 459        ret = 0;
 460
 461out:
 462#endif
 463
 464        return ret;
 465}
 466
 467static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 468{
 469        if (valbool)
 470                sock_set_flag(sk, bit);
 471        else
 472                sock_reset_flag(sk, bit);
 473}
 474
 475/*
 476 *      This is meant for all protocols to use and covers goings on
 477 *      at the socket level. Everything here is generic.
 478 */
 479
 480int sock_setsockopt(struct socket *sock, int level, int optname,
 481                    char __user *optval, unsigned int optlen)
 482{
 483        struct sock *sk = sock->sk;
 484        int val;
 485        int valbool;
 486        struct linger ling;
 487        int ret = 0;
 488
 489        /*
 490         *      Options without arguments
 491         */
 492
 493        if (optname == SO_BINDTODEVICE)
 494                return sock_bindtodevice(sk, optval, optlen);
 495
 496        if (optlen < sizeof(int))
 497                return -EINVAL;
 498
 499        if (get_user(val, (int __user *)optval))
 500                return -EFAULT;
 501
 502        valbool = val ? 1 : 0;
 503
 504        lock_sock(sk);
 505
 506        switch (optname) {
 507        case SO_DEBUG:
 508                if (val && !capable(CAP_NET_ADMIN))
 509                        ret = -EACCES;
 510                else
 511                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 512                break;
 513        case SO_REUSEADDR:
 514                sk->sk_reuse = valbool;
 515                break;
 516        case SO_TYPE:
 517        case SO_PROTOCOL:
 518        case SO_DOMAIN:
 519        case SO_ERROR:
 520                ret = -ENOPROTOOPT;
 521                break;
 522        case SO_DONTROUTE:
 523                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 524                break;
 525        case SO_BROADCAST:
 526                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 527                break;
 528        case SO_SNDBUF:
 529                /* Don't error on this BSD doesn't and if you think
 530                   about it this is right. Otherwise apps have to
 531                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 532                   are treated in BSD as hints */
 533
 534                if (val > sysctl_wmem_max)
 535                        val = sysctl_wmem_max;
 536set_sndbuf:
 537                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 538                if ((val * 2) < SOCK_MIN_SNDBUF)
 539                        sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 540                else
 541                        sk->sk_sndbuf = val * 2;
 542
 543                /*
 544                 *      Wake up sending tasks if we
 545                 *      upped the value.
 546                 */
 547                sk->sk_write_space(sk);
 548                break;
 549
 550        case SO_SNDBUFFORCE:
 551                if (!capable(CAP_NET_ADMIN)) {
 552                        ret = -EPERM;
 553                        break;
 554                }
 555                goto set_sndbuf;
 556
 557        case SO_RCVBUF:
 558                /* Don't error on this BSD doesn't and if you think
 559                   about it this is right. Otherwise apps have to
 560                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 561                   are treated in BSD as hints */
 562
 563                if (val > sysctl_rmem_max)
 564                        val = sysctl_rmem_max;
 565set_rcvbuf:
 566                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 567                /*
 568                 * We double it on the way in to account for
 569                 * "struct sk_buff" etc. overhead.   Applications
 570                 * assume that the SO_RCVBUF setting they make will
 571                 * allow that much actual data to be received on that
 572                 * socket.
 573                 *
 574                 * Applications are unaware that "struct sk_buff" and
 575                 * other overheads allocate from the receive buffer
 576                 * during socket buffer allocation.
 577                 *
 578                 * And after considering the possible alternatives,
 579                 * returning the value we actually used in getsockopt
 580                 * is the most desirable behavior.
 581                 */
 582                if ((val * 2) < SOCK_MIN_RCVBUF)
 583                        sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 584                else
 585                        sk->sk_rcvbuf = val * 2;
 586                break;
 587
 588        case SO_RCVBUFFORCE:
 589                if (!capable(CAP_NET_ADMIN)) {
 590                        ret = -EPERM;
 591                        break;
 592                }
 593                goto set_rcvbuf;
 594
 595        case SO_KEEPALIVE:
 596#ifdef CONFIG_INET
 597                if (sk->sk_protocol == IPPROTO_TCP)
 598                        tcp_set_keepalive(sk, valbool);
 599#endif
 600                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 601                break;
 602
 603        case SO_OOBINLINE:
 604                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 605                break;
 606
 607        case SO_NO_CHECK:
 608                sk->sk_no_check = valbool;
 609                break;
 610
 611        case SO_PRIORITY:
 612                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 613                        sk->sk_priority = val;
 614                else
 615                        ret = -EPERM;
 616                break;
 617
 618        case SO_LINGER:
 619                if (optlen < sizeof(ling)) {
 620                        ret = -EINVAL;  /* 1003.1g */
 621                        break;
 622                }
 623                if (copy_from_user(&ling, optval, sizeof(ling))) {
 624                        ret = -EFAULT;
 625                        break;
 626                }
 627                if (!ling.l_onoff)
 628                        sock_reset_flag(sk, SOCK_LINGER);
 629                else {
 630#if (BITS_PER_LONG == 32)
 631                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 632                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 633                        else
 634#endif
 635                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 636                        sock_set_flag(sk, SOCK_LINGER);
 637                }
 638                break;
 639
 640        case SO_BSDCOMPAT:
 641                sock_warn_obsolete_bsdism("setsockopt");
 642                break;
 643
 644        case SO_PASSCRED:
 645                if (valbool)
 646                        set_bit(SOCK_PASSCRED, &sock->flags);
 647                else
 648                        clear_bit(SOCK_PASSCRED, &sock->flags);
 649                break;
 650
 651        case SO_TIMESTAMP:
 652        case SO_TIMESTAMPNS:
 653                if (valbool)  {
 654                        if (optname == SO_TIMESTAMP)
 655                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 656                        else
 657                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 658                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 659                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 660                } else {
 661                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 662                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 663                }
 664                break;
 665
 666        case SO_TIMESTAMPING:
 667                if (val & ~SOF_TIMESTAMPING_MASK) {
 668                        ret = -EINVAL;
 669                        break;
 670                }
 671                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 672                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 673                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 674                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 675                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 676                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 677                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 678                        sock_enable_timestamp(sk,
 679                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 680                else
 681                        sock_disable_timestamp(sk,
 682                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 683                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 684                                  val & SOF_TIMESTAMPING_SOFTWARE);
 685                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 686                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 687                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 688                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 689                break;
 690
 691        case SO_RCVLOWAT:
 692                if (val < 0)
 693                        val = INT_MAX;
 694                sk->sk_rcvlowat = val ? : 1;
 695                break;
 696
 697        case SO_RCVTIMEO:
 698                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 699                break;
 700
 701        case SO_SNDTIMEO:
 702                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 703                break;
 704
 705        case SO_ATTACH_FILTER:
 706                ret = -EINVAL;
 707                if (optlen == sizeof(struct sock_fprog)) {
 708                        struct sock_fprog fprog;
 709
 710                        ret = -EFAULT;
 711                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 712                                break;
 713
 714                        ret = sk_attach_filter(&fprog, sk);
 715                }
 716                break;
 717
 718        case SO_DETACH_FILTER:
 719                ret = sk_detach_filter(sk);
 720                break;
 721
 722        case SO_PASSSEC:
 723                if (valbool)
 724                        set_bit(SOCK_PASSSEC, &sock->flags);
 725                else
 726                        clear_bit(SOCK_PASSSEC, &sock->flags);
 727                break;
 728        case SO_MARK:
 729                if (!capable(CAP_NET_ADMIN))
 730                        ret = -EPERM;
 731                else
 732                        sk->sk_mark = val;
 733                break;
 734
 735                /* We implement the SO_SNDLOWAT etc to
 736                   not be settable (1003.1g 5.3) */
 737        case SO_RXQ_OVFL:
 738                if (valbool)
 739                        sock_set_flag(sk, SOCK_RXQ_OVFL);
 740                else
 741                        sock_reset_flag(sk, SOCK_RXQ_OVFL);
 742                break;
 743        default:
 744                ret = -ENOPROTOOPT;
 745                break;
 746        }
 747        release_sock(sk);
 748        return ret;
 749}
 750EXPORT_SYMBOL(sock_setsockopt);
 751
 752
 753void cred_to_ucred(struct pid *pid, const struct cred *cred,
 754                   struct ucred *ucred)
 755{
 756        ucred->pid = pid_vnr(pid);
 757        ucred->uid = ucred->gid = -1;
 758        if (cred) {
 759                struct user_namespace *current_ns = current_user_ns();
 760
 761                ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
 762                ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
 763        }
 764}
 765EXPORT_SYMBOL_GPL(cred_to_ucred);
 766
 767int sock_getsockopt(struct socket *sock, int level, int optname,
 768                    char __user *optval, int __user *optlen)
 769{
 770        struct sock *sk = sock->sk;
 771
 772        union {
 773                int val;
 774                struct linger ling;
 775                struct timeval tm;
 776        } v;
 777
 778        int lv = sizeof(int);
 779        int len;
 780
 781        if (get_user(len, optlen))
 782                return -EFAULT;
 783        if (len < 0)
 784                return -EINVAL;
 785
 786        memset(&v, 0, sizeof(v));
 787
 788        switch (optname) {
 789        case SO_DEBUG:
 790                v.val = sock_flag(sk, SOCK_DBG);
 791                break;
 792
 793        case SO_DONTROUTE:
 794                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 795                break;
 796
 797        case SO_BROADCAST:
 798                v.val = !!sock_flag(sk, SOCK_BROADCAST);
 799                break;
 800
 801        case SO_SNDBUF:
 802                v.val = sk->sk_sndbuf;
 803                break;
 804
 805        case SO_RCVBUF:
 806                v.val = sk->sk_rcvbuf;
 807                break;
 808
 809        case SO_REUSEADDR:
 810                v.val = sk->sk_reuse;
 811                break;
 812
 813        case SO_KEEPALIVE:
 814                v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 815                break;
 816
 817        case SO_TYPE:
 818                v.val = sk->sk_type;
 819                break;
 820
 821        case SO_PROTOCOL:
 822                v.val = sk->sk_protocol;
 823                break;
 824
 825        case SO_DOMAIN:
 826                v.val = sk->sk_family;
 827                break;
 828
 829        case SO_ERROR:
 830                v.val = -sock_error(sk);
 831                if (v.val == 0)
 832                        v.val = xchg(&sk->sk_err_soft, 0);
 833                break;
 834
 835        case SO_OOBINLINE:
 836                v.val = !!sock_flag(sk, SOCK_URGINLINE);
 837                break;
 838
 839        case SO_NO_CHECK:
 840                v.val = sk->sk_no_check;
 841                break;
 842
 843        case SO_PRIORITY:
 844                v.val = sk->sk_priority;
 845                break;
 846
 847        case SO_LINGER:
 848                lv              = sizeof(v.ling);
 849                v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 850                v.ling.l_linger = sk->sk_lingertime / HZ;
 851                break;
 852
 853        case SO_BSDCOMPAT:
 854                sock_warn_obsolete_bsdism("getsockopt");
 855                break;
 856
 857        case SO_TIMESTAMP:
 858                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 859                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 860                break;
 861
 862        case SO_TIMESTAMPNS:
 863                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 864                break;
 865
 866        case SO_TIMESTAMPING:
 867                v.val = 0;
 868                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 869                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 870                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 871                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 872                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 873                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 874                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 875                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 876                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 877                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 878                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 879                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 880                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 881                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 882                break;
 883
 884        case SO_RCVTIMEO:
 885                lv = sizeof(struct timeval);
 886                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 887                        v.tm.tv_sec = 0;
 888                        v.tm.tv_usec = 0;
 889                } else {
 890                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 891                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 892                }
 893                break;
 894
 895        case SO_SNDTIMEO:
 896                lv = sizeof(struct timeval);
 897                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 898                        v.tm.tv_sec = 0;
 899                        v.tm.tv_usec = 0;
 900                } else {
 901                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 902                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 903                }
 904                break;
 905
 906        case SO_RCVLOWAT:
 907                v.val = sk->sk_rcvlowat;
 908                break;
 909
 910        case SO_SNDLOWAT:
 911                v.val = 1;
 912                break;
 913
 914        case SO_PASSCRED:
 915                v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 916                break;
 917
 918        case SO_PEERCRED:
 919        {
 920                struct ucred peercred;
 921                if (len > sizeof(peercred))
 922                        len = sizeof(peercred);
 923                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
 924                if (copy_to_user(optval, &peercred, len))
 925                        return -EFAULT;
 926                goto lenout;
 927        }
 928
 929        case SO_PEERNAME:
 930        {
 931                char address[128];
 932
 933                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 934                        return -ENOTCONN;
 935                if (lv < len)
 936                        return -EINVAL;
 937                if (copy_to_user(optval, address, len))
 938                        return -EFAULT;
 939                goto lenout;
 940        }
 941
 942        /* Dubious BSD thing... Probably nobody even uses it, but
 943         * the UNIX standard wants it for whatever reason... -DaveM
 944         */
 945        case SO_ACCEPTCONN:
 946                v.val = sk->sk_state == TCP_LISTEN;
 947                break;
 948
 949        case SO_PASSSEC:
 950                v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 951                break;
 952
 953        case SO_PEERSEC:
 954                return security_socket_getpeersec_stream(sock, optval, optlen, len);
 955
 956        case SO_MARK:
 957                v.val = sk->sk_mark;
 958                break;
 959
 960        case SO_RXQ_OVFL:
 961                v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
 962                break;
 963
 964        default:
 965                return -ENOPROTOOPT;
 966        }
 967
 968        if (len > lv)
 969                len = lv;
 970        if (copy_to_user(optval, &v, len))
 971                return -EFAULT;
 972lenout:
 973        if (put_user(len, optlen))
 974                return -EFAULT;
 975        return 0;
 976}
 977
 978/*
 979 * Initialize an sk_lock.
 980 *
 981 * (We also register the sk_lock with the lock validator.)
 982 */
 983static inline void sock_lock_init(struct sock *sk)
 984{
 985        sock_lock_init_class_and_name(sk,
 986                        af_family_slock_key_strings[sk->sk_family],
 987                        af_family_slock_keys + sk->sk_family,
 988                        af_family_key_strings[sk->sk_family],
 989                        af_family_keys + sk->sk_family);
 990}
 991
 992/*
 993 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 994 * even temporarly, because of RCU lookups. sk_node should also be left as is.
 995 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 996 */
 997static void sock_copy(struct sock *nsk, const struct sock *osk)
 998{
 999#ifdef CONFIG_SECURITY_NETWORK
1000        void *sptr = nsk->sk_security;
1001#endif
1002        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1003
1004        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1005               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1006
1007#ifdef CONFIG_SECURITY_NETWORK
1008        nsk->sk_security = sptr;
1009        security_sk_clone(osk, nsk);
1010#endif
1011}
1012
1013/*
1014 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1015 * un-modified. Special care is taken when initializing object to zero.
1016 */
1017static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1018{
1019        if (offsetof(struct sock, sk_node.next) != 0)
1020                memset(sk, 0, offsetof(struct sock, sk_node.next));
1021        memset(&sk->sk_node.pprev, 0,
1022               size - offsetof(struct sock, sk_node.pprev));
1023}
1024
1025void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1026{
1027        unsigned long nulls1, nulls2;
1028
1029        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1030        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1031        if (nulls1 > nulls2)
1032                swap(nulls1, nulls2);
1033
1034        if (nulls1 != 0)
1035                memset((char *)sk, 0, nulls1);
1036        memset((char *)sk + nulls1 + sizeof(void *), 0,
1037               nulls2 - nulls1 - sizeof(void *));
1038        memset((char *)sk + nulls2 + sizeof(void *), 0,
1039               size - nulls2 - sizeof(void *));
1040}
1041EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1042
1043static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1044                int family)
1045{
1046        struct sock *sk;
1047        struct kmem_cache *slab;
1048
1049        slab = prot->slab;
1050        if (slab != NULL) {
1051                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1052                if (!sk)
1053                        return sk;
1054                if (priority & __GFP_ZERO) {
1055                        if (prot->clear_sk)
1056                                prot->clear_sk(sk, prot->obj_size);
1057                        else
1058                                sk_prot_clear_nulls(sk, prot->obj_size);
1059                }
1060        } else
1061                sk = kmalloc(prot->obj_size, priority);
1062
1063        if (sk != NULL) {
1064                kmemcheck_annotate_bitfield(sk, flags);
1065
1066                if (security_sk_alloc(sk, family, priority))
1067                        goto out_free;
1068
1069                if (!try_module_get(prot->owner))
1070                        goto out_free_sec;
1071                sk_tx_queue_clear(sk);
1072        }
1073
1074        return sk;
1075
1076out_free_sec:
1077        security_sk_free(sk);
1078out_free:
1079        if (slab != NULL)
1080                kmem_cache_free(slab, sk);
1081        else
1082                kfree(sk);
1083        return NULL;
1084}
1085
1086static void sk_prot_free(struct proto *prot, struct sock *sk)
1087{
1088        struct kmem_cache *slab;
1089        struct module *owner;
1090
1091        owner = prot->owner;
1092        slab = prot->slab;
1093
1094        security_sk_free(sk);
1095        if (slab != NULL)
1096                kmem_cache_free(slab, sk);
1097        else
1098                kfree(sk);
1099        module_put(owner);
1100}
1101
1102#ifdef CONFIG_CGROUPS
1103void sock_update_classid(struct sock *sk)
1104{
1105        u32 classid;
1106
1107        rcu_read_lock();  /* doing current task, which cannot vanish. */
1108        classid = task_cls_classid(current);
1109        rcu_read_unlock();
1110        if (classid && classid != sk->sk_classid)
1111                sk->sk_classid = classid;
1112}
1113EXPORT_SYMBOL(sock_update_classid);
1114#endif
1115
1116/**
1117 *      sk_alloc - All socket objects are allocated here
1118 *      @net: the applicable net namespace
1119 *      @family: protocol family
1120 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1121 *      @prot: struct proto associated with this new sock instance
1122 */
1123struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1124                      struct proto *prot)
1125{
1126        struct sock *sk;
1127
1128        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1129        if (sk) {
1130                sk->sk_family = family;
1131                /*
1132                 * See comment in struct sock definition to understand
1133                 * why we need sk_prot_creator -acme
1134                 */
1135                sk->sk_prot = sk->sk_prot_creator = prot;
1136                sock_lock_init(sk);
1137                sock_net_set(sk, get_net(net));
1138                atomic_set(&sk->sk_wmem_alloc, 1);
1139
1140                sock_update_classid(sk);
1141        }
1142
1143        return sk;
1144}
1145EXPORT_SYMBOL(sk_alloc);
1146
1147static void __sk_free(struct sock *sk)
1148{
1149        struct sk_filter *filter;
1150
1151        if (sk->sk_destruct)
1152                sk->sk_destruct(sk);
1153
1154        filter = rcu_dereference_check(sk->sk_filter,
1155                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1156        if (filter) {
1157                sk_filter_uncharge(sk, filter);
1158                rcu_assign_pointer(sk->sk_filter, NULL);
1159        }
1160
1161        sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1162        sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1163
1164        if (atomic_read(&sk->sk_omem_alloc))
1165                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1166                       __func__, atomic_read(&sk->sk_omem_alloc));
1167
1168        if (sk->sk_peer_cred)
1169                put_cred(sk->sk_peer_cred);
1170        put_pid(sk->sk_peer_pid);
1171        put_net(sock_net(sk));
1172        sk_prot_free(sk->sk_prot_creator, sk);
1173}
1174
1175void sk_free(struct sock *sk)
1176{
1177        /*
1178         * We substract one from sk_wmem_alloc and can know if
1179         * some packets are still in some tx queue.
1180         * If not null, sock_wfree() will call __sk_free(sk) later
1181         */
1182        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1183                __sk_free(sk);
1184}
1185EXPORT_SYMBOL(sk_free);
1186
1187/*
1188 * Last sock_put should drop referrence to sk->sk_net. It has already
1189 * been dropped in sk_change_net. Taking referrence to stopping namespace
1190 * is not an option.
1191 * Take referrence to a socket to remove it from hash _alive_ and after that
1192 * destroy it in the context of init_net.
1193 */
1194void sk_release_kernel(struct sock *sk)
1195{
1196        if (sk == NULL || sk->sk_socket == NULL)
1197                return;
1198
1199        sock_hold(sk);
1200        sock_release(sk->sk_socket);
1201        release_net(sock_net(sk));
1202        sock_net_set(sk, get_net(&init_net));
1203        sock_put(sk);
1204}
1205EXPORT_SYMBOL(sk_release_kernel);
1206
1207struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1208{
1209        struct sock *newsk;
1210
1211        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1212        if (newsk != NULL) {
1213                struct sk_filter *filter;
1214
1215                sock_copy(newsk, sk);
1216
1217                /* SANITY */
1218                get_net(sock_net(newsk));
1219                sk_node_init(&newsk->sk_node);
1220                sock_lock_init(newsk);
1221                bh_lock_sock(newsk);
1222                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1223                newsk->sk_backlog.len = 0;
1224
1225                atomic_set(&newsk->sk_rmem_alloc, 0);
1226                /*
1227                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1228                 */
1229                atomic_set(&newsk->sk_wmem_alloc, 1);
1230                atomic_set(&newsk->sk_omem_alloc, 0);
1231                skb_queue_head_init(&newsk->sk_receive_queue);
1232                skb_queue_head_init(&newsk->sk_write_queue);
1233#ifdef CONFIG_NET_DMA
1234                skb_queue_head_init(&newsk->sk_async_wait_queue);
1235#endif
1236
1237                spin_lock_init(&newsk->sk_dst_lock);
1238                rwlock_init(&newsk->sk_callback_lock);
1239                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1240                                af_callback_keys + newsk->sk_family,
1241                                af_family_clock_key_strings[newsk->sk_family]);
1242
1243                newsk->sk_dst_cache     = NULL;
1244                newsk->sk_wmem_queued   = 0;
1245                newsk->sk_forward_alloc = 0;
1246                newsk->sk_send_head     = NULL;
1247                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1248
1249                sock_reset_flag(newsk, SOCK_DONE);
1250                skb_queue_head_init(&newsk->sk_error_queue);
1251
1252                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1253                if (filter != NULL)
1254                        sk_filter_charge(newsk, filter);
1255
1256                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1257                        /* It is still raw copy of parent, so invalidate
1258                         * destructor and make plain sk_free() */
1259                        newsk->sk_destruct = NULL;
1260                        sk_free(newsk);
1261                        newsk = NULL;
1262                        goto out;
1263                }
1264
1265                newsk->sk_err      = 0;
1266                newsk->sk_priority = 0;
1267                /*
1268                 * Before updating sk_refcnt, we must commit prior changes to memory
1269                 * (Documentation/RCU/rculist_nulls.txt for details)
1270                 */
1271                smp_wmb();
1272                atomic_set(&newsk->sk_refcnt, 2);
1273
1274                /*
1275                 * Increment the counter in the same struct proto as the master
1276                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1277                 * is the same as sk->sk_prot->socks, as this field was copied
1278                 * with memcpy).
1279                 *
1280                 * This _changes_ the previous behaviour, where
1281                 * tcp_create_openreq_child always was incrementing the
1282                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1283                 * to be taken into account in all callers. -acme
1284                 */
1285                sk_refcnt_debug_inc(newsk);
1286                sk_set_socket(newsk, NULL);
1287                newsk->sk_wq = NULL;
1288
1289                if (newsk->sk_prot->sockets_allocated)
1290                        percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1291
1292                if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1293                    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1294                        net_enable_timestamp();
1295        }
1296out:
1297        return newsk;
1298}
1299EXPORT_SYMBOL_GPL(sk_clone);
1300
1301void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1302{
1303        __sk_dst_set(sk, dst);
1304        sk->sk_route_caps = dst->dev->features;
1305        if (sk->sk_route_caps & NETIF_F_GSO)
1306                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1307        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1308        if (sk_can_gso(sk)) {
1309                if (dst->header_len) {
1310                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1311                } else {
1312                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1313                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1314                }
1315        }
1316}
1317EXPORT_SYMBOL_GPL(sk_setup_caps);
1318
1319void __init sk_init(void)
1320{
1321        if (totalram_pages <= 4096) {
1322                sysctl_wmem_max = 32767;
1323                sysctl_rmem_max = 32767;
1324                sysctl_wmem_default = 32767;
1325                sysctl_rmem_default = 32767;
1326        } else if (totalram_pages >= 131072) {
1327                sysctl_wmem_max = 131071;
1328                sysctl_rmem_max = 131071;
1329        }
1330}
1331
1332/*
1333 *      Simple resource managers for sockets.
1334 */
1335
1336
1337/*
1338 * Write buffer destructor automatically called from kfree_skb.
1339 */
1340void sock_wfree(struct sk_buff *skb)
1341{
1342        struct sock *sk = skb->sk;
1343        unsigned int len = skb->truesize;
1344
1345        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1346                /*
1347                 * Keep a reference on sk_wmem_alloc, this will be released
1348                 * after sk_write_space() call
1349                 */
1350                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1351                sk->sk_write_space(sk);
1352                len = 1;
1353        }
1354        /*
1355         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1356         * could not do because of in-flight packets
1357         */
1358        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1359                __sk_free(sk);
1360}
1361EXPORT_SYMBOL(sock_wfree);
1362
1363/*
1364 * Read buffer destructor automatically called from kfree_skb.
1365 */
1366void sock_rfree(struct sk_buff *skb)
1367{
1368        struct sock *sk = skb->sk;
1369        unsigned int len = skb->truesize;
1370
1371        atomic_sub(len, &sk->sk_rmem_alloc);
1372        sk_mem_uncharge(sk, len);
1373}
1374EXPORT_SYMBOL(sock_rfree);
1375
1376
1377int sock_i_uid(struct sock *sk)
1378{
1379        int uid;
1380
1381        read_lock_bh(&sk->sk_callback_lock);
1382        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1383        read_unlock_bh(&sk->sk_callback_lock);
1384        return uid;
1385}
1386EXPORT_SYMBOL(sock_i_uid);
1387
1388unsigned long sock_i_ino(struct sock *sk)
1389{
1390        unsigned long ino;
1391
1392        read_lock_bh(&sk->sk_callback_lock);
1393        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1394        read_unlock_bh(&sk->sk_callback_lock);
1395        return ino;
1396}
1397EXPORT_SYMBOL(sock_i_ino);
1398
1399/*
1400 * Allocate a skb from the socket's send buffer.
1401 */
1402struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1403                             gfp_t priority)
1404{
1405        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1406                struct sk_buff *skb = alloc_skb(size, priority);
1407                if (skb) {
1408                        skb_set_owner_w(skb, sk);
1409                        return skb;
1410                }
1411        }
1412        return NULL;
1413}
1414EXPORT_SYMBOL(sock_wmalloc);
1415
1416/*
1417 * Allocate a skb from the socket's receive buffer.
1418 */
1419struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1420                             gfp_t priority)
1421{
1422        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1423                struct sk_buff *skb = alloc_skb(size, priority);
1424                if (skb) {
1425                        skb_set_owner_r(skb, sk);
1426                        return skb;
1427                }
1428        }
1429        return NULL;
1430}
1431
1432/*
1433 * Allocate a memory block from the socket's option memory buffer.
1434 */
1435void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1436{
1437        if ((unsigned)size <= sysctl_optmem_max &&
1438            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1439                void *mem;
1440                /* First do the add, to avoid the race if kmalloc
1441                 * might sleep.
1442                 */
1443                atomic_add(size, &sk->sk_omem_alloc);
1444                mem = kmalloc(size, priority);
1445                if (mem)
1446                        return mem;
1447                atomic_sub(size, &sk->sk_omem_alloc);
1448        }
1449        return NULL;
1450}
1451EXPORT_SYMBOL(sock_kmalloc);
1452
1453/*
1454 * Free an option memory block.
1455 */
1456void sock_kfree_s(struct sock *sk, void *mem, int size)
1457{
1458        kfree(mem);
1459        atomic_sub(size, &sk->sk_omem_alloc);
1460}
1461EXPORT_SYMBOL(sock_kfree_s);
1462
1463/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1464   I think, these locks should be removed for datagram sockets.
1465 */
1466static long sock_wait_for_wmem(struct sock *sk, long timeo)
1467{
1468        DEFINE_WAIT(wait);
1469
1470        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1471        for (;;) {
1472                if (!timeo)
1473                        break;
1474                if (signal_pending(current))
1475                        break;
1476                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1477                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1478                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1479                        break;
1480                if (sk->sk_shutdown & SEND_SHUTDOWN)
1481                        break;
1482                if (sk->sk_err)
1483                        break;
1484                timeo = schedule_timeout(timeo);
1485        }
1486        finish_wait(sk_sleep(sk), &wait);
1487        return timeo;
1488}
1489
1490
1491/*
1492 *      Generic send/receive buffer handlers
1493 */
1494
1495struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1496                                     unsigned long data_len, int noblock,
1497                                     int *errcode)
1498{
1499        struct sk_buff *skb;
1500        gfp_t gfp_mask;
1501        long timeo;
1502        int err;
1503
1504        gfp_mask = sk->sk_allocation;
1505        if (gfp_mask & __GFP_WAIT)
1506                gfp_mask |= __GFP_REPEAT;
1507
1508        timeo = sock_sndtimeo(sk, noblock);
1509        while (1) {
1510                err = sock_error(sk);
1511                if (err != 0)
1512                        goto failure;
1513
1514                err = -EPIPE;
1515                if (sk->sk_shutdown & SEND_SHUTDOWN)
1516                        goto failure;
1517
1518                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1519                        skb = alloc_skb(header_len, gfp_mask);
1520                        if (skb) {
1521                                int npages;
1522                                int i;
1523
1524                                /* No pages, we're done... */
1525                                if (!data_len)
1526                                        break;
1527
1528                                npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1529                                skb->truesize += data_len;
1530                                skb_shinfo(skb)->nr_frags = npages;
1531                                for (i = 0; i < npages; i++) {
1532                                        struct page *page;
1533                                        skb_frag_t *frag;
1534
1535                                        page = alloc_pages(sk->sk_allocation, 0);
1536                                        if (!page) {
1537                                                err = -ENOBUFS;
1538                                                skb_shinfo(skb)->nr_frags = i;
1539                                                kfree_skb(skb);
1540                                                goto failure;
1541                                        }
1542
1543                                        frag = &skb_shinfo(skb)->frags[i];
1544                                        frag->page = page;
1545                                        frag->page_offset = 0;
1546                                        frag->size = (data_len >= PAGE_SIZE ?
1547                                                      PAGE_SIZE :
1548                                                      data_len);
1549                                        data_len -= PAGE_SIZE;
1550                                }
1551
1552                                /* Full success... */
1553                                break;
1554                        }
1555                        err = -ENOBUFS;
1556                        goto failure;
1557                }
1558                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1559                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1560                err = -EAGAIN;
1561                if (!timeo)
1562                        goto failure;
1563                if (signal_pending(current))
1564                        goto interrupted;
1565                timeo = sock_wait_for_wmem(sk, timeo);
1566        }
1567
1568        skb_set_owner_w(skb, sk);
1569        return skb;
1570
1571interrupted:
1572        err = sock_intr_errno(timeo);
1573failure:
1574        *errcode = err;
1575        return NULL;
1576}
1577EXPORT_SYMBOL(sock_alloc_send_pskb);
1578
1579struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1580                                    int noblock, int *errcode)
1581{
1582        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1583}
1584EXPORT_SYMBOL(sock_alloc_send_skb);
1585
1586static void __lock_sock(struct sock *sk)
1587        __releases(&sk->sk_lock.slock)
1588        __acquires(&sk->sk_lock.slock)
1589{
1590        DEFINE_WAIT(wait);
1591
1592        for (;;) {
1593                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1594                                        TASK_UNINTERRUPTIBLE);
1595                spin_unlock_bh(&sk->sk_lock.slock);
1596                schedule();
1597                spin_lock_bh(&sk->sk_lock.slock);
1598                if (!sock_owned_by_user(sk))
1599                        break;
1600        }
1601        finish_wait(&sk->sk_lock.wq, &wait);
1602}
1603
1604static void __release_sock(struct sock *sk)
1605        __releases(&sk->sk_lock.slock)
1606        __acquires(&sk->sk_lock.slock)
1607{
1608        struct sk_buff *skb = sk->sk_backlog.head;
1609
1610        do {
1611                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1612                bh_unlock_sock(sk);
1613
1614                do {
1615                        struct sk_buff *next = skb->next;
1616
1617                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1618                        skb->next = NULL;
1619                        sk_backlog_rcv(sk, skb);
1620
1621                        /*
1622                         * We are in process context here with softirqs
1623                         * disabled, use cond_resched_softirq() to preempt.
1624                         * This is safe to do because we've taken the backlog
1625                         * queue private:
1626                         */
1627                        cond_resched_softirq();
1628
1629                        skb = next;
1630                } while (skb != NULL);
1631
1632                bh_lock_sock(sk);
1633        } while ((skb = sk->sk_backlog.head) != NULL);
1634
1635        /*
1636         * Doing the zeroing here guarantee we can not loop forever
1637         * while a wild producer attempts to flood us.
1638         */
1639        sk->sk_backlog.len = 0;
1640}
1641
1642/**
1643 * sk_wait_data - wait for data to arrive at sk_receive_queue
1644 * @sk:    sock to wait on
1645 * @timeo: for how long
1646 *
1647 * Now socket state including sk->sk_err is changed only under lock,
1648 * hence we may omit checks after joining wait queue.
1649 * We check receive queue before schedule() only as optimization;
1650 * it is very likely that release_sock() added new data.
1651 */
1652int sk_wait_data(struct sock *sk, long *timeo)
1653{
1654        int rc;
1655        DEFINE_WAIT(wait);
1656
1657        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1658        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1659        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1660        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1661        finish_wait(sk_sleep(sk), &wait);
1662        return rc;
1663}
1664EXPORT_SYMBOL(sk_wait_data);
1665
1666/**
1667 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1668 *      @sk: socket
1669 *      @size: memory size to allocate
1670 *      @kind: allocation type
1671 *
1672 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1673 *      rmem allocation. This function assumes that protocols which have
1674 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1675 */
1676int __sk_mem_schedule(struct sock *sk, int size, int kind)
1677{
1678        struct proto *prot = sk->sk_prot;
1679        int amt = sk_mem_pages(size);
1680        long allocated;
1681
1682        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1683        allocated = atomic_long_add_return(amt, prot->memory_allocated);
1684
1685        /* Under limit. */
1686        if (allocated <= prot->sysctl_mem[0]) {
1687                if (prot->memory_pressure && *prot->memory_pressure)
1688                        *prot->memory_pressure = 0;
1689                return 1;
1690        }
1691
1692        /* Under pressure. */
1693        if (allocated > prot->sysctl_mem[1])
1694                if (prot->enter_memory_pressure)
1695                        prot->enter_memory_pressure(sk);
1696
1697        /* Over hard limit. */
1698        if (allocated > prot->sysctl_mem[2])
1699                goto suppress_allocation;
1700
1701        /* guarantee minimum buffer size under pressure */
1702        if (kind == SK_MEM_RECV) {
1703                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1704                        return 1;
1705        } else { /* SK_MEM_SEND */
1706                if (sk->sk_type == SOCK_STREAM) {
1707                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1708                                return 1;
1709                } else if (atomic_read(&sk->sk_wmem_alloc) <
1710                           prot->sysctl_wmem[0])
1711                                return 1;
1712        }
1713
1714        if (prot->memory_pressure) {
1715                int alloc;
1716
1717                if (!*prot->memory_pressure)
1718                        return 1;
1719                alloc = percpu_counter_read_positive(prot->sockets_allocated);
1720                if (prot->sysctl_mem[2] > alloc *
1721                    sk_mem_pages(sk->sk_wmem_queued +
1722                                 atomic_read(&sk->sk_rmem_alloc) +
1723                                 sk->sk_forward_alloc))
1724                        return 1;
1725        }
1726
1727suppress_allocation:
1728
1729        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1730                sk_stream_moderate_sndbuf(sk);
1731
1732                /* Fail only if socket is _under_ its sndbuf.
1733                 * In this case we cannot block, so that we have to fail.
1734                 */
1735                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1736                        return 1;
1737        }
1738
1739        /* Alas. Undo changes. */
1740        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1741        atomic_long_sub(amt, prot->memory_allocated);
1742        return 0;
1743}
1744EXPORT_SYMBOL(__sk_mem_schedule);
1745
1746/**
1747 *      __sk_reclaim - reclaim memory_allocated
1748 *      @sk: socket
1749 */
1750void __sk_mem_reclaim(struct sock *sk)
1751{
1752        struct proto *prot = sk->sk_prot;
1753
1754        atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1755                   prot->memory_allocated);
1756        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1757
1758        if (prot->memory_pressure && *prot->memory_pressure &&
1759            (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1760                *prot->memory_pressure = 0;
1761}
1762EXPORT_SYMBOL(__sk_mem_reclaim);
1763
1764
1765/*
1766 * Set of default routines for initialising struct proto_ops when
1767 * the protocol does not support a particular function. In certain
1768 * cases where it makes no sense for a protocol to have a "do nothing"
1769 * function, some default processing is provided.
1770 */
1771
1772int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1773{
1774        return -EOPNOTSUPP;
1775}
1776EXPORT_SYMBOL(sock_no_bind);
1777
1778int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1779                    int len, int flags)
1780{
1781        return -EOPNOTSUPP;
1782}
1783EXPORT_SYMBOL(sock_no_connect);
1784
1785int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1786{
1787        return -EOPNOTSUPP;
1788}
1789EXPORT_SYMBOL(sock_no_socketpair);
1790
1791int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1792{
1793        return -EOPNOTSUPP;
1794}
1795EXPORT_SYMBOL(sock_no_accept);
1796
1797int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1798                    int *len, int peer)
1799{
1800        return -EOPNOTSUPP;
1801}
1802EXPORT_SYMBOL(sock_no_getname);
1803
1804unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1805{
1806        return 0;
1807}
1808EXPORT_SYMBOL(sock_no_poll);
1809
1810int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1811{
1812        return -EOPNOTSUPP;
1813}
1814EXPORT_SYMBOL(sock_no_ioctl);
1815
1816int sock_no_listen(struct socket *sock, int backlog)
1817{
1818        return -EOPNOTSUPP;
1819}
1820EXPORT_SYMBOL(sock_no_listen);
1821
1822int sock_no_shutdown(struct socket *sock, int how)
1823{
1824        return -EOPNOTSUPP;
1825}
1826EXPORT_SYMBOL(sock_no_shutdown);
1827
1828int sock_no_setsockopt(struct socket *sock, int level, int optname,
1829                    char __user *optval, unsigned int optlen)
1830{
1831        return -EOPNOTSUPP;
1832}
1833EXPORT_SYMBOL(sock_no_setsockopt);
1834
1835int sock_no_getsockopt(struct socket *sock, int level, int optname,
1836                    char __user *optval, int __user *optlen)
1837{
1838        return -EOPNOTSUPP;
1839}
1840EXPORT_SYMBOL(sock_no_getsockopt);
1841
1842int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1843                    size_t len)
1844{
1845        return -EOPNOTSUPP;
1846}
1847EXPORT_SYMBOL(sock_no_sendmsg);
1848
1849int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1850                    size_t len, int flags)
1851{
1852        return -EOPNOTSUPP;
1853}
1854EXPORT_SYMBOL(sock_no_recvmsg);
1855
1856int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1857{
1858        /* Mirror missing mmap method error code */
1859        return -ENODEV;
1860}
1861EXPORT_SYMBOL(sock_no_mmap);
1862
1863ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1864{
1865        ssize_t res;
1866        struct msghdr msg = {.msg_flags = flags};
1867        struct kvec iov;
1868        char *kaddr = kmap(page);
1869        iov.iov_base = kaddr + offset;
1870        iov.iov_len = size;
1871        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1872        kunmap(page);
1873        return res;
1874}
1875EXPORT_SYMBOL(sock_no_sendpage);
1876
1877/*
1878 *      Default Socket Callbacks
1879 */
1880
1881static void sock_def_wakeup(struct sock *sk)
1882{
1883        struct socket_wq *wq;
1884
1885        rcu_read_lock();
1886        wq = rcu_dereference(sk->sk_wq);
1887        if (wq_has_sleeper(wq))
1888                wake_up_interruptible_all(&wq->wait);
1889        rcu_read_unlock();
1890}
1891
1892static void sock_def_error_report(struct sock *sk)
1893{
1894        struct socket_wq *wq;
1895
1896        rcu_read_lock();
1897        wq = rcu_dereference(sk->sk_wq);
1898        if (wq_has_sleeper(wq))
1899                wake_up_interruptible_poll(&wq->wait, POLLERR);
1900        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1901        rcu_read_unlock();
1902}
1903
1904static void sock_def_readable(struct sock *sk, int len)
1905{
1906        struct socket_wq *wq;
1907
1908        rcu_read_lock();
1909        wq = rcu_dereference(sk->sk_wq);
1910        if (wq_has_sleeper(wq))
1911                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1912                                                POLLRDNORM | POLLRDBAND);
1913        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1914        rcu_read_unlock();
1915}
1916
1917static void sock_def_write_space(struct sock *sk)
1918{
1919        struct socket_wq *wq;
1920
1921        rcu_read_lock();
1922
1923        /* Do not wake up a writer until he can make "significant"
1924         * progress.  --DaveM
1925         */
1926        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1927                wq = rcu_dereference(sk->sk_wq);
1928                if (wq_has_sleeper(wq))
1929                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1930                                                POLLWRNORM | POLLWRBAND);
1931
1932                /* Should agree with poll, otherwise some programs break */
1933                if (sock_writeable(sk))
1934                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1935        }
1936
1937        rcu_read_unlock();
1938}
1939
1940static void sock_def_destruct(struct sock *sk)
1941{
1942        kfree(sk->sk_protinfo);
1943}
1944
1945void sk_send_sigurg(struct sock *sk)
1946{
1947        if (sk->sk_socket && sk->sk_socket->file)
1948                if (send_sigurg(&sk->sk_socket->file->f_owner))
1949                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1950}
1951EXPORT_SYMBOL(sk_send_sigurg);
1952
1953void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1954                    unsigned long expires)
1955{
1956        if (!mod_timer(timer, expires))
1957                sock_hold(sk);
1958}
1959EXPORT_SYMBOL(sk_reset_timer);
1960
1961void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1962{
1963        if (timer_pending(timer) && del_timer(timer))
1964                __sock_put(sk);
1965}
1966EXPORT_SYMBOL(sk_stop_timer);
1967
1968void sock_init_data(struct socket *sock, struct sock *sk)
1969{
1970        skb_queue_head_init(&sk->sk_receive_queue);
1971        skb_queue_head_init(&sk->sk_write_queue);
1972        skb_queue_head_init(&sk->sk_error_queue);
1973#ifdef CONFIG_NET_DMA
1974        skb_queue_head_init(&sk->sk_async_wait_queue);
1975#endif
1976
1977        sk->sk_send_head        =       NULL;
1978
1979        init_timer(&sk->sk_timer);
1980
1981        sk->sk_allocation       =       GFP_KERNEL;
1982        sk->sk_rcvbuf           =       sysctl_rmem_default;
1983        sk->sk_sndbuf           =       sysctl_wmem_default;
1984        sk->sk_state            =       TCP_CLOSE;
1985        sk_set_socket(sk, sock);
1986
1987        sock_set_flag(sk, SOCK_ZAPPED);
1988
1989        if (sock) {
1990                sk->sk_type     =       sock->type;
1991                sk->sk_wq       =       sock->wq;
1992                sock->sk        =       sk;
1993        } else
1994                sk->sk_wq       =       NULL;
1995
1996        spin_lock_init(&sk->sk_dst_lock);
1997        rwlock_init(&sk->sk_callback_lock);
1998        lockdep_set_class_and_name(&sk->sk_callback_lock,
1999                        af_callback_keys + sk->sk_family,
2000                        af_family_clock_key_strings[sk->sk_family]);
2001
2002        sk->sk_state_change     =       sock_def_wakeup;
2003        sk->sk_data_ready       =       sock_def_readable;
2004        sk->sk_write_space      =       sock_def_write_space;
2005        sk->sk_error_report     =       sock_def_error_report;
2006        sk->sk_destruct         =       sock_def_destruct;
2007
2008        sk->sk_sndmsg_page      =       NULL;
2009        sk->sk_sndmsg_off       =       0;
2010
2011        sk->sk_peer_pid         =       NULL;
2012        sk->sk_peer_cred        =       NULL;
2013        sk->sk_write_pending    =       0;
2014        sk->sk_rcvlowat         =       1;
2015        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2016        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2017
2018        sk->sk_stamp = ktime_set(-1L, 0);
2019
2020        /*
2021         * Before updating sk_refcnt, we must commit prior changes to memory
2022         * (Documentation/RCU/rculist_nulls.txt for details)
2023         */
2024        smp_wmb();
2025        atomic_set(&sk->sk_refcnt, 1);
2026        atomic_set(&sk->sk_drops, 0);
2027}
2028EXPORT_SYMBOL(sock_init_data);
2029
2030void lock_sock_nested(struct sock *sk, int subclass)
2031{
2032        might_sleep();
2033        spin_lock_bh(&sk->sk_lock.slock);
2034        if (sk->sk_lock.owned)
2035                __lock_sock(sk);
2036        sk->sk_lock.owned = 1;
2037        spin_unlock(&sk->sk_lock.slock);
2038        /*
2039         * The sk_lock has mutex_lock() semantics here:
2040         */
2041        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2042        local_bh_enable();
2043}
2044EXPORT_SYMBOL(lock_sock_nested);
2045
2046void release_sock(struct sock *sk)
2047{
2048        /*
2049         * The sk_lock has mutex_unlock() semantics:
2050         */
2051        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2052
2053        spin_lock_bh(&sk->sk_lock.slock);
2054        if (sk->sk_backlog.tail)
2055                __release_sock(sk);
2056        sk->sk_lock.owned = 0;
2057        if (waitqueue_active(&sk->sk_lock.wq))
2058                wake_up(&sk->sk_lock.wq);
2059        spin_unlock_bh(&sk->sk_lock.slock);
2060}
2061EXPORT_SYMBOL(release_sock);
2062
2063/**
2064 * lock_sock_fast - fast version of lock_sock
2065 * @sk: socket
2066 *
2067 * This version should be used for very small section, where process wont block
2068 * return false if fast path is taken
2069 *   sk_lock.slock locked, owned = 0, BH disabled
2070 * return true if slow path is taken
2071 *   sk_lock.slock unlocked, owned = 1, BH enabled
2072 */
2073bool lock_sock_fast(struct sock *sk)
2074{
2075        might_sleep();
2076        spin_lock_bh(&sk->sk_lock.slock);
2077
2078        if (!sk->sk_lock.owned)
2079                /*
2080                 * Note : We must disable BH
2081                 */
2082                return false;
2083
2084        __lock_sock(sk);
2085        sk->sk_lock.owned = 1;
2086        spin_unlock(&sk->sk_lock.slock);
2087        /*
2088         * The sk_lock has mutex_lock() semantics here:
2089         */
2090        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2091        local_bh_enable();
2092        return true;
2093}
2094EXPORT_SYMBOL(lock_sock_fast);
2095
2096int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2097{
2098        struct timeval tv;
2099        if (!sock_flag(sk, SOCK_TIMESTAMP))
2100                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2101        tv = ktime_to_timeval(sk->sk_stamp);
2102        if (tv.tv_sec == -1)
2103                return -ENOENT;
2104        if (tv.tv_sec == 0) {
2105                sk->sk_stamp = ktime_get_real();
2106                tv = ktime_to_timeval(sk->sk_stamp);
2107        }
2108        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2109}
2110EXPORT_SYMBOL(sock_get_timestamp);
2111
2112int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2113{
2114        struct timespec ts;
2115        if (!sock_flag(sk, SOCK_TIMESTAMP))
2116                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2117        ts = ktime_to_timespec(sk->sk_stamp);
2118        if (ts.tv_sec == -1)
2119                return -ENOENT;
2120        if (ts.tv_sec == 0) {
2121                sk->sk_stamp = ktime_get_real();
2122                ts = ktime_to_timespec(sk->sk_stamp);
2123        }
2124        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2125}
2126EXPORT_SYMBOL(sock_get_timestampns);
2127
2128void sock_enable_timestamp(struct sock *sk, int flag)
2129{
2130        if (!sock_flag(sk, flag)) {
2131                sock_set_flag(sk, flag);
2132                /*
2133                 * we just set one of the two flags which require net
2134                 * time stamping, but time stamping might have been on
2135                 * already because of the other one
2136                 */
2137                if (!sock_flag(sk,
2138                                flag == SOCK_TIMESTAMP ?
2139                                SOCK_TIMESTAMPING_RX_SOFTWARE :
2140                                SOCK_TIMESTAMP))
2141                        net_enable_timestamp();
2142        }
2143}
2144
2145/*
2146 *      Get a socket option on an socket.
2147 *
2148 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2149 *      asynchronous errors should be reported by getsockopt. We assume
2150 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2151 */
2152int sock_common_getsockopt(struct socket *sock, int level, int optname,
2153                           char __user *optval, int __user *optlen)
2154{
2155        struct sock *sk = sock->sk;
2156
2157        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2158}
2159EXPORT_SYMBOL(sock_common_getsockopt);
2160
2161#ifdef CONFIG_COMPAT
2162int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2163                                  char __user *optval, int __user *optlen)
2164{
2165        struct sock *sk = sock->sk;
2166
2167        if (sk->sk_prot->compat_getsockopt != NULL)
2168                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2169                                                      optval, optlen);
2170        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2171}
2172EXPORT_SYMBOL(compat_sock_common_getsockopt);
2173#endif
2174
2175int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2176                        struct msghdr *msg, size_t size, int flags)
2177{
2178        struct sock *sk = sock->sk;
2179        int addr_len = 0;
2180        int err;
2181
2182        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2183                                   flags & ~MSG_DONTWAIT, &addr_len);
2184        if (err >= 0)
2185                msg->msg_namelen = addr_len;
2186        return err;
2187}
2188EXPORT_SYMBOL(sock_common_recvmsg);
2189
2190/*
2191 *      Set socket options on an inet socket.
2192 */
2193int sock_common_setsockopt(struct socket *sock, int level, int optname,
2194                           char __user *optval, unsigned int optlen)
2195{
2196        struct sock *sk = sock->sk;
2197
2198        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2199}
2200EXPORT_SYMBOL(sock_common_setsockopt);
2201
2202#ifdef CONFIG_COMPAT
2203int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2204                                  char __user *optval, unsigned int optlen)
2205{
2206        struct sock *sk = sock->sk;
2207
2208        if (sk->sk_prot->compat_setsockopt != NULL)
2209                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2210                                                      optval, optlen);
2211        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2212}
2213EXPORT_SYMBOL(compat_sock_common_setsockopt);
2214#endif
2215
2216void sk_common_release(struct sock *sk)
2217{
2218        if (sk->sk_prot->destroy)
2219                sk->sk_prot->destroy(sk);
2220
2221        /*
2222         * Observation: when sock_common_release is called, processes have
2223         * no access to socket. But net still has.
2224         * Step one, detach it from networking:
2225         *
2226         * A. Remove from hash tables.
2227         */
2228
2229        sk->sk_prot->unhash(sk);
2230
2231        /*
2232         * In this point socket cannot receive new packets, but it is possible
2233         * that some packets are in flight because some CPU runs receiver and
2234         * did hash table lookup before we unhashed socket. They will achieve
2235         * receive queue and will be purged by socket destructor.
2236         *
2237         * Also we still have packets pending on receive queue and probably,
2238         * our own packets waiting in device queues. sock_destroy will drain
2239         * receive queue, but transmitted packets will delay socket destruction
2240         * until the last reference will be released.
2241         */
2242
2243        sock_orphan(sk);
2244
2245        xfrm_sk_free_policy(sk);
2246
2247        sk_refcnt_debug_release(sk);
2248        sock_put(sk);
2249}
2250EXPORT_SYMBOL(sk_common_release);
2251
2252static DEFINE_RWLOCK(proto_list_lock);
2253static LIST_HEAD(proto_list);
2254
2255#ifdef CONFIG_PROC_FS
2256#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2257struct prot_inuse {
2258        int val[PROTO_INUSE_NR];
2259};
2260
2261static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2262
2263#ifdef CONFIG_NET_NS
2264void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2265{
2266        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2267}
2268EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2269
2270int sock_prot_inuse_get(struct net *net, struct proto *prot)
2271{
2272        int cpu, idx = prot->inuse_idx;
2273        int res = 0;
2274
2275        for_each_possible_cpu(cpu)
2276                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2277
2278        return res >= 0 ? res : 0;
2279}
2280EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2281
2282static int __net_init sock_inuse_init_net(struct net *net)
2283{
2284        net->core.inuse = alloc_percpu(struct prot_inuse);
2285        return net->core.inuse ? 0 : -ENOMEM;
2286}
2287
2288static void __net_exit sock_inuse_exit_net(struct net *net)
2289{
2290        free_percpu(net->core.inuse);
2291}
2292
2293static struct pernet_operations net_inuse_ops = {
2294        .init = sock_inuse_init_net,
2295        .exit = sock_inuse_exit_net,
2296};
2297
2298static __init int net_inuse_init(void)
2299{
2300        if (register_pernet_subsys(&net_inuse_ops))
2301                panic("Cannot initialize net inuse counters");
2302
2303        return 0;
2304}
2305
2306core_initcall(net_inuse_init);
2307#else
2308static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2309
2310void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2311{
2312        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2313}
2314EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2315
2316int sock_prot_inuse_get(struct net *net, struct proto *prot)
2317{
2318        int cpu, idx = prot->inuse_idx;
2319        int res = 0;
2320
2321        for_each_possible_cpu(cpu)
2322                res += per_cpu(prot_inuse, cpu).val[idx];
2323
2324        return res >= 0 ? res : 0;
2325}
2326EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2327#endif
2328
2329static void assign_proto_idx(struct proto *prot)
2330{
2331        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2332
2333        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2334                printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2335                return;
2336        }
2337
2338        set_bit(prot->inuse_idx, proto_inuse_idx);
2339}
2340
2341static void release_proto_idx(struct proto *prot)
2342{
2343        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2344                clear_bit(prot->inuse_idx, proto_inuse_idx);
2345}
2346#else
2347static inline void assign_proto_idx(struct proto *prot)
2348{
2349}
2350
2351static inline void release_proto_idx(struct proto *prot)
2352{
2353}
2354#endif
2355
2356int proto_register(struct proto *prot, int alloc_slab)
2357{
2358        if (alloc_slab) {
2359                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2360                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2361                                        NULL);
2362
2363                if (prot->slab == NULL) {
2364                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2365                               prot->name);
2366                        goto out;
2367                }
2368
2369                if (prot->rsk_prot != NULL) {
2370                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2371                        if (prot->rsk_prot->slab_name == NULL)
2372                                goto out_free_sock_slab;
2373
2374                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2375                                                                 prot->rsk_prot->obj_size, 0,
2376                                                                 SLAB_HWCACHE_ALIGN, NULL);
2377
2378                        if (prot->rsk_prot->slab == NULL) {
2379                                printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2380                                       prot->name);
2381                                goto out_free_request_sock_slab_name;
2382                        }
2383                }
2384
2385                if (prot->twsk_prot != NULL) {
2386                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2387
2388                        if (prot->twsk_prot->twsk_slab_name == NULL)
2389                                goto out_free_request_sock_slab;
2390
2391                        prot->twsk_prot->twsk_slab =
2392                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2393                                                  prot->twsk_prot->twsk_obj_size,
2394                                                  0,
2395                                                  SLAB_HWCACHE_ALIGN |
2396                                                        prot->slab_flags,
2397                                                  NULL);
2398                        if (prot->twsk_prot->twsk_slab == NULL)
2399                                goto out_free_timewait_sock_slab_name;
2400                }
2401        }
2402
2403        write_lock(&proto_list_lock);
2404        list_add(&prot->node, &proto_list);
2405        assign_proto_idx(prot);
2406        write_unlock(&proto_list_lock);
2407        return 0;
2408
2409out_free_timewait_sock_slab_name:
2410        kfree(prot->twsk_prot->twsk_slab_name);
2411out_free_request_sock_slab:
2412        if (prot->rsk_prot && prot->rsk_prot->slab) {
2413                kmem_cache_destroy(prot->rsk_prot->slab);
2414                prot->rsk_prot->slab = NULL;
2415        }
2416out_free_request_sock_slab_name:
2417        if (prot->rsk_prot)
2418                kfree(prot->rsk_prot->slab_name);
2419out_free_sock_slab:
2420        kmem_cache_destroy(prot->slab);
2421        prot->slab = NULL;
2422out:
2423        return -ENOBUFS;
2424}
2425EXPORT_SYMBOL(proto_register);
2426
2427void proto_unregister(struct proto *prot)
2428{
2429        write_lock(&proto_list_lock);
2430        release_proto_idx(prot);
2431        list_del(&prot->node);
2432        write_unlock(&proto_list_lock);
2433
2434        if (prot->slab != NULL) {
2435                kmem_cache_destroy(prot->slab);
2436                prot->slab = NULL;
2437        }
2438
2439        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2440                kmem_cache_destroy(prot->rsk_prot->slab);
2441                kfree(prot->rsk_prot->slab_name);
2442                prot->rsk_prot->slab = NULL;
2443        }
2444
2445        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2446                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2447                kfree(prot->twsk_prot->twsk_slab_name);
2448                prot->twsk_prot->twsk_slab = NULL;
2449        }
2450}
2451EXPORT_SYMBOL(proto_unregister);
2452
2453#ifdef CONFIG_PROC_FS
2454static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2455        __acquires(proto_list_lock)
2456{
2457        read_lock(&proto_list_lock);
2458        return seq_list_start_head(&proto_list, *pos);
2459}
2460
2461static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2462{
2463        return seq_list_next(v, &proto_list, pos);
2464}
2465
2466static void proto_seq_stop(struct seq_file *seq, void *v)
2467        __releases(proto_list_lock)
2468{
2469        read_unlock(&proto_list_lock);
2470}
2471
2472static char proto_method_implemented(const void *method)
2473{
2474        return method == NULL ? 'n' : 'y';
2475}
2476
2477static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2478{
2479        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2480                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2481                   proto->name,
2482                   proto->obj_size,
2483                   sock_prot_inuse_get(seq_file_net(seq), proto),
2484                   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
2485                   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2486                   proto->max_header,
2487                   proto->slab == NULL ? "no" : "yes",
2488                   module_name(proto->owner),
2489                   proto_method_implemented(proto->close),
2490                   proto_method_implemented(proto->connect),
2491                   proto_method_implemented(proto->disconnect),
2492                   proto_method_implemented(proto->accept),
2493                   proto_method_implemented(proto->ioctl),
2494                   proto_method_implemented(proto->init),
2495                   proto_method_implemented(proto->destroy),
2496                   proto_method_implemented(proto->shutdown),
2497                   proto_method_implemented(proto->setsockopt),
2498                   proto_method_implemented(proto->getsockopt),
2499                   proto_method_implemented(proto->sendmsg),
2500                   proto_method_implemented(proto->recvmsg),
2501                   proto_method_implemented(proto->sendpage),
2502                   proto_method_implemented(proto->bind),
2503                   proto_method_implemented(proto->backlog_rcv),
2504                   proto_method_implemented(proto->hash),
2505                   proto_method_implemented(proto->unhash),
2506                   proto_method_implemented(proto->get_port),
2507                   proto_method_implemented(proto->enter_memory_pressure));
2508}
2509
2510static int proto_seq_show(struct seq_file *seq, void *v)
2511{
2512        if (v == &proto_list)
2513                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2514                           "protocol",
2515                           "size",
2516                           "sockets",
2517                           "memory",
2518                           "press",
2519                           "maxhdr",
2520                           "slab",
2521                           "module",
2522                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2523        else
2524                proto_seq_printf(seq, list_entry(v, struct proto, node));
2525        return 0;
2526}
2527
2528static const struct seq_operations proto_seq_ops = {
2529        .start  = proto_seq_start,
2530        .next   = proto_seq_next,
2531        .stop   = proto_seq_stop,
2532        .show   = proto_seq_show,
2533};
2534
2535static int proto_seq_open(struct inode *inode, struct file *file)
2536{
2537        return seq_open_net(inode, file, &proto_seq_ops,
2538                            sizeof(struct seq_net_private));
2539}
2540
2541static const struct file_operations proto_seq_fops = {
2542        .owner          = THIS_MODULE,
2543        .open           = proto_seq_open,
2544        .read           = seq_read,
2545        .llseek         = seq_lseek,
2546        .release        = seq_release_net,
2547};
2548
2549static __net_init int proto_init_net(struct net *net)
2550{
2551        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2552                return -ENOMEM;
2553
2554        return 0;
2555}
2556
2557static __net_exit void proto_exit_net(struct net *net)
2558{
2559        proc_net_remove(net, "protocols");
2560}
2561
2562
2563static __net_initdata struct pernet_operations proto_net_ops = {
2564        .init = proto_init_net,
2565        .exit = proto_exit_net,
2566};
2567
2568static int __init proto_init(void)
2569{
2570        return register_pernet_subsys(&proto_net_ops);
2571}
2572
2573subsys_initcall(proto_init);
2574
2575#endif /* PROC_FS */
2576