linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/types.h>
  97#include <linux/socket.h>
  98#include <linux/in.h>
  99#include <linux/kernel.h>
 100#include <linux/module.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/sched.h>
 104#include <linux/timer.h>
 105#include <linux/string.h>
 106#include <linux/sockios.h>
 107#include <linux/net.h>
 108#include <linux/mm.h>
 109#include <linux/slab.h>
 110#include <linux/interrupt.h>
 111#include <linux/poll.h>
 112#include <linux/tcp.h>
 113#include <linux/init.h>
 114#include <linux/highmem.h>
 115#include <linux/user_namespace.h>
 116#include <linux/static_key.h>
 117#include <linux/memcontrol.h>
 118#include <linux/prefetch.h>
 119
 120#include <asm/uaccess.h>
 121
 122#include <linux/netdevice.h>
 123#include <net/protocol.h>
 124#include <linux/skbuff.h>
 125#include <net/net_namespace.h>
 126#include <net/request_sock.h>
 127#include <net/sock.h>
 128#include <linux/net_tstamp.h>
 129#include <net/xfrm.h>
 130#include <linux/ipsec.h>
 131#include <net/cls_cgroup.h>
 132#include <net/netprio_cgroup.h>
 133
 134#include <linux/filter.h>
 135
 136#include <trace/events/sock.h>
 137
 138#ifdef CONFIG_INET
 139#include <net/tcp.h>
 140#endif
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145#ifdef CONFIG_MEMCG_KMEM
 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147{
 148        struct proto *proto;
 149        int ret = 0;
 150
 151        mutex_lock(&proto_list_mutex);
 152        list_for_each_entry(proto, &proto_list, node) {
 153                if (proto->init_cgroup) {
 154                        ret = proto->init_cgroup(memcg, ss);
 155                        if (ret)
 156                                goto out;
 157                }
 158        }
 159
 160        mutex_unlock(&proto_list_mutex);
 161        return ret;
 162out:
 163        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                if (proto->destroy_cgroup)
 165                        proto->destroy_cgroup(memcg);
 166        mutex_unlock(&proto_list_mutex);
 167        return ret;
 168}
 169
 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171{
 172        struct proto *proto;
 173
 174        mutex_lock(&proto_list_mutex);
 175        list_for_each_entry_reverse(proto, &proto_list, node)
 176                if (proto->destroy_cgroup)
 177                        proto->destroy_cgroup(memcg);
 178        mutex_unlock(&proto_list_mutex);
 179}
 180#endif
 181
 182/*
 183 * Each address family might have different locking rules, so we have
 184 * one slock key per address family:
 185 */
 186static struct lock_class_key af_family_keys[AF_MAX];
 187static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189#if defined(CONFIG_MEMCG_KMEM)
 190struct static_key memcg_socket_limit_enabled;
 191EXPORT_SYMBOL(memcg_socket_limit_enabled);
 192#endif
 193
 194/*
 195 * Make lock validator output more readable. (we pre-construct these
 196 * strings build-time, so that runtime initialization of socket
 197 * locks is fast):
 198 */
 199static const char *const af_family_key_strings[AF_MAX+1] = {
 200  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 201  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 202  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 203  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 204  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 205  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 206  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 207  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 208  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 209  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 210  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 211  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 212  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 213  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 214};
 215static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 216  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 217  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 218  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 219  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 220  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 221  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 222  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 223  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 224  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 225  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 226  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 227  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 228  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 229  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 230};
 231static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 232  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 233  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 234  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 235  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 236  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 237  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 238  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 239  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 240  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 241  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 242  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 243  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 244  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 245  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 246};
 247
 248/*
 249 * sk_callback_lock locking rules are per-address-family,
 250 * so split the lock classes by using a per-AF key:
 251 */
 252static struct lock_class_key af_callback_keys[AF_MAX];
 253
 254/* Take into consideration the size of the struct sk_buff overhead in the
 255 * determination of these values, since that is non-constant across
 256 * platforms.  This makes socket queueing behavior and performance
 257 * not depend upon such differences.
 258 */
 259#define _SK_MEM_PACKETS         256
 260#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 261#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 262#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 263
 264/* Run time adjustable parameters. */
 265__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 266EXPORT_SYMBOL(sysctl_wmem_max);
 267__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 268EXPORT_SYMBOL(sysctl_rmem_max);
 269__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 270__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 271
 272/* Maximal space eaten by iovec or ancillary data plus some space */
 273int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 274EXPORT_SYMBOL(sysctl_optmem_max);
 275
 276struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 277EXPORT_SYMBOL_GPL(memalloc_socks);
 278
 279/**
 280 * sk_set_memalloc - sets %SOCK_MEMALLOC
 281 * @sk: socket to set it on
 282 *
 283 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 284 * It's the responsibility of the admin to adjust min_free_kbytes
 285 * to meet the requirements
 286 */
 287void sk_set_memalloc(struct sock *sk)
 288{
 289        sock_set_flag(sk, SOCK_MEMALLOC);
 290        sk->sk_allocation |= __GFP_MEMALLOC;
 291        static_key_slow_inc(&memalloc_socks);
 292}
 293EXPORT_SYMBOL_GPL(sk_set_memalloc);
 294
 295void sk_clear_memalloc(struct sock *sk)
 296{
 297        sock_reset_flag(sk, SOCK_MEMALLOC);
 298        sk->sk_allocation &= ~__GFP_MEMALLOC;
 299        static_key_slow_dec(&memalloc_socks);
 300
 301        /*
 302         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 303         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 304         * it has rmem allocations there is a risk that the user of the
 305         * socket cannot make forward progress due to exceeding the rmem
 306         * limits. By rights, sk_clear_memalloc() should only be called
 307         * on sockets being torn down but warn and reset the accounting if
 308         * that assumption breaks.
 309         */
 310        if (WARN_ON(sk->sk_forward_alloc))
 311                sk_mem_reclaim(sk);
 312}
 313EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 314
 315int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 316{
 317        int ret;
 318        unsigned long pflags = current->flags;
 319
 320        /* these should have been dropped before queueing */
 321        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 322
 323        current->flags |= PF_MEMALLOC;
 324        ret = sk->sk_backlog_rcv(sk, skb);
 325        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 326
 327        return ret;
 328}
 329EXPORT_SYMBOL(__sk_backlog_rcv);
 330
 331static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 332{
 333        struct timeval tv;
 334
 335        if (optlen < sizeof(tv))
 336                return -EINVAL;
 337        if (copy_from_user(&tv, optval, sizeof(tv)))
 338                return -EFAULT;
 339        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 340                return -EDOM;
 341
 342        if (tv.tv_sec < 0) {
 343                static int warned __read_mostly;
 344
 345                *timeo_p = 0;
 346                if (warned < 10 && net_ratelimit()) {
 347                        warned++;
 348                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 349                                __func__, current->comm, task_pid_nr(current));
 350                }
 351                return 0;
 352        }
 353        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 354        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 355                return 0;
 356        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 357                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 358        return 0;
 359}
 360
 361static void sock_warn_obsolete_bsdism(const char *name)
 362{
 363        static int warned;
 364        static char warncomm[TASK_COMM_LEN];
 365        if (strcmp(warncomm, current->comm) && warned < 5) {
 366                strcpy(warncomm,  current->comm);
 367                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 368                        warncomm, name);
 369                warned++;
 370        }
 371}
 372
 373#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 374
 375static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 376{
 377        if (sk->sk_flags & flags) {
 378                sk->sk_flags &= ~flags;
 379                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 380                        net_disable_timestamp();
 381        }
 382}
 383
 384
 385int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 386{
 387        int err;
 388        int skb_len;
 389        unsigned long flags;
 390        struct sk_buff_head *list = &sk->sk_receive_queue;
 391
 392        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 393                atomic_inc(&sk->sk_drops);
 394                trace_sock_rcvqueue_full(sk, skb);
 395                return -ENOMEM;
 396        }
 397
 398        err = sk_filter(sk, skb);
 399        if (err)
 400                return err;
 401
 402        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 403                atomic_inc(&sk->sk_drops);
 404                return -ENOBUFS;
 405        }
 406
 407        skb->dev = NULL;
 408        skb_set_owner_r(skb, sk);
 409
 410        /* Cache the SKB length before we tack it onto the receive
 411         * queue.  Once it is added it no longer belongs to us and
 412         * may be freed by other threads of control pulling packets
 413         * from the queue.
 414         */
 415        skb_len = skb->len;
 416
 417        /* we escape from rcu protected region, make sure we dont leak
 418         * a norefcounted dst
 419         */
 420        skb_dst_force(skb);
 421
 422        spin_lock_irqsave(&list->lock, flags);
 423        skb->dropcount = atomic_read(&sk->sk_drops);
 424        __skb_queue_tail(list, skb);
 425        spin_unlock_irqrestore(&list->lock, flags);
 426
 427        if (!sock_flag(sk, SOCK_DEAD))
 428                sk->sk_data_ready(sk, skb_len);
 429        return 0;
 430}
 431EXPORT_SYMBOL(sock_queue_rcv_skb);
 432
 433int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 434{
 435        int rc = NET_RX_SUCCESS;
 436
 437        if (sk_filter(sk, skb))
 438                goto discard_and_relse;
 439
 440        skb->dev = NULL;
 441
 442        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 443                atomic_inc(&sk->sk_drops);
 444                goto discard_and_relse;
 445        }
 446        if (nested)
 447                bh_lock_sock_nested(sk);
 448        else
 449                bh_lock_sock(sk);
 450        if (!sock_owned_by_user(sk)) {
 451                /*
 452                 * trylock + unlock semantics:
 453                 */
 454                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 455
 456                rc = sk_backlog_rcv(sk, skb);
 457
 458                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 459        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 460                bh_unlock_sock(sk);
 461                atomic_inc(&sk->sk_drops);
 462                goto discard_and_relse;
 463        }
 464
 465        bh_unlock_sock(sk);
 466out:
 467        sock_put(sk);
 468        return rc;
 469discard_and_relse:
 470        kfree_skb(skb);
 471        goto out;
 472}
 473EXPORT_SYMBOL(sk_receive_skb);
 474
 475void sk_reset_txq(struct sock *sk)
 476{
 477        sk_tx_queue_clear(sk);
 478}
 479EXPORT_SYMBOL(sk_reset_txq);
 480
 481struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 482{
 483        struct dst_entry *dst = __sk_dst_get(sk);
 484
 485        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 486                sk_tx_queue_clear(sk);
 487                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 488                dst_release(dst);
 489                return NULL;
 490        }
 491
 492        return dst;
 493}
 494EXPORT_SYMBOL(__sk_dst_check);
 495
 496struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 497{
 498        struct dst_entry *dst = sk_dst_get(sk);
 499
 500        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 501                sk_dst_reset(sk);
 502                dst_release(dst);
 503                return NULL;
 504        }
 505
 506        return dst;
 507}
 508EXPORT_SYMBOL(sk_dst_check);
 509
 510static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 511                                int optlen)
 512{
 513        int ret = -ENOPROTOOPT;
 514#ifdef CONFIG_NETDEVICES
 515        struct net *net = sock_net(sk);
 516        char devname[IFNAMSIZ];
 517        int index;
 518
 519        /* Sorry... */
 520        ret = -EPERM;
 521        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 522                goto out;
 523
 524        ret = -EINVAL;
 525        if (optlen < 0)
 526                goto out;
 527
 528        /* Bind this socket to a particular device like "eth0",
 529         * as specified in the passed interface name. If the
 530         * name is "" or the option length is zero the socket
 531         * is not bound.
 532         */
 533        if (optlen > IFNAMSIZ - 1)
 534                optlen = IFNAMSIZ - 1;
 535        memset(devname, 0, sizeof(devname));
 536
 537        ret = -EFAULT;
 538        if (copy_from_user(devname, optval, optlen))
 539                goto out;
 540
 541        index = 0;
 542        if (devname[0] != '\0') {
 543                struct net_device *dev;
 544
 545                rcu_read_lock();
 546                dev = dev_get_by_name_rcu(net, devname);
 547                if (dev)
 548                        index = dev->ifindex;
 549                rcu_read_unlock();
 550                ret = -ENODEV;
 551                if (!dev)
 552                        goto out;
 553        }
 554
 555        lock_sock(sk);
 556        sk->sk_bound_dev_if = index;
 557        sk_dst_reset(sk);
 558        release_sock(sk);
 559
 560        ret = 0;
 561
 562out:
 563#endif
 564
 565        return ret;
 566}
 567
 568static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 569                                int __user *optlen, int len)
 570{
 571        int ret = -ENOPROTOOPT;
 572#ifdef CONFIG_NETDEVICES
 573        struct net *net = sock_net(sk);
 574        char devname[IFNAMSIZ];
 575
 576        if (sk->sk_bound_dev_if == 0) {
 577                len = 0;
 578                goto zero;
 579        }
 580
 581        ret = -EINVAL;
 582        if (len < IFNAMSIZ)
 583                goto out;
 584
 585        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 586        if (ret)
 587                goto out;
 588
 589        len = strlen(devname) + 1;
 590
 591        ret = -EFAULT;
 592        if (copy_to_user(optval, devname, len))
 593                goto out;
 594
 595zero:
 596        ret = -EFAULT;
 597        if (put_user(len, optlen))
 598                goto out;
 599
 600        ret = 0;
 601
 602out:
 603#endif
 604
 605        return ret;
 606}
 607
 608static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 609{
 610        if (valbool)
 611                sock_set_flag(sk, bit);
 612        else
 613                sock_reset_flag(sk, bit);
 614}
 615
 616/*
 617 *      This is meant for all protocols to use and covers goings on
 618 *      at the socket level. Everything here is generic.
 619 */
 620
 621int sock_setsockopt(struct socket *sock, int level, int optname,
 622                    char __user *optval, unsigned int optlen)
 623{
 624        struct sock *sk = sock->sk;
 625        int val;
 626        int valbool;
 627        struct linger ling;
 628        int ret = 0;
 629
 630        /*
 631         *      Options without arguments
 632         */
 633
 634        if (optname == SO_BINDTODEVICE)
 635                return sock_setbindtodevice(sk, optval, optlen);
 636
 637        if (optlen < sizeof(int))
 638                return -EINVAL;
 639
 640        if (get_user(val, (int __user *)optval))
 641                return -EFAULT;
 642
 643        valbool = val ? 1 : 0;
 644
 645        lock_sock(sk);
 646
 647        switch (optname) {
 648        case SO_DEBUG:
 649                if (val && !capable(CAP_NET_ADMIN))
 650                        ret = -EACCES;
 651                else
 652                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 653                break;
 654        case SO_REUSEADDR:
 655                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 656                break;
 657        case SO_REUSEPORT:
 658                sk->sk_reuseport = valbool;
 659                break;
 660        case SO_TYPE:
 661        case SO_PROTOCOL:
 662        case SO_DOMAIN:
 663        case SO_ERROR:
 664                ret = -ENOPROTOOPT;
 665                break;
 666        case SO_DONTROUTE:
 667                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 668                break;
 669        case SO_BROADCAST:
 670                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 671                break;
 672        case SO_SNDBUF:
 673                /* Don't error on this BSD doesn't and if you think
 674                 * about it this is right. Otherwise apps have to
 675                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 676                 * are treated in BSD as hints
 677                 */
 678                val = min_t(u32, val, sysctl_wmem_max);
 679set_sndbuf:
 680                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 681                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 682                /* Wake up sending tasks if we upped the value. */
 683                sk->sk_write_space(sk);
 684                break;
 685
 686        case SO_SNDBUFFORCE:
 687                if (!capable(CAP_NET_ADMIN)) {
 688                        ret = -EPERM;
 689                        break;
 690                }
 691                goto set_sndbuf;
 692
 693        case SO_RCVBUF:
 694                /* Don't error on this BSD doesn't and if you think
 695                 * about it this is right. Otherwise apps have to
 696                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 697                 * are treated in BSD as hints
 698                 */
 699                val = min_t(u32, val, sysctl_rmem_max);
 700set_rcvbuf:
 701                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 702                /*
 703                 * We double it on the way in to account for
 704                 * "struct sk_buff" etc. overhead.   Applications
 705                 * assume that the SO_RCVBUF setting they make will
 706                 * allow that much actual data to be received on that
 707                 * socket.
 708                 *
 709                 * Applications are unaware that "struct sk_buff" and
 710                 * other overheads allocate from the receive buffer
 711                 * during socket buffer allocation.
 712                 *
 713                 * And after considering the possible alternatives,
 714                 * returning the value we actually used in getsockopt
 715                 * is the most desirable behavior.
 716                 */
 717                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 718                break;
 719
 720        case SO_RCVBUFFORCE:
 721                if (!capable(CAP_NET_ADMIN)) {
 722                        ret = -EPERM;
 723                        break;
 724                }
 725                goto set_rcvbuf;
 726
 727        case SO_KEEPALIVE:
 728#ifdef CONFIG_INET
 729                if (sk->sk_protocol == IPPROTO_TCP &&
 730                    sk->sk_type == SOCK_STREAM)
 731                        tcp_set_keepalive(sk, valbool);
 732#endif
 733                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 734                break;
 735
 736        case SO_OOBINLINE:
 737                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 738                break;
 739
 740        case SO_NO_CHECK:
 741                sk->sk_no_check = valbool;
 742                break;
 743
 744        case SO_PRIORITY:
 745                if ((val >= 0 && val <= 6) ||
 746                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 747                        sk->sk_priority = val;
 748                else
 749                        ret = -EPERM;
 750                break;
 751
 752        case SO_LINGER:
 753                if (optlen < sizeof(ling)) {
 754                        ret = -EINVAL;  /* 1003.1g */
 755                        break;
 756                }
 757                if (copy_from_user(&ling, optval, sizeof(ling))) {
 758                        ret = -EFAULT;
 759                        break;
 760                }
 761                if (!ling.l_onoff)
 762                        sock_reset_flag(sk, SOCK_LINGER);
 763                else {
 764#if (BITS_PER_LONG == 32)
 765                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 766                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 767                        else
 768#endif
 769                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 770                        sock_set_flag(sk, SOCK_LINGER);
 771                }
 772                break;
 773
 774        case SO_BSDCOMPAT:
 775                sock_warn_obsolete_bsdism("setsockopt");
 776                break;
 777
 778        case SO_PASSCRED:
 779                if (valbool)
 780                        set_bit(SOCK_PASSCRED, &sock->flags);
 781                else
 782                        clear_bit(SOCK_PASSCRED, &sock->flags);
 783                break;
 784
 785        case SO_TIMESTAMP:
 786        case SO_TIMESTAMPNS:
 787                if (valbool)  {
 788                        if (optname == SO_TIMESTAMP)
 789                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 790                        else
 791                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 792                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 793                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 794                } else {
 795                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 796                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 797                }
 798                break;
 799
 800        case SO_TIMESTAMPING:
 801                if (val & ~SOF_TIMESTAMPING_MASK) {
 802                        ret = -EINVAL;
 803                        break;
 804                }
 805                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 806                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 807                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 808                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 809                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 810                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 811                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 812                        sock_enable_timestamp(sk,
 813                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 814                else
 815                        sock_disable_timestamp(sk,
 816                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 817                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 818                                  val & SOF_TIMESTAMPING_SOFTWARE);
 819                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 820                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 821                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 822                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 823                break;
 824
 825        case SO_RCVLOWAT:
 826                if (val < 0)
 827                        val = INT_MAX;
 828                sk->sk_rcvlowat = val ? : 1;
 829                break;
 830
 831        case SO_RCVTIMEO:
 832                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 833                break;
 834
 835        case SO_SNDTIMEO:
 836                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 837                break;
 838
 839        case SO_ATTACH_FILTER:
 840                ret = -EINVAL;
 841                if (optlen == sizeof(struct sock_fprog)) {
 842                        struct sock_fprog fprog;
 843
 844                        ret = -EFAULT;
 845                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 846                                break;
 847
 848                        ret = sk_attach_filter(&fprog, sk);
 849                }
 850                break;
 851
 852        case SO_DETACH_FILTER:
 853                ret = sk_detach_filter(sk);
 854                break;
 855
 856        case SO_LOCK_FILTER:
 857                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 858                        ret = -EPERM;
 859                else
 860                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 861                break;
 862
 863        case SO_PASSSEC:
 864                if (valbool)
 865                        set_bit(SOCK_PASSSEC, &sock->flags);
 866                else
 867                        clear_bit(SOCK_PASSSEC, &sock->flags);
 868                break;
 869        case SO_MARK:
 870                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 871                        ret = -EPERM;
 872                else
 873                        sk->sk_mark = val;
 874                break;
 875
 876                /* We implement the SO_SNDLOWAT etc to
 877                   not be settable (1003.1g 5.3) */
 878        case SO_RXQ_OVFL:
 879                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 880                break;
 881
 882        case SO_WIFI_STATUS:
 883                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 884                break;
 885
 886        case SO_PEEK_OFF:
 887                if (sock->ops->set_peek_off)
 888                        sock->ops->set_peek_off(sk, val);
 889                else
 890                        ret = -EOPNOTSUPP;
 891                break;
 892
 893        case SO_NOFCS:
 894                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 895                break;
 896
 897        case SO_SELECT_ERR_QUEUE:
 898                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 899                break;
 900
 901        default:
 902                ret = -ENOPROTOOPT;
 903                break;
 904        }
 905        release_sock(sk);
 906        return ret;
 907}
 908EXPORT_SYMBOL(sock_setsockopt);
 909
 910
 911void cred_to_ucred(struct pid *pid, const struct cred *cred,
 912                   struct ucred *ucred)
 913{
 914        ucred->pid = pid_vnr(pid);
 915        ucred->uid = ucred->gid = -1;
 916        if (cred) {
 917                struct user_namespace *current_ns = current_user_ns();
 918
 919                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 920                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 921        }
 922}
 923EXPORT_SYMBOL_GPL(cred_to_ucred);
 924
 925int sock_getsockopt(struct socket *sock, int level, int optname,
 926                    char __user *optval, int __user *optlen)
 927{
 928        struct sock *sk = sock->sk;
 929
 930        union {
 931                int val;
 932                struct linger ling;
 933                struct timeval tm;
 934        } v;
 935
 936        int lv = sizeof(int);
 937        int len;
 938
 939        if (get_user(len, optlen))
 940                return -EFAULT;
 941        if (len < 0)
 942                return -EINVAL;
 943
 944        memset(&v, 0, sizeof(v));
 945
 946        switch (optname) {
 947        case SO_DEBUG:
 948                v.val = sock_flag(sk, SOCK_DBG);
 949                break;
 950
 951        case SO_DONTROUTE:
 952                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 953                break;
 954
 955        case SO_BROADCAST:
 956                v.val = sock_flag(sk, SOCK_BROADCAST);
 957                break;
 958
 959        case SO_SNDBUF:
 960                v.val = sk->sk_sndbuf;
 961                break;
 962
 963        case SO_RCVBUF:
 964                v.val = sk->sk_rcvbuf;
 965                break;
 966
 967        case SO_REUSEADDR:
 968                v.val = sk->sk_reuse;
 969                break;
 970
 971        case SO_REUSEPORT:
 972                v.val = sk->sk_reuseport;
 973                break;
 974
 975        case SO_KEEPALIVE:
 976                v.val = sock_flag(sk, SOCK_KEEPOPEN);
 977                break;
 978
 979        case SO_TYPE:
 980                v.val = sk->sk_type;
 981                break;
 982
 983        case SO_PROTOCOL:
 984                v.val = sk->sk_protocol;
 985                break;
 986
 987        case SO_DOMAIN:
 988                v.val = sk->sk_family;
 989                break;
 990
 991        case SO_ERROR:
 992                v.val = -sock_error(sk);
 993                if (v.val == 0)
 994                        v.val = xchg(&sk->sk_err_soft, 0);
 995                break;
 996
 997        case SO_OOBINLINE:
 998                v.val = sock_flag(sk, SOCK_URGINLINE);
 999                break;
1000
1001        case SO_NO_CHECK:
1002                v.val = sk->sk_no_check;
1003                break;
1004
1005        case SO_PRIORITY:
1006                v.val = sk->sk_priority;
1007                break;
1008
1009        case SO_LINGER:
1010                lv              = sizeof(v.ling);
1011                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1012                v.ling.l_linger = sk->sk_lingertime / HZ;
1013                break;
1014
1015        case SO_BSDCOMPAT:
1016                sock_warn_obsolete_bsdism("getsockopt");
1017                break;
1018
1019        case SO_TIMESTAMP:
1020                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1021                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1022                break;
1023
1024        case SO_TIMESTAMPNS:
1025                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1026                break;
1027
1028        case SO_TIMESTAMPING:
1029                v.val = 0;
1030                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1031                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1032                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1033                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1034                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1035                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1036                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1037                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1038                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1039                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
1040                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1041                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1042                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1043                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1044                break;
1045
1046        case SO_RCVTIMEO:
1047                lv = sizeof(struct timeval);
1048                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1049                        v.tm.tv_sec = 0;
1050                        v.tm.tv_usec = 0;
1051                } else {
1052                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1053                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1054                }
1055                break;
1056
1057        case SO_SNDTIMEO:
1058                lv = sizeof(struct timeval);
1059                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1060                        v.tm.tv_sec = 0;
1061                        v.tm.tv_usec = 0;
1062                } else {
1063                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1064                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1065                }
1066                break;
1067
1068        case SO_RCVLOWAT:
1069                v.val = sk->sk_rcvlowat;
1070                break;
1071
1072        case SO_SNDLOWAT:
1073                v.val = 1;
1074                break;
1075
1076        case SO_PASSCRED:
1077                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1078                break;
1079
1080        case SO_PEERCRED:
1081        {
1082                struct ucred peercred;
1083                if (len > sizeof(peercred))
1084                        len = sizeof(peercred);
1085                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1086                if (copy_to_user(optval, &peercred, len))
1087                        return -EFAULT;
1088                goto lenout;
1089        }
1090
1091        case SO_PEERNAME:
1092        {
1093                char address[128];
1094
1095                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1096                        return -ENOTCONN;
1097                if (lv < len)
1098                        return -EINVAL;
1099                if (copy_to_user(optval, address, len))
1100                        return -EFAULT;
1101                goto lenout;
1102        }
1103
1104        /* Dubious BSD thing... Probably nobody even uses it, but
1105         * the UNIX standard wants it for whatever reason... -DaveM
1106         */
1107        case SO_ACCEPTCONN:
1108                v.val = sk->sk_state == TCP_LISTEN;
1109                break;
1110
1111        case SO_PASSSEC:
1112                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1113                break;
1114
1115        case SO_PEERSEC:
1116                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1117
1118        case SO_MARK:
1119                v.val = sk->sk_mark;
1120                break;
1121
1122        case SO_RXQ_OVFL:
1123                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1124                break;
1125
1126        case SO_WIFI_STATUS:
1127                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1128                break;
1129
1130        case SO_PEEK_OFF:
1131                if (!sock->ops->set_peek_off)
1132                        return -EOPNOTSUPP;
1133
1134                v.val = sk->sk_peek_off;
1135                break;
1136        case SO_NOFCS:
1137                v.val = sock_flag(sk, SOCK_NOFCS);
1138                break;
1139
1140        case SO_BINDTODEVICE:
1141                return sock_getbindtodevice(sk, optval, optlen, len);
1142
1143        case SO_GET_FILTER:
1144                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1145                if (len < 0)
1146                        return len;
1147
1148                goto lenout;
1149
1150        case SO_LOCK_FILTER:
1151                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1152                break;
1153
1154        case SO_SELECT_ERR_QUEUE:
1155                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1156                break;
1157
1158        default:
1159                return -ENOPROTOOPT;
1160        }
1161
1162        if (len > lv)
1163                len = lv;
1164        if (copy_to_user(optval, &v, len))
1165                return -EFAULT;
1166lenout:
1167        if (put_user(len, optlen))
1168                return -EFAULT;
1169        return 0;
1170}
1171
1172/*
1173 * Initialize an sk_lock.
1174 *
1175 * (We also register the sk_lock with the lock validator.)
1176 */
1177static inline void sock_lock_init(struct sock *sk)
1178{
1179        sock_lock_init_class_and_name(sk,
1180                        af_family_slock_key_strings[sk->sk_family],
1181                        af_family_slock_keys + sk->sk_family,
1182                        af_family_key_strings[sk->sk_family],
1183                        af_family_keys + sk->sk_family);
1184}
1185
1186/*
1187 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1188 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1189 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1190 */
1191static void sock_copy(struct sock *nsk, const struct sock *osk)
1192{
1193#ifdef CONFIG_SECURITY_NETWORK
1194        void *sptr = nsk->sk_security;
1195#endif
1196        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1197
1198        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1199               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1200
1201#ifdef CONFIG_SECURITY_NETWORK
1202        nsk->sk_security = sptr;
1203        security_sk_clone(osk, nsk);
1204#endif
1205}
1206
1207void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1208{
1209        unsigned long nulls1, nulls2;
1210
1211        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1212        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1213        if (nulls1 > nulls2)
1214                swap(nulls1, nulls2);
1215
1216        if (nulls1 != 0)
1217                memset((char *)sk, 0, nulls1);
1218        memset((char *)sk + nulls1 + sizeof(void *), 0,
1219               nulls2 - nulls1 - sizeof(void *));
1220        memset((char *)sk + nulls2 + sizeof(void *), 0,
1221               size - nulls2 - sizeof(void *));
1222}
1223EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1224
1225static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1226                int family)
1227{
1228        struct sock *sk;
1229        struct kmem_cache *slab;
1230
1231        slab = prot->slab;
1232        if (slab != NULL) {
1233                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1234                if (!sk)
1235                        return sk;
1236                if (priority & __GFP_ZERO) {
1237                        if (prot->clear_sk)
1238                                prot->clear_sk(sk, prot->obj_size);
1239                        else
1240                                sk_prot_clear_nulls(sk, prot->obj_size);
1241                }
1242        } else
1243                sk = kmalloc(prot->obj_size, priority);
1244
1245        if (sk != NULL) {
1246                kmemcheck_annotate_bitfield(sk, flags);
1247
1248                if (security_sk_alloc(sk, family, priority))
1249                        goto out_free;
1250
1251                if (!try_module_get(prot->owner))
1252                        goto out_free_sec;
1253                sk_tx_queue_clear(sk);
1254        }
1255
1256        return sk;
1257
1258out_free_sec:
1259        security_sk_free(sk);
1260out_free:
1261        if (slab != NULL)
1262                kmem_cache_free(slab, sk);
1263        else
1264                kfree(sk);
1265        return NULL;
1266}
1267
1268static void sk_prot_free(struct proto *prot, struct sock *sk)
1269{
1270        struct kmem_cache *slab;
1271        struct module *owner;
1272
1273        owner = prot->owner;
1274        slab = prot->slab;
1275
1276        security_sk_free(sk);
1277        if (slab != NULL)
1278                kmem_cache_free(slab, sk);
1279        else
1280                kfree(sk);
1281        module_put(owner);
1282}
1283
1284#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1285void sock_update_classid(struct sock *sk)
1286{
1287        u32 classid;
1288
1289        classid = task_cls_classid(current);
1290        if (classid != sk->sk_classid)
1291                sk->sk_classid = classid;
1292}
1293EXPORT_SYMBOL(sock_update_classid);
1294#endif
1295
1296#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1297void sock_update_netprioidx(struct sock *sk)
1298{
1299        if (in_interrupt())
1300                return;
1301
1302        sk->sk_cgrp_prioidx = task_netprioidx(current);
1303}
1304EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1305#endif
1306
1307/**
1308 *      sk_alloc - All socket objects are allocated here
1309 *      @net: the applicable net namespace
1310 *      @family: protocol family
1311 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1312 *      @prot: struct proto associated with this new sock instance
1313 */
1314struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1315                      struct proto *prot)
1316{
1317        struct sock *sk;
1318
1319        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1320        if (sk) {
1321                sk->sk_family = family;
1322                /*
1323                 * See comment in struct sock definition to understand
1324                 * why we need sk_prot_creator -acme
1325                 */
1326                sk->sk_prot = sk->sk_prot_creator = prot;
1327                sock_lock_init(sk);
1328                sock_net_set(sk, get_net(net));
1329                atomic_set(&sk->sk_wmem_alloc, 1);
1330
1331                sock_update_classid(sk);
1332                sock_update_netprioidx(sk);
1333        }
1334
1335        return sk;
1336}
1337EXPORT_SYMBOL(sk_alloc);
1338
1339static void __sk_free(struct sock *sk)
1340{
1341        struct sk_filter *filter;
1342
1343        if (sk->sk_destruct)
1344                sk->sk_destruct(sk);
1345
1346        filter = rcu_dereference_check(sk->sk_filter,
1347                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1348        if (filter) {
1349                sk_filter_uncharge(sk, filter);
1350                RCU_INIT_POINTER(sk->sk_filter, NULL);
1351        }
1352
1353        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1354
1355        if (atomic_read(&sk->sk_omem_alloc))
1356                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1357                         __func__, atomic_read(&sk->sk_omem_alloc));
1358
1359        if (sk->sk_peer_cred)
1360                put_cred(sk->sk_peer_cred);
1361        put_pid(sk->sk_peer_pid);
1362        put_net(sock_net(sk));
1363        sk_prot_free(sk->sk_prot_creator, sk);
1364}
1365
1366void sk_free(struct sock *sk)
1367{
1368        /*
1369         * We subtract one from sk_wmem_alloc and can know if
1370         * some packets are still in some tx queue.
1371         * If not null, sock_wfree() will call __sk_free(sk) later
1372         */
1373        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1374                __sk_free(sk);
1375}
1376EXPORT_SYMBOL(sk_free);
1377
1378/*
1379 * Last sock_put should drop reference to sk->sk_net. It has already
1380 * been dropped in sk_change_net. Taking reference to stopping namespace
1381 * is not an option.
1382 * Take reference to a socket to remove it from hash _alive_ and after that
1383 * destroy it in the context of init_net.
1384 */
1385void sk_release_kernel(struct sock *sk)
1386{
1387        if (sk == NULL || sk->sk_socket == NULL)
1388                return;
1389
1390        sock_hold(sk);
1391        sock_release(sk->sk_socket);
1392        release_net(sock_net(sk));
1393        sock_net_set(sk, get_net(&init_net));
1394        sock_put(sk);
1395}
1396EXPORT_SYMBOL(sk_release_kernel);
1397
1398static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1399{
1400        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1401                sock_update_memcg(newsk);
1402}
1403
1404/**
1405 *      sk_clone_lock - clone a socket, and lock its clone
1406 *      @sk: the socket to clone
1407 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1408 *
1409 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1410 */
1411struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1412{
1413        struct sock *newsk;
1414
1415        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1416        if (newsk != NULL) {
1417                struct sk_filter *filter;
1418
1419                sock_copy(newsk, sk);
1420
1421                /* SANITY */
1422                get_net(sock_net(newsk));
1423                sk_node_init(&newsk->sk_node);
1424                sock_lock_init(newsk);
1425                bh_lock_sock(newsk);
1426                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1427                newsk->sk_backlog.len = 0;
1428
1429                atomic_set(&newsk->sk_rmem_alloc, 0);
1430                /*
1431                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1432                 */
1433                atomic_set(&newsk->sk_wmem_alloc, 1);
1434                atomic_set(&newsk->sk_omem_alloc, 0);
1435                skb_queue_head_init(&newsk->sk_receive_queue);
1436                skb_queue_head_init(&newsk->sk_write_queue);
1437#ifdef CONFIG_NET_DMA
1438                skb_queue_head_init(&newsk->sk_async_wait_queue);
1439#endif
1440
1441                spin_lock_init(&newsk->sk_dst_lock);
1442                rwlock_init(&newsk->sk_callback_lock);
1443                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1444                                af_callback_keys + newsk->sk_family,
1445                                af_family_clock_key_strings[newsk->sk_family]);
1446
1447                newsk->sk_dst_cache     = NULL;
1448                newsk->sk_wmem_queued   = 0;
1449                newsk->sk_forward_alloc = 0;
1450                newsk->sk_send_head     = NULL;
1451                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1452
1453                sock_reset_flag(newsk, SOCK_DONE);
1454                skb_queue_head_init(&newsk->sk_error_queue);
1455
1456                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1457                if (filter != NULL)
1458                        sk_filter_charge(newsk, filter);
1459
1460                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1461                        /* It is still raw copy of parent, so invalidate
1462                         * destructor and make plain sk_free() */
1463                        newsk->sk_destruct = NULL;
1464                        bh_unlock_sock(newsk);
1465                        sk_free(newsk);
1466                        newsk = NULL;
1467                        goto out;
1468                }
1469
1470                newsk->sk_err      = 0;
1471                newsk->sk_priority = 0;
1472                /*
1473                 * Before updating sk_refcnt, we must commit prior changes to memory
1474                 * (Documentation/RCU/rculist_nulls.txt for details)
1475                 */
1476                smp_wmb();
1477                atomic_set(&newsk->sk_refcnt, 2);
1478
1479                /*
1480                 * Increment the counter in the same struct proto as the master
1481                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1482                 * is the same as sk->sk_prot->socks, as this field was copied
1483                 * with memcpy).
1484                 *
1485                 * This _changes_ the previous behaviour, where
1486                 * tcp_create_openreq_child always was incrementing the
1487                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1488                 * to be taken into account in all callers. -acme
1489                 */
1490                sk_refcnt_debug_inc(newsk);
1491                sk_set_socket(newsk, NULL);
1492                newsk->sk_wq = NULL;
1493
1494                sk_update_clone(sk, newsk);
1495
1496                if (newsk->sk_prot->sockets_allocated)
1497                        sk_sockets_allocated_inc(newsk);
1498
1499                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1500                        net_enable_timestamp();
1501        }
1502out:
1503        return newsk;
1504}
1505EXPORT_SYMBOL_GPL(sk_clone_lock);
1506
1507void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1508{
1509        __sk_dst_set(sk, dst);
1510        sk->sk_route_caps = dst->dev->features;
1511        if (sk->sk_route_caps & NETIF_F_GSO)
1512                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1513        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1514        if (sk_can_gso(sk)) {
1515                if (dst->header_len) {
1516                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1517                } else {
1518                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1519                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1520                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1521                }
1522        }
1523}
1524EXPORT_SYMBOL_GPL(sk_setup_caps);
1525
1526/*
1527 *      Simple resource managers for sockets.
1528 */
1529
1530
1531/*
1532 * Write buffer destructor automatically called from kfree_skb.
1533 */
1534void sock_wfree(struct sk_buff *skb)
1535{
1536        struct sock *sk = skb->sk;
1537        unsigned int len = skb->truesize;
1538
1539        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540                /*
1541                 * Keep a reference on sk_wmem_alloc, this will be released
1542                 * after sk_write_space() call
1543                 */
1544                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545                sk->sk_write_space(sk);
1546                len = 1;
1547        }
1548        /*
1549         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550         * could not do because of in-flight packets
1551         */
1552        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553                __sk_free(sk);
1554}
1555EXPORT_SYMBOL(sock_wfree);
1556
1557/*
1558 * Read buffer destructor automatically called from kfree_skb.
1559 */
1560void sock_rfree(struct sk_buff *skb)
1561{
1562        struct sock *sk = skb->sk;
1563        unsigned int len = skb->truesize;
1564
1565        atomic_sub(len, &sk->sk_rmem_alloc);
1566        sk_mem_uncharge(sk, len);
1567}
1568EXPORT_SYMBOL(sock_rfree);
1569
1570void sock_edemux(struct sk_buff *skb)
1571{
1572        struct sock *sk = skb->sk;
1573
1574#ifdef CONFIG_INET
1575        if (sk->sk_state == TCP_TIME_WAIT)
1576                inet_twsk_put(inet_twsk(sk));
1577        else
1578#endif
1579                sock_put(sk);
1580}
1581EXPORT_SYMBOL(sock_edemux);
1582
1583kuid_t sock_i_uid(struct sock *sk)
1584{
1585        kuid_t uid;
1586
1587        read_lock_bh(&sk->sk_callback_lock);
1588        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1589        read_unlock_bh(&sk->sk_callback_lock);
1590        return uid;
1591}
1592EXPORT_SYMBOL(sock_i_uid);
1593
1594unsigned long sock_i_ino(struct sock *sk)
1595{
1596        unsigned long ino;
1597
1598        read_lock_bh(&sk->sk_callback_lock);
1599        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1600        read_unlock_bh(&sk->sk_callback_lock);
1601        return ino;
1602}
1603EXPORT_SYMBOL(sock_i_ino);
1604
1605/*
1606 * Allocate a skb from the socket's send buffer.
1607 */
1608struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1609                             gfp_t priority)
1610{
1611        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1612                struct sk_buff *skb = alloc_skb(size, priority);
1613                if (skb) {
1614                        skb_set_owner_w(skb, sk);
1615                        return skb;
1616                }
1617        }
1618        return NULL;
1619}
1620EXPORT_SYMBOL(sock_wmalloc);
1621
1622/*
1623 * Allocate a skb from the socket's receive buffer.
1624 */
1625struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1626                             gfp_t priority)
1627{
1628        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1629                struct sk_buff *skb = alloc_skb(size, priority);
1630                if (skb) {
1631                        skb_set_owner_r(skb, sk);
1632                        return skb;
1633                }
1634        }
1635        return NULL;
1636}
1637
1638/*
1639 * Allocate a memory block from the socket's option memory buffer.
1640 */
1641void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1642{
1643        if ((unsigned int)size <= sysctl_optmem_max &&
1644            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1645                void *mem;
1646                /* First do the add, to avoid the race if kmalloc
1647                 * might sleep.
1648                 */
1649                atomic_add(size, &sk->sk_omem_alloc);
1650                mem = kmalloc(size, priority);
1651                if (mem)
1652                        return mem;
1653                atomic_sub(size, &sk->sk_omem_alloc);
1654        }
1655        return NULL;
1656}
1657EXPORT_SYMBOL(sock_kmalloc);
1658
1659/*
1660 * Free an option memory block.
1661 */
1662void sock_kfree_s(struct sock *sk, void *mem, int size)
1663{
1664        kfree(mem);
1665        atomic_sub(size, &sk->sk_omem_alloc);
1666}
1667EXPORT_SYMBOL(sock_kfree_s);
1668
1669/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1670   I think, these locks should be removed for datagram sockets.
1671 */
1672static long sock_wait_for_wmem(struct sock *sk, long timeo)
1673{
1674        DEFINE_WAIT(wait);
1675
1676        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1677        for (;;) {
1678                if (!timeo)
1679                        break;
1680                if (signal_pending(current))
1681                        break;
1682                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1683                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1684                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1685                        break;
1686                if (sk->sk_shutdown & SEND_SHUTDOWN)
1687                        break;
1688                if (sk->sk_err)
1689                        break;
1690                timeo = schedule_timeout(timeo);
1691        }
1692        finish_wait(sk_sleep(sk), &wait);
1693        return timeo;
1694}
1695
1696
1697/*
1698 *      Generic send/receive buffer handlers
1699 */
1700
1701struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1702                                     unsigned long data_len, int noblock,
1703                                     int *errcode)
1704{
1705        struct sk_buff *skb;
1706        gfp_t gfp_mask;
1707        long timeo;
1708        int err;
1709        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1710
1711        err = -EMSGSIZE;
1712        if (npages > MAX_SKB_FRAGS)
1713                goto failure;
1714
1715        gfp_mask = sk->sk_allocation;
1716        if (gfp_mask & __GFP_WAIT)
1717                gfp_mask |= __GFP_REPEAT;
1718
1719        timeo = sock_sndtimeo(sk, noblock);
1720        while (1) {
1721                err = sock_error(sk);
1722                if (err != 0)
1723                        goto failure;
1724
1725                err = -EPIPE;
1726                if (sk->sk_shutdown & SEND_SHUTDOWN)
1727                        goto failure;
1728
1729                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1730                        skb = alloc_skb(header_len, gfp_mask);
1731                        if (skb) {
1732                                int i;
1733
1734                                /* No pages, we're done... */
1735                                if (!data_len)
1736                                        break;
1737
1738                                skb->truesize += data_len;
1739                                skb_shinfo(skb)->nr_frags = npages;
1740                                for (i = 0; i < npages; i++) {
1741                                        struct page *page;
1742
1743                                        page = alloc_pages(sk->sk_allocation, 0);
1744                                        if (!page) {
1745                                                err = -ENOBUFS;
1746                                                skb_shinfo(skb)->nr_frags = i;
1747                                                kfree_skb(skb);
1748                                                goto failure;
1749                                        }
1750
1751                                        __skb_fill_page_desc(skb, i,
1752                                                        page, 0,
1753                                                        (data_len >= PAGE_SIZE ?
1754                                                         PAGE_SIZE :
1755                                                         data_len));
1756                                        data_len -= PAGE_SIZE;
1757                                }
1758
1759                                /* Full success... */
1760                                break;
1761                        }
1762                        err = -ENOBUFS;
1763                        goto failure;
1764                }
1765                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1766                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1767                err = -EAGAIN;
1768                if (!timeo)
1769                        goto failure;
1770                if (signal_pending(current))
1771                        goto interrupted;
1772                timeo = sock_wait_for_wmem(sk, timeo);
1773        }
1774
1775        skb_set_owner_w(skb, sk);
1776        return skb;
1777
1778interrupted:
1779        err = sock_intr_errno(timeo);
1780failure:
1781        *errcode = err;
1782        return NULL;
1783}
1784EXPORT_SYMBOL(sock_alloc_send_pskb);
1785
1786struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1787                                    int noblock, int *errcode)
1788{
1789        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1790}
1791EXPORT_SYMBOL(sock_alloc_send_skb);
1792
1793/* On 32bit arches, an skb frag is limited to 2^15 */
1794#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1795
1796bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797{
1798        int order;
1799
1800        if (pfrag->page) {
1801                if (atomic_read(&pfrag->page->_count) == 1) {
1802                        pfrag->offset = 0;
1803                        return true;
1804                }
1805                if (pfrag->offset < pfrag->size)
1806                        return true;
1807                put_page(pfrag->page);
1808        }
1809
1810        /* We restrict high order allocations to users that can afford to wait */
1811        order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812
1813        do {
1814                gfp_t gfp = sk->sk_allocation;
1815
1816                if (order)
1817                        gfp |= __GFP_COMP | __GFP_NOWARN;
1818                pfrag->page = alloc_pages(gfp, order);
1819                if (likely(pfrag->page)) {
1820                        pfrag->offset = 0;
1821                        pfrag->size = PAGE_SIZE << order;
1822                        return true;
1823                }
1824        } while (--order >= 0);
1825
1826        sk_enter_memory_pressure(sk);
1827        sk_stream_moderate_sndbuf(sk);
1828        return false;
1829}
1830EXPORT_SYMBOL(sk_page_frag_refill);
1831
1832static void __lock_sock(struct sock *sk)
1833        __releases(&sk->sk_lock.slock)
1834        __acquires(&sk->sk_lock.slock)
1835{
1836        DEFINE_WAIT(wait);
1837
1838        for (;;) {
1839                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1840                                        TASK_UNINTERRUPTIBLE);
1841                spin_unlock_bh(&sk->sk_lock.slock);
1842                schedule();
1843                spin_lock_bh(&sk->sk_lock.slock);
1844                if (!sock_owned_by_user(sk))
1845                        break;
1846        }
1847        finish_wait(&sk->sk_lock.wq, &wait);
1848}
1849
1850static void __release_sock(struct sock *sk)
1851        __releases(&sk->sk_lock.slock)
1852        __acquires(&sk->sk_lock.slock)
1853{
1854        struct sk_buff *skb = sk->sk_backlog.head;
1855
1856        do {
1857                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1858                bh_unlock_sock(sk);
1859
1860                do {
1861                        struct sk_buff *next = skb->next;
1862
1863                        prefetch(next);
1864                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1865                        skb->next = NULL;
1866                        sk_backlog_rcv(sk, skb);
1867
1868                        /*
1869                         * We are in process context here with softirqs
1870                         * disabled, use cond_resched_softirq() to preempt.
1871                         * This is safe to do because we've taken the backlog
1872                         * queue private:
1873                         */
1874                        cond_resched_softirq();
1875
1876                        skb = next;
1877                } while (skb != NULL);
1878
1879                bh_lock_sock(sk);
1880        } while ((skb = sk->sk_backlog.head) != NULL);
1881
1882        /*
1883         * Doing the zeroing here guarantee we can not loop forever
1884         * while a wild producer attempts to flood us.
1885         */
1886        sk->sk_backlog.len = 0;
1887}
1888
1889/**
1890 * sk_wait_data - wait for data to arrive at sk_receive_queue
1891 * @sk:    sock to wait on
1892 * @timeo: for how long
1893 *
1894 * Now socket state including sk->sk_err is changed only under lock,
1895 * hence we may omit checks after joining wait queue.
1896 * We check receive queue before schedule() only as optimization;
1897 * it is very likely that release_sock() added new data.
1898 */
1899int sk_wait_data(struct sock *sk, long *timeo)
1900{
1901        int rc;
1902        DEFINE_WAIT(wait);
1903
1904        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1905        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1906        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1907        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1908        finish_wait(sk_sleep(sk), &wait);
1909        return rc;
1910}
1911EXPORT_SYMBOL(sk_wait_data);
1912
1913/**
1914 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1915 *      @sk: socket
1916 *      @size: memory size to allocate
1917 *      @kind: allocation type
1918 *
1919 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1920 *      rmem allocation. This function assumes that protocols which have
1921 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1922 */
1923int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924{
1925        struct proto *prot = sk->sk_prot;
1926        int amt = sk_mem_pages(size);
1927        long allocated;
1928        int parent_status = UNDER_LIMIT;
1929
1930        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1931
1932        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1933
1934        /* Under limit. */
1935        if (parent_status == UNDER_LIMIT &&
1936                        allocated <= sk_prot_mem_limits(sk, 0)) {
1937                sk_leave_memory_pressure(sk);
1938                return 1;
1939        }
1940
1941        /* Under pressure. (we or our parents) */
1942        if ((parent_status > SOFT_LIMIT) ||
1943                        allocated > sk_prot_mem_limits(sk, 1))
1944                sk_enter_memory_pressure(sk);
1945
1946        /* Over hard limit (we or our parents) */
1947        if ((parent_status == OVER_LIMIT) ||
1948                        (allocated > sk_prot_mem_limits(sk, 2)))
1949                goto suppress_allocation;
1950
1951        /* guarantee minimum buffer size under pressure */
1952        if (kind == SK_MEM_RECV) {
1953                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1954                        return 1;
1955
1956        } else { /* SK_MEM_SEND */
1957                if (sk->sk_type == SOCK_STREAM) {
1958                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1959                                return 1;
1960                } else if (atomic_read(&sk->sk_wmem_alloc) <
1961                           prot->sysctl_wmem[0])
1962                                return 1;
1963        }
1964
1965        if (sk_has_memory_pressure(sk)) {
1966                int alloc;
1967
1968                if (!sk_under_memory_pressure(sk))
1969                        return 1;
1970                alloc = sk_sockets_allocated_read_positive(sk);
1971                if (sk_prot_mem_limits(sk, 2) > alloc *
1972                    sk_mem_pages(sk->sk_wmem_queued +
1973                                 atomic_read(&sk->sk_rmem_alloc) +
1974                                 sk->sk_forward_alloc))
1975                        return 1;
1976        }
1977
1978suppress_allocation:
1979
1980        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981                sk_stream_moderate_sndbuf(sk);
1982
1983                /* Fail only if socket is _under_ its sndbuf.
1984                 * In this case we cannot block, so that we have to fail.
1985                 */
1986                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1987                        return 1;
1988        }
1989
1990        trace_sock_exceed_buf_limit(sk, prot, allocated);
1991
1992        /* Alas. Undo changes. */
1993        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995        sk_memory_allocated_sub(sk, amt);
1996
1997        return 0;
1998}
1999EXPORT_SYMBOL(__sk_mem_schedule);
2000
2001/**
2002 *      __sk_reclaim - reclaim memory_allocated
2003 *      @sk: socket
2004 */
2005void __sk_mem_reclaim(struct sock *sk)
2006{
2007        sk_memory_allocated_sub(sk,
2008                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2009        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2010
2011        if (sk_under_memory_pressure(sk) &&
2012            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2013                sk_leave_memory_pressure(sk);
2014}
2015EXPORT_SYMBOL(__sk_mem_reclaim);
2016
2017
2018/*
2019 * Set of default routines for initialising struct proto_ops when
2020 * the protocol does not support a particular function. In certain
2021 * cases where it makes no sense for a protocol to have a "do nothing"
2022 * function, some default processing is provided.
2023 */
2024
2025int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2026{
2027        return -EOPNOTSUPP;
2028}
2029EXPORT_SYMBOL(sock_no_bind);
2030
2031int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2032                    int len, int flags)
2033{
2034        return -EOPNOTSUPP;
2035}
2036EXPORT_SYMBOL(sock_no_connect);
2037
2038int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2039{
2040        return -EOPNOTSUPP;
2041}
2042EXPORT_SYMBOL(sock_no_socketpair);
2043
2044int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2045{
2046        return -EOPNOTSUPP;
2047}
2048EXPORT_SYMBOL(sock_no_accept);
2049
2050int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2051                    int *len, int peer)
2052{
2053        return -EOPNOTSUPP;
2054}
2055EXPORT_SYMBOL(sock_no_getname);
2056
2057unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2058{
2059        return 0;
2060}
2061EXPORT_SYMBOL(sock_no_poll);
2062
2063int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2064{
2065        return -EOPNOTSUPP;
2066}
2067EXPORT_SYMBOL(sock_no_ioctl);
2068
2069int sock_no_listen(struct socket *sock, int backlog)
2070{
2071        return -EOPNOTSUPP;
2072}
2073EXPORT_SYMBOL(sock_no_listen);
2074
2075int sock_no_shutdown(struct socket *sock, int how)
2076{
2077        return -EOPNOTSUPP;
2078}
2079EXPORT_SYMBOL(sock_no_shutdown);
2080
2081int sock_no_setsockopt(struct socket *sock, int level, int optname,
2082                    char __user *optval, unsigned int optlen)
2083{
2084        return -EOPNOTSUPP;
2085}
2086EXPORT_SYMBOL(sock_no_setsockopt);
2087
2088int sock_no_getsockopt(struct socket *sock, int level, int optname,
2089                    char __user *optval, int __user *optlen)
2090{
2091        return -EOPNOTSUPP;
2092}
2093EXPORT_SYMBOL(sock_no_getsockopt);
2094
2095int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2096                    size_t len)
2097{
2098        return -EOPNOTSUPP;
2099}
2100EXPORT_SYMBOL(sock_no_sendmsg);
2101
2102int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2103                    size_t len, int flags)
2104{
2105        return -EOPNOTSUPP;
2106}
2107EXPORT_SYMBOL(sock_no_recvmsg);
2108
2109int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2110{
2111        /* Mirror missing mmap method error code */
2112        return -ENODEV;
2113}
2114EXPORT_SYMBOL(sock_no_mmap);
2115
2116ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2117{
2118        ssize_t res;
2119        struct msghdr msg = {.msg_flags = flags};
2120        struct kvec iov;
2121        char *kaddr = kmap(page);
2122        iov.iov_base = kaddr + offset;
2123        iov.iov_len = size;
2124        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2125        kunmap(page);
2126        return res;
2127}
2128EXPORT_SYMBOL(sock_no_sendpage);
2129
2130/*
2131 *      Default Socket Callbacks
2132 */
2133
2134static void sock_def_wakeup(struct sock *sk)
2135{
2136        struct socket_wq *wq;
2137
2138        rcu_read_lock();
2139        wq = rcu_dereference(sk->sk_wq);
2140        if (wq_has_sleeper(wq))
2141                wake_up_interruptible_all(&wq->wait);
2142        rcu_read_unlock();
2143}
2144
2145static void sock_def_error_report(struct sock *sk)
2146{
2147        struct socket_wq *wq;
2148
2149        rcu_read_lock();
2150        wq = rcu_dereference(sk->sk_wq);
2151        if (wq_has_sleeper(wq))
2152                wake_up_interruptible_poll(&wq->wait, POLLERR);
2153        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2154        rcu_read_unlock();
2155}
2156
2157static void sock_def_readable(struct sock *sk, int len)
2158{
2159        struct socket_wq *wq;
2160
2161        rcu_read_lock();
2162        wq = rcu_dereference(sk->sk_wq);
2163        if (wq_has_sleeper(wq))
2164                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2165                                                POLLRDNORM | POLLRDBAND);
2166        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2167        rcu_read_unlock();
2168}
2169
2170static void sock_def_write_space(struct sock *sk)
2171{
2172        struct socket_wq *wq;
2173
2174        rcu_read_lock();
2175
2176        /* Do not wake up a writer until he can make "significant"
2177         * progress.  --DaveM
2178         */
2179        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2180                wq = rcu_dereference(sk->sk_wq);
2181                if (wq_has_sleeper(wq))
2182                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2183                                                POLLWRNORM | POLLWRBAND);
2184
2185                /* Should agree with poll, otherwise some programs break */
2186                if (sock_writeable(sk))
2187                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2188        }
2189
2190        rcu_read_unlock();
2191}
2192
2193static void sock_def_destruct(struct sock *sk)
2194{
2195        kfree(sk->sk_protinfo);
2196}
2197
2198void sk_send_sigurg(struct sock *sk)
2199{
2200        if (sk->sk_socket && sk->sk_socket->file)
2201                if (send_sigurg(&sk->sk_socket->file->f_owner))
2202                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2203}
2204EXPORT_SYMBOL(sk_send_sigurg);
2205
2206void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2207                    unsigned long expires)
2208{
2209        if (!mod_timer(timer, expires))
2210                sock_hold(sk);
2211}
2212EXPORT_SYMBOL(sk_reset_timer);
2213
2214void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2215{
2216        if (del_timer(timer))
2217                __sock_put(sk);
2218}
2219EXPORT_SYMBOL(sk_stop_timer);
2220
2221void sock_init_data(struct socket *sock, struct sock *sk)
2222{
2223        skb_queue_head_init(&sk->sk_receive_queue);
2224        skb_queue_head_init(&sk->sk_write_queue);
2225        skb_queue_head_init(&sk->sk_error_queue);
2226#ifdef CONFIG_NET_DMA
2227        skb_queue_head_init(&sk->sk_async_wait_queue);
2228#endif
2229
2230        sk->sk_send_head        =       NULL;
2231
2232        init_timer(&sk->sk_timer);
2233
2234        sk->sk_allocation       =       GFP_KERNEL;
2235        sk->sk_rcvbuf           =       sysctl_rmem_default;
2236        sk->sk_sndbuf           =       sysctl_wmem_default;
2237        sk->sk_state            =       TCP_CLOSE;
2238        sk_set_socket(sk, sock);
2239
2240        sock_set_flag(sk, SOCK_ZAPPED);
2241
2242        if (sock) {
2243                sk->sk_type     =       sock->type;
2244                sk->sk_wq       =       sock->wq;
2245                sock->sk        =       sk;
2246        } else
2247                sk->sk_wq       =       NULL;
2248
2249        spin_lock_init(&sk->sk_dst_lock);
2250        rwlock_init(&sk->sk_callback_lock);
2251        lockdep_set_class_and_name(&sk->sk_callback_lock,
2252                        af_callback_keys + sk->sk_family,
2253                        af_family_clock_key_strings[sk->sk_family]);
2254
2255        sk->sk_state_change     =       sock_def_wakeup;
2256        sk->sk_data_ready       =       sock_def_readable;
2257        sk->sk_write_space      =       sock_def_write_space;
2258        sk->sk_error_report     =       sock_def_error_report;
2259        sk->sk_destruct         =       sock_def_destruct;
2260
2261        sk->sk_frag.page        =       NULL;
2262        sk->sk_frag.offset      =       0;
2263        sk->sk_peek_off         =       -1;
2264
2265        sk->sk_peer_pid         =       NULL;
2266        sk->sk_peer_cred        =       NULL;
2267        sk->sk_write_pending    =       0;
2268        sk->sk_rcvlowat         =       1;
2269        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2270        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2271
2272        sk->sk_stamp = ktime_set(-1L, 0);
2273
2274        /*
2275         * Before updating sk_refcnt, we must commit prior changes to memory
2276         * (Documentation/RCU/rculist_nulls.txt for details)
2277         */
2278        smp_wmb();
2279        atomic_set(&sk->sk_refcnt, 1);
2280        atomic_set(&sk->sk_drops, 0);
2281}
2282EXPORT_SYMBOL(sock_init_data);
2283
2284void lock_sock_nested(struct sock *sk, int subclass)
2285{
2286        might_sleep();
2287        spin_lock_bh(&sk->sk_lock.slock);
2288        if (sk->sk_lock.owned)
2289                __lock_sock(sk);
2290        sk->sk_lock.owned = 1;
2291        spin_unlock(&sk->sk_lock.slock);
2292        /*
2293         * The sk_lock has mutex_lock() semantics here:
2294         */
2295        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2296        local_bh_enable();
2297}
2298EXPORT_SYMBOL(lock_sock_nested);
2299
2300void release_sock(struct sock *sk)
2301{
2302        /*
2303         * The sk_lock has mutex_unlock() semantics:
2304         */
2305        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2306
2307        spin_lock_bh(&sk->sk_lock.slock);
2308        if (sk->sk_backlog.tail)
2309                __release_sock(sk);
2310
2311        if (sk->sk_prot->release_cb)
2312                sk->sk_prot->release_cb(sk);
2313
2314        sk->sk_lock.owned = 0;
2315        if (waitqueue_active(&sk->sk_lock.wq))
2316                wake_up(&sk->sk_lock.wq);
2317        spin_unlock_bh(&sk->sk_lock.slock);
2318}
2319EXPORT_SYMBOL(release_sock);
2320
2321/**
2322 * lock_sock_fast - fast version of lock_sock
2323 * @sk: socket
2324 *
2325 * This version should be used for very small section, where process wont block
2326 * return false if fast path is taken
2327 *   sk_lock.slock locked, owned = 0, BH disabled
2328 * return true if slow path is taken
2329 *   sk_lock.slock unlocked, owned = 1, BH enabled
2330 */
2331bool lock_sock_fast(struct sock *sk)
2332{
2333        might_sleep();
2334        spin_lock_bh(&sk->sk_lock.slock);
2335
2336        if (!sk->sk_lock.owned)
2337                /*
2338                 * Note : We must disable BH
2339                 */
2340                return false;
2341
2342        __lock_sock(sk);
2343        sk->sk_lock.owned = 1;
2344        spin_unlock(&sk->sk_lock.slock);
2345        /*
2346         * The sk_lock has mutex_lock() semantics here:
2347         */
2348        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2349        local_bh_enable();
2350        return true;
2351}
2352EXPORT_SYMBOL(lock_sock_fast);
2353
2354int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2355{
2356        struct timeval tv;
2357        if (!sock_flag(sk, SOCK_TIMESTAMP))
2358                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2359        tv = ktime_to_timeval(sk->sk_stamp);
2360        if (tv.tv_sec == -1)
2361                return -ENOENT;
2362        if (tv.tv_sec == 0) {
2363                sk->sk_stamp = ktime_get_real();
2364                tv = ktime_to_timeval(sk->sk_stamp);
2365        }
2366        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2367}
2368EXPORT_SYMBOL(sock_get_timestamp);
2369
2370int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2371{
2372        struct timespec ts;
2373        if (!sock_flag(sk, SOCK_TIMESTAMP))
2374                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2375        ts = ktime_to_timespec(sk->sk_stamp);
2376        if (ts.tv_sec == -1)
2377                return -ENOENT;
2378        if (ts.tv_sec == 0) {
2379                sk->sk_stamp = ktime_get_real();
2380                ts = ktime_to_timespec(sk->sk_stamp);
2381        }
2382        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2383}
2384EXPORT_SYMBOL(sock_get_timestampns);
2385
2386void sock_enable_timestamp(struct sock *sk, int flag)
2387{
2388        if (!sock_flag(sk, flag)) {
2389                unsigned long previous_flags = sk->sk_flags;
2390
2391                sock_set_flag(sk, flag);
2392                /*
2393                 * we just set one of the two flags which require net
2394                 * time stamping, but time stamping might have been on
2395                 * already because of the other one
2396                 */
2397                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2398                        net_enable_timestamp();
2399        }
2400}
2401
2402/*
2403 *      Get a socket option on an socket.
2404 *
2405 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2406 *      asynchronous errors should be reported by getsockopt. We assume
2407 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2408 */
2409int sock_common_getsockopt(struct socket *sock, int level, int optname,
2410                           char __user *optval, int __user *optlen)
2411{
2412        struct sock *sk = sock->sk;
2413
2414        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2415}
2416EXPORT_SYMBOL(sock_common_getsockopt);
2417
2418#ifdef CONFIG_COMPAT
2419int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2420                                  char __user *optval, int __user *optlen)
2421{
2422        struct sock *sk = sock->sk;
2423
2424        if (sk->sk_prot->compat_getsockopt != NULL)
2425                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2426                                                      optval, optlen);
2427        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2428}
2429EXPORT_SYMBOL(compat_sock_common_getsockopt);
2430#endif
2431
2432int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2433                        struct msghdr *msg, size_t size, int flags)
2434{
2435        struct sock *sk = sock->sk;
2436        int addr_len = 0;
2437        int err;
2438
2439        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2440                                   flags & ~MSG_DONTWAIT, &addr_len);
2441        if (err >= 0)
2442                msg->msg_namelen = addr_len;
2443        return err;
2444}
2445EXPORT_SYMBOL(sock_common_recvmsg);
2446
2447/*
2448 *      Set socket options on an inet socket.
2449 */
2450int sock_common_setsockopt(struct socket *sock, int level, int optname,
2451                           char __user *optval, unsigned int optlen)
2452{
2453        struct sock *sk = sock->sk;
2454
2455        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2456}
2457EXPORT_SYMBOL(sock_common_setsockopt);
2458
2459#ifdef CONFIG_COMPAT
2460int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2461                                  char __user *optval, unsigned int optlen)
2462{
2463        struct sock *sk = sock->sk;
2464
2465        if (sk->sk_prot->compat_setsockopt != NULL)
2466                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2467                                                      optval, optlen);
2468        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2469}
2470EXPORT_SYMBOL(compat_sock_common_setsockopt);
2471#endif
2472
2473void sk_common_release(struct sock *sk)
2474{
2475        if (sk->sk_prot->destroy)
2476                sk->sk_prot->destroy(sk);
2477
2478        /*
2479         * Observation: when sock_common_release is called, processes have
2480         * no access to socket. But net still has.
2481         * Step one, detach it from networking:
2482         *
2483         * A. Remove from hash tables.
2484         */
2485
2486        sk->sk_prot->unhash(sk);
2487
2488        /*
2489         * In this point socket cannot receive new packets, but it is possible
2490         * that some packets are in flight because some CPU runs receiver and
2491         * did hash table lookup before we unhashed socket. They will achieve
2492         * receive queue and will be purged by socket destructor.
2493         *
2494         * Also we still have packets pending on receive queue and probably,
2495         * our own packets waiting in device queues. sock_destroy will drain
2496         * receive queue, but transmitted packets will delay socket destruction
2497         * until the last reference will be released.
2498         */
2499
2500        sock_orphan(sk);
2501
2502        xfrm_sk_free_policy(sk);
2503
2504        sk_refcnt_debug_release(sk);
2505
2506        if (sk->sk_frag.page) {
2507                put_page(sk->sk_frag.page);
2508                sk->sk_frag.page = NULL;
2509        }
2510
2511        sock_put(sk);
2512}
2513EXPORT_SYMBOL(sk_common_release);
2514
2515#ifdef CONFIG_PROC_FS
2516#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2517struct prot_inuse {
2518        int val[PROTO_INUSE_NR];
2519};
2520
2521static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2522
2523#ifdef CONFIG_NET_NS
2524void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2525{
2526        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2527}
2528EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2529
2530int sock_prot_inuse_get(struct net *net, struct proto *prot)
2531{
2532        int cpu, idx = prot->inuse_idx;
2533        int res = 0;
2534
2535        for_each_possible_cpu(cpu)
2536                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2537
2538        return res >= 0 ? res : 0;
2539}
2540EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2541
2542static int __net_init sock_inuse_init_net(struct net *net)
2543{
2544        net->core.inuse = alloc_percpu(struct prot_inuse);
2545        return net->core.inuse ? 0 : -ENOMEM;
2546}
2547
2548static void __net_exit sock_inuse_exit_net(struct net *net)
2549{
2550        free_percpu(net->core.inuse);
2551}
2552
2553static struct pernet_operations net_inuse_ops = {
2554        .init = sock_inuse_init_net,
2555        .exit = sock_inuse_exit_net,
2556};
2557
2558static __init int net_inuse_init(void)
2559{
2560        if (register_pernet_subsys(&net_inuse_ops))
2561                panic("Cannot initialize net inuse counters");
2562
2563        return 0;
2564}
2565
2566core_initcall(net_inuse_init);
2567#else
2568static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2569
2570void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2571{
2572        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2573}
2574EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2575
2576int sock_prot_inuse_get(struct net *net, struct proto *prot)
2577{
2578        int cpu, idx = prot->inuse_idx;
2579        int res = 0;
2580
2581        for_each_possible_cpu(cpu)
2582                res += per_cpu(prot_inuse, cpu).val[idx];
2583
2584        return res >= 0 ? res : 0;
2585}
2586EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2587#endif
2588
2589static void assign_proto_idx(struct proto *prot)
2590{
2591        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2592
2593        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2594                pr_err("PROTO_INUSE_NR exhausted\n");
2595                return;
2596        }
2597
2598        set_bit(prot->inuse_idx, proto_inuse_idx);
2599}
2600
2601static void release_proto_idx(struct proto *prot)
2602{
2603        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2604                clear_bit(prot->inuse_idx, proto_inuse_idx);
2605}
2606#else
2607static inline void assign_proto_idx(struct proto *prot)
2608{
2609}
2610
2611static inline void release_proto_idx(struct proto *prot)
2612{
2613}
2614#endif
2615
2616int proto_register(struct proto *prot, int alloc_slab)
2617{
2618        if (alloc_slab) {
2619                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2620                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2621                                        NULL);
2622
2623                if (prot->slab == NULL) {
2624                        pr_crit("%s: Can't create sock SLAB cache!\n",
2625                                prot->name);
2626                        goto out;
2627                }
2628
2629                if (prot->rsk_prot != NULL) {
2630                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2631                        if (prot->rsk_prot->slab_name == NULL)
2632                                goto out_free_sock_slab;
2633
2634                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2635                                                                 prot->rsk_prot->obj_size, 0,
2636                                                                 SLAB_HWCACHE_ALIGN, NULL);
2637
2638                        if (prot->rsk_prot->slab == NULL) {
2639                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2640                                        prot->name);
2641                                goto out_free_request_sock_slab_name;
2642                        }
2643                }
2644
2645                if (prot->twsk_prot != NULL) {
2646                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2647
2648                        if (prot->twsk_prot->twsk_slab_name == NULL)
2649                                goto out_free_request_sock_slab;
2650
2651                        prot->twsk_prot->twsk_slab =
2652                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2653                                                  prot->twsk_prot->twsk_obj_size,
2654                                                  0,
2655                                                  SLAB_HWCACHE_ALIGN |
2656                                                        prot->slab_flags,
2657                                                  NULL);
2658                        if (prot->twsk_prot->twsk_slab == NULL)
2659                                goto out_free_timewait_sock_slab_name;
2660                }
2661        }
2662
2663        mutex_lock(&proto_list_mutex);
2664        list_add(&prot->node, &proto_list);
2665        assign_proto_idx(prot);
2666        mutex_unlock(&proto_list_mutex);
2667        return 0;
2668
2669out_free_timewait_sock_slab_name:
2670        kfree(prot->twsk_prot->twsk_slab_name);
2671out_free_request_sock_slab:
2672        if (prot->rsk_prot && prot->rsk_prot->slab) {
2673                kmem_cache_destroy(prot->rsk_prot->slab);
2674                prot->rsk_prot->slab = NULL;
2675        }
2676out_free_request_sock_slab_name:
2677        if (prot->rsk_prot)
2678                kfree(prot->rsk_prot->slab_name);
2679out_free_sock_slab:
2680        kmem_cache_destroy(prot->slab);
2681        prot->slab = NULL;
2682out:
2683        return -ENOBUFS;
2684}
2685EXPORT_SYMBOL(proto_register);
2686
2687void proto_unregister(struct proto *prot)
2688{
2689        mutex_lock(&proto_list_mutex);
2690        release_proto_idx(prot);
2691        list_del(&prot->node);
2692        mutex_unlock(&proto_list_mutex);
2693
2694        if (prot->slab != NULL) {
2695                kmem_cache_destroy(prot->slab);
2696                prot->slab = NULL;
2697        }
2698
2699        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2700                kmem_cache_destroy(prot->rsk_prot->slab);
2701                kfree(prot->rsk_prot->slab_name);
2702                prot->rsk_prot->slab = NULL;
2703        }
2704
2705        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2706                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2707                kfree(prot->twsk_prot->twsk_slab_name);
2708                prot->twsk_prot->twsk_slab = NULL;
2709        }
2710}
2711EXPORT_SYMBOL(proto_unregister);
2712
2713#ifdef CONFIG_PROC_FS
2714static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2715        __acquires(proto_list_mutex)
2716{
2717        mutex_lock(&proto_list_mutex);
2718        return seq_list_start_head(&proto_list, *pos);
2719}
2720
2721static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2722{
2723        return seq_list_next(v, &proto_list, pos);
2724}
2725
2726static void proto_seq_stop(struct seq_file *seq, void *v)
2727        __releases(proto_list_mutex)
2728{
2729        mutex_unlock(&proto_list_mutex);
2730}
2731
2732static char proto_method_implemented(const void *method)
2733{
2734        return method == NULL ? 'n' : 'y';
2735}
2736static long sock_prot_memory_allocated(struct proto *proto)
2737{
2738        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2739}
2740
2741static char *sock_prot_memory_pressure(struct proto *proto)
2742{
2743        return proto->memory_pressure != NULL ?
2744        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2745}
2746
2747static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2748{
2749
2750        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2751                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2752                   proto->name,
2753                   proto->obj_size,
2754                   sock_prot_inuse_get(seq_file_net(seq), proto),
2755                   sock_prot_memory_allocated(proto),
2756                   sock_prot_memory_pressure(proto),
2757                   proto->max_header,
2758                   proto->slab == NULL ? "no" : "yes",
2759                   module_name(proto->owner),
2760                   proto_method_implemented(proto->close),
2761                   proto_method_implemented(proto->connect),
2762                   proto_method_implemented(proto->disconnect),
2763                   proto_method_implemented(proto->accept),
2764                   proto_method_implemented(proto->ioctl),
2765                   proto_method_implemented(proto->init),
2766                   proto_method_implemented(proto->destroy),
2767                   proto_method_implemented(proto->shutdown),
2768                   proto_method_implemented(proto->setsockopt),
2769                   proto_method_implemented(proto->getsockopt),
2770                   proto_method_implemented(proto->sendmsg),
2771                   proto_method_implemented(proto->recvmsg),
2772                   proto_method_implemented(proto->sendpage),
2773                   proto_method_implemented(proto->bind),
2774                   proto_method_implemented(proto->backlog_rcv),
2775                   proto_method_implemented(proto->hash),
2776                   proto_method_implemented(proto->unhash),
2777                   proto_method_implemented(proto->get_port),
2778                   proto_method_implemented(proto->enter_memory_pressure));
2779}
2780
2781static int proto_seq_show(struct seq_file *seq, void *v)
2782{
2783        if (v == &proto_list)
2784                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2785                           "protocol",
2786                           "size",
2787                           "sockets",
2788                           "memory",
2789                           "press",
2790                           "maxhdr",
2791                           "slab",
2792                           "module",
2793                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2794        else
2795                proto_seq_printf(seq, list_entry(v, struct proto, node));
2796        return 0;
2797}
2798
2799static const struct seq_operations proto_seq_ops = {
2800        .start  = proto_seq_start,
2801        .next   = proto_seq_next,
2802        .stop   = proto_seq_stop,
2803        .show   = proto_seq_show,
2804};
2805
2806static int proto_seq_open(struct inode *inode, struct file *file)
2807{
2808        return seq_open_net(inode, file, &proto_seq_ops,
2809                            sizeof(struct seq_net_private));
2810}
2811
2812static const struct file_operations proto_seq_fops = {
2813        .owner          = THIS_MODULE,
2814        .open           = proto_seq_open,
2815        .read           = seq_read,
2816        .llseek         = seq_lseek,
2817        .release        = seq_release_net,
2818};
2819
2820static __net_init int proto_init_net(struct net *net)
2821{
2822        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2823                return -ENOMEM;
2824
2825        return 0;
2826}
2827
2828static __net_exit void proto_exit_net(struct net *net)
2829{
2830        remove_proc_entry("protocols", net->proc_net);
2831}
2832
2833
2834static __net_initdata struct pernet_operations proto_net_ops = {
2835        .init = proto_init_net,
2836        .exit = proto_exit_net,
2837};
2838
2839static int __init proto_init(void)
2840{
2841        return register_pernet_subsys(&proto_net_ops);
2842}
2843
2844subsys_initcall(proto_init);
2845
2846#endif /* PROC_FS */
2847