linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/types.h>
  97#include <linux/socket.h>
  98#include <linux/in.h>
  99#include <linux/kernel.h>
 100#include <linux/module.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/sched.h>
 104#include <linux/timer.h>
 105#include <linux/string.h>
 106#include <linux/sockios.h>
 107#include <linux/net.h>
 108#include <linux/mm.h>
 109#include <linux/slab.h>
 110#include <linux/interrupt.h>
 111#include <linux/poll.h>
 112#include <linux/tcp.h>
 113#include <linux/init.h>
 114#include <linux/highmem.h>
 115#include <linux/user_namespace.h>
 116#include <linux/static_key.h>
 117#include <linux/memcontrol.h>
 118#include <linux/prefetch.h>
 119
 120#include <asm/uaccess.h>
 121
 122#include <linux/netdevice.h>
 123#include <net/protocol.h>
 124#include <linux/skbuff.h>
 125#include <net/net_namespace.h>
 126#include <net/request_sock.h>
 127#include <net/sock.h>
 128#include <linux/net_tstamp.h>
 129#include <net/xfrm.h>
 130#include <linux/ipsec.h>
 131#include <net/cls_cgroup.h>
 132#include <net/netprio_cgroup.h>
 133
 134#include <linux/filter.h>
 135
 136#include <trace/events/sock.h>
 137
 138#ifdef CONFIG_INET
 139#include <net/tcp.h>
 140#endif
 141
 142#include <net/busy_poll.h>
 143
 144static DEFINE_MUTEX(proto_list_mutex);
 145static LIST_HEAD(proto_list);
 146
 147#ifdef CONFIG_MEMCG_KMEM
 148int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 149{
 150        struct proto *proto;
 151        int ret = 0;
 152
 153        mutex_lock(&proto_list_mutex);
 154        list_for_each_entry(proto, &proto_list, node) {
 155                if (proto->init_cgroup) {
 156                        ret = proto->init_cgroup(memcg, ss);
 157                        if (ret)
 158                                goto out;
 159                }
 160        }
 161
 162        mutex_unlock(&proto_list_mutex);
 163        return ret;
 164out:
 165        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 166                if (proto->destroy_cgroup)
 167                        proto->destroy_cgroup(memcg);
 168        mutex_unlock(&proto_list_mutex);
 169        return ret;
 170}
 171
 172void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 173{
 174        struct proto *proto;
 175
 176        mutex_lock(&proto_list_mutex);
 177        list_for_each_entry_reverse(proto, &proto_list, node)
 178                if (proto->destroy_cgroup)
 179                        proto->destroy_cgroup(memcg);
 180        mutex_unlock(&proto_list_mutex);
 181}
 182#endif
 183
 184/*
 185 * Each address family might have different locking rules, so we have
 186 * one slock key per address family:
 187 */
 188static struct lock_class_key af_family_keys[AF_MAX];
 189static struct lock_class_key af_family_slock_keys[AF_MAX];
 190
 191#if defined(CONFIG_MEMCG_KMEM)
 192struct static_key memcg_socket_limit_enabled;
 193EXPORT_SYMBOL(memcg_socket_limit_enabled);
 194#endif
 195
 196/*
 197 * Make lock validator output more readable. (we pre-construct these
 198 * strings build-time, so that runtime initialization of socket
 199 * locks is fast):
 200 */
 201static const char *const af_family_key_strings[AF_MAX+1] = {
 202  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 203  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 204  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 205  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 206  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 207  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 208  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 209  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 210  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 211  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 212  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 213  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 214  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 215  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 216};
 217static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 218  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 219  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 220  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 221  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 222  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 223  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 224  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 225  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 226  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 227  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 228  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 229  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 230  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 231  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 232};
 233static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 234  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 235  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 236  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 237  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 238  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 239  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 240  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 241  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 242  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 243  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 244  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 245  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 246  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 247  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 248};
 249
 250/*
 251 * sk_callback_lock locking rules are per-address-family,
 252 * so split the lock classes by using a per-AF key:
 253 */
 254static struct lock_class_key af_callback_keys[AF_MAX];
 255
 256/* Take into consideration the size of the struct sk_buff overhead in the
 257 * determination of these values, since that is non-constant across
 258 * platforms.  This makes socket queueing behavior and performance
 259 * not depend upon such differences.
 260 */
 261#define _SK_MEM_PACKETS         256
 262#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 263#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 264#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 265
 266/* Run time adjustable parameters. */
 267__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 268EXPORT_SYMBOL(sysctl_wmem_max);
 269__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 270EXPORT_SYMBOL(sysctl_rmem_max);
 271__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 272__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 273
 274/* Maximal space eaten by iovec or ancillary data plus some space */
 275int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 276EXPORT_SYMBOL(sysctl_optmem_max);
 277
 278struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 279EXPORT_SYMBOL_GPL(memalloc_socks);
 280
 281/**
 282 * sk_set_memalloc - sets %SOCK_MEMALLOC
 283 * @sk: socket to set it on
 284 *
 285 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 286 * It's the responsibility of the admin to adjust min_free_kbytes
 287 * to meet the requirements
 288 */
 289void sk_set_memalloc(struct sock *sk)
 290{
 291        sock_set_flag(sk, SOCK_MEMALLOC);
 292        sk->sk_allocation |= __GFP_MEMALLOC;
 293        static_key_slow_inc(&memalloc_socks);
 294}
 295EXPORT_SYMBOL_GPL(sk_set_memalloc);
 296
 297void sk_clear_memalloc(struct sock *sk)
 298{
 299        sock_reset_flag(sk, SOCK_MEMALLOC);
 300        sk->sk_allocation &= ~__GFP_MEMALLOC;
 301        static_key_slow_dec(&memalloc_socks);
 302
 303        /*
 304         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 305         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 306         * it has rmem allocations there is a risk that the user of the
 307         * socket cannot make forward progress due to exceeding the rmem
 308         * limits. By rights, sk_clear_memalloc() should only be called
 309         * on sockets being torn down but warn and reset the accounting if
 310         * that assumption breaks.
 311         */
 312        if (WARN_ON(sk->sk_forward_alloc))
 313                sk_mem_reclaim(sk);
 314}
 315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318{
 319        int ret;
 320        unsigned long pflags = current->flags;
 321
 322        /* these should have been dropped before queueing */
 323        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325        current->flags |= PF_MEMALLOC;
 326        ret = sk->sk_backlog_rcv(sk, skb);
 327        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 328
 329        return ret;
 330}
 331EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 334{
 335        struct timeval tv;
 336
 337        if (optlen < sizeof(tv))
 338                return -EINVAL;
 339        if (copy_from_user(&tv, optval, sizeof(tv)))
 340                return -EFAULT;
 341        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 342                return -EDOM;
 343
 344        if (tv.tv_sec < 0) {
 345                static int warned __read_mostly;
 346
 347                *timeo_p = 0;
 348                if (warned < 10 && net_ratelimit()) {
 349                        warned++;
 350                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 351                                __func__, current->comm, task_pid_nr(current));
 352                }
 353                return 0;
 354        }
 355        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 356        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 357                return 0;
 358        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 359                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 360        return 0;
 361}
 362
 363static void sock_warn_obsolete_bsdism(const char *name)
 364{
 365        static int warned;
 366        static char warncomm[TASK_COMM_LEN];
 367        if (strcmp(warncomm, current->comm) && warned < 5) {
 368                strcpy(warncomm,  current->comm);
 369                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 370                        warncomm, name);
 371                warned++;
 372        }
 373}
 374
 375static bool sock_needs_netstamp(const struct sock *sk)
 376{
 377        switch (sk->sk_family) {
 378        case AF_UNSPEC:
 379        case AF_UNIX:
 380                return false;
 381        default:
 382                return true;
 383        }
 384}
 385
 386static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 387{
 388        if (sk->sk_flags & flags) {
 389                sk->sk_flags &= ~flags;
 390                if (sock_needs_netstamp(sk) &&
 391                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 392                        net_disable_timestamp();
 393        }
 394}
 395
 396
 397int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 398{
 399        int skb_len;
 400        unsigned long flags;
 401        struct sk_buff_head *list = &sk->sk_receive_queue;
 402
 403        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 404                atomic_inc(&sk->sk_drops);
 405                trace_sock_rcvqueue_full(sk, skb);
 406                return -ENOMEM;
 407        }
 408
 409        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 410                atomic_inc(&sk->sk_drops);
 411                return -ENOBUFS;
 412        }
 413
 414        skb->dev = NULL;
 415        skb_set_owner_r(skb, sk);
 416
 417        /* Cache the SKB length before we tack it onto the receive
 418         * queue.  Once it is added it no longer belongs to us and
 419         * may be freed by other threads of control pulling packets
 420         * from the queue.
 421         */
 422        skb_len = skb->len;
 423
 424        /* we escape from rcu protected region, make sure we dont leak
 425         * a norefcounted dst
 426         */
 427        skb_dst_force(skb);
 428
 429        spin_lock_irqsave(&list->lock, flags);
 430        sock_skb_set_dropcount(sk, skb);
 431        __skb_queue_tail(list, skb);
 432        spin_unlock_irqrestore(&list->lock, flags);
 433
 434        if (!sock_flag(sk, SOCK_DEAD))
 435                sk->sk_data_ready(sk, skb_len);
 436        return 0;
 437}
 438EXPORT_SYMBOL(__sock_queue_rcv_skb);
 439
 440int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441{
 442        int err;
 443
 444        err = sk_filter(sk, skb);
 445        if (err)
 446                return err;
 447
 448        return __sock_queue_rcv_skb(sk, skb);
 449}
 450EXPORT_SYMBOL(sock_queue_rcv_skb);
 451
 452int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 453{
 454        int rc = NET_RX_SUCCESS;
 455
 456        if (sk_filter(sk, skb))
 457                goto discard_and_relse;
 458
 459        skb->dev = NULL;
 460
 461        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 462                atomic_inc(&sk->sk_drops);
 463                goto discard_and_relse;
 464        }
 465        if (nested)
 466                bh_lock_sock_nested(sk);
 467        else
 468                bh_lock_sock(sk);
 469        if (!sock_owned_by_user(sk)) {
 470                /*
 471                 * trylock + unlock semantics:
 472                 */
 473                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 474
 475                rc = sk_backlog_rcv(sk, skb);
 476
 477                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 478        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 479                bh_unlock_sock(sk);
 480                atomic_inc(&sk->sk_drops);
 481                goto discard_and_relse;
 482        }
 483
 484        bh_unlock_sock(sk);
 485out:
 486        sock_put(sk);
 487        return rc;
 488discard_and_relse:
 489        kfree_skb(skb);
 490        goto out;
 491}
 492EXPORT_SYMBOL(sk_receive_skb);
 493
 494void sk_reset_txq(struct sock *sk)
 495{
 496        sk_tx_queue_clear(sk);
 497}
 498EXPORT_SYMBOL(sk_reset_txq);
 499
 500struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 501{
 502        struct dst_entry *dst = __sk_dst_get(sk);
 503
 504        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 505                sk_tx_queue_clear(sk);
 506                sk->sk_dst_pending_confirm = 0;
 507                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 508                dst_release(dst);
 509                return NULL;
 510        }
 511
 512        return dst;
 513}
 514EXPORT_SYMBOL(__sk_dst_check);
 515
 516struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 517{
 518        struct dst_entry *dst = sk_dst_get(sk);
 519
 520        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 521                sk_dst_reset(sk);
 522                dst_release(dst);
 523                return NULL;
 524        }
 525
 526        return dst;
 527}
 528EXPORT_SYMBOL(sk_dst_check);
 529
 530static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 531                                int optlen)
 532{
 533        int ret = -ENOPROTOOPT;
 534#ifdef CONFIG_NETDEVICES
 535        struct net *net = sock_net(sk);
 536        char devname[IFNAMSIZ];
 537        int index;
 538
 539        /* Sorry... */
 540        ret = -EPERM;
 541        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 542                goto out;
 543
 544        ret = -EINVAL;
 545        if (optlen < 0)
 546                goto out;
 547
 548        /* Bind this socket to a particular device like "eth0",
 549         * as specified in the passed interface name. If the
 550         * name is "" or the option length is zero the socket
 551         * is not bound.
 552         */
 553        if (optlen > IFNAMSIZ - 1)
 554                optlen = IFNAMSIZ - 1;
 555        memset(devname, 0, sizeof(devname));
 556
 557        ret = -EFAULT;
 558        if (copy_from_user(devname, optval, optlen))
 559                goto out;
 560
 561        index = 0;
 562        if (devname[0] != '\0') {
 563                struct net_device *dev;
 564
 565                rcu_read_lock();
 566                dev = dev_get_by_name_rcu(net, devname);
 567                if (dev)
 568                        index = dev->ifindex;
 569                rcu_read_unlock();
 570                ret = -ENODEV;
 571                if (!dev)
 572                        goto out;
 573        }
 574
 575        lock_sock(sk);
 576        sk->sk_bound_dev_if = index;
 577        sk_dst_reset(sk);
 578        release_sock(sk);
 579
 580        ret = 0;
 581
 582out:
 583#endif
 584
 585        return ret;
 586}
 587
 588static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 589                                int __user *optlen, int len)
 590{
 591        int ret = -ENOPROTOOPT;
 592#ifdef CONFIG_NETDEVICES
 593        struct net *net = sock_net(sk);
 594        char devname[IFNAMSIZ];
 595
 596        if (sk->sk_bound_dev_if == 0) {
 597                len = 0;
 598                goto zero;
 599        }
 600
 601        ret = -EINVAL;
 602        if (len < IFNAMSIZ)
 603                goto out;
 604
 605        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 606        if (ret)
 607                goto out;
 608
 609        len = strlen(devname) + 1;
 610
 611        ret = -EFAULT;
 612        if (copy_to_user(optval, devname, len))
 613                goto out;
 614
 615zero:
 616        ret = -EFAULT;
 617        if (put_user(len, optlen))
 618                goto out;
 619
 620        ret = 0;
 621
 622out:
 623#endif
 624
 625        return ret;
 626}
 627
 628static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 629{
 630        if (valbool)
 631                sock_set_flag(sk, bit);
 632        else
 633                sock_reset_flag(sk, bit);
 634}
 635
 636bool sk_mc_loop(struct sock *sk)
 637{
 638        if (dev_recursion_level())
 639                return false;
 640        if (!sk)
 641                return true;
 642        switch (sk->sk_family) {
 643        case AF_INET:
 644                return inet_sk(sk)->mc_loop;
 645#if IS_ENABLED(CONFIG_IPV6)
 646        case AF_INET6:
 647                return inet6_sk(sk)->mc_loop;
 648#endif
 649        }
 650        WARN_ON(1);
 651        return true;
 652}
 653EXPORT_SYMBOL(sk_mc_loop);
 654
 655/*
 656 *      This is meant for all protocols to use and covers goings on
 657 *      at the socket level. Everything here is generic.
 658 */
 659
 660int sock_setsockopt(struct socket *sock, int level, int optname,
 661                    char __user *optval, unsigned int optlen)
 662{
 663        struct sock *sk = sock->sk;
 664        int val;
 665        int valbool;
 666        struct linger ling;
 667        int ret = 0;
 668
 669        /*
 670         *      Options without arguments
 671         */
 672
 673        if (optname == SO_BINDTODEVICE)
 674                return sock_setbindtodevice(sk, optval, optlen);
 675
 676        if (optlen < sizeof(int))
 677                return -EINVAL;
 678
 679        if (get_user(val, (int __user *)optval))
 680                return -EFAULT;
 681
 682        valbool = val ? 1 : 0;
 683
 684        lock_sock(sk);
 685
 686        switch (optname) {
 687        case SO_DEBUG:
 688                if (val && !capable(CAP_NET_ADMIN))
 689                        ret = -EACCES;
 690                else
 691                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 692                break;
 693        case SO_REUSEADDR:
 694                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 695                break;
 696        case SO_REUSEPORT:
 697                sk->sk_reuseport = valbool;
 698                break;
 699        case SO_TYPE:
 700        case SO_PROTOCOL:
 701        case SO_DOMAIN:
 702        case SO_ERROR:
 703                ret = -ENOPROTOOPT;
 704                break;
 705        case SO_DONTROUTE:
 706                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 707                break;
 708        case SO_BROADCAST:
 709                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 710                break;
 711        case SO_SNDBUF:
 712                /* Don't error on this BSD doesn't and if you think
 713                 * about it this is right. Otherwise apps have to
 714                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 715                 * are treated in BSD as hints
 716                 */
 717                val = min_t(u32, val, sysctl_wmem_max);
 718set_sndbuf:
 719                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 720                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 721                /* Wake up sending tasks if we upped the value. */
 722                sk->sk_write_space(sk);
 723                break;
 724
 725        case SO_SNDBUFFORCE:
 726                if (!capable(CAP_NET_ADMIN)) {
 727                        ret = -EPERM;
 728                        break;
 729                }
 730                goto set_sndbuf;
 731
 732        case SO_RCVBUF:
 733                /* Don't error on this BSD doesn't and if you think
 734                 * about it this is right. Otherwise apps have to
 735                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 736                 * are treated in BSD as hints
 737                 */
 738                val = min_t(u32, val, sysctl_rmem_max);
 739set_rcvbuf:
 740                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 741                /*
 742                 * We double it on the way in to account for
 743                 * "struct sk_buff" etc. overhead.   Applications
 744                 * assume that the SO_RCVBUF setting they make will
 745                 * allow that much actual data to be received on that
 746                 * socket.
 747                 *
 748                 * Applications are unaware that "struct sk_buff" and
 749                 * other overheads allocate from the receive buffer
 750                 * during socket buffer allocation.
 751                 *
 752                 * And after considering the possible alternatives,
 753                 * returning the value we actually used in getsockopt
 754                 * is the most desirable behavior.
 755                 */
 756                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 757                break;
 758
 759        case SO_RCVBUFFORCE:
 760                if (!capable(CAP_NET_ADMIN)) {
 761                        ret = -EPERM;
 762                        break;
 763                }
 764                goto set_rcvbuf;
 765
 766        case SO_KEEPALIVE:
 767#ifdef CONFIG_INET
 768                if (sk->sk_protocol == IPPROTO_TCP &&
 769                    sk->sk_type == SOCK_STREAM)
 770                        tcp_set_keepalive(sk, valbool);
 771#endif
 772                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 773                break;
 774
 775        case SO_OOBINLINE:
 776                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 777                break;
 778
 779        case SO_NO_CHECK:
 780                sk->sk_no_check_tx = valbool;
 781                break;
 782
 783        case SO_PRIORITY:
 784                if ((val >= 0 && val <= 6) ||
 785                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 786                        sk->sk_priority = val;
 787                else
 788                        ret = -EPERM;
 789                break;
 790
 791        case SO_LINGER:
 792                if (optlen < sizeof(ling)) {
 793                        ret = -EINVAL;  /* 1003.1g */
 794                        break;
 795                }
 796                if (copy_from_user(&ling, optval, sizeof(ling))) {
 797                        ret = -EFAULT;
 798                        break;
 799                }
 800                if (!ling.l_onoff)
 801                        sock_reset_flag(sk, SOCK_LINGER);
 802                else {
 803#if (BITS_PER_LONG == 32)
 804                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 805                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 806                        else
 807#endif
 808                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 809                        sock_set_flag(sk, SOCK_LINGER);
 810                }
 811                break;
 812
 813        case SO_BSDCOMPAT:
 814                sock_warn_obsolete_bsdism("setsockopt");
 815                break;
 816
 817        case SO_PASSCRED:
 818                if (valbool)
 819                        set_bit(SOCK_PASSCRED, &sock->flags);
 820                else
 821                        clear_bit(SOCK_PASSCRED, &sock->flags);
 822                break;
 823
 824        case SO_TIMESTAMP:
 825        case SO_TIMESTAMPNS:
 826                if (valbool)  {
 827                        if (optname == SO_TIMESTAMP)
 828                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 829                        else
 830                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 831                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 832                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 833                } else {
 834                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 835                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 836                }
 837                break;
 838
 839        case SO_TIMESTAMPING:
 840                if (val & ~SOF_TIMESTAMPING_MASK ||
 841                    val & __RH_RESERVED_SOF_TIMESTAMPING_OPT_ID ||
 842                    val & __RH_RESERVED_SOF_TIMESTAMPING_TX_SCHED ||
 843                    val & __RH_RESERVED_SOF_TIMESTAMPING_TX_ACK) {
 844                        ret = -EINVAL;
 845                        break;
 846                }
 847                sk->sk_tsflags = val;
 848                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 849                        sock_enable_timestamp(sk,
 850                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 851                else
 852                        sock_disable_timestamp(sk,
 853                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 854                break;
 855
 856        case SO_RCVLOWAT:
 857                if (val < 0)
 858                        val = INT_MAX;
 859                sk->sk_rcvlowat = val ? : 1;
 860                break;
 861
 862        case SO_RCVTIMEO:
 863                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 864                break;
 865
 866        case SO_SNDTIMEO:
 867                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 868                break;
 869
 870        case SO_ATTACH_FILTER:
 871                ret = -EINVAL;
 872                if (optlen == sizeof(struct sock_fprog)) {
 873                        struct sock_fprog fprog;
 874
 875                        ret = -EFAULT;
 876                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 877                                break;
 878
 879                        ret = sk_attach_filter(&fprog, sk);
 880                }
 881                break;
 882
 883        case SO_DETACH_FILTER:
 884                ret = sk_detach_filter(sk);
 885                break;
 886
 887        case SO_LOCK_FILTER:
 888                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 889                        ret = -EPERM;
 890                else
 891                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 892                break;
 893
 894        case SO_PASSSEC:
 895                if (valbool)
 896                        set_bit(SOCK_PASSSEC, &sock->flags);
 897                else
 898                        clear_bit(SOCK_PASSSEC, &sock->flags);
 899                break;
 900        case SO_MARK:
 901                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 902                        ret = -EPERM;
 903                else
 904                        sk->sk_mark = val;
 905                break;
 906
 907                /* We implement the SO_SNDLOWAT etc to
 908                   not be settable (1003.1g 5.3) */
 909        case SO_RXQ_OVFL:
 910                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 911                break;
 912
 913        case SO_WIFI_STATUS:
 914                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 915                break;
 916
 917        case SO_PEEK_OFF:
 918                if (sock->ops->set_peek_off)
 919                        ret = sock->ops->set_peek_off(sk, val);
 920                else
 921                        ret = -EOPNOTSUPP;
 922                break;
 923
 924        case SO_NOFCS:
 925                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 926                break;
 927
 928        case SO_SELECT_ERR_QUEUE:
 929                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 930                break;
 931
 932#ifdef CONFIG_NET_RX_BUSY_POLL
 933        case SO_BUSY_POLL:
 934                /* allow unprivileged users to decrease the value */
 935                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 936                        ret = -EPERM;
 937                else {
 938                        if (val < 0)
 939                                ret = -EINVAL;
 940                        else
 941                                sk->sk_ll_usec = val;
 942                }
 943                break;
 944#endif
 945
 946        case SO_MAX_PACING_RATE:
 947                sk->sk_max_pacing_rate = val;
 948                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 949                                         sk->sk_max_pacing_rate);
 950                break;
 951
 952        default:
 953                ret = -ENOPROTOOPT;
 954                break;
 955        }
 956        release_sock(sk);
 957        return ret;
 958}
 959EXPORT_SYMBOL(sock_setsockopt);
 960
 961
 962void cred_to_ucred(struct pid *pid, const struct cred *cred,
 963                   struct ucred *ucred)
 964{
 965        ucred->pid = pid_vnr(pid);
 966        ucred->uid = ucred->gid = -1;
 967        if (cred) {
 968                struct user_namespace *current_ns = current_user_ns();
 969
 970                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 971                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 972        }
 973}
 974EXPORT_SYMBOL_GPL(cred_to_ucred);
 975
 976int sock_getsockopt(struct socket *sock, int level, int optname,
 977                    char __user *optval, int __user *optlen)
 978{
 979        struct sock *sk = sock->sk;
 980
 981        union {
 982                int val;
 983                struct linger ling;
 984                struct timeval tm;
 985        } v;
 986
 987        int lv = sizeof(int);
 988        int len;
 989
 990        if (get_user(len, optlen))
 991                return -EFAULT;
 992        if (len < 0)
 993                return -EINVAL;
 994
 995        memset(&v, 0, sizeof(v));
 996
 997        switch (optname) {
 998        case SO_DEBUG:
 999                v.val = sock_flag(sk, SOCK_DBG);
1000                break;
1001
1002        case SO_DONTROUTE:
1003                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1004                break;
1005
1006        case SO_BROADCAST:
1007                v.val = sock_flag(sk, SOCK_BROADCAST);
1008                break;
1009
1010        case SO_SNDBUF:
1011                v.val = sk->sk_sndbuf;
1012                break;
1013
1014        case SO_RCVBUF:
1015                v.val = sk->sk_rcvbuf;
1016                break;
1017
1018        case SO_REUSEADDR:
1019                v.val = sk->sk_reuse;
1020                break;
1021
1022        case SO_REUSEPORT:
1023                v.val = sk->sk_reuseport;
1024                break;
1025
1026        case SO_KEEPALIVE:
1027                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1028                break;
1029
1030        case SO_TYPE:
1031                v.val = sk->sk_type;
1032                break;
1033
1034        case SO_PROTOCOL:
1035                v.val = sk->sk_protocol;
1036                break;
1037
1038        case SO_DOMAIN:
1039                v.val = sk->sk_family;
1040                break;
1041
1042        case SO_ERROR:
1043                v.val = -sock_error(sk);
1044                if (v.val == 0)
1045                        v.val = xchg(&sk->sk_err_soft, 0);
1046                break;
1047
1048        case SO_OOBINLINE:
1049                v.val = sock_flag(sk, SOCK_URGINLINE);
1050                break;
1051
1052        case SO_NO_CHECK:
1053                v.val = sk->sk_no_check_tx;
1054                break;
1055
1056        case SO_PRIORITY:
1057                v.val = sk->sk_priority;
1058                break;
1059
1060        case SO_LINGER:
1061                lv              = sizeof(v.ling);
1062                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1063                v.ling.l_linger = sk->sk_lingertime / HZ;
1064                break;
1065
1066        case SO_BSDCOMPAT:
1067                sock_warn_obsolete_bsdism("getsockopt");
1068                break;
1069
1070        case SO_TIMESTAMP:
1071                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1072                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1073                break;
1074
1075        case SO_TIMESTAMPNS:
1076                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1077                break;
1078
1079        case SO_TIMESTAMPING:
1080                v.val = sk->sk_tsflags;
1081                break;
1082
1083        case SO_RCVTIMEO:
1084                lv = sizeof(struct timeval);
1085                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1086                        v.tm.tv_sec = 0;
1087                        v.tm.tv_usec = 0;
1088                } else {
1089                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1090                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1091                }
1092                break;
1093
1094        case SO_SNDTIMEO:
1095                lv = sizeof(struct timeval);
1096                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1097                        v.tm.tv_sec = 0;
1098                        v.tm.tv_usec = 0;
1099                } else {
1100                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1101                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1102                }
1103                break;
1104
1105        case SO_RCVLOWAT:
1106                v.val = sk->sk_rcvlowat;
1107                break;
1108
1109        case SO_SNDLOWAT:
1110                v.val = 1;
1111                break;
1112
1113        case SO_PASSCRED:
1114                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1115                break;
1116
1117        case SO_PEERCRED:
1118        {
1119                struct ucred peercred;
1120                if (len > sizeof(peercred))
1121                        len = sizeof(peercred);
1122                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1123                if (copy_to_user(optval, &peercred, len))
1124                        return -EFAULT;
1125                goto lenout;
1126        }
1127
1128        case SO_PEERNAME:
1129        {
1130                char address[128];
1131
1132                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1133                        return -ENOTCONN;
1134                if (lv < len)
1135                        return -EINVAL;
1136                if (copy_to_user(optval, address, len))
1137                        return -EFAULT;
1138                goto lenout;
1139        }
1140
1141        /* Dubious BSD thing... Probably nobody even uses it, but
1142         * the UNIX standard wants it for whatever reason... -DaveM
1143         */
1144        case SO_ACCEPTCONN:
1145                v.val = sk->sk_state == TCP_LISTEN;
1146                break;
1147
1148        case SO_PASSSEC:
1149                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1150                break;
1151
1152        case SO_PEERSEC:
1153                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1154
1155        case SO_MARK:
1156                v.val = sk->sk_mark;
1157                break;
1158
1159        case SO_RXQ_OVFL:
1160                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1161                break;
1162
1163        case SO_WIFI_STATUS:
1164                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1165                break;
1166
1167        case SO_PEEK_OFF:
1168                if (!sock->ops->set_peek_off)
1169                        return -EOPNOTSUPP;
1170
1171                v.val = sk->sk_peek_off;
1172                break;
1173        case SO_NOFCS:
1174                v.val = sock_flag(sk, SOCK_NOFCS);
1175                break;
1176
1177        case SO_BINDTODEVICE:
1178                return sock_getbindtodevice(sk, optval, optlen, len);
1179
1180        case SO_GET_FILTER:
1181                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1182                if (len < 0)
1183                        return len;
1184
1185                goto lenout;
1186
1187        case SO_LOCK_FILTER:
1188                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1189                break;
1190
1191        case SO_BPF_EXTENSIONS:
1192                v.val = bpf_tell_extensions();
1193                break;
1194
1195        case SO_SELECT_ERR_QUEUE:
1196                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1197                break;
1198
1199#ifdef CONFIG_NET_RX_BUSY_POLL
1200        case SO_BUSY_POLL:
1201                v.val = sk->sk_ll_usec;
1202                break;
1203#endif
1204
1205        case SO_MAX_PACING_RATE:
1206                v.val = sk->sk_max_pacing_rate;
1207                break;
1208
1209        default:
1210                return -ENOPROTOOPT;
1211        }
1212
1213        if (len > lv)
1214                len = lv;
1215        if (copy_to_user(optval, &v, len))
1216                return -EFAULT;
1217lenout:
1218        if (put_user(len, optlen))
1219                return -EFAULT;
1220        return 0;
1221}
1222
1223/*
1224 * Initialize an sk_lock.
1225 *
1226 * (We also register the sk_lock with the lock validator.)
1227 */
1228static inline void sock_lock_init(struct sock *sk)
1229{
1230        sock_lock_init_class_and_name(sk,
1231                        af_family_slock_key_strings[sk->sk_family],
1232                        af_family_slock_keys + sk->sk_family,
1233                        af_family_key_strings[sk->sk_family],
1234                        af_family_keys + sk->sk_family);
1235}
1236
1237/*
1238 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1239 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1240 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1241 */
1242static void sock_copy(struct sock *nsk, const struct sock *osk)
1243{
1244#ifdef CONFIG_SECURITY_NETWORK
1245        void *sptr = nsk->sk_security;
1246#endif
1247        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1248
1249        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1250               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1251
1252#ifdef CONFIG_SECURITY_NETWORK
1253        nsk->sk_security = sptr;
1254        security_sk_clone(osk, nsk);
1255#endif
1256}
1257
1258void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1259{
1260        unsigned long nulls1, nulls2;
1261
1262        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1263        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1264        if (nulls1 > nulls2)
1265                swap(nulls1, nulls2);
1266
1267        if (nulls1 != 0)
1268                memset((char *)sk, 0, nulls1);
1269        memset((char *)sk + nulls1 + sizeof(void *), 0,
1270               nulls2 - nulls1 - sizeof(void *));
1271        memset((char *)sk + nulls2 + sizeof(void *), 0,
1272               size - nulls2 - sizeof(void *));
1273}
1274EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1275
1276static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1277                int family)
1278{
1279        struct sock *sk;
1280        struct kmem_cache *slab;
1281
1282        slab = prot->slab;
1283        if (slab != NULL) {
1284                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1285                if (!sk)
1286                        return sk;
1287                if (priority & __GFP_ZERO) {
1288                        if (prot->clear_sk)
1289                                prot->clear_sk(sk, prot->obj_size);
1290                        else
1291                                sk_prot_clear_nulls(sk, prot->obj_size);
1292                }
1293        } else
1294                sk = kmalloc(prot->obj_size, priority);
1295
1296        if (sk != NULL) {
1297                kmemcheck_annotate_bitfield(sk, flags);
1298
1299                if (security_sk_alloc(sk, family, priority))
1300                        goto out_free;
1301
1302                if (!try_module_get(prot->owner))
1303                        goto out_free_sec;
1304                sk_tx_queue_clear(sk);
1305        }
1306
1307        return sk;
1308
1309out_free_sec:
1310        security_sk_free(sk);
1311out_free:
1312        if (slab != NULL)
1313                kmem_cache_free(slab, sk);
1314        else
1315                kfree(sk);
1316        return NULL;
1317}
1318
1319static void sk_prot_free(struct proto *prot, struct sock *sk)
1320{
1321        struct kmem_cache *slab;
1322        struct module *owner;
1323
1324        owner = prot->owner;
1325        slab = prot->slab;
1326
1327        security_sk_free(sk);
1328        if (slab != NULL)
1329                kmem_cache_free(slab, sk);
1330        else
1331                kfree(sk);
1332        module_put(owner);
1333}
1334
1335#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1336void sock_update_classid(struct sock *sk)
1337{
1338        u32 classid;
1339
1340        classid = task_cls_classid(current);
1341        if (classid != sk->sk_classid)
1342                sk->sk_classid = classid;
1343}
1344EXPORT_SYMBOL(sock_update_classid);
1345#endif
1346
1347#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1348void sock_update_netprioidx(struct sock *sk)
1349{
1350        if (in_interrupt())
1351                return;
1352
1353        sk->sk_cgrp_prioidx = task_netprioidx(current);
1354}
1355EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1356#endif
1357
1358/**
1359 *      sk_alloc - All socket objects are allocated here
1360 *      @net: the applicable net namespace
1361 *      @family: protocol family
1362 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1363 *      @prot: struct proto associated with this new sock instance
1364 */
1365struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1366                      struct proto *prot)
1367{
1368        struct sock *sk;
1369
1370        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1371        if (sk) {
1372                sk->sk_family = family;
1373                /*
1374                 * See comment in struct sock definition to understand
1375                 * why we need sk_prot_creator -acme
1376                 */
1377                sk->sk_prot = sk->sk_prot_creator = prot;
1378                sock_lock_init(sk);
1379                sock_net_set(sk, get_net(net));
1380                atomic_set(&sk->sk_wmem_alloc, 1);
1381
1382                sock_update_classid(sk);
1383                sock_update_netprioidx(sk);
1384        }
1385
1386        return sk;
1387}
1388EXPORT_SYMBOL(sk_alloc);
1389
1390static void __sk_free(struct sock *sk)
1391{
1392        struct sk_filter *filter;
1393
1394        if (sk->sk_destruct)
1395                sk->sk_destruct(sk);
1396
1397        filter = rcu_dereference_check(sk->sk_filter,
1398                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1399        if (filter) {
1400                sk_filter_uncharge(sk, filter);
1401                RCU_INIT_POINTER(sk->sk_filter, NULL);
1402        }
1403
1404        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1405
1406        if (atomic_read(&sk->sk_omem_alloc))
1407                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1408                         __func__, atomic_read(&sk->sk_omem_alloc));
1409
1410        if (sk->sk_peer_cred)
1411                put_cred(sk->sk_peer_cred);
1412        put_pid(sk->sk_peer_pid);
1413        put_net(sock_net(sk));
1414        sk_prot_free(sk->sk_prot_creator, sk);
1415}
1416
1417void sk_free(struct sock *sk)
1418{
1419        /*
1420         * We subtract one from sk_wmem_alloc and can know if
1421         * some packets are still in some tx queue.
1422         * If not null, sock_wfree() will call __sk_free(sk) later
1423         */
1424        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1425                __sk_free(sk);
1426}
1427EXPORT_SYMBOL(sk_free);
1428
1429/*
1430 * Last sock_put should drop reference to sk->sk_net. It has already
1431 * been dropped in sk_change_net. Taking reference to stopping namespace
1432 * is not an option.
1433 * Take reference to a socket to remove it from hash _alive_ and after that
1434 * destroy it in the context of init_net.
1435 */
1436void sk_release_kernel(struct sock *sk)
1437{
1438        if (sk == NULL || sk->sk_socket == NULL)
1439                return;
1440
1441        sock_hold(sk);
1442        sock_release(sk->sk_socket);
1443        sock_net_set(sk, get_net(&init_net));
1444        sock_put(sk);
1445}
1446EXPORT_SYMBOL(sk_release_kernel);
1447
1448static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1449{
1450        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1451                sock_update_memcg(newsk);
1452}
1453
1454/**
1455 *      sk_clone_lock - clone a socket, and lock its clone
1456 *      @sk: the socket to clone
1457 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1458 *
1459 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1460 */
1461struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1462{
1463        struct sock *newsk;
1464
1465        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1466        if (newsk != NULL) {
1467                struct sk_filter *filter;
1468
1469                sock_copy(newsk, sk);
1470
1471                /* SANITY */
1472                get_net(sock_net(newsk));
1473                sk_node_init(&newsk->sk_node);
1474                sock_lock_init(newsk);
1475                bh_lock_sock(newsk);
1476                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1477                newsk->sk_backlog.len = 0;
1478
1479                atomic_set(&newsk->sk_rmem_alloc, 0);
1480                /*
1481                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1482                 */
1483                atomic_set(&newsk->sk_wmem_alloc, 1);
1484                atomic_set(&newsk->sk_omem_alloc, 0);
1485                skb_queue_head_init(&newsk->sk_receive_queue);
1486                skb_queue_head_init(&newsk->sk_write_queue);
1487
1488                rwlock_init(&newsk->sk_callback_lock);
1489                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1490                                af_callback_keys + newsk->sk_family,
1491                                af_family_clock_key_strings[newsk->sk_family]);
1492
1493                newsk->sk_dst_cache     = NULL;
1494                newsk->sk_dst_pending_confirm = 0;
1495                newsk->sk_wmem_queued   = 0;
1496                newsk->sk_forward_alloc = 0;
1497                newsk->sk_send_head     = NULL;
1498                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1499
1500                sock_reset_flag(newsk, SOCK_DONE);
1501                skb_queue_head_init(&newsk->sk_error_queue);
1502
1503                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1504                if (filter != NULL)
1505                        sk_filter_charge(newsk, filter);
1506
1507                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1508                        /* It is still raw copy of parent, so invalidate
1509                         * destructor and make plain sk_free() */
1510                        newsk->sk_destruct = NULL;
1511                        bh_unlock_sock(newsk);
1512                        sk_free(newsk);
1513                        newsk = NULL;
1514                        goto out;
1515                }
1516
1517                newsk->sk_err      = 0;
1518                newsk->sk_priority = 0;
1519                /*
1520                 * Before updating sk_refcnt, we must commit prior changes to memory
1521                 * (Documentation/RCU/rculist_nulls.txt for details)
1522                 */
1523                smp_wmb();
1524                atomic_set(&newsk->sk_refcnt, 2);
1525
1526                /*
1527                 * Increment the counter in the same struct proto as the master
1528                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1529                 * is the same as sk->sk_prot->socks, as this field was copied
1530                 * with memcpy).
1531                 *
1532                 * This _changes_ the previous behaviour, where
1533                 * tcp_create_openreq_child always was incrementing the
1534                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1535                 * to be taken into account in all callers. -acme
1536                 */
1537                sk_refcnt_debug_inc(newsk);
1538                sk_set_socket(newsk, NULL);
1539                newsk->sk_wq = NULL;
1540
1541                sk_update_clone(sk, newsk);
1542
1543                if (newsk->sk_prot->sockets_allocated)
1544                        sk_sockets_allocated_inc(newsk);
1545
1546                if (sock_needs_netstamp(sk) &&
1547                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1548                        net_enable_timestamp();
1549        }
1550out:
1551        return newsk;
1552}
1553EXPORT_SYMBOL_GPL(sk_clone_lock);
1554
1555void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1556{
1557        sk_dst_set(sk, dst);
1558        sk->sk_route_caps = dst->dev->features;
1559        if (sk->sk_route_caps & NETIF_F_GSO)
1560                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1561        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1562        if (sk_can_gso(sk)) {
1563                if (dst->header_len) {
1564                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1565                } else {
1566                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1567                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1568                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1569                }
1570        }
1571}
1572EXPORT_SYMBOL_GPL(sk_setup_caps);
1573
1574/*
1575 *      Simple resource managers for sockets.
1576 */
1577
1578
1579/*
1580 * Write buffer destructor automatically called from kfree_skb.
1581 */
1582void sock_wfree(struct sk_buff *skb)
1583{
1584        struct sock *sk = skb->sk;
1585        unsigned int len = skb->truesize;
1586
1587        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1588                /*
1589                 * Keep a reference on sk_wmem_alloc, this will be released
1590                 * after sk_write_space() call
1591                 */
1592                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1593                sk->sk_write_space(sk);
1594                len = 1;
1595        }
1596        /*
1597         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1598         * could not do because of in-flight packets
1599         */
1600        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1601                __sk_free(sk);
1602}
1603EXPORT_SYMBOL(sock_wfree);
1604
1605/*
1606 * Read buffer destructor automatically called from kfree_skb.
1607 */
1608void sock_rfree(struct sk_buff *skb)
1609{
1610        struct sock *sk = skb->sk;
1611        unsigned int len = skb->truesize;
1612
1613        atomic_sub(len, &sk->sk_rmem_alloc);
1614        sk_mem_uncharge(sk, len);
1615}
1616EXPORT_SYMBOL(sock_rfree);
1617
1618void sock_efree(struct sk_buff *skb)
1619{
1620        sock_put(skb->sk);
1621}
1622EXPORT_SYMBOL(sock_efree);
1623
1624#ifdef CONFIG_INET
1625void sock_edemux(struct sk_buff *skb)
1626{
1627        struct sock *sk = skb->sk;
1628
1629        if (sk->sk_state == TCP_TIME_WAIT)
1630                inet_twsk_put(inet_twsk(sk));
1631        else
1632                sock_put(sk);
1633}
1634EXPORT_SYMBOL(sock_edemux);
1635#endif
1636
1637kuid_t sock_i_uid(struct sock *sk)
1638{
1639        kuid_t uid;
1640
1641        read_lock_bh(&sk->sk_callback_lock);
1642        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1643        read_unlock_bh(&sk->sk_callback_lock);
1644        return uid;
1645}
1646EXPORT_SYMBOL(sock_i_uid);
1647
1648unsigned long sock_i_ino(struct sock *sk)
1649{
1650        unsigned long ino;
1651
1652        read_lock_bh(&sk->sk_callback_lock);
1653        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1654        read_unlock_bh(&sk->sk_callback_lock);
1655        return ino;
1656}
1657EXPORT_SYMBOL(sock_i_ino);
1658
1659/*
1660 * Allocate a skb from the socket's send buffer.
1661 */
1662struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1663                             gfp_t priority)
1664{
1665        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1666                struct sk_buff *skb = alloc_skb(size, priority);
1667                if (skb) {
1668                        skb_set_owner_w(skb, sk);
1669                        return skb;
1670                }
1671        }
1672        return NULL;
1673}
1674EXPORT_SYMBOL(sock_wmalloc);
1675
1676/*
1677 * Allocate a skb from the socket's receive buffer.
1678 */
1679struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1680                             gfp_t priority)
1681{
1682        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1683                struct sk_buff *skb = alloc_skb(size, priority);
1684                if (skb) {
1685                        skb_set_owner_r(skb, sk);
1686                        return skb;
1687                }
1688        }
1689        return NULL;
1690}
1691
1692/*
1693 * Allocate a memory block from the socket's option memory buffer.
1694 */
1695void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1696{
1697        if ((unsigned int)size <= sysctl_optmem_max &&
1698            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1699                void *mem;
1700                /* First do the add, to avoid the race if kmalloc
1701                 * might sleep.
1702                 */
1703                atomic_add(size, &sk->sk_omem_alloc);
1704                mem = kmalloc(size, priority);
1705                if (mem)
1706                        return mem;
1707                atomic_sub(size, &sk->sk_omem_alloc);
1708        }
1709        return NULL;
1710}
1711EXPORT_SYMBOL(sock_kmalloc);
1712
1713/*
1714 * Free an option memory block.
1715 */
1716void sock_kfree_s(struct sock *sk, void *mem, int size)
1717{
1718        kfree(mem);
1719        atomic_sub(size, &sk->sk_omem_alloc);
1720}
1721EXPORT_SYMBOL(sock_kfree_s);
1722
1723/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1724   I think, these locks should be removed for datagram sockets.
1725 */
1726static long sock_wait_for_wmem(struct sock *sk, long timeo)
1727{
1728        DEFINE_WAIT(wait);
1729
1730        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1731        for (;;) {
1732                if (!timeo)
1733                        break;
1734                if (signal_pending(current))
1735                        break;
1736                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1737                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1738                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1739                        break;
1740                if (sk->sk_shutdown & SEND_SHUTDOWN)
1741                        break;
1742                if (sk->sk_err)
1743                        break;
1744                timeo = schedule_timeout(timeo);
1745        }
1746        finish_wait(sk_sleep(sk), &wait);
1747        return timeo;
1748}
1749
1750
1751/*
1752 *      Generic send/receive buffer handlers
1753 */
1754
1755struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1756                                     unsigned long data_len, int noblock,
1757                                     int *errcode, int max_page_order)
1758{
1759        struct sk_buff *skb;
1760        long timeo;
1761        int err;
1762
1763        timeo = sock_sndtimeo(sk, noblock);
1764        for (;;) {
1765                err = sock_error(sk);
1766                if (err != 0)
1767                        goto failure;
1768
1769                err = -EPIPE;
1770                if (sk->sk_shutdown & SEND_SHUTDOWN)
1771                        goto failure;
1772
1773                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1774                        break;
1775
1776                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1777                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1778                err = -EAGAIN;
1779                if (!timeo)
1780                        goto failure;
1781                if (signal_pending(current))
1782                        goto interrupted;
1783                timeo = sock_wait_for_wmem(sk, timeo);
1784        }
1785        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1786                                   errcode, sk->sk_allocation);
1787        if (skb)
1788                skb_set_owner_w(skb, sk);
1789        return skb;
1790
1791interrupted:
1792        err = sock_intr_errno(timeo);
1793failure:
1794        *errcode = err;
1795        return NULL;
1796}
1797EXPORT_SYMBOL(sock_alloc_send_pskb);
1798
1799struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1800                                    int noblock, int *errcode)
1801{
1802        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1803}
1804EXPORT_SYMBOL(sock_alloc_send_skb);
1805
1806/* On 32bit arches, an skb frag is limited to 2^15 */
1807#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1808
1809/**
1810 * skb_page_frag_refill - check that a page_frag contains enough room
1811 * @sz: minimum size of the fragment we want to get
1812 * @pfrag: pointer to page_frag
1813 * @gfp: priority for memory allocation
1814 *
1815 * Note: While this allocator tries to use high order pages, there is
1816 * no guarantee that allocations succeed. Therefore, @sz MUST be
1817 * less or equal than PAGE_SIZE.
1818 */
1819bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1820{
1821        if (pfrag->page) {
1822                if (page_ref_count(pfrag->page) == 1) {
1823                        pfrag->offset = 0;
1824                        return true;
1825                }
1826                if (pfrag->offset + sz <= pfrag->size)
1827                        return true;
1828                put_page(pfrag->page);
1829        }
1830
1831        pfrag->offset = 0;
1832        if (SKB_FRAG_PAGE_ORDER) {
1833                pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1834                                          __GFP_NOWARN | __GFP_NORETRY,
1835                                          SKB_FRAG_PAGE_ORDER);
1836                if (likely(pfrag->page)) {
1837                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1838                        return true;
1839                }
1840        }
1841        pfrag->page = alloc_page(gfp);
1842        if (likely(pfrag->page)) {
1843                pfrag->size = PAGE_SIZE;
1844                return true;
1845        }
1846        return false;
1847}
1848EXPORT_SYMBOL(skb_page_frag_refill);
1849
1850bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1851{
1852        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1853                return true;
1854
1855        sk_enter_memory_pressure(sk);
1856        sk_stream_moderate_sndbuf(sk);
1857        return false;
1858}
1859EXPORT_SYMBOL(sk_page_frag_refill);
1860
1861static void __lock_sock(struct sock *sk)
1862        __releases(&sk->sk_lock.slock)
1863        __acquires(&sk->sk_lock.slock)
1864{
1865        DEFINE_WAIT(wait);
1866
1867        for (;;) {
1868                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1869                                        TASK_UNINTERRUPTIBLE);
1870                spin_unlock_bh(&sk->sk_lock.slock);
1871                schedule();
1872                spin_lock_bh(&sk->sk_lock.slock);
1873                if (!sock_owned_by_user(sk))
1874                        break;
1875        }
1876        finish_wait(&sk->sk_lock.wq, &wait);
1877}
1878
1879static void __release_sock(struct sock *sk)
1880        __releases(&sk->sk_lock.slock)
1881        __acquires(&sk->sk_lock.slock)
1882{
1883        struct sk_buff *skb = sk->sk_backlog.head;
1884
1885        do {
1886                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1887                bh_unlock_sock(sk);
1888
1889                do {
1890                        struct sk_buff *next = skb->next;
1891
1892                        prefetch(next);
1893                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1894                        skb->next = NULL;
1895                        sk_backlog_rcv(sk, skb);
1896
1897                        /*
1898                         * We are in process context here with softirqs
1899                         * disabled, use cond_resched_softirq() to preempt.
1900                         * This is safe to do because we've taken the backlog
1901                         * queue private:
1902                         */
1903                        cond_resched_softirq();
1904
1905                        skb = next;
1906                } while (skb != NULL);
1907
1908                bh_lock_sock(sk);
1909        } while ((skb = sk->sk_backlog.head) != NULL);
1910
1911        /*
1912         * Doing the zeroing here guarantee we can not loop forever
1913         * while a wild producer attempts to flood us.
1914         */
1915        sk->sk_backlog.len = 0;
1916}
1917
1918/**
1919 * sk_wait_data - wait for data to arrive at sk_receive_queue
1920 * @sk:    sock to wait on
1921 * @timeo: for how long
1922 * @skb:   last skb seen on sk_receive_queue
1923 *
1924 * Now socket state including sk->sk_err is changed only under lock,
1925 * hence we may omit checks after joining wait queue.
1926 * We check receive queue before schedule() only as optimization;
1927 * it is very likely that release_sock() added new data.
1928 */
1929int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1930{
1931        int rc;
1932        DEFINE_WAIT(wait);
1933
1934        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1935        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1936        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
1937        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1938        finish_wait(sk_sleep(sk), &wait);
1939        return rc;
1940}
1941EXPORT_SYMBOL(sk_wait_data);
1942
1943/**
1944 *      __sk_mem_raise_allocated - increase memory_allocated
1945 *      @sk: socket
1946 *      @size: memory size to allocate
1947 *      @amt: pages to allocate
1948 *      @kind: allocation type
1949 *
1950 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
1951 */
1952int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
1953{
1954        struct proto *prot = sk->sk_prot;
1955        int parent_status = UNDER_LIMIT;
1956        long allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1957
1958        /* Under limit. */
1959        if (parent_status == UNDER_LIMIT &&
1960                        allocated <= sk_prot_mem_limits(sk, 0)) {
1961                sk_leave_memory_pressure(sk);
1962                return 1;
1963        }
1964
1965        /* Under pressure. (we or our parents) */
1966        if ((parent_status > SOFT_LIMIT) ||
1967                        allocated > sk_prot_mem_limits(sk, 1))
1968                sk_enter_memory_pressure(sk);
1969
1970        /* Over hard limit (we or our parents) */
1971        if ((parent_status == OVER_LIMIT) ||
1972                        (allocated > sk_prot_mem_limits(sk, 2)))
1973                goto suppress_allocation;
1974
1975        /* guarantee minimum buffer size under pressure */
1976        if (kind == SK_MEM_RECV) {
1977                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1978                        return 1;
1979
1980        } else { /* SK_MEM_SEND */
1981                if (sk->sk_type == SOCK_STREAM) {
1982                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1983                                return 1;
1984                } else if (atomic_read(&sk->sk_wmem_alloc) <
1985                           prot->sysctl_wmem[0])
1986                                return 1;
1987        }
1988
1989        if (sk_has_memory_pressure(sk)) {
1990                int alloc;
1991
1992                if (!sk_under_memory_pressure(sk))
1993                        return 1;
1994                alloc = sk_sockets_allocated_read_positive(sk);
1995                if (sk_prot_mem_limits(sk, 2) > alloc *
1996                    sk_mem_pages(sk->sk_wmem_queued +
1997                                 atomic_read(&sk->sk_rmem_alloc) +
1998                                 sk->sk_forward_alloc))
1999                        return 1;
2000        }
2001
2002suppress_allocation:
2003
2004        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2005                sk_stream_moderate_sndbuf(sk);
2006
2007                /* Fail only if socket is _under_ its sndbuf.
2008                 * In this case we cannot block, so that we have to fail.
2009                 */
2010                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2011                        return 1;
2012        }
2013
2014        trace_sock_exceed_buf_limit(sk, prot, allocated);
2015
2016        sk_memory_allocated_sub(sk, amt);
2017
2018        return 0;
2019}
2020EXPORT_SYMBOL(__sk_mem_raise_allocated);
2021
2022/**
2023 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2024 *      @sk: socket
2025 *      @size: memory size to allocate
2026 *      @kind: allocation type
2027 *
2028 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2029 *      rmem allocation. This function assumes that protocols which have
2030 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2031 */
2032int __sk_mem_schedule(struct sock *sk, int size, int kind)
2033{
2034        int ret, amt = sk_mem_pages(size);
2035
2036        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2037        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2038        if (!ret)
2039                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2040        return ret;
2041}
2042EXPORT_SYMBOL(__sk_mem_schedule);
2043
2044/**
2045 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2046 *      @sk: socket
2047 *      @amount: number of quanta
2048 *
2049 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2050 */
2051void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2052{
2053        sk_memory_allocated_sub(sk, amount);
2054
2055        if (sk_under_memory_pressure(sk) &&
2056            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2057                sk_leave_memory_pressure(sk);
2058}
2059EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2060
2061/**
2062 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2063 *      @sk: socket
2064 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2065 */
2066void __sk_mem_reclaim(struct sock *sk, int amount)
2067{
2068        amount >>= SK_MEM_QUANTUM_SHIFT;
2069        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2070        __sk_mem_reduce_allocated(sk, amount);
2071}
2072EXPORT_SYMBOL(__sk_mem_reclaim);
2073
2074
2075/*
2076 * Set of default routines for initialising struct proto_ops when
2077 * the protocol does not support a particular function. In certain
2078 * cases where it makes no sense for a protocol to have a "do nothing"
2079 * function, some default processing is provided.
2080 */
2081
2082int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2083{
2084        return -EOPNOTSUPP;
2085}
2086EXPORT_SYMBOL(sock_no_bind);
2087
2088int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2089                    int len, int flags)
2090{
2091        return -EOPNOTSUPP;
2092}
2093EXPORT_SYMBOL(sock_no_connect);
2094
2095int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2096{
2097        return -EOPNOTSUPP;
2098}
2099EXPORT_SYMBOL(sock_no_socketpair);
2100
2101int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2102{
2103        return -EOPNOTSUPP;
2104}
2105EXPORT_SYMBOL(sock_no_accept);
2106
2107int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2108                    int *len, int peer)
2109{
2110        return -EOPNOTSUPP;
2111}
2112EXPORT_SYMBOL(sock_no_getname);
2113
2114unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2115{
2116        return 0;
2117}
2118EXPORT_SYMBOL(sock_no_poll);
2119
2120int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2121{
2122        return -EOPNOTSUPP;
2123}
2124EXPORT_SYMBOL(sock_no_ioctl);
2125
2126int sock_no_listen(struct socket *sock, int backlog)
2127{
2128        return -EOPNOTSUPP;
2129}
2130EXPORT_SYMBOL(sock_no_listen);
2131
2132int sock_no_shutdown(struct socket *sock, int how)
2133{
2134        return -EOPNOTSUPP;
2135}
2136EXPORT_SYMBOL(sock_no_shutdown);
2137
2138int sock_no_setsockopt(struct socket *sock, int level, int optname,
2139                    char __user *optval, unsigned int optlen)
2140{
2141        return -EOPNOTSUPP;
2142}
2143EXPORT_SYMBOL(sock_no_setsockopt);
2144
2145int sock_no_getsockopt(struct socket *sock, int level, int optname,
2146                    char __user *optval, int __user *optlen)
2147{
2148        return -EOPNOTSUPP;
2149}
2150EXPORT_SYMBOL(sock_no_getsockopt);
2151
2152int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2153                    size_t len)
2154{
2155        return -EOPNOTSUPP;
2156}
2157EXPORT_SYMBOL(sock_no_sendmsg);
2158
2159int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2160                    size_t len, int flags)
2161{
2162        return -EOPNOTSUPP;
2163}
2164EXPORT_SYMBOL(sock_no_recvmsg);
2165
2166int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2167{
2168        /* Mirror missing mmap method error code */
2169        return -ENODEV;
2170}
2171EXPORT_SYMBOL(sock_no_mmap);
2172
2173ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2174{
2175        ssize_t res;
2176        struct msghdr msg = {.msg_flags = flags};
2177        struct kvec iov;
2178        char *kaddr = kmap(page);
2179        iov.iov_base = kaddr + offset;
2180        iov.iov_len = size;
2181        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2182        kunmap(page);
2183        return res;
2184}
2185EXPORT_SYMBOL(sock_no_sendpage);
2186
2187/*
2188 *      Default Socket Callbacks
2189 */
2190
2191static void sock_def_wakeup(struct sock *sk)
2192{
2193        struct socket_wq *wq;
2194
2195        rcu_read_lock();
2196        wq = rcu_dereference(sk->sk_wq);
2197        if (wq_has_sleeper(wq))
2198                wake_up_interruptible_all(&wq->wait);
2199        rcu_read_unlock();
2200}
2201
2202static void sock_def_error_report(struct sock *sk)
2203{
2204        struct socket_wq *wq;
2205
2206        rcu_read_lock();
2207        wq = rcu_dereference(sk->sk_wq);
2208        if (wq_has_sleeper(wq))
2209                wake_up_interruptible_poll(&wq->wait, POLLERR);
2210        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2211        rcu_read_unlock();
2212}
2213
2214static void sock_def_readable(struct sock *sk, int len)
2215{
2216        struct socket_wq *wq;
2217
2218        rcu_read_lock();
2219        wq = rcu_dereference(sk->sk_wq);
2220        if (wq_has_sleeper(wq))
2221                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2222                                                POLLRDNORM | POLLRDBAND);
2223        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2224        rcu_read_unlock();
2225}
2226
2227static void sock_def_write_space(struct sock *sk)
2228{
2229        struct socket_wq *wq;
2230
2231        rcu_read_lock();
2232
2233        /* Do not wake up a writer until he can make "significant"
2234         * progress.  --DaveM
2235         */
2236        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2237                wq = rcu_dereference(sk->sk_wq);
2238                if (wq_has_sleeper(wq))
2239                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2240                                                POLLWRNORM | POLLWRBAND);
2241
2242                /* Should agree with poll, otherwise some programs break */
2243                if (sock_writeable(sk))
2244                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2245        }
2246
2247        rcu_read_unlock();
2248}
2249
2250static void sock_def_destruct(struct sock *sk)
2251{
2252        kfree(sk->sk_protinfo);
2253}
2254
2255void sk_send_sigurg(struct sock *sk)
2256{
2257        if (sk->sk_socket && sk->sk_socket->file)
2258                if (send_sigurg(&sk->sk_socket->file->f_owner))
2259                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2260}
2261EXPORT_SYMBOL(sk_send_sigurg);
2262
2263void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2264                    unsigned long expires)
2265{
2266        if (!mod_timer(timer, expires))
2267                sock_hold(sk);
2268}
2269EXPORT_SYMBOL(sk_reset_timer);
2270
2271void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2272{
2273        if (del_timer(timer))
2274                __sock_put(sk);
2275}
2276EXPORT_SYMBOL(sk_stop_timer);
2277
2278void sock_init_data(struct socket *sock, struct sock *sk)
2279{
2280        skb_queue_head_init(&sk->sk_receive_queue);
2281        skb_queue_head_init(&sk->sk_write_queue);
2282        skb_queue_head_init(&sk->sk_error_queue);
2283
2284        sk->sk_send_head        =       NULL;
2285
2286        init_timer(&sk->sk_timer);
2287
2288        sk->sk_allocation       =       GFP_KERNEL;
2289        sk->sk_rcvbuf           =       sysctl_rmem_default;
2290        sk->sk_sndbuf           =       sysctl_wmem_default;
2291        sk->sk_state            =       TCP_CLOSE;
2292        sk_set_socket(sk, sock);
2293
2294        sock_set_flag(sk, SOCK_ZAPPED);
2295
2296        if (sock) {
2297                sk->sk_type     =       sock->type;
2298                sk->sk_wq       =       sock->wq;
2299                sock->sk        =       sk;
2300        } else
2301                sk->sk_wq       =       NULL;
2302
2303        rwlock_init(&sk->sk_callback_lock);
2304        lockdep_set_class_and_name(&sk->sk_callback_lock,
2305                        af_callback_keys + sk->sk_family,
2306                        af_family_clock_key_strings[sk->sk_family]);
2307
2308        sk->sk_state_change     =       sock_def_wakeup;
2309        sk->sk_data_ready       =       sock_def_readable;
2310        sk->sk_write_space      =       sock_def_write_space;
2311        sk->sk_error_report     =       sock_def_error_report;
2312        sk->sk_destruct         =       sock_def_destruct;
2313
2314        sk->sk_frag.page        =       NULL;
2315        sk->sk_frag.offset      =       0;
2316        sk->sk_peek_off         =       -1;
2317
2318        sk->sk_peer_pid         =       NULL;
2319        sk->sk_peer_cred        =       NULL;
2320        sk->sk_write_pending    =       0;
2321        sk->sk_rcvlowat         =       1;
2322        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2323        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2324
2325        sk->sk_stamp = ktime_set(-1L, 0);
2326
2327#ifdef CONFIG_NET_RX_BUSY_POLL
2328        sk->sk_napi_id          =       0;
2329        sk->sk_ll_usec          =       sysctl_net_busy_read;
2330#endif
2331
2332        sk->sk_max_pacing_rate = ~0U;
2333        sk->sk_pacing_rate = ~0U;
2334        /*
2335         * Before updating sk_refcnt, we must commit prior changes to memory
2336         * (Documentation/RCU/rculist_nulls.txt for details)
2337         */
2338        smp_wmb();
2339        atomic_set(&sk->sk_refcnt, 1);
2340        atomic_set(&sk->sk_drops, 0);
2341}
2342EXPORT_SYMBOL(sock_init_data);
2343
2344void lock_sock_nested(struct sock *sk, int subclass)
2345{
2346        might_sleep();
2347        spin_lock_bh(&sk->sk_lock.slock);
2348        if (sk->sk_lock.owned)
2349                __lock_sock(sk);
2350        sk->sk_lock.owned = 1;
2351        spin_unlock(&sk->sk_lock.slock);
2352        /*
2353         * The sk_lock has mutex_lock() semantics here:
2354         */
2355        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2356        local_bh_enable();
2357}
2358EXPORT_SYMBOL(lock_sock_nested);
2359
2360void release_sock(struct sock *sk)
2361{
2362        /*
2363         * The sk_lock has mutex_unlock() semantics:
2364         */
2365        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2366
2367        spin_lock_bh(&sk->sk_lock.slock);
2368        if (sk->sk_backlog.tail)
2369                __release_sock(sk);
2370
2371        /* Warning : release_cb() might need to release sk ownership,
2372         * ie call sock_release_ownership(sk) before us.
2373         */
2374        if (sk->sk_prot->release_cb)
2375                sk->sk_prot->release_cb(sk);
2376
2377        sock_release_ownership(sk);
2378        if (waitqueue_active(&sk->sk_lock.wq))
2379                wake_up(&sk->sk_lock.wq);
2380        spin_unlock_bh(&sk->sk_lock.slock);
2381}
2382EXPORT_SYMBOL(release_sock);
2383
2384/**
2385 * lock_sock_fast - fast version of lock_sock
2386 * @sk: socket
2387 *
2388 * This version should be used for very small section, where process wont block
2389 * return false if fast path is taken
2390 *   sk_lock.slock locked, owned = 0, BH disabled
2391 * return true if slow path is taken
2392 *   sk_lock.slock unlocked, owned = 1, BH enabled
2393 */
2394bool lock_sock_fast(struct sock *sk)
2395{
2396        might_sleep();
2397        spin_lock_bh(&sk->sk_lock.slock);
2398
2399        if (!sk->sk_lock.owned)
2400                /*
2401                 * Note : We must disable BH
2402                 */
2403                return false;
2404
2405        __lock_sock(sk);
2406        sk->sk_lock.owned = 1;
2407        spin_unlock(&sk->sk_lock.slock);
2408        /*
2409         * The sk_lock has mutex_lock() semantics here:
2410         */
2411        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2412        local_bh_enable();
2413        return true;
2414}
2415EXPORT_SYMBOL(lock_sock_fast);
2416
2417int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2418{
2419        struct timeval tv;
2420        if (!sock_flag(sk, SOCK_TIMESTAMP))
2421                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2422        tv = ktime_to_timeval(sk->sk_stamp);
2423        if (tv.tv_sec == -1)
2424                return -ENOENT;
2425        if (tv.tv_sec == 0) {
2426                sk->sk_stamp = ktime_get_real();
2427                tv = ktime_to_timeval(sk->sk_stamp);
2428        }
2429        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2430}
2431EXPORT_SYMBOL(sock_get_timestamp);
2432
2433int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2434{
2435        struct timespec ts;
2436        if (!sock_flag(sk, SOCK_TIMESTAMP))
2437                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2438        ts = ktime_to_timespec(sk->sk_stamp);
2439        if (ts.tv_sec == -1)
2440                return -ENOENT;
2441        if (ts.tv_sec == 0) {
2442                sk->sk_stamp = ktime_get_real();
2443                ts = ktime_to_timespec(sk->sk_stamp);
2444        }
2445        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2446}
2447EXPORT_SYMBOL(sock_get_timestampns);
2448
2449void sock_enable_timestamp(struct sock *sk, int flag)
2450{
2451        if (!sock_flag(sk, flag)) {
2452                unsigned long previous_flags = sk->sk_flags;
2453
2454                sock_set_flag(sk, flag);
2455                /*
2456                 * we just set one of the two flags which require net
2457                 * time stamping, but time stamping might have been on
2458                 * already because of the other one
2459                 */
2460                if (sock_needs_netstamp(sk) &&
2461                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2462                        net_enable_timestamp();
2463        }
2464}
2465
2466/*
2467 *      Get a socket option on an socket.
2468 *
2469 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2470 *      asynchronous errors should be reported by getsockopt. We assume
2471 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2472 */
2473int sock_common_getsockopt(struct socket *sock, int level, int optname,
2474                           char __user *optval, int __user *optlen)
2475{
2476        struct sock *sk = sock->sk;
2477
2478        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2479}
2480EXPORT_SYMBOL(sock_common_getsockopt);
2481
2482#ifdef CONFIG_COMPAT
2483int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2484                                  char __user *optval, int __user *optlen)
2485{
2486        struct sock *sk = sock->sk;
2487
2488        if (sk->sk_prot->compat_getsockopt != NULL)
2489                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2490                                                      optval, optlen);
2491        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2492}
2493EXPORT_SYMBOL(compat_sock_common_getsockopt);
2494#endif
2495
2496int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2497                        struct msghdr *msg, size_t size, int flags)
2498{
2499        struct sock *sk = sock->sk;
2500        int addr_len = 0;
2501        int err;
2502
2503        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2504                                   flags & ~MSG_DONTWAIT, &addr_len);
2505        if (err >= 0)
2506                msg->msg_namelen = addr_len;
2507        return err;
2508}
2509EXPORT_SYMBOL(sock_common_recvmsg);
2510
2511/*
2512 *      Set socket options on an inet socket.
2513 */
2514int sock_common_setsockopt(struct socket *sock, int level, int optname,
2515                           char __user *optval, unsigned int optlen)
2516{
2517        struct sock *sk = sock->sk;
2518
2519        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2520}
2521EXPORT_SYMBOL(sock_common_setsockopt);
2522
2523#ifdef CONFIG_COMPAT
2524int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2525                                  char __user *optval, unsigned int optlen)
2526{
2527        struct sock *sk = sock->sk;
2528
2529        if (sk->sk_prot->compat_setsockopt != NULL)
2530                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2531                                                      optval, optlen);
2532        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2533}
2534EXPORT_SYMBOL(compat_sock_common_setsockopt);
2535#endif
2536
2537void sk_common_release(struct sock *sk)
2538{
2539        if (sk->sk_prot->destroy)
2540                sk->sk_prot->destroy(sk);
2541
2542        /*
2543         * Observation: when sock_common_release is called, processes have
2544         * no access to socket. But net still has.
2545         * Step one, detach it from networking:
2546         *
2547         * A. Remove from hash tables.
2548         */
2549
2550        sk->sk_prot->unhash(sk);
2551
2552        /*
2553         * In this point socket cannot receive new packets, but it is possible
2554         * that some packets are in flight because some CPU runs receiver and
2555         * did hash table lookup before we unhashed socket. They will achieve
2556         * receive queue and will be purged by socket destructor.
2557         *
2558         * Also we still have packets pending on receive queue and probably,
2559         * our own packets waiting in device queues. sock_destroy will drain
2560         * receive queue, but transmitted packets will delay socket destruction
2561         * until the last reference will be released.
2562         */
2563
2564        sock_orphan(sk);
2565
2566        xfrm_sk_free_policy(sk);
2567
2568        sk_refcnt_debug_release(sk);
2569
2570        if (sk->sk_frag.page) {
2571                put_page(sk->sk_frag.page);
2572                sk->sk_frag.page = NULL;
2573        }
2574
2575        sock_put(sk);
2576}
2577EXPORT_SYMBOL(sk_common_release);
2578
2579#ifdef CONFIG_PROC_FS
2580#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2581struct prot_inuse {
2582        int val[PROTO_INUSE_NR];
2583};
2584
2585static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2586
2587#ifdef CONFIG_NET_NS
2588void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2589{
2590        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2591}
2592EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2593
2594int sock_prot_inuse_get(struct net *net, struct proto *prot)
2595{
2596        int cpu, idx = prot->inuse_idx;
2597        int res = 0;
2598
2599        for_each_possible_cpu(cpu)
2600                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2601
2602        return res >= 0 ? res : 0;
2603}
2604EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2605
2606static int __net_init sock_inuse_init_net(struct net *net)
2607{
2608        net->core.inuse = alloc_percpu(struct prot_inuse);
2609        return net->core.inuse ? 0 : -ENOMEM;
2610}
2611
2612static void __net_exit sock_inuse_exit_net(struct net *net)
2613{
2614        free_percpu(net->core.inuse);
2615}
2616
2617static struct pernet_operations net_inuse_ops = {
2618        .init = sock_inuse_init_net,
2619        .exit = sock_inuse_exit_net,
2620};
2621
2622static __init int net_inuse_init(void)
2623{
2624        if (register_pernet_subsys(&net_inuse_ops))
2625                panic("Cannot initialize net inuse counters");
2626
2627        return 0;
2628}
2629
2630core_initcall(net_inuse_init);
2631#else
2632static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2633
2634void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2635{
2636        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2637}
2638EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2639
2640int sock_prot_inuse_get(struct net *net, struct proto *prot)
2641{
2642        int cpu, idx = prot->inuse_idx;
2643        int res = 0;
2644
2645        for_each_possible_cpu(cpu)
2646                res += per_cpu(prot_inuse, cpu).val[idx];
2647
2648        return res >= 0 ? res : 0;
2649}
2650EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2651#endif
2652
2653static void assign_proto_idx(struct proto *prot)
2654{
2655        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2656
2657        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2658                pr_err("PROTO_INUSE_NR exhausted\n");
2659                return;
2660        }
2661
2662        set_bit(prot->inuse_idx, proto_inuse_idx);
2663}
2664
2665static void release_proto_idx(struct proto *prot)
2666{
2667        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2668                clear_bit(prot->inuse_idx, proto_inuse_idx);
2669}
2670#else
2671static inline void assign_proto_idx(struct proto *prot)
2672{
2673}
2674
2675static inline void release_proto_idx(struct proto *prot)
2676{
2677}
2678#endif
2679
2680int proto_register(struct proto *prot, int alloc_slab)
2681{
2682        if (alloc_slab) {
2683                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2684                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2685                                        NULL);
2686
2687                if (prot->slab == NULL) {
2688                        pr_crit("%s: Can't create sock SLAB cache!\n",
2689                                prot->name);
2690                        goto out;
2691                }
2692
2693                if (prot->rsk_prot != NULL) {
2694                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2695                        if (prot->rsk_prot->slab_name == NULL)
2696                                goto out_free_sock_slab;
2697
2698                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2699                                                                 prot->rsk_prot->obj_size, 0,
2700                                                                 SLAB_HWCACHE_ALIGN, NULL);
2701
2702                        if (prot->rsk_prot->slab == NULL) {
2703                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2704                                        prot->name);
2705                                goto out_free_request_sock_slab_name;
2706                        }
2707                }
2708
2709                if (prot->twsk_prot != NULL) {
2710                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2711
2712                        if (prot->twsk_prot->twsk_slab_name == NULL)
2713                                goto out_free_request_sock_slab;
2714
2715                        prot->twsk_prot->twsk_slab =
2716                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2717                                                  prot->twsk_prot->twsk_obj_size,
2718                                                  0,
2719                                                  SLAB_HWCACHE_ALIGN |
2720                                                        prot->slab_flags,
2721                                                  NULL);
2722                        if (prot->twsk_prot->twsk_slab == NULL)
2723                                goto out_free_timewait_sock_slab_name;
2724                }
2725        }
2726
2727        mutex_lock(&proto_list_mutex);
2728        list_add(&prot->node, &proto_list);
2729        assign_proto_idx(prot);
2730        mutex_unlock(&proto_list_mutex);
2731        return 0;
2732
2733out_free_timewait_sock_slab_name:
2734        kfree(prot->twsk_prot->twsk_slab_name);
2735out_free_request_sock_slab:
2736        if (prot->rsk_prot && prot->rsk_prot->slab) {
2737                kmem_cache_destroy(prot->rsk_prot->slab);
2738                prot->rsk_prot->slab = NULL;
2739        }
2740out_free_request_sock_slab_name:
2741        if (prot->rsk_prot)
2742                kfree(prot->rsk_prot->slab_name);
2743out_free_sock_slab:
2744        kmem_cache_destroy(prot->slab);
2745        prot->slab = NULL;
2746out:
2747        return -ENOBUFS;
2748}
2749EXPORT_SYMBOL(proto_register);
2750
2751void proto_unregister(struct proto *prot)
2752{
2753        mutex_lock(&proto_list_mutex);
2754        release_proto_idx(prot);
2755        list_del(&prot->node);
2756        mutex_unlock(&proto_list_mutex);
2757
2758        if (prot->slab != NULL) {
2759                kmem_cache_destroy(prot->slab);
2760                prot->slab = NULL;
2761        }
2762
2763        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2764                kmem_cache_destroy(prot->rsk_prot->slab);
2765                kfree(prot->rsk_prot->slab_name);
2766                prot->rsk_prot->slab = NULL;
2767        }
2768
2769        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2770                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2771                kfree(prot->twsk_prot->twsk_slab_name);
2772                prot->twsk_prot->twsk_slab = NULL;
2773        }
2774}
2775EXPORT_SYMBOL(proto_unregister);
2776
2777#ifdef CONFIG_PROC_FS
2778static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2779        __acquires(proto_list_mutex)
2780{
2781        mutex_lock(&proto_list_mutex);
2782        return seq_list_start_head(&proto_list, *pos);
2783}
2784
2785static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2786{
2787        return seq_list_next(v, &proto_list, pos);
2788}
2789
2790static void proto_seq_stop(struct seq_file *seq, void *v)
2791        __releases(proto_list_mutex)
2792{
2793        mutex_unlock(&proto_list_mutex);
2794}
2795
2796static char proto_method_implemented(const void *method)
2797{
2798        return method == NULL ? 'n' : 'y';
2799}
2800static long sock_prot_memory_allocated(struct proto *proto)
2801{
2802        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2803}
2804
2805static char *sock_prot_memory_pressure(struct proto *proto)
2806{
2807        return proto->memory_pressure != NULL ?
2808        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2809}
2810
2811static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2812{
2813
2814        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2815                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2816                   proto->name,
2817                   proto->obj_size,
2818                   sock_prot_inuse_get(seq_file_net(seq), proto),
2819                   sock_prot_memory_allocated(proto),
2820                   sock_prot_memory_pressure(proto),
2821                   proto->max_header,
2822                   proto->slab == NULL ? "no" : "yes",
2823                   module_name(proto->owner),
2824                   proto_method_implemented(proto->close),
2825                   proto_method_implemented(proto->connect),
2826                   proto_method_implemented(proto->disconnect),
2827                   proto_method_implemented(proto->accept),
2828                   proto_method_implemented(proto->ioctl),
2829                   proto_method_implemented(proto->init),
2830                   proto_method_implemented(proto->destroy),
2831                   proto_method_implemented(proto->shutdown),
2832                   proto_method_implemented(proto->setsockopt),
2833                   proto_method_implemented(proto->getsockopt),
2834                   proto_method_implemented(proto->sendmsg),
2835                   proto_method_implemented(proto->recvmsg),
2836                   proto_method_implemented(proto->sendpage),
2837                   proto_method_implemented(proto->bind),
2838                   proto_method_implemented(proto->backlog_rcv),
2839                   proto_method_implemented(proto->hash),
2840                   proto_method_implemented(proto->unhash),
2841                   proto_method_implemented(proto->get_port),
2842                   proto_method_implemented(proto->enter_memory_pressure));
2843}
2844
2845static int proto_seq_show(struct seq_file *seq, void *v)
2846{
2847        if (v == &proto_list)
2848                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2849                           "protocol",
2850                           "size",
2851                           "sockets",
2852                           "memory",
2853                           "press",
2854                           "maxhdr",
2855                           "slab",
2856                           "module",
2857                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2858        else
2859                proto_seq_printf(seq, list_entry(v, struct proto, node));
2860        return 0;
2861}
2862
2863static const struct seq_operations proto_seq_ops = {
2864        .start  = proto_seq_start,
2865        .next   = proto_seq_next,
2866        .stop   = proto_seq_stop,
2867        .show   = proto_seq_show,
2868};
2869
2870static int proto_seq_open(struct inode *inode, struct file *file)
2871{
2872        return seq_open_net(inode, file, &proto_seq_ops,
2873                            sizeof(struct seq_net_private));
2874}
2875
2876static const struct file_operations proto_seq_fops = {
2877        .owner          = THIS_MODULE,
2878        .open           = proto_seq_open,
2879        .read           = seq_read,
2880        .llseek         = seq_lseek,
2881        .release        = seq_release_net,
2882};
2883
2884static __net_init int proto_init_net(struct net *net)
2885{
2886        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2887                return -ENOMEM;
2888
2889        return 0;
2890}
2891
2892static __net_exit void proto_exit_net(struct net *net)
2893{
2894        remove_proc_entry("protocols", net->proc_net);
2895}
2896
2897
2898static __net_initdata struct pernet_operations proto_net_ops = {
2899        .init = proto_init_net,
2900        .exit = proto_exit_net,
2901};
2902
2903static int __init proto_init(void)
2904{
2905        return register_pernet_subsys(&proto_net_ops);
2906}
2907
2908subsys_initcall(proto_init);
2909
2910#endif /* PROC_FS */
2911