linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/types.h>
  97#include <linux/socket.h>
  98#include <linux/in.h>
  99#include <linux/kernel.h>
 100#include <linux/module.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/sched.h>
 104#include <linux/timer.h>
 105#include <linux/string.h>
 106#include <linux/sockios.h>
 107#include <linux/net.h>
 108#include <linux/mm.h>
 109#include <linux/slab.h>
 110#include <linux/interrupt.h>
 111#include <linux/poll.h>
 112#include <linux/tcp.h>
 113#include <linux/init.h>
 114#include <linux/highmem.h>
 115#include <linux/user_namespace.h>
 116#include <linux/static_key.h>
 117#include <linux/memcontrol.h>
 118#include <linux/prefetch.h>
 119
 120#include <asm/uaccess.h>
 121
 122#include <linux/netdevice.h>
 123#include <net/protocol.h>
 124#include <linux/skbuff.h>
 125#include <net/net_namespace.h>
 126#include <net/request_sock.h>
 127#include <net/sock.h>
 128#include <linux/net_tstamp.h>
 129#include <net/xfrm.h>
 130#include <linux/ipsec.h>
 131#include <net/cls_cgroup.h>
 132#include <net/netprio_cgroup.h>
 133
 134#include <linux/filter.h>
 135
 136#include <trace/events/sock.h>
 137
 138#include <net/tcp.h>
 139#include <net/busy_poll.h>
 140
 141static DEFINE_MUTEX(proto_list_mutex);
 142static LIST_HEAD(proto_list);
 143
 144/**
 145 * sk_ns_capable - General socket capability test
 146 * @sk: Socket to use a capability on or through
 147 * @user_ns: The user namespace of the capability to use
 148 * @cap: The capability to use
 149 *
 150 * Test to see if the opener of the socket had when the socket was
 151 * created and the current process has the capability @cap in the user
 152 * namespace @user_ns.
 153 */
 154bool sk_ns_capable(const struct sock *sk,
 155                   struct user_namespace *user_ns, int cap)
 156{
 157        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 158                ns_capable(user_ns, cap);
 159}
 160EXPORT_SYMBOL(sk_ns_capable);
 161
 162/**
 163 * sk_capable - Socket global capability test
 164 * @sk: Socket to use a capability on or through
 165 * @cap: The global capbility to use
 166 *
 167 * Test to see if the opener of the socket had when the socket was
 168 * created and the current process has the capability @cap in all user
 169 * namespaces.
 170 */
 171bool sk_capable(const struct sock *sk, int cap)
 172{
 173        return sk_ns_capable(sk, &init_user_ns, cap);
 174}
 175EXPORT_SYMBOL(sk_capable);
 176
 177/**
 178 * sk_net_capable - Network namespace socket capability test
 179 * @sk: Socket to use a capability on or through
 180 * @cap: The capability to use
 181 *
 182 * Test to see if the opener of the socket had when the socke was created
 183 * and the current process has the capability @cap over the network namespace
 184 * the socket is a member of.
 185 */
 186bool sk_net_capable(const struct sock *sk, int cap)
 187{
 188        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 189}
 190EXPORT_SYMBOL(sk_net_capable);
 191
 192
 193#ifdef CONFIG_MEMCG_KMEM
 194int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 195{
 196        struct proto *proto;
 197        int ret = 0;
 198
 199        mutex_lock(&proto_list_mutex);
 200        list_for_each_entry(proto, &proto_list, node) {
 201                if (proto->init_cgroup) {
 202                        ret = proto->init_cgroup(memcg, ss);
 203                        if (ret)
 204                                goto out;
 205                }
 206        }
 207
 208        mutex_unlock(&proto_list_mutex);
 209        return ret;
 210out:
 211        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 212                if (proto->destroy_cgroup)
 213                        proto->destroy_cgroup(memcg);
 214        mutex_unlock(&proto_list_mutex);
 215        return ret;
 216}
 217
 218void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 219{
 220        struct proto *proto;
 221
 222        mutex_lock(&proto_list_mutex);
 223        list_for_each_entry_reverse(proto, &proto_list, node)
 224                if (proto->destroy_cgroup)
 225                        proto->destroy_cgroup(memcg);
 226        mutex_unlock(&proto_list_mutex);
 227}
 228#endif
 229
 230/*
 231 * Each address family might have different locking rules, so we have
 232 * one slock key per address family:
 233 */
 234static struct lock_class_key af_family_keys[AF_MAX];
 235static struct lock_class_key af_family_slock_keys[AF_MAX];
 236
 237#if defined(CONFIG_MEMCG_KMEM)
 238struct static_key memcg_socket_limit_enabled;
 239EXPORT_SYMBOL(memcg_socket_limit_enabled);
 240#endif
 241
 242/*
 243 * Make lock validator output more readable. (we pre-construct these
 244 * strings build-time, so that runtime initialization of socket
 245 * locks is fast):
 246 */
 247static const char *const af_family_key_strings[AF_MAX+1] = {
 248  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 249  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 250  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 251  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 252  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 253  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 254  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 255  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 256  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 257  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 258  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 259  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 260  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 261  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 262};
 263static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 264  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 265  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 266  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 267  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 268  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 269  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 270  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 271  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 272  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 273  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 274  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 275  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 276  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 277  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 278};
 279static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 280  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 281  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 282  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 283  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 284  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 285  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 286  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 287  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 288  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 289  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 290  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 291  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 292  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 293  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 294};
 295
 296/*
 297 * sk_callback_lock locking rules are per-address-family,
 298 * so split the lock classes by using a per-AF key:
 299 */
 300static struct lock_class_key af_callback_keys[AF_MAX];
 301
 302/* Take into consideration the size of the struct sk_buff overhead in the
 303 * determination of these values, since that is non-constant across
 304 * platforms.  This makes socket queueing behavior and performance
 305 * not depend upon such differences.
 306 */
 307#define _SK_MEM_PACKETS         256
 308#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 309#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 310#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 311
 312/* Run time adjustable parameters. */
 313__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 314EXPORT_SYMBOL(sysctl_wmem_max);
 315__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 316EXPORT_SYMBOL(sysctl_rmem_max);
 317__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 318__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 319
 320/* Maximal space eaten by iovec or ancillary data plus some space */
 321int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 322EXPORT_SYMBOL(sysctl_optmem_max);
 323
 324struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 325EXPORT_SYMBOL_GPL(memalloc_socks);
 326
 327/**
 328 * sk_set_memalloc - sets %SOCK_MEMALLOC
 329 * @sk: socket to set it on
 330 *
 331 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 332 * It's the responsibility of the admin to adjust min_free_kbytes
 333 * to meet the requirements
 334 */
 335void sk_set_memalloc(struct sock *sk)
 336{
 337        sock_set_flag(sk, SOCK_MEMALLOC);
 338        sk->sk_allocation |= __GFP_MEMALLOC;
 339        static_key_slow_inc(&memalloc_socks);
 340}
 341EXPORT_SYMBOL_GPL(sk_set_memalloc);
 342
 343void sk_clear_memalloc(struct sock *sk)
 344{
 345        sock_reset_flag(sk, SOCK_MEMALLOC);
 346        sk->sk_allocation &= ~__GFP_MEMALLOC;
 347        static_key_slow_dec(&memalloc_socks);
 348
 349        /*
 350         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 351         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 352         * it has rmem allocations there is a risk that the user of the
 353         * socket cannot make forward progress due to exceeding the rmem
 354         * limits. By rights, sk_clear_memalloc() should only be called
 355         * on sockets being torn down but warn and reset the accounting if
 356         * that assumption breaks.
 357         */
 358        if (WARN_ON(sk->sk_forward_alloc))
 359                sk_mem_reclaim(sk);
 360}
 361EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 362
 363int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 364{
 365        int ret;
 366        unsigned long pflags = current->flags;
 367
 368        /* these should have been dropped before queueing */
 369        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 370
 371        current->flags |= PF_MEMALLOC;
 372        ret = sk->sk_backlog_rcv(sk, skb);
 373        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 374
 375        return ret;
 376}
 377EXPORT_SYMBOL(__sk_backlog_rcv);
 378
 379static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 380{
 381        struct timeval tv;
 382
 383        if (optlen < sizeof(tv))
 384                return -EINVAL;
 385        if (copy_from_user(&tv, optval, sizeof(tv)))
 386                return -EFAULT;
 387        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 388                return -EDOM;
 389
 390        if (tv.tv_sec < 0) {
 391                static int warned __read_mostly;
 392
 393                *timeo_p = 0;
 394                if (warned < 10 && net_ratelimit()) {
 395                        warned++;
 396                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 397                                __func__, current->comm, task_pid_nr(current));
 398                }
 399                return 0;
 400        }
 401        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 402        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 403                return 0;
 404        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 405                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 406        return 0;
 407}
 408
 409static void sock_warn_obsolete_bsdism(const char *name)
 410{
 411        static int warned;
 412        static char warncomm[TASK_COMM_LEN];
 413        if (strcmp(warncomm, current->comm) && warned < 5) {
 414                strcpy(warncomm,  current->comm);
 415                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 416                        warncomm, name);
 417                warned++;
 418        }
 419}
 420
 421static bool sock_needs_netstamp(const struct sock *sk)
 422{
 423        switch (sk->sk_family) {
 424        case AF_UNSPEC:
 425        case AF_UNIX:
 426                return false;
 427        default:
 428                return true;
 429        }
 430}
 431
 432static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 433{
 434        if (sk->sk_flags & flags) {
 435                sk->sk_flags &= ~flags;
 436                if (sock_needs_netstamp(sk) &&
 437                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 438                        net_disable_timestamp();
 439        }
 440}
 441
 442
 443int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 444{
 445        int skb_len;
 446        unsigned long flags;
 447        struct sk_buff_head *list = &sk->sk_receive_queue;
 448
 449        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 450                atomic_inc(&sk->sk_drops);
 451                trace_sock_rcvqueue_full(sk, skb);
 452                return -ENOMEM;
 453        }
 454
 455        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 456                atomic_inc(&sk->sk_drops);
 457                return -ENOBUFS;
 458        }
 459
 460        skb->dev = NULL;
 461        skb_set_owner_r(skb, sk);
 462
 463        /* Cache the SKB length before we tack it onto the receive
 464         * queue.  Once it is added it no longer belongs to us and
 465         * may be freed by other threads of control pulling packets
 466         * from the queue.
 467         */
 468        skb_len = skb->len;
 469
 470        /* we escape from rcu protected region, make sure we dont leak
 471         * a norefcounted dst
 472         */
 473        skb_dst_force(skb);
 474
 475        spin_lock_irqsave(&list->lock, flags);
 476        sock_skb_set_dropcount(sk, skb);
 477        __skb_queue_tail(list, skb);
 478        spin_unlock_irqrestore(&list->lock, flags);
 479
 480        if (!sock_flag(sk, SOCK_DEAD))
 481                sk->sk_data_ready(sk, skb_len);
 482        return 0;
 483}
 484EXPORT_SYMBOL(__sock_queue_rcv_skb);
 485
 486int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 487{
 488        int err;
 489
 490        err = sk_filter(sk, skb);
 491        if (err)
 492                return err;
 493
 494        return __sock_queue_rcv_skb(sk, skb);
 495}
 496EXPORT_SYMBOL(sock_queue_rcv_skb);
 497
 498int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 499{
 500        int rc = NET_RX_SUCCESS;
 501
 502        if (sk_filter(sk, skb))
 503                goto discard_and_relse;
 504
 505        skb->dev = NULL;
 506
 507        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 508                atomic_inc(&sk->sk_drops);
 509                goto discard_and_relse;
 510        }
 511        if (nested)
 512                bh_lock_sock_nested(sk);
 513        else
 514                bh_lock_sock(sk);
 515        if (!sock_owned_by_user(sk)) {
 516                /*
 517                 * trylock + unlock semantics:
 518                 */
 519                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                rc = sk_backlog_rcv(sk, skb);
 522
 523                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                bh_unlock_sock(sk);
 526                atomic_inc(&sk->sk_drops);
 527                goto discard_and_relse;
 528        }
 529
 530        bh_unlock_sock(sk);
 531out:
 532        sock_put(sk);
 533        return rc;
 534discard_and_relse:
 535        kfree_skb(skb);
 536        goto out;
 537}
 538EXPORT_SYMBOL(sk_receive_skb);
 539
 540void sk_reset_txq(struct sock *sk)
 541{
 542        sk_tx_queue_clear(sk);
 543}
 544EXPORT_SYMBOL(sk_reset_txq);
 545
 546struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 547{
 548        struct dst_entry *dst = __sk_dst_get(sk);
 549
 550        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 551                sk_tx_queue_clear(sk);
 552                sk->sk_dst_pending_confirm = 0;
 553                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 554                dst_release(dst);
 555                return NULL;
 556        }
 557
 558        return dst;
 559}
 560EXPORT_SYMBOL(__sk_dst_check);
 561
 562struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 563{
 564        struct dst_entry *dst = sk_dst_get(sk);
 565
 566        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 567                sk_dst_reset(sk);
 568                dst_release(dst);
 569                return NULL;
 570        }
 571
 572        return dst;
 573}
 574EXPORT_SYMBOL(sk_dst_check);
 575
 576static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 577                                int optlen)
 578{
 579        int ret = -ENOPROTOOPT;
 580#ifdef CONFIG_NETDEVICES
 581        struct net *net = sock_net(sk);
 582        char devname[IFNAMSIZ];
 583        int index;
 584
 585        /* Sorry... */
 586        ret = -EPERM;
 587        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 588                goto out;
 589
 590        ret = -EINVAL;
 591        if (optlen < 0)
 592                goto out;
 593
 594        /* Bind this socket to a particular device like "eth0",
 595         * as specified in the passed interface name. If the
 596         * name is "" or the option length is zero the socket
 597         * is not bound.
 598         */
 599        if (optlen > IFNAMSIZ - 1)
 600                optlen = IFNAMSIZ - 1;
 601        memset(devname, 0, sizeof(devname));
 602
 603        ret = -EFAULT;
 604        if (copy_from_user(devname, optval, optlen))
 605                goto out;
 606
 607        index = 0;
 608        if (devname[0] != '\0') {
 609                struct net_device *dev;
 610
 611                rcu_read_lock();
 612                dev = dev_get_by_name_rcu(net, devname);
 613                if (dev)
 614                        index = dev->ifindex;
 615                rcu_read_unlock();
 616                ret = -ENODEV;
 617                if (!dev)
 618                        goto out;
 619        }
 620
 621        lock_sock(sk);
 622        sk->sk_bound_dev_if = index;
 623        sk_dst_reset(sk);
 624        release_sock(sk);
 625
 626        ret = 0;
 627
 628out:
 629#endif
 630
 631        return ret;
 632}
 633
 634static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 635                                int __user *optlen, int len)
 636{
 637        int ret = -ENOPROTOOPT;
 638#ifdef CONFIG_NETDEVICES
 639        struct net *net = sock_net(sk);
 640        char devname[IFNAMSIZ];
 641
 642        if (sk->sk_bound_dev_if == 0) {
 643                len = 0;
 644                goto zero;
 645        }
 646
 647        ret = -EINVAL;
 648        if (len < IFNAMSIZ)
 649                goto out;
 650
 651        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 652        if (ret)
 653                goto out;
 654
 655        len = strlen(devname) + 1;
 656
 657        ret = -EFAULT;
 658        if (copy_to_user(optval, devname, len))
 659                goto out;
 660
 661zero:
 662        ret = -EFAULT;
 663        if (put_user(len, optlen))
 664                goto out;
 665
 666        ret = 0;
 667
 668out:
 669#endif
 670
 671        return ret;
 672}
 673
 674static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 675{
 676        if (valbool)
 677                sock_set_flag(sk, bit);
 678        else
 679                sock_reset_flag(sk, bit);
 680}
 681
 682bool sk_mc_loop(struct sock *sk)
 683{
 684        if (dev_recursion_level())
 685                return false;
 686        if (!sk)
 687                return true;
 688        switch (sk->sk_family) {
 689        case AF_INET:
 690                return inet_sk(sk)->mc_loop;
 691#if IS_ENABLED(CONFIG_IPV6)
 692        case AF_INET6:
 693                return inet6_sk(sk)->mc_loop;
 694#endif
 695        }
 696        WARN_ON(1);
 697        return true;
 698}
 699EXPORT_SYMBOL(sk_mc_loop);
 700
 701/*
 702 *      This is meant for all protocols to use and covers goings on
 703 *      at the socket level. Everything here is generic.
 704 */
 705
 706int sock_setsockopt(struct socket *sock, int level, int optname,
 707                    char __user *optval, unsigned int optlen)
 708{
 709        struct sock *sk = sock->sk;
 710        int val;
 711        int valbool;
 712        struct linger ling;
 713        int ret = 0;
 714
 715        /*
 716         *      Options without arguments
 717         */
 718
 719        if (optname == SO_BINDTODEVICE)
 720                return sock_setbindtodevice(sk, optval, optlen);
 721
 722        if (optlen < sizeof(int))
 723                return -EINVAL;
 724
 725        if (get_user(val, (int __user *)optval))
 726                return -EFAULT;
 727
 728        valbool = val ? 1 : 0;
 729
 730        lock_sock(sk);
 731
 732        switch (optname) {
 733        case SO_DEBUG:
 734                if (val && !capable(CAP_NET_ADMIN))
 735                        ret = -EACCES;
 736                else
 737                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 738                break;
 739        case SO_REUSEADDR:
 740                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 741                break;
 742        case SO_REUSEPORT:
 743                sk->sk_reuseport = valbool;
 744                break;
 745        case SO_TYPE:
 746        case SO_PROTOCOL:
 747        case SO_DOMAIN:
 748        case SO_ERROR:
 749                ret = -ENOPROTOOPT;
 750                break;
 751        case SO_DONTROUTE:
 752                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 753                break;
 754        case SO_BROADCAST:
 755                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 756                break;
 757        case SO_SNDBUF:
 758                /* Don't error on this BSD doesn't and if you think
 759                 * about it this is right. Otherwise apps have to
 760                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 761                 * are treated in BSD as hints
 762                 */
 763                val = min_t(u32, val, sysctl_wmem_max);
 764set_sndbuf:
 765                /* Ensure val * 2 fits into an int, to prevent max_t()
 766                 * from treating it as a negative value.
 767                 */
 768                val = min_t(int, val, INT_MAX / 2);
 769                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 770                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 771                /* Wake up sending tasks if we upped the value. */
 772                sk->sk_write_space(sk);
 773                break;
 774
 775        case SO_SNDBUFFORCE:
 776                if (!capable(CAP_NET_ADMIN)) {
 777                        ret = -EPERM;
 778                        break;
 779                }
 780
 781                /* No negative values (to prevent underflow, as val will be
 782                 * multiplied by 2).
 783                 */
 784                if (val < 0)
 785                        val = 0;
 786                goto set_sndbuf;
 787
 788        case SO_RCVBUF:
 789                /* Don't error on this BSD doesn't and if you think
 790                 * about it this is right. Otherwise apps have to
 791                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 792                 * are treated in BSD as hints
 793                 */
 794                val = min_t(u32, val, sysctl_rmem_max);
 795set_rcvbuf:
 796                /* Ensure val * 2 fits into an int, to prevent max_t()
 797                 * from treating it as a negative value.
 798                 */
 799                val = min_t(int, val, INT_MAX / 2);
 800                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 801                /*
 802                 * We double it on the way in to account for
 803                 * "struct sk_buff" etc. overhead.   Applications
 804                 * assume that the SO_RCVBUF setting they make will
 805                 * allow that much actual data to be received on that
 806                 * socket.
 807                 *
 808                 * Applications are unaware that "struct sk_buff" and
 809                 * other overheads allocate from the receive buffer
 810                 * during socket buffer allocation.
 811                 *
 812                 * And after considering the possible alternatives,
 813                 * returning the value we actually used in getsockopt
 814                 * is the most desirable behavior.
 815                 */
 816                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 817                break;
 818
 819        case SO_RCVBUFFORCE:
 820                if (!capable(CAP_NET_ADMIN)) {
 821                        ret = -EPERM;
 822                        break;
 823                }
 824
 825                /* No negative values (to prevent underflow, as val will be
 826                 * multiplied by 2).
 827                 */
 828                if (val < 0)
 829                        val = 0;
 830                goto set_rcvbuf;
 831
 832        case SO_KEEPALIVE:
 833#ifdef CONFIG_INET
 834                if (sk->sk_protocol == IPPROTO_TCP &&
 835                    sk->sk_type == SOCK_STREAM)
 836                        tcp_set_keepalive(sk, valbool);
 837#endif
 838                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 839                break;
 840
 841        case SO_OOBINLINE:
 842                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 843                break;
 844
 845        case SO_NO_CHECK:
 846                sk->sk_no_check_tx = valbool;
 847                break;
 848
 849        case SO_PRIORITY:
 850                if ((val >= 0 && val <= 6) ||
 851                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 852                        sk->sk_priority = val;
 853                else
 854                        ret = -EPERM;
 855                break;
 856
 857        case SO_LINGER:
 858                if (optlen < sizeof(ling)) {
 859                        ret = -EINVAL;  /* 1003.1g */
 860                        break;
 861                }
 862                if (copy_from_user(&ling, optval, sizeof(ling))) {
 863                        ret = -EFAULT;
 864                        break;
 865                }
 866                if (!ling.l_onoff)
 867                        sock_reset_flag(sk, SOCK_LINGER);
 868                else {
 869#if (BITS_PER_LONG == 32)
 870                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 871                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 872                        else
 873#endif
 874                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 875                        sock_set_flag(sk, SOCK_LINGER);
 876                }
 877                break;
 878
 879        case SO_BSDCOMPAT:
 880                sock_warn_obsolete_bsdism("setsockopt");
 881                break;
 882
 883        case SO_PASSCRED:
 884                if (valbool)
 885                        set_bit(SOCK_PASSCRED, &sock->flags);
 886                else
 887                        clear_bit(SOCK_PASSCRED, &sock->flags);
 888                break;
 889
 890        case SO_TIMESTAMP:
 891        case SO_TIMESTAMPNS:
 892                if (valbool)  {
 893                        if (optname == SO_TIMESTAMP)
 894                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 895                        else
 896                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 897                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 898                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 899                } else {
 900                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 901                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 902                }
 903                break;
 904
 905        case SO_TIMESTAMPING:
 906                if (val & ~SOF_TIMESTAMPING_MASK ||
 907                    val & __RH_RESERVED_SOF_TIMESTAMPING_OPT_ID ||
 908                    val & __RH_RESERVED_SOF_TIMESTAMPING_TX_SCHED ||
 909                    val & __RH_RESERVED_SOF_TIMESTAMPING_TX_ACK ||
 910                    val & __RH_RESERVED_SOF_TIMESTAMPING_OPT_TSONLY ||
 911                    val & __RH_RESERVED_SOF_TIMESTAMPING_OPT_STATS) {
 912                        ret = -EINVAL;
 913                        break;
 914                }
 915                sk->sk_tsflags = val;
 916                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 917                        sock_enable_timestamp(sk,
 918                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 919                else
 920                        sock_disable_timestamp(sk,
 921                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 922                break;
 923
 924        case SO_RCVLOWAT:
 925                if (val < 0)
 926                        val = INT_MAX;
 927                sk->sk_rcvlowat = val ? : 1;
 928                break;
 929
 930        case SO_RCVTIMEO:
 931                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 932                break;
 933
 934        case SO_SNDTIMEO:
 935                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 936                break;
 937
 938        case SO_ATTACH_FILTER:
 939                ret = -EINVAL;
 940                if (optlen == sizeof(struct sock_fprog)) {
 941                        struct sock_fprog fprog;
 942
 943                        ret = -EFAULT;
 944                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 945                                break;
 946
 947                        ret = sk_attach_filter(&fprog, sk);
 948                }
 949                break;
 950
 951        case SO_DETACH_FILTER:
 952                ret = sk_detach_filter(sk);
 953                break;
 954
 955        case SO_LOCK_FILTER:
 956                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 957                        ret = -EPERM;
 958                else
 959                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 960                break;
 961
 962        case SO_PASSSEC:
 963                if (valbool)
 964                        set_bit(SOCK_PASSSEC, &sock->flags);
 965                else
 966                        clear_bit(SOCK_PASSSEC, &sock->flags);
 967                break;
 968        case SO_MARK:
 969                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 970                        ret = -EPERM;
 971                else
 972                        sk->sk_mark = val;
 973                break;
 974
 975                /* We implement the SO_SNDLOWAT etc to
 976                   not be settable (1003.1g 5.3) */
 977        case SO_RXQ_OVFL:
 978                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 979                break;
 980
 981        case SO_WIFI_STATUS:
 982                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 983                break;
 984
 985        case SO_PEEK_OFF:
 986                if (sock->ops->set_peek_off)
 987                        ret = sock->ops->set_peek_off(sk, val);
 988                else
 989                        ret = -EOPNOTSUPP;
 990                break;
 991
 992        case SO_NOFCS:
 993                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 994                break;
 995
 996        case SO_SELECT_ERR_QUEUE:
 997                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 998                break;
 999
1000#ifdef CONFIG_NET_RX_BUSY_POLL
1001        case SO_BUSY_POLL:
1002                /* allow unprivileged users to decrease the value */
1003                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1004                        ret = -EPERM;
1005                else {
1006                        if (val < 0)
1007                                ret = -EINVAL;
1008                        else
1009                                sk->sk_ll_usec = val;
1010                }
1011                break;
1012#endif
1013
1014        case SO_MAX_PACING_RATE:
1015                sk->sk_max_pacing_rate = val;
1016                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1017                                         sk->sk_max_pacing_rate);
1018                break;
1019
1020        default:
1021                ret = -ENOPROTOOPT;
1022                break;
1023        }
1024        release_sock(sk);
1025        return ret;
1026}
1027EXPORT_SYMBOL(sock_setsockopt);
1028
1029
1030void cred_to_ucred(struct pid *pid, const struct cred *cred,
1031                   struct ucred *ucred)
1032{
1033        ucred->pid = pid_vnr(pid);
1034        ucred->uid = ucred->gid = -1;
1035        if (cred) {
1036                struct user_namespace *current_ns = current_user_ns();
1037
1038                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1039                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1040        }
1041}
1042EXPORT_SYMBOL_GPL(cred_to_ucred);
1043
1044int sock_getsockopt(struct socket *sock, int level, int optname,
1045                    char __user *optval, int __user *optlen)
1046{
1047        struct sock *sk = sock->sk;
1048
1049        union {
1050                int val;
1051                struct linger ling;
1052                struct timeval tm;
1053        } v;
1054
1055        int lv = sizeof(int);
1056        int len;
1057
1058        if (get_user(len, optlen))
1059                return -EFAULT;
1060        if (len < 0)
1061                return -EINVAL;
1062
1063        memset(&v, 0, sizeof(v));
1064
1065        switch (optname) {
1066        case SO_DEBUG:
1067                v.val = sock_flag(sk, SOCK_DBG);
1068                break;
1069
1070        case SO_DONTROUTE:
1071                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1072                break;
1073
1074        case SO_BROADCAST:
1075                v.val = sock_flag(sk, SOCK_BROADCAST);
1076                break;
1077
1078        case SO_SNDBUF:
1079                v.val = sk->sk_sndbuf;
1080                break;
1081
1082        case SO_RCVBUF:
1083                v.val = sk->sk_rcvbuf;
1084                break;
1085
1086        case SO_REUSEADDR:
1087                v.val = sk->sk_reuse;
1088                break;
1089
1090        case SO_REUSEPORT:
1091                v.val = sk->sk_reuseport;
1092                break;
1093
1094        case SO_KEEPALIVE:
1095                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1096                break;
1097
1098        case SO_TYPE:
1099                v.val = sk->sk_type;
1100                break;
1101
1102        case SO_PROTOCOL:
1103                v.val = sk->sk_protocol;
1104                break;
1105
1106        case SO_DOMAIN:
1107                v.val = sk->sk_family;
1108                break;
1109
1110        case SO_ERROR:
1111                v.val = -sock_error(sk);
1112                if (v.val == 0)
1113                        v.val = xchg(&sk->sk_err_soft, 0);
1114                break;
1115
1116        case SO_OOBINLINE:
1117                v.val = sock_flag(sk, SOCK_URGINLINE);
1118                break;
1119
1120        case SO_NO_CHECK:
1121                v.val = sk->sk_no_check_tx;
1122                break;
1123
1124        case SO_PRIORITY:
1125                v.val = sk->sk_priority;
1126                break;
1127
1128        case SO_LINGER:
1129                lv              = sizeof(v.ling);
1130                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1131                v.ling.l_linger = sk->sk_lingertime / HZ;
1132                break;
1133
1134        case SO_BSDCOMPAT:
1135                sock_warn_obsolete_bsdism("getsockopt");
1136                break;
1137
1138        case SO_TIMESTAMP:
1139                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1140                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1141                break;
1142
1143        case SO_TIMESTAMPNS:
1144                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1145                break;
1146
1147        case SO_TIMESTAMPING:
1148                v.val = sk->sk_tsflags;
1149                break;
1150
1151        case SO_RCVTIMEO:
1152                lv = sizeof(struct timeval);
1153                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1154                        v.tm.tv_sec = 0;
1155                        v.tm.tv_usec = 0;
1156                } else {
1157                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1158                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1159                }
1160                break;
1161
1162        case SO_SNDTIMEO:
1163                lv = sizeof(struct timeval);
1164                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1165                        v.tm.tv_sec = 0;
1166                        v.tm.tv_usec = 0;
1167                } else {
1168                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1169                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1170                }
1171                break;
1172
1173        case SO_RCVLOWAT:
1174                v.val = sk->sk_rcvlowat;
1175                break;
1176
1177        case SO_SNDLOWAT:
1178                v.val = 1;
1179                break;
1180
1181        case SO_PASSCRED:
1182                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1183                break;
1184
1185        case SO_PEERCRED:
1186        {
1187                struct ucred peercred;
1188                if (len > sizeof(peercred))
1189                        len = sizeof(peercred);
1190                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1191                if (copy_to_user(optval, &peercred, len))
1192                        return -EFAULT;
1193                goto lenout;
1194        }
1195
1196        case SO_PEERNAME:
1197        {
1198                char address[128];
1199
1200                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1201                        return -ENOTCONN;
1202                if (lv < len)
1203                        return -EINVAL;
1204                if (copy_to_user(optval, address, len))
1205                        return -EFAULT;
1206                goto lenout;
1207        }
1208
1209        /* Dubious BSD thing... Probably nobody even uses it, but
1210         * the UNIX standard wants it for whatever reason... -DaveM
1211         */
1212        case SO_ACCEPTCONN:
1213                v.val = sk->sk_state == TCP_LISTEN;
1214                break;
1215
1216        case SO_PASSSEC:
1217                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1218                break;
1219
1220        case SO_PEERSEC:
1221                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1222
1223        case SO_MARK:
1224                v.val = sk->sk_mark;
1225                break;
1226
1227        case SO_RXQ_OVFL:
1228                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1229                break;
1230
1231        case SO_WIFI_STATUS:
1232                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1233                break;
1234
1235        case SO_PEEK_OFF:
1236                if (!sock->ops->set_peek_off)
1237                        return -EOPNOTSUPP;
1238
1239                v.val = sk->sk_peek_off;
1240                break;
1241        case SO_NOFCS:
1242                v.val = sock_flag(sk, SOCK_NOFCS);
1243                break;
1244
1245        case SO_BINDTODEVICE:
1246                return sock_getbindtodevice(sk, optval, optlen, len);
1247
1248        case SO_GET_FILTER:
1249                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1250                if (len < 0)
1251                        return len;
1252
1253                goto lenout;
1254
1255        case SO_LOCK_FILTER:
1256                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1257                break;
1258
1259        case SO_BPF_EXTENSIONS:
1260                v.val = bpf_tell_extensions();
1261                break;
1262
1263        case SO_SELECT_ERR_QUEUE:
1264                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1265                break;
1266
1267#ifdef CONFIG_NET_RX_BUSY_POLL
1268        case SO_BUSY_POLL:
1269                v.val = sk->sk_ll_usec;
1270                break;
1271#endif
1272
1273        case SO_MAX_PACING_RATE:
1274                v.val = sk->sk_max_pacing_rate;
1275                break;
1276
1277        default:
1278                return -ENOPROTOOPT;
1279        }
1280
1281        if (len > lv)
1282                len = lv;
1283        if (copy_to_user(optval, &v, len))
1284                return -EFAULT;
1285lenout:
1286        if (put_user(len, optlen))
1287                return -EFAULT;
1288        return 0;
1289}
1290
1291/*
1292 * Initialize an sk_lock.
1293 *
1294 * (We also register the sk_lock with the lock validator.)
1295 */
1296static inline void sock_lock_init(struct sock *sk)
1297{
1298        sock_lock_init_class_and_name(sk,
1299                        af_family_slock_key_strings[sk->sk_family],
1300                        af_family_slock_keys + sk->sk_family,
1301                        af_family_key_strings[sk->sk_family],
1302                        af_family_keys + sk->sk_family);
1303}
1304
1305/*
1306 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1307 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1308 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1309 */
1310static void sock_copy(struct sock *nsk, const struct sock *osk)
1311{
1312#ifdef CONFIG_SECURITY_NETWORK
1313        void *sptr = nsk->sk_security;
1314#endif
1315        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1316
1317        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1318               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1319
1320#ifdef CONFIG_SECURITY_NETWORK
1321        nsk->sk_security = sptr;
1322        security_sk_clone(osk, nsk);
1323#endif
1324}
1325
1326void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1327{
1328        unsigned long nulls1, nulls2;
1329
1330        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1331        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1332        if (nulls1 > nulls2)
1333                swap(nulls1, nulls2);
1334
1335        if (nulls1 != 0)
1336                memset((char *)sk, 0, nulls1);
1337        memset((char *)sk + nulls1 + sizeof(void *), 0,
1338               nulls2 - nulls1 - sizeof(void *));
1339        memset((char *)sk + nulls2 + sizeof(void *), 0,
1340               size - nulls2 - sizeof(void *));
1341}
1342EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1343
1344static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1345                int family)
1346{
1347        struct sock *sk;
1348        struct kmem_cache *slab;
1349
1350        slab = prot->slab;
1351        if (slab != NULL) {
1352                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1353                if (!sk)
1354                        return sk;
1355                if (priority & __GFP_ZERO) {
1356                        if (prot->clear_sk)
1357                                prot->clear_sk(sk, prot->obj_size);
1358                        else
1359                                sk_prot_clear_nulls(sk, prot->obj_size);
1360                }
1361        } else
1362                sk = kmalloc(prot->obj_size, priority);
1363
1364        if (sk != NULL) {
1365                kmemcheck_annotate_bitfield(sk, flags);
1366
1367                if (security_sk_alloc(sk, family, priority))
1368                        goto out_free;
1369
1370                if (!try_module_get(prot->owner))
1371                        goto out_free_sec;
1372                sk_tx_queue_clear(sk);
1373        }
1374
1375        return sk;
1376
1377out_free_sec:
1378        security_sk_free(sk);
1379out_free:
1380        if (slab != NULL)
1381                kmem_cache_free(slab, sk);
1382        else
1383                kfree(sk);
1384        return NULL;
1385}
1386
1387static void sk_prot_free(struct proto *prot, struct sock *sk)
1388{
1389        struct kmem_cache *slab;
1390        struct module *owner;
1391
1392        owner = prot->owner;
1393        slab = prot->slab;
1394
1395        security_sk_free(sk);
1396        if (slab != NULL)
1397                kmem_cache_free(slab, sk);
1398        else
1399                kfree(sk);
1400        module_put(owner);
1401}
1402
1403#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1404void sock_update_classid(struct sock *sk)
1405{
1406        u32 classid;
1407
1408        classid = task_cls_classid(current);
1409        if (classid != sk->sk_classid)
1410                sk->sk_classid = classid;
1411}
1412EXPORT_SYMBOL(sock_update_classid);
1413#endif
1414
1415#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1416void sock_update_netprioidx(struct sock *sk)
1417{
1418        if (in_interrupt())
1419                return;
1420
1421        sk->sk_cgrp_prioidx = task_netprioidx(current);
1422}
1423EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1424#endif
1425
1426/**
1427 *      sk_alloc - All socket objects are allocated here
1428 *      @net: the applicable net namespace
1429 *      @family: protocol family
1430 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1431 *      @prot: struct proto associated with this new sock instance
1432 */
1433struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1434                      struct proto *prot)
1435{
1436        struct sock *sk;
1437
1438        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1439        if (sk) {
1440                sk->sk_family = family;
1441                /*
1442                 * See comment in struct sock definition to understand
1443                 * why we need sk_prot_creator -acme
1444                 */
1445                sk->sk_prot = sk->sk_prot_creator = prot;
1446                sock_lock_init(sk);
1447                sock_net_set(sk, get_net(net));
1448                atomic_set(&sk->sk_wmem_alloc, 1);
1449
1450                sock_update_classid(sk);
1451                sock_update_netprioidx(sk);
1452        }
1453
1454        return sk;
1455}
1456EXPORT_SYMBOL(sk_alloc);
1457
1458/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1459 * grace period. This is the case for UDP sockets and TCP listeners.
1460 */
1461static void __sk_destruct(struct rcu_head *head)
1462{
1463        struct sock *sk = container_of(head, struct sock, sk_rcu);
1464        struct sk_filter *filter;
1465
1466        if (sk->sk_destruct)
1467                sk->sk_destruct(sk);
1468
1469        filter = rcu_dereference_check(sk->sk_filter,
1470                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1471        if (filter) {
1472                sk_filter_uncharge(sk, filter);
1473                RCU_INIT_POINTER(sk->sk_filter, NULL);
1474        }
1475
1476        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1477
1478        if (atomic_read(&sk->sk_omem_alloc))
1479                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1480                         __func__, atomic_read(&sk->sk_omem_alloc));
1481
1482        if (sk->sk_frag.page) {
1483                put_page(sk->sk_frag.page);
1484                sk->sk_frag.page = NULL;
1485        }
1486
1487        if (sk->sk_peer_cred)
1488                put_cred(sk->sk_peer_cred);
1489        put_pid(sk->sk_peer_pid);
1490        put_net(sock_net(sk));
1491        sk_prot_free(sk->sk_prot_creator, sk);
1492}
1493
1494void sk_destruct(struct sock *sk)
1495{
1496        if (sock_flag(sk, SOCK_RCU_FREE))
1497                call_rcu(&sk->sk_rcu, __sk_destruct);
1498        else
1499                __sk_destruct(&sk->sk_rcu);
1500}
1501
1502static void __sk_free(struct sock *sk)
1503{
1504        sk_destruct(sk);
1505}
1506
1507void sk_free(struct sock *sk)
1508{
1509        /*
1510         * We subtract one from sk_wmem_alloc and can know if
1511         * some packets are still in some tx queue.
1512         * If not null, sock_wfree() will call __sk_free(sk) later
1513         */
1514        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1515                __sk_free(sk);
1516}
1517EXPORT_SYMBOL(sk_free);
1518
1519/*
1520 * Last sock_put should drop reference to sk->sk_net. It has already
1521 * been dropped in sk_change_net. Taking reference to stopping namespace
1522 * is not an option.
1523 * Take reference to a socket to remove it from hash _alive_ and after that
1524 * destroy it in the context of init_net.
1525 */
1526void sk_release_kernel(struct sock *sk)
1527{
1528        if (sk == NULL || sk->sk_socket == NULL)
1529                return;
1530
1531        sock_hold(sk);
1532        sock_release(sk->sk_socket);
1533        sock_net_set(sk, get_net(&init_net));
1534        sock_put(sk);
1535}
1536EXPORT_SYMBOL(sk_release_kernel);
1537
1538static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1539{
1540        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1541                sock_update_memcg(newsk);
1542}
1543
1544/**
1545 *      sk_clone_lock - clone a socket, and lock its clone
1546 *      @sk: the socket to clone
1547 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1548 *
1549 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1550 */
1551struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1552{
1553        struct sock *newsk;
1554
1555        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1556        if (newsk != NULL) {
1557                struct sk_filter *filter;
1558
1559                sock_copy(newsk, sk);
1560
1561                newsk->sk_prot_creator = sk->sk_prot;
1562
1563                /* SANITY */
1564                get_net(sock_net(newsk));
1565                sk_node_init(&newsk->sk_node);
1566                sock_lock_init(newsk);
1567                bh_lock_sock(newsk);
1568                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1569                newsk->sk_backlog.len = 0;
1570
1571                atomic_set(&newsk->sk_rmem_alloc, 0);
1572                /*
1573                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1574                 */
1575                atomic_set(&newsk->sk_wmem_alloc, 1);
1576                atomic_set(&newsk->sk_omem_alloc, 0);
1577                skb_queue_head_init(&newsk->sk_receive_queue);
1578                skb_queue_head_init(&newsk->sk_write_queue);
1579
1580                rwlock_init(&newsk->sk_callback_lock);
1581                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1582                                af_callback_keys + newsk->sk_family,
1583                                af_family_clock_key_strings[newsk->sk_family]);
1584
1585                newsk->sk_dst_cache     = NULL;
1586                newsk->sk_dst_pending_confirm = 0;
1587                newsk->sk_wmem_queued   = 0;
1588                newsk->sk_forward_alloc = 0;
1589                newsk->sk_send_head     = NULL;
1590                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1591
1592                sock_reset_flag(newsk, SOCK_DONE);
1593                skb_queue_head_init(&newsk->sk_error_queue);
1594
1595                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1596                if (filter != NULL)
1597                        sk_filter_charge(newsk, filter);
1598
1599                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1600                        /* It is still raw copy of parent, so invalidate
1601                         * destructor and make plain sk_free() */
1602                        newsk->sk_destruct = NULL;
1603                        bh_unlock_sock(newsk);
1604                        sk_free(newsk);
1605                        newsk = NULL;
1606                        goto out;
1607                }
1608
1609                newsk->sk_err      = 0;
1610                newsk->sk_priority = 0;
1611                /*
1612                 * Before updating sk_refcnt, we must commit prior changes to memory
1613                 * (Documentation/RCU/rculist_nulls.txt for details)
1614                 */
1615                smp_wmb();
1616                atomic_set(&newsk->sk_refcnt, 2);
1617
1618                /*
1619                 * Increment the counter in the same struct proto as the master
1620                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1621                 * is the same as sk->sk_prot->socks, as this field was copied
1622                 * with memcpy).
1623                 *
1624                 * This _changes_ the previous behaviour, where
1625                 * tcp_create_openreq_child always was incrementing the
1626                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1627                 * to be taken into account in all callers. -acme
1628                 */
1629                sk_refcnt_debug_inc(newsk);
1630                sk_set_socket(newsk, NULL);
1631                newsk->sk_wq = NULL;
1632
1633                sk_update_clone(sk, newsk);
1634
1635                if (newsk->sk_prot->sockets_allocated)
1636                        sk_sockets_allocated_inc(newsk);
1637
1638                if (sock_needs_netstamp(sk) &&
1639                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1640                        net_enable_timestamp();
1641        }
1642out:
1643        return newsk;
1644}
1645EXPORT_SYMBOL_GPL(sk_clone_lock);
1646
1647void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1648{
1649        sk_dst_set(sk, dst);
1650        sk->sk_route_caps = dst->dev->features;
1651        if (sk->sk_route_caps & NETIF_F_GSO)
1652                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1653        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1654        if (sk_can_gso(sk)) {
1655                if (dst->header_len) {
1656                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1657                } else {
1658                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1659                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1660                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1661                }
1662        }
1663}
1664EXPORT_SYMBOL_GPL(sk_setup_caps);
1665
1666/*
1667 *      Simple resource managers for sockets.
1668 */
1669
1670
1671/*
1672 * Write buffer destructor automatically called from kfree_skb.
1673 */
1674void sock_wfree(struct sk_buff *skb)
1675{
1676        struct sock *sk = skb->sk;
1677        unsigned int len = skb->truesize;
1678
1679        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1680                /*
1681                 * Keep a reference on sk_wmem_alloc, this will be released
1682                 * after sk_write_space() call
1683                 */
1684                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1685                sk->sk_write_space(sk);
1686                len = 1;
1687        }
1688        /*
1689         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1690         * could not do because of in-flight packets
1691         */
1692        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1693                __sk_free(sk);
1694}
1695EXPORT_SYMBOL(sock_wfree);
1696
1697/* This variant of sock_wfree() is used by TCP,
1698 * since it sets SOCK_USE_WRITE_QUEUE.
1699 */
1700void __sock_wfree(struct sk_buff *skb)
1701{
1702        struct sock *sk = skb->sk;
1703
1704        if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1705                __sk_free(sk);
1706}
1707
1708/* This helper is used by netem, as it can hold packets in its
1709 * delay queue. We want to allow the owner socket to send more
1710 * packets, as if they were already TX completed by a typical driver.
1711 * But we also want to keep skb->sk set because some packet schedulers
1712 * rely on it (sch_fq for example).
1713 */
1714void skb_orphan_partial(struct sk_buff *skb)
1715{
1716        if (skb_is_tcp_pure_ack(skb))
1717                return;
1718
1719        if (skb->destructor == sock_wfree
1720#ifdef CONFIG_INET
1721            || skb->destructor == tcp_wfree
1722#endif
1723                ) {
1724                struct sock *sk = skb->sk;
1725
1726                if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1727                        atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1728                        skb->destructor = sock_efree;
1729                }
1730        } else {
1731                skb_orphan(skb);
1732        }
1733}
1734EXPORT_SYMBOL(skb_orphan_partial);
1735
1736/*
1737 * Read buffer destructor automatically called from kfree_skb.
1738 */
1739void sock_rfree(struct sk_buff *skb)
1740{
1741        struct sock *sk = skb->sk;
1742        unsigned int len = skb->truesize;
1743
1744        atomic_sub(len, &sk->sk_rmem_alloc);
1745        sk_mem_uncharge(sk, len);
1746}
1747EXPORT_SYMBOL(sock_rfree);
1748
1749void sock_efree(struct sk_buff *skb)
1750{
1751        sock_put(skb->sk);
1752}
1753EXPORT_SYMBOL(sock_efree);
1754
1755#ifdef CONFIG_INET
1756void sock_edemux(struct sk_buff *skb)
1757{
1758        struct sock *sk = skb->sk;
1759
1760        if (sk->sk_state == TCP_TIME_WAIT)
1761                inet_twsk_put(inet_twsk(sk));
1762        else
1763                sock_put(sk);
1764}
1765EXPORT_SYMBOL(sock_edemux);
1766#endif
1767
1768kuid_t sock_i_uid(struct sock *sk)
1769{
1770        kuid_t uid;
1771
1772        read_lock_bh(&sk->sk_callback_lock);
1773        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1774        read_unlock_bh(&sk->sk_callback_lock);
1775        return uid;
1776}
1777EXPORT_SYMBOL(sock_i_uid);
1778
1779unsigned long sock_i_ino(struct sock *sk)
1780{
1781        unsigned long ino;
1782
1783        read_lock_bh(&sk->sk_callback_lock);
1784        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1785        read_unlock_bh(&sk->sk_callback_lock);
1786        return ino;
1787}
1788EXPORT_SYMBOL(sock_i_ino);
1789
1790/*
1791 * Allocate a skb from the socket's send buffer.
1792 */
1793struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1794                             gfp_t priority)
1795{
1796        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1797                struct sk_buff *skb = alloc_skb(size, priority);
1798                if (skb) {
1799                        skb_set_owner_w(skb, sk);
1800                        return skb;
1801                }
1802        }
1803        return NULL;
1804}
1805EXPORT_SYMBOL(sock_wmalloc);
1806
1807/*
1808 * Allocate a skb from the socket's receive buffer.
1809 */
1810struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1811                             gfp_t priority)
1812{
1813        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1814                struct sk_buff *skb = alloc_skb(size, priority);
1815                if (skb) {
1816                        skb_set_owner_r(skb, sk);
1817                        return skb;
1818                }
1819        }
1820        return NULL;
1821}
1822
1823/*
1824 * Allocate a memory block from the socket's option memory buffer.
1825 */
1826void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1827{
1828        if ((unsigned int)size <= sysctl_optmem_max &&
1829            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1830                void *mem;
1831                /* First do the add, to avoid the race if kmalloc
1832                 * might sleep.
1833                 */
1834                atomic_add(size, &sk->sk_omem_alloc);
1835                mem = kmalloc(size, priority);
1836                if (mem)
1837                        return mem;
1838                atomic_sub(size, &sk->sk_omem_alloc);
1839        }
1840        return NULL;
1841}
1842EXPORT_SYMBOL(sock_kmalloc);
1843
1844/* Free an option memory block. Note, we actually want the inline
1845 * here as this allows gcc to detect the nullify and fold away the
1846 * condition entirely.
1847 */
1848static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1849                                  const bool nullify)
1850{
1851        if (WARN_ON_ONCE(!mem))
1852                return;
1853        if (nullify)
1854                kzfree(mem);
1855        else
1856                kfree(mem);
1857        atomic_sub(size, &sk->sk_omem_alloc);
1858}
1859
1860void sock_kfree_s(struct sock *sk, void *mem, int size)
1861{
1862        __sock_kfree_s(sk, mem, size, false);
1863}
1864EXPORT_SYMBOL(sock_kfree_s);
1865
1866void sock_kzfree_s(struct sock *sk, void *mem, int size)
1867{
1868        __sock_kfree_s(sk, mem, size, true);
1869}
1870EXPORT_SYMBOL(sock_kzfree_s);
1871
1872/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1873   I think, these locks should be removed for datagram sockets.
1874 */
1875static long sock_wait_for_wmem(struct sock *sk, long timeo)
1876{
1877        DEFINE_WAIT(wait);
1878
1879        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1880        for (;;) {
1881                if (!timeo)
1882                        break;
1883                if (signal_pending(current))
1884                        break;
1885                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1886                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1887                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1888                        break;
1889                if (sk->sk_shutdown & SEND_SHUTDOWN)
1890                        break;
1891                if (sk->sk_err)
1892                        break;
1893                timeo = schedule_timeout(timeo);
1894        }
1895        finish_wait(sk_sleep(sk), &wait);
1896        return timeo;
1897}
1898
1899
1900/*
1901 *      Generic send/receive buffer handlers
1902 */
1903
1904struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1905                                     unsigned long data_len, int noblock,
1906                                     int *errcode, int max_page_order)
1907{
1908        struct sk_buff *skb;
1909        long timeo;
1910        int err;
1911
1912        timeo = sock_sndtimeo(sk, noblock);
1913        for (;;) {
1914                err = sock_error(sk);
1915                if (err != 0)
1916                        goto failure;
1917
1918                err = -EPIPE;
1919                if (sk->sk_shutdown & SEND_SHUTDOWN)
1920                        goto failure;
1921
1922                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1923                        break;
1924
1925                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1926                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1927                err = -EAGAIN;
1928                if (!timeo)
1929                        goto failure;
1930                if (signal_pending(current))
1931                        goto interrupted;
1932                timeo = sock_wait_for_wmem(sk, timeo);
1933        }
1934        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1935                                   errcode, sk->sk_allocation);
1936        if (skb)
1937                skb_set_owner_w(skb, sk);
1938        return skb;
1939
1940interrupted:
1941        err = sock_intr_errno(timeo);
1942failure:
1943        *errcode = err;
1944        return NULL;
1945}
1946EXPORT_SYMBOL(sock_alloc_send_pskb);
1947
1948struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1949                                    int noblock, int *errcode)
1950{
1951        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1952}
1953EXPORT_SYMBOL(sock_alloc_send_skb);
1954
1955/* On 32bit arches, an skb frag is limited to 2^15 */
1956#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1957
1958/**
1959 * skb_page_frag_refill - check that a page_frag contains enough room
1960 * @sz: minimum size of the fragment we want to get
1961 * @pfrag: pointer to page_frag
1962 * @gfp: priority for memory allocation
1963 *
1964 * Note: While this allocator tries to use high order pages, there is
1965 * no guarantee that allocations succeed. Therefore, @sz MUST be
1966 * less or equal than PAGE_SIZE.
1967 */
1968bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1969{
1970        if (pfrag->page) {
1971                if (page_ref_count(pfrag->page) == 1) {
1972                        pfrag->offset = 0;
1973                        return true;
1974                }
1975                if (pfrag->offset + sz <= pfrag->size)
1976                        return true;
1977                put_page(pfrag->page);
1978        }
1979
1980        pfrag->offset = 0;
1981        if (SKB_FRAG_PAGE_ORDER) {
1982                pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1983                                          __GFP_NOWARN | __GFP_NORETRY,
1984                                          SKB_FRAG_PAGE_ORDER);
1985                if (likely(pfrag->page)) {
1986                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1987                        return true;
1988                }
1989        }
1990        pfrag->page = alloc_page(gfp);
1991        if (likely(pfrag->page)) {
1992                pfrag->size = PAGE_SIZE;
1993                return true;
1994        }
1995        return false;
1996}
1997EXPORT_SYMBOL(skb_page_frag_refill);
1998
1999bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2000{
2001        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2002                return true;
2003
2004        sk_enter_memory_pressure(sk);
2005        sk_stream_moderate_sndbuf(sk);
2006        return false;
2007}
2008EXPORT_SYMBOL(sk_page_frag_refill);
2009
2010static void __lock_sock(struct sock *sk)
2011        __releases(&sk->sk_lock.slock)
2012        __acquires(&sk->sk_lock.slock)
2013{
2014        DEFINE_WAIT(wait);
2015
2016        for (;;) {
2017                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2018                                        TASK_UNINTERRUPTIBLE);
2019                spin_unlock_bh(&sk->sk_lock.slock);
2020                schedule();
2021                spin_lock_bh(&sk->sk_lock.slock);
2022                if (!sock_owned_by_user(sk))
2023                        break;
2024        }
2025        finish_wait(&sk->sk_lock.wq, &wait);
2026}
2027
2028static void __release_sock(struct sock *sk)
2029        __releases(&sk->sk_lock.slock)
2030        __acquires(&sk->sk_lock.slock)
2031{
2032        struct sk_buff *skb = sk->sk_backlog.head;
2033
2034        do {
2035                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2036                bh_unlock_sock(sk);
2037
2038                do {
2039                        struct sk_buff *next = skb->next;
2040
2041                        prefetch(next);
2042                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2043                        skb->next = NULL;
2044                        sk_backlog_rcv(sk, skb);
2045
2046                        /*
2047                         * We are in process context here with softirqs
2048                         * disabled, use cond_resched_softirq() to preempt.
2049                         * This is safe to do because we've taken the backlog
2050                         * queue private:
2051                         */
2052                        cond_resched_softirq();
2053
2054                        skb = next;
2055                } while (skb != NULL);
2056
2057                bh_lock_sock(sk);
2058        } while ((skb = sk->sk_backlog.head) != NULL);
2059
2060        /*
2061         * Doing the zeroing here guarantee we can not loop forever
2062         * while a wild producer attempts to flood us.
2063         */
2064        sk->sk_backlog.len = 0;
2065}
2066
2067/**
2068 * sk_wait_data - wait for data to arrive at sk_receive_queue
2069 * @sk:    sock to wait on
2070 * @timeo: for how long
2071 * @skb:   last skb seen on sk_receive_queue
2072 *
2073 * Now socket state including sk->sk_err is changed only under lock,
2074 * hence we may omit checks after joining wait queue.
2075 * We check receive queue before schedule() only as optimization;
2076 * it is very likely that release_sock() added new data.
2077 */
2078int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2079{
2080        int rc;
2081        DEFINE_WAIT(wait);
2082
2083        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2084        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2085        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2086        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2087        finish_wait(sk_sleep(sk), &wait);
2088        return rc;
2089}
2090EXPORT_SYMBOL(sk_wait_data);
2091
2092/**
2093 *      __sk_mem_raise_allocated - increase memory_allocated
2094 *      @sk: socket
2095 *      @size: memory size to allocate
2096 *      @amt: pages to allocate
2097 *      @kind: allocation type
2098 *
2099 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2100 */
2101int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2102{
2103        struct proto *prot = sk->sk_prot;
2104        int parent_status = UNDER_LIMIT;
2105        long allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2106
2107        /* Under limit. */
2108        if (parent_status == UNDER_LIMIT &&
2109                        allocated <= sk_prot_mem_limits(sk, 0)) {
2110                sk_leave_memory_pressure(sk);
2111                return 1;
2112        }
2113
2114        /* Under pressure. (we or our parents) */
2115        if ((parent_status > SOFT_LIMIT) ||
2116                        allocated > sk_prot_mem_limits(sk, 1))
2117                sk_enter_memory_pressure(sk);
2118
2119        /* Over hard limit (we or our parents) */
2120        if ((parent_status == OVER_LIMIT) ||
2121                        (allocated > sk_prot_mem_limits(sk, 2)))
2122                goto suppress_allocation;
2123
2124        /* guarantee minimum buffer size under pressure */
2125        if (kind == SK_MEM_RECV) {
2126                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2127                        return 1;
2128
2129        } else { /* SK_MEM_SEND */
2130                if (sk->sk_type == SOCK_STREAM) {
2131                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2132                                return 1;
2133                } else if (atomic_read(&sk->sk_wmem_alloc) <
2134                           prot->sysctl_wmem[0])
2135                                return 1;
2136        }
2137
2138        if (sk_has_memory_pressure(sk)) {
2139                int alloc;
2140
2141                if (!sk_under_memory_pressure(sk))
2142                        return 1;
2143                alloc = sk_sockets_allocated_read_positive(sk);
2144                if (sk_prot_mem_limits(sk, 2) > alloc *
2145                    sk_mem_pages(sk->sk_wmem_queued +
2146                                 atomic_read(&sk->sk_rmem_alloc) +
2147                                 sk->sk_forward_alloc))
2148                        return 1;
2149        }
2150
2151suppress_allocation:
2152
2153        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2154                sk_stream_moderate_sndbuf(sk);
2155
2156                /* Fail only if socket is _under_ its sndbuf.
2157                 * In this case we cannot block, so that we have to fail.
2158                 */
2159                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2160                        return 1;
2161        }
2162
2163        trace_sock_exceed_buf_limit(sk, prot, allocated);
2164
2165        sk_memory_allocated_sub(sk, amt);
2166
2167        return 0;
2168}
2169EXPORT_SYMBOL(__sk_mem_raise_allocated);
2170
2171/**
2172 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2173 *      @sk: socket
2174 *      @size: memory size to allocate
2175 *      @kind: allocation type
2176 *
2177 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2178 *      rmem allocation. This function assumes that protocols which have
2179 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2180 */
2181int __sk_mem_schedule(struct sock *sk, int size, int kind)
2182{
2183        int ret, amt = sk_mem_pages(size);
2184
2185        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2186        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2187        if (!ret)
2188                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2189        return ret;
2190}
2191EXPORT_SYMBOL(__sk_mem_schedule);
2192
2193/**
2194 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2195 *      @sk: socket
2196 *      @amount: number of quanta
2197 *
2198 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2199 */
2200void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2201{
2202        sk_memory_allocated_sub(sk, amount);
2203
2204        if (sk_under_memory_pressure(sk) &&
2205            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2206                sk_leave_memory_pressure(sk);
2207}
2208EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2209
2210/**
2211 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2212 *      @sk: socket
2213 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2214 */
2215void __sk_mem_reclaim(struct sock *sk, int amount)
2216{
2217        amount >>= SK_MEM_QUANTUM_SHIFT;
2218        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2219        __sk_mem_reduce_allocated(sk, amount);
2220}
2221EXPORT_SYMBOL(__sk_mem_reclaim);
2222
2223
2224/*
2225 * Set of default routines for initialising struct proto_ops when
2226 * the protocol does not support a particular function. In certain
2227 * cases where it makes no sense for a protocol to have a "do nothing"
2228 * function, some default processing is provided.
2229 */
2230
2231int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2232{
2233        return -EOPNOTSUPP;
2234}
2235EXPORT_SYMBOL(sock_no_bind);
2236
2237int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2238                    int len, int flags)
2239{
2240        return -EOPNOTSUPP;
2241}
2242EXPORT_SYMBOL(sock_no_connect);
2243
2244int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2245{
2246        return -EOPNOTSUPP;
2247}
2248EXPORT_SYMBOL(sock_no_socketpair);
2249
2250int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2251{
2252        return -EOPNOTSUPP;
2253}
2254EXPORT_SYMBOL(sock_no_accept);
2255
2256int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2257                    int *len, int peer)
2258{
2259        return -EOPNOTSUPP;
2260}
2261EXPORT_SYMBOL(sock_no_getname);
2262
2263unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2264{
2265        return 0;
2266}
2267EXPORT_SYMBOL(sock_no_poll);
2268
2269int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2270{
2271        return -EOPNOTSUPP;
2272}
2273EXPORT_SYMBOL(sock_no_ioctl);
2274
2275int sock_no_listen(struct socket *sock, int backlog)
2276{
2277        return -EOPNOTSUPP;
2278}
2279EXPORT_SYMBOL(sock_no_listen);
2280
2281int sock_no_shutdown(struct socket *sock, int how)
2282{
2283        return -EOPNOTSUPP;
2284}
2285EXPORT_SYMBOL(sock_no_shutdown);
2286
2287int sock_no_setsockopt(struct socket *sock, int level, int optname,
2288                    char __user *optval, unsigned int optlen)
2289{
2290        return -EOPNOTSUPP;
2291}
2292EXPORT_SYMBOL(sock_no_setsockopt);
2293
2294int sock_no_getsockopt(struct socket *sock, int level, int optname,
2295                    char __user *optval, int __user *optlen)
2296{
2297        return -EOPNOTSUPP;
2298}
2299EXPORT_SYMBOL(sock_no_getsockopt);
2300
2301int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2302                    size_t len)
2303{
2304        return -EOPNOTSUPP;
2305}
2306EXPORT_SYMBOL(sock_no_sendmsg);
2307
2308int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2309                    size_t len, int flags)
2310{
2311        return -EOPNOTSUPP;
2312}
2313EXPORT_SYMBOL(sock_no_recvmsg);
2314
2315int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2316{
2317        /* Mirror missing mmap method error code */
2318        return -ENODEV;
2319}
2320EXPORT_SYMBOL(sock_no_mmap);
2321
2322ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2323{
2324        ssize_t res;
2325        struct msghdr msg = {.msg_flags = flags};
2326        struct kvec iov;
2327        char *kaddr = kmap(page);
2328        iov.iov_base = kaddr + offset;
2329        iov.iov_len = size;
2330        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2331        kunmap(page);
2332        return res;
2333}
2334EXPORT_SYMBOL(sock_no_sendpage);
2335
2336/*
2337 *      Default Socket Callbacks
2338 */
2339
2340static void sock_def_wakeup(struct sock *sk)
2341{
2342        struct socket_wq *wq;
2343
2344        rcu_read_lock();
2345        wq = rcu_dereference(sk->sk_wq);
2346        if (wq_has_sleeper(wq))
2347                wake_up_interruptible_all(&wq->wait);
2348        rcu_read_unlock();
2349}
2350
2351static void sock_def_error_report(struct sock *sk)
2352{
2353        struct socket_wq *wq;
2354
2355        rcu_read_lock();
2356        wq = rcu_dereference(sk->sk_wq);
2357        if (wq_has_sleeper(wq))
2358                wake_up_interruptible_poll(&wq->wait, POLLERR);
2359        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2360        rcu_read_unlock();
2361}
2362
2363static void sock_def_readable(struct sock *sk, int len)
2364{
2365        struct socket_wq *wq;
2366
2367        rcu_read_lock();
2368        wq = rcu_dereference(sk->sk_wq);
2369        if (wq_has_sleeper(wq))
2370                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2371                                                POLLRDNORM | POLLRDBAND);
2372        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2373        rcu_read_unlock();
2374}
2375
2376static void sock_def_write_space(struct sock *sk)
2377{
2378        struct socket_wq *wq;
2379
2380        rcu_read_lock();
2381
2382        /* Do not wake up a writer until he can make "significant"
2383         * progress.  --DaveM
2384         */
2385        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2386                wq = rcu_dereference(sk->sk_wq);
2387                if (wq_has_sleeper(wq))
2388                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2389                                                POLLWRNORM | POLLWRBAND);
2390
2391                /* Should agree with poll, otherwise some programs break */
2392                if (sock_writeable(sk))
2393                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2394        }
2395
2396        rcu_read_unlock();
2397}
2398
2399static void sock_def_destruct(struct sock *sk)
2400{
2401        kfree(sk->sk_protinfo);
2402}
2403
2404void sk_send_sigurg(struct sock *sk)
2405{
2406        if (sk->sk_socket && sk->sk_socket->file)
2407                if (send_sigurg(&sk->sk_socket->file->f_owner))
2408                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2409}
2410EXPORT_SYMBOL(sk_send_sigurg);
2411
2412void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2413                    unsigned long expires)
2414{
2415        if (!mod_timer(timer, expires))
2416                sock_hold(sk);
2417}
2418EXPORT_SYMBOL(sk_reset_timer);
2419
2420void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2421{
2422        if (del_timer(timer))
2423                __sock_put(sk);
2424}
2425EXPORT_SYMBOL(sk_stop_timer);
2426
2427void sock_init_data(struct socket *sock, struct sock *sk)
2428{
2429        skb_queue_head_init(&sk->sk_receive_queue);
2430        skb_queue_head_init(&sk->sk_write_queue);
2431        skb_queue_head_init(&sk->sk_error_queue);
2432
2433        sk->sk_send_head        =       NULL;
2434
2435        init_timer(&sk->sk_timer);
2436
2437        sk->sk_allocation       =       GFP_KERNEL;
2438        sk->sk_rcvbuf           =       sysctl_rmem_default;
2439        sk->sk_sndbuf           =       sysctl_wmem_default;
2440        sk->sk_state            =       TCP_CLOSE;
2441        sk_set_socket(sk, sock);
2442
2443        sock_set_flag(sk, SOCK_ZAPPED);
2444
2445        if (sock) {
2446                sk->sk_type     =       sock->type;
2447                sk->sk_wq       =       sock->wq;
2448                sock->sk        =       sk;
2449        } else
2450                sk->sk_wq       =       NULL;
2451
2452        rwlock_init(&sk->sk_callback_lock);
2453        lockdep_set_class_and_name(&sk->sk_callback_lock,
2454                        af_callback_keys + sk->sk_family,
2455                        af_family_clock_key_strings[sk->sk_family]);
2456
2457        sk->sk_state_change     =       sock_def_wakeup;
2458        sk->sk_data_ready       =       sock_def_readable;
2459        sk->sk_write_space      =       sock_def_write_space;
2460        sk->sk_error_report     =       sock_def_error_report;
2461        sk->sk_destruct         =       sock_def_destruct;
2462
2463        sk->sk_frag.page        =       NULL;
2464        sk->sk_frag.offset      =       0;
2465        sk->sk_peek_off         =       -1;
2466
2467        sk->sk_peer_pid         =       NULL;
2468        sk->sk_peer_cred        =       NULL;
2469        sk->sk_write_pending    =       0;
2470        sk->sk_rcvlowat         =       1;
2471        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2472        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2473
2474        sk->sk_stamp = ktime_set(-1L, 0);
2475
2476#ifdef CONFIG_NET_RX_BUSY_POLL
2477        sk->sk_napi_id          =       0;
2478        sk->sk_ll_usec          =       sysctl_net_busy_read;
2479#endif
2480
2481        sk->sk_max_pacing_rate = ~0U;
2482        sk->sk_pacing_rate = ~0U;
2483        /*
2484         * Before updating sk_refcnt, we must commit prior changes to memory
2485         * (Documentation/RCU/rculist_nulls.txt for details)
2486         */
2487        smp_wmb();
2488        atomic_set(&sk->sk_refcnt, 1);
2489        atomic_set(&sk->sk_drops, 0);
2490}
2491EXPORT_SYMBOL(sock_init_data);
2492
2493void lock_sock_nested(struct sock *sk, int subclass)
2494{
2495        might_sleep();
2496        spin_lock_bh(&sk->sk_lock.slock);
2497        if (sk->sk_lock.owned)
2498                __lock_sock(sk);
2499        sk->sk_lock.owned = 1;
2500        spin_unlock(&sk->sk_lock.slock);
2501        /*
2502         * The sk_lock has mutex_lock() semantics here:
2503         */
2504        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2505        local_bh_enable();
2506}
2507EXPORT_SYMBOL(lock_sock_nested);
2508
2509void release_sock(struct sock *sk)
2510{
2511        spin_lock_bh(&sk->sk_lock.slock);
2512        if (sk->sk_backlog.tail)
2513                __release_sock(sk);
2514
2515        /* Warning : release_cb() might need to release sk ownership,
2516         * ie call sock_release_ownership(sk) before us.
2517         */
2518        if (sk->sk_prot->release_cb)
2519                sk->sk_prot->release_cb(sk);
2520
2521        sock_release_ownership(sk);
2522        if (waitqueue_active(&sk->sk_lock.wq))
2523                wake_up(&sk->sk_lock.wq);
2524        spin_unlock_bh(&sk->sk_lock.slock);
2525}
2526EXPORT_SYMBOL(release_sock);
2527
2528/**
2529 * lock_sock_fast - fast version of lock_sock
2530 * @sk: socket
2531 *
2532 * This version should be used for very small section, where process wont block
2533 * return false if fast path is taken
2534 *   sk_lock.slock locked, owned = 0, BH disabled
2535 * return true if slow path is taken
2536 *   sk_lock.slock unlocked, owned = 1, BH enabled
2537 */
2538bool lock_sock_fast(struct sock *sk)
2539{
2540        might_sleep();
2541        spin_lock_bh(&sk->sk_lock.slock);
2542
2543        if (!sk->sk_lock.owned)
2544                /*
2545                 * Note : We must disable BH
2546                 */
2547                return false;
2548
2549        __lock_sock(sk);
2550        sk->sk_lock.owned = 1;
2551        spin_unlock(&sk->sk_lock.slock);
2552        /*
2553         * The sk_lock has mutex_lock() semantics here:
2554         */
2555        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2556        local_bh_enable();
2557        return true;
2558}
2559EXPORT_SYMBOL(lock_sock_fast);
2560
2561int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2562{
2563        struct timeval tv;
2564        if (!sock_flag(sk, SOCK_TIMESTAMP))
2565                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2566        tv = ktime_to_timeval(sk->sk_stamp);
2567        if (tv.tv_sec == -1)
2568                return -ENOENT;
2569        if (tv.tv_sec == 0) {
2570                sk->sk_stamp = ktime_get_real();
2571                tv = ktime_to_timeval(sk->sk_stamp);
2572        }
2573        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2574}
2575EXPORT_SYMBOL(sock_get_timestamp);
2576
2577int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2578{
2579        struct timespec ts;
2580        if (!sock_flag(sk, SOCK_TIMESTAMP))
2581                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2582        ts = ktime_to_timespec(sk->sk_stamp);
2583        if (ts.tv_sec == -1)
2584                return -ENOENT;
2585        if (ts.tv_sec == 0) {
2586                sk->sk_stamp = ktime_get_real();
2587                ts = ktime_to_timespec(sk->sk_stamp);
2588        }
2589        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2590}
2591EXPORT_SYMBOL(sock_get_timestampns);
2592
2593void sock_enable_timestamp(struct sock *sk, int flag)
2594{
2595        if (!sock_flag(sk, flag)) {
2596                unsigned long previous_flags = sk->sk_flags;
2597
2598                sock_set_flag(sk, flag);
2599                /*
2600                 * we just set one of the two flags which require net
2601                 * time stamping, but time stamping might have been on
2602                 * already because of the other one
2603                 */
2604                if (sock_needs_netstamp(sk) &&
2605                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2606                        net_enable_timestamp();
2607        }
2608}
2609
2610/*
2611 *      Get a socket option on an socket.
2612 *
2613 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2614 *      asynchronous errors should be reported by getsockopt. We assume
2615 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2616 */
2617int sock_common_getsockopt(struct socket *sock, int level, int optname,
2618                           char __user *optval, int __user *optlen)
2619{
2620        struct sock *sk = sock->sk;
2621
2622        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2623}
2624EXPORT_SYMBOL(sock_common_getsockopt);
2625
2626#ifdef CONFIG_COMPAT
2627int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2628                                  char __user *optval, int __user *optlen)
2629{
2630        struct sock *sk = sock->sk;
2631
2632        if (sk->sk_prot->compat_getsockopt != NULL)
2633                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2634                                                      optval, optlen);
2635        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2636}
2637EXPORT_SYMBOL(compat_sock_common_getsockopt);
2638#endif
2639
2640int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2641                        struct msghdr *msg, size_t size, int flags)
2642{
2643        struct sock *sk = sock->sk;
2644        int addr_len = 0;
2645        int err;
2646
2647        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2648                                   flags & ~MSG_DONTWAIT, &addr_len);
2649        if (err >= 0)
2650                msg->msg_namelen = addr_len;
2651        return err;
2652}
2653EXPORT_SYMBOL(sock_common_recvmsg);
2654
2655/*
2656 *      Set socket options on an inet socket.
2657 */
2658int sock_common_setsockopt(struct socket *sock, int level, int optname,
2659                           char __user *optval, unsigned int optlen)
2660{
2661        struct sock *sk = sock->sk;
2662
2663        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2664}
2665EXPORT_SYMBOL(sock_common_setsockopt);
2666
2667#ifdef CONFIG_COMPAT
2668int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2669                                  char __user *optval, unsigned int optlen)
2670{
2671        struct sock *sk = sock->sk;
2672
2673        if (sk->sk_prot->compat_setsockopt != NULL)
2674                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2675                                                      optval, optlen);
2676        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2677}
2678EXPORT_SYMBOL(compat_sock_common_setsockopt);
2679#endif
2680
2681void sk_common_release(struct sock *sk)
2682{
2683        if (sk->sk_prot->destroy)
2684                sk->sk_prot->destroy(sk);
2685
2686        /*
2687         * Observation: when sock_common_release is called, processes have
2688         * no access to socket. But net still has.
2689         * Step one, detach it from networking:
2690         *
2691         * A. Remove from hash tables.
2692         */
2693
2694        sk->sk_prot->unhash(sk);
2695
2696        /*
2697         * In this point socket cannot receive new packets, but it is possible
2698         * that some packets are in flight because some CPU runs receiver and
2699         * did hash table lookup before we unhashed socket. They will achieve
2700         * receive queue and will be purged by socket destructor.
2701         *
2702         * Also we still have packets pending on receive queue and probably,
2703         * our own packets waiting in device queues. sock_destroy will drain
2704         * receive queue, but transmitted packets will delay socket destruction
2705         * until the last reference will be released.
2706         */
2707
2708        sock_orphan(sk);
2709
2710        xfrm_sk_free_policy(sk);
2711
2712        sk_refcnt_debug_release(sk);
2713
2714        sock_put(sk);
2715}
2716EXPORT_SYMBOL(sk_common_release);
2717
2718#ifdef CONFIG_PROC_FS
2719#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2720struct prot_inuse {
2721        int val[PROTO_INUSE_NR];
2722};
2723
2724static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2725
2726#ifdef CONFIG_NET_NS
2727void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2728{
2729        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2730}
2731EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2732
2733int sock_prot_inuse_get(struct net *net, struct proto *prot)
2734{
2735        int cpu, idx = prot->inuse_idx;
2736        int res = 0;
2737
2738        for_each_possible_cpu(cpu)
2739                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2740
2741        return res >= 0 ? res : 0;
2742}
2743EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2744
2745static int __net_init sock_inuse_init_net(struct net *net)
2746{
2747        net->core.inuse = alloc_percpu(struct prot_inuse);
2748        return net->core.inuse ? 0 : -ENOMEM;
2749}
2750
2751static void __net_exit sock_inuse_exit_net(struct net *net)
2752{
2753        free_percpu(net->core.inuse);
2754}
2755
2756static struct pernet_operations net_inuse_ops = {
2757        .init = sock_inuse_init_net,
2758        .exit = sock_inuse_exit_net,
2759};
2760
2761static __init int net_inuse_init(void)
2762{
2763        if (register_pernet_subsys(&net_inuse_ops))
2764                panic("Cannot initialize net inuse counters");
2765
2766        return 0;
2767}
2768
2769core_initcall(net_inuse_init);
2770#else
2771static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2772
2773void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2774{
2775        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2776}
2777EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2778
2779int sock_prot_inuse_get(struct net *net, struct proto *prot)
2780{
2781        int cpu, idx = prot->inuse_idx;
2782        int res = 0;
2783
2784        for_each_possible_cpu(cpu)
2785                res += per_cpu(prot_inuse, cpu).val[idx];
2786
2787        return res >= 0 ? res : 0;
2788}
2789EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2790#endif
2791
2792static void assign_proto_idx(struct proto *prot)
2793{
2794        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2795
2796        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2797                pr_err("PROTO_INUSE_NR exhausted\n");
2798                return;
2799        }
2800
2801        set_bit(prot->inuse_idx, proto_inuse_idx);
2802}
2803
2804static void release_proto_idx(struct proto *prot)
2805{
2806        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2807                clear_bit(prot->inuse_idx, proto_inuse_idx);
2808}
2809#else
2810static inline void assign_proto_idx(struct proto *prot)
2811{
2812}
2813
2814static inline void release_proto_idx(struct proto *prot)
2815{
2816}
2817#endif
2818
2819int proto_register(struct proto *prot, int alloc_slab)
2820{
2821        if (alloc_slab) {
2822                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2823                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2824                                        NULL);
2825
2826                if (prot->slab == NULL) {
2827                        pr_crit("%s: Can't create sock SLAB cache!\n",
2828                                prot->name);
2829                        goto out;
2830                }
2831
2832                if (prot->rsk_prot != NULL) {
2833                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2834                        if (prot->rsk_prot->slab_name == NULL)
2835                                goto out_free_sock_slab;
2836
2837                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2838                                                                 prot->rsk_prot->obj_size, 0,
2839                                                                 SLAB_HWCACHE_ALIGN, NULL);
2840
2841                        if (prot->rsk_prot->slab == NULL) {
2842                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2843                                        prot->name);
2844                                goto out_free_request_sock_slab_name;
2845                        }
2846                }
2847
2848                if (prot->twsk_prot != NULL) {
2849                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2850
2851                        if (prot->twsk_prot->twsk_slab_name == NULL)
2852                                goto out_free_request_sock_slab;
2853
2854                        prot->twsk_prot->twsk_slab =
2855                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2856                                                  prot->twsk_prot->twsk_obj_size,
2857                                                  0,
2858                                                  SLAB_HWCACHE_ALIGN |
2859                                                        prot->slab_flags,
2860                                                  NULL);
2861                        if (prot->twsk_prot->twsk_slab == NULL)
2862                                goto out_free_timewait_sock_slab_name;
2863                }
2864        }
2865
2866        mutex_lock(&proto_list_mutex);
2867        list_add(&prot->node, &proto_list);
2868        assign_proto_idx(prot);
2869        mutex_unlock(&proto_list_mutex);
2870        return 0;
2871
2872out_free_timewait_sock_slab_name:
2873        kfree(prot->twsk_prot->twsk_slab_name);
2874out_free_request_sock_slab:
2875        if (prot->rsk_prot && prot->rsk_prot->slab) {
2876                kmem_cache_destroy(prot->rsk_prot->slab);
2877                prot->rsk_prot->slab = NULL;
2878        }
2879out_free_request_sock_slab_name:
2880        if (prot->rsk_prot)
2881                kfree(prot->rsk_prot->slab_name);
2882out_free_sock_slab:
2883        kmem_cache_destroy(prot->slab);
2884        prot->slab = NULL;
2885out:
2886        return -ENOBUFS;
2887}
2888EXPORT_SYMBOL(proto_register);
2889
2890void proto_unregister(struct proto *prot)
2891{
2892        mutex_lock(&proto_list_mutex);
2893        release_proto_idx(prot);
2894        list_del(&prot->node);
2895        mutex_unlock(&proto_list_mutex);
2896
2897        if (prot->slab != NULL) {
2898                kmem_cache_destroy(prot->slab);
2899                prot->slab = NULL;
2900        }
2901
2902        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2903                kmem_cache_destroy(prot->rsk_prot->slab);
2904                kfree(prot->rsk_prot->slab_name);
2905                prot->rsk_prot->slab = NULL;
2906        }
2907
2908        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2909                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2910                kfree(prot->twsk_prot->twsk_slab_name);
2911                prot->twsk_prot->twsk_slab = NULL;
2912        }
2913}
2914EXPORT_SYMBOL(proto_unregister);
2915
2916int sock_load_diag_module(int family, int protocol)
2917{
2918        if (!protocol) {
2919                if (!sock_is_registered(family))
2920                        return -ENOENT;
2921
2922                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
2923                                      NETLINK_SOCK_DIAG, family);
2924        }
2925
2926#ifdef CONFIG_INET
2927        if (family == AF_INET &&
2928            !rcu_access_pointer(inet_protos[protocol]))
2929                return -ENOENT;
2930#endif
2931
2932        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
2933                              NETLINK_SOCK_DIAG, family, protocol);
2934}
2935EXPORT_SYMBOL(sock_load_diag_module);
2936
2937#ifdef CONFIG_PROC_FS
2938static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2939        __acquires(proto_list_mutex)
2940{
2941        mutex_lock(&proto_list_mutex);
2942        return seq_list_start_head(&proto_list, *pos);
2943}
2944
2945static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2946{
2947        return seq_list_next(v, &proto_list, pos);
2948}
2949
2950static void proto_seq_stop(struct seq_file *seq, void *v)
2951        __releases(proto_list_mutex)
2952{
2953        mutex_unlock(&proto_list_mutex);
2954}
2955
2956static char proto_method_implemented(const void *method)
2957{
2958        return method == NULL ? 'n' : 'y';
2959}
2960static long sock_prot_memory_allocated(struct proto *proto)
2961{
2962        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2963}
2964
2965static char *sock_prot_memory_pressure(struct proto *proto)
2966{
2967        return proto->memory_pressure != NULL ?
2968        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2969}
2970
2971static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2972{
2973
2974        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2975                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2976                   proto->name,
2977                   proto->obj_size,
2978                   sock_prot_inuse_get(seq_file_net(seq), proto),
2979                   sock_prot_memory_allocated(proto),
2980                   sock_prot_memory_pressure(proto),
2981                   proto->max_header,
2982                   proto->slab == NULL ? "no" : "yes",
2983                   module_name(proto->owner),
2984                   proto_method_implemented(proto->close),
2985                   proto_method_implemented(proto->connect),
2986                   proto_method_implemented(proto->disconnect),
2987                   proto_method_implemented(proto->accept),
2988                   proto_method_implemented(proto->ioctl),
2989                   proto_method_implemented(proto->init),
2990                   proto_method_implemented(proto->destroy),
2991                   proto_method_implemented(proto->shutdown),
2992                   proto_method_implemented(proto->setsockopt),
2993                   proto_method_implemented(proto->getsockopt),
2994                   proto_method_implemented(proto->sendmsg),
2995                   proto_method_implemented(proto->recvmsg),
2996                   proto_method_implemented(proto->sendpage),
2997                   proto_method_implemented(proto->bind),
2998                   proto_method_implemented(proto->backlog_rcv),
2999                   proto_method_implemented(proto->hash),
3000                   proto_method_implemented(proto->unhash),
3001                   proto_method_implemented(proto->get_port),
3002                   proto_method_implemented(proto->enter_memory_pressure));
3003}
3004
3005static int proto_seq_show(struct seq_file *seq, void *v)
3006{
3007        if (v == &proto_list)
3008                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3009                           "protocol",
3010                           "size",
3011                           "sockets",
3012                           "memory",
3013                           "press",
3014                           "maxhdr",
3015                           "slab",
3016                           "module",
3017                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3018        else
3019                proto_seq_printf(seq, list_entry(v, struct proto, node));
3020        return 0;
3021}
3022
3023static const struct seq_operations proto_seq_ops = {
3024        .start  = proto_seq_start,
3025        .next   = proto_seq_next,
3026        .stop   = proto_seq_stop,
3027        .show   = proto_seq_show,
3028};
3029
3030static int proto_seq_open(struct inode *inode, struct file *file)
3031{
3032        return seq_open_net(inode, file, &proto_seq_ops,
3033                            sizeof(struct seq_net_private));
3034}
3035
3036static const struct file_operations proto_seq_fops = {
3037        .owner          = THIS_MODULE,
3038        .open           = proto_seq_open,
3039        .read           = seq_read,
3040        .llseek         = seq_lseek,
3041        .release        = seq_release_net,
3042};
3043
3044static __net_init int proto_init_net(struct net *net)
3045{
3046        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3047                return -ENOMEM;
3048
3049        return 0;
3050}
3051
3052static __net_exit void proto_exit_net(struct net *net)
3053{
3054        remove_proc_entry("protocols", net->proc_net);
3055}
3056
3057
3058static __net_initdata struct pernet_operations proto_net_ops = {
3059        .init = proto_init_net,
3060        .exit = proto_exit_net,
3061};
3062
3063static int __init proto_init(void)
3064{
3065        return register_pernet_subsys(&proto_net_ops);
3066}
3067
3068subsys_initcall(proto_init);
3069
3070#endif /* PROC_FS */
3071