LXR linux/net/core/sock.c

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/sched/mm.h>
 106#include <linux/timer.h>
 107#include <linux/string.h>
 108#include <linux/sockios.h>
 109#include <linux/net.h>
 110#include <linux/mm.h>
 111#include <linux/slab.h>
 112#include <linux/interrupt.h>
 113#include <linux/poll.h>
 114#include <linux/tcp.h>
 115#include <linux/init.h>
 116#include <linux/highmem.h>
 117#include <linux/user_namespace.h>
 118#include <linux/static_key.h>
 119#include <linux/memcontrol.h>
 120#include <linux/prefetch.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <linux/net_tstamp.h>
 131#include <net/xfrm.h>
 132#include <linux/ipsec.h>
 133#include <net/cls_cgroup.h>
 134#include <net/netprio_cgroup.h>
 135#include <linux/sock_diag.h>
 136
 137#include <linux/filter.h>
 138#include <net/sock_reuseport.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159                   struct user_namespace *user_ns, int cap)
 160{
 161        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177        return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196/*
 197 * Each address family might have different locking rules, so we have
 198 * one slock key per address family and separate keys for internal and
 199 * userspace sockets.
 200 */
 201static struct lock_class_key af_family_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206/*
 207 * Make lock validator output more readable. (we pre-construct these
 208 * strings build-time, so that runtime initialization of socket
 209 * locks is fast):
 210 */
 211
 212#define _sock_locks(x)                                            \
 213  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230        _sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233        _sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236        _sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240        _sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243        _sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249  "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250  "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251  "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252  "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253  "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254  "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255  "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256  "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257  "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258  "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259  "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260  "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261  "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262  "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264};
 265static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266  "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267  "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268  "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269  "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270  "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271  "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272  "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273  "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274  "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275  "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276  "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277  "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278  "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279  "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281};
 282static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283  "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284  "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285  "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286  "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287  "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288  "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289  "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290  "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291  "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292  "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293  "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294  "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295  "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296  "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock and sk queues locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305static struct lock_class_key af_rlock_keys[AF_MAX];
 306static struct lock_class_key af_wlock_keys[AF_MAX];
 307static struct lock_class_key af_elock_keys[AF_MAX];
 308static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310/* Take into consideration the size of the struct sk_buff overhead in the
 311 * determination of these values, since that is non-constant across
 312 * platforms.  This makes socket queueing behavior and performance
 313 * not depend upon such differences.
 314 */
 315#define _SK_MEM_PACKETS         256
 316#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 317#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 318#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 319
 320/* Run time adjustable parameters. */
 321__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 322EXPORT_SYMBOL(sysctl_wmem_max);
 323__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 324EXPORT_SYMBOL(sysctl_rmem_max);
 325__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 326__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 327
 328/* Maximal space eaten by iovec or ancillary data plus some space */
 329int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 330EXPORT_SYMBOL(sysctl_optmem_max);
 331
 332int sysctl_tstamp_allow_data __read_mostly = 1;
 333
 334struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 335EXPORT_SYMBOL_GPL(memalloc_socks);
 336
 337/**
 338 * sk_set_memalloc - sets %SOCK_MEMALLOC
 339 * @sk: socket to set it on
 340 *
 341 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 342 * It's the responsibility of the admin to adjust min_free_kbytes
 343 * to meet the requirements
 344 */
 345void sk_set_memalloc(struct sock *sk)
 346{
 347        sock_set_flag(sk, SOCK_MEMALLOC);
 348        sk->sk_allocation |= __GFP_MEMALLOC;
 349        static_key_slow_inc(&memalloc_socks);
 350}
 351EXPORT_SYMBOL_GPL(sk_set_memalloc);
 352
 353void sk_clear_memalloc(struct sock *sk)
 354{
 355        sock_reset_flag(sk, SOCK_MEMALLOC);
 356        sk->sk_allocation &= ~__GFP_MEMALLOC;
 357        static_key_slow_dec(&memalloc_socks);
 358
 359        /*
 360         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 361         * progress of swapping. SOCK_MEMALLOC may be cleared while
 362         * it has rmem allocations due to the last swapfile being deactivated
 363         * but there is a risk that the socket is unusable due to exceeding
 364         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 365         */
 366        sk_mem_reclaim(sk);
 367}
 368EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 369
 370int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 371{
 372        int ret;
 373        unsigned int noreclaim_flag;
 374
 375        /* these should have been dropped before queueing */
 376        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 377
 378        noreclaim_flag = memalloc_noreclaim_save();
 379        ret = sk->sk_backlog_rcv(sk, skb);
 380        memalloc_noreclaim_restore(noreclaim_flag);
 381
 382        return ret;
 383}
 384EXPORT_SYMBOL(__sk_backlog_rcv);
 385
 386static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 387{
 388        struct timeval tv;
 389
 390        if (optlen < sizeof(tv))
 391                return -EINVAL;
 392        if (copy_from_user(&tv, optval, sizeof(tv)))
 393                return -EFAULT;
 394        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                return -EDOM;
 396
 397        if (tv.tv_sec < 0) {
 398                static int warned __read_mostly;
 399
 400                *timeo_p = 0;
 401                if (warned < 10 && net_ratelimit()) {
 402                        warned++;
 403                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                __func__, current->comm, task_pid_nr(current));
 405                }
 406                return 0;
 407        }
 408        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                return 0;
 411        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 412                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 413        return 0;
 414}
 415
 416static void sock_warn_obsolete_bsdism(const char *name)
 417{
 418        static int warned;
 419        static char warncomm[TASK_COMM_LEN];
 420        if (strcmp(warncomm, current->comm) && warned < 5) {
 421                strcpy(warncomm,  current->comm);
 422                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                        warncomm, name);
 424                warned++;
 425        }
 426}
 427
 428static bool sock_needs_netstamp(const struct sock *sk)
 429{
 430        switch (sk->sk_family) {
 431        case AF_UNSPEC:
 432        case AF_UNIX:
 433                return false;
 434        default:
 435                return true;
 436        }
 437}
 438
 439static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440{
 441        if (sk->sk_flags & flags) {
 442                sk->sk_flags &= ~flags;
 443                if (sock_needs_netstamp(sk) &&
 444                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                        net_disable_timestamp();
 446        }
 447}
 448
 449
 450int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451{
 452        unsigned long flags;
 453        struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                atomic_inc(&sk->sk_drops);
 457                trace_sock_rcvqueue_full(sk, skb);
 458                return -ENOMEM;
 459        }
 460
 461        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                atomic_inc(&sk->sk_drops);
 463                return -ENOBUFS;
 464        }
 465
 466        skb->dev = NULL;
 467        skb_set_owner_r(skb, sk);
 468
 469        /* we escape from rcu protected region, make sure we dont leak
 470         * a norefcounted dst
 471         */
 472        skb_dst_force(skb);
 473
 474        spin_lock_irqsave(&list->lock, flags);
 475        sock_skb_set_dropcount(sk, skb);
 476        __skb_queue_tail(list, skb);
 477        spin_unlock_irqrestore(&list->lock, flags);
 478
 479        if (!sock_flag(sk, SOCK_DEAD))
 480                sk->sk_data_ready(sk);
 481        return 0;
 482}
 483EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486{
 487        int err;
 488
 489        err = sk_filter(sk, skb);
 490        if (err)
 491                return err;
 492
 493        return __sock_queue_rcv_skb(sk, skb);
 494}
 495EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                     const int nested, unsigned int trim_cap, bool refcounted)
 499{
 500        int rc = NET_RX_SUCCESS;
 501
 502        if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                goto discard_and_relse;
 504
 505        skb->dev = NULL;
 506
 507        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                atomic_inc(&sk->sk_drops);
 509                goto discard_and_relse;
 510        }
 511        if (nested)
 512                bh_lock_sock_nested(sk);
 513        else
 514                bh_lock_sock(sk);
 515        if (!sock_owned_by_user(sk)) {
 516                /*
 517                 * trylock + unlock semantics:
 518                 */
 519                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                rc = sk_backlog_rcv(sk, skb);
 522
 523                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                bh_unlock_sock(sk);
 526                atomic_inc(&sk->sk_drops);
 527                goto discard_and_relse;
 528        }
 529
 530        bh_unlock_sock(sk);
 531out:
 532        if (refcounted)
 533                sock_put(sk);
 534        return rc;
 535discard_and_relse:
 536        kfree_skb(skb);
 537        goto out;
 538}
 539EXPORT_SYMBOL(__sk_receive_skb);
 540
 541struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542{
 543        struct dst_entry *dst = __sk_dst_get(sk);
 544
 545        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                sk_tx_queue_clear(sk);
 547                sk->sk_dst_pending_confirm = 0;
 548                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                dst_release(dst);
 550                return NULL;
 551        }
 552
 553        return dst;
 554}
 555EXPORT_SYMBOL(__sk_dst_check);
 556
 557struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558{
 559        struct dst_entry *dst = sk_dst_get(sk);
 560
 561        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                sk_dst_reset(sk);
 563                dst_release(dst);
 564                return NULL;
 565        }
 566
 567        return dst;
 568}
 569EXPORT_SYMBOL(sk_dst_check);
 570
 571static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 572                                int optlen)
 573{
 574        int ret = -ENOPROTOOPT;
 575#ifdef CONFIG_NETDEVICES
 576        struct net *net = sock_net(sk);
 577        char devname[IFNAMSIZ];
 578        int index;
 579
 580        /* Sorry... */
 581        ret = -EPERM;
 582        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 583                goto out;
 584
 585        ret = -EINVAL;
 586        if (optlen < 0)
 587                goto out;
 588
 589        /* Bind this socket to a particular device like "eth0",
 590         * as specified in the passed interface name. If the
 591         * name is "" or the option length is zero the socket
 592         * is not bound.
 593         */
 594        if (optlen > IFNAMSIZ - 1)
 595                optlen = IFNAMSIZ - 1;
 596        memset(devname, 0, sizeof(devname));
 597
 598        ret = -EFAULT;
 599        if (copy_from_user(devname, optval, optlen))
 600                goto out;
 601
 602        index = 0;
 603        if (devname[0] != '\0') {
 604                struct net_device *dev;
 605
 606                rcu_read_lock();
 607                dev = dev_get_by_name_rcu(net, devname);
 608                if (dev)
 609                        index = dev->ifindex;
 610                rcu_read_unlock();
 611                ret = -ENODEV;
 612                if (!dev)
 613                        goto out;
 614        }
 615
 616        lock_sock(sk);
 617        sk->sk_bound_dev_if = index;
 618        sk_dst_reset(sk);
 619        release_sock(sk);
 620
 621        ret = 0;
 622
 623out:
 624#endif
 625
 626        return ret;
 627}
 628
 629static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 630                                int __user *optlen, int len)
 631{
 632        int ret = -ENOPROTOOPT;
 633#ifdef CONFIG_NETDEVICES
 634        struct net *net = sock_net(sk);
 635        char devname[IFNAMSIZ];
 636
 637        if (sk->sk_bound_dev_if == 0) {
 638                len = 0;
 639                goto zero;
 640        }
 641
 642        ret = -EINVAL;
 643        if (len < IFNAMSIZ)
 644                goto out;
 645
 646        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 647        if (ret)
 648                goto out;
 649
 650        len = strlen(devname) + 1;
 651
 652        ret = -EFAULT;
 653        if (copy_to_user(optval, devname, len))
 654                goto out;
 655
 656zero:
 657        ret = -EFAULT;
 658        if (put_user(len, optlen))
 659                goto out;
 660
 661        ret = 0;
 662
 663out:
 664#endif
 665
 666        return ret;
 667}
 668
 669static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 670{
 671        if (valbool)
 672                sock_set_flag(sk, bit);
 673        else
 674                sock_reset_flag(sk, bit);
 675}
 676
 677bool sk_mc_loop(struct sock *sk)
 678{
 679        if (dev_recursion_level())
 680                return false;
 681        if (!sk)
 682                return true;
 683        switch (sk->sk_family) {
 684        case AF_INET:
 685                return inet_sk(sk)->mc_loop;
 686#if IS_ENABLED(CONFIG_IPV6)
 687        case AF_INET6:
 688                return inet6_sk(sk)->mc_loop;
 689#endif
 690        }
 691        WARN_ON(1);
 692        return true;
 693}
 694EXPORT_SYMBOL(sk_mc_loop);
 695
 696/*
 697 *      This is meant for all protocols to use and covers goings on
 698 *      at the socket level. Everything here is generic.
 699 */
 700
 701int sock_setsockopt(struct socket *sock, int level, int optname,
 702                    char __user *optval, unsigned int optlen)
 703{
 704        struct sock *sk = sock->sk;
 705        int val;
 706        int valbool;
 707        struct linger ling;
 708        int ret = 0;
 709
 710        /*
 711         *      Options without arguments
 712         */
 713
 714        if (optname == SO_BINDTODEVICE)
 715                return sock_setbindtodevice(sk, optval, optlen);
 716
 717        if (optlen < sizeof(int))
 718                return -EINVAL;
 719
 720        if (get_user(val, (int __user *)optval))
 721                return -EFAULT;
 722
 723        valbool = val ? 1 : 0;
 724
 725        lock_sock(sk);
 726
 727        switch (optname) {
 728        case SO_DEBUG:
 729                if (val && !capable(CAP_NET_ADMIN))
 730                        ret = -EACCES;
 731                else
 732                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 733                break;
 734        case SO_REUSEADDR:
 735                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 736                break;
 737        case SO_REUSEPORT:
 738                sk->sk_reuseport = valbool;
 739                break;
 740        case SO_TYPE:
 741        case SO_PROTOCOL:
 742        case SO_DOMAIN:
 743        case SO_ERROR:
 744                ret = -ENOPROTOOPT;
 745                break;
 746        case SO_DONTROUTE:
 747                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 748                break;
 749        case SO_BROADCAST:
 750                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 751                break;
 752        case SO_SNDBUF:
 753                /* Don't error on this BSD doesn't and if you think
 754                 * about it this is right. Otherwise apps have to
 755                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 756                 * are treated in BSD as hints
 757                 */
 758                val = min_t(u32, val, sysctl_wmem_max);
 759set_sndbuf:
 760                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 761                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 762                /* Wake up sending tasks if we upped the value. */
 763                sk->sk_write_space(sk);
 764                break;
 765
 766        case SO_SNDBUFFORCE:
 767                if (!capable(CAP_NET_ADMIN)) {
 768                        ret = -EPERM;
 769                        break;
 770                }
 771                goto set_sndbuf;
 772
 773        case SO_RCVBUF:
 774                /* Don't error on this BSD doesn't and if you think
 775                 * about it this is right. Otherwise apps have to
 776                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                 * are treated in BSD as hints
 778                 */
 779                val = min_t(u32, val, sysctl_rmem_max);
 780set_rcvbuf:
 781                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 782                /*
 783                 * We double it on the way in to account for
 784                 * "struct sk_buff" etc. overhead.   Applications
 785                 * assume that the SO_RCVBUF setting they make will
 786                 * allow that much actual data to be received on that
 787                 * socket.
 788                 *
 789                 * Applications are unaware that "struct sk_buff" and
 790                 * other overheads allocate from the receive buffer
 791                 * during socket buffer allocation.
 792                 *
 793                 * And after considering the possible alternatives,
 794                 * returning the value we actually used in getsockopt
 795                 * is the most desirable behavior.
 796                 */
 797                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 798                break;
 799
 800        case SO_RCVBUFFORCE:
 801                if (!capable(CAP_NET_ADMIN)) {
 802                        ret = -EPERM;
 803                        break;
 804                }
 805                goto set_rcvbuf;
 806
 807        case SO_KEEPALIVE:
 808                if (sk->sk_prot->keepalive)
 809                        sk->sk_prot->keepalive(sk, valbool);
 810                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 811                break;
 812
 813        case SO_OOBINLINE:
 814                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 815                break;
 816
 817        case SO_NO_CHECK:
 818                sk->sk_no_check_tx = valbool;
 819                break;
 820
 821        case SO_PRIORITY:
 822                if ((val >= 0 && val <= 6) ||
 823                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 824                        sk->sk_priority = val;
 825                else
 826                        ret = -EPERM;
 827                break;
 828
 829        case SO_LINGER:
 830                if (optlen < sizeof(ling)) {
 831                        ret = -EINVAL;  /* 1003.1g */
 832                        break;
 833                }
 834                if (copy_from_user(&ling, optval, sizeof(ling))) {
 835                        ret = -EFAULT;
 836                        break;
 837                }
 838                if (!ling.l_onoff)
 839                        sock_reset_flag(sk, SOCK_LINGER);
 840                else {
 841#if (BITS_PER_LONG == 32)
 842                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 843                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 844                        else
 845#endif
 846                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 847                        sock_set_flag(sk, SOCK_LINGER);
 848                }
 849                break;
 850
 851        case SO_BSDCOMPAT:
 852                sock_warn_obsolete_bsdism("setsockopt");
 853                break;
 854
 855        case SO_PASSCRED:
 856                if (valbool)
 857                        set_bit(SOCK_PASSCRED, &sock->flags);
 858                else
 859                        clear_bit(SOCK_PASSCRED, &sock->flags);
 860                break;
 861
 862        case SO_TIMESTAMP:
 863        case SO_TIMESTAMPNS:
 864                if (valbool)  {
 865                        if (optname == SO_TIMESTAMP)
 866                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 867                        else
 868                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 869                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 870                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 871                } else {
 872                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 873                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 874                }
 875                break;
 876
 877        case SO_TIMESTAMPING:
 878                if (val & ~SOF_TIMESTAMPING_MASK) {
 879                        ret = -EINVAL;
 880                        break;
 881                }
 882
 883                if (val & SOF_TIMESTAMPING_OPT_ID &&
 884                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 885                        if (sk->sk_protocol == IPPROTO_TCP &&
 886                            sk->sk_type == SOCK_STREAM) {
 887                                if ((1 << sk->sk_state) &
 888                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 889                                        ret = -EINVAL;
 890                                        break;
 891                                }
 892                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 893                        } else {
 894                                sk->sk_tskey = 0;
 895                        }
 896                }
 897
 898                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 899                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 900                        ret = -EINVAL;
 901                        break;
 902                }
 903
 904                sk->sk_tsflags = val;
 905                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 906                        sock_enable_timestamp(sk,
 907                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 908                else
 909                        sock_disable_timestamp(sk,
 910                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 911                break;
 912
 913        case SO_RCVLOWAT:
 914                if (val < 0)
 915                        val = INT_MAX;
 916                sk->sk_rcvlowat = val ? : 1;
 917                break;
 918
 919        case SO_RCVTIMEO:
 920                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 921                break;
 922
 923        case SO_SNDTIMEO:
 924                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 925                break;
 926
 927        case SO_ATTACH_FILTER:
 928                ret = -EINVAL;
 929                if (optlen == sizeof(struct sock_fprog)) {
 930                        struct sock_fprog fprog;
 931
 932                        ret = -EFAULT;
 933                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 934                                break;
 935
 936                        ret = sk_attach_filter(&fprog, sk);
 937                }
 938                break;
 939
 940        case SO_ATTACH_BPF:
 941                ret = -EINVAL;
 942                if (optlen == sizeof(u32)) {
 943                        u32 ufd;
 944
 945                        ret = -EFAULT;
 946                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 947                                break;
 948
 949                        ret = sk_attach_bpf(ufd, sk);
 950                }
 951                break;
 952
 953        case SO_ATTACH_REUSEPORT_CBPF:
 954                ret = -EINVAL;
 955                if (optlen == sizeof(struct sock_fprog)) {
 956                        struct sock_fprog fprog;
 957
 958                        ret = -EFAULT;
 959                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 960                                break;
 961
 962                        ret = sk_reuseport_attach_filter(&fprog, sk);
 963                }
 964                break;
 965
 966        case SO_ATTACH_REUSEPORT_EBPF:
 967                ret = -EINVAL;
 968                if (optlen == sizeof(u32)) {
 969                        u32 ufd;
 970
 971                        ret = -EFAULT;
 972                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 973                                break;
 974
 975                        ret = sk_reuseport_attach_bpf(ufd, sk);
 976                }
 977                break;
 978
 979        case SO_DETACH_FILTER:
 980                ret = sk_detach_filter(sk);
 981                break;
 982
 983        case SO_LOCK_FILTER:
 984                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 985                        ret = -EPERM;
 986                else
 987                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 988                break;
 989
 990        case SO_PASSSEC:
 991                if (valbool)
 992                        set_bit(SOCK_PASSSEC, &sock->flags);
 993                else
 994                        clear_bit(SOCK_PASSSEC, &sock->flags);
 995                break;
 996        case SO_MARK:
 997                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 998                        ret = -EPERM;
 999                else
1000                        sk->sk_mark = val;

1001                break;
1002
1003        case SO_RXQ_OVFL:
1004                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                break;
1006
1007        case SO_WIFI_STATUS:
1008                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                break;
1010
1011        case SO_PEEK_OFF:
1012                if (sock->ops->set_peek_off)
1013                        ret = sock->ops->set_peek_off(sk, val);
1014                else
1015                        ret = -EOPNOTSUPP;
1016                break;
1017
1018        case SO_NOFCS:
1019                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                break;
1021
1022        case SO_SELECT_ERR_QUEUE:
1023                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                break;
1025
1026#ifdef CONFIG_NET_RX_BUSY_POLL
1027        case SO_BUSY_POLL:
1028                /* allow unprivileged users to decrease the value */
1029                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                        ret = -EPERM;
1031                else {
1032                        if (val < 0)
1033                                ret = -EINVAL;
1034                        else
1035                                sk->sk_ll_usec = val;
1036                }
1037                break;
1038#endif
1039
1040        case SO_MAX_PACING_RATE:
1041                sk->sk_max_pacing_rate = val;
1042                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1043                                         sk->sk_max_pacing_rate);
1044                break;
1045
1046        case SO_INCOMING_CPU:
1047                sk->sk_incoming_cpu = val;
1048                break;
1049
1050        case SO_CNX_ADVICE:
1051                if (val == 1)
1052                        dst_negative_advice(sk);
1053                break;
1054        default:
1055                ret = -ENOPROTOOPT;
1056                break;
1057        }
1058        release_sock(sk);
1059        return ret;
1060}
1061EXPORT_SYMBOL(sock_setsockopt);
1062
1063
1064static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1065                          struct ucred *ucred)
1066{
1067        ucred->pid = pid_vnr(pid);
1068        ucred->uid = ucred->gid = -1;
1069        if (cred) {
1070                struct user_namespace *current_ns = current_user_ns();
1071
1072                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1073                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1074        }
1075}
1076
1077int sock_getsockopt(struct socket *sock, int level, int optname,
1078                    char __user *optval, int __user *optlen)
1079{
1080        struct sock *sk = sock->sk;
1081
1082        union {
1083                int val;
1084                u64 val64;
1085                struct linger ling;
1086                struct timeval tm;
1087        } v;
1088
1089        int lv = sizeof(int);
1090        int len;
1091
1092        if (get_user(len, optlen))
1093                return -EFAULT;
1094        if (len < 0)
1095                return -EINVAL;
1096
1097        memset(&v, 0, sizeof(v));
1098
1099        switch (optname) {
1100        case SO_DEBUG:
1101                v.val = sock_flag(sk, SOCK_DBG);
1102                break;
1103
1104        case SO_DONTROUTE:
1105                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1106                break;
1107
1108        case SO_BROADCAST:
1109                v.val = sock_flag(sk, SOCK_BROADCAST);
1110                break;
1111
1112        case SO_SNDBUF:
1113                v.val = sk->sk_sndbuf;
1114                break;
1115
1116        case SO_RCVBUF:
1117                v.val = sk->sk_rcvbuf;
1118                break;
1119
1120        case SO_REUSEADDR:
1121                v.val = sk->sk_reuse;
1122                break;
1123
1124        case SO_REUSEPORT:
1125                v.val = sk->sk_reuseport;
1126                break;
1127
1128        case SO_KEEPALIVE:
1129                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1130                break;
1131
1132        case SO_TYPE:
1133                v.val = sk->sk_type;
1134                break;
1135
1136        case SO_PROTOCOL:
1137                v.val = sk->sk_protocol;
1138                break;
1139
1140        case SO_DOMAIN:
1141                v.val = sk->sk_family;
1142                break;
1143
1144        case SO_ERROR:
1145                v.val = -sock_error(sk);
1146                if (v.val == 0)
1147                        v.val = xchg(&sk->sk_err_soft, 0);
1148                break;
1149
1150        case SO_OOBINLINE:
1151                v.val = sock_flag(sk, SOCK_URGINLINE);
1152                break;
1153
1154        case SO_NO_CHECK:
1155                v.val = sk->sk_no_check_tx;
1156                break;
1157
1158        case SO_PRIORITY:
1159                v.val = sk->sk_priority;
1160                break;
1161
1162        case SO_LINGER:
1163                lv              = sizeof(v.ling);
1164                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1165                v.ling.l_linger = sk->sk_lingertime / HZ;
1166                break;
1167
1168        case SO_BSDCOMPAT:
1169                sock_warn_obsolete_bsdism("getsockopt");
1170                break;
1171
1172        case SO_TIMESTAMP:
1173                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1174                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1175                break;
1176
1177        case SO_TIMESTAMPNS:
1178                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1179                break;
1180
1181        case SO_TIMESTAMPING:
1182                v.val = sk->sk_tsflags;
1183                break;
1184
1185        case SO_RCVTIMEO:
1186                lv = sizeof(struct timeval);
1187                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1188                        v.tm.tv_sec = 0;
1189                        v.tm.tv_usec = 0;
1190                } else {
1191                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1192                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1193                }
1194                break;
1195
1196        case SO_SNDTIMEO:
1197                lv = sizeof(struct timeval);
1198                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1199                        v.tm.tv_sec = 0;
1200                        v.tm.tv_usec = 0;
1201                } else {
1202                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1203                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1204                }
1205                break;
1206
1207        case SO_RCVLOWAT:
1208                v.val = sk->sk_rcvlowat;
1209                break;
1210
1211        case SO_SNDLOWAT:
1212                v.val = 1;
1213                break;
1214
1215        case SO_PASSCRED:
1216                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1217                break;
1218
1219        case SO_PEERCRED:
1220        {
1221                struct ucred peercred;
1222                if (len > sizeof(peercred))
1223                        len = sizeof(peercred);
1224                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1225                if (copy_to_user(optval, &peercred, len))
1226                        return -EFAULT;
1227                goto lenout;
1228        }
1229
1230        case SO_PEERNAME:
1231        {
1232                char address[128];
1233
1234                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1235                        return -ENOTCONN;
1236                if (lv < len)
1237                        return -EINVAL;
1238                if (copy_to_user(optval, address, len))
1239                        return -EFAULT;
1240                goto lenout;
1241        }
1242
1243        /* Dubious BSD thing... Probably nobody even uses it, but
1244         * the UNIX standard wants it for whatever reason... -DaveM
1245         */
1246        case SO_ACCEPTCONN:
1247                v.val = sk->sk_state == TCP_LISTEN;
1248                break;
1249
1250        case SO_PASSSEC:
1251                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1252                break;
1253
1254        case SO_PEERSEC:
1255                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1256
1257        case SO_MARK:
1258                v.val = sk->sk_mark;
1259                break;
1260
1261        case SO_RXQ_OVFL:
1262                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1263                break;
1264
1265        case SO_WIFI_STATUS:
1266                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1267                break;
1268
1269        case SO_PEEK_OFF:
1270                if (!sock->ops->set_peek_off)
1271                        return -EOPNOTSUPP;
1272
1273                v.val = sk->sk_peek_off;
1274                break;
1275        case SO_NOFCS:
1276                v.val = sock_flag(sk, SOCK_NOFCS);
1277                break;
1278
1279        case SO_BINDTODEVICE:
1280                return sock_getbindtodevice(sk, optval, optlen, len);
1281
1282        case SO_GET_FILTER:
1283                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1284                if (len < 0)
1285                        return len;
1286
1287                goto lenout;
1288
1289        case SO_LOCK_FILTER:
1290                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1291                break;
1292
1293        case SO_BPF_EXTENSIONS:
1294                v.val = bpf_tell_extensions();
1295                break;
1296
1297        case SO_SELECT_ERR_QUEUE:
1298                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1299                break;
1300
1301#ifdef CONFIG_NET_RX_BUSY_POLL
1302        case SO_BUSY_POLL:
1303                v.val = sk->sk_ll_usec;
1304                break;
1305#endif
1306
1307        case SO_MAX_PACING_RATE:
1308                v.val = sk->sk_max_pacing_rate;
1309                break;
1310
1311        case SO_INCOMING_CPU:
1312                v.val = sk->sk_incoming_cpu;
1313                break;
1314
1315        case SO_MEMINFO:
1316        {
1317                u32 meminfo[SK_MEMINFO_VARS];
1318
1319                if (get_user(len, optlen))
1320                        return -EFAULT;
1321
1322                sk_get_meminfo(sk, meminfo);
1323
1324                len = min_t(unsigned int, len, sizeof(meminfo));
1325                if (copy_to_user(optval, &meminfo, len))
1326                        return -EFAULT;
1327
1328                goto lenout;
1329        }
1330
1331#ifdef CONFIG_NET_RX_BUSY_POLL
1332        case SO_INCOMING_NAPI_ID:
1333                v.val = READ_ONCE(sk->sk_napi_id);
1334
1335                /* aggregate non-NAPI IDs down to 0 */
1336                if (v.val < MIN_NAPI_ID)
1337                        v.val = 0;
1338
1339                break;
1340#endif
1341
1342        case SO_COOKIE:
1343                lv = sizeof(u64);
1344                if (len < lv)
1345                        return -EINVAL;
1346                v.val64 = sock_gen_cookie(sk);
1347                break;
1348
1349        default:
1350                /* We implement the SO_SNDLOWAT etc to not be settable
1351                 * (1003.1g 7).
1352                 */
1353                return -ENOPROTOOPT;
1354        }
1355
1356        if (len > lv)
1357                len = lv;
1358        if (copy_to_user(optval, &v, len))
1359                return -EFAULT;
1360lenout:
1361        if (put_user(len, optlen))
1362                return -EFAULT;
1363        return 0;
1364}
1365
1366/*
1367 * Initialize an sk_lock.
1368 *
1369 * (We also register the sk_lock with the lock validator.)
1370 */
1371static inline void sock_lock_init(struct sock *sk)
1372{
1373        if (sk->sk_kern_sock)
1374                sock_lock_init_class_and_name(
1375                        sk,
1376                        af_family_kern_slock_key_strings[sk->sk_family],
1377                        af_family_kern_slock_keys + sk->sk_family,
1378                        af_family_kern_key_strings[sk->sk_family],
1379                        af_family_kern_keys + sk->sk_family);
1380        else
1381                sock_lock_init_class_and_name(
1382                        sk,
1383                        af_family_slock_key_strings[sk->sk_family],
1384                        af_family_slock_keys + sk->sk_family,
1385                        af_family_key_strings[sk->sk_family],
1386                        af_family_keys + sk->sk_family);
1387}
1388
1389/*
1390 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1391 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1392 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1393 */
1394static void sock_copy(struct sock *nsk, const struct sock *osk)
1395{
1396#ifdef CONFIG_SECURITY_NETWORK
1397        void *sptr = nsk->sk_security;
1398#endif
1399        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1400
1401        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1402               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1403
1404#ifdef CONFIG_SECURITY_NETWORK
1405        nsk->sk_security = sptr;
1406        security_sk_clone(osk, nsk);
1407#endif
1408}
1409
1410static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1411                int family)
1412{
1413        struct sock *sk;
1414        struct kmem_cache *slab;
1415
1416        slab = prot->slab;
1417        if (slab != NULL) {
1418                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1419                if (!sk)
1420                        return sk;
1421                if (priority & __GFP_ZERO)
1422                        sk_prot_clear_nulls(sk, prot->obj_size);
1423        } else
1424                sk = kmalloc(prot->obj_size, priority);
1425
1426        if (sk != NULL) {
1427                kmemcheck_annotate_bitfield(sk, flags);
1428
1429                if (security_sk_alloc(sk, family, priority))
1430                        goto out_free;
1431
1432                if (!try_module_get(prot->owner))
1433                        goto out_free_sec;
1434                sk_tx_queue_clear(sk);
1435        }
1436
1437        return sk;
1438
1439out_free_sec:
1440        security_sk_free(sk);
1441out_free:
1442        if (slab != NULL)
1443                kmem_cache_free(slab, sk);
1444        else
1445                kfree(sk);
1446        return NULL;
1447}
1448
1449static void sk_prot_free(struct proto *prot, struct sock *sk)
1450{
1451        struct kmem_cache *slab;
1452        struct module *owner;
1453
1454        owner = prot->owner;
1455        slab = prot->slab;
1456
1457        cgroup_sk_free(&sk->sk_cgrp_data);
1458        mem_cgroup_sk_free(sk);
1459        security_sk_free(sk);
1460        if (slab != NULL)
1461                kmem_cache_free(slab, sk);
1462        else
1463                kfree(sk);
1464        module_put(owner);
1465}
1466
1467/**
1468 *      sk_alloc - All socket objects are allocated here
1469 *      @net: the applicable net namespace
1470 *      @family: protocol family
1471 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1472 *      @prot: struct proto associated with this new sock instance
1473 *      @kern: is this to be a kernel socket?
1474 */
1475struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1476                      struct proto *prot, int kern)
1477{
1478        struct sock *sk;
1479
1480        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1481        if (sk) {
1482                sk->sk_family = family;
1483                /*
1484                 * See comment in struct sock definition to understand
1485                 * why we need sk_prot_creator -acme
1486                 */
1487                sk->sk_prot = sk->sk_prot_creator = prot;
1488                sk->sk_kern_sock = kern;
1489                sock_lock_init(sk);
1490                sk->sk_net_refcnt = kern ? 0 : 1;
1491                if (likely(sk->sk_net_refcnt))
1492                        get_net(net);
1493                sock_net_set(sk, net);
1494                atomic_set(&sk->sk_wmem_alloc, 1);
1495
1496                mem_cgroup_sk_alloc(sk);
1497                cgroup_sk_alloc(&sk->sk_cgrp_data);
1498                sock_update_classid(&sk->sk_cgrp_data);
1499                sock_update_netprioidx(&sk->sk_cgrp_data);
1500        }
1501
1502        return sk;
1503}
1504EXPORT_SYMBOL(sk_alloc);
1505
1506/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1507 * grace period. This is the case for UDP sockets and TCP listeners.
1508 */
1509static void __sk_destruct(struct rcu_head *head)
1510{
1511        struct sock *sk = container_of(head, struct sock, sk_rcu);
1512        struct sk_filter *filter;
1513
1514        if (sk->sk_destruct)
1515                sk->sk_destruct(sk);
1516
1517        filter = rcu_dereference_check(sk->sk_filter,
1518                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1519        if (filter) {
1520                sk_filter_uncharge(sk, filter);
1521                RCU_INIT_POINTER(sk->sk_filter, NULL);
1522        }
1523        if (rcu_access_pointer(sk->sk_reuseport_cb))
1524                reuseport_detach_sock(sk);
1525
1526        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1527
1528        if (atomic_read(&sk->sk_omem_alloc))
1529                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1530                         __func__, atomic_read(&sk->sk_omem_alloc));
1531
1532        if (sk->sk_frag.page) {
1533                put_page(sk->sk_frag.page);
1534                sk->sk_frag.page = NULL;
1535        }
1536
1537        if (sk->sk_peer_cred)
1538                put_cred(sk->sk_peer_cred);
1539        put_pid(sk->sk_peer_pid);
1540        if (likely(sk->sk_net_refcnt))
1541                put_net(sock_net(sk));
1542        sk_prot_free(sk->sk_prot_creator, sk);
1543}
1544
1545void sk_destruct(struct sock *sk)
1546{
1547        if (sock_flag(sk, SOCK_RCU_FREE))
1548                call_rcu(&sk->sk_rcu, __sk_destruct);
1549        else
1550                __sk_destruct(&sk->sk_rcu);
1551}
1552
1553static void __sk_free(struct sock *sk)
1554{
1555        if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1556                sock_diag_broadcast_destroy(sk);
1557        else
1558                sk_destruct(sk);
1559}
1560
1561void sk_free(struct sock *sk)
1562{
1563        /*
1564         * We subtract one from sk_wmem_alloc and can know if
1565         * some packets are still in some tx queue.
1566         * If not null, sock_wfree() will call __sk_free(sk) later
1567         */
1568        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1569                __sk_free(sk);
1570}
1571EXPORT_SYMBOL(sk_free);
1572
1573static void sk_init_common(struct sock *sk)
1574{
1575        skb_queue_head_init(&sk->sk_receive_queue);
1576        skb_queue_head_init(&sk->sk_write_queue);
1577        skb_queue_head_init(&sk->sk_error_queue);
1578
1579        rwlock_init(&sk->sk_callback_lock);
1580        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1581                        af_rlock_keys + sk->sk_family,
1582                        af_family_rlock_key_strings[sk->sk_family]);
1583        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1584                        af_wlock_keys + sk->sk_family,
1585                        af_family_wlock_key_strings[sk->sk_family]);
1586        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1587                        af_elock_keys + sk->sk_family,
1588                        af_family_elock_key_strings[sk->sk_family]);
1589        lockdep_set_class_and_name(&sk->sk_callback_lock,
1590                        af_callback_keys + sk->sk_family,
1591                        af_family_clock_key_strings[sk->sk_family]);
1592}
1593
1594/**
1595 *      sk_clone_lock - clone a socket, and lock its clone
1596 *      @sk: the socket to clone
1597 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1598 *
1599 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1600 */
1601struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1602{
1603        struct sock *newsk;
1604        bool is_charged = true;
1605
1606        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1607        if (newsk != NULL) {
1608                struct sk_filter *filter;
1609
1610                sock_copy(newsk, sk);
1611
1612                /* SANITY */
1613                if (likely(newsk->sk_net_refcnt))
1614                        get_net(sock_net(newsk));
1615                sk_node_init(&newsk->sk_node);
1616                sock_lock_init(newsk);
1617                bh_lock_sock(newsk);
1618                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1619                newsk->sk_backlog.len = 0;
1620
1621                atomic_set(&newsk->sk_rmem_alloc, 0);
1622                /*
1623                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1624                 */
1625                atomic_set(&newsk->sk_wmem_alloc, 1);
1626                atomic_set(&newsk->sk_omem_alloc, 0);
1627                sk_init_common(newsk);
1628
1629                newsk->sk_dst_cache     = NULL;
1630                newsk->sk_dst_pending_confirm = 0;
1631                newsk->sk_wmem_queued   = 0;
1632                newsk->sk_forward_alloc = 0;
1633                atomic_set(&newsk->sk_drops, 0);
1634                newsk->sk_send_head     = NULL;
1635                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1636
1637                sock_reset_flag(newsk, SOCK_DONE);
1638
1639                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1640                if (filter != NULL)
1641                        /* though it's an empty new sock, the charging may fail
1642                         * if sysctl_optmem_max was changed between creation of
1643                         * original socket and cloning
1644                         */
1645                        is_charged = sk_filter_charge(newsk, filter);
1646
1647                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1648                        /* We need to make sure that we don't uncharge the new
1649                         * socket if we couldn't charge it in the first place
1650                         * as otherwise we uncharge the parent's filter.
1651                         */
1652                        if (!is_charged)
1653                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1654                        sk_free_unlock_clone(newsk);
1655                        newsk = NULL;
1656                        goto out;
1657                }
1658                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1659
1660                newsk->sk_err      = 0;
1661                newsk->sk_err_soft = 0;
1662                newsk->sk_priority = 0;
1663                newsk->sk_incoming_cpu = raw_smp_processor_id();
1664                atomic64_set(&newsk->sk_cookie, 0);
1665
1666                mem_cgroup_sk_alloc(newsk);
1667                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1668
1669                /*
1670                 * Before updating sk_refcnt, we must commit prior changes to memory
1671                 * (Documentation/RCU/rculist_nulls.txt for details)
1672                 */
1673                smp_wmb();
1674                atomic_set(&newsk->sk_refcnt, 2);
1675
1676                /*
1677                 * Increment the counter in the same struct proto as the master
1678                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1679                 * is the same as sk->sk_prot->socks, as this field was copied
1680                 * with memcpy).
1681                 *
1682                 * This _changes_ the previous behaviour, where
1683                 * tcp_create_openreq_child always was incrementing the
1684                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1685                 * to be taken into account in all callers. -acme
1686                 */
1687                sk_refcnt_debug_inc(newsk);
1688                sk_set_socket(newsk, NULL);
1689                newsk->sk_wq = NULL;
1690
1691                if (newsk->sk_prot->sockets_allocated)
1692                        sk_sockets_allocated_inc(newsk);
1693
1694                if (sock_needs_netstamp(sk) &&
1695                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1696                        net_enable_timestamp();
1697        }
1698out:
1699        return newsk;
1700}
1701EXPORT_SYMBOL_GPL(sk_clone_lock);
1702
1703void sk_free_unlock_clone(struct sock *sk)
1704{
1705        /* It is still raw copy of parent, so invalidate
1706         * destructor and make plain sk_free() */
1707        sk->sk_destruct = NULL;
1708        bh_unlock_sock(sk);
1709        sk_free(sk);
1710}
1711EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1712
1713void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1714{
1715        u32 max_segs = 1;
1716
1717        sk_dst_set(sk, dst);
1718        sk->sk_route_caps = dst->dev->features;
1719        if (sk->sk_route_caps & NETIF_F_GSO)
1720                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1721        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1722        if (sk_can_gso(sk)) {
1723                if (dst->header_len) {
1724                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1725                } else {
1726                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1727                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1728                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1729                }
1730        }
1731        sk->sk_gso_max_segs = max_segs;
1732}
1733EXPORT_SYMBOL_GPL(sk_setup_caps);
1734
1735/*
1736 *      Simple resource managers for sockets.
1737 */
1738
1739
1740/*
1741 * Write buffer destructor automatically called from kfree_skb.
1742 */
1743void sock_wfree(struct sk_buff *skb)
1744{
1745        struct sock *sk = skb->sk;
1746        unsigned int len = skb->truesize;
1747
1748        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1749                /*
1750                 * Keep a reference on sk_wmem_alloc, this will be released
1751                 * after sk_write_space() call
1752                 */
1753                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1754                sk->sk_write_space(sk);
1755                len = 1;
1756        }
1757        /*
1758         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1759         * could not do because of in-flight packets
1760         */
1761        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1762                __sk_free(sk);
1763}
1764EXPORT_SYMBOL(sock_wfree);
1765
1766/* This variant of sock_wfree() is used by TCP,
1767 * since it sets SOCK_USE_WRITE_QUEUE.
1768 */
1769void __sock_wfree(struct sk_buff *skb)
1770{
1771        struct sock *sk = skb->sk;
1772
1773        if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1774                __sk_free(sk);
1775}
1776
1777void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1778{
1779        skb_orphan(skb);
1780        skb->sk = sk;
1781#ifdef CONFIG_INET
1782        if (unlikely(!sk_fullsock(sk))) {
1783                skb->destructor = sock_edemux;
1784                sock_hold(sk);
1785                return;
1786        }
1787#endif
1788        skb->destructor = sock_wfree;
1789        skb_set_hash_from_sk(skb, sk);
1790        /*
1791         * We used to take a refcount on sk, but following operation
1792         * is enough to guarantee sk_free() wont free this sock until
1793         * all in-flight packets are completed
1794         */
1795        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1796}
1797EXPORT_SYMBOL(skb_set_owner_w);
1798
1799/* This helper is used by netem, as it can hold packets in its
1800 * delay queue. We want to allow the owner socket to send more
1801 * packets, as if they were already TX completed by a typical driver.
1802 * But we also want to keep skb->sk set because some packet schedulers
1803 * rely on it (sch_fq for example).
1804 */
1805void skb_orphan_partial(struct sk_buff *skb)
1806{
1807        if (skb_is_tcp_pure_ack(skb))
1808                return;
1809
1810        if (skb->destructor == sock_wfree
1811#ifdef CONFIG_INET
1812            || skb->destructor == tcp_wfree
1813#endif
1814                ) {
1815                struct sock *sk = skb->sk;
1816
1817                if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1818                        atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1819                        skb->destructor = sock_efree;
1820                }
1821        } else {
1822                skb_orphan(skb);
1823        }
1824}
1825EXPORT_SYMBOL(skb_orphan_partial);
1826
1827/*
1828 * Read buffer destructor automatically called from kfree_skb.
1829 */
1830void sock_rfree(struct sk_buff *skb)
1831{
1832        struct sock *sk = skb->sk;
1833        unsigned int len = skb->truesize;
1834
1835        atomic_sub(len, &sk->sk_rmem_alloc);
1836        sk_mem_uncharge(sk, len);
1837}
1838EXPORT_SYMBOL(sock_rfree);
1839
1840/*
1841 * Buffer destructor for skbs that are not used directly in read or write
1842 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1843 */
1844void sock_efree(struct sk_buff *skb)
1845{
1846        sock_put(skb->sk);
1847}
1848EXPORT_SYMBOL(sock_efree);
1849
1850kuid_t sock_i_uid(struct sock *sk)
1851{
1852        kuid_t uid;
1853
1854        read_lock_bh(&sk->sk_callback_lock);
1855        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1856        read_unlock_bh(&sk->sk_callback_lock);
1857        return uid;
1858}
1859EXPORT_SYMBOL(sock_i_uid);
1860
1861unsigned long sock_i_ino(struct sock *sk)
1862{
1863        unsigned long ino;
1864
1865        read_lock_bh(&sk->sk_callback_lock);
1866        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1867        read_unlock_bh(&sk->sk_callback_lock);
1868        return ino;
1869}
1870EXPORT_SYMBOL(sock_i_ino);
1871
1872/*
1873 * Allocate a skb from the socket's send buffer.
1874 */
1875struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1876                             gfp_t priority)
1877{
1878        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1879                struct sk_buff *skb = alloc_skb(size, priority);
1880                if (skb) {
1881                        skb_set_owner_w(skb, sk);
1882                        return skb;
1883                }
1884        }
1885        return NULL;
1886}
1887EXPORT_SYMBOL(sock_wmalloc);
1888
1889/*
1890 * Allocate a memory block from the socket's option memory buffer.
1891 */
1892void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1893{
1894        if ((unsigned int)size <= sysctl_optmem_max &&
1895            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1896                void *mem;
1897                /* First do the add, to avoid the race if kmalloc
1898                 * might sleep.
1899                 */
1900                atomic_add(size, &sk->sk_omem_alloc);
1901                mem = kmalloc(size, priority);
1902                if (mem)
1903                        return mem;
1904                atomic_sub(size, &sk->sk_omem_alloc);
1905        }
1906        return NULL;
1907}
1908EXPORT_SYMBOL(sock_kmalloc);
1909
1910/* Free an option memory block. Note, we actually want the inline
1911 * here as this allows gcc to detect the nullify and fold away the
1912 * condition entirely.
1913 */
1914static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1915                                  const bool nullify)
1916{
1917        if (WARN_ON_ONCE(!mem))
1918                return;
1919        if (nullify)
1920                kzfree(mem);
1921        else
1922                kfree(mem);
1923        atomic_sub(size, &sk->sk_omem_alloc);
1924}
1925
1926void sock_kfree_s(struct sock *sk, void *mem, int size)
1927{
1928        __sock_kfree_s(sk, mem, size, false);
1929}
1930EXPORT_SYMBOL(sock_kfree_s);
1931
1932void sock_kzfree_s(struct sock *sk, void *mem, int size)
1933{
1934        __sock_kfree_s(sk, mem, size, true);
1935}
1936EXPORT_SYMBOL(sock_kzfree_s);
1937
1938/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1939   I think, these locks should be removed for datagram sockets.
1940 */
1941static long sock_wait_for_wmem(struct sock *sk, long timeo)
1942{
1943        DEFINE_WAIT(wait);
1944
1945        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1946        for (;;) {
1947                if (!timeo)
1948                        break;
1949                if (signal_pending(current))
1950                        break;
1951                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1952                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1953                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1954                        break;
1955                if (sk->sk_shutdown & SEND_SHUTDOWN)
1956                        break;
1957                if (sk->sk_err)
1958                        break;
1959                timeo = schedule_timeout(timeo);
1960        }
1961        finish_wait(sk_sleep(sk), &wait);
1962        return timeo;
1963}
1964
1965
1966/*
1967 *      Generic send/receive buffer handlers
1968 */
1969
1970struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1971                                     unsigned long data_len, int noblock,
1972                                     int *errcode, int max_page_order)
1973{
1974        struct sk_buff *skb;
1975        long timeo;
1976        int err;
1977
1978        timeo = sock_sndtimeo(sk, noblock);
1979        for (;;) {
1980                err = sock_error(sk);
1981                if (err != 0)
1982                        goto failure;
1983
1984                err = -EPIPE;
1985                if (sk->sk_shutdown & SEND_SHUTDOWN)
1986                        goto failure;
1987
1988                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1989                        break;
1990
1991                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1992                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1993                err = -EAGAIN;
1994                if (!timeo)
1995                        goto failure;
1996                if (signal_pending(current))
1997                        goto interrupted;
1998                timeo = sock_wait_for_wmem(sk, timeo);
1999        }
2000        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,

2001                                   errcode, sk->sk_allocation);
2002        if (skb)
2003                skb_set_owner_w(skb, sk);
2004        return skb;
2005
2006interrupted:
2007        err = sock_intr_errno(timeo);
2008failure:
2009        *errcode = err;
2010        return NULL;
2011}
2012EXPORT_SYMBOL(sock_alloc_send_pskb);
2013
2014struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2015                                    int noblock, int *errcode)
2016{
2017        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2018}
2019EXPORT_SYMBOL(sock_alloc_send_skb);
2020
2021int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2022                     struct sockcm_cookie *sockc)
2023{
2024        u32 tsflags;
2025
2026        switch (cmsg->cmsg_type) {
2027        case SO_MARK:
2028                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2029                        return -EPERM;
2030                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2031                        return -EINVAL;
2032                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2033                break;
2034        case SO_TIMESTAMPING:
2035                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2036                        return -EINVAL;
2037
2038                tsflags = *(u32 *)CMSG_DATA(cmsg);
2039                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2040                        return -EINVAL;
2041
2042                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2043                sockc->tsflags |= tsflags;
2044                break;
2045        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2046        case SCM_RIGHTS:
2047        case SCM_CREDENTIALS:
2048                break;
2049        default:
2050                return -EINVAL;
2051        }
2052        return 0;
2053}
2054EXPORT_SYMBOL(__sock_cmsg_send);
2055
2056int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2057                   struct sockcm_cookie *sockc)
2058{
2059        struct cmsghdr *cmsg;
2060        int ret;
2061
2062        for_each_cmsghdr(cmsg, msg) {
2063                if (!CMSG_OK(msg, cmsg))
2064                        return -EINVAL;
2065                if (cmsg->cmsg_level != SOL_SOCKET)
2066                        continue;
2067                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2068                if (ret)
2069                        return ret;
2070        }
2071        return 0;
2072}
2073EXPORT_SYMBOL(sock_cmsg_send);
2074
2075/* On 32bit arches, an skb frag is limited to 2^15 */
2076#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2077
2078/**
2079 * skb_page_frag_refill - check that a page_frag contains enough room
2080 * @sz: minimum size of the fragment we want to get
2081 * @pfrag: pointer to page_frag
2082 * @gfp: priority for memory allocation
2083 *
2084 * Note: While this allocator tries to use high order pages, there is
2085 * no guarantee that allocations succeed. Therefore, @sz MUST be
2086 * less or equal than PAGE_SIZE.
2087 */
2088bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2089{
2090        if (pfrag->page) {
2091                if (page_ref_count(pfrag->page) == 1) {
2092                        pfrag->offset = 0;
2093                        return true;
2094                }
2095                if (pfrag->offset + sz <= pfrag->size)
2096                        return true;
2097                put_page(pfrag->page);
2098        }
2099
2100        pfrag->offset = 0;
2101        if (SKB_FRAG_PAGE_ORDER) {
2102                /* Avoid direct reclaim but allow kswapd to wake */
2103                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2104                                          __GFP_COMP | __GFP_NOWARN |
2105                                          __GFP_NORETRY,
2106                                          SKB_FRAG_PAGE_ORDER);
2107                if (likely(pfrag->page)) {
2108                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2109                        return true;
2110                }
2111        }
2112        pfrag->page = alloc_page(gfp);
2113        if (likely(pfrag->page)) {
2114                pfrag->size = PAGE_SIZE;
2115                return true;
2116        }
2117        return false;
2118}
2119EXPORT_SYMBOL(skb_page_frag_refill);
2120
2121bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2122{
2123        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2124                return true;
2125
2126        sk_enter_memory_pressure(sk);
2127        sk_stream_moderate_sndbuf(sk);
2128        return false;
2129}
2130EXPORT_SYMBOL(sk_page_frag_refill);
2131
2132static void __lock_sock(struct sock *sk)
2133        __releases(&sk->sk_lock.slock)
2134        __acquires(&sk->sk_lock.slock)
2135{
2136        DEFINE_WAIT(wait);
2137
2138        for (;;) {
2139                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2140                                        TASK_UNINTERRUPTIBLE);
2141                spin_unlock_bh(&sk->sk_lock.slock);
2142                schedule();
2143                spin_lock_bh(&sk->sk_lock.slock);
2144                if (!sock_owned_by_user(sk))
2145                        break;
2146        }
2147        finish_wait(&sk->sk_lock.wq, &wait);
2148}
2149
2150static void __release_sock(struct sock *sk)
2151        __releases(&sk->sk_lock.slock)
2152        __acquires(&sk->sk_lock.slock)
2153{
2154        struct sk_buff *skb, *next;
2155
2156        while ((skb = sk->sk_backlog.head) != NULL) {
2157                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2158
2159                spin_unlock_bh(&sk->sk_lock.slock);
2160
2161                do {
2162                        next = skb->next;
2163                        prefetch(next);
2164                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2165                        skb->next = NULL;
2166                        sk_backlog_rcv(sk, skb);
2167
2168                        cond_resched();
2169
2170                        skb = next;
2171                } while (skb != NULL);
2172
2173                spin_lock_bh(&sk->sk_lock.slock);
2174        }
2175
2176        /*
2177         * Doing the zeroing here guarantee we can not loop forever
2178         * while a wild producer attempts to flood us.
2179         */
2180        sk->sk_backlog.len = 0;
2181}
2182
2183void __sk_flush_backlog(struct sock *sk)
2184{
2185        spin_lock_bh(&sk->sk_lock.slock);
2186        __release_sock(sk);
2187        spin_unlock_bh(&sk->sk_lock.slock);
2188}
2189
2190/**
2191 * sk_wait_data - wait for data to arrive at sk_receive_queue
2192 * @sk:    sock to wait on
2193 * @timeo: for how long
2194 * @skb:   last skb seen on sk_receive_queue
2195 *
2196 * Now socket state including sk->sk_err is changed only under lock,
2197 * hence we may omit checks after joining wait queue.
2198 * We check receive queue before schedule() only as optimization;
2199 * it is very likely that release_sock() added new data.
2200 */
2201int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2202{
2203        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2204        int rc;
2205
2206        add_wait_queue(sk_sleep(sk), &wait);
2207        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2208        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2209        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2210        remove_wait_queue(sk_sleep(sk), &wait);
2211        return rc;
2212}
2213EXPORT_SYMBOL(sk_wait_data);
2214
2215/**
2216 *      __sk_mem_raise_allocated - increase memory_allocated
2217 *      @sk: socket
2218 *      @size: memory size to allocate
2219 *      @amt: pages to allocate
2220 *      @kind: allocation type
2221 *
2222 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2223 */
2224int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2225{
2226        struct proto *prot = sk->sk_prot;
2227        long allocated = sk_memory_allocated_add(sk, amt);
2228
2229        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2230            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2231                goto suppress_allocation;
2232
2233        /* Under limit. */
2234        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2235                sk_leave_memory_pressure(sk);
2236                return 1;
2237        }
2238
2239        /* Under pressure. */
2240        if (allocated > sk_prot_mem_limits(sk, 1))
2241                sk_enter_memory_pressure(sk);
2242
2243        /* Over hard limit. */
2244        if (allocated > sk_prot_mem_limits(sk, 2))
2245                goto suppress_allocation;
2246
2247        /* guarantee minimum buffer size under pressure */
2248        if (kind == SK_MEM_RECV) {
2249                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2250                        return 1;
2251
2252        } else { /* SK_MEM_SEND */
2253                if (sk->sk_type == SOCK_STREAM) {
2254                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2255                                return 1;
2256                } else if (atomic_read(&sk->sk_wmem_alloc) <
2257                           prot->sysctl_wmem[0])
2258                                return 1;
2259        }
2260
2261        if (sk_has_memory_pressure(sk)) {
2262                int alloc;
2263
2264                if (!sk_under_memory_pressure(sk))
2265                        return 1;
2266                alloc = sk_sockets_allocated_read_positive(sk);
2267                if (sk_prot_mem_limits(sk, 2) > alloc *
2268                    sk_mem_pages(sk->sk_wmem_queued +
2269                                 atomic_read(&sk->sk_rmem_alloc) +
2270                                 sk->sk_forward_alloc))
2271                        return 1;
2272        }
2273
2274suppress_allocation:
2275
2276        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2277                sk_stream_moderate_sndbuf(sk);
2278
2279                /* Fail only if socket is _under_ its sndbuf.
2280                 * In this case we cannot block, so that we have to fail.
2281                 */
2282                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2283                        return 1;
2284        }
2285
2286        trace_sock_exceed_buf_limit(sk, prot, allocated);
2287
2288        sk_memory_allocated_sub(sk, amt);
2289
2290        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2291                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2292
2293        return 0;
2294}
2295EXPORT_SYMBOL(__sk_mem_raise_allocated);
2296
2297/**
2298 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2299 *      @sk: socket
2300 *      @size: memory size to allocate
2301 *      @kind: allocation type
2302 *
2303 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2304 *      rmem allocation. This function assumes that protocols which have
2305 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2306 */
2307int __sk_mem_schedule(struct sock *sk, int size, int kind)
2308{
2309        int ret, amt = sk_mem_pages(size);
2310
2311        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2312        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2313        if (!ret)
2314                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2315        return ret;
2316}
2317EXPORT_SYMBOL(__sk_mem_schedule);
2318
2319/**
2320 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2321 *      @sk: socket
2322 *      @amount: number of quanta
2323 *
2324 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2325 */
2326void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2327{
2328        sk_memory_allocated_sub(sk, amount);
2329
2330        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2331                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2332
2333        if (sk_under_memory_pressure(sk) &&
2334            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2335                sk_leave_memory_pressure(sk);
2336}
2337EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2338
2339/**
2340 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2341 *      @sk: socket
2342 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2343 */
2344void __sk_mem_reclaim(struct sock *sk, int amount)
2345{
2346        amount >>= SK_MEM_QUANTUM_SHIFT;
2347        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2348        __sk_mem_reduce_allocated(sk, amount);
2349}
2350EXPORT_SYMBOL(__sk_mem_reclaim);
2351
2352int sk_set_peek_off(struct sock *sk, int val)
2353{
2354        if (val < 0)
2355                return -EINVAL;
2356
2357        sk->sk_peek_off = val;
2358        return 0;
2359}
2360EXPORT_SYMBOL_GPL(sk_set_peek_off);
2361
2362/*
2363 * Set of default routines for initialising struct proto_ops when
2364 * the protocol does not support a particular function. In certain
2365 * cases where it makes no sense for a protocol to have a "do nothing"
2366 * function, some default processing is provided.
2367 */
2368
2369int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2370{
2371        return -EOPNOTSUPP;
2372}
2373EXPORT_SYMBOL(sock_no_bind);
2374
2375int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2376                    int len, int flags)
2377{
2378        return -EOPNOTSUPP;
2379}
2380EXPORT_SYMBOL(sock_no_connect);
2381
2382int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2383{
2384        return -EOPNOTSUPP;
2385}
2386EXPORT_SYMBOL(sock_no_socketpair);
2387
2388int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2389                   bool kern)
2390{
2391        return -EOPNOTSUPP;
2392}
2393EXPORT_SYMBOL(sock_no_accept);
2394
2395int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2396                    int *len, int peer)
2397{
2398        return -EOPNOTSUPP;
2399}
2400EXPORT_SYMBOL(sock_no_getname);
2401
2402unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2403{
2404        return 0;
2405}
2406EXPORT_SYMBOL(sock_no_poll);
2407
2408int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2409{
2410        return -EOPNOTSUPP;
2411}
2412EXPORT_SYMBOL(sock_no_ioctl);
2413
2414int sock_no_listen(struct socket *sock, int backlog)
2415{
2416        return -EOPNOTSUPP;
2417}
2418EXPORT_SYMBOL(sock_no_listen);
2419
2420int sock_no_shutdown(struct socket *sock, int how)
2421{
2422        return -EOPNOTSUPP;
2423}
2424EXPORT_SYMBOL(sock_no_shutdown);
2425
2426int sock_no_setsockopt(struct socket *sock, int level, int optname,
2427                    char __user *optval, unsigned int optlen)
2428{
2429        return -EOPNOTSUPP;
2430}
2431EXPORT_SYMBOL(sock_no_setsockopt);
2432
2433int sock_no_getsockopt(struct socket *sock, int level, int optname,
2434                    char __user *optval, int __user *optlen)
2435{
2436        return -EOPNOTSUPP;
2437}
2438EXPORT_SYMBOL(sock_no_getsockopt);
2439
2440int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2441{
2442        return -EOPNOTSUPP;
2443}
2444EXPORT_SYMBOL(sock_no_sendmsg);
2445
2446int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2447                    int flags)
2448{
2449        return -EOPNOTSUPP;
2450}
2451EXPORT_SYMBOL(sock_no_recvmsg);
2452
2453int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2454{
2455        /* Mirror missing mmap method error code */
2456        return -ENODEV;
2457}
2458EXPORT_SYMBOL(sock_no_mmap);
2459
2460ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2461{
2462        ssize_t res;
2463        struct msghdr msg = {.msg_flags = flags};
2464        struct kvec iov;
2465        char *kaddr = kmap(page);
2466        iov.iov_base = kaddr + offset;
2467        iov.iov_len = size;
2468        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2469        kunmap(page);
2470        return res;
2471}
2472EXPORT_SYMBOL(sock_no_sendpage);
2473
2474/*
2475 *      Default Socket Callbacks
2476 */
2477
2478static void sock_def_wakeup(struct sock *sk)
2479{
2480        struct socket_wq *wq;
2481
2482        rcu_read_lock();
2483        wq = rcu_dereference(sk->sk_wq);
2484        if (skwq_has_sleeper(wq))
2485                wake_up_interruptible_all(&wq->wait);
2486        rcu_read_unlock();
2487}
2488
2489static void sock_def_error_report(struct sock *sk)
2490{
2491        struct socket_wq *wq;
2492
2493        rcu_read_lock();
2494        wq = rcu_dereference(sk->sk_wq);
2495        if (skwq_has_sleeper(wq))
2496                wake_up_interruptible_poll(&wq->wait, POLLERR);
2497        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2498        rcu_read_unlock();
2499}
2500
2501static void sock_def_readable(struct sock *sk)
2502{
2503        struct socket_wq *wq;
2504
2505        rcu_read_lock();
2506        wq = rcu_dereference(sk->sk_wq);
2507        if (skwq_has_sleeper(wq))
2508                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2509                                                POLLRDNORM | POLLRDBAND);
2510        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2511        rcu_read_unlock();
2512}
2513
2514static void sock_def_write_space(struct sock *sk)
2515{
2516        struct socket_wq *wq;
2517
2518        rcu_read_lock();
2519
2520        /* Do not wake up a writer until he can make "significant"
2521         * progress.  --DaveM
2522         */
2523        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2524                wq = rcu_dereference(sk->sk_wq);
2525                if (skwq_has_sleeper(wq))
2526                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2527                                                POLLWRNORM | POLLWRBAND);
2528
2529                /* Should agree with poll, otherwise some programs break */
2530                if (sock_writeable(sk))
2531                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2532        }
2533
2534        rcu_read_unlock();
2535}
2536
2537static void sock_def_destruct(struct sock *sk)
2538{
2539}
2540
2541void sk_send_sigurg(struct sock *sk)
2542{
2543        if (sk->sk_socket && sk->sk_socket->file)
2544                if (send_sigurg(&sk->sk_socket->file->f_owner))
2545                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2546}
2547EXPORT_SYMBOL(sk_send_sigurg);
2548
2549void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2550                    unsigned long expires)
2551{
2552        if (!mod_timer(timer, expires))
2553                sock_hold(sk);
2554}
2555EXPORT_SYMBOL(sk_reset_timer);
2556
2557void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2558{
2559        if (del_timer(timer))
2560                __sock_put(sk);
2561}
2562EXPORT_SYMBOL(sk_stop_timer);
2563
2564void sock_init_data(struct socket *sock, struct sock *sk)
2565{
2566        sk_init_common(sk);
2567        sk->sk_send_head        =       NULL;
2568
2569        init_timer(&sk->sk_timer);
2570
2571        sk->sk_allocation       =       GFP_KERNEL;
2572        sk->sk_rcvbuf           =       sysctl_rmem_default;
2573        sk->sk_sndbuf           =       sysctl_wmem_default;
2574        sk->sk_state            =       TCP_CLOSE;
2575        sk_set_socket(sk, sock);
2576
2577        sock_set_flag(sk, SOCK_ZAPPED);
2578
2579        if (sock) {
2580                sk->sk_type     =       sock->type;
2581                sk->sk_wq       =       sock->wq;
2582                sock->sk        =       sk;
2583                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2584        } else {
2585                sk->sk_wq       =       NULL;
2586                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2587        }
2588
2589        rwlock_init(&sk->sk_callback_lock);
2590        if (sk->sk_kern_sock)
2591                lockdep_set_class_and_name(
2592                        &sk->sk_callback_lock,
2593                        af_kern_callback_keys + sk->sk_family,
2594                        af_family_kern_clock_key_strings[sk->sk_family]);
2595        else
2596                lockdep_set_class_and_name(
2597                        &sk->sk_callback_lock,
2598                        af_callback_keys + sk->sk_family,
2599                        af_family_clock_key_strings[sk->sk_family]);
2600
2601        sk->sk_state_change     =       sock_def_wakeup;
2602        sk->sk_data_ready       =       sock_def_readable;
2603        sk->sk_write_space      =       sock_def_write_space;
2604        sk->sk_error_report     =       sock_def_error_report;
2605        sk->sk_destruct         =       sock_def_destruct;
2606
2607        sk->sk_frag.page        =       NULL;
2608        sk->sk_frag.offset      =       0;
2609        sk->sk_peek_off         =       -1;
2610
2611        sk->sk_peer_pid         =       NULL;
2612        sk->sk_peer_cred        =       NULL;
2613        sk->sk_write_pending    =       0;
2614        sk->sk_rcvlowat         =       1;
2615        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2616        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2617
2618        sk->sk_stamp = SK_DEFAULT_STAMP;
2619
2620#ifdef CONFIG_NET_RX_BUSY_POLL
2621        sk->sk_napi_id          =       0;
2622        sk->sk_ll_usec          =       sysctl_net_busy_read;
2623#endif
2624
2625        sk->sk_max_pacing_rate = ~0U;
2626        sk->sk_pacing_rate = ~0U;
2627        sk->sk_incoming_cpu = -1;
2628        /*
2629         * Before updating sk_refcnt, we must commit prior changes to memory
2630         * (Documentation/RCU/rculist_nulls.txt for details)
2631         */
2632        smp_wmb();
2633        atomic_set(&sk->sk_refcnt, 1);
2634        atomic_set(&sk->sk_drops, 0);
2635}
2636EXPORT_SYMBOL(sock_init_data);
2637
2638void lock_sock_nested(struct sock *sk, int subclass)
2639{
2640        might_sleep();
2641        spin_lock_bh(&sk->sk_lock.slock);
2642        if (sk->sk_lock.owned)
2643                __lock_sock(sk);
2644        sk->sk_lock.owned = 1;
2645        spin_unlock(&sk->sk_lock.slock);
2646        /*
2647         * The sk_lock has mutex_lock() semantics here:
2648         */
2649        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2650        local_bh_enable();
2651}
2652EXPORT_SYMBOL(lock_sock_nested);
2653
2654void release_sock(struct sock *sk)
2655{
2656        spin_lock_bh(&sk->sk_lock.slock);
2657        if (sk->sk_backlog.tail)
2658                __release_sock(sk);
2659
2660        /* Warning : release_cb() might need to release sk ownership,
2661         * ie call sock_release_ownership(sk) before us.
2662         */
2663        if (sk->sk_prot->release_cb)
2664                sk->sk_prot->release_cb(sk);
2665
2666        sock_release_ownership(sk);
2667        if (waitqueue_active(&sk->sk_lock.wq))
2668                wake_up(&sk->sk_lock.wq);
2669        spin_unlock_bh(&sk->sk_lock.slock);
2670}
2671EXPORT_SYMBOL(release_sock);
2672
2673/**
2674 * lock_sock_fast - fast version of lock_sock
2675 * @sk: socket
2676 *
2677 * This version should be used for very small section, where process wont block
2678 * return false if fast path is taken
2679 *   sk_lock.slock locked, owned = 0, BH disabled
2680 * return true if slow path is taken
2681 *   sk_lock.slock unlocked, owned = 1, BH enabled
2682 */
2683bool lock_sock_fast(struct sock *sk)
2684{
2685        might_sleep();
2686        spin_lock_bh(&sk->sk_lock.slock);
2687
2688        if (!sk->sk_lock.owned)
2689                /*
2690                 * Note : We must disable BH
2691                 */
2692                return false;
2693
2694        __lock_sock(sk);
2695        sk->sk_lock.owned = 1;
2696        spin_unlock(&sk->sk_lock.slock);
2697        /*
2698         * The sk_lock has mutex_lock() semantics here:
2699         */
2700        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2701        local_bh_enable();
2702        return true;
2703}
2704EXPORT_SYMBOL(lock_sock_fast);
2705
2706int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2707{
2708        struct timeval tv;
2709        if (!sock_flag(sk, SOCK_TIMESTAMP))
2710                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2711        tv = ktime_to_timeval(sk->sk_stamp);
2712        if (tv.tv_sec == -1)
2713                return -ENOENT;
2714        if (tv.tv_sec == 0) {
2715                sk->sk_stamp = ktime_get_real();
2716                tv = ktime_to_timeval(sk->sk_stamp);
2717        }
2718        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2719}
2720EXPORT_SYMBOL(sock_get_timestamp);
2721
2722int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2723{
2724        struct timespec ts;
2725        if (!sock_flag(sk, SOCK_TIMESTAMP))
2726                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2727        ts = ktime_to_timespec(sk->sk_stamp);
2728        if (ts.tv_sec == -1)
2729                return -ENOENT;
2730        if (ts.tv_sec == 0) {
2731                sk->sk_stamp = ktime_get_real();
2732                ts = ktime_to_timespec(sk->sk_stamp);
2733        }
2734        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2735}
2736EXPORT_SYMBOL(sock_get_timestampns);
2737
2738void sock_enable_timestamp(struct sock *sk, int flag)
2739{
2740        if (!sock_flag(sk, flag)) {
2741                unsigned long previous_flags = sk->sk_flags;
2742
2743                sock_set_flag(sk, flag);
2744                /*
2745                 * we just set one of the two flags which require net
2746                 * time stamping, but time stamping might have been on
2747                 * already because of the other one
2748                 */
2749                if (sock_needs_netstamp(sk) &&
2750                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2751                        net_enable_timestamp();
2752        }
2753}
2754
2755int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2756                       int level, int type)
2757{
2758        struct sock_exterr_skb *serr;
2759        struct sk_buff *skb;
2760        int copied, err;
2761
2762        err = -EAGAIN;
2763        skb = sock_dequeue_err_skb(sk);
2764        if (skb == NULL)
2765                goto out;
2766
2767        copied = skb->len;
2768        if (copied > len) {
2769                msg->msg_flags |= MSG_TRUNC;
2770                copied = len;
2771        }
2772        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2773        if (err)
2774                goto out_free_skb;
2775
2776        sock_recv_timestamp(msg, sk, skb);
2777
2778        serr = SKB_EXT_ERR(skb);
2779        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2780
2781        msg->msg_flags |= MSG_ERRQUEUE;
2782        err = copied;
2783
2784out_free_skb:
2785        kfree_skb(skb);
2786out:
2787        return err;
2788}
2789EXPORT_SYMBOL(sock_recv_errqueue);
2790
2791/*
2792 *      Get a socket option on an socket.
2793 *
2794 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2795 *      asynchronous errors should be reported by getsockopt. We assume
2796 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2797 */
2798int sock_common_getsockopt(struct socket *sock, int level, int optname,
2799                           char __user *optval, int __user *optlen)
2800{
2801        struct sock *sk = sock->sk;
2802
2803        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2804}
2805EXPORT_SYMBOL(sock_common_getsockopt);
2806
2807#ifdef CONFIG_COMPAT
2808int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2809                                  char __user *optval, int __user *optlen)
2810{
2811        struct sock *sk = sock->sk;
2812
2813        if (sk->sk_prot->compat_getsockopt != NULL)
2814                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2815                                                      optval, optlen);
2816        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2817}
2818EXPORT_SYMBOL(compat_sock_common_getsockopt);
2819#endif
2820
2821int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2822                        int flags)
2823{
2824        struct sock *sk = sock->sk;
2825        int addr_len = 0;
2826        int err;
2827
2828        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2829                                   flags & ~MSG_DONTWAIT, &addr_len);
2830        if (err >= 0)
2831                msg->msg_namelen = addr_len;
2832        return err;
2833}
2834EXPORT_SYMBOL(sock_common_recvmsg);
2835
2836/*
2837 *      Set socket options on an inet socket.
2838 */
2839int sock_common_setsockopt(struct socket *sock, int level, int optname,
2840                           char __user *optval, unsigned int optlen)
2841{
2842        struct sock *sk = sock->sk;
2843
2844        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2845}
2846EXPORT_SYMBOL(sock_common_setsockopt);
2847
2848#ifdef CONFIG_COMPAT
2849int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2850                                  char __user *optval, unsigned int optlen)
2851{
2852        struct sock *sk = sock->sk;
2853
2854        if (sk->sk_prot->compat_setsockopt != NULL)
2855                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2856                                                      optval, optlen);
2857        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2858}
2859EXPORT_SYMBOL(compat_sock_common_setsockopt);
2860#endif
2861
2862void sk_common_release(struct sock *sk)
2863{
2864        if (sk->sk_prot->destroy)
2865                sk->sk_prot->destroy(sk);
2866
2867        /*
2868         * Observation: when sock_common_release is called, processes have
2869         * no access to socket. But net still has.
2870         * Step one, detach it from networking:
2871         *
2872         * A. Remove from hash tables.
2873         */
2874
2875        sk->sk_prot->unhash(sk);
2876
2877        /*
2878         * In this point socket cannot receive new packets, but it is possible
2879         * that some packets are in flight because some CPU runs receiver and
2880         * did hash table lookup before we unhashed socket. They will achieve
2881         * receive queue and will be purged by socket destructor.
2882         *
2883         * Also we still have packets pending on receive queue and probably,
2884         * our own packets waiting in device queues. sock_destroy will drain
2885         * receive queue, but transmitted packets will delay socket destruction
2886         * until the last reference will be released.
2887         */
2888
2889        sock_orphan(sk);
2890
2891        xfrm_sk_free_policy(sk);
2892
2893        sk_refcnt_debug_release(sk);
2894
2895        sock_put(sk);
2896}
2897EXPORT_SYMBOL(sk_common_release);
2898
2899void sk_get_meminfo(const struct sock *sk, u32 *mem)
2900{
2901        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2902
2903        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2904        mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2905        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2906        mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2907        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2908        mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2909        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2910        mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2911        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2912}
2913
2914#ifdef CONFIG_PROC_FS
2915#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2916struct prot_inuse {
2917        int val[PROTO_INUSE_NR];
2918};
2919
2920static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2921
2922#ifdef CONFIG_NET_NS
2923void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2924{
2925        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2926}
2927EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2928
2929int sock_prot_inuse_get(struct net *net, struct proto *prot)
2930{
2931        int cpu, idx = prot->inuse_idx;
2932        int res = 0;
2933
2934        for_each_possible_cpu(cpu)
2935                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2936
2937        return res >= 0 ? res : 0;
2938}
2939EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2940
2941static int __net_init sock_inuse_init_net(struct net *net)
2942{
2943        net->core.inuse = alloc_percpu(struct prot_inuse);
2944        return net->core.inuse ? 0 : -ENOMEM;
2945}
2946
2947static void __net_exit sock_inuse_exit_net(struct net *net)
2948{
2949        free_percpu(net->core.inuse);
2950}
2951
2952static struct pernet_operations net_inuse_ops = {
2953        .init = sock_inuse_init_net,
2954        .exit = sock_inuse_exit_net,
2955};
2956
2957static __init int net_inuse_init(void)
2958{
2959        if (register_pernet_subsys(&net_inuse_ops))
2960                panic("Cannot initialize net inuse counters");
2961
2962        return 0;
2963}
2964
2965core_initcall(net_inuse_init);
2966#else
2967static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2968
2969void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2970{
2971        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2972}
2973EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2974
2975int sock_prot_inuse_get(struct net *net, struct proto *prot)
2976{
2977        int cpu, idx = prot->inuse_idx;
2978        int res = 0;
2979
2980        for_each_possible_cpu(cpu)
2981                res += per_cpu(prot_inuse, cpu).val[idx];
2982
2983        return res >= 0 ? res : 0;
2984}
2985EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2986#endif
2987
2988static void assign_proto_idx(struct proto *prot)
2989{
2990        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2991
2992        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2993                pr_err("PROTO_INUSE_NR exhausted\n");
2994                return;
2995        }
2996
2997        set_bit(prot->inuse_idx, proto_inuse_idx);
2998}
2999
3000static void release_proto_idx(struct proto *prot)

3001{
3002        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3003                clear_bit(prot->inuse_idx, proto_inuse_idx);
3004}
3005#else
3006static inline void assign_proto_idx(struct proto *prot)
3007{
3008}
3009
3010static inline void release_proto_idx(struct proto *prot)
3011{
3012}
3013#endif
3014
3015static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3016{
3017        if (!rsk_prot)
3018                return;
3019        kfree(rsk_prot->slab_name);
3020        rsk_prot->slab_name = NULL;
3021        kmem_cache_destroy(rsk_prot->slab);
3022        rsk_prot->slab = NULL;
3023}
3024
3025static int req_prot_init(const struct proto *prot)
3026{
3027        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3028
3029        if (!rsk_prot)
3030                return 0;
3031
3032        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3033                                        prot->name);
3034        if (!rsk_prot->slab_name)
3035                return -ENOMEM;
3036
3037        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3038                                           rsk_prot->obj_size, 0,
3039                                           prot->slab_flags, NULL);
3040
3041        if (!rsk_prot->slab) {
3042                pr_crit("%s: Can't create request sock SLAB cache!\n",
3043                        prot->name);
3044                return -ENOMEM;
3045        }
3046        return 0;
3047}
3048
3049int proto_register(struct proto *prot, int alloc_slab)
3050{
3051        if (alloc_slab) {
3052                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3053                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
3054                                        NULL);
3055
3056                if (prot->slab == NULL) {
3057                        pr_crit("%s: Can't create sock SLAB cache!\n",
3058                                prot->name);
3059                        goto out;
3060                }
3061
3062                if (req_prot_init(prot))
3063                        goto out_free_request_sock_slab;
3064
3065                if (prot->twsk_prot != NULL) {
3066                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3067
3068                        if (prot->twsk_prot->twsk_slab_name == NULL)
3069                                goto out_free_request_sock_slab;
3070
3071                        prot->twsk_prot->twsk_slab =
3072                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3073                                                  prot->twsk_prot->twsk_obj_size,
3074                                                  0,
3075                                                  prot->slab_flags,
3076                                                  NULL);
3077                        if (prot->twsk_prot->twsk_slab == NULL)
3078                                goto out_free_timewait_sock_slab_name;
3079                }
3080        }
3081
3082        mutex_lock(&proto_list_mutex);
3083        list_add(&prot->node, &proto_list);
3084        assign_proto_idx(prot);
3085        mutex_unlock(&proto_list_mutex);
3086        return 0;
3087
3088out_free_timewait_sock_slab_name:
3089        kfree(prot->twsk_prot->twsk_slab_name);
3090out_free_request_sock_slab:
3091        req_prot_cleanup(prot->rsk_prot);
3092
3093        kmem_cache_destroy(prot->slab);
3094        prot->slab = NULL;
3095out:
3096        return -ENOBUFS;
3097}
3098EXPORT_SYMBOL(proto_register);
3099
3100void proto_unregister(struct proto *prot)
3101{
3102        mutex_lock(&proto_list_mutex);
3103        release_proto_idx(prot);
3104        list_del(&prot->node);
3105        mutex_unlock(&proto_list_mutex);
3106
3107        kmem_cache_destroy(prot->slab);
3108        prot->slab = NULL;
3109
3110        req_prot_cleanup(prot->rsk_prot);
3111
3112        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3113                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3114                kfree(prot->twsk_prot->twsk_slab_name);
3115                prot->twsk_prot->twsk_slab = NULL;
3116        }
3117}
3118EXPORT_SYMBOL(proto_unregister);
3119
3120#ifdef CONFIG_PROC_FS
3121static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3122        __acquires(proto_list_mutex)
3123{
3124        mutex_lock(&proto_list_mutex);
3125        return seq_list_start_head(&proto_list, *pos);
3126}
3127
3128static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3129{
3130        return seq_list_next(v, &proto_list, pos);
3131}
3132
3133static void proto_seq_stop(struct seq_file *seq, void *v)
3134        __releases(proto_list_mutex)
3135{
3136        mutex_unlock(&proto_list_mutex);
3137}
3138
3139static char proto_method_implemented(const void *method)
3140{
3141        return method == NULL ? 'n' : 'y';
3142}
3143static long sock_prot_memory_allocated(struct proto *proto)
3144{
3145        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3146}
3147
3148static char *sock_prot_memory_pressure(struct proto *proto)
3149{
3150        return proto->memory_pressure != NULL ?
3151        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3152}
3153
3154static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3155{
3156
3157        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3158                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3159                   proto->name,
3160                   proto->obj_size,
3161                   sock_prot_inuse_get(seq_file_net(seq), proto),
3162                   sock_prot_memory_allocated(proto),
3163                   sock_prot_memory_pressure(proto),
3164                   proto->max_header,
3165                   proto->slab == NULL ? "no" : "yes",
3166                   module_name(proto->owner),
3167                   proto_method_implemented(proto->close),
3168                   proto_method_implemented(proto->connect),
3169                   proto_method_implemented(proto->disconnect),
3170                   proto_method_implemented(proto->accept),
3171                   proto_method_implemented(proto->ioctl),
3172                   proto_method_implemented(proto->init),
3173                   proto_method_implemented(proto->destroy),
3174                   proto_method_implemented(proto->shutdown),
3175                   proto_method_implemented(proto->setsockopt),
3176                   proto_method_implemented(proto->getsockopt),
3177                   proto_method_implemented(proto->sendmsg),
3178                   proto_method_implemented(proto->recvmsg),
3179                   proto_method_implemented(proto->sendpage),
3180                   proto_method_implemented(proto->bind),
3181                   proto_method_implemented(proto->backlog_rcv),
3182                   proto_method_implemented(proto->hash),
3183                   proto_method_implemented(proto->unhash),
3184                   proto_method_implemented(proto->get_port),
3185                   proto_method_implemented(proto->enter_memory_pressure));
3186}
3187
3188static int proto_seq_show(struct seq_file *seq, void *v)
3189{
3190        if (v == &proto_list)
3191                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3192                           "protocol",
3193                           "size",
3194                           "sockets",
3195                           "memory",
3196                           "press",
3197                           "maxhdr",
3198                           "slab",
3199                           "module",
3200                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3201        else
3202                proto_seq_printf(seq, list_entry(v, struct proto, node));
3203        return 0;
3204}
3205
3206static const struct seq_operations proto_seq_ops = {
3207        .start  = proto_seq_start,
3208        .next   = proto_seq_next,
3209        .stop   = proto_seq_stop,
3210        .show   = proto_seq_show,
3211};
3212
3213static int proto_seq_open(struct inode *inode, struct file *file)
3214{
3215        return seq_open_net(inode, file, &proto_seq_ops,
3216                            sizeof(struct seq_net_private));
3217}
3218
3219static const struct file_operations proto_seq_fops = {
3220        .owner          = THIS_MODULE,
3221        .open           = proto_seq_open,
3222        .read           = seq_read,
3223        .llseek         = seq_lseek,
3224        .release        = seq_release_net,
3225};
3226
3227static __net_init int proto_init_net(struct net *net)
3228{
3229        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3230                return -ENOMEM;
3231
3232        return 0;
3233}
3234
3235static __net_exit void proto_exit_net(struct net *net)
3236{
3237        remove_proc_entry("protocols", net->proc_net);
3238}
3239
3240
3241static __net_initdata struct pernet_operations proto_net_ops = {
3242        .init = proto_init_net,
3243        .exit = proto_exit_net,
3244};
3245
3246static int __init proto_init(void)
3247{
3248        return register_pernet_subsys(&proto_net_ops);
3249}
3250
3251subsys_initcall(proto_init);
3252
3253#endif /* PROC_FS */
3254
3255#ifdef CONFIG_NET_RX_BUSY_POLL
3256bool sk_busy_loop_end(void *p, unsigned long start_time)
3257{
3258        struct sock *sk = p;
3259
3260        return !skb_queue_empty(&sk->sk_receive_queue) ||
3261               sk_busy_loop_timeout(sk, start_time);
3262}
3263EXPORT_SYMBOL(sk_busy_loop_end);
3264#endif /* CONFIG_NET_RX_BUSY_POLL */
3265