linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/sched/mm.h>
 106#include <linux/timer.h>
 107#include <linux/string.h>
 108#include <linux/sockios.h>
 109#include <linux/net.h>
 110#include <linux/mm.h>
 111#include <linux/slab.h>
 112#include <linux/interrupt.h>
 113#include <linux/poll.h>
 114#include <linux/tcp.h>
 115#include <linux/init.h>
 116#include <linux/highmem.h>
 117#include <linux/user_namespace.h>
 118#include <linux/static_key.h>
 119#include <linux/memcontrol.h>
 120#include <linux/prefetch.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <linux/net_tstamp.h>
 131#include <net/xfrm.h>
 132#include <linux/ipsec.h>
 133#include <net/cls_cgroup.h>
 134#include <net/netprio_cgroup.h>
 135#include <linux/sock_diag.h>
 136
 137#include <linux/filter.h>
 138#include <net/sock_reuseport.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148static void sock_inuse_add(struct net *net, int val);
 149
 150/**
 151 * sk_ns_capable - General socket capability test
 152 * @sk: Socket to use a capability on or through
 153 * @user_ns: The user namespace of the capability to use
 154 * @cap: The capability to use
 155 *
 156 * Test to see if the opener of the socket had when the socket was
 157 * created and the current process has the capability @cap in the user
 158 * namespace @user_ns.
 159 */
 160bool sk_ns_capable(const struct sock *sk,
 161                   struct user_namespace *user_ns, int cap)
 162{
 163        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                ns_capable(user_ns, cap);
 165}
 166EXPORT_SYMBOL(sk_ns_capable);
 167
 168/**
 169 * sk_capable - Socket global capability test
 170 * @sk: Socket to use a capability on or through
 171 * @cap: The global capability to use
 172 *
 173 * Test to see if the opener of the socket had when the socket was
 174 * created and the current process has the capability @cap in all user
 175 * namespaces.
 176 */
 177bool sk_capable(const struct sock *sk, int cap)
 178{
 179        return sk_ns_capable(sk, &init_user_ns, cap);
 180}
 181EXPORT_SYMBOL(sk_capable);
 182
 183/**
 184 * sk_net_capable - Network namespace socket capability test
 185 * @sk: Socket to use a capability on or through
 186 * @cap: The capability to use
 187 *
 188 * Test to see if the opener of the socket had when the socket was created
 189 * and the current process has the capability @cap over the network namespace
 190 * the socket is a member of.
 191 */
 192bool sk_net_capable(const struct sock *sk, int cap)
 193{
 194        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195}
 196EXPORT_SYMBOL(sk_net_capable);
 197
 198/*
 199 * Each address family might have different locking rules, so we have
 200 * one slock key per address family and separate keys for internal and
 201 * userspace sockets.
 202 */
 203static struct lock_class_key af_family_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_keys[AF_MAX];
 205static struct lock_class_key af_family_slock_keys[AF_MAX];
 206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208/*
 209 * Make lock validator output more readable. (we pre-construct these
 210 * strings build-time, so that runtime initialization of socket
 211 * locks is fast):
 212 */
 213
 214#define _sock_locks(x)                                            \
 215  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 216  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 217  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 218  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 219  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 220  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 221  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 222  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 223  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 224  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 225  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 226  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 227  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 228  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 229  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 230  x "AF_MAX"
 231
 232static const char *const af_family_key_strings[AF_MAX+1] = {
 233        _sock_locks("sk_lock-")
 234};
 235static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236        _sock_locks("slock-")
 237};
 238static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239        _sock_locks("clock-")
 240};
 241
 242static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243        _sock_locks("k-sk_lock-")
 244};
 245static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-slock-")
 247};
 248static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249        _sock_locks("k-clock-")
 250};
 251static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252  "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 253  "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 254  "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 255  "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 256  "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 257  "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 258  "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 259  "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 260  "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 261  "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 262  "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 263  "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 264  "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 265  "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 266  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
 267  "rlock-AF_MAX"
 268};
 269static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 270  "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 271  "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 272  "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 273  "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 274  "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 275  "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 276  "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 277  "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 278  "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 279  "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 280  "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 281  "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 282  "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 283  "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 284  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
 285  "wlock-AF_MAX"
 286};
 287static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 288  "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 289  "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 290  "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 291  "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 292  "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 293  "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 294  "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 295  "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 296  "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 297  "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 298  "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 299  "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 300  "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 301  "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 302  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
 303  "elock-AF_MAX"
 304};
 305
 306/*
 307 * sk_callback_lock and sk queues locking rules are per-address-family,
 308 * so split the lock classes by using a per-AF key:
 309 */
 310static struct lock_class_key af_callback_keys[AF_MAX];
 311static struct lock_class_key af_rlock_keys[AF_MAX];
 312static struct lock_class_key af_wlock_keys[AF_MAX];
 313static struct lock_class_key af_elock_keys[AF_MAX];
 314static struct lock_class_key af_kern_callback_keys[AF_MAX];
 315
 316/* Run time adjustable parameters. */
 317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318EXPORT_SYMBOL(sysctl_wmem_max);
 319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320EXPORT_SYMBOL(sysctl_rmem_max);
 321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324/* Maximal space eaten by iovec or ancillary data plus some space */
 325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328int sysctl_tstamp_allow_data __read_mostly = 1;
 329
 330DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 331EXPORT_SYMBOL_GPL(memalloc_socks_key);
 332
 333/**
 334 * sk_set_memalloc - sets %SOCK_MEMALLOC
 335 * @sk: socket to set it on
 336 *
 337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 338 * It's the responsibility of the admin to adjust min_free_kbytes
 339 * to meet the requirements
 340 */
 341void sk_set_memalloc(struct sock *sk)
 342{
 343        sock_set_flag(sk, SOCK_MEMALLOC);
 344        sk->sk_allocation |= __GFP_MEMALLOC;
 345        static_branch_inc(&memalloc_socks_key);
 346}
 347EXPORT_SYMBOL_GPL(sk_set_memalloc);
 348
 349void sk_clear_memalloc(struct sock *sk)
 350{
 351        sock_reset_flag(sk, SOCK_MEMALLOC);
 352        sk->sk_allocation &= ~__GFP_MEMALLOC;
 353        static_branch_dec(&memalloc_socks_key);
 354
 355        /*
 356         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 357         * progress of swapping. SOCK_MEMALLOC may be cleared while
 358         * it has rmem allocations due to the last swapfile being deactivated
 359         * but there is a risk that the socket is unusable due to exceeding
 360         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 361         */
 362        sk_mem_reclaim(sk);
 363}
 364EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 365
 366int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 367{
 368        int ret;
 369        unsigned int noreclaim_flag;
 370
 371        /* these should have been dropped before queueing */
 372        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 373
 374        noreclaim_flag = memalloc_noreclaim_save();
 375        ret = sk->sk_backlog_rcv(sk, skb);
 376        memalloc_noreclaim_restore(noreclaim_flag);
 377
 378        return ret;
 379}
 380EXPORT_SYMBOL(__sk_backlog_rcv);
 381
 382static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 383{
 384        struct timeval tv;
 385
 386        if (optlen < sizeof(tv))
 387                return -EINVAL;
 388        if (copy_from_user(&tv, optval, sizeof(tv)))
 389                return -EFAULT;
 390        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 391                return -EDOM;
 392
 393        if (tv.tv_sec < 0) {
 394                static int warned __read_mostly;
 395
 396                *timeo_p = 0;
 397                if (warned < 10 && net_ratelimit()) {
 398                        warned++;
 399                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 400                                __func__, current->comm, task_pid_nr(current));
 401                }
 402                return 0;
 403        }
 404        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 405        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 406                return 0;
 407        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 408                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 409        return 0;
 410}
 411
 412static void sock_warn_obsolete_bsdism(const char *name)
 413{
 414        static int warned;
 415        static char warncomm[TASK_COMM_LEN];
 416        if (strcmp(warncomm, current->comm) && warned < 5) {
 417                strcpy(warncomm,  current->comm);
 418                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 419                        warncomm, name);
 420                warned++;
 421        }
 422}
 423
 424static bool sock_needs_netstamp(const struct sock *sk)
 425{
 426        switch (sk->sk_family) {
 427        case AF_UNSPEC:
 428        case AF_UNIX:
 429                return false;
 430        default:
 431                return true;
 432        }
 433}
 434
 435static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 436{
 437        if (sk->sk_flags & flags) {
 438                sk->sk_flags &= ~flags;
 439                if (sock_needs_netstamp(sk) &&
 440                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 441                        net_disable_timestamp();
 442        }
 443}
 444
 445
 446int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 447{
 448        unsigned long flags;
 449        struct sk_buff_head *list = &sk->sk_receive_queue;
 450
 451        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 452                atomic_inc(&sk->sk_drops);
 453                trace_sock_rcvqueue_full(sk, skb);
 454                return -ENOMEM;
 455        }
 456
 457        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 458                atomic_inc(&sk->sk_drops);
 459                return -ENOBUFS;
 460        }
 461
 462        skb->dev = NULL;
 463        skb_set_owner_r(skb, sk);
 464
 465        /* we escape from rcu protected region, make sure we dont leak
 466         * a norefcounted dst
 467         */
 468        skb_dst_force(skb);
 469
 470        spin_lock_irqsave(&list->lock, flags);
 471        sock_skb_set_dropcount(sk, skb);
 472        __skb_queue_tail(list, skb);
 473        spin_unlock_irqrestore(&list->lock, flags);
 474
 475        if (!sock_flag(sk, SOCK_DEAD))
 476                sk->sk_data_ready(sk);
 477        return 0;
 478}
 479EXPORT_SYMBOL(__sock_queue_rcv_skb);
 480
 481int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 482{
 483        int err;
 484
 485        err = sk_filter(sk, skb);
 486        if (err)
 487                return err;
 488
 489        return __sock_queue_rcv_skb(sk, skb);
 490}
 491EXPORT_SYMBOL(sock_queue_rcv_skb);
 492
 493int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 494                     const int nested, unsigned int trim_cap, bool refcounted)
 495{
 496        int rc = NET_RX_SUCCESS;
 497
 498        if (sk_filter_trim_cap(sk, skb, trim_cap))
 499                goto discard_and_relse;
 500
 501        skb->dev = NULL;
 502
 503        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 504                atomic_inc(&sk->sk_drops);
 505                goto discard_and_relse;
 506        }
 507        if (nested)
 508                bh_lock_sock_nested(sk);
 509        else
 510                bh_lock_sock(sk);
 511        if (!sock_owned_by_user(sk)) {
 512                /*
 513                 * trylock + unlock semantics:
 514                 */
 515                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 516
 517                rc = sk_backlog_rcv(sk, skb);
 518
 519                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 520        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 521                bh_unlock_sock(sk);
 522                atomic_inc(&sk->sk_drops);
 523                goto discard_and_relse;
 524        }
 525
 526        bh_unlock_sock(sk);
 527out:
 528        if (refcounted)
 529                sock_put(sk);
 530        return rc;
 531discard_and_relse:
 532        kfree_skb(skb);
 533        goto out;
 534}
 535EXPORT_SYMBOL(__sk_receive_skb);
 536
 537struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 538{
 539        struct dst_entry *dst = __sk_dst_get(sk);
 540
 541        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 542                sk_tx_queue_clear(sk);
 543                sk->sk_dst_pending_confirm = 0;
 544                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 545                dst_release(dst);
 546                return NULL;
 547        }
 548
 549        return dst;
 550}
 551EXPORT_SYMBOL(__sk_dst_check);
 552
 553struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 554{
 555        struct dst_entry *dst = sk_dst_get(sk);
 556
 557        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 558                sk_dst_reset(sk);
 559                dst_release(dst);
 560                return NULL;
 561        }
 562
 563        return dst;
 564}
 565EXPORT_SYMBOL(sk_dst_check);
 566
 567static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 568                                int optlen)
 569{
 570        int ret = -ENOPROTOOPT;
 571#ifdef CONFIG_NETDEVICES
 572        struct net *net = sock_net(sk);
 573        char devname[IFNAMSIZ];
 574        int index;
 575
 576        /* Sorry... */
 577        ret = -EPERM;
 578        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 579                goto out;
 580
 581        ret = -EINVAL;
 582        if (optlen < 0)
 583                goto out;
 584
 585        /* Bind this socket to a particular device like "eth0",
 586         * as specified in the passed interface name. If the
 587         * name is "" or the option length is zero the socket
 588         * is not bound.
 589         */
 590        if (optlen > IFNAMSIZ - 1)
 591                optlen = IFNAMSIZ - 1;
 592        memset(devname, 0, sizeof(devname));
 593
 594        ret = -EFAULT;
 595        if (copy_from_user(devname, optval, optlen))
 596                goto out;
 597
 598        index = 0;
 599        if (devname[0] != '\0') {
 600                struct net_device *dev;
 601
 602                rcu_read_lock();
 603                dev = dev_get_by_name_rcu(net, devname);
 604                if (dev)
 605                        index = dev->ifindex;
 606                rcu_read_unlock();
 607                ret = -ENODEV;
 608                if (!dev)
 609                        goto out;
 610        }
 611
 612        lock_sock(sk);
 613        sk->sk_bound_dev_if = index;
 614        sk_dst_reset(sk);
 615        release_sock(sk);
 616
 617        ret = 0;
 618
 619out:
 620#endif
 621
 622        return ret;
 623}
 624
 625static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 626                                int __user *optlen, int len)
 627{
 628        int ret = -ENOPROTOOPT;
 629#ifdef CONFIG_NETDEVICES
 630        struct net *net = sock_net(sk);
 631        char devname[IFNAMSIZ];
 632
 633        if (sk->sk_bound_dev_if == 0) {
 634                len = 0;
 635                goto zero;
 636        }
 637
 638        ret = -EINVAL;
 639        if (len < IFNAMSIZ)
 640                goto out;
 641
 642        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 643        if (ret)
 644                goto out;
 645
 646        len = strlen(devname) + 1;
 647
 648        ret = -EFAULT;
 649        if (copy_to_user(optval, devname, len))
 650                goto out;
 651
 652zero:
 653        ret = -EFAULT;
 654        if (put_user(len, optlen))
 655                goto out;
 656
 657        ret = 0;
 658
 659out:
 660#endif
 661
 662        return ret;
 663}
 664
 665static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 666{
 667        if (valbool)
 668                sock_set_flag(sk, bit);
 669        else
 670                sock_reset_flag(sk, bit);
 671}
 672
 673bool sk_mc_loop(struct sock *sk)
 674{
 675        if (dev_recursion_level())
 676                return false;
 677        if (!sk)
 678                return true;
 679        switch (sk->sk_family) {
 680        case AF_INET:
 681                return inet_sk(sk)->mc_loop;
 682#if IS_ENABLED(CONFIG_IPV6)
 683        case AF_INET6:
 684                return inet6_sk(sk)->mc_loop;
 685#endif
 686        }
 687        WARN_ON(1);
 688        return true;
 689}
 690EXPORT_SYMBOL(sk_mc_loop);
 691
 692/*
 693 *      This is meant for all protocols to use and covers goings on
 694 *      at the socket level. Everything here is generic.
 695 */
 696
 697int sock_setsockopt(struct socket *sock, int level, int optname,
 698                    char __user *optval, unsigned int optlen)
 699{
 700        struct sock *sk = sock->sk;
 701        int val;
 702        int valbool;
 703        struct linger ling;
 704        int ret = 0;
 705
 706        /*
 707         *      Options without arguments
 708         */
 709
 710        if (optname == SO_BINDTODEVICE)
 711                return sock_setbindtodevice(sk, optval, optlen);
 712
 713        if (optlen < sizeof(int))
 714                return -EINVAL;
 715
 716        if (get_user(val, (int __user *)optval))
 717                return -EFAULT;
 718
 719        valbool = val ? 1 : 0;
 720
 721        lock_sock(sk);
 722
 723        switch (optname) {
 724        case SO_DEBUG:
 725                if (val && !capable(CAP_NET_ADMIN))
 726                        ret = -EACCES;
 727                else
 728                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 729                break;
 730        case SO_REUSEADDR:
 731                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 732                break;
 733        case SO_REUSEPORT:
 734                sk->sk_reuseport = valbool;
 735                break;
 736        case SO_TYPE:
 737        case SO_PROTOCOL:
 738        case SO_DOMAIN:
 739        case SO_ERROR:
 740                ret = -ENOPROTOOPT;
 741                break;
 742        case SO_DONTROUTE:
 743                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 744                break;
 745        case SO_BROADCAST:
 746                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 747                break;
 748        case SO_SNDBUF:
 749                /* Don't error on this BSD doesn't and if you think
 750                 * about it this is right. Otherwise apps have to
 751                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 752                 * are treated in BSD as hints
 753                 */
 754                val = min_t(u32, val, sysctl_wmem_max);
 755set_sndbuf:
 756                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 757                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 758                /* Wake up sending tasks if we upped the value. */
 759                sk->sk_write_space(sk);
 760                break;
 761
 762        case SO_SNDBUFFORCE:
 763                if (!capable(CAP_NET_ADMIN)) {
 764                        ret = -EPERM;
 765                        break;
 766                }
 767                goto set_sndbuf;
 768
 769        case SO_RCVBUF:
 770                /* Don't error on this BSD doesn't and if you think
 771                 * about it this is right. Otherwise apps have to
 772                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 773                 * are treated in BSD as hints
 774                 */
 775                val = min_t(u32, val, sysctl_rmem_max);
 776set_rcvbuf:
 777                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 778                /*
 779                 * We double it on the way in to account for
 780                 * "struct sk_buff" etc. overhead.   Applications
 781                 * assume that the SO_RCVBUF setting they make will
 782                 * allow that much actual data to be received on that
 783                 * socket.
 784                 *
 785                 * Applications are unaware that "struct sk_buff" and
 786                 * other overheads allocate from the receive buffer
 787                 * during socket buffer allocation.
 788                 *
 789                 * And after considering the possible alternatives,
 790                 * returning the value we actually used in getsockopt
 791                 * is the most desirable behavior.
 792                 */
 793                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 794                break;
 795
 796        case SO_RCVBUFFORCE:
 797                if (!capable(CAP_NET_ADMIN)) {
 798                        ret = -EPERM;
 799                        break;
 800                }
 801                goto set_rcvbuf;
 802
 803        case SO_KEEPALIVE:
 804                if (sk->sk_prot->keepalive)
 805                        sk->sk_prot->keepalive(sk, valbool);
 806                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 807                break;
 808
 809        case SO_OOBINLINE:
 810                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 811                break;
 812
 813        case SO_NO_CHECK:
 814                sk->sk_no_check_tx = valbool;
 815                break;
 816
 817        case SO_PRIORITY:
 818                if ((val >= 0 && val <= 6) ||
 819                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 820                        sk->sk_priority = val;
 821                else
 822                        ret = -EPERM;
 823                break;
 824
 825        case SO_LINGER:
 826                if (optlen < sizeof(ling)) {
 827                        ret = -EINVAL;  /* 1003.1g */
 828                        break;
 829                }
 830                if (copy_from_user(&ling, optval, sizeof(ling))) {
 831                        ret = -EFAULT;
 832                        break;
 833                }
 834                if (!ling.l_onoff)
 835                        sock_reset_flag(sk, SOCK_LINGER);
 836                else {
 837#if (BITS_PER_LONG == 32)
 838                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 839                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 840                        else
 841#endif
 842                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 843                        sock_set_flag(sk, SOCK_LINGER);
 844                }
 845                break;
 846
 847        case SO_BSDCOMPAT:
 848                sock_warn_obsolete_bsdism("setsockopt");
 849                break;
 850
 851        case SO_PASSCRED:
 852                if (valbool)
 853                        set_bit(SOCK_PASSCRED, &sock->flags);
 854                else
 855                        clear_bit(SOCK_PASSCRED, &sock->flags);
 856                break;
 857
 858        case SO_TIMESTAMP:
 859        case SO_TIMESTAMPNS:
 860                if (valbool)  {
 861                        if (optname == SO_TIMESTAMP)
 862                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 863                        else
 864                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 865                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 866                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 867                } else {
 868                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 869                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 870                }
 871                break;
 872
 873        case SO_TIMESTAMPING:
 874                if (val & ~SOF_TIMESTAMPING_MASK) {
 875                        ret = -EINVAL;
 876                        break;
 877                }
 878
 879                if (val & SOF_TIMESTAMPING_OPT_ID &&
 880                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 881                        if (sk->sk_protocol == IPPROTO_TCP &&
 882                            sk->sk_type == SOCK_STREAM) {
 883                                if ((1 << sk->sk_state) &
 884                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 885                                        ret = -EINVAL;
 886                                        break;
 887                                }
 888                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 889                        } else {
 890                                sk->sk_tskey = 0;
 891                        }
 892                }
 893
 894                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 895                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 896                        ret = -EINVAL;
 897                        break;
 898                }
 899
 900                sk->sk_tsflags = val;
 901                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 902                        sock_enable_timestamp(sk,
 903                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 904                else
 905                        sock_disable_timestamp(sk,
 906                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 907                break;
 908
 909        case SO_RCVLOWAT:
 910                if (val < 0)
 911                        val = INT_MAX;
 912                if (sock->ops->set_rcvlowat)
 913                        ret = sock->ops->set_rcvlowat(sk, val);
 914                else
 915                        sk->sk_rcvlowat = val ? : 1;
 916                break;
 917
 918        case SO_RCVTIMEO:
 919                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 920                break;
 921
 922        case SO_SNDTIMEO:
 923                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 924                break;
 925
 926        case SO_ATTACH_FILTER:
 927                ret = -EINVAL;
 928                if (optlen == sizeof(struct sock_fprog)) {
 929                        struct sock_fprog fprog;
 930
 931                        ret = -EFAULT;
 932                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 933                                break;
 934
 935                        ret = sk_attach_filter(&fprog, sk);
 936                }
 937                break;
 938
 939        case SO_ATTACH_BPF:
 940                ret = -EINVAL;
 941                if (optlen == sizeof(u32)) {
 942                        u32 ufd;
 943
 944                        ret = -EFAULT;
 945                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 946                                break;
 947
 948                        ret = sk_attach_bpf(ufd, sk);
 949                }
 950                break;
 951
 952        case SO_ATTACH_REUSEPORT_CBPF:
 953                ret = -EINVAL;
 954                if (optlen == sizeof(struct sock_fprog)) {
 955                        struct sock_fprog fprog;
 956
 957                        ret = -EFAULT;
 958                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 959                                break;
 960
 961                        ret = sk_reuseport_attach_filter(&fprog, sk);
 962                }
 963                break;
 964
 965        case SO_ATTACH_REUSEPORT_EBPF:
 966                ret = -EINVAL;
 967                if (optlen == sizeof(u32)) {
 968                        u32 ufd;
 969
 970                        ret = -EFAULT;
 971                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 972                                break;
 973
 974                        ret = sk_reuseport_attach_bpf(ufd, sk);
 975                }
 976                break;
 977
 978        case SO_DETACH_FILTER:
 979                ret = sk_detach_filter(sk);
 980                break;
 981
 982        case SO_LOCK_FILTER:
 983                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 984                        ret = -EPERM;
 985                else
 986                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 987                break;
 988
 989        case SO_PASSSEC:
 990                if (valbool)
 991                        set_bit(SOCK_PASSSEC, &sock->flags);
 992                else
 993                        clear_bit(SOCK_PASSSEC, &sock->flags);
 994                break;
 995        case SO_MARK:
 996                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 997                        ret = -EPERM;
 998                else
 999                        sk->sk_mark = val;
1000                break;
1001
1002        case SO_RXQ_OVFL:
1003                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1004                break;
1005
1006        case SO_WIFI_STATUS:
1007                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1008                break;
1009
1010        case SO_PEEK_OFF:
1011                if (sock->ops->set_peek_off)
1012                        ret = sock->ops->set_peek_off(sk, val);
1013                else
1014                        ret = -EOPNOTSUPP;
1015                break;
1016
1017        case SO_NOFCS:
1018                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1019                break;
1020
1021        case SO_SELECT_ERR_QUEUE:
1022                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1023                break;
1024
1025#ifdef CONFIG_NET_RX_BUSY_POLL
1026        case SO_BUSY_POLL:
1027                /* allow unprivileged users to decrease the value */
1028                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1029                        ret = -EPERM;
1030                else {
1031                        if (val < 0)
1032                                ret = -EINVAL;
1033                        else
1034                                sk->sk_ll_usec = val;
1035                }
1036                break;
1037#endif
1038
1039        case SO_MAX_PACING_RATE:
1040                if (val != ~0U)
1041                        cmpxchg(&sk->sk_pacing_status,
1042                                SK_PACING_NONE,
1043                                SK_PACING_NEEDED);
1044                sk->sk_max_pacing_rate = val;
1045                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1046                                         sk->sk_max_pacing_rate);
1047                break;
1048
1049        case SO_INCOMING_CPU:
1050                sk->sk_incoming_cpu = val;
1051                break;
1052
1053        case SO_CNX_ADVICE:
1054                if (val == 1)
1055                        dst_negative_advice(sk);
1056                break;
1057
1058        case SO_ZEROCOPY:
1059                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1060                        if (sk->sk_protocol != IPPROTO_TCP)
1061                                ret = -ENOTSUPP;
1062                } else if (sk->sk_family != PF_RDS) {
1063                        ret = -ENOTSUPP;
1064                }
1065                if (!ret) {
1066                        if (val < 0 || val > 1)
1067                                ret = -EINVAL;
1068                        else
1069                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1070                }
1071                break;
1072
1073        default:
1074                ret = -ENOPROTOOPT;
1075                break;
1076        }
1077        release_sock(sk);
1078        return ret;
1079}
1080EXPORT_SYMBOL(sock_setsockopt);
1081
1082
1083static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1084                          struct ucred *ucred)
1085{
1086        ucred->pid = pid_vnr(pid);
1087        ucred->uid = ucred->gid = -1;
1088        if (cred) {
1089                struct user_namespace *current_ns = current_user_ns();
1090
1091                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1092                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1093        }
1094}
1095
1096static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1097{
1098        struct user_namespace *user_ns = current_user_ns();
1099        int i;
1100
1101        for (i = 0; i < src->ngroups; i++)
1102                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1103                        return -EFAULT;
1104
1105        return 0;
1106}
1107
1108int sock_getsockopt(struct socket *sock, int level, int optname,
1109                    char __user *optval, int __user *optlen)
1110{
1111        struct sock *sk = sock->sk;
1112
1113        union {
1114                int val;
1115                u64 val64;
1116                struct linger ling;
1117                struct timeval tm;
1118        } v;
1119
1120        int lv = sizeof(int);
1121        int len;
1122
1123        if (get_user(len, optlen))
1124                return -EFAULT;
1125        if (len < 0)
1126                return -EINVAL;
1127
1128        memset(&v, 0, sizeof(v));
1129
1130        switch (optname) {
1131        case SO_DEBUG:
1132                v.val = sock_flag(sk, SOCK_DBG);
1133                break;
1134
1135        case SO_DONTROUTE:
1136                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1137                break;
1138
1139        case SO_BROADCAST:
1140                v.val = sock_flag(sk, SOCK_BROADCAST);
1141                break;
1142
1143        case SO_SNDBUF:
1144                v.val = sk->sk_sndbuf;
1145                break;
1146
1147        case SO_RCVBUF:
1148                v.val = sk->sk_rcvbuf;
1149                break;
1150
1151        case SO_REUSEADDR:
1152                v.val = sk->sk_reuse;
1153                break;
1154
1155        case SO_REUSEPORT:
1156                v.val = sk->sk_reuseport;
1157                break;
1158
1159        case SO_KEEPALIVE:
1160                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1161                break;
1162
1163        case SO_TYPE:
1164                v.val = sk->sk_type;
1165                break;
1166
1167        case SO_PROTOCOL:
1168                v.val = sk->sk_protocol;
1169                break;
1170
1171        case SO_DOMAIN:
1172                v.val = sk->sk_family;
1173                break;
1174
1175        case SO_ERROR:
1176                v.val = -sock_error(sk);
1177                if (v.val == 0)
1178                        v.val = xchg(&sk->sk_err_soft, 0);
1179                break;
1180
1181        case SO_OOBINLINE:
1182                v.val = sock_flag(sk, SOCK_URGINLINE);
1183                break;
1184
1185        case SO_NO_CHECK:
1186                v.val = sk->sk_no_check_tx;
1187                break;
1188
1189        case SO_PRIORITY:
1190                v.val = sk->sk_priority;
1191                break;
1192
1193        case SO_LINGER:
1194                lv              = sizeof(v.ling);
1195                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1196                v.ling.l_linger = sk->sk_lingertime / HZ;
1197                break;
1198
1199        case SO_BSDCOMPAT:
1200                sock_warn_obsolete_bsdism("getsockopt");
1201                break;
1202
1203        case SO_TIMESTAMP:
1204                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1205                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1206                break;
1207
1208        case SO_TIMESTAMPNS:
1209                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1210                break;
1211
1212        case SO_TIMESTAMPING:
1213                v.val = sk->sk_tsflags;
1214                break;
1215
1216        case SO_RCVTIMEO:
1217                lv = sizeof(struct timeval);
1218                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1219                        v.tm.tv_sec = 0;
1220                        v.tm.tv_usec = 0;
1221                } else {
1222                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1223                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1224                }
1225                break;
1226
1227        case SO_SNDTIMEO:
1228                lv = sizeof(struct timeval);
1229                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1230                        v.tm.tv_sec = 0;
1231                        v.tm.tv_usec = 0;
1232                } else {
1233                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1234                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1235                }
1236                break;
1237
1238        case SO_RCVLOWAT:
1239                v.val = sk->sk_rcvlowat;
1240                break;
1241
1242        case SO_SNDLOWAT:
1243                v.val = 1;
1244                break;
1245
1246        case SO_PASSCRED:
1247                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1248                break;
1249
1250        case SO_PEERCRED:
1251        {
1252                struct ucred peercred;
1253                if (len > sizeof(peercred))
1254                        len = sizeof(peercred);
1255                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1256                if (copy_to_user(optval, &peercred, len))
1257                        return -EFAULT;
1258                goto lenout;
1259        }
1260
1261        case SO_PEERGROUPS:
1262        {
1263                int ret, n;
1264
1265                if (!sk->sk_peer_cred)
1266                        return -ENODATA;
1267
1268                n = sk->sk_peer_cred->group_info->ngroups;
1269                if (len < n * sizeof(gid_t)) {
1270                        len = n * sizeof(gid_t);
1271                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1272                }
1273                len = n * sizeof(gid_t);
1274
1275                ret = groups_to_user((gid_t __user *)optval,
1276                                     sk->sk_peer_cred->group_info);
1277                if (ret)
1278                        return ret;
1279                goto lenout;
1280        }
1281
1282        case SO_PEERNAME:
1283        {
1284                char address[128];
1285
1286                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1287                if (lv < 0)
1288                        return -ENOTCONN;
1289                if (lv < len)
1290                        return -EINVAL;
1291                if (copy_to_user(optval, address, len))
1292                        return -EFAULT;
1293                goto lenout;
1294        }
1295
1296        /* Dubious BSD thing... Probably nobody even uses it, but
1297         * the UNIX standard wants it for whatever reason... -DaveM
1298         */
1299        case SO_ACCEPTCONN:
1300                v.val = sk->sk_state == TCP_LISTEN;
1301                break;
1302
1303        case SO_PASSSEC:
1304                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1305                break;
1306
1307        case SO_PEERSEC:
1308                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1309
1310        case SO_MARK:
1311                v.val = sk->sk_mark;
1312                break;
1313
1314        case SO_RXQ_OVFL:
1315                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1316                break;
1317
1318        case SO_WIFI_STATUS:
1319                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1320                break;
1321
1322        case SO_PEEK_OFF:
1323                if (!sock->ops->set_peek_off)
1324                        return -EOPNOTSUPP;
1325
1326                v.val = sk->sk_peek_off;
1327                break;
1328        case SO_NOFCS:
1329                v.val = sock_flag(sk, SOCK_NOFCS);
1330                break;
1331
1332        case SO_BINDTODEVICE:
1333                return sock_getbindtodevice(sk, optval, optlen, len);
1334
1335        case SO_GET_FILTER:
1336                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1337                if (len < 0)
1338                        return len;
1339
1340                goto lenout;
1341
1342        case SO_LOCK_FILTER:
1343                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1344                break;
1345
1346        case SO_BPF_EXTENSIONS:
1347                v.val = bpf_tell_extensions();
1348                break;
1349
1350        case SO_SELECT_ERR_QUEUE:
1351                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1352                break;
1353
1354#ifdef CONFIG_NET_RX_BUSY_POLL
1355        case SO_BUSY_POLL:
1356                v.val = sk->sk_ll_usec;
1357                break;
1358#endif
1359
1360        case SO_MAX_PACING_RATE:
1361                v.val = sk->sk_max_pacing_rate;
1362                break;
1363
1364        case SO_INCOMING_CPU:
1365                v.val = sk->sk_incoming_cpu;
1366                break;
1367
1368        case SO_MEMINFO:
1369        {
1370                u32 meminfo[SK_MEMINFO_VARS];
1371
1372                if (get_user(len, optlen))
1373                        return -EFAULT;
1374
1375                sk_get_meminfo(sk, meminfo);
1376
1377                len = min_t(unsigned int, len, sizeof(meminfo));
1378                if (copy_to_user(optval, &meminfo, len))
1379                        return -EFAULT;
1380
1381                goto lenout;
1382        }
1383
1384#ifdef CONFIG_NET_RX_BUSY_POLL
1385        case SO_INCOMING_NAPI_ID:
1386                v.val = READ_ONCE(sk->sk_napi_id);
1387
1388                /* aggregate non-NAPI IDs down to 0 */
1389                if (v.val < MIN_NAPI_ID)
1390                        v.val = 0;
1391
1392                break;
1393#endif
1394
1395        case SO_COOKIE:
1396                lv = sizeof(u64);
1397                if (len < lv)
1398                        return -EINVAL;
1399                v.val64 = sock_gen_cookie(sk);
1400                break;
1401
1402        case SO_ZEROCOPY:
1403                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1404                break;
1405
1406        default:
1407                /* We implement the SO_SNDLOWAT etc to not be settable
1408                 * (1003.1g 7).
1409                 */
1410                return -ENOPROTOOPT;
1411        }
1412
1413        if (len > lv)
1414                len = lv;
1415        if (copy_to_user(optval, &v, len))
1416                return -EFAULT;
1417lenout:
1418        if (put_user(len, optlen))
1419                return -EFAULT;
1420        return 0;
1421}
1422
1423/*
1424 * Initialize an sk_lock.
1425 *
1426 * (We also register the sk_lock with the lock validator.)
1427 */
1428static inline void sock_lock_init(struct sock *sk)
1429{
1430        if (sk->sk_kern_sock)
1431                sock_lock_init_class_and_name(
1432                        sk,
1433                        af_family_kern_slock_key_strings[sk->sk_family],
1434                        af_family_kern_slock_keys + sk->sk_family,
1435                        af_family_kern_key_strings[sk->sk_family],
1436                        af_family_kern_keys + sk->sk_family);
1437        else
1438                sock_lock_init_class_and_name(
1439                        sk,
1440                        af_family_slock_key_strings[sk->sk_family],
1441                        af_family_slock_keys + sk->sk_family,
1442                        af_family_key_strings[sk->sk_family],
1443                        af_family_keys + sk->sk_family);
1444}
1445
1446/*
1447 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1448 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1449 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1450 */
1451static void sock_copy(struct sock *nsk, const struct sock *osk)
1452{
1453#ifdef CONFIG_SECURITY_NETWORK
1454        void *sptr = nsk->sk_security;
1455#endif
1456        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1457
1458        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1459               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1460
1461#ifdef CONFIG_SECURITY_NETWORK
1462        nsk->sk_security = sptr;
1463        security_sk_clone(osk, nsk);
1464#endif
1465}
1466
1467static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1468                int family)
1469{
1470        struct sock *sk;
1471        struct kmem_cache *slab;
1472
1473        slab = prot->slab;
1474        if (slab != NULL) {
1475                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1476                if (!sk)
1477                        return sk;
1478                if (priority & __GFP_ZERO)
1479                        sk_prot_clear_nulls(sk, prot->obj_size);
1480        } else
1481                sk = kmalloc(prot->obj_size, priority);
1482
1483        if (sk != NULL) {
1484                if (security_sk_alloc(sk, family, priority))
1485                        goto out_free;
1486
1487                if (!try_module_get(prot->owner))
1488                        goto out_free_sec;
1489                sk_tx_queue_clear(sk);
1490        }
1491
1492        return sk;
1493
1494out_free_sec:
1495        security_sk_free(sk);
1496out_free:
1497        if (slab != NULL)
1498                kmem_cache_free(slab, sk);
1499        else
1500                kfree(sk);
1501        return NULL;
1502}
1503
1504static void sk_prot_free(struct proto *prot, struct sock *sk)
1505{
1506        struct kmem_cache *slab;
1507        struct module *owner;
1508
1509        owner = prot->owner;
1510        slab = prot->slab;
1511
1512        cgroup_sk_free(&sk->sk_cgrp_data);
1513        mem_cgroup_sk_free(sk);
1514        security_sk_free(sk);
1515        if (slab != NULL)
1516                kmem_cache_free(slab, sk);
1517        else
1518                kfree(sk);
1519        module_put(owner);
1520}
1521
1522/**
1523 *      sk_alloc - All socket objects are allocated here
1524 *      @net: the applicable net namespace
1525 *      @family: protocol family
1526 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1527 *      @prot: struct proto associated with this new sock instance
1528 *      @kern: is this to be a kernel socket?
1529 */
1530struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1531                      struct proto *prot, int kern)
1532{
1533        struct sock *sk;
1534
1535        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1536        if (sk) {
1537                sk->sk_family = family;
1538                /*
1539                 * See comment in struct sock definition to understand
1540                 * why we need sk_prot_creator -acme
1541                 */
1542                sk->sk_prot = sk->sk_prot_creator = prot;
1543                sk->sk_kern_sock = kern;
1544                sock_lock_init(sk);
1545                sk->sk_net_refcnt = kern ? 0 : 1;
1546                if (likely(sk->sk_net_refcnt)) {
1547                        get_net(net);
1548                        sock_inuse_add(net, 1);
1549                }
1550
1551                sock_net_set(sk, net);
1552                refcount_set(&sk->sk_wmem_alloc, 1);
1553
1554                mem_cgroup_sk_alloc(sk);
1555                cgroup_sk_alloc(&sk->sk_cgrp_data);
1556                sock_update_classid(&sk->sk_cgrp_data);
1557                sock_update_netprioidx(&sk->sk_cgrp_data);
1558        }
1559
1560        return sk;
1561}
1562EXPORT_SYMBOL(sk_alloc);
1563
1564/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1565 * grace period. This is the case for UDP sockets and TCP listeners.
1566 */
1567static void __sk_destruct(struct rcu_head *head)
1568{
1569        struct sock *sk = container_of(head, struct sock, sk_rcu);
1570        struct sk_filter *filter;
1571
1572        if (sk->sk_destruct)
1573                sk->sk_destruct(sk);
1574
1575        filter = rcu_dereference_check(sk->sk_filter,
1576                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1577        if (filter) {
1578                sk_filter_uncharge(sk, filter);
1579                RCU_INIT_POINTER(sk->sk_filter, NULL);
1580        }
1581        if (rcu_access_pointer(sk->sk_reuseport_cb))
1582                reuseport_detach_sock(sk);
1583
1584        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1585
1586        if (atomic_read(&sk->sk_omem_alloc))
1587                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1588                         __func__, atomic_read(&sk->sk_omem_alloc));
1589
1590        if (sk->sk_frag.page) {
1591                put_page(sk->sk_frag.page);
1592                sk->sk_frag.page = NULL;
1593        }
1594
1595        if (sk->sk_peer_cred)
1596                put_cred(sk->sk_peer_cred);
1597        put_pid(sk->sk_peer_pid);
1598        if (likely(sk->sk_net_refcnt))
1599                put_net(sock_net(sk));
1600        sk_prot_free(sk->sk_prot_creator, sk);
1601}
1602
1603void sk_destruct(struct sock *sk)
1604{
1605        if (sock_flag(sk, SOCK_RCU_FREE))
1606                call_rcu(&sk->sk_rcu, __sk_destruct);
1607        else
1608                __sk_destruct(&sk->sk_rcu);
1609}
1610
1611static void __sk_free(struct sock *sk)
1612{
1613        if (likely(sk->sk_net_refcnt))
1614                sock_inuse_add(sock_net(sk), -1);
1615
1616        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1617                sock_diag_broadcast_destroy(sk);
1618        else
1619                sk_destruct(sk);
1620}
1621
1622void sk_free(struct sock *sk)
1623{
1624        /*
1625         * We subtract one from sk_wmem_alloc and can know if
1626         * some packets are still in some tx queue.
1627         * If not null, sock_wfree() will call __sk_free(sk) later
1628         */
1629        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1630                __sk_free(sk);
1631}
1632EXPORT_SYMBOL(sk_free);
1633
1634static void sk_init_common(struct sock *sk)
1635{
1636        skb_queue_head_init(&sk->sk_receive_queue);
1637        skb_queue_head_init(&sk->sk_write_queue);
1638        skb_queue_head_init(&sk->sk_error_queue);
1639
1640        rwlock_init(&sk->sk_callback_lock);
1641        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1642                        af_rlock_keys + sk->sk_family,
1643                        af_family_rlock_key_strings[sk->sk_family]);
1644        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1645                        af_wlock_keys + sk->sk_family,
1646                        af_family_wlock_key_strings[sk->sk_family]);
1647        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1648                        af_elock_keys + sk->sk_family,
1649                        af_family_elock_key_strings[sk->sk_family]);
1650        lockdep_set_class_and_name(&sk->sk_callback_lock,
1651                        af_callback_keys + sk->sk_family,
1652                        af_family_clock_key_strings[sk->sk_family]);
1653}
1654
1655/**
1656 *      sk_clone_lock - clone a socket, and lock its clone
1657 *      @sk: the socket to clone
1658 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1659 *
1660 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1661 */
1662struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1663{
1664        struct sock *newsk;
1665        bool is_charged = true;
1666
1667        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1668        if (newsk != NULL) {
1669                struct sk_filter *filter;
1670
1671                sock_copy(newsk, sk);
1672
1673                newsk->sk_prot_creator = sk->sk_prot;
1674
1675                /* SANITY */
1676                if (likely(newsk->sk_net_refcnt))
1677                        get_net(sock_net(newsk));
1678                sk_node_init(&newsk->sk_node);
1679                sock_lock_init(newsk);
1680                bh_lock_sock(newsk);
1681                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1682                newsk->sk_backlog.len = 0;
1683
1684                atomic_set(&newsk->sk_rmem_alloc, 0);
1685                /*
1686                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1687                 */
1688                refcount_set(&newsk->sk_wmem_alloc, 1);
1689                atomic_set(&newsk->sk_omem_alloc, 0);
1690                sk_init_common(newsk);
1691
1692                newsk->sk_dst_cache     = NULL;
1693                newsk->sk_dst_pending_confirm = 0;
1694                newsk->sk_wmem_queued   = 0;
1695                newsk->sk_forward_alloc = 0;
1696                atomic_set(&newsk->sk_drops, 0);
1697                newsk->sk_send_head     = NULL;
1698                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1699                atomic_set(&newsk->sk_zckey, 0);
1700
1701                sock_reset_flag(newsk, SOCK_DONE);
1702                mem_cgroup_sk_alloc(newsk);
1703                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1704
1705                rcu_read_lock();
1706                filter = rcu_dereference(sk->sk_filter);
1707                if (filter != NULL)
1708                        /* though it's an empty new sock, the charging may fail
1709                         * if sysctl_optmem_max was changed between creation of
1710                         * original socket and cloning
1711                         */
1712                        is_charged = sk_filter_charge(newsk, filter);
1713                RCU_INIT_POINTER(newsk->sk_filter, filter);
1714                rcu_read_unlock();
1715
1716                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1717                        /* We need to make sure that we don't uncharge the new
1718                         * socket if we couldn't charge it in the first place
1719                         * as otherwise we uncharge the parent's filter.
1720                         */
1721                        if (!is_charged)
1722                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1723                        sk_free_unlock_clone(newsk);
1724                        newsk = NULL;
1725                        goto out;
1726                }
1727                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1728
1729                newsk->sk_err      = 0;
1730                newsk->sk_err_soft = 0;
1731                newsk->sk_priority = 0;
1732                newsk->sk_incoming_cpu = raw_smp_processor_id();
1733                atomic64_set(&newsk->sk_cookie, 0);
1734                if (likely(newsk->sk_net_refcnt))
1735                        sock_inuse_add(sock_net(newsk), 1);
1736
1737                /*
1738                 * Before updating sk_refcnt, we must commit prior changes to memory
1739                 * (Documentation/RCU/rculist_nulls.txt for details)
1740                 */
1741                smp_wmb();
1742                refcount_set(&newsk->sk_refcnt, 2);
1743
1744                /*
1745                 * Increment the counter in the same struct proto as the master
1746                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1747                 * is the same as sk->sk_prot->socks, as this field was copied
1748                 * with memcpy).
1749                 *
1750                 * This _changes_ the previous behaviour, where
1751                 * tcp_create_openreq_child always was incrementing the
1752                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1753                 * to be taken into account in all callers. -acme
1754                 */
1755                sk_refcnt_debug_inc(newsk);
1756                sk_set_socket(newsk, NULL);
1757                newsk->sk_wq = NULL;
1758
1759                if (newsk->sk_prot->sockets_allocated)
1760                        sk_sockets_allocated_inc(newsk);
1761
1762                if (sock_needs_netstamp(sk) &&
1763                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1764                        net_enable_timestamp();
1765        }
1766out:
1767        return newsk;
1768}
1769EXPORT_SYMBOL_GPL(sk_clone_lock);
1770
1771void sk_free_unlock_clone(struct sock *sk)
1772{
1773        /* It is still raw copy of parent, so invalidate
1774         * destructor and make plain sk_free() */
1775        sk->sk_destruct = NULL;
1776        bh_unlock_sock(sk);
1777        sk_free(sk);
1778}
1779EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1780
1781void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1782{
1783        u32 max_segs = 1;
1784
1785        sk_dst_set(sk, dst);
1786        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1787        if (sk->sk_route_caps & NETIF_F_GSO)
1788                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1789        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1790        if (sk_can_gso(sk)) {
1791                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1792                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1793                } else {
1794                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1795                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1796                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1797                }
1798        }
1799        sk->sk_gso_max_segs = max_segs;
1800}
1801EXPORT_SYMBOL_GPL(sk_setup_caps);
1802
1803/*
1804 *      Simple resource managers for sockets.
1805 */
1806
1807
1808/*
1809 * Write buffer destructor automatically called from kfree_skb.
1810 */
1811void sock_wfree(struct sk_buff *skb)
1812{
1813        struct sock *sk = skb->sk;
1814        unsigned int len = skb->truesize;
1815
1816        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1817                /*
1818                 * Keep a reference on sk_wmem_alloc, this will be released
1819                 * after sk_write_space() call
1820                 */
1821                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1822                sk->sk_write_space(sk);
1823                len = 1;
1824        }
1825        /*
1826         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1827         * could not do because of in-flight packets
1828         */
1829        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1830                __sk_free(sk);
1831}
1832EXPORT_SYMBOL(sock_wfree);
1833
1834/* This variant of sock_wfree() is used by TCP,
1835 * since it sets SOCK_USE_WRITE_QUEUE.
1836 */
1837void __sock_wfree(struct sk_buff *skb)
1838{
1839        struct sock *sk = skb->sk;
1840
1841        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1842                __sk_free(sk);
1843}
1844
1845void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1846{
1847        skb_orphan(skb);
1848        skb->sk = sk;
1849#ifdef CONFIG_INET
1850        if (unlikely(!sk_fullsock(sk))) {
1851                skb->destructor = sock_edemux;
1852                sock_hold(sk);
1853                return;
1854        }
1855#endif
1856        skb->destructor = sock_wfree;
1857        skb_set_hash_from_sk(skb, sk);
1858        /*
1859         * We used to take a refcount on sk, but following operation
1860         * is enough to guarantee sk_free() wont free this sock until
1861         * all in-flight packets are completed
1862         */
1863        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1864}
1865EXPORT_SYMBOL(skb_set_owner_w);
1866
1867/* This helper is used by netem, as it can hold packets in its
1868 * delay queue. We want to allow the owner socket to send more
1869 * packets, as if they were already TX completed by a typical driver.
1870 * But we also want to keep skb->sk set because some packet schedulers
1871 * rely on it (sch_fq for example).
1872 */
1873void skb_orphan_partial(struct sk_buff *skb)
1874{
1875        if (skb_is_tcp_pure_ack(skb))
1876                return;
1877
1878        if (skb->destructor == sock_wfree
1879#ifdef CONFIG_INET
1880            || skb->destructor == tcp_wfree
1881#endif
1882                ) {
1883                struct sock *sk = skb->sk;
1884
1885                if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1886                        WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1887                        skb->destructor = sock_efree;
1888                }
1889        } else {
1890                skb_orphan(skb);
1891        }
1892}
1893EXPORT_SYMBOL(skb_orphan_partial);
1894
1895/*
1896 * Read buffer destructor automatically called from kfree_skb.
1897 */
1898void sock_rfree(struct sk_buff *skb)
1899{
1900        struct sock *sk = skb->sk;
1901        unsigned int len = skb->truesize;
1902
1903        atomic_sub(len, &sk->sk_rmem_alloc);
1904        sk_mem_uncharge(sk, len);
1905}
1906EXPORT_SYMBOL(sock_rfree);
1907
1908/*
1909 * Buffer destructor for skbs that are not used directly in read or write
1910 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1911 */
1912void sock_efree(struct sk_buff *skb)
1913{
1914        sock_put(skb->sk);
1915}
1916EXPORT_SYMBOL(sock_efree);
1917
1918kuid_t sock_i_uid(struct sock *sk)
1919{
1920        kuid_t uid;
1921
1922        read_lock_bh(&sk->sk_callback_lock);
1923        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1924        read_unlock_bh(&sk->sk_callback_lock);
1925        return uid;
1926}
1927EXPORT_SYMBOL(sock_i_uid);
1928
1929unsigned long sock_i_ino(struct sock *sk)
1930{
1931        unsigned long ino;
1932
1933        read_lock_bh(&sk->sk_callback_lock);
1934        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1935        read_unlock_bh(&sk->sk_callback_lock);
1936        return ino;
1937}
1938EXPORT_SYMBOL(sock_i_ino);
1939
1940/*
1941 * Allocate a skb from the socket's send buffer.
1942 */
1943struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1944                             gfp_t priority)
1945{
1946        if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1947                struct sk_buff *skb = alloc_skb(size, priority);
1948                if (skb) {
1949                        skb_set_owner_w(skb, sk);
1950                        return skb;
1951                }
1952        }
1953        return NULL;
1954}
1955EXPORT_SYMBOL(sock_wmalloc);
1956
1957static void sock_ofree(struct sk_buff *skb)
1958{
1959        struct sock *sk = skb->sk;
1960
1961        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1962}
1963
1964struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1965                             gfp_t priority)
1966{
1967        struct sk_buff *skb;
1968
1969        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1970        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1971            sysctl_optmem_max)
1972                return NULL;
1973
1974        skb = alloc_skb(size, priority);
1975        if (!skb)
1976                return NULL;
1977
1978        atomic_add(skb->truesize, &sk->sk_omem_alloc);
1979        skb->sk = sk;
1980        skb->destructor = sock_ofree;
1981        return skb;
1982}
1983
1984/*
1985 * Allocate a memory block from the socket's option memory buffer.
1986 */
1987void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1988{
1989        if ((unsigned int)size <= sysctl_optmem_max &&
1990            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1991                void *mem;
1992                /* First do the add, to avoid the race if kmalloc
1993                 * might sleep.
1994                 */
1995                atomic_add(size, &sk->sk_omem_alloc);
1996                mem = kmalloc(size, priority);
1997                if (mem)
1998                        return mem;
1999                atomic_sub(size, &sk->sk_omem_alloc);
2000        }
2001        return NULL;
2002}
2003EXPORT_SYMBOL(sock_kmalloc);
2004
2005/* Free an option memory block. Note, we actually want the inline
2006 * here as this allows gcc to detect the nullify and fold away the
2007 * condition entirely.
2008 */
2009static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2010                                  const bool nullify)
2011{
2012        if (WARN_ON_ONCE(!mem))
2013                return;
2014        if (nullify)
2015                kzfree(mem);
2016        else
2017                kfree(mem);
2018        atomic_sub(size, &sk->sk_omem_alloc);
2019}
2020
2021void sock_kfree_s(struct sock *sk, void *mem, int size)
2022{
2023        __sock_kfree_s(sk, mem, size, false);
2024}
2025EXPORT_SYMBOL(sock_kfree_s);
2026
2027void sock_kzfree_s(struct sock *sk, void *mem, int size)
2028{
2029        __sock_kfree_s(sk, mem, size, true);
2030}
2031EXPORT_SYMBOL(sock_kzfree_s);
2032
2033/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2034   I think, these locks should be removed for datagram sockets.
2035 */
2036static long sock_wait_for_wmem(struct sock *sk, long timeo)
2037{
2038        DEFINE_WAIT(wait);
2039
2040        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2041        for (;;) {
2042                if (!timeo)
2043                        break;
2044                if (signal_pending(current))
2045                        break;
2046                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2047                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2048                if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2049                        break;
2050                if (sk->sk_shutdown & SEND_SHUTDOWN)
2051                        break;
2052                if (sk->sk_err)
2053                        break;
2054                timeo = schedule_timeout(timeo);
2055        }
2056        finish_wait(sk_sleep(sk), &wait);
2057        return timeo;
2058}
2059
2060
2061/*
2062 *      Generic send/receive buffer handlers
2063 */
2064
2065struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2066                                     unsigned long data_len, int noblock,
2067                                     int *errcode, int max_page_order)
2068{
2069        struct sk_buff *skb;
2070        long timeo;
2071        int err;
2072
2073        timeo = sock_sndtimeo(sk, noblock);
2074        for (;;) {
2075                err = sock_error(sk);
2076                if (err != 0)
2077                        goto failure;
2078
2079                err = -EPIPE;
2080                if (sk->sk_shutdown & SEND_SHUTDOWN)
2081                        goto failure;
2082
2083                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2084                        break;
2085
2086                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2087                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2088                err = -EAGAIN;
2089                if (!timeo)
2090                        goto failure;
2091                if (signal_pending(current))
2092                        goto interrupted;
2093                timeo = sock_wait_for_wmem(sk, timeo);
2094        }
2095        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2096                                   errcode, sk->sk_allocation);
2097        if (skb)
2098                skb_set_owner_w(skb, sk);
2099        return skb;
2100
2101interrupted:
2102        err = sock_intr_errno(timeo);
2103failure:
2104        *errcode = err;
2105        return NULL;
2106}
2107EXPORT_SYMBOL(sock_alloc_send_pskb);
2108
2109struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2110                                    int noblock, int *errcode)
2111{
2112        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2113}
2114EXPORT_SYMBOL(sock_alloc_send_skb);
2115
2116int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2117                     struct sockcm_cookie *sockc)
2118{
2119        u32 tsflags;
2120
2121        switch (cmsg->cmsg_type) {
2122        case SO_MARK:
2123                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2124                        return -EPERM;
2125                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126                        return -EINVAL;
2127                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2128                break;
2129        case SO_TIMESTAMPING:
2130                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2131                        return -EINVAL;
2132
2133                tsflags = *(u32 *)CMSG_DATA(cmsg);
2134                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2135                        return -EINVAL;
2136
2137                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2138                sockc->tsflags |= tsflags;
2139                break;
2140        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2141        case SCM_RIGHTS:
2142        case SCM_CREDENTIALS:
2143                break;
2144        default:
2145                return -EINVAL;
2146        }
2147        return 0;
2148}
2149EXPORT_SYMBOL(__sock_cmsg_send);
2150
2151int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2152                   struct sockcm_cookie *sockc)
2153{
2154        struct cmsghdr *cmsg;
2155        int ret;
2156
2157        for_each_cmsghdr(cmsg, msg) {
2158                if (!CMSG_OK(msg, cmsg))
2159                        return -EINVAL;
2160                if (cmsg->cmsg_level != SOL_SOCKET)
2161                        continue;
2162                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2163                if (ret)
2164                        return ret;
2165        }
2166        return 0;
2167}
2168EXPORT_SYMBOL(sock_cmsg_send);
2169
2170static void sk_enter_memory_pressure(struct sock *sk)
2171{
2172        if (!sk->sk_prot->enter_memory_pressure)
2173                return;
2174
2175        sk->sk_prot->enter_memory_pressure(sk);
2176}
2177
2178static void sk_leave_memory_pressure(struct sock *sk)
2179{
2180        if (sk->sk_prot->leave_memory_pressure) {
2181                sk->sk_prot->leave_memory_pressure(sk);
2182        } else {
2183                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2184
2185                if (memory_pressure && *memory_pressure)
2186                        *memory_pressure = 0;
2187        }
2188}
2189
2190/* On 32bit arches, an skb frag is limited to 2^15 */
2191#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2192
2193/**
2194 * skb_page_frag_refill - check that a page_frag contains enough room
2195 * @sz: minimum size of the fragment we want to get
2196 * @pfrag: pointer to page_frag
2197 * @gfp: priority for memory allocation
2198 *
2199 * Note: While this allocator tries to use high order pages, there is
2200 * no guarantee that allocations succeed. Therefore, @sz MUST be
2201 * less or equal than PAGE_SIZE.
2202 */
2203bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2204{
2205        if (pfrag->page) {
2206                if (page_ref_count(pfrag->page) == 1) {
2207                        pfrag->offset = 0;
2208                        return true;
2209                }
2210                if (pfrag->offset + sz <= pfrag->size)
2211                        return true;
2212                put_page(pfrag->page);
2213        }
2214
2215        pfrag->offset = 0;
2216        if (SKB_FRAG_PAGE_ORDER) {
2217                /* Avoid direct reclaim but allow kswapd to wake */
2218                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2219                                          __GFP_COMP | __GFP_NOWARN |
2220                                          __GFP_NORETRY,
2221                                          SKB_FRAG_PAGE_ORDER);
2222                if (likely(pfrag->page)) {
2223                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2224                        return true;
2225                }
2226        }
2227        pfrag->page = alloc_page(gfp);
2228        if (likely(pfrag->page)) {
2229                pfrag->size = PAGE_SIZE;
2230                return true;
2231        }
2232        return false;
2233}
2234EXPORT_SYMBOL(skb_page_frag_refill);
2235
2236bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2237{
2238        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2239                return true;
2240
2241        sk_enter_memory_pressure(sk);
2242        sk_stream_moderate_sndbuf(sk);
2243        return false;
2244}
2245EXPORT_SYMBOL(sk_page_frag_refill);
2246
2247int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2248                int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2249                int first_coalesce)
2250{
2251        int sg_curr = *sg_curr_index, use = 0, rc = 0;
2252        unsigned int size = *sg_curr_size;
2253        struct page_frag *pfrag;
2254        struct scatterlist *sge;
2255
2256        len -= size;
2257        pfrag = sk_page_frag(sk);
2258
2259        while (len > 0) {
2260                unsigned int orig_offset;
2261
2262                if (!sk_page_frag_refill(sk, pfrag)) {
2263                        rc = -ENOMEM;
2264                        goto out;
2265                }
2266
2267                use = min_t(int, len, pfrag->size - pfrag->offset);
2268
2269                if (!sk_wmem_schedule(sk, use)) {
2270                        rc = -ENOMEM;
2271                        goto out;
2272                }
2273
2274                sk_mem_charge(sk, use);
2275                size += use;
2276                orig_offset = pfrag->offset;
2277                pfrag->offset += use;
2278
2279                sge = sg + sg_curr - 1;
2280                if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2281                    sge->offset + sge->length == orig_offset) {
2282                        sge->length += use;
2283                } else {
2284                        sge = sg + sg_curr;
2285                        sg_unmark_end(sge);
2286                        sg_set_page(sge, pfrag->page, use, orig_offset);
2287                        get_page(pfrag->page);
2288                        sg_curr++;
2289
2290                        if (sg_curr == MAX_SKB_FRAGS)
2291                                sg_curr = 0;
2292
2293                        if (sg_curr == sg_start) {
2294                                rc = -ENOSPC;
2295                                break;
2296                        }
2297                }
2298
2299                len -= use;
2300        }
2301out:
2302        *sg_curr_size = size;
2303        *sg_curr_index = sg_curr;
2304        return rc;
2305}
2306EXPORT_SYMBOL(sk_alloc_sg);
2307
2308static void __lock_sock(struct sock *sk)
2309        __releases(&sk->sk_lock.slock)
2310        __acquires(&sk->sk_lock.slock)
2311{
2312        DEFINE_WAIT(wait);
2313
2314        for (;;) {
2315                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2316                                        TASK_UNINTERRUPTIBLE);
2317                spin_unlock_bh(&sk->sk_lock.slock);
2318                schedule();
2319                spin_lock_bh(&sk->sk_lock.slock);
2320                if (!sock_owned_by_user(sk))
2321                        break;
2322        }
2323        finish_wait(&sk->sk_lock.wq, &wait);
2324}
2325
2326static void __release_sock(struct sock *sk)
2327        __releases(&sk->sk_lock.slock)
2328        __acquires(&sk->sk_lock.slock)
2329{
2330        struct sk_buff *skb, *next;
2331
2332        while ((skb = sk->sk_backlog.head) != NULL) {
2333                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2334
2335                spin_unlock_bh(&sk->sk_lock.slock);
2336
2337                do {
2338                        next = skb->next;
2339                        prefetch(next);
2340                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2341                        skb->next = NULL;
2342                        sk_backlog_rcv(sk, skb);
2343
2344                        cond_resched();
2345
2346                        skb = next;
2347                } while (skb != NULL);
2348
2349                spin_lock_bh(&sk->sk_lock.slock);
2350        }
2351
2352        /*
2353         * Doing the zeroing here guarantee we can not loop forever
2354         * while a wild producer attempts to flood us.
2355         */
2356        sk->sk_backlog.len = 0;
2357}
2358
2359void __sk_flush_backlog(struct sock *sk)
2360{
2361        spin_lock_bh(&sk->sk_lock.slock);
2362        __release_sock(sk);
2363        spin_unlock_bh(&sk->sk_lock.slock);
2364}
2365
2366/**
2367 * sk_wait_data - wait for data to arrive at sk_receive_queue
2368 * @sk:    sock to wait on
2369 * @timeo: for how long
2370 * @skb:   last skb seen on sk_receive_queue
2371 *
2372 * Now socket state including sk->sk_err is changed only under lock,
2373 * hence we may omit checks after joining wait queue.
2374 * We check receive queue before schedule() only as optimization;
2375 * it is very likely that release_sock() added new data.
2376 */
2377int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2378{
2379        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2380        int rc;
2381
2382        add_wait_queue(sk_sleep(sk), &wait);
2383        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2384        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2385        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2386        remove_wait_queue(sk_sleep(sk), &wait);
2387        return rc;
2388}
2389EXPORT_SYMBOL(sk_wait_data);
2390
2391/**
2392 *      __sk_mem_raise_allocated - increase memory_allocated
2393 *      @sk: socket
2394 *      @size: memory size to allocate
2395 *      @amt: pages to allocate
2396 *      @kind: allocation type
2397 *
2398 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2399 */
2400int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2401{
2402        struct proto *prot = sk->sk_prot;
2403        long allocated = sk_memory_allocated_add(sk, amt);
2404
2405        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2406            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2407                goto suppress_allocation;
2408
2409        /* Under limit. */
2410        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2411                sk_leave_memory_pressure(sk);
2412                return 1;
2413        }
2414
2415        /* Under pressure. */
2416        if (allocated > sk_prot_mem_limits(sk, 1))
2417                sk_enter_memory_pressure(sk);
2418
2419        /* Over hard limit. */
2420        if (allocated > sk_prot_mem_limits(sk, 2))
2421                goto suppress_allocation;
2422
2423        /* guarantee minimum buffer size under pressure */
2424        if (kind == SK_MEM_RECV) {
2425                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2426                        return 1;
2427
2428        } else { /* SK_MEM_SEND */
2429                int wmem0 = sk_get_wmem0(sk, prot);
2430
2431                if (sk->sk_type == SOCK_STREAM) {
2432                        if (sk->sk_wmem_queued < wmem0)
2433                                return 1;
2434                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2435                                return 1;
2436                }
2437        }
2438
2439        if (sk_has_memory_pressure(sk)) {
2440                int alloc;
2441
2442                if (!sk_under_memory_pressure(sk))
2443                        return 1;
2444                alloc = sk_sockets_allocated_read_positive(sk);
2445                if (sk_prot_mem_limits(sk, 2) > alloc *
2446                    sk_mem_pages(sk->sk_wmem_queued +
2447                                 atomic_read(&sk->sk_rmem_alloc) +
2448                                 sk->sk_forward_alloc))
2449                        return 1;
2450        }
2451
2452suppress_allocation:
2453
2454        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2455                sk_stream_moderate_sndbuf(sk);
2456
2457                /* Fail only if socket is _under_ its sndbuf.
2458                 * In this case we cannot block, so that we have to fail.
2459                 */
2460                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2461                        return 1;
2462        }
2463
2464        trace_sock_exceed_buf_limit(sk, prot, allocated);
2465
2466        sk_memory_allocated_sub(sk, amt);
2467
2468        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2469                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2470
2471        return 0;
2472}
2473EXPORT_SYMBOL(__sk_mem_raise_allocated);
2474
2475/**
2476 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2477 *      @sk: socket
2478 *      @size: memory size to allocate
2479 *      @kind: allocation type
2480 *
2481 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2482 *      rmem allocation. This function assumes that protocols which have
2483 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2484 */
2485int __sk_mem_schedule(struct sock *sk, int size, int kind)
2486{
2487        int ret, amt = sk_mem_pages(size);
2488
2489        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2490        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2491        if (!ret)
2492                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2493        return ret;
2494}
2495EXPORT_SYMBOL(__sk_mem_schedule);
2496
2497/**
2498 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2499 *      @sk: socket
2500 *      @amount: number of quanta
2501 *
2502 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2503 */
2504void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2505{
2506        sk_memory_allocated_sub(sk, amount);
2507
2508        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2509                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2510
2511        if (sk_under_memory_pressure(sk) &&
2512            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2513                sk_leave_memory_pressure(sk);
2514}
2515EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2516
2517/**
2518 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2519 *      @sk: socket
2520 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2521 */
2522void __sk_mem_reclaim(struct sock *sk, int amount)
2523{
2524        amount >>= SK_MEM_QUANTUM_SHIFT;
2525        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2526        __sk_mem_reduce_allocated(sk, amount);
2527}
2528EXPORT_SYMBOL(__sk_mem_reclaim);
2529
2530int sk_set_peek_off(struct sock *sk, int val)
2531{
2532        sk->sk_peek_off = val;
2533        return 0;
2534}
2535EXPORT_SYMBOL_GPL(sk_set_peek_off);
2536
2537/*
2538 * Set of default routines for initialising struct proto_ops when
2539 * the protocol does not support a particular function. In certain
2540 * cases where it makes no sense for a protocol to have a "do nothing"
2541 * function, some default processing is provided.
2542 */
2543
2544int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2545{
2546        return -EOPNOTSUPP;
2547}
2548EXPORT_SYMBOL(sock_no_bind);
2549
2550int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2551                    int len, int flags)
2552{
2553        return -EOPNOTSUPP;
2554}
2555EXPORT_SYMBOL(sock_no_connect);
2556
2557int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2558{
2559        return -EOPNOTSUPP;
2560}
2561EXPORT_SYMBOL(sock_no_socketpair);
2562
2563int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2564                   bool kern)
2565{
2566        return -EOPNOTSUPP;
2567}
2568EXPORT_SYMBOL(sock_no_accept);
2569
2570int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2571                    int peer)
2572{
2573        return -EOPNOTSUPP;
2574}
2575EXPORT_SYMBOL(sock_no_getname);
2576
2577int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2578{
2579        return -EOPNOTSUPP;
2580}
2581EXPORT_SYMBOL(sock_no_ioctl);
2582
2583int sock_no_listen(struct socket *sock, int backlog)
2584{
2585        return -EOPNOTSUPP;
2586}
2587EXPORT_SYMBOL(sock_no_listen);
2588
2589int sock_no_shutdown(struct socket *sock, int how)
2590{
2591        return -EOPNOTSUPP;
2592}
2593EXPORT_SYMBOL(sock_no_shutdown);
2594
2595int sock_no_setsockopt(struct socket *sock, int level, int optname,
2596                    char __user *optval, unsigned int optlen)
2597{
2598        return -EOPNOTSUPP;
2599}
2600EXPORT_SYMBOL(sock_no_setsockopt);
2601
2602int sock_no_getsockopt(struct socket *sock, int level, int optname,
2603                    char __user *optval, int __user *optlen)
2604{
2605        return -EOPNOTSUPP;
2606}
2607EXPORT_SYMBOL(sock_no_getsockopt);
2608
2609int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2610{
2611        return -EOPNOTSUPP;
2612}
2613EXPORT_SYMBOL(sock_no_sendmsg);
2614
2615int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2616{
2617        return -EOPNOTSUPP;
2618}
2619EXPORT_SYMBOL(sock_no_sendmsg_locked);
2620
2621int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2622                    int flags)
2623{
2624        return -EOPNOTSUPP;
2625}
2626EXPORT_SYMBOL(sock_no_recvmsg);
2627
2628int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2629{
2630        /* Mirror missing mmap method error code */
2631        return -ENODEV;
2632}
2633EXPORT_SYMBOL(sock_no_mmap);
2634
2635ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2636{
2637        ssize_t res;
2638        struct msghdr msg = {.msg_flags = flags};
2639        struct kvec iov;
2640        char *kaddr = kmap(page);
2641        iov.iov_base = kaddr + offset;
2642        iov.iov_len = size;
2643        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2644        kunmap(page);
2645        return res;
2646}
2647EXPORT_SYMBOL(sock_no_sendpage);
2648
2649ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2650                                int offset, size_t size, int flags)
2651{
2652        ssize_t res;
2653        struct msghdr msg = {.msg_flags = flags};
2654        struct kvec iov;
2655        char *kaddr = kmap(page);
2656
2657        iov.iov_base = kaddr + offset;
2658        iov.iov_len = size;
2659        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2660        kunmap(page);
2661        return res;
2662}
2663EXPORT_SYMBOL(sock_no_sendpage_locked);
2664
2665/*
2666 *      Default Socket Callbacks
2667 */
2668
2669static void sock_def_wakeup(struct sock *sk)
2670{
2671        struct socket_wq *wq;
2672
2673        rcu_read_lock();
2674        wq = rcu_dereference(sk->sk_wq);
2675        if (skwq_has_sleeper(wq))
2676                wake_up_interruptible_all(&wq->wait);
2677        rcu_read_unlock();
2678}
2679
2680static void sock_def_error_report(struct sock *sk)
2681{
2682        struct socket_wq *wq;
2683
2684        rcu_read_lock();
2685        wq = rcu_dereference(sk->sk_wq);
2686        if (skwq_has_sleeper(wq))
2687                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2688        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2689        rcu_read_unlock();
2690}
2691
2692static void sock_def_readable(struct sock *sk)
2693{
2694        struct socket_wq *wq;
2695
2696        rcu_read_lock();
2697        wq = rcu_dereference(sk->sk_wq);
2698        if (skwq_has_sleeper(wq))
2699                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2700                                                EPOLLRDNORM | EPOLLRDBAND);
2701        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2702        rcu_read_unlock();
2703}
2704
2705static void sock_def_write_space(struct sock *sk)
2706{
2707        struct socket_wq *wq;
2708
2709        rcu_read_lock();
2710
2711        /* Do not wake up a writer until he can make "significant"
2712         * progress.  --DaveM
2713         */
2714        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2715                wq = rcu_dereference(sk->sk_wq);
2716                if (skwq_has_sleeper(wq))
2717                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2718                                                EPOLLWRNORM | EPOLLWRBAND);
2719
2720                /* Should agree with poll, otherwise some programs break */
2721                if (sock_writeable(sk))
2722                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2723        }
2724
2725        rcu_read_unlock();
2726}
2727
2728static void sock_def_destruct(struct sock *sk)
2729{
2730}
2731
2732void sk_send_sigurg(struct sock *sk)
2733{
2734        if (sk->sk_socket && sk->sk_socket->file)
2735                if (send_sigurg(&sk->sk_socket->file->f_owner))
2736                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2737}
2738EXPORT_SYMBOL(sk_send_sigurg);
2739
2740void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2741                    unsigned long expires)
2742{
2743        if (!mod_timer(timer, expires))
2744                sock_hold(sk);
2745}
2746EXPORT_SYMBOL(sk_reset_timer);
2747
2748void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2749{
2750        if (del_timer(timer))
2751                __sock_put(sk);
2752}
2753EXPORT_SYMBOL(sk_stop_timer);
2754
2755void sock_init_data(struct socket *sock, struct sock *sk)
2756{
2757        sk_init_common(sk);
2758        sk->sk_send_head        =       NULL;
2759
2760        timer_setup(&sk->sk_timer, NULL, 0);
2761
2762        sk->sk_allocation       =       GFP_KERNEL;
2763        sk->sk_rcvbuf           =       sysctl_rmem_default;
2764        sk->sk_sndbuf           =       sysctl_wmem_default;
2765        sk->sk_state            =       TCP_CLOSE;
2766        sk_set_socket(sk, sock);
2767
2768        sock_set_flag(sk, SOCK_ZAPPED);
2769
2770        if (sock) {
2771                sk->sk_type     =       sock->type;
2772                sk->sk_wq       =       sock->wq;
2773                sock->sk        =       sk;
2774                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2775        } else {
2776                sk->sk_wq       =       NULL;
2777                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2778        }
2779
2780        rwlock_init(&sk->sk_callback_lock);
2781        if (sk->sk_kern_sock)
2782                lockdep_set_class_and_name(
2783                        &sk->sk_callback_lock,
2784                        af_kern_callback_keys + sk->sk_family,
2785                        af_family_kern_clock_key_strings[sk->sk_family]);
2786        else
2787                lockdep_set_class_and_name(
2788                        &sk->sk_callback_lock,
2789                        af_callback_keys + sk->sk_family,
2790                        af_family_clock_key_strings[sk->sk_family]);
2791
2792        sk->sk_state_change     =       sock_def_wakeup;
2793        sk->sk_data_ready       =       sock_def_readable;
2794        sk->sk_write_space      =       sock_def_write_space;
2795        sk->sk_error_report     =       sock_def_error_report;
2796        sk->sk_destruct         =       sock_def_destruct;
2797
2798        sk->sk_frag.page        =       NULL;
2799        sk->sk_frag.offset      =       0;
2800        sk->sk_peek_off         =       -1;
2801
2802        sk->sk_peer_pid         =       NULL;
2803        sk->sk_peer_cred        =       NULL;
2804        sk->sk_write_pending    =       0;
2805        sk->sk_rcvlowat         =       1;
2806        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2807        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2808
2809        sk->sk_stamp = SK_DEFAULT_STAMP;
2810        atomic_set(&sk->sk_zckey, 0);
2811
2812#ifdef CONFIG_NET_RX_BUSY_POLL
2813        sk->sk_napi_id          =       0;
2814        sk->sk_ll_usec          =       sysctl_net_busy_read;
2815#endif
2816
2817        sk->sk_max_pacing_rate = ~0U;
2818        sk->sk_pacing_rate = ~0U;
2819        sk->sk_pacing_shift = 10;
2820        sk->sk_incoming_cpu = -1;
2821        /*
2822         * Before updating sk_refcnt, we must commit prior changes to memory
2823         * (Documentation/RCU/rculist_nulls.txt for details)
2824         */
2825        smp_wmb();
2826        refcount_set(&sk->sk_refcnt, 1);
2827        atomic_set(&sk->sk_drops, 0);
2828}
2829EXPORT_SYMBOL(sock_init_data);
2830
2831void lock_sock_nested(struct sock *sk, int subclass)
2832{
2833        might_sleep();
2834        spin_lock_bh(&sk->sk_lock.slock);
2835        if (sk->sk_lock.owned)
2836                __lock_sock(sk);
2837        sk->sk_lock.owned = 1;
2838        spin_unlock(&sk->sk_lock.slock);
2839        /*
2840         * The sk_lock has mutex_lock() semantics here:
2841         */
2842        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2843        local_bh_enable();
2844}
2845EXPORT_SYMBOL(lock_sock_nested);
2846
2847void release_sock(struct sock *sk)
2848{
2849        spin_lock_bh(&sk->sk_lock.slock);
2850        if (sk->sk_backlog.tail)
2851                __release_sock(sk);
2852
2853        /* Warning : release_cb() might need to release sk ownership,
2854         * ie call sock_release_ownership(sk) before us.
2855         */
2856        if (sk->sk_prot->release_cb)
2857                sk->sk_prot->release_cb(sk);
2858
2859        sock_release_ownership(sk);
2860        if (waitqueue_active(&sk->sk_lock.wq))
2861                wake_up(&sk->sk_lock.wq);
2862        spin_unlock_bh(&sk->sk_lock.slock);
2863}
2864EXPORT_SYMBOL(release_sock);
2865
2866/**
2867 * lock_sock_fast - fast version of lock_sock
2868 * @sk: socket
2869 *
2870 * This version should be used for very small section, where process wont block
2871 * return false if fast path is taken:
2872 *
2873 *   sk_lock.slock locked, owned = 0, BH disabled
2874 *
2875 * return true if slow path is taken:
2876 *
2877 *   sk_lock.slock unlocked, owned = 1, BH enabled
2878 */
2879bool lock_sock_fast(struct sock *sk)
2880{
2881        might_sleep();
2882        spin_lock_bh(&sk->sk_lock.slock);
2883
2884        if (!sk->sk_lock.owned)
2885                /*
2886                 * Note : We must disable BH
2887                 */
2888                return false;
2889
2890        __lock_sock(sk);
2891        sk->sk_lock.owned = 1;
2892        spin_unlock(&sk->sk_lock.slock);
2893        /*
2894         * The sk_lock has mutex_lock() semantics here:
2895         */
2896        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2897        local_bh_enable();
2898        return true;
2899}
2900EXPORT_SYMBOL(lock_sock_fast);
2901
2902int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2903{
2904        struct timeval tv;
2905        if (!sock_flag(sk, SOCK_TIMESTAMP))
2906                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2907        tv = ktime_to_timeval(sk->sk_stamp);
2908        if (tv.tv_sec == -1)
2909                return -ENOENT;
2910        if (tv.tv_sec == 0) {
2911                sk->sk_stamp = ktime_get_real();
2912                tv = ktime_to_timeval(sk->sk_stamp);
2913        }
2914        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2915}
2916EXPORT_SYMBOL(sock_get_timestamp);
2917
2918int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2919{
2920        struct timespec ts;
2921        if (!sock_flag(sk, SOCK_TIMESTAMP))
2922                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2923        ts = ktime_to_timespec(sk->sk_stamp);
2924        if (ts.tv_sec == -1)
2925                return -ENOENT;
2926        if (ts.tv_sec == 0) {
2927                sk->sk_stamp = ktime_get_real();
2928                ts = ktime_to_timespec(sk->sk_stamp);
2929        }
2930        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2931}
2932EXPORT_SYMBOL(sock_get_timestampns);
2933
2934void sock_enable_timestamp(struct sock *sk, int flag)
2935{
2936        if (!sock_flag(sk, flag)) {
2937                unsigned long previous_flags = sk->sk_flags;
2938
2939                sock_set_flag(sk, flag);
2940                /*
2941                 * we just set one of the two flags which require net
2942                 * time stamping, but time stamping might have been on
2943                 * already because of the other one
2944                 */
2945                if (sock_needs_netstamp(sk) &&
2946                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2947                        net_enable_timestamp();
2948        }
2949}
2950
2951int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2952                       int level, int type)
2953{
2954        struct sock_exterr_skb *serr;
2955        struct sk_buff *skb;
2956        int copied, err;
2957
2958        err = -EAGAIN;
2959        skb = sock_dequeue_err_skb(sk);
2960        if (skb == NULL)
2961                goto out;
2962
2963        copied = skb->len;
2964        if (copied > len) {
2965                msg->msg_flags |= MSG_TRUNC;
2966                copied = len;
2967        }
2968        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2969        if (err)
2970                goto out_free_skb;
2971
2972        sock_recv_timestamp(msg, sk, skb);
2973
2974        serr = SKB_EXT_ERR(skb);
2975        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2976
2977        msg->msg_flags |= MSG_ERRQUEUE;
2978        err = copied;
2979
2980out_free_skb:
2981        kfree_skb(skb);
2982out:
2983        return err;
2984}
2985EXPORT_SYMBOL(sock_recv_errqueue);
2986
2987/*
2988 *      Get a socket option on an socket.
2989 *
2990 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2991 *      asynchronous errors should be reported by getsockopt. We assume
2992 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2993 */
2994int sock_common_getsockopt(struct socket *sock, int level, int optname,
2995                           char __user *optval, int __user *optlen)
2996{
2997        struct sock *sk = sock->sk;
2998
2999        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3000}
3001EXPORT_SYMBOL(sock_common_getsockopt);
3002
3003#ifdef CONFIG_COMPAT
3004int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3005                                  char __user *optval, int __user *optlen)
3006{
3007        struct sock *sk = sock->sk;
3008
3009        if (sk->sk_prot->compat_getsockopt != NULL)
3010                return sk->sk_prot->compat_getsockopt(sk, level, optname,
3011                                                      optval, optlen);
3012        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3013}
3014EXPORT_SYMBOL(compat_sock_common_getsockopt);
3015#endif
3016
3017int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3018                        int flags)
3019{
3020        struct sock *sk = sock->sk;
3021        int addr_len = 0;
3022        int err;
3023
3024        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3025                                   flags & ~MSG_DONTWAIT, &addr_len);
3026        if (err >= 0)
3027                msg->msg_namelen = addr_len;
3028        return err;
3029}
3030EXPORT_SYMBOL(sock_common_recvmsg);
3031
3032/*
3033 *      Set socket options on an inet socket.
3034 */
3035int sock_common_setsockopt(struct socket *sock, int level, int optname,
3036                           char __user *optval, unsigned int optlen)
3037{
3038        struct sock *sk = sock->sk;
3039
3040        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3041}
3042EXPORT_SYMBOL(sock_common_setsockopt);
3043
3044#ifdef CONFIG_COMPAT
3045int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3046                                  char __user *optval, unsigned int optlen)
3047{
3048        struct sock *sk = sock->sk;
3049
3050        if (sk->sk_prot->compat_setsockopt != NULL)
3051                return sk->sk_prot->compat_setsockopt(sk, level, optname,
3052                                                      optval, optlen);
3053        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3054}
3055EXPORT_SYMBOL(compat_sock_common_setsockopt);
3056#endif
3057
3058void sk_common_release(struct sock *sk)
3059{
3060        if (sk->sk_prot->destroy)
3061                sk->sk_prot->destroy(sk);
3062
3063        /*
3064         * Observation: when sock_common_release is called, processes have
3065         * no access to socket. But net still has.
3066         * Step one, detach it from networking:
3067         *
3068         * A. Remove from hash tables.
3069         */
3070
3071        sk->sk_prot->unhash(sk);
3072
3073        /*
3074         * In this point socket cannot receive new packets, but it is possible
3075         * that some packets are in flight because some CPU runs receiver and
3076         * did hash table lookup before we unhashed socket. They will achieve
3077         * receive queue and will be purged by socket destructor.
3078         *
3079         * Also we still have packets pending on receive queue and probably,
3080         * our own packets waiting in device queues. sock_destroy will drain
3081         * receive queue, but transmitted packets will delay socket destruction
3082         * until the last reference will be released.
3083         */
3084
3085        sock_orphan(sk);
3086
3087        xfrm_sk_free_policy(sk);
3088
3089        sk_refcnt_debug_release(sk);
3090
3091        sock_put(sk);
3092}
3093EXPORT_SYMBOL(sk_common_release);
3094
3095void sk_get_meminfo(const struct sock *sk, u32 *mem)
3096{
3097        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3098
3099        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3100        mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3101        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3102        mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3103        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3104        mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3105        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3106        mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3107        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3108}
3109
3110#ifdef CONFIG_PROC_FS
3111#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3112struct prot_inuse {
3113        int val[PROTO_INUSE_NR];
3114};
3115
3116static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3117
3118void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3119{
3120        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3121}
3122EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3123
3124int sock_prot_inuse_get(struct net *net, struct proto *prot)
3125{
3126        int cpu, idx = prot->inuse_idx;
3127        int res = 0;
3128
3129        for_each_possible_cpu(cpu)
3130                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3131
3132        return res >= 0 ? res : 0;
3133}
3134EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3135
3136static void sock_inuse_add(struct net *net, int val)
3137{
3138        this_cpu_add(*net->core.sock_inuse, val);
3139}
3140
3141int sock_inuse_get(struct net *net)
3142{
3143        int cpu, res = 0;
3144
3145        for_each_possible_cpu(cpu)
3146                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3147
3148        return res;
3149}
3150
3151EXPORT_SYMBOL_GPL(sock_inuse_get);
3152
3153static int __net_init sock_inuse_init_net(struct net *net)
3154{
3155        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3156        if (net->core.prot_inuse == NULL)
3157                return -ENOMEM;
3158
3159        net->core.sock_inuse = alloc_percpu(int);
3160        if (net->core.sock_inuse == NULL)
3161                goto out;
3162
3163        return 0;
3164
3165out:
3166        free_percpu(net->core.prot_inuse);
3167        return -ENOMEM;
3168}
3169
3170static void __net_exit sock_inuse_exit_net(struct net *net)
3171{
3172        free_percpu(net->core.prot_inuse);
3173        free_percpu(net->core.sock_inuse);
3174}
3175
3176static struct pernet_operations net_inuse_ops = {
3177        .init = sock_inuse_init_net,
3178        .exit = sock_inuse_exit_net,
3179};
3180
3181static __init int net_inuse_init(void)
3182{
3183        if (register_pernet_subsys(&net_inuse_ops))
3184                panic("Cannot initialize net inuse counters");
3185
3186        return 0;
3187}
3188
3189core_initcall(net_inuse_init);
3190
3191static void assign_proto_idx(struct proto *prot)
3192{
3193        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3194
3195        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3196                pr_err("PROTO_INUSE_NR exhausted\n");
3197                return;
3198        }
3199
3200        set_bit(prot->inuse_idx, proto_inuse_idx);
3201}
3202
3203static void release_proto_idx(struct proto *prot)
3204{
3205        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3206                clear_bit(prot->inuse_idx, proto_inuse_idx);
3207}
3208#else
3209static inline void assign_proto_idx(struct proto *prot)
3210{
3211}
3212
3213static inline void release_proto_idx(struct proto *prot)
3214{
3215}
3216
3217static void sock_inuse_add(struct net *net, int val)
3218{
3219}
3220#endif
3221
3222static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3223{
3224        if (!rsk_prot)
3225                return;
3226        kfree(rsk_prot->slab_name);
3227        rsk_prot->slab_name = NULL;
3228        kmem_cache_destroy(rsk_prot->slab);
3229        rsk_prot->slab = NULL;
3230}
3231
3232static int req_prot_init(const struct proto *prot)
3233{
3234        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3235
3236        if (!rsk_prot)
3237                return 0;
3238
3239        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3240                                        prot->name);
3241        if (!rsk_prot->slab_name)
3242                return -ENOMEM;
3243
3244        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3245                                           rsk_prot->obj_size, 0,
3246                                           SLAB_ACCOUNT | prot->slab_flags,
3247                                           NULL);
3248
3249        if (!rsk_prot->slab) {
3250                pr_crit("%s: Can't create request sock SLAB cache!\n",
3251                        prot->name);
3252                return -ENOMEM;
3253        }
3254        return 0;
3255}
3256
3257int proto_register(struct proto *prot, int alloc_slab)
3258{
3259        if (alloc_slab) {
3260                prot->slab = kmem_cache_create_usercopy(prot->name,
3261                                        prot->obj_size, 0,
3262                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3263                                        prot->slab_flags,
3264                                        prot->useroffset, prot->usersize,
3265                                        NULL);
3266
3267                if (prot->slab == NULL) {
3268                        pr_crit("%s: Can't create sock SLAB cache!\n",
3269                                prot->name);
3270                        goto out;
3271                }
3272
3273                if (req_prot_init(prot))
3274                        goto out_free_request_sock_slab;
3275
3276                if (prot->twsk_prot != NULL) {
3277                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3278
3279                        if (prot->twsk_prot->twsk_slab_name == NULL)
3280                                goto out_free_request_sock_slab;
3281
3282                        prot->twsk_prot->twsk_slab =
3283                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3284                                                  prot->twsk_prot->twsk_obj_size,
3285                                                  0,
3286                                                  SLAB_ACCOUNT |
3287                                                  prot->slab_flags,
3288                                                  NULL);
3289                        if (prot->twsk_prot->twsk_slab == NULL)
3290                                goto out_free_timewait_sock_slab_name;
3291                }
3292        }
3293
3294        mutex_lock(&proto_list_mutex);
3295        list_add(&prot->node, &proto_list);
3296        assign_proto_idx(prot);
3297        mutex_unlock(&proto_list_mutex);
3298        return 0;
3299
3300out_free_timewait_sock_slab_name:
3301        kfree(prot->twsk_prot->twsk_slab_name);
3302out_free_request_sock_slab:
3303        req_prot_cleanup(prot->rsk_prot);
3304
3305        kmem_cache_destroy(prot->slab);
3306        prot->slab = NULL;
3307out:
3308        return -ENOBUFS;
3309}
3310EXPORT_SYMBOL(proto_register);
3311
3312void proto_unregister(struct proto *prot)
3313{
3314        mutex_lock(&proto_list_mutex);
3315        release_proto_idx(prot);
3316        list_del(&prot->node);
3317        mutex_unlock(&proto_list_mutex);
3318
3319        kmem_cache_destroy(prot->slab);
3320        prot->slab = NULL;
3321
3322        req_prot_cleanup(prot->rsk_prot);
3323
3324        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3325                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3326                kfree(prot->twsk_prot->twsk_slab_name);
3327                prot->twsk_prot->twsk_slab = NULL;
3328        }
3329}
3330EXPORT_SYMBOL(proto_unregister);
3331
3332int sock_load_diag_module(int family, int protocol)
3333{
3334        if (!protocol) {
3335                if (!sock_is_registered(family))
3336                        return -ENOENT;
3337
3338                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3339                                      NETLINK_SOCK_DIAG, family);
3340        }
3341
3342#ifdef CONFIG_INET
3343        if (family == AF_INET &&
3344            !rcu_access_pointer(inet_protos[protocol]))
3345                return -ENOENT;
3346#endif
3347
3348        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3349                              NETLINK_SOCK_DIAG, family, protocol);
3350}
3351EXPORT_SYMBOL(sock_load_diag_module);
3352
3353#ifdef CONFIG_PROC_FS
3354static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3355        __acquires(proto_list_mutex)
3356{
3357        mutex_lock(&proto_list_mutex);
3358        return seq_list_start_head(&proto_list, *pos);
3359}
3360
3361static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3362{
3363        return seq_list_next(v, &proto_list, pos);
3364}
3365
3366static void proto_seq_stop(struct seq_file *seq, void *v)
3367        __releases(proto_list_mutex)
3368{
3369        mutex_unlock(&proto_list_mutex);
3370}
3371
3372static char proto_method_implemented(const void *method)
3373{
3374        return method == NULL ? 'n' : 'y';
3375}
3376static long sock_prot_memory_allocated(struct proto *proto)
3377{
3378        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3379}
3380
3381static char *sock_prot_memory_pressure(struct proto *proto)
3382{
3383        return proto->memory_pressure != NULL ?
3384        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3385}
3386
3387static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3388{
3389
3390        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3391                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3392                   proto->name,
3393                   proto->obj_size,
3394                   sock_prot_inuse_get(seq_file_net(seq), proto),
3395                   sock_prot_memory_allocated(proto),
3396                   sock_prot_memory_pressure(proto),
3397                   proto->max_header,
3398                   proto->slab == NULL ? "no" : "yes",
3399                   module_name(proto->owner),
3400                   proto_method_implemented(proto->close),
3401                   proto_method_implemented(proto->connect),
3402                   proto_method_implemented(proto->disconnect),
3403                   proto_method_implemented(proto->accept),
3404                   proto_method_implemented(proto->ioctl),
3405                   proto_method_implemented(proto->init),
3406                   proto_method_implemented(proto->destroy),
3407                   proto_method_implemented(proto->shutdown),
3408                   proto_method_implemented(proto->setsockopt),
3409                   proto_method_implemented(proto->getsockopt),
3410                   proto_method_implemented(proto->sendmsg),
3411                   proto_method_implemented(proto->recvmsg),
3412                   proto_method_implemented(proto->sendpage),
3413                   proto_method_implemented(proto->bind),
3414                   proto_method_implemented(proto->backlog_rcv),
3415                   proto_method_implemented(proto->hash),
3416                   proto_method_implemented(proto->unhash),
3417                   proto_method_implemented(proto->get_port),
3418                   proto_method_implemented(proto->enter_memory_pressure));
3419}
3420
3421static int proto_seq_show(struct seq_file *seq, void *v)
3422{
3423        if (v == &proto_list)
3424                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3425                           "protocol",
3426                           "size",
3427                           "sockets",
3428                           "memory",
3429                           "press",
3430                           "maxhdr",
3431                           "slab",
3432                           "module",
3433                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3434        else
3435                proto_seq_printf(seq, list_entry(v, struct proto, node));
3436        return 0;
3437}
3438
3439static const struct seq_operations proto_seq_ops = {
3440        .start  = proto_seq_start,
3441        .next   = proto_seq_next,
3442        .stop   = proto_seq_stop,
3443        .show   = proto_seq_show,
3444};
3445
3446static __net_init int proto_init_net(struct net *net)
3447{
3448        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3449                        sizeof(struct seq_net_private)))
3450                return -ENOMEM;
3451
3452        return 0;
3453}
3454
3455static __net_exit void proto_exit_net(struct net *net)
3456{
3457        remove_proc_entry("protocols", net->proc_net);
3458}
3459
3460
3461static __net_initdata struct pernet_operations proto_net_ops = {
3462        .init = proto_init_net,
3463        .exit = proto_exit_net,
3464};
3465
3466static int __init proto_init(void)
3467{
3468        return register_pernet_subsys(&proto_net_ops);
3469}
3470
3471subsys_initcall(proto_init);
3472
3473#endif /* PROC_FS */
3474
3475#ifdef CONFIG_NET_RX_BUSY_POLL
3476bool sk_busy_loop_end(void *p, unsigned long start_time)
3477{
3478        struct sock *sk = p;
3479
3480        return !skb_queue_empty(&sk->sk_receive_queue) ||
3481               sk_busy_loop_timeout(sk, start_time);
3482}
3483EXPORT_SYMBOL(sk_busy_loop_end);
3484#endif /* CONFIG_NET_RX_BUSY_POLL */
3485