LXR linux/net/core/sock.c

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/sched/mm.h>
 106#include <linux/timer.h>
 107#include <linux/string.h>
 108#include <linux/sockios.h>
 109#include <linux/net.h>
 110#include <linux/mm.h>
 111#include <linux/slab.h>
 112#include <linux/interrupt.h>
 113#include <linux/poll.h>
 114#include <linux/tcp.h>
 115#include <linux/init.h>
 116#include <linux/highmem.h>
 117#include <linux/user_namespace.h>
 118#include <linux/static_key.h>
 119#include <linux/memcontrol.h>
 120#include <linux/prefetch.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <linux/net_tstamp.h>
 131#include <net/xfrm.h>
 132#include <linux/ipsec.h>
 133#include <net/cls_cgroup.h>
 134#include <net/netprio_cgroup.h>
 135#include <linux/sock_diag.h>
 136
 137#include <linux/filter.h>
 138#include <net/sock_reuseport.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159                   struct user_namespace *user_ns, int cap)
 160{
 161        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177        return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196/*
 197 * Each address family might have different locking rules, so we have
 198 * one slock key per address family and separate keys for internal and
 199 * userspace sockets.
 200 */
 201static struct lock_class_key af_family_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206/*
 207 * Make lock validator output more readable. (we pre-construct these
 208 * strings build-time, so that runtime initialization of socket
 209 * locks is fast):
 210 */
 211
 212#define _sock_locks(x)                                            \
 213  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230        _sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233        _sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236        _sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240        _sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243        _sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249  "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250  "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251  "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252  "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253  "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254  "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255  "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256  "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257  "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258  "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259  "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260  "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261  "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262  "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264};
 265static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266  "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267  "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268  "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269  "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270  "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271  "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272  "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273  "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274  "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275  "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276  "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277  "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278  "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279  "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281};
 282static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283  "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284  "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285  "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286  "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287  "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288  "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289  "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290  "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291  "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292  "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293  "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294  "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295  "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296  "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock and sk queues locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305static struct lock_class_key af_rlock_keys[AF_MAX];
 306static struct lock_class_key af_wlock_keys[AF_MAX];
 307static struct lock_class_key af_elock_keys[AF_MAX];
 308static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310/* Take into consideration the size of the struct sk_buff overhead in the
 311 * determination of these values, since that is non-constant across
 312 * platforms.  This makes socket queueing behavior and performance
 313 * not depend upon such differences.
 314 */
 315#define _SK_MEM_PACKETS         256
 316#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 317#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 318#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 319
 320/* Run time adjustable parameters. */
 321__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 322EXPORT_SYMBOL(sysctl_wmem_max);
 323__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 324EXPORT_SYMBOL(sysctl_rmem_max);
 325__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 326__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 327
 328/* Maximal space eaten by iovec or ancillary data plus some space */
 329int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 330EXPORT_SYMBOL(sysctl_optmem_max);
 331
 332int sysctl_tstamp_allow_data __read_mostly = 1;
 333
 334struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 335EXPORT_SYMBOL_GPL(memalloc_socks);
 336
 337/**
 338 * sk_set_memalloc - sets %SOCK_MEMALLOC
 339 * @sk: socket to set it on
 340 *
 341 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 342 * It's the responsibility of the admin to adjust min_free_kbytes
 343 * to meet the requirements
 344 */
 345void sk_set_memalloc(struct sock *sk)
 346{
 347        sock_set_flag(sk, SOCK_MEMALLOC);
 348        sk->sk_allocation |= __GFP_MEMALLOC;
 349        static_key_slow_inc(&memalloc_socks);
 350}
 351EXPORT_SYMBOL_GPL(sk_set_memalloc);
 352
 353void sk_clear_memalloc(struct sock *sk)
 354{
 355        sock_reset_flag(sk, SOCK_MEMALLOC);
 356        sk->sk_allocation &= ~__GFP_MEMALLOC;
 357        static_key_slow_dec(&memalloc_socks);
 358
 359        /*
 360         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 361         * progress of swapping. SOCK_MEMALLOC may be cleared while
 362         * it has rmem allocations due to the last swapfile being deactivated
 363         * but there is a risk that the socket is unusable due to exceeding
 364         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 365         */
 366        sk_mem_reclaim(sk);
 367}
 368EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 369
 370int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 371{
 372        int ret;
 373        unsigned int noreclaim_flag;
 374
 375        /* these should have been dropped before queueing */
 376        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 377
 378        noreclaim_flag = memalloc_noreclaim_save();
 379        ret = sk->sk_backlog_rcv(sk, skb);
 380        memalloc_noreclaim_restore(noreclaim_flag);
 381
 382        return ret;
 383}
 384EXPORT_SYMBOL(__sk_backlog_rcv);
 385
 386static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 387{
 388        struct timeval tv;
 389
 390        if (optlen < sizeof(tv))
 391                return -EINVAL;
 392        if (copy_from_user(&tv, optval, sizeof(tv)))
 393                return -EFAULT;
 394        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                return -EDOM;
 396
 397        if (tv.tv_sec < 0) {
 398                static int warned __read_mostly;
 399
 400                *timeo_p = 0;
 401                if (warned < 10 && net_ratelimit()) {
 402                        warned++;
 403                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                __func__, current->comm, task_pid_nr(current));
 405                }
 406                return 0;
 407        }
 408        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                return 0;
 411        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 412                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 413        return 0;
 414}
 415
 416static void sock_warn_obsolete_bsdism(const char *name)
 417{
 418        static int warned;
 419        static char warncomm[TASK_COMM_LEN];
 420        if (strcmp(warncomm, current->comm) && warned < 5) {
 421                strcpy(warncomm,  current->comm);
 422                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                        warncomm, name);
 424                warned++;
 425        }
 426}
 427
 428static bool sock_needs_netstamp(const struct sock *sk)
 429{
 430        switch (sk->sk_family) {
 431        case AF_UNSPEC:
 432        case AF_UNIX:
 433                return false;
 434        default:
 435                return true;
 436        }
 437}
 438
 439static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440{
 441        if (sk->sk_flags & flags) {
 442                sk->sk_flags &= ~flags;
 443                if (sock_needs_netstamp(sk) &&
 444                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                        net_disable_timestamp();
 446        }
 447}
 448
 449
 450int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451{
 452        unsigned long flags;
 453        struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                atomic_inc(&sk->sk_drops);
 457                trace_sock_rcvqueue_full(sk, skb);
 458                return -ENOMEM;
 459        }
 460
 461        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                atomic_inc(&sk->sk_drops);
 463                return -ENOBUFS;
 464        }
 465
 466        skb->dev = NULL;
 467        skb_set_owner_r(skb, sk);
 468
 469        /* we escape from rcu protected region, make sure we dont leak
 470         * a norefcounted dst
 471         */
 472        skb_dst_force(skb);
 473
 474        spin_lock_irqsave(&list->lock, flags);
 475        sock_skb_set_dropcount(sk, skb);
 476        __skb_queue_tail(list, skb);
 477        spin_unlock_irqrestore(&list->lock, flags);
 478
 479        if (!sock_flag(sk, SOCK_DEAD))
 480                sk->sk_data_ready(sk);
 481        return 0;
 482}
 483EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486{
 487        int err;
 488
 489        err = sk_filter(sk, skb);
 490        if (err)
 491                return err;
 492
 493        return __sock_queue_rcv_skb(sk, skb);
 494}
 495EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                     const int nested, unsigned int trim_cap, bool refcounted)
 499{
 500        int rc = NET_RX_SUCCESS;
 501
 502        if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                goto discard_and_relse;
 504
 505        skb->dev = NULL;
 506
 507        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                atomic_inc(&sk->sk_drops);
 509                goto discard_and_relse;
 510        }
 511        if (nested)
 512                bh_lock_sock_nested(sk);
 513        else
 514                bh_lock_sock(sk);
 515        if (!sock_owned_by_user(sk)) {
 516                /*
 517                 * trylock + unlock semantics:
 518                 */
 519                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                rc = sk_backlog_rcv(sk, skb);
 522
 523                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                bh_unlock_sock(sk);
 526                atomic_inc(&sk->sk_drops);
 527                goto discard_and_relse;
 528        }
 529
 530        bh_unlock_sock(sk);
 531out:
 532        if (refcounted)
 533                sock_put(sk);
 534        return rc;
 535discard_and_relse:
 536        kfree_skb(skb);
 537        goto out;
 538}
 539EXPORT_SYMBOL(__sk_receive_skb);
 540
 541struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542{
 543        struct dst_entry *dst = __sk_dst_get(sk);
 544
 545        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                sk_tx_queue_clear(sk);
 547                sk->sk_dst_pending_confirm = 0;
 548                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                dst_release(dst);
 550                return NULL;
 551        }
 552
 553        return dst;
 554}
 555EXPORT_SYMBOL(__sk_dst_check);
 556
 557struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558{
 559        struct dst_entry *dst = sk_dst_get(sk);
 560
 561        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                sk_dst_reset(sk);
 563                dst_release(dst);
 564                return NULL;
 565        }
 566
 567        return dst;
 568}
 569EXPORT_SYMBOL(sk_dst_check);
 570
 571static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 572                                int optlen)
 573{
 574        int ret = -ENOPROTOOPT;
 575#ifdef CONFIG_NETDEVICES
 576        struct net *net = sock_net(sk);
 577        char devname[IFNAMSIZ];
 578        int index;
 579
 580        /* Sorry... */
 581        ret = -EPERM;
 582        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 583                goto out;
 584
 585        ret = -EINVAL;
 586        if (optlen < 0)
 587                goto out;
 588
 589        /* Bind this socket to a particular device like "eth0",
 590         * as specified in the passed interface name. If the
 591         * name is "" or the option length is zero the socket
 592         * is not bound.
 593         */
 594        if (optlen > IFNAMSIZ - 1)
 595                optlen = IFNAMSIZ - 1;
 596        memset(devname, 0, sizeof(devname));
 597
 598        ret = -EFAULT;
 599        if (copy_from_user(devname, optval, optlen))
 600                goto out;
 601
 602        index = 0;
 603        if (devname[0] != '\0') {
 604                struct net_device *dev;
 605
 606                rcu_read_lock();
 607                dev = dev_get_by_name_rcu(net, devname);
 608                if (dev)
 609                        index = dev->ifindex;
 610                rcu_read_unlock();
 611                ret = -ENODEV;
 612                if (!dev)
 613                        goto out;
 614        }
 615
 616        lock_sock(sk);
 617        sk->sk_bound_dev_if = index;
 618        sk_dst_reset(sk);
 619        release_sock(sk);
 620
 621        ret = 0;
 622
 623out:
 624#endif
 625
 626        return ret;
 627}
 628
 629static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 630                                int __user *optlen, int len)
 631{
 632        int ret = -ENOPROTOOPT;
 633#ifdef CONFIG_NETDEVICES
 634        struct net *net = sock_net(sk);
 635        char devname[IFNAMSIZ];
 636
 637        if (sk->sk_bound_dev_if == 0) {
 638                len = 0;
 639                goto zero;
 640        }
 641
 642        ret = -EINVAL;
 643        if (len < IFNAMSIZ)
 644                goto out;
 645
 646        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 647        if (ret)
 648                goto out;
 649
 650        len = strlen(devname) + 1;
 651
 652        ret = -EFAULT;
 653        if (copy_to_user(optval, devname, len))
 654                goto out;
 655
 656zero:
 657        ret = -EFAULT;
 658        if (put_user(len, optlen))
 659                goto out;
 660
 661        ret = 0;
 662
 663out:
 664#endif
 665
 666        return ret;
 667}
 668
 669static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 670{
 671        if (valbool)
 672                sock_set_flag(sk, bit);
 673        else
 674                sock_reset_flag(sk, bit);
 675}
 676
 677bool sk_mc_loop(struct sock *sk)
 678{
 679        if (dev_recursion_level())
 680                return false;
 681        if (!sk)
 682                return true;
 683        switch (sk->sk_family) {
 684        case AF_INET:
 685                return inet_sk(sk)->mc_loop;
 686#if IS_ENABLED(CONFIG_IPV6)
 687        case AF_INET6:
 688                return inet6_sk(sk)->mc_loop;
 689#endif
 690        }
 691        WARN_ON(1);
 692        return true;
 693}
 694EXPORT_SYMBOL(sk_mc_loop);
 695
 696/*
 697 *      This is meant for all protocols to use and covers goings on
 698 *      at the socket level. Everything here is generic.
 699 */
 700
 701int sock_setsockopt(struct socket *sock, int level, int optname,
 702                    char __user *optval, unsigned int optlen)
 703{
 704        struct sock *sk = sock->sk;
 705        int val;
 706        int valbool;
 707        struct linger ling;
 708        int ret = 0;
 709
 710        /*
 711         *      Options without arguments
 712         */
 713
 714        if (optname == SO_BINDTODEVICE)
 715                return sock_setbindtodevice(sk, optval, optlen);
 716
 717        if (optlen < sizeof(int))
 718                return -EINVAL;
 719
 720        if (get_user(val, (int __user *)optval))
 721                return -EFAULT;
 722
 723        valbool = val ? 1 : 0;
 724
 725        lock_sock(sk);
 726
 727        switch (optname) {
 728        case SO_DEBUG:
 729                if (val && !capable(CAP_NET_ADMIN))
 730                        ret = -EACCES;
 731                else
 732                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 733                break;
 734        case SO_REUSEADDR:
 735                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 736                break;
 737        case SO_REUSEPORT:
 738                sk->sk_reuseport = valbool;
 739                break;
 740        case SO_TYPE:
 741        case SO_PROTOCOL:
 742        case SO_DOMAIN:
 743        case SO_ERROR:
 744                ret = -ENOPROTOOPT;
 745                break;
 746        case SO_DONTROUTE:
 747                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 748                break;
 749        case SO_BROADCAST:
 750                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 751                break;
 752        case SO_SNDBUF:
 753                /* Don't error on this BSD doesn't and if you think
 754                 * about it this is right. Otherwise apps have to
 755                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 756                 * are treated in BSD as hints
 757                 */
 758                val = min_t(u32, val, sysctl_wmem_max);
 759set_sndbuf:
 760                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 761                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 762                /* Wake up sending tasks if we upped the value. */
 763                sk->sk_write_space(sk);
 764                break;
 765
 766        case SO_SNDBUFFORCE:
 767                if (!capable(CAP_NET_ADMIN)) {
 768                        ret = -EPERM;
 769                        break;
 770                }
 771                goto set_sndbuf;
 772
 773        case SO_RCVBUF:
 774                /* Don't error on this BSD doesn't and if you think
 775                 * about it this is right. Otherwise apps have to
 776                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                 * are treated in BSD as hints
 778                 */
 779                val = min_t(u32, val, sysctl_rmem_max);
 780set_rcvbuf:
 781                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 782                /*
 783                 * We double it on the way in to account for
 784                 * "struct sk_buff" etc. overhead.   Applications
 785                 * assume that the SO_RCVBUF setting they make will
 786                 * allow that much actual data to be received on that
 787                 * socket.
 788                 *
 789                 * Applications are unaware that "struct sk_buff" and
 790                 * other overheads allocate from the receive buffer
 791                 * during socket buffer allocation.
 792                 *
 793                 * And after considering the possible alternatives,
 794                 * returning the value we actually used in getsockopt
 795                 * is the most desirable behavior.
 796                 */
 797                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 798                break;
 799
 800        case SO_RCVBUFFORCE:
 801                if (!capable(CAP_NET_ADMIN)) {
 802                        ret = -EPERM;
 803                        break;
 804                }
 805                goto set_rcvbuf;
 806
 807        case SO_KEEPALIVE:
 808                if (sk->sk_prot->keepalive)
 809                        sk->sk_prot->keepalive(sk, valbool);
 810                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 811                break;
 812
 813        case SO_OOBINLINE:
 814                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 815                break;
 816
 817        case SO_NO_CHECK:
 818                sk->sk_no_check_tx = valbool;
 819                break;
 820
 821        case SO_PRIORITY:
 822                if ((val >= 0 && val <= 6) ||
 823                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 824                        sk->sk_priority = val;
 825                else
 826                        ret = -EPERM;
 827                break;
 828
 829        case SO_LINGER:
 830                if (optlen < sizeof(ling)) {
 831                        ret = -EINVAL;  /* 1003.1g */
 832                        break;
 833                }
 834                if (copy_from_user(&ling, optval, sizeof(ling))) {
 835                        ret = -EFAULT;
 836                        break;
 837                }
 838                if (!ling.l_onoff)
 839                        sock_reset_flag(sk, SOCK_LINGER);
 840                else {
 841#if (BITS_PER_LONG == 32)
 842                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 843                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 844                        else
 845#endif
 846                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 847                        sock_set_flag(sk, SOCK_LINGER);
 848                }
 849                break;
 850
 851        case SO_BSDCOMPAT:
 852                sock_warn_obsolete_bsdism("setsockopt");
 853                break;
 854
 855        case SO_PASSCRED:
 856                if (valbool)
 857                        set_bit(SOCK_PASSCRED, &sock->flags);
 858                else
 859                        clear_bit(SOCK_PASSCRED, &sock->flags);
 860                break;
 861
 862        case SO_TIMESTAMP:
 863        case SO_TIMESTAMPNS:
 864                if (valbool)  {
 865                        if (optname == SO_TIMESTAMP)
 866                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 867                        else
 868                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 869                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 870                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 871                } else {
 872                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 873                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 874                }
 875                break;
 876
 877        case SO_TIMESTAMPING:
 878                if (val & ~SOF_TIMESTAMPING_MASK) {
 879                        ret = -EINVAL;
 880                        break;
 881                }
 882
 883                if (val & SOF_TIMESTAMPING_OPT_ID &&
 884                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 885                        if (sk->sk_protocol == IPPROTO_TCP &&
 886                            sk->sk_type == SOCK_STREAM) {
 887                                if ((1 << sk->sk_state) &
 888                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 889                                        ret = -EINVAL;
 890                                        break;
 891                                }
 892                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 893                        } else {
 894                                sk->sk_tskey = 0;
 895                        }
 896                }
 897
 898                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 899                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 900                        ret = -EINVAL;
 901                        break;
 902                }
 903
 904                sk->sk_tsflags = val;
 905                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 906                        sock_enable_timestamp(sk,
 907                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 908                else
 909                        sock_disable_timestamp(sk,
 910                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 911                break;
 912
 913        case SO_RCVLOWAT:
 914                if (val < 0)
 915                        val = INT_MAX;
 916                sk->sk_rcvlowat = val ? : 1;
 917                break;
 918
 919        case SO_RCVTIMEO:
 920                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 921                break;
 922
 923        case SO_SNDTIMEO:
 924                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 925                break;
 926
 927        case SO_ATTACH_FILTER:
 928                ret = -EINVAL;
 929                if (optlen == sizeof(struct sock_fprog)) {
 930                        struct sock_fprog fprog;
 931
 932                        ret = -EFAULT;
 933                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 934                                break;
 935
 936                        ret = sk_attach_filter(&fprog, sk);
 937                }
 938                break;
 939
 940        case SO_ATTACH_BPF:
 941                ret = -EINVAL;
 942                if (optlen == sizeof(u32)) {
 943                        u32 ufd;
 944
 945                        ret = -EFAULT;
 946                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 947                                break;
 948
 949                        ret = sk_attach_bpf(ufd, sk);
 950                }
 951                break;
 952
 953        case SO_ATTACH_REUSEPORT_CBPF:
 954                ret = -EINVAL;
 955                if (optlen == sizeof(struct sock_fprog)) {
 956                        struct sock_fprog fprog;
 957
 958                        ret = -EFAULT;
 959                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 960                                break;
 961
 962                        ret = sk_reuseport_attach_filter(&fprog, sk);
 963                }
 964                break;
 965
 966        case SO_ATTACH_REUSEPORT_EBPF:
 967                ret = -EINVAL;
 968                if (optlen == sizeof(u32)) {
 969                        u32 ufd;
 970
 971                        ret = -EFAULT;
 972                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 973                                break;
 974
 975                        ret = sk_reuseport_attach_bpf(ufd, sk);
 976                }
 977                break;
 978
 979        case SO_DETACH_FILTER:
 980                ret = sk_detach_filter(sk);
 981                break;
 982
 983        case SO_LOCK_FILTER:
 984                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 985                        ret = -EPERM;
 986                else
 987                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 988                break;
 989
 990        case SO_PASSSEC:
 991                if (valbool)
 992                        set_bit(SOCK_PASSSEC, &sock->flags);
 993                else
 994                        clear_bit(SOCK_PASSSEC, &sock->flags);
 995                break;
 996        case SO_MARK:
 997                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 998                        ret = -EPERM;
 999                else
1000                        sk->sk_mark = val;

1001                break;
1002
1003        case SO_RXQ_OVFL:
1004                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                break;
1006
1007        case SO_WIFI_STATUS:
1008                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                break;
1010
1011        case SO_PEEK_OFF:
1012                if (sock->ops->set_peek_off)
1013                        ret = sock->ops->set_peek_off(sk, val);
1014                else
1015                        ret = -EOPNOTSUPP;
1016                break;
1017
1018        case SO_NOFCS:
1019                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                break;
1021
1022        case SO_SELECT_ERR_QUEUE:
1023                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                break;
1025
1026#ifdef CONFIG_NET_RX_BUSY_POLL
1027        case SO_BUSY_POLL:
1028                /* allow unprivileged users to decrease the value */
1029                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                        ret = -EPERM;
1031                else {
1032                        if (val < 0)
1033                                ret = -EINVAL;
1034                        else
1035                                sk->sk_ll_usec = val;
1036                }
1037                break;
1038#endif
1039
1040        case SO_MAX_PACING_RATE:
1041                if (val != ~0U)
1042                        cmpxchg(&sk->sk_pacing_status,
1043                                SK_PACING_NONE,
1044                                SK_PACING_NEEDED);
1045                sk->sk_max_pacing_rate = val;
1046                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1047                                         sk->sk_max_pacing_rate);
1048                break;
1049
1050        case SO_INCOMING_CPU:
1051                sk->sk_incoming_cpu = val;
1052                break;
1053
1054        case SO_CNX_ADVICE:
1055                if (val == 1)
1056                        dst_negative_advice(sk);
1057                break;
1058        default:
1059                ret = -ENOPROTOOPT;
1060                break;
1061        }
1062        release_sock(sk);
1063        return ret;
1064}
1065EXPORT_SYMBOL(sock_setsockopt);
1066
1067
1068static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069                          struct ucred *ucred)
1070{
1071        ucred->pid = pid_vnr(pid);
1072        ucred->uid = ucred->gid = -1;
1073        if (cred) {
1074                struct user_namespace *current_ns = current_user_ns();
1075
1076                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078        }
1079}
1080
1081static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082{
1083        struct user_namespace *user_ns = current_user_ns();
1084        int i;
1085
1086        for (i = 0; i < src->ngroups; i++)
1087                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088                        return -EFAULT;
1089
1090        return 0;
1091}
1092
1093int sock_getsockopt(struct socket *sock, int level, int optname,
1094                    char __user *optval, int __user *optlen)
1095{
1096        struct sock *sk = sock->sk;
1097
1098        union {
1099                int val;
1100                u64 val64;
1101                struct linger ling;
1102                struct timeval tm;
1103        } v;
1104
1105        int lv = sizeof(int);
1106        int len;
1107
1108        if (get_user(len, optlen))
1109                return -EFAULT;
1110        if (len < 0)
1111                return -EINVAL;
1112
1113        memset(&v, 0, sizeof(v));
1114
1115        switch (optname) {
1116        case SO_DEBUG:
1117                v.val = sock_flag(sk, SOCK_DBG);
1118                break;
1119
1120        case SO_DONTROUTE:
1121                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1122                break;
1123
1124        case SO_BROADCAST:
1125                v.val = sock_flag(sk, SOCK_BROADCAST);
1126                break;
1127
1128        case SO_SNDBUF:
1129                v.val = sk->sk_sndbuf;
1130                break;
1131
1132        case SO_RCVBUF:
1133                v.val = sk->sk_rcvbuf;
1134                break;
1135
1136        case SO_REUSEADDR:
1137                v.val = sk->sk_reuse;
1138                break;
1139
1140        case SO_REUSEPORT:
1141                v.val = sk->sk_reuseport;
1142                break;
1143
1144        case SO_KEEPALIVE:
1145                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1146                break;
1147
1148        case SO_TYPE:
1149                v.val = sk->sk_type;
1150                break;
1151
1152        case SO_PROTOCOL:
1153                v.val = sk->sk_protocol;
1154                break;
1155
1156        case SO_DOMAIN:
1157                v.val = sk->sk_family;
1158                break;
1159
1160        case SO_ERROR:
1161                v.val = -sock_error(sk);
1162                if (v.val == 0)
1163                        v.val = xchg(&sk->sk_err_soft, 0);
1164                break;
1165
1166        case SO_OOBINLINE:
1167                v.val = sock_flag(sk, SOCK_URGINLINE);
1168                break;
1169
1170        case SO_NO_CHECK:
1171                v.val = sk->sk_no_check_tx;
1172                break;
1173
1174        case SO_PRIORITY:
1175                v.val = sk->sk_priority;
1176                break;
1177
1178        case SO_LINGER:
1179                lv              = sizeof(v.ling);
1180                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1181                v.ling.l_linger = sk->sk_lingertime / HZ;
1182                break;
1183
1184        case SO_BSDCOMPAT:
1185                sock_warn_obsolete_bsdism("getsockopt");
1186                break;
1187
1188        case SO_TIMESTAMP:
1189                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1190                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1191                break;
1192
1193        case SO_TIMESTAMPNS:
1194                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1195                break;
1196
1197        case SO_TIMESTAMPING:
1198                v.val = sk->sk_tsflags;
1199                break;
1200
1201        case SO_RCVTIMEO:
1202                lv = sizeof(struct timeval);
1203                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1204                        v.tm.tv_sec = 0;
1205                        v.tm.tv_usec = 0;
1206                } else {
1207                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1208                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1209                }
1210                break;
1211
1212        case SO_SNDTIMEO:
1213                lv = sizeof(struct timeval);
1214                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1215                        v.tm.tv_sec = 0;
1216                        v.tm.tv_usec = 0;
1217                } else {
1218                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1219                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1220                }
1221                break;
1222
1223        case SO_RCVLOWAT:
1224                v.val = sk->sk_rcvlowat;
1225                break;
1226
1227        case SO_SNDLOWAT:
1228                v.val = 1;
1229                break;
1230
1231        case SO_PASSCRED:
1232                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1233                break;
1234
1235        case SO_PEERCRED:
1236        {
1237                struct ucred peercred;
1238                if (len > sizeof(peercred))
1239                        len = sizeof(peercred);
1240                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1241                if (copy_to_user(optval, &peercred, len))
1242                        return -EFAULT;
1243                goto lenout;
1244        }
1245
1246        case SO_PEERGROUPS:
1247        {
1248                int ret, n;
1249
1250                if (!sk->sk_peer_cred)
1251                        return -ENODATA;
1252
1253                n = sk->sk_peer_cred->group_info->ngroups;
1254                if (len < n * sizeof(gid_t)) {
1255                        len = n * sizeof(gid_t);
1256                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1257                }
1258                len = n * sizeof(gid_t);
1259
1260                ret = groups_to_user((gid_t __user *)optval,
1261                                     sk->sk_peer_cred->group_info);
1262                if (ret)
1263                        return ret;
1264                goto lenout;
1265        }
1266
1267        case SO_PEERNAME:
1268        {
1269                char address[128];
1270
1271                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1272                        return -ENOTCONN;
1273                if (lv < len)
1274                        return -EINVAL;
1275                if (copy_to_user(optval, address, len))
1276                        return -EFAULT;
1277                goto lenout;
1278        }
1279
1280        /* Dubious BSD thing... Probably nobody even uses it, but
1281         * the UNIX standard wants it for whatever reason... -DaveM
1282         */
1283        case SO_ACCEPTCONN:
1284                v.val = sk->sk_state == TCP_LISTEN;
1285                break;
1286
1287        case SO_PASSSEC:
1288                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1289                break;
1290
1291        case SO_PEERSEC:
1292                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1293
1294        case SO_MARK:
1295                v.val = sk->sk_mark;
1296                break;
1297
1298        case SO_RXQ_OVFL:
1299                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1300                break;
1301
1302        case SO_WIFI_STATUS:
1303                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1304                break;
1305
1306        case SO_PEEK_OFF:
1307                if (!sock->ops->set_peek_off)
1308                        return -EOPNOTSUPP;
1309
1310                v.val = sk->sk_peek_off;
1311                break;
1312        case SO_NOFCS:
1313                v.val = sock_flag(sk, SOCK_NOFCS);
1314                break;
1315
1316        case SO_BINDTODEVICE:
1317                return sock_getbindtodevice(sk, optval, optlen, len);
1318
1319        case SO_GET_FILTER:
1320                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1321                if (len < 0)
1322                        return len;
1323
1324                goto lenout;
1325
1326        case SO_LOCK_FILTER:
1327                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1328                break;
1329
1330        case SO_BPF_EXTENSIONS:
1331                v.val = bpf_tell_extensions();
1332                break;
1333
1334        case SO_SELECT_ERR_QUEUE:
1335                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1336                break;
1337
1338#ifdef CONFIG_NET_RX_BUSY_POLL
1339        case SO_BUSY_POLL:
1340                v.val = sk->sk_ll_usec;
1341                break;
1342#endif
1343
1344        case SO_MAX_PACING_RATE:
1345                v.val = sk->sk_max_pacing_rate;
1346                break;
1347
1348        case SO_INCOMING_CPU:
1349                v.val = sk->sk_incoming_cpu;
1350                break;
1351
1352        case SO_MEMINFO:
1353        {
1354                u32 meminfo[SK_MEMINFO_VARS];
1355
1356                if (get_user(len, optlen))
1357                        return -EFAULT;
1358
1359                sk_get_meminfo(sk, meminfo);
1360
1361                len = min_t(unsigned int, len, sizeof(meminfo));
1362                if (copy_to_user(optval, &meminfo, len))
1363                        return -EFAULT;
1364
1365                goto lenout;
1366        }
1367
1368#ifdef CONFIG_NET_RX_BUSY_POLL
1369        case SO_INCOMING_NAPI_ID:
1370                v.val = READ_ONCE(sk->sk_napi_id);
1371
1372                /* aggregate non-NAPI IDs down to 0 */
1373                if (v.val < MIN_NAPI_ID)
1374                        v.val = 0;
1375
1376                break;
1377#endif
1378
1379        case SO_COOKIE:
1380                lv = sizeof(u64);
1381                if (len < lv)
1382                        return -EINVAL;
1383                v.val64 = sock_gen_cookie(sk);
1384                break;
1385
1386        default:
1387                /* We implement the SO_SNDLOWAT etc to not be settable
1388                 * (1003.1g 7).
1389                 */
1390                return -ENOPROTOOPT;
1391        }
1392
1393        if (len > lv)
1394                len = lv;
1395        if (copy_to_user(optval, &v, len))
1396                return -EFAULT;
1397lenout:
1398        if (put_user(len, optlen))
1399                return -EFAULT;
1400        return 0;
1401}
1402
1403/*
1404 * Initialize an sk_lock.
1405 *
1406 * (We also register the sk_lock with the lock validator.)
1407 */
1408static inline void sock_lock_init(struct sock *sk)
1409{
1410        if (sk->sk_kern_sock)
1411                sock_lock_init_class_and_name(
1412                        sk,
1413                        af_family_kern_slock_key_strings[sk->sk_family],
1414                        af_family_kern_slock_keys + sk->sk_family,
1415                        af_family_kern_key_strings[sk->sk_family],
1416                        af_family_kern_keys + sk->sk_family);
1417        else
1418                sock_lock_init_class_and_name(
1419                        sk,
1420                        af_family_slock_key_strings[sk->sk_family],
1421                        af_family_slock_keys + sk->sk_family,
1422                        af_family_key_strings[sk->sk_family],
1423                        af_family_keys + sk->sk_family);
1424}
1425
1426/*
1427 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1428 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1429 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1430 */
1431static void sock_copy(struct sock *nsk, const struct sock *osk)
1432{
1433#ifdef CONFIG_SECURITY_NETWORK
1434        void *sptr = nsk->sk_security;
1435#endif
1436        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1437
1438        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1439               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1440
1441#ifdef CONFIG_SECURITY_NETWORK
1442        nsk->sk_security = sptr;
1443        security_sk_clone(osk, nsk);
1444#endif
1445}
1446
1447static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1448                int family)
1449{
1450        struct sock *sk;
1451        struct kmem_cache *slab;
1452
1453        slab = prot->slab;
1454        if (slab != NULL) {
1455                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1456                if (!sk)
1457                        return sk;
1458                if (priority & __GFP_ZERO)
1459                        sk_prot_clear_nulls(sk, prot->obj_size);
1460        } else
1461                sk = kmalloc(prot->obj_size, priority);
1462
1463        if (sk != NULL) {
1464                kmemcheck_annotate_bitfield(sk, flags);
1465
1466                if (security_sk_alloc(sk, family, priority))
1467                        goto out_free;
1468
1469                if (!try_module_get(prot->owner))
1470                        goto out_free_sec;
1471                sk_tx_queue_clear(sk);
1472        }
1473
1474        return sk;
1475
1476out_free_sec:
1477        security_sk_free(sk);
1478out_free:
1479        if (slab != NULL)
1480                kmem_cache_free(slab, sk);
1481        else
1482                kfree(sk);
1483        return NULL;
1484}
1485
1486static void sk_prot_free(struct proto *prot, struct sock *sk)
1487{
1488        struct kmem_cache *slab;
1489        struct module *owner;
1490
1491        owner = prot->owner;
1492        slab = prot->slab;
1493
1494        cgroup_sk_free(&sk->sk_cgrp_data);
1495        mem_cgroup_sk_free(sk);
1496        security_sk_free(sk);
1497        if (slab != NULL)
1498                kmem_cache_free(slab, sk);
1499        else
1500                kfree(sk);
1501        module_put(owner);
1502}
1503
1504/**
1505 *      sk_alloc - All socket objects are allocated here
1506 *      @net: the applicable net namespace
1507 *      @family: protocol family
1508 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1509 *      @prot: struct proto associated with this new sock instance
1510 *      @kern: is this to be a kernel socket?
1511 */
1512struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1513                      struct proto *prot, int kern)
1514{
1515        struct sock *sk;
1516
1517        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1518        if (sk) {
1519                sk->sk_family = family;
1520                /*
1521                 * See comment in struct sock definition to understand
1522                 * why we need sk_prot_creator -acme
1523                 */
1524                sk->sk_prot = sk->sk_prot_creator = prot;
1525                sk->sk_kern_sock = kern;
1526                sock_lock_init(sk);
1527                sk->sk_net_refcnt = kern ? 0 : 1;
1528                if (likely(sk->sk_net_refcnt))
1529                        get_net(net);
1530                sock_net_set(sk, net);
1531                refcount_set(&sk->sk_wmem_alloc, 1);
1532
1533                mem_cgroup_sk_alloc(sk);
1534                cgroup_sk_alloc(&sk->sk_cgrp_data);
1535                sock_update_classid(&sk->sk_cgrp_data);
1536                sock_update_netprioidx(&sk->sk_cgrp_data);
1537        }
1538
1539        return sk;
1540}
1541EXPORT_SYMBOL(sk_alloc);
1542
1543/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1544 * grace period. This is the case for UDP sockets and TCP listeners.
1545 */
1546static void __sk_destruct(struct rcu_head *head)
1547{
1548        struct sock *sk = container_of(head, struct sock, sk_rcu);
1549        struct sk_filter *filter;
1550
1551        if (sk->sk_destruct)
1552                sk->sk_destruct(sk);
1553
1554        filter = rcu_dereference_check(sk->sk_filter,
1555                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1556        if (filter) {
1557                sk_filter_uncharge(sk, filter);
1558                RCU_INIT_POINTER(sk->sk_filter, NULL);
1559        }
1560        if (rcu_access_pointer(sk->sk_reuseport_cb))
1561                reuseport_detach_sock(sk);
1562
1563        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1564
1565        if (atomic_read(&sk->sk_omem_alloc))
1566                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1567                         __func__, atomic_read(&sk->sk_omem_alloc));
1568
1569        if (sk->sk_frag.page) {
1570                put_page(sk->sk_frag.page);
1571                sk->sk_frag.page = NULL;
1572        }
1573
1574        if (sk->sk_peer_cred)
1575                put_cred(sk->sk_peer_cred);
1576        put_pid(sk->sk_peer_pid);
1577        if (likely(sk->sk_net_refcnt))
1578                put_net(sock_net(sk));
1579        sk_prot_free(sk->sk_prot_creator, sk);
1580}
1581
1582void sk_destruct(struct sock *sk)
1583{
1584        if (sock_flag(sk, SOCK_RCU_FREE))
1585                call_rcu(&sk->sk_rcu, __sk_destruct);
1586        else
1587                __sk_destruct(&sk->sk_rcu);
1588}
1589
1590static void __sk_free(struct sock *sk)
1591{
1592        if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1593                sock_diag_broadcast_destroy(sk);
1594        else
1595                sk_destruct(sk);
1596}
1597
1598void sk_free(struct sock *sk)
1599{
1600        /*
1601         * We subtract one from sk_wmem_alloc and can know if
1602         * some packets are still in some tx queue.
1603         * If not null, sock_wfree() will call __sk_free(sk) later
1604         */
1605        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1606                __sk_free(sk);
1607}
1608EXPORT_SYMBOL(sk_free);
1609
1610static void sk_init_common(struct sock *sk)
1611{
1612        skb_queue_head_init(&sk->sk_receive_queue);
1613        skb_queue_head_init(&sk->sk_write_queue);
1614        skb_queue_head_init(&sk->sk_error_queue);
1615
1616        rwlock_init(&sk->sk_callback_lock);
1617        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1618                        af_rlock_keys + sk->sk_family,
1619                        af_family_rlock_key_strings[sk->sk_family]);
1620        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1621                        af_wlock_keys + sk->sk_family,
1622                        af_family_wlock_key_strings[sk->sk_family]);
1623        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1624                        af_elock_keys + sk->sk_family,
1625                        af_family_elock_key_strings[sk->sk_family]);
1626        lockdep_set_class_and_name(&sk->sk_callback_lock,
1627                        af_callback_keys + sk->sk_family,
1628                        af_family_clock_key_strings[sk->sk_family]);
1629}
1630
1631/**
1632 *      sk_clone_lock - clone a socket, and lock its clone
1633 *      @sk: the socket to clone
1634 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1635 *
1636 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1637 */
1638struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1639{
1640        struct sock *newsk;
1641        bool is_charged = true;
1642
1643        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1644        if (newsk != NULL) {
1645                struct sk_filter *filter;
1646
1647                sock_copy(newsk, sk);
1648
1649                /* SANITY */
1650                if (likely(newsk->sk_net_refcnt))
1651                        get_net(sock_net(newsk));
1652                sk_node_init(&newsk->sk_node);
1653                sock_lock_init(newsk);
1654                bh_lock_sock(newsk);
1655                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1656                newsk->sk_backlog.len = 0;
1657
1658                atomic_set(&newsk->sk_rmem_alloc, 0);
1659                /*
1660                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1661                 */
1662                refcount_set(&newsk->sk_wmem_alloc, 1);
1663                atomic_set(&newsk->sk_omem_alloc, 0);
1664                sk_init_common(newsk);
1665
1666                newsk->sk_dst_cache     = NULL;
1667                newsk->sk_dst_pending_confirm = 0;
1668                newsk->sk_wmem_queued   = 0;
1669                newsk->sk_forward_alloc = 0;
1670                atomic_set(&newsk->sk_drops, 0);
1671                newsk->sk_send_head     = NULL;
1672                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1673
1674                sock_reset_flag(newsk, SOCK_DONE);
1675
1676                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1677                if (filter != NULL)
1678                        /* though it's an empty new sock, the charging may fail
1679                         * if sysctl_optmem_max was changed between creation of
1680                         * original socket and cloning
1681                         */
1682                        is_charged = sk_filter_charge(newsk, filter);
1683
1684                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1685                        /* We need to make sure that we don't uncharge the new
1686                         * socket if we couldn't charge it in the first place
1687                         * as otherwise we uncharge the parent's filter.
1688                         */
1689                        if (!is_charged)
1690                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1691                        sk_free_unlock_clone(newsk);
1692                        newsk = NULL;
1693                        goto out;
1694                }
1695                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1696
1697                newsk->sk_err      = 0;
1698                newsk->sk_err_soft = 0;
1699                newsk->sk_priority = 0;
1700                newsk->sk_incoming_cpu = raw_smp_processor_id();
1701                atomic64_set(&newsk->sk_cookie, 0);
1702
1703                mem_cgroup_sk_alloc(newsk);
1704                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1705
1706                /*
1707                 * Before updating sk_refcnt, we must commit prior changes to memory
1708                 * (Documentation/RCU/rculist_nulls.txt for details)
1709                 */
1710                smp_wmb();
1711                refcount_set(&newsk->sk_refcnt, 2);
1712
1713                /*
1714                 * Increment the counter in the same struct proto as the master
1715                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1716                 * is the same as sk->sk_prot->socks, as this field was copied
1717                 * with memcpy).
1718                 *
1719                 * This _changes_ the previous behaviour, where
1720                 * tcp_create_openreq_child always was incrementing the
1721                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1722                 * to be taken into account in all callers. -acme
1723                 */
1724                sk_refcnt_debug_inc(newsk);
1725                sk_set_socket(newsk, NULL);
1726                newsk->sk_wq = NULL;
1727
1728                if (newsk->sk_prot->sockets_allocated)
1729                        sk_sockets_allocated_inc(newsk);
1730
1731                if (sock_needs_netstamp(sk) &&
1732                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1733                        net_enable_timestamp();
1734        }
1735out:
1736        return newsk;
1737}
1738EXPORT_SYMBOL_GPL(sk_clone_lock);
1739
1740void sk_free_unlock_clone(struct sock *sk)
1741{
1742        /* It is still raw copy of parent, so invalidate
1743         * destructor and make plain sk_free() */
1744        sk->sk_destruct = NULL;
1745        bh_unlock_sock(sk);
1746        sk_free(sk);
1747}
1748EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1749
1750void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1751{
1752        u32 max_segs = 1;
1753
1754        sk_dst_set(sk, dst);
1755        sk->sk_route_caps = dst->dev->features;
1756        if (sk->sk_route_caps & NETIF_F_GSO)
1757                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1758        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1759        if (sk_can_gso(sk)) {
1760                if (dst->header_len) {
1761                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1762                } else {
1763                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1764                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1765                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1766                }
1767        }
1768        sk->sk_gso_max_segs = max_segs;
1769}
1770EXPORT_SYMBOL_GPL(sk_setup_caps);
1771
1772/*
1773 *      Simple resource managers for sockets.
1774 */
1775
1776
1777/*
1778 * Write buffer destructor automatically called from kfree_skb.
1779 */
1780void sock_wfree(struct sk_buff *skb)
1781{
1782        struct sock *sk = skb->sk;
1783        unsigned int len = skb->truesize;
1784
1785        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1786                /*
1787                 * Keep a reference on sk_wmem_alloc, this will be released
1788                 * after sk_write_space() call
1789                 */
1790                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1791                sk->sk_write_space(sk);
1792                len = 1;
1793        }
1794        /*
1795         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1796         * could not do because of in-flight packets
1797         */
1798        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1799                __sk_free(sk);
1800}
1801EXPORT_SYMBOL(sock_wfree);
1802
1803/* This variant of sock_wfree() is used by TCP,
1804 * since it sets SOCK_USE_WRITE_QUEUE.
1805 */
1806void __sock_wfree(struct sk_buff *skb)
1807{
1808        struct sock *sk = skb->sk;
1809
1810        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1811                __sk_free(sk);
1812}
1813
1814void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1815{
1816        skb_orphan(skb);
1817        skb->sk = sk;
1818#ifdef CONFIG_INET
1819        if (unlikely(!sk_fullsock(sk))) {
1820                skb->destructor = sock_edemux;
1821                sock_hold(sk);
1822                return;
1823        }
1824#endif
1825        skb->destructor = sock_wfree;
1826        skb_set_hash_from_sk(skb, sk);
1827        /*
1828         * We used to take a refcount on sk, but following operation
1829         * is enough to guarantee sk_free() wont free this sock until
1830         * all in-flight packets are completed
1831         */
1832        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1833}
1834EXPORT_SYMBOL(skb_set_owner_w);
1835
1836/* This helper is used by netem, as it can hold packets in its
1837 * delay queue. We want to allow the owner socket to send more
1838 * packets, as if they were already TX completed by a typical driver.
1839 * But we also want to keep skb->sk set because some packet schedulers
1840 * rely on it (sch_fq for example).
1841 */
1842void skb_orphan_partial(struct sk_buff *skb)
1843{
1844        if (skb_is_tcp_pure_ack(skb))
1845                return;
1846
1847        if (skb->destructor == sock_wfree
1848#ifdef CONFIG_INET
1849            || skb->destructor == tcp_wfree
1850#endif
1851                ) {
1852                struct sock *sk = skb->sk;
1853
1854                if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1855                        WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1856                        skb->destructor = sock_efree;
1857                }
1858        } else {
1859                skb_orphan(skb);
1860        }
1861}
1862EXPORT_SYMBOL(skb_orphan_partial);
1863
1864/*
1865 * Read buffer destructor automatically called from kfree_skb.
1866 */
1867void sock_rfree(struct sk_buff *skb)
1868{
1869        struct sock *sk = skb->sk;
1870        unsigned int len = skb->truesize;
1871
1872        atomic_sub(len, &sk->sk_rmem_alloc);
1873        sk_mem_uncharge(sk, len);
1874}
1875EXPORT_SYMBOL(sock_rfree);
1876
1877/*
1878 * Buffer destructor for skbs that are not used directly in read or write
1879 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1880 */
1881void sock_efree(struct sk_buff *skb)
1882{
1883        sock_put(skb->sk);
1884}
1885EXPORT_SYMBOL(sock_efree);
1886
1887kuid_t sock_i_uid(struct sock *sk)
1888{
1889        kuid_t uid;
1890
1891        read_lock_bh(&sk->sk_callback_lock);
1892        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1893        read_unlock_bh(&sk->sk_callback_lock);
1894        return uid;
1895}
1896EXPORT_SYMBOL(sock_i_uid);
1897
1898unsigned long sock_i_ino(struct sock *sk)
1899{
1900        unsigned long ino;
1901
1902        read_lock_bh(&sk->sk_callback_lock);
1903        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1904        read_unlock_bh(&sk->sk_callback_lock);
1905        return ino;
1906}
1907EXPORT_SYMBOL(sock_i_ino);
1908
1909/*
1910 * Allocate a skb from the socket's send buffer.
1911 */
1912struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1913                             gfp_t priority)
1914{
1915        if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1916                struct sk_buff *skb = alloc_skb(size, priority);
1917                if (skb) {
1918                        skb_set_owner_w(skb, sk);
1919                        return skb;
1920                }
1921        }
1922        return NULL;
1923}
1924EXPORT_SYMBOL(sock_wmalloc);
1925
1926/*
1927 * Allocate a memory block from the socket's option memory buffer.
1928 */
1929void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1930{
1931        if ((unsigned int)size <= sysctl_optmem_max &&
1932            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1933                void *mem;
1934                /* First do the add, to avoid the race if kmalloc
1935                 * might sleep.
1936                 */
1937                atomic_add(size, &sk->sk_omem_alloc);
1938                mem = kmalloc(size, priority);
1939                if (mem)
1940                        return mem;
1941                atomic_sub(size, &sk->sk_omem_alloc);
1942        }
1943        return NULL;
1944}
1945EXPORT_SYMBOL(sock_kmalloc);
1946
1947/* Free an option memory block. Note, we actually want the inline
1948 * here as this allows gcc to detect the nullify and fold away the
1949 * condition entirely.
1950 */
1951static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1952                                  const bool nullify)
1953{
1954        if (WARN_ON_ONCE(!mem))
1955                return;
1956        if (nullify)
1957                kzfree(mem);
1958        else
1959                kfree(mem);
1960        atomic_sub(size, &sk->sk_omem_alloc);
1961}
1962
1963void sock_kfree_s(struct sock *sk, void *mem, int size)
1964{
1965        __sock_kfree_s(sk, mem, size, false);
1966}
1967EXPORT_SYMBOL(sock_kfree_s);
1968
1969void sock_kzfree_s(struct sock *sk, void *mem, int size)
1970{
1971        __sock_kfree_s(sk, mem, size, true);
1972}
1973EXPORT_SYMBOL(sock_kzfree_s);
1974
1975/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1976   I think, these locks should be removed for datagram sockets.
1977 */
1978static long sock_wait_for_wmem(struct sock *sk, long timeo)
1979{
1980        DEFINE_WAIT(wait);
1981
1982        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1983        for (;;) {
1984                if (!timeo)
1985                        break;
1986                if (signal_pending(current))
1987                        break;
1988                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1989                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1990                if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1991                        break;
1992                if (sk->sk_shutdown & SEND_SHUTDOWN)
1993                        break;
1994                if (sk->sk_err)
1995                        break;
1996                timeo = schedule_timeout(timeo);
1997        }
1998        finish_wait(sk_sleep(sk), &wait);
1999        return timeo;
2000}

2001
2002
2003/*
2004 *      Generic send/receive buffer handlers
2005 */
2006
2007struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2008                                     unsigned long data_len, int noblock,
2009                                     int *errcode, int max_page_order)
2010{
2011        struct sk_buff *skb;
2012        long timeo;
2013        int err;
2014
2015        timeo = sock_sndtimeo(sk, noblock);
2016        for (;;) {
2017                err = sock_error(sk);
2018                if (err != 0)
2019                        goto failure;
2020
2021                err = -EPIPE;
2022                if (sk->sk_shutdown & SEND_SHUTDOWN)
2023                        goto failure;
2024
2025                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2026                        break;
2027
2028                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2029                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2030                err = -EAGAIN;
2031                if (!timeo)
2032                        goto failure;
2033                if (signal_pending(current))
2034                        goto interrupted;
2035                timeo = sock_wait_for_wmem(sk, timeo);
2036        }
2037        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2038                                   errcode, sk->sk_allocation);
2039        if (skb)
2040                skb_set_owner_w(skb, sk);
2041        return skb;
2042
2043interrupted:
2044        err = sock_intr_errno(timeo);
2045failure:
2046        *errcode = err;
2047        return NULL;
2048}
2049EXPORT_SYMBOL(sock_alloc_send_pskb);
2050
2051struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2052                                    int noblock, int *errcode)
2053{
2054        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2055}
2056EXPORT_SYMBOL(sock_alloc_send_skb);
2057
2058int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2059                     struct sockcm_cookie *sockc)
2060{
2061        u32 tsflags;
2062
2063        switch (cmsg->cmsg_type) {
2064        case SO_MARK:
2065                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2066                        return -EPERM;
2067                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2068                        return -EINVAL;
2069                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2070                break;
2071        case SO_TIMESTAMPING:
2072                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2073                        return -EINVAL;
2074
2075                tsflags = *(u32 *)CMSG_DATA(cmsg);
2076                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2077                        return -EINVAL;
2078
2079                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2080                sockc->tsflags |= tsflags;
2081                break;
2082        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2083        case SCM_RIGHTS:
2084        case SCM_CREDENTIALS:
2085                break;
2086        default:
2087                return -EINVAL;
2088        }
2089        return 0;
2090}
2091EXPORT_SYMBOL(__sock_cmsg_send);
2092
2093int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2094                   struct sockcm_cookie *sockc)
2095{
2096        struct cmsghdr *cmsg;
2097        int ret;
2098
2099        for_each_cmsghdr(cmsg, msg) {
2100                if (!CMSG_OK(msg, cmsg))
2101                        return -EINVAL;
2102                if (cmsg->cmsg_level != SOL_SOCKET)
2103                        continue;
2104                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2105                if (ret)
2106                        return ret;
2107        }
2108        return 0;
2109}
2110EXPORT_SYMBOL(sock_cmsg_send);
2111
2112static void sk_enter_memory_pressure(struct sock *sk)
2113{
2114        if (!sk->sk_prot->enter_memory_pressure)
2115                return;
2116
2117        sk->sk_prot->enter_memory_pressure(sk);
2118}
2119
2120static void sk_leave_memory_pressure(struct sock *sk)
2121{
2122        if (sk->sk_prot->leave_memory_pressure) {
2123                sk->sk_prot->leave_memory_pressure(sk);
2124        } else {
2125                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2126
2127                if (memory_pressure && *memory_pressure)
2128                        *memory_pressure = 0;
2129        }
2130}
2131
2132/* On 32bit arches, an skb frag is limited to 2^15 */
2133#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2134
2135/**
2136 * skb_page_frag_refill - check that a page_frag contains enough room
2137 * @sz: minimum size of the fragment we want to get
2138 * @pfrag: pointer to page_frag
2139 * @gfp: priority for memory allocation
2140 *
2141 * Note: While this allocator tries to use high order pages, there is
2142 * no guarantee that allocations succeed. Therefore, @sz MUST be
2143 * less or equal than PAGE_SIZE.
2144 */
2145bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2146{
2147        if (pfrag->page) {
2148                if (page_ref_count(pfrag->page) == 1) {
2149                        pfrag->offset = 0;
2150                        return true;
2151                }
2152                if (pfrag->offset + sz <= pfrag->size)
2153                        return true;
2154                put_page(pfrag->page);
2155        }
2156
2157        pfrag->offset = 0;
2158        if (SKB_FRAG_PAGE_ORDER) {
2159                /* Avoid direct reclaim but allow kswapd to wake */
2160                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2161                                          __GFP_COMP | __GFP_NOWARN |
2162                                          __GFP_NORETRY,
2163                                          SKB_FRAG_PAGE_ORDER);
2164                if (likely(pfrag->page)) {
2165                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2166                        return true;
2167                }
2168        }
2169        pfrag->page = alloc_page(gfp);
2170        if (likely(pfrag->page)) {
2171                pfrag->size = PAGE_SIZE;
2172                return true;
2173        }
2174        return false;
2175}
2176EXPORT_SYMBOL(skb_page_frag_refill);
2177
2178bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2179{
2180        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2181                return true;
2182
2183        sk_enter_memory_pressure(sk);
2184        sk_stream_moderate_sndbuf(sk);
2185        return false;
2186}
2187EXPORT_SYMBOL(sk_page_frag_refill);
2188
2189static void __lock_sock(struct sock *sk)
2190        __releases(&sk->sk_lock.slock)
2191        __acquires(&sk->sk_lock.slock)
2192{
2193        DEFINE_WAIT(wait);
2194
2195        for (;;) {
2196                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2197                                        TASK_UNINTERRUPTIBLE);
2198                spin_unlock_bh(&sk->sk_lock.slock);
2199                schedule();
2200                spin_lock_bh(&sk->sk_lock.slock);
2201                if (!sock_owned_by_user(sk))
2202                        break;
2203        }
2204        finish_wait(&sk->sk_lock.wq, &wait);
2205}
2206
2207static void __release_sock(struct sock *sk)
2208        __releases(&sk->sk_lock.slock)
2209        __acquires(&sk->sk_lock.slock)
2210{
2211        struct sk_buff *skb, *next;
2212
2213        while ((skb = sk->sk_backlog.head) != NULL) {
2214                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2215
2216                spin_unlock_bh(&sk->sk_lock.slock);
2217
2218                do {
2219                        next = skb->next;
2220                        prefetch(next);
2221                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2222                        skb->next = NULL;
2223                        sk_backlog_rcv(sk, skb);
2224
2225                        cond_resched();
2226
2227                        skb = next;
2228                } while (skb != NULL);
2229
2230                spin_lock_bh(&sk->sk_lock.slock);
2231        }
2232
2233        /*
2234         * Doing the zeroing here guarantee we can not loop forever
2235         * while a wild producer attempts to flood us.
2236         */
2237        sk->sk_backlog.len = 0;
2238}
2239
2240void __sk_flush_backlog(struct sock *sk)
2241{
2242        spin_lock_bh(&sk->sk_lock.slock);
2243        __release_sock(sk);
2244        spin_unlock_bh(&sk->sk_lock.slock);
2245}
2246
2247/**
2248 * sk_wait_data - wait for data to arrive at sk_receive_queue
2249 * @sk:    sock to wait on
2250 * @timeo: for how long
2251 * @skb:   last skb seen on sk_receive_queue
2252 *
2253 * Now socket state including sk->sk_err is changed only under lock,
2254 * hence we may omit checks after joining wait queue.
2255 * We check receive queue before schedule() only as optimization;
2256 * it is very likely that release_sock() added new data.
2257 */
2258int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2259{
2260        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2261        int rc;
2262
2263        add_wait_queue(sk_sleep(sk), &wait);
2264        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2265        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2266        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2267        remove_wait_queue(sk_sleep(sk), &wait);
2268        return rc;
2269}
2270EXPORT_SYMBOL(sk_wait_data);
2271
2272/**
2273 *      __sk_mem_raise_allocated - increase memory_allocated
2274 *      @sk: socket
2275 *      @size: memory size to allocate
2276 *      @amt: pages to allocate
2277 *      @kind: allocation type
2278 *
2279 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2280 */
2281int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2282{
2283        struct proto *prot = sk->sk_prot;
2284        long allocated = sk_memory_allocated_add(sk, amt);
2285
2286        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2287            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2288                goto suppress_allocation;
2289
2290        /* Under limit. */
2291        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2292                sk_leave_memory_pressure(sk);
2293                return 1;
2294        }
2295
2296        /* Under pressure. */
2297        if (allocated > sk_prot_mem_limits(sk, 1))
2298                sk_enter_memory_pressure(sk);
2299
2300        /* Over hard limit. */
2301        if (allocated > sk_prot_mem_limits(sk, 2))
2302                goto suppress_allocation;
2303
2304        /* guarantee minimum buffer size under pressure */
2305        if (kind == SK_MEM_RECV) {
2306                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2307                        return 1;
2308
2309        } else { /* SK_MEM_SEND */
2310                if (sk->sk_type == SOCK_STREAM) {
2311                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2312                                return 1;
2313                } else if (refcount_read(&sk->sk_wmem_alloc) <
2314                           prot->sysctl_wmem[0])
2315                                return 1;
2316        }
2317
2318        if (sk_has_memory_pressure(sk)) {
2319                int alloc;
2320
2321                if (!sk_under_memory_pressure(sk))
2322                        return 1;
2323                alloc = sk_sockets_allocated_read_positive(sk);
2324                if (sk_prot_mem_limits(sk, 2) > alloc *
2325                    sk_mem_pages(sk->sk_wmem_queued +
2326                                 atomic_read(&sk->sk_rmem_alloc) +
2327                                 sk->sk_forward_alloc))
2328                        return 1;
2329        }
2330
2331suppress_allocation:
2332
2333        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2334                sk_stream_moderate_sndbuf(sk);
2335
2336                /* Fail only if socket is _under_ its sndbuf.
2337                 * In this case we cannot block, so that we have to fail.
2338                 */
2339                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2340                        return 1;
2341        }
2342
2343        trace_sock_exceed_buf_limit(sk, prot, allocated);
2344
2345        sk_memory_allocated_sub(sk, amt);
2346
2347        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2348                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2349
2350        return 0;
2351}
2352EXPORT_SYMBOL(__sk_mem_raise_allocated);
2353
2354/**
2355 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2356 *      @sk: socket
2357 *      @size: memory size to allocate
2358 *      @kind: allocation type
2359 *
2360 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2361 *      rmem allocation. This function assumes that protocols which have
2362 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2363 */
2364int __sk_mem_schedule(struct sock *sk, int size, int kind)
2365{
2366        int ret, amt = sk_mem_pages(size);
2367
2368        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2369        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2370        if (!ret)
2371                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2372        return ret;
2373}
2374EXPORT_SYMBOL(__sk_mem_schedule);
2375
2376/**
2377 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2378 *      @sk: socket
2379 *      @amount: number of quanta
2380 *
2381 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2382 */
2383void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2384{
2385        sk_memory_allocated_sub(sk, amount);
2386
2387        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2388                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2389
2390        if (sk_under_memory_pressure(sk) &&
2391            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2392                sk_leave_memory_pressure(sk);
2393}
2394EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2395
2396/**
2397 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2398 *      @sk: socket
2399 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2400 */
2401void __sk_mem_reclaim(struct sock *sk, int amount)
2402{
2403        amount >>= SK_MEM_QUANTUM_SHIFT;
2404        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2405        __sk_mem_reduce_allocated(sk, amount);
2406}
2407EXPORT_SYMBOL(__sk_mem_reclaim);
2408
2409int sk_set_peek_off(struct sock *sk, int val)
2410{
2411        if (val < 0)
2412                return -EINVAL;
2413
2414        sk->sk_peek_off = val;
2415        return 0;
2416}
2417EXPORT_SYMBOL_GPL(sk_set_peek_off);
2418
2419/*
2420 * Set of default routines for initialising struct proto_ops when
2421 * the protocol does not support a particular function. In certain
2422 * cases where it makes no sense for a protocol to have a "do nothing"
2423 * function, some default processing is provided.
2424 */
2425
2426int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2427{
2428        return -EOPNOTSUPP;
2429}
2430EXPORT_SYMBOL(sock_no_bind);
2431
2432int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2433                    int len, int flags)
2434{
2435        return -EOPNOTSUPP;
2436}
2437EXPORT_SYMBOL(sock_no_connect);
2438
2439int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2440{
2441        return -EOPNOTSUPP;
2442}
2443EXPORT_SYMBOL(sock_no_socketpair);
2444
2445int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2446                   bool kern)
2447{
2448        return -EOPNOTSUPP;
2449}
2450EXPORT_SYMBOL(sock_no_accept);
2451
2452int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2453                    int *len, int peer)
2454{
2455        return -EOPNOTSUPP;
2456}
2457EXPORT_SYMBOL(sock_no_getname);
2458
2459unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2460{
2461        return 0;
2462}
2463EXPORT_SYMBOL(sock_no_poll);
2464
2465int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2466{
2467        return -EOPNOTSUPP;
2468}
2469EXPORT_SYMBOL(sock_no_ioctl);
2470
2471int sock_no_listen(struct socket *sock, int backlog)
2472{
2473        return -EOPNOTSUPP;
2474}
2475EXPORT_SYMBOL(sock_no_listen);
2476
2477int sock_no_shutdown(struct socket *sock, int how)
2478{
2479        return -EOPNOTSUPP;
2480}
2481EXPORT_SYMBOL(sock_no_shutdown);
2482
2483int sock_no_setsockopt(struct socket *sock, int level, int optname,
2484                    char __user *optval, unsigned int optlen)
2485{
2486        return -EOPNOTSUPP;
2487}
2488EXPORT_SYMBOL(sock_no_setsockopt);
2489
2490int sock_no_getsockopt(struct socket *sock, int level, int optname,
2491                    char __user *optval, int __user *optlen)
2492{
2493        return -EOPNOTSUPP;
2494}
2495EXPORT_SYMBOL(sock_no_getsockopt);
2496
2497int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2498{
2499        return -EOPNOTSUPP;
2500}
2501EXPORT_SYMBOL(sock_no_sendmsg);
2502
2503int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2504                    int flags)
2505{
2506        return -EOPNOTSUPP;
2507}
2508EXPORT_SYMBOL(sock_no_recvmsg);
2509
2510int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2511{
2512        /* Mirror missing mmap method error code */
2513        return -ENODEV;
2514}
2515EXPORT_SYMBOL(sock_no_mmap);
2516
2517ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2518{
2519        ssize_t res;
2520        struct msghdr msg = {.msg_flags = flags};
2521        struct kvec iov;
2522        char *kaddr = kmap(page);
2523        iov.iov_base = kaddr + offset;
2524        iov.iov_len = size;
2525        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2526        kunmap(page);
2527        return res;
2528}
2529EXPORT_SYMBOL(sock_no_sendpage);
2530
2531/*
2532 *      Default Socket Callbacks
2533 */
2534
2535static void sock_def_wakeup(struct sock *sk)
2536{
2537        struct socket_wq *wq;
2538
2539        rcu_read_lock();
2540        wq = rcu_dereference(sk->sk_wq);
2541        if (skwq_has_sleeper(wq))
2542                wake_up_interruptible_all(&wq->wait);
2543        rcu_read_unlock();
2544}
2545
2546static void sock_def_error_report(struct sock *sk)
2547{
2548        struct socket_wq *wq;
2549
2550        rcu_read_lock();
2551        wq = rcu_dereference(sk->sk_wq);
2552        if (skwq_has_sleeper(wq))
2553                wake_up_interruptible_poll(&wq->wait, POLLERR);
2554        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2555        rcu_read_unlock();
2556}
2557
2558static void sock_def_readable(struct sock *sk)
2559{
2560        struct socket_wq *wq;
2561
2562        rcu_read_lock();
2563        wq = rcu_dereference(sk->sk_wq);
2564        if (skwq_has_sleeper(wq))
2565                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2566                                                POLLRDNORM | POLLRDBAND);
2567        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2568        rcu_read_unlock();
2569}
2570
2571static void sock_def_write_space(struct sock *sk)
2572{
2573        struct socket_wq *wq;
2574
2575        rcu_read_lock();
2576
2577        /* Do not wake up a writer until he can make "significant"
2578         * progress.  --DaveM
2579         */
2580        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2581                wq = rcu_dereference(sk->sk_wq);
2582                if (skwq_has_sleeper(wq))
2583                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2584                                                POLLWRNORM | POLLWRBAND);
2585
2586                /* Should agree with poll, otherwise some programs break */
2587                if (sock_writeable(sk))
2588                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2589        }
2590
2591        rcu_read_unlock();
2592}
2593
2594static void sock_def_destruct(struct sock *sk)
2595{
2596}
2597
2598void sk_send_sigurg(struct sock *sk)
2599{
2600        if (sk->sk_socket && sk->sk_socket->file)
2601                if (send_sigurg(&sk->sk_socket->file->f_owner))
2602                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2603}
2604EXPORT_SYMBOL(sk_send_sigurg);
2605
2606void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2607                    unsigned long expires)
2608{
2609        if (!mod_timer(timer, expires))
2610                sock_hold(sk);
2611}
2612EXPORT_SYMBOL(sk_reset_timer);
2613
2614void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2615{
2616        if (del_timer(timer))
2617                __sock_put(sk);
2618}
2619EXPORT_SYMBOL(sk_stop_timer);
2620
2621void sock_init_data(struct socket *sock, struct sock *sk)
2622{
2623        sk_init_common(sk);
2624        sk->sk_send_head        =       NULL;
2625
2626        init_timer(&sk->sk_timer);
2627
2628        sk->sk_allocation       =       GFP_KERNEL;
2629        sk->sk_rcvbuf           =       sysctl_rmem_default;
2630        sk->sk_sndbuf           =       sysctl_wmem_default;
2631        sk->sk_state            =       TCP_CLOSE;
2632        sk_set_socket(sk, sock);
2633
2634        sock_set_flag(sk, SOCK_ZAPPED);
2635
2636        if (sock) {
2637                sk->sk_type     =       sock->type;
2638                sk->sk_wq       =       sock->wq;
2639                sock->sk        =       sk;
2640                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2641        } else {
2642                sk->sk_wq       =       NULL;
2643                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2644        }
2645
2646        rwlock_init(&sk->sk_callback_lock);
2647        if (sk->sk_kern_sock)
2648                lockdep_set_class_and_name(
2649                        &sk->sk_callback_lock,
2650                        af_kern_callback_keys + sk->sk_family,
2651                        af_family_kern_clock_key_strings[sk->sk_family]);
2652        else
2653                lockdep_set_class_and_name(
2654                        &sk->sk_callback_lock,
2655                        af_callback_keys + sk->sk_family,
2656                        af_family_clock_key_strings[sk->sk_family]);
2657
2658        sk->sk_state_change     =       sock_def_wakeup;
2659        sk->sk_data_ready       =       sock_def_readable;
2660        sk->sk_write_space      =       sock_def_write_space;
2661        sk->sk_error_report     =       sock_def_error_report;
2662        sk->sk_destruct         =       sock_def_destruct;
2663
2664        sk->sk_frag.page        =       NULL;
2665        sk->sk_frag.offset      =       0;
2666        sk->sk_peek_off         =       -1;
2667
2668        sk->sk_peer_pid         =       NULL;
2669        sk->sk_peer_cred        =       NULL;
2670        sk->sk_write_pending    =       0;
2671        sk->sk_rcvlowat         =       1;
2672        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2673        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2674
2675        sk->sk_stamp = SK_DEFAULT_STAMP;
2676
2677#ifdef CONFIG_NET_RX_BUSY_POLL
2678        sk->sk_napi_id          =       0;
2679        sk->sk_ll_usec          =       sysctl_net_busy_read;
2680#endif
2681
2682        sk->sk_max_pacing_rate = ~0U;
2683        sk->sk_pacing_rate = ~0U;
2684        sk->sk_incoming_cpu = -1;
2685        /*
2686         * Before updating sk_refcnt, we must commit prior changes to memory
2687         * (Documentation/RCU/rculist_nulls.txt for details)
2688         */
2689        smp_wmb();
2690        refcount_set(&sk->sk_refcnt, 1);
2691        atomic_set(&sk->sk_drops, 0);
2692}
2693EXPORT_SYMBOL(sock_init_data);
2694
2695void lock_sock_nested(struct sock *sk, int subclass)
2696{
2697        might_sleep();
2698        spin_lock_bh(&sk->sk_lock.slock);
2699        if (sk->sk_lock.owned)
2700                __lock_sock(sk);
2701        sk->sk_lock.owned = 1;
2702        spin_unlock(&sk->sk_lock.slock);
2703        /*
2704         * The sk_lock has mutex_lock() semantics here:
2705         */
2706        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2707        local_bh_enable();
2708}
2709EXPORT_SYMBOL(lock_sock_nested);
2710
2711void release_sock(struct sock *sk)
2712{
2713        spin_lock_bh(&sk->sk_lock.slock);
2714        if (sk->sk_backlog.tail)
2715                __release_sock(sk);
2716
2717        /* Warning : release_cb() might need to release sk ownership,
2718         * ie call sock_release_ownership(sk) before us.
2719         */
2720        if (sk->sk_prot->release_cb)
2721                sk->sk_prot->release_cb(sk);
2722
2723        sock_release_ownership(sk);
2724        if (waitqueue_active(&sk->sk_lock.wq))
2725                wake_up(&sk->sk_lock.wq);
2726        spin_unlock_bh(&sk->sk_lock.slock);
2727}
2728EXPORT_SYMBOL(release_sock);
2729
2730/**
2731 * lock_sock_fast - fast version of lock_sock
2732 * @sk: socket
2733 *
2734 * This version should be used for very small section, where process wont block
2735 * return false if fast path is taken:
2736 *
2737 *   sk_lock.slock locked, owned = 0, BH disabled
2738 *
2739 * return true if slow path is taken:
2740 *
2741 *   sk_lock.slock unlocked, owned = 1, BH enabled
2742 */
2743bool lock_sock_fast(struct sock *sk)
2744{
2745        might_sleep();
2746        spin_lock_bh(&sk->sk_lock.slock);
2747
2748        if (!sk->sk_lock.owned)
2749                /*
2750                 * Note : We must disable BH
2751                 */
2752                return false;
2753
2754        __lock_sock(sk);
2755        sk->sk_lock.owned = 1;
2756        spin_unlock(&sk->sk_lock.slock);
2757        /*
2758         * The sk_lock has mutex_lock() semantics here:
2759         */
2760        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2761        local_bh_enable();
2762        return true;
2763}
2764EXPORT_SYMBOL(lock_sock_fast);
2765
2766int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2767{
2768        struct timeval tv;
2769        if (!sock_flag(sk, SOCK_TIMESTAMP))
2770                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2771        tv = ktime_to_timeval(sk->sk_stamp);
2772        if (tv.tv_sec == -1)
2773                return -ENOENT;
2774        if (tv.tv_sec == 0) {
2775                sk->sk_stamp = ktime_get_real();
2776                tv = ktime_to_timeval(sk->sk_stamp);
2777        }
2778        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2779}
2780EXPORT_SYMBOL(sock_get_timestamp);
2781
2782int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2783{
2784        struct timespec ts;
2785        if (!sock_flag(sk, SOCK_TIMESTAMP))
2786                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2787        ts = ktime_to_timespec(sk->sk_stamp);
2788        if (ts.tv_sec == -1)
2789                return -ENOENT;
2790        if (ts.tv_sec == 0) {
2791                sk->sk_stamp = ktime_get_real();
2792                ts = ktime_to_timespec(sk->sk_stamp);
2793        }
2794        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2795}
2796EXPORT_SYMBOL(sock_get_timestampns);
2797
2798void sock_enable_timestamp(struct sock *sk, int flag)
2799{
2800        if (!sock_flag(sk, flag)) {
2801                unsigned long previous_flags = sk->sk_flags;
2802
2803                sock_set_flag(sk, flag);
2804                /*
2805                 * we just set one of the two flags which require net
2806                 * time stamping, but time stamping might have been on
2807                 * already because of the other one
2808                 */
2809                if (sock_needs_netstamp(sk) &&
2810                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2811                        net_enable_timestamp();
2812        }
2813}
2814
2815int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2816                       int level, int type)
2817{
2818        struct sock_exterr_skb *serr;
2819        struct sk_buff *skb;
2820        int copied, err;
2821
2822        err = -EAGAIN;
2823        skb = sock_dequeue_err_skb(sk);
2824        if (skb == NULL)
2825                goto out;
2826
2827        copied = skb->len;
2828        if (copied > len) {
2829                msg->msg_flags |= MSG_TRUNC;
2830                copied = len;
2831        }
2832        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2833        if (err)
2834                goto out_free_skb;
2835
2836        sock_recv_timestamp(msg, sk, skb);
2837
2838        serr = SKB_EXT_ERR(skb);
2839        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2840
2841        msg->msg_flags |= MSG_ERRQUEUE;
2842        err = copied;
2843
2844out_free_skb:
2845        kfree_skb(skb);
2846out:
2847        return err;
2848}
2849EXPORT_SYMBOL(sock_recv_errqueue);
2850
2851/*
2852 *      Get a socket option on an socket.
2853 *
2854 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2855 *      asynchronous errors should be reported by getsockopt. We assume
2856 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2857 */
2858int sock_common_getsockopt(struct socket *sock, int level, int optname,
2859                           char __user *optval, int __user *optlen)
2860{
2861        struct sock *sk = sock->sk;
2862
2863        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2864}
2865EXPORT_SYMBOL(sock_common_getsockopt);
2866
2867#ifdef CONFIG_COMPAT
2868int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2869                                  char __user *optval, int __user *optlen)
2870{
2871        struct sock *sk = sock->sk;
2872
2873        if (sk->sk_prot->compat_getsockopt != NULL)
2874                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2875                                                      optval, optlen);
2876        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2877}
2878EXPORT_SYMBOL(compat_sock_common_getsockopt);
2879#endif
2880
2881int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2882                        int flags)
2883{
2884        struct sock *sk = sock->sk;
2885        int addr_len = 0;
2886        int err;
2887
2888        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2889                                   flags & ~MSG_DONTWAIT, &addr_len);
2890        if (err >= 0)
2891                msg->msg_namelen = addr_len;
2892        return err;
2893}
2894EXPORT_SYMBOL(sock_common_recvmsg);
2895
2896/*
2897 *      Set socket options on an inet socket.
2898 */
2899int sock_common_setsockopt(struct socket *sock, int level, int optname,
2900                           char __user *optval, unsigned int optlen)
2901{
2902        struct sock *sk = sock->sk;
2903
2904        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2905}
2906EXPORT_SYMBOL(sock_common_setsockopt);
2907
2908#ifdef CONFIG_COMPAT
2909int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2910                                  char __user *optval, unsigned int optlen)
2911{
2912        struct sock *sk = sock->sk;
2913
2914        if (sk->sk_prot->compat_setsockopt != NULL)
2915                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2916                                                      optval, optlen);
2917        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2918}
2919EXPORT_SYMBOL(compat_sock_common_setsockopt);
2920#endif
2921
2922void sk_common_release(struct sock *sk)
2923{
2924        if (sk->sk_prot->destroy)
2925                sk->sk_prot->destroy(sk);
2926
2927        /*
2928         * Observation: when sock_common_release is called, processes have
2929         * no access to socket. But net still has.
2930         * Step one, detach it from networking:
2931         *
2932         * A. Remove from hash tables.
2933         */
2934
2935        sk->sk_prot->unhash(sk);
2936
2937        /*
2938         * In this point socket cannot receive new packets, but it is possible
2939         * that some packets are in flight because some CPU runs receiver and
2940         * did hash table lookup before we unhashed socket. They will achieve
2941         * receive queue and will be purged by socket destructor.
2942         *
2943         * Also we still have packets pending on receive queue and probably,
2944         * our own packets waiting in device queues. sock_destroy will drain
2945         * receive queue, but transmitted packets will delay socket destruction
2946         * until the last reference will be released.
2947         */
2948
2949        sock_orphan(sk);
2950
2951        xfrm_sk_free_policy(sk);
2952
2953        sk_refcnt_debug_release(sk);
2954
2955        sock_put(sk);
2956}
2957EXPORT_SYMBOL(sk_common_release);
2958
2959void sk_get_meminfo(const struct sock *sk, u32 *mem)
2960{
2961        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2962
2963        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2964        mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2965        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2966        mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2967        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2968        mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2969        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2970        mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2971        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2972}
2973
2974#ifdef CONFIG_PROC_FS
2975#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2976struct prot_inuse {
2977        int val[PROTO_INUSE_NR];
2978};
2979
2980static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2981
2982#ifdef CONFIG_NET_NS
2983void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2984{
2985        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2986}
2987EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2988
2989int sock_prot_inuse_get(struct net *net, struct proto *prot)
2990{
2991        int cpu, idx = prot->inuse_idx;
2992        int res = 0;
2993
2994        for_each_possible_cpu(cpu)
2995                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2996
2997        return res >= 0 ? res : 0;
2998}
2999EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3000

3001static int __net_init sock_inuse_init_net(struct net *net)
3002{
3003        net->core.inuse = alloc_percpu(struct prot_inuse);
3004        return net->core.inuse ? 0 : -ENOMEM;
3005}
3006
3007static void __net_exit sock_inuse_exit_net(struct net *net)
3008{
3009        free_percpu(net->core.inuse);
3010}
3011
3012static struct pernet_operations net_inuse_ops = {
3013        .init = sock_inuse_init_net,
3014        .exit = sock_inuse_exit_net,
3015};
3016
3017static __init int net_inuse_init(void)
3018{
3019        if (register_pernet_subsys(&net_inuse_ops))
3020                panic("Cannot initialize net inuse counters");
3021
3022        return 0;
3023}
3024
3025core_initcall(net_inuse_init);
3026#else
3027static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3028
3029void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3030{
3031        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3032}
3033EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3034
3035int sock_prot_inuse_get(struct net *net, struct proto *prot)
3036{
3037        int cpu, idx = prot->inuse_idx;
3038        int res = 0;
3039
3040        for_each_possible_cpu(cpu)
3041                res += per_cpu(prot_inuse, cpu).val[idx];
3042
3043        return res >= 0 ? res : 0;
3044}
3045EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3046#endif
3047
3048static void assign_proto_idx(struct proto *prot)
3049{
3050        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3051
3052        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3053                pr_err("PROTO_INUSE_NR exhausted\n");
3054                return;
3055        }
3056
3057        set_bit(prot->inuse_idx, proto_inuse_idx);
3058}
3059
3060static void release_proto_idx(struct proto *prot)
3061{
3062        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3063                clear_bit(prot->inuse_idx, proto_inuse_idx);
3064}
3065#else
3066static inline void assign_proto_idx(struct proto *prot)
3067{
3068}
3069
3070static inline void release_proto_idx(struct proto *prot)
3071{
3072}
3073#endif
3074
3075static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3076{
3077        if (!rsk_prot)
3078                return;
3079        kfree(rsk_prot->slab_name);
3080        rsk_prot->slab_name = NULL;
3081        kmem_cache_destroy(rsk_prot->slab);
3082        rsk_prot->slab = NULL;
3083}
3084
3085static int req_prot_init(const struct proto *prot)
3086{
3087        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3088
3089        if (!rsk_prot)
3090                return 0;
3091
3092        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3093                                        prot->name);
3094        if (!rsk_prot->slab_name)
3095                return -ENOMEM;
3096
3097        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3098                                           rsk_prot->obj_size, 0,
3099                                           prot->slab_flags, NULL);
3100
3101        if (!rsk_prot->slab) {
3102                pr_crit("%s: Can't create request sock SLAB cache!\n",
3103                        prot->name);
3104                return -ENOMEM;
3105        }
3106        return 0;
3107}
3108
3109int proto_register(struct proto *prot, int alloc_slab)
3110{
3111        if (alloc_slab) {
3112                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3113                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
3114                                        NULL);
3115
3116                if (prot->slab == NULL) {
3117                        pr_crit("%s: Can't create sock SLAB cache!\n",
3118                                prot->name);
3119                        goto out;
3120                }
3121
3122                if (req_prot_init(prot))
3123                        goto out_free_request_sock_slab;
3124
3125                if (prot->twsk_prot != NULL) {
3126                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3127
3128                        if (prot->twsk_prot->twsk_slab_name == NULL)
3129                                goto out_free_request_sock_slab;
3130
3131                        prot->twsk_prot->twsk_slab =
3132                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3133                                                  prot->twsk_prot->twsk_obj_size,
3134                                                  0,
3135                                                  prot->slab_flags,
3136                                                  NULL);
3137                        if (prot->twsk_prot->twsk_slab == NULL)
3138                                goto out_free_timewait_sock_slab_name;
3139                }
3140        }
3141
3142        mutex_lock(&proto_list_mutex);
3143        list_add(&prot->node, &proto_list);
3144        assign_proto_idx(prot);
3145        mutex_unlock(&proto_list_mutex);
3146        return 0;
3147
3148out_free_timewait_sock_slab_name:
3149        kfree(prot->twsk_prot->twsk_slab_name);
3150out_free_request_sock_slab:
3151        req_prot_cleanup(prot->rsk_prot);
3152
3153        kmem_cache_destroy(prot->slab);
3154        prot->slab = NULL;
3155out:
3156        return -ENOBUFS;
3157}
3158EXPORT_SYMBOL(proto_register);
3159
3160void proto_unregister(struct proto *prot)
3161{
3162        mutex_lock(&proto_list_mutex);
3163        release_proto_idx(prot);
3164        list_del(&prot->node);
3165        mutex_unlock(&proto_list_mutex);
3166
3167        kmem_cache_destroy(prot->slab);
3168        prot->slab = NULL;
3169
3170        req_prot_cleanup(prot->rsk_prot);
3171
3172        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3173                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3174                kfree(prot->twsk_prot->twsk_slab_name);
3175                prot->twsk_prot->twsk_slab = NULL;
3176        }
3177}
3178EXPORT_SYMBOL(proto_unregister);
3179
3180#ifdef CONFIG_PROC_FS
3181static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3182        __acquires(proto_list_mutex)
3183{
3184        mutex_lock(&proto_list_mutex);
3185        return seq_list_start_head(&proto_list, *pos);
3186}
3187
3188static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3189{
3190        return seq_list_next(v, &proto_list, pos);
3191}
3192
3193static void proto_seq_stop(struct seq_file *seq, void *v)
3194        __releases(proto_list_mutex)
3195{
3196        mutex_unlock(&proto_list_mutex);
3197}
3198
3199static char proto_method_implemented(const void *method)
3200{
3201        return method == NULL ? 'n' : 'y';
3202}
3203static long sock_prot_memory_allocated(struct proto *proto)
3204{
3205        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3206}
3207
3208static char *sock_prot_memory_pressure(struct proto *proto)
3209{
3210        return proto->memory_pressure != NULL ?
3211        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3212}
3213
3214static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3215{
3216
3217        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3218                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3219                   proto->name,
3220                   proto->obj_size,
3221                   sock_prot_inuse_get(seq_file_net(seq), proto),
3222                   sock_prot_memory_allocated(proto),
3223                   sock_prot_memory_pressure(proto),
3224                   proto->max_header,
3225                   proto->slab == NULL ? "no" : "yes",
3226                   module_name(proto->owner),
3227                   proto_method_implemented(proto->close),
3228                   proto_method_implemented(proto->connect),
3229                   proto_method_implemented(proto->disconnect),
3230                   proto_method_implemented(proto->accept),
3231                   proto_method_implemented(proto->ioctl),
3232                   proto_method_implemented(proto->init),
3233                   proto_method_implemented(proto->destroy),
3234                   proto_method_implemented(proto->shutdown),
3235                   proto_method_implemented(proto->setsockopt),
3236                   proto_method_implemented(proto->getsockopt),
3237                   proto_method_implemented(proto->sendmsg),
3238                   proto_method_implemented(proto->recvmsg),
3239                   proto_method_implemented(proto->sendpage),
3240                   proto_method_implemented(proto->bind),
3241                   proto_method_implemented(proto->backlog_rcv),
3242                   proto_method_implemented(proto->hash),
3243                   proto_method_implemented(proto->unhash),
3244                   proto_method_implemented(proto->get_port),
3245                   proto_method_implemented(proto->enter_memory_pressure));
3246}
3247
3248static int proto_seq_show(struct seq_file *seq, void *v)
3249{
3250        if (v == &proto_list)
3251                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3252                           "protocol",
3253                           "size",
3254                           "sockets",
3255                           "memory",
3256                           "press",
3257                           "maxhdr",
3258                           "slab",
3259                           "module",
3260                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3261        else
3262                proto_seq_printf(seq, list_entry(v, struct proto, node));
3263        return 0;
3264}
3265
3266static const struct seq_operations proto_seq_ops = {
3267        .start  = proto_seq_start,
3268        .next   = proto_seq_next,
3269        .stop   = proto_seq_stop,
3270        .show   = proto_seq_show,
3271};
3272
3273static int proto_seq_open(struct inode *inode, struct file *file)
3274{
3275        return seq_open_net(inode, file, &proto_seq_ops,
3276                            sizeof(struct seq_net_private));
3277}
3278
3279static const struct file_operations proto_seq_fops = {
3280        .owner          = THIS_MODULE,
3281        .open           = proto_seq_open,
3282        .read           = seq_read,
3283        .llseek         = seq_lseek,
3284        .release        = seq_release_net,
3285};
3286
3287static __net_init int proto_init_net(struct net *net)
3288{
3289        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3290                return -ENOMEM;
3291
3292        return 0;
3293}
3294
3295static __net_exit void proto_exit_net(struct net *net)
3296{
3297        remove_proc_entry("protocols", net->proc_net);
3298}
3299
3300
3301static __net_initdata struct pernet_operations proto_net_ops = {
3302        .init = proto_init_net,
3303        .exit = proto_exit_net,
3304};
3305
3306static int __init proto_init(void)
3307{
3308        return register_pernet_subsys(&proto_net_ops);
3309}
3310
3311subsys_initcall(proto_init);
3312
3313#endif /* PROC_FS */
3314
3315#ifdef CONFIG_NET_RX_BUSY_POLL
3316bool sk_busy_loop_end(void *p, unsigned long start_time)
3317{
3318        struct sock *sk = p;
3319
3320        return !skb_queue_empty(&sk->sk_receive_queue) ||
3321               sk_busy_loop_timeout(sk, start_time);
3322}
3323EXPORT_SYMBOL(sk_busy_loop_end);
3324#endif /* CONFIG_NET_RX_BUSY_POLL */
3325