linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144#include "dev.h"
 145
 146static DEFINE_MUTEX(proto_list_mutex);
 147static LIST_HEAD(proto_list);
 148
 149static void sock_def_write_space_wfree(struct sock *sk);
 150static void sock_def_write_space(struct sock *sk);
 151
 152/**
 153 * sk_ns_capable - General socket capability test
 154 * @sk: Socket to use a capability on or through
 155 * @user_ns: The user namespace of the capability to use
 156 * @cap: The capability to use
 157 *
 158 * Test to see if the opener of the socket had when the socket was
 159 * created and the current process has the capability @cap in the user
 160 * namespace @user_ns.
 161 */
 162bool sk_ns_capable(const struct sock *sk,
 163                   struct user_namespace *user_ns, int cap)
 164{
 165        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166                ns_capable(user_ns, cap);
 167}
 168EXPORT_SYMBOL(sk_ns_capable);
 169
 170/**
 171 * sk_capable - Socket global capability test
 172 * @sk: Socket to use a capability on or through
 173 * @cap: The global capability to use
 174 *
 175 * Test to see if the opener of the socket had when the socket was
 176 * created and the current process has the capability @cap in all user
 177 * namespaces.
 178 */
 179bool sk_capable(const struct sock *sk, int cap)
 180{
 181        return sk_ns_capable(sk, &init_user_ns, cap);
 182}
 183EXPORT_SYMBOL(sk_capable);
 184
 185/**
 186 * sk_net_capable - Network namespace socket capability test
 187 * @sk: Socket to use a capability on or through
 188 * @cap: The capability to use
 189 *
 190 * Test to see if the opener of the socket had when the socket was created
 191 * and the current process has the capability @cap over the network namespace
 192 * the socket is a member of.
 193 */
 194bool sk_net_capable(const struct sock *sk, int cap)
 195{
 196        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197}
 198EXPORT_SYMBOL(sk_net_capable);
 199
 200/*
 201 * Each address family might have different locking rules, so we have
 202 * one slock key per address family and separate keys for internal and
 203 * userspace sockets.
 204 */
 205static struct lock_class_key af_family_keys[AF_MAX];
 206static struct lock_class_key af_family_kern_keys[AF_MAX];
 207static struct lock_class_key af_family_slock_keys[AF_MAX];
 208static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210/*
 211 * Make lock validator output more readable. (we pre-construct these
 212 * strings build-time, so that runtime initialization of socket
 213 * locks is fast):
 214 */
 215
 216#define _sock_locks(x)                                            \
 217  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 218  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 219  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 220  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 221  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 222  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 223  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 224  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 225  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 226  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 227  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 228  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 229  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 230  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 231  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 232  x "AF_MCTP"  , \
 233  x "AF_MAX"
 234
 235static const char *const af_family_key_strings[AF_MAX+1] = {
 236        _sock_locks("sk_lock-")
 237};
 238static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 239        _sock_locks("slock-")
 240};
 241static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242        _sock_locks("clock-")
 243};
 244
 245static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-sk_lock-")
 247};
 248static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 249        _sock_locks("k-slock-")
 250};
 251static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 252        _sock_locks("k-clock-")
 253};
 254static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 255        _sock_locks("rlock-")
 256};
 257static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 258        _sock_locks("wlock-")
 259};
 260static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 261        _sock_locks("elock-")
 262};
 263
 264/*
 265 * sk_callback_lock and sk queues locking rules are per-address-family,
 266 * so split the lock classes by using a per-AF key:
 267 */
 268static struct lock_class_key af_callback_keys[AF_MAX];
 269static struct lock_class_key af_rlock_keys[AF_MAX];
 270static struct lock_class_key af_wlock_keys[AF_MAX];
 271static struct lock_class_key af_elock_keys[AF_MAX];
 272static struct lock_class_key af_kern_callback_keys[AF_MAX];
 273
 274/* Run time adjustable parameters. */
 275__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 276EXPORT_SYMBOL(sysctl_wmem_max);
 277__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 278EXPORT_SYMBOL(sysctl_rmem_max);
 279__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 280__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 281
 282/* Maximal space eaten by iovec or ancillary data plus some space */
 283int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 284EXPORT_SYMBOL(sysctl_optmem_max);
 285
 286int sysctl_tstamp_allow_data __read_mostly = 1;
 287
 288DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 289EXPORT_SYMBOL_GPL(memalloc_socks_key);
 290
 291/**
 292 * sk_set_memalloc - sets %SOCK_MEMALLOC
 293 * @sk: socket to set it on
 294 *
 295 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 296 * It's the responsibility of the admin to adjust min_free_kbytes
 297 * to meet the requirements
 298 */
 299void sk_set_memalloc(struct sock *sk)
 300{
 301        sock_set_flag(sk, SOCK_MEMALLOC);
 302        sk->sk_allocation |= __GFP_MEMALLOC;
 303        static_branch_inc(&memalloc_socks_key);
 304}
 305EXPORT_SYMBOL_GPL(sk_set_memalloc);
 306
 307void sk_clear_memalloc(struct sock *sk)
 308{
 309        sock_reset_flag(sk, SOCK_MEMALLOC);
 310        sk->sk_allocation &= ~__GFP_MEMALLOC;
 311        static_branch_dec(&memalloc_socks_key);
 312
 313        /*
 314         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 315         * progress of swapping. SOCK_MEMALLOC may be cleared while
 316         * it has rmem allocations due to the last swapfile being deactivated
 317         * but there is a risk that the socket is unusable due to exceeding
 318         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 319         */
 320        sk_mem_reclaim(sk);
 321}
 322EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 323
 324int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 325{
 326        int ret;
 327        unsigned int noreclaim_flag;
 328
 329        /* these should have been dropped before queueing */
 330        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 331
 332        noreclaim_flag = memalloc_noreclaim_save();
 333        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 334                                 tcp_v6_do_rcv,
 335                                 tcp_v4_do_rcv,
 336                                 sk, skb);
 337        memalloc_noreclaim_restore(noreclaim_flag);
 338
 339        return ret;
 340}
 341EXPORT_SYMBOL(__sk_backlog_rcv);
 342
 343void sk_error_report(struct sock *sk)
 344{
 345        sk->sk_error_report(sk);
 346
 347        switch (sk->sk_family) {
 348        case AF_INET:
 349                fallthrough;
 350        case AF_INET6:
 351                trace_inet_sk_error_report(sk);
 352                break;
 353        default:
 354                break;
 355        }
 356}
 357EXPORT_SYMBOL(sk_error_report);
 358
 359int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 360{
 361        struct __kernel_sock_timeval tv;
 362
 363        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 364                tv.tv_sec = 0;
 365                tv.tv_usec = 0;
 366        } else {
 367                tv.tv_sec = timeo / HZ;
 368                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 369        }
 370
 371        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 372                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 373                *(struct old_timeval32 *)optval = tv32;
 374                return sizeof(tv32);
 375        }
 376
 377        if (old_timeval) {
 378                struct __kernel_old_timeval old_tv;
 379                old_tv.tv_sec = tv.tv_sec;
 380                old_tv.tv_usec = tv.tv_usec;
 381                *(struct __kernel_old_timeval *)optval = old_tv;
 382                return sizeof(old_tv);
 383        }
 384
 385        *(struct __kernel_sock_timeval *)optval = tv;
 386        return sizeof(tv);
 387}
 388EXPORT_SYMBOL(sock_get_timeout);
 389
 390int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 391                           sockptr_t optval, int optlen, bool old_timeval)
 392{
 393        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 394                struct old_timeval32 tv32;
 395
 396                if (optlen < sizeof(tv32))
 397                        return -EINVAL;
 398
 399                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 400                        return -EFAULT;
 401                tv->tv_sec = tv32.tv_sec;
 402                tv->tv_usec = tv32.tv_usec;
 403        } else if (old_timeval) {
 404                struct __kernel_old_timeval old_tv;
 405
 406                if (optlen < sizeof(old_tv))
 407                        return -EINVAL;
 408                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 409                        return -EFAULT;
 410                tv->tv_sec = old_tv.tv_sec;
 411                tv->tv_usec = old_tv.tv_usec;
 412        } else {
 413                if (optlen < sizeof(*tv))
 414                        return -EINVAL;
 415                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 416                        return -EFAULT;
 417        }
 418
 419        return 0;
 420}
 421EXPORT_SYMBOL(sock_copy_user_timeval);
 422
 423static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 424                            bool old_timeval)
 425{
 426        struct __kernel_sock_timeval tv;
 427        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 428
 429        if (err)
 430                return err;
 431
 432        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 433                return -EDOM;
 434
 435        if (tv.tv_sec < 0) {
 436                static int warned __read_mostly;
 437
 438                *timeo_p = 0;
 439                if (warned < 10 && net_ratelimit()) {
 440                        warned++;
 441                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 442                                __func__, current->comm, task_pid_nr(current));
 443                }
 444                return 0;
 445        }
 446        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 447        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 448                return 0;
 449        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 450                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 451        return 0;
 452}
 453
 454static bool sock_needs_netstamp(const struct sock *sk)
 455{
 456        switch (sk->sk_family) {
 457        case AF_UNSPEC:
 458        case AF_UNIX:
 459                return false;
 460        default:
 461                return true;
 462        }
 463}
 464
 465static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 466{
 467        if (sk->sk_flags & flags) {
 468                sk->sk_flags &= ~flags;
 469                if (sock_needs_netstamp(sk) &&
 470                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 471                        net_disable_timestamp();
 472        }
 473}
 474
 475
 476int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 477{
 478        unsigned long flags;
 479        struct sk_buff_head *list = &sk->sk_receive_queue;
 480
 481        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 482                atomic_inc(&sk->sk_drops);
 483                trace_sock_rcvqueue_full(sk, skb);
 484                return -ENOMEM;
 485        }
 486
 487        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 488                atomic_inc(&sk->sk_drops);
 489                return -ENOBUFS;
 490        }
 491
 492        skb->dev = NULL;
 493        skb_set_owner_r(skb, sk);
 494
 495        /* we escape from rcu protected region, make sure we dont leak
 496         * a norefcounted dst
 497         */
 498        skb_dst_force(skb);
 499
 500        spin_lock_irqsave(&list->lock, flags);
 501        sock_skb_set_dropcount(sk, skb);
 502        __skb_queue_tail(list, skb);
 503        spin_unlock_irqrestore(&list->lock, flags);
 504
 505        if (!sock_flag(sk, SOCK_DEAD))
 506                sk->sk_data_ready(sk);
 507        return 0;
 508}
 509EXPORT_SYMBOL(__sock_queue_rcv_skb);
 510
 511int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 512                              enum skb_drop_reason *reason)
 513{
 514        enum skb_drop_reason drop_reason;
 515        int err;
 516
 517        err = sk_filter(sk, skb);
 518        if (err) {
 519                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 520                goto out;
 521        }
 522        err = __sock_queue_rcv_skb(sk, skb);
 523        switch (err) {
 524        case -ENOMEM:
 525                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 526                break;
 527        case -ENOBUFS:
 528                drop_reason = SKB_DROP_REASON_PROTO_MEM;
 529                break;
 530        default:
 531                drop_reason = SKB_NOT_DROPPED_YET;
 532                break;
 533        }
 534out:
 535        if (reason)
 536                *reason = drop_reason;
 537        return err;
 538}
 539EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 540
 541int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 542                     const int nested, unsigned int trim_cap, bool refcounted)
 543{
 544        int rc = NET_RX_SUCCESS;
 545
 546        if (sk_filter_trim_cap(sk, skb, trim_cap))
 547                goto discard_and_relse;
 548
 549        skb->dev = NULL;
 550
 551        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 552                atomic_inc(&sk->sk_drops);
 553                goto discard_and_relse;
 554        }
 555        if (nested)
 556                bh_lock_sock_nested(sk);
 557        else
 558                bh_lock_sock(sk);
 559        if (!sock_owned_by_user(sk)) {
 560                /*
 561                 * trylock + unlock semantics:
 562                 */
 563                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 564
 565                rc = sk_backlog_rcv(sk, skb);
 566
 567                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 568        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 569                bh_unlock_sock(sk);
 570                atomic_inc(&sk->sk_drops);
 571                goto discard_and_relse;
 572        }
 573
 574        bh_unlock_sock(sk);
 575out:
 576        if (refcounted)
 577                sock_put(sk);
 578        return rc;
 579discard_and_relse:
 580        kfree_skb(skb);
 581        goto out;
 582}
 583EXPORT_SYMBOL(__sk_receive_skb);
 584
 585INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 586                                                          u32));
 587INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 588                                                           u32));
 589struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 590{
 591        struct dst_entry *dst = __sk_dst_get(sk);
 592
 593        if (dst && dst->obsolete &&
 594            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 595                               dst, cookie) == NULL) {
 596                sk_tx_queue_clear(sk);
 597                sk->sk_dst_pending_confirm = 0;
 598                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 599                dst_release(dst);
 600                return NULL;
 601        }
 602
 603        return dst;
 604}
 605EXPORT_SYMBOL(__sk_dst_check);
 606
 607struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 608{
 609        struct dst_entry *dst = sk_dst_get(sk);
 610
 611        if (dst && dst->obsolete &&
 612            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613                               dst, cookie) == NULL) {
 614                sk_dst_reset(sk);
 615                dst_release(dst);
 616                return NULL;
 617        }
 618
 619        return dst;
 620}
 621EXPORT_SYMBOL(sk_dst_check);
 622
 623static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 624{
 625        int ret = -ENOPROTOOPT;
 626#ifdef CONFIG_NETDEVICES
 627        struct net *net = sock_net(sk);
 628
 629        /* Sorry... */
 630        ret = -EPERM;
 631        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 632                goto out;
 633
 634        ret = -EINVAL;
 635        if (ifindex < 0)
 636                goto out;
 637
 638        /* Paired with all READ_ONCE() done locklessly. */
 639        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 640
 641        if (sk->sk_prot->rehash)
 642                sk->sk_prot->rehash(sk);
 643        sk_dst_reset(sk);
 644
 645        ret = 0;
 646
 647out:
 648#endif
 649
 650        return ret;
 651}
 652
 653int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 654{
 655        int ret;
 656
 657        if (lock_sk)
 658                lock_sock(sk);
 659        ret = sock_bindtoindex_locked(sk, ifindex);
 660        if (lock_sk)
 661                release_sock(sk);
 662
 663        return ret;
 664}
 665EXPORT_SYMBOL(sock_bindtoindex);
 666
 667static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 668{
 669        int ret = -ENOPROTOOPT;
 670#ifdef CONFIG_NETDEVICES
 671        struct net *net = sock_net(sk);
 672        char devname[IFNAMSIZ];
 673        int index;
 674
 675        ret = -EINVAL;
 676        if (optlen < 0)
 677                goto out;
 678
 679        /* Bind this socket to a particular device like "eth0",
 680         * as specified in the passed interface name. If the
 681         * name is "" or the option length is zero the socket
 682         * is not bound.
 683         */
 684        if (optlen > IFNAMSIZ - 1)
 685                optlen = IFNAMSIZ - 1;
 686        memset(devname, 0, sizeof(devname));
 687
 688        ret = -EFAULT;
 689        if (copy_from_sockptr(devname, optval, optlen))
 690                goto out;
 691
 692        index = 0;
 693        if (devname[0] != '\0') {
 694                struct net_device *dev;
 695
 696                rcu_read_lock();
 697                dev = dev_get_by_name_rcu(net, devname);
 698                if (dev)
 699                        index = dev->ifindex;
 700                rcu_read_unlock();
 701                ret = -ENODEV;
 702                if (!dev)
 703                        goto out;
 704        }
 705
 706        return sock_bindtoindex(sk, index, true);
 707out:
 708#endif
 709
 710        return ret;
 711}
 712
 713static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 714                                int __user *optlen, int len)
 715{
 716        int ret = -ENOPROTOOPT;
 717#ifdef CONFIG_NETDEVICES
 718        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 719        struct net *net = sock_net(sk);
 720        char devname[IFNAMSIZ];
 721
 722        if (bound_dev_if == 0) {
 723                len = 0;
 724                goto zero;
 725        }
 726
 727        ret = -EINVAL;
 728        if (len < IFNAMSIZ)
 729                goto out;
 730
 731        ret = netdev_get_name(net, devname, bound_dev_if);
 732        if (ret)
 733                goto out;
 734
 735        len = strlen(devname) + 1;
 736
 737        ret = -EFAULT;
 738        if (copy_to_user(optval, devname, len))
 739                goto out;
 740
 741zero:
 742        ret = -EFAULT;
 743        if (put_user(len, optlen))
 744                goto out;
 745
 746        ret = 0;
 747
 748out:
 749#endif
 750
 751        return ret;
 752}
 753
 754bool sk_mc_loop(struct sock *sk)
 755{
 756        if (dev_recursion_level())
 757                return false;
 758        if (!sk)
 759                return true;
 760        switch (sk->sk_family) {
 761        case AF_INET:
 762                return inet_sk(sk)->mc_loop;
 763#if IS_ENABLED(CONFIG_IPV6)
 764        case AF_INET6:
 765                return inet6_sk(sk)->mc_loop;
 766#endif
 767        }
 768        WARN_ON_ONCE(1);
 769        return true;
 770}
 771EXPORT_SYMBOL(sk_mc_loop);
 772
 773void sock_set_reuseaddr(struct sock *sk)
 774{
 775        lock_sock(sk);
 776        sk->sk_reuse = SK_CAN_REUSE;
 777        release_sock(sk);
 778}
 779EXPORT_SYMBOL(sock_set_reuseaddr);
 780
 781void sock_set_reuseport(struct sock *sk)
 782{
 783        lock_sock(sk);
 784        sk->sk_reuseport = true;
 785        release_sock(sk);
 786}
 787EXPORT_SYMBOL(sock_set_reuseport);
 788
 789void sock_no_linger(struct sock *sk)
 790{
 791        lock_sock(sk);
 792        sk->sk_lingertime = 0;
 793        sock_set_flag(sk, SOCK_LINGER);
 794        release_sock(sk);
 795}
 796EXPORT_SYMBOL(sock_no_linger);
 797
 798void sock_set_priority(struct sock *sk, u32 priority)
 799{
 800        lock_sock(sk);
 801        sk->sk_priority = priority;
 802        release_sock(sk);
 803}
 804EXPORT_SYMBOL(sock_set_priority);
 805
 806void sock_set_sndtimeo(struct sock *sk, s64 secs)
 807{
 808        lock_sock(sk);
 809        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 810                sk->sk_sndtimeo = secs * HZ;
 811        else
 812                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 813        release_sock(sk);
 814}
 815EXPORT_SYMBOL(sock_set_sndtimeo);
 816
 817static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 818{
 819        if (val)  {
 820                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 821                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 822                sock_set_flag(sk, SOCK_RCVTSTAMP);
 823                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 824        } else {
 825                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 826                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827        }
 828}
 829
 830void sock_enable_timestamps(struct sock *sk)
 831{
 832        lock_sock(sk);
 833        __sock_set_timestamps(sk, true, false, true);
 834        release_sock(sk);
 835}
 836EXPORT_SYMBOL(sock_enable_timestamps);
 837
 838void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 839{
 840        switch (optname) {
 841        case SO_TIMESTAMP_OLD:
 842                __sock_set_timestamps(sk, valbool, false, false);
 843                break;
 844        case SO_TIMESTAMP_NEW:
 845                __sock_set_timestamps(sk, valbool, true, false);
 846                break;
 847        case SO_TIMESTAMPNS_OLD:
 848                __sock_set_timestamps(sk, valbool, false, true);
 849                break;
 850        case SO_TIMESTAMPNS_NEW:
 851                __sock_set_timestamps(sk, valbool, true, true);
 852                break;
 853        }
 854}
 855
 856static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 857{
 858        struct net *net = sock_net(sk);
 859        struct net_device *dev = NULL;
 860        bool match = false;
 861        int *vclock_index;
 862        int i, num;
 863
 864        if (sk->sk_bound_dev_if)
 865                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 866
 867        if (!dev) {
 868                pr_err("%s: sock not bind to device\n", __func__);
 869                return -EOPNOTSUPP;
 870        }
 871
 872        num = ethtool_get_phc_vclocks(dev, &vclock_index);
 873        dev_put(dev);
 874
 875        for (i = 0; i < num; i++) {
 876                if (*(vclock_index + i) == phc_index) {
 877                        match = true;
 878                        break;
 879                }
 880        }
 881
 882        if (num > 0)
 883                kfree(vclock_index);
 884
 885        if (!match)
 886                return -EINVAL;
 887
 888        sk->sk_bind_phc = phc_index;
 889
 890        return 0;
 891}
 892
 893int sock_set_timestamping(struct sock *sk, int optname,
 894                          struct so_timestamping timestamping)
 895{
 896        int val = timestamping.flags;
 897        int ret;
 898
 899        if (val & ~SOF_TIMESTAMPING_MASK)
 900                return -EINVAL;
 901
 902        if (val & SOF_TIMESTAMPING_OPT_ID &&
 903            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 904                if (sk_is_tcp(sk)) {
 905                        if ((1 << sk->sk_state) &
 906                            (TCPF_CLOSE | TCPF_LISTEN))
 907                                return -EINVAL;
 908                        atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 909                } else {
 910                        atomic_set(&sk->sk_tskey, 0);
 911                }
 912        }
 913
 914        if (val & SOF_TIMESTAMPING_OPT_STATS &&
 915            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 916                return -EINVAL;
 917
 918        if (val & SOF_TIMESTAMPING_BIND_PHC) {
 919                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 920                if (ret)
 921                        return ret;
 922        }
 923
 924        sk->sk_tsflags = val;
 925        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 926
 927        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 928                sock_enable_timestamp(sk,
 929                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
 930        else
 931                sock_disable_timestamp(sk,
 932                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 933        return 0;
 934}
 935
 936void sock_set_keepalive(struct sock *sk)
 937{
 938        lock_sock(sk);
 939        if (sk->sk_prot->keepalive)
 940                sk->sk_prot->keepalive(sk, true);
 941        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 942        release_sock(sk);
 943}
 944EXPORT_SYMBOL(sock_set_keepalive);
 945
 946static void __sock_set_rcvbuf(struct sock *sk, int val)
 947{
 948        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 949         * as a negative value.
 950         */
 951        val = min_t(int, val, INT_MAX / 2);
 952        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 953
 954        /* We double it on the way in to account for "struct sk_buff" etc.
 955         * overhead.   Applications assume that the SO_RCVBUF setting they make
 956         * will allow that much actual data to be received on that socket.
 957         *
 958         * Applications are unaware that "struct sk_buff" and other overheads
 959         * allocate from the receive buffer during socket buffer allocation.
 960         *
 961         * And after considering the possible alternatives, returning the value
 962         * we actually used in getsockopt is the most desirable behavior.
 963         */
 964        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 965}
 966
 967void sock_set_rcvbuf(struct sock *sk, int val)
 968{
 969        lock_sock(sk);
 970        __sock_set_rcvbuf(sk, val);
 971        release_sock(sk);
 972}
 973EXPORT_SYMBOL(sock_set_rcvbuf);
 974
 975static void __sock_set_mark(struct sock *sk, u32 val)
 976{
 977        if (val != sk->sk_mark) {
 978                sk->sk_mark = val;
 979                sk_dst_reset(sk);
 980        }
 981}
 982
 983void sock_set_mark(struct sock *sk, u32 val)
 984{
 985        lock_sock(sk);
 986        __sock_set_mark(sk, val);
 987        release_sock(sk);
 988}
 989EXPORT_SYMBOL(sock_set_mark);
 990
 991static void sock_release_reserved_memory(struct sock *sk, int bytes)
 992{
 993        /* Round down bytes to multiple of pages */
 994        bytes &= ~(SK_MEM_QUANTUM - 1);
 995
 996        WARN_ON(bytes > sk->sk_reserved_mem);
 997        sk->sk_reserved_mem -= bytes;
 998        sk_mem_reclaim(sk);
 999}
1000
1001static int sock_reserve_memory(struct sock *sk, int bytes)
1002{
1003        long allocated;
1004        bool charged;
1005        int pages;
1006
1007        if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008                return -EOPNOTSUPP;
1009
1010        if (!bytes)
1011                return 0;
1012
1013        pages = sk_mem_pages(bytes);
1014
1015        /* pre-charge to memcg */
1016        charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018        if (!charged)
1019                return -ENOMEM;
1020
1021        /* pre-charge to forward_alloc */
1022        allocated = sk_memory_allocated_add(sk, pages);
1023        /* If the system goes into memory pressure with this
1024         * precharge, give up and return error.
1025         */
1026        if (allocated > sk_prot_mem_limits(sk, 1)) {
1027                sk_memory_allocated_sub(sk, pages);
1028                mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1029                return -ENOMEM;
1030        }
1031        sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1032
1033        sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1034
1035        return 0;
1036}
1037
1038/*
1039 *      This is meant for all protocols to use and covers goings on
1040 *      at the socket level. Everything here is generic.
1041 */
1042
1043int sock_setsockopt(struct socket *sock, int level, int optname,
1044                    sockptr_t optval, unsigned int optlen)
1045{
1046        struct so_timestamping timestamping;
1047        struct sock_txtime sk_txtime;
1048        struct sock *sk = sock->sk;
1049        int val;
1050        int valbool;
1051        struct linger ling;
1052        int ret = 0;
1053
1054        /*
1055         *      Options without arguments
1056         */
1057
1058        if (optname == SO_BINDTODEVICE)
1059                return sock_setbindtodevice(sk, optval, optlen);
1060
1061        if (optlen < sizeof(int))
1062                return -EINVAL;
1063
1064        if (copy_from_sockptr(&val, optval, sizeof(val)))
1065                return -EFAULT;
1066
1067        valbool = val ? 1 : 0;
1068
1069        lock_sock(sk);
1070
1071        switch (optname) {
1072        case SO_DEBUG:
1073                if (val && !capable(CAP_NET_ADMIN))
1074                        ret = -EACCES;
1075                else
1076                        sock_valbool_flag(sk, SOCK_DBG, valbool);
1077                break;
1078        case SO_REUSEADDR:
1079                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1080                break;
1081        case SO_REUSEPORT:
1082                sk->sk_reuseport = valbool;
1083                break;
1084        case SO_TYPE:
1085        case SO_PROTOCOL:
1086        case SO_DOMAIN:
1087        case SO_ERROR:
1088                ret = -ENOPROTOOPT;
1089                break;
1090        case SO_DONTROUTE:
1091                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1092                sk_dst_reset(sk);
1093                break;
1094        case SO_BROADCAST:
1095                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1096                break;
1097        case SO_SNDBUF:
1098                /* Don't error on this BSD doesn't and if you think
1099                 * about it this is right. Otherwise apps have to
1100                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1101                 * are treated in BSD as hints
1102                 */
1103                val = min_t(u32, val, sysctl_wmem_max);
1104set_sndbuf:
1105                /* Ensure val * 2 fits into an int, to prevent max_t()
1106                 * from treating it as a negative value.
1107                 */
1108                val = min_t(int, val, INT_MAX / 2);
1109                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1110                WRITE_ONCE(sk->sk_sndbuf,
1111                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
1112                /* Wake up sending tasks if we upped the value. */
1113                sk->sk_write_space(sk);
1114                break;
1115
1116        case SO_SNDBUFFORCE:
1117                if (!capable(CAP_NET_ADMIN)) {
1118                        ret = -EPERM;
1119                        break;
1120                }
1121
1122                /* No negative values (to prevent underflow, as val will be
1123                 * multiplied by 2).
1124                 */
1125                if (val < 0)
1126                        val = 0;
1127                goto set_sndbuf;
1128
1129        case SO_RCVBUF:
1130                /* Don't error on this BSD doesn't and if you think
1131                 * about it this is right. Otherwise apps have to
1132                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1133                 * are treated in BSD as hints
1134                 */
1135                __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1136                break;
1137
1138        case SO_RCVBUFFORCE:
1139                if (!capable(CAP_NET_ADMIN)) {
1140                        ret = -EPERM;
1141                        break;
1142                }
1143
1144                /* No negative values (to prevent underflow, as val will be
1145                 * multiplied by 2).
1146                 */
1147                __sock_set_rcvbuf(sk, max(val, 0));
1148                break;
1149
1150        case SO_KEEPALIVE:
1151                if (sk->sk_prot->keepalive)
1152                        sk->sk_prot->keepalive(sk, valbool);
1153                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1154                break;
1155
1156        case SO_OOBINLINE:
1157                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1158                break;
1159
1160        case SO_NO_CHECK:
1161                sk->sk_no_check_tx = valbool;
1162                break;
1163
1164        case SO_PRIORITY:
1165                if ((val >= 0 && val <= 6) ||
1166                    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1167                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1168                        sk->sk_priority = val;
1169                else
1170                        ret = -EPERM;
1171                break;
1172
1173        case SO_LINGER:
1174                if (optlen < sizeof(ling)) {
1175                        ret = -EINVAL;  /* 1003.1g */
1176                        break;
1177                }
1178                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1179                        ret = -EFAULT;
1180                        break;
1181                }
1182                if (!ling.l_onoff)
1183                        sock_reset_flag(sk, SOCK_LINGER);
1184                else {
1185#if (BITS_PER_LONG == 32)
1186                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1187                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1188                        else
1189#endif
1190                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1191                        sock_set_flag(sk, SOCK_LINGER);
1192                }
1193                break;
1194
1195        case SO_BSDCOMPAT:
1196                break;
1197
1198        case SO_PASSCRED:
1199                if (valbool)
1200                        set_bit(SOCK_PASSCRED, &sock->flags);
1201                else
1202                        clear_bit(SOCK_PASSCRED, &sock->flags);
1203                break;
1204
1205        case SO_TIMESTAMP_OLD:
1206        case SO_TIMESTAMP_NEW:
1207        case SO_TIMESTAMPNS_OLD:
1208        case SO_TIMESTAMPNS_NEW:
1209                sock_set_timestamp(sk, optname, valbool);
1210                break;
1211
1212        case SO_TIMESTAMPING_NEW:
1213        case SO_TIMESTAMPING_OLD:
1214                if (optlen == sizeof(timestamping)) {
1215                        if (copy_from_sockptr(&timestamping, optval,
1216                                              sizeof(timestamping))) {
1217                                ret = -EFAULT;
1218                                break;
1219                        }
1220                } else {
1221                        memset(&timestamping, 0, sizeof(timestamping));
1222                        timestamping.flags = val;
1223                }
1224                ret = sock_set_timestamping(sk, optname, timestamping);
1225                break;
1226
1227        case SO_RCVLOWAT:
1228                if (val < 0)
1229                        val = INT_MAX;
1230                if (sock->ops->set_rcvlowat)
1231                        ret = sock->ops->set_rcvlowat(sk, val);
1232                else
1233                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1234                break;
1235
1236        case SO_RCVTIMEO_OLD:
1237        case SO_RCVTIMEO_NEW:
1238                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1239                                       optlen, optname == SO_RCVTIMEO_OLD);
1240                break;
1241
1242        case SO_SNDTIMEO_OLD:
1243        case SO_SNDTIMEO_NEW:
1244                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1245                                       optlen, optname == SO_SNDTIMEO_OLD);
1246                break;
1247
1248        case SO_ATTACH_FILTER: {
1249                struct sock_fprog fprog;
1250
1251                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1252                if (!ret)
1253                        ret = sk_attach_filter(&fprog, sk);
1254                break;
1255        }
1256        case SO_ATTACH_BPF:
1257                ret = -EINVAL;
1258                if (optlen == sizeof(u32)) {
1259                        u32 ufd;
1260
1261                        ret = -EFAULT;
1262                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1263                                break;
1264
1265                        ret = sk_attach_bpf(ufd, sk);
1266                }
1267                break;
1268
1269        case SO_ATTACH_REUSEPORT_CBPF: {
1270                struct sock_fprog fprog;
1271
1272                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1273                if (!ret)
1274                        ret = sk_reuseport_attach_filter(&fprog, sk);
1275                break;
1276        }
1277        case SO_ATTACH_REUSEPORT_EBPF:
1278                ret = -EINVAL;
1279                if (optlen == sizeof(u32)) {
1280                        u32 ufd;
1281
1282                        ret = -EFAULT;
1283                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1284                                break;
1285
1286                        ret = sk_reuseport_attach_bpf(ufd, sk);
1287                }
1288                break;
1289
1290        case SO_DETACH_REUSEPORT_BPF:
1291                ret = reuseport_detach_prog(sk);
1292                break;
1293
1294        case SO_DETACH_FILTER:
1295                ret = sk_detach_filter(sk);
1296                break;
1297
1298        case SO_LOCK_FILTER:
1299                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1300                        ret = -EPERM;
1301                else
1302                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1303                break;
1304
1305        case SO_PASSSEC:
1306                if (valbool)
1307                        set_bit(SOCK_PASSSEC, &sock->flags);
1308                else
1309                        clear_bit(SOCK_PASSSEC, &sock->flags);
1310                break;
1311        case SO_MARK:
1312                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1313                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1314                        ret = -EPERM;
1315                        break;
1316                }
1317
1318                __sock_set_mark(sk, val);
1319                break;
1320        case SO_RCVMARK:
1321                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1322                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1323                        ret = -EPERM;
1324                        break;
1325                }
1326
1327                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1328                break;
1329
1330        case SO_RXQ_OVFL:
1331                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1332                break;
1333
1334        case SO_WIFI_STATUS:
1335                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1336                break;
1337
1338        case SO_PEEK_OFF:
1339                if (sock->ops->set_peek_off)
1340                        ret = sock->ops->set_peek_off(sk, val);
1341                else
1342                        ret = -EOPNOTSUPP;
1343                break;
1344
1345        case SO_NOFCS:
1346                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1347                break;
1348
1349        case SO_SELECT_ERR_QUEUE:
1350                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1351                break;
1352
1353#ifdef CONFIG_NET_RX_BUSY_POLL
1354        case SO_BUSY_POLL:
1355                /* allow unprivileged users to decrease the value */
1356                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1357                        ret = -EPERM;
1358                else {
1359                        if (val < 0)
1360                                ret = -EINVAL;
1361                        else
1362                                WRITE_ONCE(sk->sk_ll_usec, val);
1363                }
1364                break;
1365        case SO_PREFER_BUSY_POLL:
1366                if (valbool && !capable(CAP_NET_ADMIN))
1367                        ret = -EPERM;
1368                else
1369                        WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1370                break;
1371        case SO_BUSY_POLL_BUDGET:
1372                if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1373                        ret = -EPERM;
1374                } else {
1375                        if (val < 0 || val > U16_MAX)
1376                                ret = -EINVAL;
1377                        else
1378                                WRITE_ONCE(sk->sk_busy_poll_budget, val);
1379                }
1380                break;
1381#endif
1382
1383        case SO_MAX_PACING_RATE:
1384                {
1385                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1386
1387                if (sizeof(ulval) != sizeof(val) &&
1388                    optlen >= sizeof(ulval) &&
1389                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1390                        ret = -EFAULT;
1391                        break;
1392                }
1393                if (ulval != ~0UL)
1394                        cmpxchg(&sk->sk_pacing_status,
1395                                SK_PACING_NONE,
1396                                SK_PACING_NEEDED);
1397                sk->sk_max_pacing_rate = ulval;
1398                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1399                break;
1400                }
1401        case SO_INCOMING_CPU:
1402                WRITE_ONCE(sk->sk_incoming_cpu, val);
1403                break;
1404
1405        case SO_CNX_ADVICE:
1406                if (val == 1)
1407                        dst_negative_advice(sk);
1408                break;
1409
1410        case SO_ZEROCOPY:
1411                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1412                        if (!(sk_is_tcp(sk) ||
1413                              (sk->sk_type == SOCK_DGRAM &&
1414                               sk->sk_protocol == IPPROTO_UDP)))
1415                                ret = -EOPNOTSUPP;
1416                } else if (sk->sk_family != PF_RDS) {
1417                        ret = -EOPNOTSUPP;
1418                }
1419                if (!ret) {
1420                        if (val < 0 || val > 1)
1421                                ret = -EINVAL;
1422                        else
1423                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1424                }
1425                break;
1426
1427        case SO_TXTIME:
1428                if (optlen != sizeof(struct sock_txtime)) {
1429                        ret = -EINVAL;
1430                        break;
1431                } else if (copy_from_sockptr(&sk_txtime, optval,
1432                           sizeof(struct sock_txtime))) {
1433                        ret = -EFAULT;
1434                        break;
1435                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1436                        ret = -EINVAL;
1437                        break;
1438                }
1439                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1440                 * scheduler has enough safe guards.
1441                 */
1442                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1443                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1444                        ret = -EPERM;
1445                        break;
1446                }
1447                sock_valbool_flag(sk, SOCK_TXTIME, true);
1448                sk->sk_clockid = sk_txtime.clockid;
1449                sk->sk_txtime_deadline_mode =
1450                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1451                sk->sk_txtime_report_errors =
1452                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1453                break;
1454
1455        case SO_BINDTOIFINDEX:
1456                ret = sock_bindtoindex_locked(sk, val);
1457                break;
1458
1459        case SO_BUF_LOCK:
1460                if (val & ~SOCK_BUF_LOCK_MASK) {
1461                        ret = -EINVAL;
1462                        break;
1463                }
1464                sk->sk_userlocks = val | (sk->sk_userlocks &
1465                                          ~SOCK_BUF_LOCK_MASK);
1466                break;
1467
1468        case SO_RESERVE_MEM:
1469        {
1470                int delta;
1471
1472                if (val < 0) {
1473                        ret = -EINVAL;
1474                        break;
1475                }
1476
1477                delta = val - sk->sk_reserved_mem;
1478                if (delta < 0)
1479                        sock_release_reserved_memory(sk, -delta);
1480                else
1481                        ret = sock_reserve_memory(sk, delta);
1482                break;
1483        }
1484
1485        case SO_TXREHASH:
1486                if (val < -1 || val > 1) {
1487                        ret = -EINVAL;
1488                        break;
1489                }
1490                /* Paired with READ_ONCE() in tcp_rtx_synack() */
1491                WRITE_ONCE(sk->sk_txrehash, (u8)val);
1492                break;
1493
1494        default:
1495                ret = -ENOPROTOOPT;
1496                break;
1497        }
1498        release_sock(sk);
1499        return ret;
1500}
1501EXPORT_SYMBOL(sock_setsockopt);
1502
1503static const struct cred *sk_get_peer_cred(struct sock *sk)
1504{
1505        const struct cred *cred;
1506
1507        spin_lock(&sk->sk_peer_lock);
1508        cred = get_cred(sk->sk_peer_cred);
1509        spin_unlock(&sk->sk_peer_lock);
1510
1511        return cred;
1512}
1513
1514static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1515                          struct ucred *ucred)
1516{
1517        ucred->pid = pid_vnr(pid);
1518        ucred->uid = ucred->gid = -1;
1519        if (cred) {
1520                struct user_namespace *current_ns = current_user_ns();
1521
1522                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1523                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1524        }
1525}
1526
1527static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1528{
1529        struct user_namespace *user_ns = current_user_ns();
1530        int i;
1531
1532        for (i = 0; i < src->ngroups; i++)
1533                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1534                        return -EFAULT;
1535
1536        return 0;
1537}
1538
1539int sock_getsockopt(struct socket *sock, int level, int optname,
1540                    char __user *optval, int __user *optlen)
1541{
1542        struct sock *sk = sock->sk;
1543
1544        union {
1545                int val;
1546                u64 val64;
1547                unsigned long ulval;
1548                struct linger ling;
1549                struct old_timeval32 tm32;
1550                struct __kernel_old_timeval tm;
1551                struct  __kernel_sock_timeval stm;
1552                struct sock_txtime txtime;
1553                struct so_timestamping timestamping;
1554        } v;
1555
1556        int lv = sizeof(int);
1557        int len;
1558
1559        if (get_user(len, optlen))
1560                return -EFAULT;
1561        if (len < 0)
1562                return -EINVAL;
1563
1564        memset(&v, 0, sizeof(v));
1565
1566        switch (optname) {
1567        case SO_DEBUG:
1568                v.val = sock_flag(sk, SOCK_DBG);
1569                break;
1570
1571        case SO_DONTROUTE:
1572                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1573                break;
1574
1575        case SO_BROADCAST:
1576                v.val = sock_flag(sk, SOCK_BROADCAST);
1577                break;
1578
1579        case SO_SNDBUF:
1580                v.val = sk->sk_sndbuf;
1581                break;
1582
1583        case SO_RCVBUF:
1584                v.val = sk->sk_rcvbuf;
1585                break;
1586
1587        case SO_REUSEADDR:
1588                v.val = sk->sk_reuse;
1589                break;
1590
1591        case SO_REUSEPORT:
1592                v.val = sk->sk_reuseport;
1593                break;
1594
1595        case SO_KEEPALIVE:
1596                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1597                break;
1598
1599        case SO_TYPE:
1600                v.val = sk->sk_type;
1601                break;
1602
1603        case SO_PROTOCOL:
1604                v.val = sk->sk_protocol;
1605                break;
1606
1607        case SO_DOMAIN:
1608                v.val = sk->sk_family;
1609                break;
1610
1611        case SO_ERROR:
1612                v.val = -sock_error(sk);
1613                if (v.val == 0)
1614                        v.val = xchg(&sk->sk_err_soft, 0);
1615                break;
1616
1617        case SO_OOBINLINE:
1618                v.val = sock_flag(sk, SOCK_URGINLINE);
1619                break;
1620
1621        case SO_NO_CHECK:
1622                v.val = sk->sk_no_check_tx;
1623                break;
1624
1625        case SO_PRIORITY:
1626                v.val = sk->sk_priority;
1627                break;
1628
1629        case SO_LINGER:
1630                lv              = sizeof(v.ling);
1631                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1632                v.ling.l_linger = sk->sk_lingertime / HZ;
1633                break;
1634
1635        case SO_BSDCOMPAT:
1636                break;
1637
1638        case SO_TIMESTAMP_OLD:
1639                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1640                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1641                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1642                break;
1643
1644        case SO_TIMESTAMPNS_OLD:
1645                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1646                break;
1647
1648        case SO_TIMESTAMP_NEW:
1649                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1650                break;
1651
1652        case SO_TIMESTAMPNS_NEW:
1653                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1654                break;
1655
1656        case SO_TIMESTAMPING_OLD:
1657                lv = sizeof(v.timestamping);
1658                v.timestamping.flags = sk->sk_tsflags;
1659                v.timestamping.bind_phc = sk->sk_bind_phc;
1660                break;
1661
1662        case SO_RCVTIMEO_OLD:
1663        case SO_RCVTIMEO_NEW:
1664                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1665                break;
1666
1667        case SO_SNDTIMEO_OLD:
1668        case SO_SNDTIMEO_NEW:
1669                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1670                break;
1671
1672        case SO_RCVLOWAT:
1673                v.val = sk->sk_rcvlowat;
1674                break;
1675
1676        case SO_SNDLOWAT:
1677                v.val = 1;
1678                break;
1679
1680        case SO_PASSCRED:
1681                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1682                break;
1683
1684        case SO_PEERCRED:
1685        {
1686                struct ucred peercred;
1687                if (len > sizeof(peercred))
1688                        len = sizeof(peercred);
1689
1690                spin_lock(&sk->sk_peer_lock);
1691                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1692                spin_unlock(&sk->sk_peer_lock);
1693
1694                if (copy_to_user(optval, &peercred, len))
1695                        return -EFAULT;
1696                goto lenout;
1697        }
1698
1699        case SO_PEERGROUPS:
1700        {
1701                const struct cred *cred;
1702                int ret, n;
1703
1704                cred = sk_get_peer_cred(sk);
1705                if (!cred)
1706                        return -ENODATA;
1707
1708                n = cred->group_info->ngroups;
1709                if (len < n * sizeof(gid_t)) {
1710                        len = n * sizeof(gid_t);
1711                        put_cred(cred);
1712                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1713                }
1714                len = n * sizeof(gid_t);
1715
1716                ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1717                put_cred(cred);
1718                if (ret)
1719                        return ret;
1720                goto lenout;
1721        }
1722
1723        case SO_PEERNAME:
1724        {
1725                char address[128];
1726
1727                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1728                if (lv < 0)
1729                        return -ENOTCONN;
1730                if (lv < len)
1731                        return -EINVAL;
1732                if (copy_to_user(optval, address, len))
1733                        return -EFAULT;
1734                goto lenout;
1735        }
1736
1737        /* Dubious BSD thing... Probably nobody even uses it, but
1738         * the UNIX standard wants it for whatever reason... -DaveM
1739         */
1740        case SO_ACCEPTCONN:
1741                v.val = sk->sk_state == TCP_LISTEN;
1742                break;
1743
1744        case SO_PASSSEC:
1745                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1746                break;
1747
1748        case SO_PEERSEC:
1749                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1750
1751        case SO_MARK:
1752                v.val = sk->sk_mark;
1753                break;
1754
1755        case SO_RCVMARK:
1756                v.val = sock_flag(sk, SOCK_RCVMARK);
1757                break;
1758
1759        case SO_RXQ_OVFL:
1760                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1761                break;
1762
1763        case SO_WIFI_STATUS:
1764                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1765                break;
1766
1767        case SO_PEEK_OFF:
1768                if (!sock->ops->set_peek_off)
1769                        return -EOPNOTSUPP;
1770
1771                v.val = sk->sk_peek_off;
1772                break;
1773        case SO_NOFCS:
1774                v.val = sock_flag(sk, SOCK_NOFCS);
1775                break;
1776
1777        case SO_BINDTODEVICE:
1778                return sock_getbindtodevice(sk, optval, optlen, len);
1779
1780        case SO_GET_FILTER:
1781                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1782                if (len < 0)
1783                        return len;
1784
1785                goto lenout;
1786
1787        case SO_LOCK_FILTER:
1788                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1789                break;
1790
1791        case SO_BPF_EXTENSIONS:
1792                v.val = bpf_tell_extensions();
1793                break;
1794
1795        case SO_SELECT_ERR_QUEUE:
1796                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1797                break;
1798
1799#ifdef CONFIG_NET_RX_BUSY_POLL
1800        case SO_BUSY_POLL:
1801                v.val = sk->sk_ll_usec;
1802                break;
1803        case SO_PREFER_BUSY_POLL:
1804                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1805                break;
1806#endif
1807
1808        case SO_MAX_PACING_RATE:
1809                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1810                        lv = sizeof(v.ulval);
1811                        v.ulval = sk->sk_max_pacing_rate;
1812                } else {
1813                        /* 32bit version */
1814                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1815                }
1816                break;
1817
1818        case SO_INCOMING_CPU:
1819                v.val = READ_ONCE(sk->sk_incoming_cpu);
1820                break;
1821
1822        case SO_MEMINFO:
1823        {
1824                u32 meminfo[SK_MEMINFO_VARS];
1825
1826                sk_get_meminfo(sk, meminfo);
1827
1828                len = min_t(unsigned int, len, sizeof(meminfo));
1829                if (copy_to_user(optval, &meminfo, len))
1830                        return -EFAULT;
1831
1832                goto lenout;
1833        }
1834
1835#ifdef CONFIG_NET_RX_BUSY_POLL
1836        case SO_INCOMING_NAPI_ID:
1837                v.val = READ_ONCE(sk->sk_napi_id);
1838
1839                /* aggregate non-NAPI IDs down to 0 */
1840                if (v.val < MIN_NAPI_ID)
1841                        v.val = 0;
1842
1843                break;
1844#endif
1845
1846        case SO_COOKIE:
1847                lv = sizeof(u64);
1848                if (len < lv)
1849                        return -EINVAL;
1850                v.val64 = sock_gen_cookie(sk);
1851                break;
1852
1853        case SO_ZEROCOPY:
1854                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1855                break;
1856
1857        case SO_TXTIME:
1858                lv = sizeof(v.txtime);
1859                v.txtime.clockid = sk->sk_clockid;
1860                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1861                                  SOF_TXTIME_DEADLINE_MODE : 0;
1862                v.txtime.flags |= sk->sk_txtime_report_errors ?
1863                                  SOF_TXTIME_REPORT_ERRORS : 0;
1864                break;
1865
1866        case SO_BINDTOIFINDEX:
1867                v.val = READ_ONCE(sk->sk_bound_dev_if);
1868                break;
1869
1870        case SO_NETNS_COOKIE:
1871                lv = sizeof(u64);
1872                if (len != lv)
1873                        return -EINVAL;
1874                v.val64 = sock_net(sk)->net_cookie;
1875                break;
1876
1877        case SO_BUF_LOCK:
1878                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1879                break;
1880
1881        case SO_RESERVE_MEM:
1882                v.val = sk->sk_reserved_mem;
1883                break;
1884
1885        case SO_TXREHASH:
1886                v.val = sk->sk_txrehash;
1887                break;
1888
1889        default:
1890                /* We implement the SO_SNDLOWAT etc to not be settable
1891                 * (1003.1g 7).
1892                 */
1893                return -ENOPROTOOPT;
1894        }
1895
1896        if (len > lv)
1897                len = lv;
1898        if (copy_to_user(optval, &v, len))
1899                return -EFAULT;
1900lenout:
1901        if (put_user(len, optlen))
1902                return -EFAULT;
1903        return 0;
1904}
1905
1906/*
1907 * Initialize an sk_lock.
1908 *
1909 * (We also register the sk_lock with the lock validator.)
1910 */
1911static inline void sock_lock_init(struct sock *sk)
1912{
1913        if (sk->sk_kern_sock)
1914                sock_lock_init_class_and_name(
1915                        sk,
1916                        af_family_kern_slock_key_strings[sk->sk_family],
1917                        af_family_kern_slock_keys + sk->sk_family,
1918                        af_family_kern_key_strings[sk->sk_family],
1919                        af_family_kern_keys + sk->sk_family);
1920        else
1921                sock_lock_init_class_and_name(
1922                        sk,
1923                        af_family_slock_key_strings[sk->sk_family],
1924                        af_family_slock_keys + sk->sk_family,
1925                        af_family_key_strings[sk->sk_family],
1926                        af_family_keys + sk->sk_family);
1927}
1928
1929/*
1930 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1931 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1932 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1933 */
1934static void sock_copy(struct sock *nsk, const struct sock *osk)
1935{
1936        const struct proto *prot = READ_ONCE(osk->sk_prot);
1937#ifdef CONFIG_SECURITY_NETWORK
1938        void *sptr = nsk->sk_security;
1939#endif
1940
1941        /* If we move sk_tx_queue_mapping out of the private section,
1942         * we must check if sk_tx_queue_clear() is called after
1943         * sock_copy() in sk_clone_lock().
1944         */
1945        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1946                     offsetof(struct sock, sk_dontcopy_begin) ||
1947                     offsetof(struct sock, sk_tx_queue_mapping) >=
1948                     offsetof(struct sock, sk_dontcopy_end));
1949
1950        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1951
1952        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1953               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1954
1955#ifdef CONFIG_SECURITY_NETWORK
1956        nsk->sk_security = sptr;
1957        security_sk_clone(osk, nsk);
1958#endif
1959}
1960
1961static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1962                int family)
1963{
1964        struct sock *sk;
1965        struct kmem_cache *slab;
1966
1967        slab = prot->slab;
1968        if (slab != NULL) {
1969                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1970                if (!sk)
1971                        return sk;
1972                if (want_init_on_alloc(priority))
1973                        sk_prot_clear_nulls(sk, prot->obj_size);
1974        } else
1975                sk = kmalloc(prot->obj_size, priority);
1976
1977        if (sk != NULL) {
1978                if (security_sk_alloc(sk, family, priority))
1979                        goto out_free;
1980
1981                if (!try_module_get(prot->owner))
1982                        goto out_free_sec;
1983        }
1984
1985        return sk;
1986
1987out_free_sec:
1988        security_sk_free(sk);
1989out_free:
1990        if (slab != NULL)
1991                kmem_cache_free(slab, sk);
1992        else
1993                kfree(sk);
1994        return NULL;
1995}
1996
1997static void sk_prot_free(struct proto *prot, struct sock *sk)
1998{
1999        struct kmem_cache *slab;
2000        struct module *owner;
2001
2002        owner = prot->owner;
2003        slab = prot->slab;
2004
2005        cgroup_sk_free(&sk->sk_cgrp_data);
2006        mem_cgroup_sk_free(sk);
2007        security_sk_free(sk);
2008        if (slab != NULL)
2009                kmem_cache_free(slab, sk);
2010        else
2011                kfree(sk);
2012        module_put(owner);
2013}
2014
2015/**
2016 *      sk_alloc - All socket objects are allocated here
2017 *      @net: the applicable net namespace
2018 *      @family: protocol family
2019 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2020 *      @prot: struct proto associated with this new sock instance
2021 *      @kern: is this to be a kernel socket?
2022 */
2023struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2024                      struct proto *prot, int kern)
2025{
2026        struct sock *sk;
2027
2028        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2029        if (sk) {
2030                sk->sk_family = family;
2031                /*
2032                 * See comment in struct sock definition to understand
2033                 * why we need sk_prot_creator -acme
2034                 */
2035                sk->sk_prot = sk->sk_prot_creator = prot;
2036                sk->sk_kern_sock = kern;
2037                sock_lock_init(sk);
2038                sk->sk_net_refcnt = kern ? 0 : 1;
2039                if (likely(sk->sk_net_refcnt)) {
2040                        get_net_track(net, &sk->ns_tracker, priority);
2041                        sock_inuse_add(net, 1);
2042                }
2043
2044                sock_net_set(sk, net);
2045                refcount_set(&sk->sk_wmem_alloc, 1);
2046
2047                mem_cgroup_sk_alloc(sk);
2048                cgroup_sk_alloc(&sk->sk_cgrp_data);
2049                sock_update_classid(&sk->sk_cgrp_data);
2050                sock_update_netprioidx(&sk->sk_cgrp_data);
2051                sk_tx_queue_clear(sk);
2052        }
2053
2054        return sk;
2055}
2056EXPORT_SYMBOL(sk_alloc);
2057
2058/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2059 * grace period. This is the case for UDP sockets and TCP listeners.
2060 */
2061static void __sk_destruct(struct rcu_head *head)
2062{
2063        struct sock *sk = container_of(head, struct sock, sk_rcu);
2064        struct sk_filter *filter;
2065
2066        if (sk->sk_destruct)
2067                sk->sk_destruct(sk);
2068
2069        filter = rcu_dereference_check(sk->sk_filter,
2070                                       refcount_read(&sk->sk_wmem_alloc) == 0);
2071        if (filter) {
2072                sk_filter_uncharge(sk, filter);
2073                RCU_INIT_POINTER(sk->sk_filter, NULL);
2074        }
2075
2076        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2077
2078#ifdef CONFIG_BPF_SYSCALL
2079        bpf_sk_storage_free(sk);
2080#endif
2081
2082        if (atomic_read(&sk->sk_omem_alloc))
2083                pr_debug("%s: optmem leakage (%d bytes) detected\n",
2084                         __func__, atomic_read(&sk->sk_omem_alloc));
2085
2086        if (sk->sk_frag.page) {
2087                put_page(sk->sk_frag.page);
2088                sk->sk_frag.page = NULL;
2089        }
2090
2091        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2092        put_cred(sk->sk_peer_cred);
2093        put_pid(sk->sk_peer_pid);
2094
2095        if (likely(sk->sk_net_refcnt))
2096                put_net_track(sock_net(sk), &sk->ns_tracker);
2097        sk_prot_free(sk->sk_prot_creator, sk);
2098}
2099
2100void sk_destruct(struct sock *sk)
2101{
2102        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2103
2104        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2105                reuseport_detach_sock(sk);
2106                use_call_rcu = true;
2107        }
2108
2109        if (use_call_rcu)
2110                call_rcu(&sk->sk_rcu, __sk_destruct);
2111        else
2112                __sk_destruct(&sk->sk_rcu);
2113}
2114
2115static void __sk_free(struct sock *sk)
2116{
2117        if (likely(sk->sk_net_refcnt))
2118                sock_inuse_add(sock_net(sk), -1);
2119
2120        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2121                sock_diag_broadcast_destroy(sk);
2122        else
2123                sk_destruct(sk);
2124}
2125
2126void sk_free(struct sock *sk)
2127{
2128        /*
2129         * We subtract one from sk_wmem_alloc and can know if
2130         * some packets are still in some tx queue.
2131         * If not null, sock_wfree() will call __sk_free(sk) later
2132         */
2133        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2134                __sk_free(sk);
2135}
2136EXPORT_SYMBOL(sk_free);
2137
2138static void sk_init_common(struct sock *sk)
2139{
2140        skb_queue_head_init(&sk->sk_receive_queue);
2141        skb_queue_head_init(&sk->sk_write_queue);
2142        skb_queue_head_init(&sk->sk_error_queue);
2143
2144        rwlock_init(&sk->sk_callback_lock);
2145        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2146                        af_rlock_keys + sk->sk_family,
2147                        af_family_rlock_key_strings[sk->sk_family]);
2148        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2149                        af_wlock_keys + sk->sk_family,
2150                        af_family_wlock_key_strings[sk->sk_family]);
2151        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2152                        af_elock_keys + sk->sk_family,
2153                        af_family_elock_key_strings[sk->sk_family]);
2154        lockdep_set_class_and_name(&sk->sk_callback_lock,
2155                        af_callback_keys + sk->sk_family,
2156                        af_family_clock_key_strings[sk->sk_family]);
2157}
2158
2159/**
2160 *      sk_clone_lock - clone a socket, and lock its clone
2161 *      @sk: the socket to clone
2162 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2163 *
2164 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2165 */
2166struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2167{
2168        struct proto *prot = READ_ONCE(sk->sk_prot);
2169        struct sk_filter *filter;
2170        bool is_charged = true;
2171        struct sock *newsk;
2172
2173        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2174        if (!newsk)
2175                goto out;
2176
2177        sock_copy(newsk, sk);
2178
2179        newsk->sk_prot_creator = prot;
2180
2181        /* SANITY */
2182        if (likely(newsk->sk_net_refcnt)) {
2183                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2184                sock_inuse_add(sock_net(newsk), 1);
2185        }
2186        sk_node_init(&newsk->sk_node);
2187        sock_lock_init(newsk);
2188        bh_lock_sock(newsk);
2189        newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2190        newsk->sk_backlog.len = 0;
2191
2192        atomic_set(&newsk->sk_rmem_alloc, 0);
2193
2194        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2195        refcount_set(&newsk->sk_wmem_alloc, 1);
2196
2197        atomic_set(&newsk->sk_omem_alloc, 0);
2198        sk_init_common(newsk);
2199
2200        newsk->sk_dst_cache     = NULL;
2201        newsk->sk_dst_pending_confirm = 0;
2202        newsk->sk_wmem_queued   = 0;
2203        newsk->sk_forward_alloc = 0;
2204        newsk->sk_reserved_mem  = 0;
2205        atomic_set(&newsk->sk_drops, 0);
2206        newsk->sk_send_head     = NULL;
2207        newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2208        atomic_set(&newsk->sk_zckey, 0);
2209
2210        sock_reset_flag(newsk, SOCK_DONE);
2211
2212        /* sk->sk_memcg will be populated at accept() time */
2213        newsk->sk_memcg = NULL;
2214
2215        cgroup_sk_clone(&newsk->sk_cgrp_data);
2216
2217        rcu_read_lock();
2218        filter = rcu_dereference(sk->sk_filter);
2219        if (filter != NULL)
2220                /* though it's an empty new sock, the charging may fail
2221                 * if sysctl_optmem_max was changed between creation of
2222                 * original socket and cloning
2223                 */
2224                is_charged = sk_filter_charge(newsk, filter);
2225        RCU_INIT_POINTER(newsk->sk_filter, filter);
2226        rcu_read_unlock();
2227
2228        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2229                /* We need to make sure that we don't uncharge the new
2230                 * socket if we couldn't charge it in the first place
2231                 * as otherwise we uncharge the parent's filter.
2232                 */
2233                if (!is_charged)
2234                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
2235                sk_free_unlock_clone(newsk);
2236                newsk = NULL;
2237                goto out;
2238        }
2239        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2240
2241        if (bpf_sk_storage_clone(sk, newsk)) {
2242                sk_free_unlock_clone(newsk);
2243                newsk = NULL;
2244                goto out;
2245        }
2246
2247        /* Clear sk_user_data if parent had the pointer tagged
2248         * as not suitable for copying when cloning.
2249         */
2250        if (sk_user_data_is_nocopy(newsk))
2251                newsk->sk_user_data = NULL;
2252
2253        newsk->sk_err      = 0;
2254        newsk->sk_err_soft = 0;
2255        newsk->sk_priority = 0;
2256        newsk->sk_incoming_cpu = raw_smp_processor_id();
2257
2258        /* Before updating sk_refcnt, we must commit prior changes to memory
2259         * (Documentation/RCU/rculist_nulls.rst for details)
2260         */
2261        smp_wmb();
2262        refcount_set(&newsk->sk_refcnt, 2);
2263
2264        /* Increment the counter in the same struct proto as the master
2265         * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2266         * is the same as sk->sk_prot->socks, as this field was copied
2267         * with memcpy).
2268         *
2269         * This _changes_ the previous behaviour, where
2270         * tcp_create_openreq_child always was incrementing the
2271         * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2272         * to be taken into account in all callers. -acme
2273         */
2274        sk_refcnt_debug_inc(newsk);
2275        sk_set_socket(newsk, NULL);
2276        sk_tx_queue_clear(newsk);
2277        RCU_INIT_POINTER(newsk->sk_wq, NULL);
2278
2279        if (newsk->sk_prot->sockets_allocated)
2280                sk_sockets_allocated_inc(newsk);
2281
2282        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2283                net_enable_timestamp();
2284out:
2285        return newsk;
2286}
2287EXPORT_SYMBOL_GPL(sk_clone_lock);
2288
2289void sk_free_unlock_clone(struct sock *sk)
2290{
2291        /* It is still raw copy of parent, so invalidate
2292         * destructor and make plain sk_free() */
2293        sk->sk_destruct = NULL;
2294        bh_unlock_sock(sk);
2295        sk_free(sk);
2296}
2297EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2298
2299static void sk_trim_gso_size(struct sock *sk)
2300{
2301        if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2302                return;
2303#if IS_ENABLED(CONFIG_IPV6)
2304        if (sk->sk_family == AF_INET6 &&
2305            sk_is_tcp(sk) &&
2306            !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2307                return;
2308#endif
2309        sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2310}
2311
2312void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2313{
2314        u32 max_segs = 1;
2315
2316        sk_dst_set(sk, dst);
2317        sk->sk_route_caps = dst->dev->features;
2318        if (sk_is_tcp(sk))
2319                sk->sk_route_caps |= NETIF_F_GSO;
2320        if (sk->sk_route_caps & NETIF_F_GSO)
2321                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2322        if (unlikely(sk->sk_gso_disabled))
2323                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2324        if (sk_can_gso(sk)) {
2325                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2326                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2327                } else {
2328                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2329                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2330                        sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2331                        sk_trim_gso_size(sk);
2332                        sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2333                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2334                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2335                }
2336        }
2337        sk->sk_gso_max_segs = max_segs;
2338}
2339EXPORT_SYMBOL_GPL(sk_setup_caps);
2340
2341/*
2342 *      Simple resource managers for sockets.
2343 */
2344
2345
2346/*
2347 * Write buffer destructor automatically called from kfree_skb.
2348 */
2349void sock_wfree(struct sk_buff *skb)
2350{
2351        struct sock *sk = skb->sk;
2352        unsigned int len = skb->truesize;
2353        bool free;
2354
2355        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2356                if (sock_flag(sk, SOCK_RCU_FREE) &&
2357                    sk->sk_write_space == sock_def_write_space) {
2358                        rcu_read_lock();
2359                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2360                        sock_def_write_space_wfree(sk);
2361                        rcu_read_unlock();
2362                        if (unlikely(free))
2363                                __sk_free(sk);
2364                        return;
2365                }
2366
2367                /*
2368                 * Keep a reference on sk_wmem_alloc, this will be released
2369                 * after sk_write_space() call
2370                 */
2371                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2372                sk->sk_write_space(sk);
2373                len = 1;
2374        }
2375        /*
2376         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2377         * could not do because of in-flight packets
2378         */
2379        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2380                __sk_free(sk);
2381}
2382EXPORT_SYMBOL(sock_wfree);
2383
2384/* This variant of sock_wfree() is used by TCP,
2385 * since it sets SOCK_USE_WRITE_QUEUE.
2386 */
2387void __sock_wfree(struct sk_buff *skb)
2388{
2389        struct sock *sk = skb->sk;
2390
2391        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2392                __sk_free(sk);
2393}
2394
2395void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2396{
2397        skb_orphan(skb);
2398        skb->sk = sk;
2399#ifdef CONFIG_INET
2400        if (unlikely(!sk_fullsock(sk))) {
2401                skb->destructor = sock_edemux;
2402                sock_hold(sk);
2403                return;
2404        }
2405#endif
2406        skb->destructor = sock_wfree;
2407        skb_set_hash_from_sk(skb, sk);
2408        /*
2409         * We used to take a refcount on sk, but following operation
2410         * is enough to guarantee sk_free() wont free this sock until
2411         * all in-flight packets are completed
2412         */
2413        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2414}
2415EXPORT_SYMBOL(skb_set_owner_w);
2416
2417static bool can_skb_orphan_partial(const struct sk_buff *skb)
2418{
2419#ifdef CONFIG_TLS_DEVICE
2420        /* Drivers depend on in-order delivery for crypto offload,
2421         * partial orphan breaks out-of-order-OK logic.
2422         */
2423        if (skb->decrypted)
2424                return false;
2425#endif
2426        return (skb->destructor == sock_wfree ||
2427                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2428}
2429
2430/* This helper is used by netem, as it can hold packets in its
2431 * delay queue. We want to allow the owner socket to send more
2432 * packets, as if they were already TX completed by a typical driver.
2433 * But we also want to keep skb->sk set because some packet schedulers
2434 * rely on it (sch_fq for example).
2435 */
2436void skb_orphan_partial(struct sk_buff *skb)
2437{
2438        if (skb_is_tcp_pure_ack(skb))
2439                return;
2440
2441        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2442                return;
2443
2444        skb_orphan(skb);
2445}
2446EXPORT_SYMBOL(skb_orphan_partial);
2447
2448/*
2449 * Read buffer destructor automatically called from kfree_skb.
2450 */
2451void sock_rfree(struct sk_buff *skb)
2452{
2453        struct sock *sk = skb->sk;
2454        unsigned int len = skb->truesize;
2455
2456        atomic_sub(len, &sk->sk_rmem_alloc);
2457        sk_mem_uncharge(sk, len);
2458}
2459EXPORT_SYMBOL(sock_rfree);
2460
2461/*
2462 * Buffer destructor for skbs that are not used directly in read or write
2463 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2464 */
2465void sock_efree(struct sk_buff *skb)
2466{
2467        sock_put(skb->sk);
2468}
2469EXPORT_SYMBOL(sock_efree);
2470
2471/* Buffer destructor for prefetch/receive path where reference count may
2472 * not be held, e.g. for listen sockets.
2473 */
2474#ifdef CONFIG_INET
2475void sock_pfree(struct sk_buff *skb)
2476{
2477        if (sk_is_refcounted(skb->sk))
2478                sock_gen_put(skb->sk);
2479}
2480EXPORT_SYMBOL(sock_pfree);
2481#endif /* CONFIG_INET */
2482
2483kuid_t sock_i_uid(struct sock *sk)
2484{
2485        kuid_t uid;
2486
2487        read_lock_bh(&sk->sk_callback_lock);
2488        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2489        read_unlock_bh(&sk->sk_callback_lock);
2490        return uid;
2491}
2492EXPORT_SYMBOL(sock_i_uid);
2493
2494unsigned long sock_i_ino(struct sock *sk)
2495{
2496        unsigned long ino;
2497
2498        read_lock_bh(&sk->sk_callback_lock);
2499        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2500        read_unlock_bh(&sk->sk_callback_lock);
2501        return ino;
2502}
2503EXPORT_SYMBOL(sock_i_ino);
2504
2505/*
2506 * Allocate a skb from the socket's send buffer.
2507 */
2508struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2509                             gfp_t priority)
2510{
2511        if (force ||
2512            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2513                struct sk_buff *skb = alloc_skb(size, priority);
2514
2515                if (skb) {
2516                        skb_set_owner_w(skb, sk);
2517                        return skb;
2518                }
2519        }
2520        return NULL;
2521}
2522EXPORT_SYMBOL(sock_wmalloc);
2523
2524static void sock_ofree(struct sk_buff *skb)
2525{
2526        struct sock *sk = skb->sk;
2527
2528        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2529}
2530
2531struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2532                             gfp_t priority)
2533{
2534        struct sk_buff *skb;
2535
2536        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2537        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2538            sysctl_optmem_max)
2539                return NULL;
2540
2541        skb = alloc_skb(size, priority);
2542        if (!skb)
2543                return NULL;
2544
2545        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2546        skb->sk = sk;
2547        skb->destructor = sock_ofree;
2548        return skb;
2549}
2550
2551/*
2552 * Allocate a memory block from the socket's option memory buffer.
2553 */
2554void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2555{
2556        if ((unsigned int)size <= sysctl_optmem_max &&
2557            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2558                void *mem;
2559                /* First do the add, to avoid the race if kmalloc
2560                 * might sleep.
2561                 */
2562                atomic_add(size, &sk->sk_omem_alloc);
2563                mem = kmalloc(size, priority);
2564                if (mem)
2565                        return mem;
2566                atomic_sub(size, &sk->sk_omem_alloc);
2567        }
2568        return NULL;
2569}
2570EXPORT_SYMBOL(sock_kmalloc);
2571
2572/* Free an option memory block. Note, we actually want the inline
2573 * here as this allows gcc to detect the nullify and fold away the
2574 * condition entirely.
2575 */
2576static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2577                                  const bool nullify)
2578{
2579        if (WARN_ON_ONCE(!mem))
2580                return;
2581        if (nullify)
2582                kfree_sensitive(mem);
2583        else
2584                kfree(mem);
2585        atomic_sub(size, &sk->sk_omem_alloc);
2586}
2587
2588void sock_kfree_s(struct sock *sk, void *mem, int size)
2589{
2590        __sock_kfree_s(sk, mem, size, false);
2591}
2592EXPORT_SYMBOL(sock_kfree_s);
2593
2594void sock_kzfree_s(struct sock *sk, void *mem, int size)
2595{
2596        __sock_kfree_s(sk, mem, size, true);
2597}
2598EXPORT_SYMBOL(sock_kzfree_s);
2599
2600/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2601   I think, these locks should be removed for datagram sockets.
2602 */
2603static long sock_wait_for_wmem(struct sock *sk, long timeo)
2604{
2605        DEFINE_WAIT(wait);
2606
2607        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2608        for (;;) {
2609                if (!timeo)
2610                        break;
2611                if (signal_pending(current))
2612                        break;
2613                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2614                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2615                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2616                        break;
2617                if (sk->sk_shutdown & SEND_SHUTDOWN)
2618                        break;
2619                if (sk->sk_err)
2620                        break;
2621                timeo = schedule_timeout(timeo);
2622        }
2623        finish_wait(sk_sleep(sk), &wait);
2624        return timeo;
2625}
2626
2627
2628/*
2629 *      Generic send/receive buffer handlers
2630 */
2631
2632struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2633                                     unsigned long data_len, int noblock,
2634                                     int *errcode, int max_page_order)
2635{
2636        struct sk_buff *skb;
2637        long timeo;
2638        int err;
2639
2640        timeo = sock_sndtimeo(sk, noblock);
2641        for (;;) {
2642                err = sock_error(sk);
2643                if (err != 0)
2644                        goto failure;
2645
2646                err = -EPIPE;
2647                if (sk->sk_shutdown & SEND_SHUTDOWN)
2648                        goto failure;
2649
2650                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2651                        break;
2652
2653                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2654                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2655                err = -EAGAIN;
2656                if (!timeo)
2657                        goto failure;
2658                if (signal_pending(current))
2659                        goto interrupted;
2660                timeo = sock_wait_for_wmem(sk, timeo);
2661        }
2662        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2663                                   errcode, sk->sk_allocation);
2664        if (skb)
2665                skb_set_owner_w(skb, sk);
2666        return skb;
2667
2668interrupted:
2669        err = sock_intr_errno(timeo);
2670failure:
2671        *errcode = err;
2672        return NULL;
2673}
2674EXPORT_SYMBOL(sock_alloc_send_pskb);
2675
2676int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2677                     struct sockcm_cookie *sockc)
2678{
2679        u32 tsflags;
2680
2681        switch (cmsg->cmsg_type) {
2682        case SO_MARK:
2683                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2684                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2685                        return -EPERM;
2686                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2687                        return -EINVAL;
2688                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2689                break;
2690        case SO_TIMESTAMPING_OLD:
2691                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2692                        return -EINVAL;
2693
2694                tsflags = *(u32 *)CMSG_DATA(cmsg);
2695                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2696                        return -EINVAL;
2697
2698                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2699                sockc->tsflags |= tsflags;
2700                break;
2701        case SCM_TXTIME:
2702                if (!sock_flag(sk, SOCK_TXTIME))
2703                        return -EINVAL;
2704                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2705                        return -EINVAL;
2706                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2707                break;
2708        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2709        case SCM_RIGHTS:
2710        case SCM_CREDENTIALS:
2711                break;
2712        default:
2713                return -EINVAL;
2714        }
2715        return 0;
2716}
2717EXPORT_SYMBOL(__sock_cmsg_send);
2718
2719int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2720                   struct sockcm_cookie *sockc)
2721{
2722        struct cmsghdr *cmsg;
2723        int ret;
2724
2725        for_each_cmsghdr(cmsg, msg) {
2726                if (!CMSG_OK(msg, cmsg))
2727                        return -EINVAL;
2728                if (cmsg->cmsg_level != SOL_SOCKET)
2729                        continue;
2730                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2731                if (ret)
2732                        return ret;
2733        }
2734        return 0;
2735}
2736EXPORT_SYMBOL(sock_cmsg_send);
2737
2738static void sk_enter_memory_pressure(struct sock *sk)
2739{
2740        if (!sk->sk_prot->enter_memory_pressure)
2741                return;
2742
2743        sk->sk_prot->enter_memory_pressure(sk);
2744}
2745
2746static void sk_leave_memory_pressure(struct sock *sk)
2747{
2748        if (sk->sk_prot->leave_memory_pressure) {
2749                sk->sk_prot->leave_memory_pressure(sk);
2750        } else {
2751                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2752
2753                if (memory_pressure && READ_ONCE(*memory_pressure))
2754                        WRITE_ONCE(*memory_pressure, 0);
2755        }
2756}
2757
2758DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2759
2760/**
2761 * skb_page_frag_refill - check that a page_frag contains enough room
2762 * @sz: minimum size of the fragment we want to get
2763 * @pfrag: pointer to page_frag
2764 * @gfp: priority for memory allocation
2765 *
2766 * Note: While this allocator tries to use high order pages, there is
2767 * no guarantee that allocations succeed. Therefore, @sz MUST be
2768 * less or equal than PAGE_SIZE.
2769 */
2770bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2771{
2772        if (pfrag->page) {
2773                if (page_ref_count(pfrag->page) == 1) {
2774                        pfrag->offset = 0;
2775                        return true;
2776                }
2777                if (pfrag->offset + sz <= pfrag->size)
2778                        return true;
2779                put_page(pfrag->page);
2780        }
2781
2782        pfrag->offset = 0;
2783        if (SKB_FRAG_PAGE_ORDER &&
2784            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2785                /* Avoid direct reclaim but allow kswapd to wake */
2786                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2787                                          __GFP_COMP | __GFP_NOWARN |
2788                                          __GFP_NORETRY,
2789                                          SKB_FRAG_PAGE_ORDER);
2790                if (likely(pfrag->page)) {
2791                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2792                        return true;
2793                }
2794        }
2795        pfrag->page = alloc_page(gfp);
2796        if (likely(pfrag->page)) {
2797                pfrag->size = PAGE_SIZE;
2798                return true;
2799        }
2800        return false;
2801}
2802EXPORT_SYMBOL(skb_page_frag_refill);
2803
2804bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2805{
2806        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2807                return true;
2808
2809        sk_enter_memory_pressure(sk);
2810        sk_stream_moderate_sndbuf(sk);
2811        return false;
2812}
2813EXPORT_SYMBOL(sk_page_frag_refill);
2814
2815void __lock_sock(struct sock *sk)
2816        __releases(&sk->sk_lock.slock)
2817        __acquires(&sk->sk_lock.slock)
2818{
2819        DEFINE_WAIT(wait);
2820
2821        for (;;) {
2822                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2823                                        TASK_UNINTERRUPTIBLE);
2824                spin_unlock_bh(&sk->sk_lock.slock);
2825                schedule();
2826                spin_lock_bh(&sk->sk_lock.slock);
2827                if (!sock_owned_by_user(sk))
2828                        break;
2829        }
2830        finish_wait(&sk->sk_lock.wq, &wait);
2831}
2832
2833void __release_sock(struct sock *sk)
2834        __releases(&sk->sk_lock.slock)
2835        __acquires(&sk->sk_lock.slock)
2836{
2837        struct sk_buff *skb, *next;
2838
2839        while ((skb = sk->sk_backlog.head) != NULL) {
2840                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2841
2842                spin_unlock_bh(&sk->sk_lock.slock);
2843
2844                do {
2845                        next = skb->next;
2846                        prefetch(next);
2847                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2848                        skb_mark_not_on_list(skb);
2849                        sk_backlog_rcv(sk, skb);
2850
2851                        cond_resched();
2852
2853                        skb = next;
2854                } while (skb != NULL);
2855
2856                spin_lock_bh(&sk->sk_lock.slock);
2857        }
2858
2859        /*
2860         * Doing the zeroing here guarantee we can not loop forever
2861         * while a wild producer attempts to flood us.
2862         */
2863        sk->sk_backlog.len = 0;
2864}
2865
2866void __sk_flush_backlog(struct sock *sk)
2867{
2868        spin_lock_bh(&sk->sk_lock.slock);
2869        __release_sock(sk);
2870        spin_unlock_bh(&sk->sk_lock.slock);
2871}
2872
2873/**
2874 * sk_wait_data - wait for data to arrive at sk_receive_queue
2875 * @sk:    sock to wait on
2876 * @timeo: for how long
2877 * @skb:   last skb seen on sk_receive_queue
2878 *
2879 * Now socket state including sk->sk_err is changed only under lock,
2880 * hence we may omit checks after joining wait queue.
2881 * We check receive queue before schedule() only as optimization;
2882 * it is very likely that release_sock() added new data.
2883 */
2884int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2885{
2886        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2887        int rc;
2888
2889        add_wait_queue(sk_sleep(sk), &wait);
2890        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2891        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2892        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2893        remove_wait_queue(sk_sleep(sk), &wait);
2894        return rc;
2895}
2896EXPORT_SYMBOL(sk_wait_data);
2897
2898/**
2899 *      __sk_mem_raise_allocated - increase memory_allocated
2900 *      @sk: socket
2901 *      @size: memory size to allocate
2902 *      @amt: pages to allocate
2903 *      @kind: allocation type
2904 *
2905 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2906 */
2907int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2908{
2909        struct proto *prot = sk->sk_prot;
2910        long allocated = sk_memory_allocated_add(sk, amt);
2911        bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2912        bool charged = true;
2913
2914        if (memcg_charge &&
2915            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2916                                                gfp_memcg_charge())))
2917                goto suppress_allocation;
2918
2919        /* Under limit. */
2920        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2921                sk_leave_memory_pressure(sk);
2922                return 1;
2923        }
2924
2925        /* Under pressure. */
2926        if (allocated > sk_prot_mem_limits(sk, 1))
2927                sk_enter_memory_pressure(sk);
2928
2929        /* Over hard limit. */
2930        if (allocated > sk_prot_mem_limits(sk, 2))
2931                goto suppress_allocation;
2932
2933        /* guarantee minimum buffer size under pressure */
2934        if (kind == SK_MEM_RECV) {
2935                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2936                        return 1;
2937
2938        } else { /* SK_MEM_SEND */
2939                int wmem0 = sk_get_wmem0(sk, prot);
2940
2941                if (sk->sk_type == SOCK_STREAM) {
2942                        if (sk->sk_wmem_queued < wmem0)
2943                                return 1;
2944                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2945                                return 1;
2946                }
2947        }
2948
2949        if (sk_has_memory_pressure(sk)) {
2950                u64 alloc;
2951
2952                if (!sk_under_memory_pressure(sk))
2953                        return 1;
2954                alloc = sk_sockets_allocated_read_positive(sk);
2955                if (sk_prot_mem_limits(sk, 2) > alloc *
2956                    sk_mem_pages(sk->sk_wmem_queued +
2957                                 atomic_read(&sk->sk_rmem_alloc) +
2958                                 sk->sk_forward_alloc))
2959                        return 1;
2960        }
2961
2962suppress_allocation:
2963
2964        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2965                sk_stream_moderate_sndbuf(sk);
2966
2967                /* Fail only if socket is _under_ its sndbuf.
2968                 * In this case we cannot block, so that we have to fail.
2969                 */
2970                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2971                        /* Force charge with __GFP_NOFAIL */
2972                        if (memcg_charge && !charged) {
2973                                mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2974                                        gfp_memcg_charge() | __GFP_NOFAIL);
2975                        }
2976                        return 1;
2977                }
2978        }
2979
2980        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2981                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2982
2983        sk_memory_allocated_sub(sk, amt);
2984
2985        if (memcg_charge && charged)
2986                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2987
2988        return 0;
2989}
2990EXPORT_SYMBOL(__sk_mem_raise_allocated);
2991
2992/**
2993 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2994 *      @sk: socket
2995 *      @size: memory size to allocate
2996 *      @kind: allocation type
2997 *
2998 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2999 *      rmem allocation. This function assumes that protocols which have
3000 *      memory_pressure use sk_wmem_queued as write buffer accounting.
3001 */
3002int __sk_mem_schedule(struct sock *sk, int size, int kind)
3003{
3004        int ret, amt = sk_mem_pages(size);
3005
3006        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
3007        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3008        if (!ret)
3009                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
3010        return ret;
3011}
3012EXPORT_SYMBOL(__sk_mem_schedule);
3013
3014/**
3015 *      __sk_mem_reduce_allocated - reclaim memory_allocated
3016 *      @sk: socket
3017 *      @amount: number of quanta
3018 *
3019 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3020 */
3021void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3022{
3023        sk_memory_allocated_sub(sk, amount);
3024
3025        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3026                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3027
3028        if (sk_under_memory_pressure(sk) &&
3029            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3030                sk_leave_memory_pressure(sk);
3031}
3032EXPORT_SYMBOL(__sk_mem_reduce_allocated);
3033
3034/**
3035 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3036 *      @sk: socket
3037 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3038 */
3039void __sk_mem_reclaim(struct sock *sk, int amount)
3040{
3041        amount >>= SK_MEM_QUANTUM_SHIFT;
3042        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3043        __sk_mem_reduce_allocated(sk, amount);
3044}
3045EXPORT_SYMBOL(__sk_mem_reclaim);
3046
3047int sk_set_peek_off(struct sock *sk, int val)
3048{
3049        sk->sk_peek_off = val;
3050        return 0;
3051}
3052EXPORT_SYMBOL_GPL(sk_set_peek_off);
3053
3054/*
3055 * Set of default routines for initialising struct proto_ops when
3056 * the protocol does not support a particular function. In certain
3057 * cases where it makes no sense for a protocol to have a "do nothing"
3058 * function, some default processing is provided.
3059 */
3060
3061int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3062{
3063        return -EOPNOTSUPP;
3064}
3065EXPORT_SYMBOL(sock_no_bind);
3066
3067int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3068                    int len, int flags)
3069{
3070        return -EOPNOTSUPP;
3071}
3072EXPORT_SYMBOL(sock_no_connect);
3073
3074int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3075{
3076        return -EOPNOTSUPP;
3077}
3078EXPORT_SYMBOL(sock_no_socketpair);
3079
3080int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3081                   bool kern)
3082{
3083        return -EOPNOTSUPP;
3084}
3085EXPORT_SYMBOL(sock_no_accept);
3086
3087int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3088                    int peer)
3089{
3090        return -EOPNOTSUPP;
3091}
3092EXPORT_SYMBOL(sock_no_getname);
3093
3094int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3095{
3096        return -EOPNOTSUPP;
3097}
3098EXPORT_SYMBOL(sock_no_ioctl);
3099
3100int sock_no_listen(struct socket *sock, int backlog)
3101{
3102        return -EOPNOTSUPP;
3103}
3104EXPORT_SYMBOL(sock_no_listen);
3105
3106int sock_no_shutdown(struct socket *sock, int how)
3107{
3108        return -EOPNOTSUPP;
3109}
3110EXPORT_SYMBOL(sock_no_shutdown);
3111
3112int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3113{
3114        return -EOPNOTSUPP;
3115}
3116EXPORT_SYMBOL(sock_no_sendmsg);
3117
3118int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3119{
3120        return -EOPNOTSUPP;
3121}
3122EXPORT_SYMBOL(sock_no_sendmsg_locked);
3123
3124int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3125                    int flags)
3126{
3127        return -EOPNOTSUPP;
3128}
3129EXPORT_SYMBOL(sock_no_recvmsg);
3130
3131int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3132{
3133        /* Mirror missing mmap method error code */
3134        return -ENODEV;
3135}
3136EXPORT_SYMBOL(sock_no_mmap);
3137
3138/*
3139 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3140 * various sock-based usage counts.
3141 */
3142void __receive_sock(struct file *file)
3143{
3144        struct socket *sock;
3145
3146        sock = sock_from_file(file);
3147        if (sock) {
3148                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3149                sock_update_classid(&sock->sk->sk_cgrp_data);
3150        }
3151}
3152
3153ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3154{
3155        ssize_t res;
3156        struct msghdr msg = {.msg_flags = flags};
3157        struct kvec iov;
3158        char *kaddr = kmap(page);
3159        iov.iov_base = kaddr + offset;
3160        iov.iov_len = size;
3161        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3162        kunmap(page);
3163        return res;
3164}
3165EXPORT_SYMBOL(sock_no_sendpage);
3166
3167ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3168                                int offset, size_t size, int flags)
3169{
3170        ssize_t res;
3171        struct msghdr msg = {.msg_flags = flags};
3172        struct kvec iov;
3173        char *kaddr = kmap(page);
3174
3175        iov.iov_base = kaddr + offset;
3176        iov.iov_len = size;
3177        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3178        kunmap(page);
3179        return res;
3180}
3181EXPORT_SYMBOL(sock_no_sendpage_locked);
3182
3183/*
3184 *      Default Socket Callbacks
3185 */
3186
3187static void sock_def_wakeup(struct sock *sk)
3188{
3189        struct socket_wq *wq;
3190
3191        rcu_read_lock();
3192        wq = rcu_dereference(sk->sk_wq);
3193        if (skwq_has_sleeper(wq))
3194                wake_up_interruptible_all(&wq->wait);
3195        rcu_read_unlock();
3196}
3197
3198static void sock_def_error_report(struct sock *sk)
3199{
3200        struct socket_wq *wq;
3201
3202        rcu_read_lock();
3203        wq = rcu_dereference(sk->sk_wq);
3204        if (skwq_has_sleeper(wq))
3205                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3206        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3207        rcu_read_unlock();
3208}
3209
3210void sock_def_readable(struct sock *sk)
3211{
3212        struct socket_wq *wq;
3213
3214        rcu_read_lock();
3215        wq = rcu_dereference(sk->sk_wq);
3216        if (skwq_has_sleeper(wq))
3217                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3218                                                EPOLLRDNORM | EPOLLRDBAND);
3219        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3220        rcu_read_unlock();
3221}
3222
3223static void sock_def_write_space(struct sock *sk)
3224{
3225        struct socket_wq *wq;
3226
3227        rcu_read_lock();
3228
3229        /* Do not wake up a writer until he can make "significant"
3230         * progress.  --DaveM
3231         */
3232        if (sock_writeable(sk)) {
3233                wq = rcu_dereference(sk->sk_wq);
3234                if (skwq_has_sleeper(wq))
3235                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3236                                                EPOLLWRNORM | EPOLLWRBAND);
3237
3238                /* Should agree with poll, otherwise some programs break */
3239                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3240        }
3241
3242        rcu_read_unlock();
3243}
3244
3245/* An optimised version of sock_def_write_space(), should only be called
3246 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3247 * ->sk_wmem_alloc.
3248 */
3249static void sock_def_write_space_wfree(struct sock *sk)
3250{
3251        /* Do not wake up a writer until he can make "significant"
3252         * progress.  --DaveM
3253         */
3254        if (sock_writeable(sk)) {
3255                struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3256
3257                /* rely on refcount_sub from sock_wfree() */
3258                smp_mb__after_atomic();
3259                if (wq && waitqueue_active(&wq->wait))
3260                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3261                                                EPOLLWRNORM | EPOLLWRBAND);
3262
3263                /* Should agree with poll, otherwise some programs break */
3264                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3265        }
3266}
3267
3268static void sock_def_destruct(struct sock *sk)
3269{
3270}
3271
3272void sk_send_sigurg(struct sock *sk)
3273{
3274        if (sk->sk_socket && sk->sk_socket->file)
3275                if (send_sigurg(&sk->sk_socket->file->f_owner))
3276                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3277}
3278EXPORT_SYMBOL(sk_send_sigurg);
3279
3280void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3281                    unsigned long expires)
3282{
3283        if (!mod_timer(timer, expires))
3284                sock_hold(sk);
3285}
3286EXPORT_SYMBOL(sk_reset_timer);
3287
3288void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3289{
3290        if (del_timer(timer))
3291                __sock_put(sk);
3292}
3293EXPORT_SYMBOL(sk_stop_timer);
3294
3295void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3296{
3297        if (del_timer_sync(timer))
3298                __sock_put(sk);
3299}
3300EXPORT_SYMBOL(sk_stop_timer_sync);
3301
3302void sock_init_data(struct socket *sock, struct sock *sk)
3303{
3304        sk_init_common(sk);
3305        sk->sk_send_head        =       NULL;
3306
3307        timer_setup(&sk->sk_timer, NULL, 0);
3308
3309        sk->sk_allocation       =       GFP_KERNEL;
3310        sk->sk_rcvbuf           =       sysctl_rmem_default;
3311        sk->sk_sndbuf           =       sysctl_wmem_default;
3312        sk->sk_state            =       TCP_CLOSE;
3313        sk_set_socket(sk, sock);
3314
3315        sock_set_flag(sk, SOCK_ZAPPED);
3316
3317        if (sock) {
3318                sk->sk_type     =       sock->type;
3319                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3320                sock->sk        =       sk;
3321                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3322        } else {
3323                RCU_INIT_POINTER(sk->sk_wq, NULL);
3324                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3325        }
3326
3327        rwlock_init(&sk->sk_callback_lock);
3328        if (sk->sk_kern_sock)
3329                lockdep_set_class_and_name(
3330                        &sk->sk_callback_lock,
3331                        af_kern_callback_keys + sk->sk_family,
3332                        af_family_kern_clock_key_strings[sk->sk_family]);
3333        else
3334                lockdep_set_class_and_name(
3335                        &sk->sk_callback_lock,
3336                        af_callback_keys + sk->sk_family,
3337                        af_family_clock_key_strings[sk->sk_family]);
3338
3339        sk->sk_state_change     =       sock_def_wakeup;
3340        sk->sk_data_ready       =       sock_def_readable;
3341        sk->sk_write_space      =       sock_def_write_space;
3342        sk->sk_error_report     =       sock_def_error_report;
3343        sk->sk_destruct         =       sock_def_destruct;
3344
3345        sk->sk_frag.page        =       NULL;
3346        sk->sk_frag.offset      =       0;
3347        sk->sk_peek_off         =       -1;
3348
3349        sk->sk_peer_pid         =       NULL;
3350        sk->sk_peer_cred        =       NULL;
3351        spin_lock_init(&sk->sk_peer_lock);
3352
3353        sk->sk_write_pending    =       0;
3354        sk->sk_rcvlowat         =       1;
3355        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3356        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3357
3358        sk->sk_stamp = SK_DEFAULT_STAMP;
3359#if BITS_PER_LONG==32
3360        seqlock_init(&sk->sk_stamp_seq);
3361#endif
3362        atomic_set(&sk->sk_zckey, 0);
3363
3364#ifdef CONFIG_NET_RX_BUSY_POLL
3365        sk->sk_napi_id          =       0;
3366        sk->sk_ll_usec          =       sysctl_net_busy_read;
3367#endif
3368
3369        sk->sk_max_pacing_rate = ~0UL;
3370        sk->sk_pacing_rate = ~0UL;
3371        WRITE_ONCE(sk->sk_pacing_shift, 10);
3372        sk->sk_incoming_cpu = -1;
3373        sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3374
3375        sk_rx_queue_clear(sk);
3376        /*
3377         * Before updating sk_refcnt, we must commit prior changes to memory
3378         * (Documentation/RCU/rculist_nulls.rst for details)
3379         */
3380        smp_wmb();
3381        refcount_set(&sk->sk_refcnt, 1);
3382        atomic_set(&sk->sk_drops, 0);
3383}
3384EXPORT_SYMBOL(sock_init_data);
3385
3386void lock_sock_nested(struct sock *sk, int subclass)
3387{
3388        /* The sk_lock has mutex_lock() semantics here. */
3389        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3390
3391        might_sleep();
3392        spin_lock_bh(&sk->sk_lock.slock);
3393        if (sock_owned_by_user_nocheck(sk))
3394                __lock_sock(sk);
3395        sk->sk_lock.owned = 1;
3396        spin_unlock_bh(&sk->sk_lock.slock);
3397}
3398EXPORT_SYMBOL(lock_sock_nested);
3399
3400void release_sock(struct sock *sk)
3401{
3402        spin_lock_bh(&sk->sk_lock.slock);
3403        if (sk->sk_backlog.tail)
3404                __release_sock(sk);
3405
3406        /* Warning : release_cb() might need to release sk ownership,
3407         * ie call sock_release_ownership(sk) before us.
3408         */
3409        if (sk->sk_prot->release_cb)
3410                sk->sk_prot->release_cb(sk);
3411
3412        sock_release_ownership(sk);
3413        if (waitqueue_active(&sk->sk_lock.wq))
3414                wake_up(&sk->sk_lock.wq);
3415        spin_unlock_bh(&sk->sk_lock.slock);
3416}
3417EXPORT_SYMBOL(release_sock);
3418
3419bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3420{
3421        might_sleep();
3422        spin_lock_bh(&sk->sk_lock.slock);
3423
3424        if (!sock_owned_by_user_nocheck(sk)) {
3425                /*
3426                 * Fast path return with bottom halves disabled and
3427                 * sock::sk_lock.slock held.
3428                 *
3429                 * The 'mutex' is not contended and holding
3430                 * sock::sk_lock.slock prevents all other lockers to
3431                 * proceed so the corresponding unlock_sock_fast() can
3432                 * avoid the slow path of release_sock() completely and
3433                 * just release slock.
3434                 *
3435                 * From a semantical POV this is equivalent to 'acquiring'
3436                 * the 'mutex', hence the corresponding lockdep
3437                 * mutex_release() has to happen in the fast path of
3438                 * unlock_sock_fast().
3439                 */
3440                return false;
3441        }
3442
3443        __lock_sock(sk);
3444        sk->sk_lock.owned = 1;
3445        __acquire(&sk->sk_lock.slock);
3446        spin_unlock_bh(&sk->sk_lock.slock);
3447        return true;
3448}
3449EXPORT_SYMBOL(__lock_sock_fast);
3450
3451int sock_gettstamp(struct socket *sock, void __user *userstamp,
3452                   bool timeval, bool time32)
3453{
3454        struct sock *sk = sock->sk;
3455        struct timespec64 ts;
3456
3457        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3458        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3459        if (ts.tv_sec == -1)
3460                return -ENOENT;
3461        if (ts.tv_sec == 0) {
3462                ktime_t kt = ktime_get_real();
3463                sock_write_timestamp(sk, kt);
3464                ts = ktime_to_timespec64(kt);
3465        }
3466
3467        if (timeval)
3468                ts.tv_nsec /= 1000;
3469
3470#ifdef CONFIG_COMPAT_32BIT_TIME
3471        if (time32)
3472                return put_old_timespec32(&ts, userstamp);
3473#endif
3474#ifdef CONFIG_SPARC64
3475        /* beware of padding in sparc64 timeval */
3476        if (timeval && !in_compat_syscall()) {
3477                struct __kernel_old_timeval __user tv = {
3478                        .tv_sec = ts.tv_sec,
3479                        .tv_usec = ts.tv_nsec,
3480                };
3481                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3482                        return -EFAULT;
3483                return 0;
3484        }
3485#endif
3486        return put_timespec64(&ts, userstamp);
3487}
3488EXPORT_SYMBOL(sock_gettstamp);
3489
3490void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3491{
3492        if (!sock_flag(sk, flag)) {
3493                unsigned long previous_flags = sk->sk_flags;
3494
3495                sock_set_flag(sk, flag);
3496                /*
3497                 * we just set one of the two flags which require net
3498                 * time stamping, but time stamping might have been on
3499                 * already because of the other one
3500                 */
3501                if (sock_needs_netstamp(sk) &&
3502                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3503                        net_enable_timestamp();
3504        }
3505}
3506
3507int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3508                       int level, int type)
3509{
3510        struct sock_exterr_skb *serr;
3511        struct sk_buff *skb;
3512        int copied, err;
3513
3514        err = -EAGAIN;
3515        skb = sock_dequeue_err_skb(sk);
3516        if (skb == NULL)
3517                goto out;
3518
3519        copied = skb->len;
3520        if (copied > len) {
3521                msg->msg_flags |= MSG_TRUNC;
3522                copied = len;
3523        }
3524        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3525        if (err)
3526                goto out_free_skb;
3527
3528        sock_recv_timestamp(msg, sk, skb);
3529
3530        serr = SKB_EXT_ERR(skb);
3531        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3532
3533        msg->msg_flags |= MSG_ERRQUEUE;
3534        err = copied;
3535
3536out_free_skb:
3537        kfree_skb(skb);
3538out:
3539        return err;
3540}
3541EXPORT_SYMBOL(sock_recv_errqueue);
3542
3543/*
3544 *      Get a socket option on an socket.
3545 *
3546 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3547 *      asynchronous errors should be reported by getsockopt. We assume
3548 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3549 */
3550int sock_common_getsockopt(struct socket *sock, int level, int optname,
3551                           char __user *optval, int __user *optlen)
3552{
3553        struct sock *sk = sock->sk;
3554
3555        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3556}
3557EXPORT_SYMBOL(sock_common_getsockopt);
3558
3559int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3560                        int flags)
3561{
3562        struct sock *sk = sock->sk;
3563        int addr_len = 0;
3564        int err;
3565
3566        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3567        if (err >= 0)
3568                msg->msg_namelen = addr_len;
3569        return err;
3570}
3571EXPORT_SYMBOL(sock_common_recvmsg);
3572
3573/*
3574 *      Set socket options on an inet socket.
3575 */
3576int sock_common_setsockopt(struct socket *sock, int level, int optname,
3577                           sockptr_t optval, unsigned int optlen)
3578{
3579        struct sock *sk = sock->sk;
3580
3581        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3582}
3583EXPORT_SYMBOL(sock_common_setsockopt);
3584
3585void sk_common_release(struct sock *sk)
3586{
3587        if (sk->sk_prot->destroy)
3588                sk->sk_prot->destroy(sk);
3589
3590        /*
3591         * Observation: when sk_common_release is called, processes have
3592         * no access to socket. But net still has.
3593         * Step one, detach it from networking:
3594         *
3595         * A. Remove from hash tables.
3596         */
3597
3598        sk->sk_prot->unhash(sk);
3599
3600        /*
3601         * In this point socket cannot receive new packets, but it is possible
3602         * that some packets are in flight because some CPU runs receiver and
3603         * did hash table lookup before we unhashed socket. They will achieve
3604         * receive queue and will be purged by socket destructor.
3605         *
3606         * Also we still have packets pending on receive queue and probably,
3607         * our own packets waiting in device queues. sock_destroy will drain
3608         * receive queue, but transmitted packets will delay socket destruction
3609         * until the last reference will be released.
3610         */
3611
3612        sock_orphan(sk);
3613
3614        xfrm_sk_free_policy(sk);
3615
3616        sk_refcnt_debug_release(sk);
3617
3618        sock_put(sk);
3619}
3620EXPORT_SYMBOL(sk_common_release);
3621
3622void sk_get_meminfo(const struct sock *sk, u32 *mem)
3623{
3624        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3625
3626        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3627        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3628        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3629        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3630        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3631        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3632        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3633        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3634        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3635}
3636
3637#ifdef CONFIG_PROC_FS
3638static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3639
3640int sock_prot_inuse_get(struct net *net, struct proto *prot)
3641{
3642        int cpu, idx = prot->inuse_idx;
3643        int res = 0;
3644
3645        for_each_possible_cpu(cpu)
3646                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3647
3648        return res >= 0 ? res : 0;
3649}
3650EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3651
3652int sock_inuse_get(struct net *net)
3653{
3654        int cpu, res = 0;
3655
3656        for_each_possible_cpu(cpu)
3657                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3658
3659        return res;
3660}
3661
3662EXPORT_SYMBOL_GPL(sock_inuse_get);
3663
3664static int __net_init sock_inuse_init_net(struct net *net)
3665{
3666        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3667        if (net->core.prot_inuse == NULL)
3668                return -ENOMEM;
3669        return 0;
3670}
3671
3672static void __net_exit sock_inuse_exit_net(struct net *net)
3673{
3674        free_percpu(net->core.prot_inuse);
3675}
3676
3677static struct pernet_operations net_inuse_ops = {
3678        .init = sock_inuse_init_net,
3679        .exit = sock_inuse_exit_net,
3680};
3681
3682static __init int net_inuse_init(void)
3683{
3684        if (register_pernet_subsys(&net_inuse_ops))
3685                panic("Cannot initialize net inuse counters");
3686
3687        return 0;
3688}
3689
3690core_initcall(net_inuse_init);
3691
3692static int assign_proto_idx(struct proto *prot)
3693{
3694        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3695
3696        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3697                pr_err("PROTO_INUSE_NR exhausted\n");
3698                return -ENOSPC;
3699        }
3700
3701        set_bit(prot->inuse_idx, proto_inuse_idx);
3702        return 0;
3703}
3704
3705static void release_proto_idx(struct proto *prot)
3706{
3707        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3708                clear_bit(prot->inuse_idx, proto_inuse_idx);
3709}
3710#else
3711static inline int assign_proto_idx(struct proto *prot)
3712{
3713        return 0;
3714}
3715
3716static inline void release_proto_idx(struct proto *prot)
3717{
3718}
3719
3720#endif
3721
3722static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3723{
3724        if (!twsk_prot)
3725                return;
3726        kfree(twsk_prot->twsk_slab_name);
3727        twsk_prot->twsk_slab_name = NULL;
3728        kmem_cache_destroy(twsk_prot->twsk_slab);
3729        twsk_prot->twsk_slab = NULL;
3730}
3731
3732static int tw_prot_init(const struct proto *prot)
3733{
3734        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3735
3736        if (!twsk_prot)
3737                return 0;
3738
3739        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3740                                              prot->name);
3741        if (!twsk_prot->twsk_slab_name)
3742                return -ENOMEM;
3743
3744        twsk_prot->twsk_slab =
3745                kmem_cache_create(twsk_prot->twsk_slab_name,
3746                                  twsk_prot->twsk_obj_size, 0,
3747                                  SLAB_ACCOUNT | prot->slab_flags,
3748                                  NULL);
3749        if (!twsk_prot->twsk_slab) {
3750                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3751                        prot->name);
3752                return -ENOMEM;
3753        }
3754
3755        return 0;
3756}
3757
3758static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3759{
3760        if (!rsk_prot)
3761                return;
3762        kfree(rsk_prot->slab_name);
3763        rsk_prot->slab_name = NULL;
3764        kmem_cache_destroy(rsk_prot->slab);
3765        rsk_prot->slab = NULL;
3766}
3767
3768static int req_prot_init(const struct proto *prot)
3769{
3770        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3771
3772        if (!rsk_prot)
3773                return 0;
3774
3775        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3776                                        prot->name);
3777        if (!rsk_prot->slab_name)
3778                return -ENOMEM;
3779
3780        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3781                                           rsk_prot->obj_size, 0,
3782                                           SLAB_ACCOUNT | prot->slab_flags,
3783                                           NULL);
3784
3785        if (!rsk_prot->slab) {
3786                pr_crit("%s: Can't create request sock SLAB cache!\n",
3787                        prot->name);
3788                return -ENOMEM;
3789        }
3790        return 0;
3791}
3792
3793int proto_register(struct proto *prot, int alloc_slab)
3794{
3795        int ret = -ENOBUFS;
3796
3797        if (prot->memory_allocated && !prot->sysctl_mem) {
3798                pr_err("%s: missing sysctl_mem\n", prot->name);
3799                return -EINVAL;
3800        }
3801        if (alloc_slab) {
3802                prot->slab = kmem_cache_create_usercopy(prot->name,
3803                                        prot->obj_size, 0,
3804                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3805                                        prot->slab_flags,
3806                                        prot->useroffset, prot->usersize,
3807                                        NULL);
3808
3809                if (prot->slab == NULL) {
3810                        pr_crit("%s: Can't create sock SLAB cache!\n",
3811                                prot->name);
3812                        goto out;
3813                }
3814
3815                if (req_prot_init(prot))
3816                        goto out_free_request_sock_slab;
3817
3818                if (tw_prot_init(prot))
3819                        goto out_free_timewait_sock_slab;
3820        }
3821
3822        mutex_lock(&proto_list_mutex);
3823        ret = assign_proto_idx(prot);
3824        if (ret) {
3825                mutex_unlock(&proto_list_mutex);
3826                goto out_free_timewait_sock_slab;
3827        }
3828        list_add(&prot->node, &proto_list);
3829        mutex_unlock(&proto_list_mutex);
3830        return ret;
3831
3832out_free_timewait_sock_slab:
3833        if (alloc_slab)
3834                tw_prot_cleanup(prot->twsk_prot);
3835out_free_request_sock_slab:
3836        if (alloc_slab) {
3837                req_prot_cleanup(prot->rsk_prot);
3838
3839                kmem_cache_destroy(prot->slab);
3840                prot->slab = NULL;
3841        }
3842out:
3843        return ret;
3844}
3845EXPORT_SYMBOL(proto_register);
3846
3847void proto_unregister(struct proto *prot)
3848{
3849        mutex_lock(&proto_list_mutex);
3850        release_proto_idx(prot);
3851        list_del(&prot->node);
3852        mutex_unlock(&proto_list_mutex);
3853
3854        kmem_cache_destroy(prot->slab);
3855        prot->slab = NULL;
3856
3857        req_prot_cleanup(prot->rsk_prot);
3858        tw_prot_cleanup(prot->twsk_prot);
3859}
3860EXPORT_SYMBOL(proto_unregister);
3861
3862int sock_load_diag_module(int family, int protocol)
3863{
3864        if (!protocol) {
3865                if (!sock_is_registered(family))
3866                        return -ENOENT;
3867
3868                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3869                                      NETLINK_SOCK_DIAG, family);
3870        }
3871
3872#ifdef CONFIG_INET
3873        if (family == AF_INET &&
3874            protocol != IPPROTO_RAW &&
3875            protocol < MAX_INET_PROTOS &&
3876            !rcu_access_pointer(inet_protos[protocol]))
3877                return -ENOENT;
3878#endif
3879
3880        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3881                              NETLINK_SOCK_DIAG, family, protocol);
3882}
3883EXPORT_SYMBOL(sock_load_diag_module);
3884
3885#ifdef CONFIG_PROC_FS
3886static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3887        __acquires(proto_list_mutex)
3888{
3889        mutex_lock(&proto_list_mutex);
3890        return seq_list_start_head(&proto_list, *pos);
3891}
3892
3893static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3894{
3895        return seq_list_next(v, &proto_list, pos);
3896}
3897
3898static void proto_seq_stop(struct seq_file *seq, void *v)
3899        __releases(proto_list_mutex)
3900{
3901        mutex_unlock(&proto_list_mutex);
3902}
3903
3904static char proto_method_implemented(const void *method)
3905{
3906        return method == NULL ? 'n' : 'y';
3907}
3908static long sock_prot_memory_allocated(struct proto *proto)
3909{
3910        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3911}
3912
3913static const char *sock_prot_memory_pressure(struct proto *proto)
3914{
3915        return proto->memory_pressure != NULL ?
3916        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3917}
3918
3919static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3920{
3921
3922        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3923                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3924                   proto->name,
3925                   proto->obj_size,
3926                   sock_prot_inuse_get(seq_file_net(seq), proto),
3927                   sock_prot_memory_allocated(proto),
3928                   sock_prot_memory_pressure(proto),
3929                   proto->max_header,
3930                   proto->slab == NULL ? "no" : "yes",
3931                   module_name(proto->owner),
3932                   proto_method_implemented(proto->close),
3933                   proto_method_implemented(proto->connect),
3934                   proto_method_implemented(proto->disconnect),
3935                   proto_method_implemented(proto->accept),
3936                   proto_method_implemented(proto->ioctl),
3937                   proto_method_implemented(proto->init),
3938                   proto_method_implemented(proto->destroy),
3939                   proto_method_implemented(proto->shutdown),
3940                   proto_method_implemented(proto->setsockopt),
3941                   proto_method_implemented(proto->getsockopt),
3942                   proto_method_implemented(proto->sendmsg),
3943                   proto_method_implemented(proto->recvmsg),
3944                   proto_method_implemented(proto->sendpage),
3945                   proto_method_implemented(proto->bind),
3946                   proto_method_implemented(proto->backlog_rcv),
3947                   proto_method_implemented(proto->hash),
3948                   proto_method_implemented(proto->unhash),
3949                   proto_method_implemented(proto->get_port),
3950                   proto_method_implemented(proto->enter_memory_pressure));
3951}
3952
3953static int proto_seq_show(struct seq_file *seq, void *v)
3954{
3955        if (v == &proto_list)
3956                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3957                           "protocol",
3958                           "size",
3959                           "sockets",
3960                           "memory",
3961                           "press",
3962                           "maxhdr",
3963                           "slab",
3964                           "module",
3965                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3966        else
3967                proto_seq_printf(seq, list_entry(v, struct proto, node));
3968        return 0;
3969}
3970
3971static const struct seq_operations proto_seq_ops = {
3972        .start  = proto_seq_start,
3973        .next   = proto_seq_next,
3974        .stop   = proto_seq_stop,
3975        .show   = proto_seq_show,
3976};
3977
3978static __net_init int proto_init_net(struct net *net)
3979{
3980        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3981                        sizeof(struct seq_net_private)))
3982                return -ENOMEM;
3983
3984        return 0;
3985}
3986
3987static __net_exit void proto_exit_net(struct net *net)
3988{
3989        remove_proc_entry("protocols", net->proc_net);
3990}
3991
3992
3993static __net_initdata struct pernet_operations proto_net_ops = {
3994        .init = proto_init_net,
3995        .exit = proto_exit_net,
3996};
3997
3998static int __init proto_init(void)
3999{
4000        return register_pernet_subsys(&proto_net_ops);
4001}
4002
4003subsys_initcall(proto_init);
4004
4005#endif /* PROC_FS */
4006
4007#ifdef CONFIG_NET_RX_BUSY_POLL
4008bool sk_busy_loop_end(void *p, unsigned long start_time)
4009{
4010        struct sock *sk = p;
4011
4012        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4013               sk_busy_loop_timeout(sk, start_time);
4014}
4015EXPORT_SYMBOL(sk_busy_loop_end);
4016#endif /* CONFIG_NET_RX_BUSY_POLL */
4017
4018int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4019{
4020        if (!sk->sk_prot->bind_add)
4021                return -EOPNOTSUPP;
4022        return sk->sk_prot->bind_add(sk, addr, addr_len);
4023}
4024EXPORT_SYMBOL(sock_bind_add);
4025