linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144static DEFINE_MUTEX(proto_list_mutex);
 145static LIST_HEAD(proto_list);
 146
 147static void sock_inuse_add(struct net *net, int val);
 148
 149/**
 150 * sk_ns_capable - General socket capability test
 151 * @sk: Socket to use a capability on or through
 152 * @user_ns: The user namespace of the capability to use
 153 * @cap: The capability to use
 154 *
 155 * Test to see if the opener of the socket had when the socket was
 156 * created and the current process has the capability @cap in the user
 157 * namespace @user_ns.
 158 */
 159bool sk_ns_capable(const struct sock *sk,
 160                   struct user_namespace *user_ns, int cap)
 161{
 162        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 163                ns_capable(user_ns, cap);
 164}
 165EXPORT_SYMBOL(sk_ns_capable);
 166
 167/**
 168 * sk_capable - Socket global capability test
 169 * @sk: Socket to use a capability on or through
 170 * @cap: The global capability to use
 171 *
 172 * Test to see if the opener of the socket had when the socket was
 173 * created and the current process has the capability @cap in all user
 174 * namespaces.
 175 */
 176bool sk_capable(const struct sock *sk, int cap)
 177{
 178        return sk_ns_capable(sk, &init_user_ns, cap);
 179}
 180EXPORT_SYMBOL(sk_capable);
 181
 182/**
 183 * sk_net_capable - Network namespace socket capability test
 184 * @sk: Socket to use a capability on or through
 185 * @cap: The capability to use
 186 *
 187 * Test to see if the opener of the socket had when the socket was created
 188 * and the current process has the capability @cap over the network namespace
 189 * the socket is a member of.
 190 */
 191bool sk_net_capable(const struct sock *sk, int cap)
 192{
 193        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 194}
 195EXPORT_SYMBOL(sk_net_capable);
 196
 197/*
 198 * Each address family might have different locking rules, so we have
 199 * one slock key per address family and separate keys for internal and
 200 * userspace sockets.
 201 */
 202static struct lock_class_key af_family_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_keys[AF_MAX];
 204static struct lock_class_key af_family_slock_keys[AF_MAX];
 205static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 206
 207/*
 208 * Make lock validator output more readable. (we pre-construct these
 209 * strings build-time, so that runtime initialization of socket
 210 * locks is fast):
 211 */
 212
 213#define _sock_locks(x)                                            \
 214  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 215  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 216  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 217  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 218  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 219  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 220  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 221  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 222  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 223  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 224  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 225  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 226  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 227  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 228  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 229  x "AF_MCTP"  , \
 230  x "AF_MAX"
 231
 232static const char *const af_family_key_strings[AF_MAX+1] = {
 233        _sock_locks("sk_lock-")
 234};
 235static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236        _sock_locks("slock-")
 237};
 238static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239        _sock_locks("clock-")
 240};
 241
 242static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243        _sock_locks("k-sk_lock-")
 244};
 245static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-slock-")
 247};
 248static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249        _sock_locks("k-clock-")
 250};
 251static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252        _sock_locks("rlock-")
 253};
 254static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 255        _sock_locks("wlock-")
 256};
 257static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 258        _sock_locks("elock-")
 259};
 260
 261/*
 262 * sk_callback_lock and sk queues locking rules are per-address-family,
 263 * so split the lock classes by using a per-AF key:
 264 */
 265static struct lock_class_key af_callback_keys[AF_MAX];
 266static struct lock_class_key af_rlock_keys[AF_MAX];
 267static struct lock_class_key af_wlock_keys[AF_MAX];
 268static struct lock_class_key af_elock_keys[AF_MAX];
 269static struct lock_class_key af_kern_callback_keys[AF_MAX];
 270
 271/* Run time adjustable parameters. */
 272__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 273EXPORT_SYMBOL(sysctl_wmem_max);
 274__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 275EXPORT_SYMBOL(sysctl_rmem_max);
 276__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 277__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 278
 279/* Maximal space eaten by iovec or ancillary data plus some space */
 280int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 281EXPORT_SYMBOL(sysctl_optmem_max);
 282
 283int sysctl_tstamp_allow_data __read_mostly = 1;
 284
 285DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 286EXPORT_SYMBOL_GPL(memalloc_socks_key);
 287
 288/**
 289 * sk_set_memalloc - sets %SOCK_MEMALLOC
 290 * @sk: socket to set it on
 291 *
 292 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 293 * It's the responsibility of the admin to adjust min_free_kbytes
 294 * to meet the requirements
 295 */
 296void sk_set_memalloc(struct sock *sk)
 297{
 298        sock_set_flag(sk, SOCK_MEMALLOC);
 299        sk->sk_allocation |= __GFP_MEMALLOC;
 300        static_branch_inc(&memalloc_socks_key);
 301}
 302EXPORT_SYMBOL_GPL(sk_set_memalloc);
 303
 304void sk_clear_memalloc(struct sock *sk)
 305{
 306        sock_reset_flag(sk, SOCK_MEMALLOC);
 307        sk->sk_allocation &= ~__GFP_MEMALLOC;
 308        static_branch_dec(&memalloc_socks_key);
 309
 310        /*
 311         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 312         * progress of swapping. SOCK_MEMALLOC may be cleared while
 313         * it has rmem allocations due to the last swapfile being deactivated
 314         * but there is a risk that the socket is unusable due to exceeding
 315         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 316         */
 317        sk_mem_reclaim(sk);
 318}
 319EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 320
 321int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 322{
 323        int ret;
 324        unsigned int noreclaim_flag;
 325
 326        /* these should have been dropped before queueing */
 327        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 328
 329        noreclaim_flag = memalloc_noreclaim_save();
 330        ret = sk->sk_backlog_rcv(sk, skb);
 331        memalloc_noreclaim_restore(noreclaim_flag);
 332
 333        return ret;
 334}
 335EXPORT_SYMBOL(__sk_backlog_rcv);
 336
 337void sk_error_report(struct sock *sk)
 338{
 339        sk->sk_error_report(sk);
 340
 341        switch (sk->sk_family) {
 342        case AF_INET:
 343                fallthrough;
 344        case AF_INET6:
 345                trace_inet_sk_error_report(sk);
 346                break;
 347        default:
 348                break;
 349        }
 350}
 351EXPORT_SYMBOL(sk_error_report);
 352
 353static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 354{
 355        struct __kernel_sock_timeval tv;
 356
 357        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 358                tv.tv_sec = 0;
 359                tv.tv_usec = 0;
 360        } else {
 361                tv.tv_sec = timeo / HZ;
 362                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 363        }
 364
 365        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 366                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 367                *(struct old_timeval32 *)optval = tv32;
 368                return sizeof(tv32);
 369        }
 370
 371        if (old_timeval) {
 372                struct __kernel_old_timeval old_tv;
 373                old_tv.tv_sec = tv.tv_sec;
 374                old_tv.tv_usec = tv.tv_usec;
 375                *(struct __kernel_old_timeval *)optval = old_tv;
 376                return sizeof(old_tv);
 377        }
 378
 379        *(struct __kernel_sock_timeval *)optval = tv;
 380        return sizeof(tv);
 381}
 382
 383static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 384                            bool old_timeval)
 385{
 386        struct __kernel_sock_timeval tv;
 387
 388        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389                struct old_timeval32 tv32;
 390
 391                if (optlen < sizeof(tv32))
 392                        return -EINVAL;
 393
 394                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395                        return -EFAULT;
 396                tv.tv_sec = tv32.tv_sec;
 397                tv.tv_usec = tv32.tv_usec;
 398        } else if (old_timeval) {
 399                struct __kernel_old_timeval old_tv;
 400
 401                if (optlen < sizeof(old_tv))
 402                        return -EINVAL;
 403                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404                        return -EFAULT;
 405                tv.tv_sec = old_tv.tv_sec;
 406                tv.tv_usec = old_tv.tv_usec;
 407        } else {
 408                if (optlen < sizeof(tv))
 409                        return -EINVAL;
 410                if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 411                        return -EFAULT;
 412        }
 413        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 414                return -EDOM;
 415
 416        if (tv.tv_sec < 0) {
 417                static int warned __read_mostly;
 418
 419                *timeo_p = 0;
 420                if (warned < 10 && net_ratelimit()) {
 421                        warned++;
 422                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 423                                __func__, current->comm, task_pid_nr(current));
 424                }
 425                return 0;
 426        }
 427        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 428        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 429                return 0;
 430        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 431                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 432        return 0;
 433}
 434
 435static bool sock_needs_netstamp(const struct sock *sk)
 436{
 437        switch (sk->sk_family) {
 438        case AF_UNSPEC:
 439        case AF_UNIX:
 440                return false;
 441        default:
 442                return true;
 443        }
 444}
 445
 446static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 447{
 448        if (sk->sk_flags & flags) {
 449                sk->sk_flags &= ~flags;
 450                if (sock_needs_netstamp(sk) &&
 451                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 452                        net_disable_timestamp();
 453        }
 454}
 455
 456
 457int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 458{
 459        unsigned long flags;
 460        struct sk_buff_head *list = &sk->sk_receive_queue;
 461
 462        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 463                atomic_inc(&sk->sk_drops);
 464                trace_sock_rcvqueue_full(sk, skb);
 465                return -ENOMEM;
 466        }
 467
 468        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 469                atomic_inc(&sk->sk_drops);
 470                return -ENOBUFS;
 471        }
 472
 473        skb->dev = NULL;
 474        skb_set_owner_r(skb, sk);
 475
 476        /* we escape from rcu protected region, make sure we dont leak
 477         * a norefcounted dst
 478         */
 479        skb_dst_force(skb);
 480
 481        spin_lock_irqsave(&list->lock, flags);
 482        sock_skb_set_dropcount(sk, skb);
 483        __skb_queue_tail(list, skb);
 484        spin_unlock_irqrestore(&list->lock, flags);
 485
 486        if (!sock_flag(sk, SOCK_DEAD))
 487                sk->sk_data_ready(sk);
 488        return 0;
 489}
 490EXPORT_SYMBOL(__sock_queue_rcv_skb);
 491
 492int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 493{
 494        int err;
 495
 496        err = sk_filter(sk, skb);
 497        if (err)
 498                return err;
 499
 500        return __sock_queue_rcv_skb(sk, skb);
 501}
 502EXPORT_SYMBOL(sock_queue_rcv_skb);
 503
 504int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 505                     const int nested, unsigned int trim_cap, bool refcounted)
 506{
 507        int rc = NET_RX_SUCCESS;
 508
 509        if (sk_filter_trim_cap(sk, skb, trim_cap))
 510                goto discard_and_relse;
 511
 512        skb->dev = NULL;
 513
 514        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 515                atomic_inc(&sk->sk_drops);
 516                goto discard_and_relse;
 517        }
 518        if (nested)
 519                bh_lock_sock_nested(sk);
 520        else
 521                bh_lock_sock(sk);
 522        if (!sock_owned_by_user(sk)) {
 523                /*
 524                 * trylock + unlock semantics:
 525                 */
 526                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 527
 528                rc = sk_backlog_rcv(sk, skb);
 529
 530                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 531        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 532                bh_unlock_sock(sk);
 533                atomic_inc(&sk->sk_drops);
 534                goto discard_and_relse;
 535        }
 536
 537        bh_unlock_sock(sk);
 538out:
 539        if (refcounted)
 540                sock_put(sk);
 541        return rc;
 542discard_and_relse:
 543        kfree_skb(skb);
 544        goto out;
 545}
 546EXPORT_SYMBOL(__sk_receive_skb);
 547
 548INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 549                                                          u32));
 550INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 551                                                           u32));
 552struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 553{
 554        struct dst_entry *dst = __sk_dst_get(sk);
 555
 556        if (dst && dst->obsolete &&
 557            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 558                               dst, cookie) == NULL) {
 559                sk_tx_queue_clear(sk);
 560                sk->sk_dst_pending_confirm = 0;
 561                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 562                dst_release(dst);
 563                return NULL;
 564        }
 565
 566        return dst;
 567}
 568EXPORT_SYMBOL(__sk_dst_check);
 569
 570struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 571{
 572        struct dst_entry *dst = sk_dst_get(sk);
 573
 574        if (dst && dst->obsolete &&
 575            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 576                               dst, cookie) == NULL) {
 577                sk_dst_reset(sk);
 578                dst_release(dst);
 579                return NULL;
 580        }
 581
 582        return dst;
 583}
 584EXPORT_SYMBOL(sk_dst_check);
 585
 586static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 587{
 588        int ret = -ENOPROTOOPT;
 589#ifdef CONFIG_NETDEVICES
 590        struct net *net = sock_net(sk);
 591
 592        /* Sorry... */
 593        ret = -EPERM;
 594        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 595                goto out;
 596
 597        ret = -EINVAL;
 598        if (ifindex < 0)
 599                goto out;
 600
 601        sk->sk_bound_dev_if = ifindex;
 602        if (sk->sk_prot->rehash)
 603                sk->sk_prot->rehash(sk);
 604        sk_dst_reset(sk);
 605
 606        ret = 0;
 607
 608out:
 609#endif
 610
 611        return ret;
 612}
 613
 614int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 615{
 616        int ret;
 617
 618        if (lock_sk)
 619                lock_sock(sk);
 620        ret = sock_bindtoindex_locked(sk, ifindex);
 621        if (lock_sk)
 622                release_sock(sk);
 623
 624        return ret;
 625}
 626EXPORT_SYMBOL(sock_bindtoindex);
 627
 628static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 629{
 630        int ret = -ENOPROTOOPT;
 631#ifdef CONFIG_NETDEVICES
 632        struct net *net = sock_net(sk);
 633        char devname[IFNAMSIZ];
 634        int index;
 635
 636        ret = -EINVAL;
 637        if (optlen < 0)
 638                goto out;
 639
 640        /* Bind this socket to a particular device like "eth0",
 641         * as specified in the passed interface name. If the
 642         * name is "" or the option length is zero the socket
 643         * is not bound.
 644         */
 645        if (optlen > IFNAMSIZ - 1)
 646                optlen = IFNAMSIZ - 1;
 647        memset(devname, 0, sizeof(devname));
 648
 649        ret = -EFAULT;
 650        if (copy_from_sockptr(devname, optval, optlen))
 651                goto out;
 652
 653        index = 0;
 654        if (devname[0] != '\0') {
 655                struct net_device *dev;
 656
 657                rcu_read_lock();
 658                dev = dev_get_by_name_rcu(net, devname);
 659                if (dev)
 660                        index = dev->ifindex;
 661                rcu_read_unlock();
 662                ret = -ENODEV;
 663                if (!dev)
 664                        goto out;
 665        }
 666
 667        return sock_bindtoindex(sk, index, true);
 668out:
 669#endif
 670
 671        return ret;
 672}
 673
 674static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 675                                int __user *optlen, int len)
 676{
 677        int ret = -ENOPROTOOPT;
 678#ifdef CONFIG_NETDEVICES
 679        struct net *net = sock_net(sk);
 680        char devname[IFNAMSIZ];
 681
 682        if (sk->sk_bound_dev_if == 0) {
 683                len = 0;
 684                goto zero;
 685        }
 686
 687        ret = -EINVAL;
 688        if (len < IFNAMSIZ)
 689                goto out;
 690
 691        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 692        if (ret)
 693                goto out;
 694
 695        len = strlen(devname) + 1;
 696
 697        ret = -EFAULT;
 698        if (copy_to_user(optval, devname, len))
 699                goto out;
 700
 701zero:
 702        ret = -EFAULT;
 703        if (put_user(len, optlen))
 704                goto out;
 705
 706        ret = 0;
 707
 708out:
 709#endif
 710
 711        return ret;
 712}
 713
 714bool sk_mc_loop(struct sock *sk)
 715{
 716        if (dev_recursion_level())
 717                return false;
 718        if (!sk)
 719                return true;
 720        switch (sk->sk_family) {
 721        case AF_INET:
 722                return inet_sk(sk)->mc_loop;
 723#if IS_ENABLED(CONFIG_IPV6)
 724        case AF_INET6:
 725                return inet6_sk(sk)->mc_loop;
 726#endif
 727        }
 728        WARN_ON_ONCE(1);
 729        return true;
 730}
 731EXPORT_SYMBOL(sk_mc_loop);
 732
 733void sock_set_reuseaddr(struct sock *sk)
 734{
 735        lock_sock(sk);
 736        sk->sk_reuse = SK_CAN_REUSE;
 737        release_sock(sk);
 738}
 739EXPORT_SYMBOL(sock_set_reuseaddr);
 740
 741void sock_set_reuseport(struct sock *sk)
 742{
 743        lock_sock(sk);
 744        sk->sk_reuseport = true;
 745        release_sock(sk);
 746}
 747EXPORT_SYMBOL(sock_set_reuseport);
 748
 749void sock_no_linger(struct sock *sk)
 750{
 751        lock_sock(sk);
 752        sk->sk_lingertime = 0;
 753        sock_set_flag(sk, SOCK_LINGER);
 754        release_sock(sk);
 755}
 756EXPORT_SYMBOL(sock_no_linger);
 757
 758void sock_set_priority(struct sock *sk, u32 priority)
 759{
 760        lock_sock(sk);
 761        sk->sk_priority = priority;
 762        release_sock(sk);
 763}
 764EXPORT_SYMBOL(sock_set_priority);
 765
 766void sock_set_sndtimeo(struct sock *sk, s64 secs)
 767{
 768        lock_sock(sk);
 769        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 770                sk->sk_sndtimeo = secs * HZ;
 771        else
 772                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 773        release_sock(sk);
 774}
 775EXPORT_SYMBOL(sock_set_sndtimeo);
 776
 777static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 778{
 779        if (val)  {
 780                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 781                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 782                sock_set_flag(sk, SOCK_RCVTSTAMP);
 783                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 784        } else {
 785                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 786                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 787        }
 788}
 789
 790void sock_enable_timestamps(struct sock *sk)
 791{
 792        lock_sock(sk);
 793        __sock_set_timestamps(sk, true, false, true);
 794        release_sock(sk);
 795}
 796EXPORT_SYMBOL(sock_enable_timestamps);
 797
 798void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 799{
 800        switch (optname) {
 801        case SO_TIMESTAMP_OLD:
 802                __sock_set_timestamps(sk, valbool, false, false);
 803                break;
 804        case SO_TIMESTAMP_NEW:
 805                __sock_set_timestamps(sk, valbool, true, false);
 806                break;
 807        case SO_TIMESTAMPNS_OLD:
 808                __sock_set_timestamps(sk, valbool, false, true);
 809                break;
 810        case SO_TIMESTAMPNS_NEW:
 811                __sock_set_timestamps(sk, valbool, true, true);
 812                break;
 813        }
 814}
 815
 816static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 817{
 818        struct net *net = sock_net(sk);
 819        struct net_device *dev = NULL;
 820        bool match = false;
 821        int *vclock_index;
 822        int i, num;
 823
 824        if (sk->sk_bound_dev_if)
 825                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 826
 827        if (!dev) {
 828                pr_err("%s: sock not bind to device\n", __func__);
 829                return -EOPNOTSUPP;
 830        }
 831
 832        num = ethtool_get_phc_vclocks(dev, &vclock_index);
 833        for (i = 0; i < num; i++) {
 834                if (*(vclock_index + i) == phc_index) {
 835                        match = true;
 836                        break;
 837                }
 838        }
 839
 840        if (num > 0)
 841                kfree(vclock_index);
 842
 843        if (!match)
 844                return -EINVAL;
 845
 846        sk->sk_bind_phc = phc_index;
 847
 848        return 0;
 849}
 850
 851int sock_set_timestamping(struct sock *sk, int optname,
 852                          struct so_timestamping timestamping)
 853{
 854        int val = timestamping.flags;
 855        int ret;
 856
 857        if (val & ~SOF_TIMESTAMPING_MASK)
 858                return -EINVAL;
 859
 860        if (val & SOF_TIMESTAMPING_OPT_ID &&
 861            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 862                if (sk->sk_protocol == IPPROTO_TCP &&
 863                    sk->sk_type == SOCK_STREAM) {
 864                        if ((1 << sk->sk_state) &
 865                            (TCPF_CLOSE | TCPF_LISTEN))
 866                                return -EINVAL;
 867                        sk->sk_tskey = tcp_sk(sk)->snd_una;
 868                } else {
 869                        sk->sk_tskey = 0;
 870                }
 871        }
 872
 873        if (val & SOF_TIMESTAMPING_OPT_STATS &&
 874            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 875                return -EINVAL;
 876
 877        if (val & SOF_TIMESTAMPING_BIND_PHC) {
 878                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 879                if (ret)
 880                        return ret;
 881        }
 882
 883        sk->sk_tsflags = val;
 884        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 885
 886        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 887                sock_enable_timestamp(sk,
 888                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
 889        else
 890                sock_disable_timestamp(sk,
 891                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 892        return 0;
 893}
 894
 895void sock_set_keepalive(struct sock *sk)
 896{
 897        lock_sock(sk);
 898        if (sk->sk_prot->keepalive)
 899                sk->sk_prot->keepalive(sk, true);
 900        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 901        release_sock(sk);
 902}
 903EXPORT_SYMBOL(sock_set_keepalive);
 904
 905static void __sock_set_rcvbuf(struct sock *sk, int val)
 906{
 907        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 908         * as a negative value.
 909         */
 910        val = min_t(int, val, INT_MAX / 2);
 911        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 912
 913        /* We double it on the way in to account for "struct sk_buff" etc.
 914         * overhead.   Applications assume that the SO_RCVBUF setting they make
 915         * will allow that much actual data to be received on that socket.
 916         *
 917         * Applications are unaware that "struct sk_buff" and other overheads
 918         * allocate from the receive buffer during socket buffer allocation.
 919         *
 920         * And after considering the possible alternatives, returning the value
 921         * we actually used in getsockopt is the most desirable behavior.
 922         */
 923        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 924}
 925
 926void sock_set_rcvbuf(struct sock *sk, int val)
 927{
 928        lock_sock(sk);
 929        __sock_set_rcvbuf(sk, val);
 930        release_sock(sk);
 931}
 932EXPORT_SYMBOL(sock_set_rcvbuf);
 933
 934static void __sock_set_mark(struct sock *sk, u32 val)
 935{
 936        if (val != sk->sk_mark) {
 937                sk->sk_mark = val;
 938                sk_dst_reset(sk);
 939        }
 940}
 941
 942void sock_set_mark(struct sock *sk, u32 val)
 943{
 944        lock_sock(sk);
 945        __sock_set_mark(sk, val);
 946        release_sock(sk);
 947}
 948EXPORT_SYMBOL(sock_set_mark);
 949
 950/*
 951 *      This is meant for all protocols to use and covers goings on
 952 *      at the socket level. Everything here is generic.
 953 */
 954
 955int sock_setsockopt(struct socket *sock, int level, int optname,
 956                    sockptr_t optval, unsigned int optlen)
 957{
 958        struct so_timestamping timestamping;
 959        struct sock_txtime sk_txtime;
 960        struct sock *sk = sock->sk;
 961        int val;
 962        int valbool;
 963        struct linger ling;
 964        int ret = 0;
 965
 966        /*
 967         *      Options without arguments
 968         */
 969
 970        if (optname == SO_BINDTODEVICE)
 971                return sock_setbindtodevice(sk, optval, optlen);
 972
 973        if (optlen < sizeof(int))
 974                return -EINVAL;
 975
 976        if (copy_from_sockptr(&val, optval, sizeof(val)))
 977                return -EFAULT;
 978
 979        valbool = val ? 1 : 0;
 980
 981        lock_sock(sk);
 982
 983        switch (optname) {
 984        case SO_DEBUG:
 985                if (val && !capable(CAP_NET_ADMIN))
 986                        ret = -EACCES;
 987                else
 988                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 989                break;
 990        case SO_REUSEADDR:
 991                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 992                break;
 993        case SO_REUSEPORT:
 994                sk->sk_reuseport = valbool;
 995                break;
 996        case SO_TYPE:
 997        case SO_PROTOCOL:
 998        case SO_DOMAIN:
 999        case SO_ERROR:
1000                ret = -ENOPROTOOPT;
1001                break;
1002        case SO_DONTROUTE:
1003                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1004                sk_dst_reset(sk);
1005                break;
1006        case SO_BROADCAST:
1007                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1008                break;
1009        case SO_SNDBUF:
1010                /* Don't error on this BSD doesn't and if you think
1011                 * about it this is right. Otherwise apps have to
1012                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1013                 * are treated in BSD as hints
1014                 */
1015                val = min_t(u32, val, sysctl_wmem_max);
1016set_sndbuf:
1017                /* Ensure val * 2 fits into an int, to prevent max_t()
1018                 * from treating it as a negative value.
1019                 */
1020                val = min_t(int, val, INT_MAX / 2);
1021                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1022                WRITE_ONCE(sk->sk_sndbuf,
1023                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
1024                /* Wake up sending tasks if we upped the value. */
1025                sk->sk_write_space(sk);
1026                break;
1027
1028        case SO_SNDBUFFORCE:
1029                if (!capable(CAP_NET_ADMIN)) {
1030                        ret = -EPERM;
1031                        break;
1032                }
1033
1034                /* No negative values (to prevent underflow, as val will be
1035                 * multiplied by 2).
1036                 */
1037                if (val < 0)
1038                        val = 0;
1039                goto set_sndbuf;
1040
1041        case SO_RCVBUF:
1042                /* Don't error on this BSD doesn't and if you think
1043                 * about it this is right. Otherwise apps have to
1044                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1045                 * are treated in BSD as hints
1046                 */
1047                __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1048                break;
1049
1050        case SO_RCVBUFFORCE:
1051                if (!capable(CAP_NET_ADMIN)) {
1052                        ret = -EPERM;
1053                        break;
1054                }
1055
1056                /* No negative values (to prevent underflow, as val will be
1057                 * multiplied by 2).
1058                 */
1059                __sock_set_rcvbuf(sk, max(val, 0));
1060                break;
1061
1062        case SO_KEEPALIVE:
1063                if (sk->sk_prot->keepalive)
1064                        sk->sk_prot->keepalive(sk, valbool);
1065                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1066                break;
1067
1068        case SO_OOBINLINE:
1069                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1070                break;
1071
1072        case SO_NO_CHECK:
1073                sk->sk_no_check_tx = valbool;
1074                break;
1075
1076        case SO_PRIORITY:
1077                if ((val >= 0 && val <= 6) ||
1078                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1079                        sk->sk_priority = val;
1080                else
1081                        ret = -EPERM;
1082                break;
1083
1084        case SO_LINGER:
1085                if (optlen < sizeof(ling)) {
1086                        ret = -EINVAL;  /* 1003.1g */
1087                        break;
1088                }
1089                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1090                        ret = -EFAULT;
1091                        break;
1092                }
1093                if (!ling.l_onoff)
1094                        sock_reset_flag(sk, SOCK_LINGER);
1095                else {
1096#if (BITS_PER_LONG == 32)
1097                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1098                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1099                        else
1100#endif
1101                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1102                        sock_set_flag(sk, SOCK_LINGER);
1103                }
1104                break;
1105
1106        case SO_BSDCOMPAT:
1107                break;
1108
1109        case SO_PASSCRED:
1110                if (valbool)
1111                        set_bit(SOCK_PASSCRED, &sock->flags);
1112                else
1113                        clear_bit(SOCK_PASSCRED, &sock->flags);
1114                break;
1115
1116        case SO_TIMESTAMP_OLD:
1117        case SO_TIMESTAMP_NEW:
1118        case SO_TIMESTAMPNS_OLD:
1119        case SO_TIMESTAMPNS_NEW:
1120                sock_set_timestamp(sk, optname, valbool);
1121                break;
1122
1123        case SO_TIMESTAMPING_NEW:
1124        case SO_TIMESTAMPING_OLD:
1125                if (optlen == sizeof(timestamping)) {
1126                        if (copy_from_sockptr(&timestamping, optval,
1127                                              sizeof(timestamping))) {
1128                                ret = -EFAULT;
1129                                break;
1130                        }
1131                } else {
1132                        memset(&timestamping, 0, sizeof(timestamping));
1133                        timestamping.flags = val;
1134                }
1135                ret = sock_set_timestamping(sk, optname, timestamping);
1136                break;
1137
1138        case SO_RCVLOWAT:
1139                if (val < 0)
1140                        val = INT_MAX;
1141                if (sock->ops->set_rcvlowat)
1142                        ret = sock->ops->set_rcvlowat(sk, val);
1143                else
1144                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1145                break;
1146
1147        case SO_RCVTIMEO_OLD:
1148        case SO_RCVTIMEO_NEW:
1149                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1150                                       optlen, optname == SO_RCVTIMEO_OLD);
1151                break;
1152
1153        case SO_SNDTIMEO_OLD:
1154        case SO_SNDTIMEO_NEW:
1155                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1156                                       optlen, optname == SO_SNDTIMEO_OLD);
1157                break;
1158
1159        case SO_ATTACH_FILTER: {
1160                struct sock_fprog fprog;
1161
1162                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1163                if (!ret)
1164                        ret = sk_attach_filter(&fprog, sk);
1165                break;
1166        }
1167        case SO_ATTACH_BPF:
1168                ret = -EINVAL;
1169                if (optlen == sizeof(u32)) {
1170                        u32 ufd;
1171
1172                        ret = -EFAULT;
1173                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1174                                break;
1175
1176                        ret = sk_attach_bpf(ufd, sk);
1177                }
1178                break;
1179
1180        case SO_ATTACH_REUSEPORT_CBPF: {
1181                struct sock_fprog fprog;
1182
1183                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1184                if (!ret)
1185                        ret = sk_reuseport_attach_filter(&fprog, sk);
1186                break;
1187        }
1188        case SO_ATTACH_REUSEPORT_EBPF:
1189                ret = -EINVAL;
1190                if (optlen == sizeof(u32)) {
1191                        u32 ufd;
1192
1193                        ret = -EFAULT;
1194                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1195                                break;
1196
1197                        ret = sk_reuseport_attach_bpf(ufd, sk);
1198                }
1199                break;
1200
1201        case SO_DETACH_REUSEPORT_BPF:
1202                ret = reuseport_detach_prog(sk);
1203                break;
1204
1205        case SO_DETACH_FILTER:
1206                ret = sk_detach_filter(sk);
1207                break;
1208
1209        case SO_LOCK_FILTER:
1210                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1211                        ret = -EPERM;
1212                else
1213                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1214                break;
1215
1216        case SO_PASSSEC:
1217                if (valbool)
1218                        set_bit(SOCK_PASSSEC, &sock->flags);
1219                else
1220                        clear_bit(SOCK_PASSSEC, &sock->flags);
1221                break;
1222        case SO_MARK:
1223                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1224                        ret = -EPERM;
1225                        break;
1226                }
1227
1228                __sock_set_mark(sk, val);
1229                break;
1230
1231        case SO_RXQ_OVFL:
1232                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1233                break;
1234
1235        case SO_WIFI_STATUS:
1236                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1237                break;
1238
1239        case SO_PEEK_OFF:
1240                if (sock->ops->set_peek_off)
1241                        ret = sock->ops->set_peek_off(sk, val);
1242                else
1243                        ret = -EOPNOTSUPP;
1244                break;
1245
1246        case SO_NOFCS:
1247                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1248                break;
1249
1250        case SO_SELECT_ERR_QUEUE:
1251                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1252                break;
1253
1254#ifdef CONFIG_NET_RX_BUSY_POLL
1255        case SO_BUSY_POLL:
1256                /* allow unprivileged users to decrease the value */
1257                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1258                        ret = -EPERM;
1259                else {
1260                        if (val < 0)
1261                                ret = -EINVAL;
1262                        else
1263                                WRITE_ONCE(sk->sk_ll_usec, val);
1264                }
1265                break;
1266        case SO_PREFER_BUSY_POLL:
1267                if (valbool && !capable(CAP_NET_ADMIN))
1268                        ret = -EPERM;
1269                else
1270                        WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1271                break;
1272        case SO_BUSY_POLL_BUDGET:
1273                if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1274                        ret = -EPERM;
1275                } else {
1276                        if (val < 0 || val > U16_MAX)
1277                                ret = -EINVAL;
1278                        else
1279                                WRITE_ONCE(sk->sk_busy_poll_budget, val);
1280                }
1281                break;
1282#endif
1283
1284        case SO_MAX_PACING_RATE:
1285                {
1286                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1287
1288                if (sizeof(ulval) != sizeof(val) &&
1289                    optlen >= sizeof(ulval) &&
1290                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1291                        ret = -EFAULT;
1292                        break;
1293                }
1294                if (ulval != ~0UL)
1295                        cmpxchg(&sk->sk_pacing_status,
1296                                SK_PACING_NONE,
1297                                SK_PACING_NEEDED);
1298                sk->sk_max_pacing_rate = ulval;
1299                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1300                break;
1301                }
1302        case SO_INCOMING_CPU:
1303                WRITE_ONCE(sk->sk_incoming_cpu, val);
1304                break;
1305
1306        case SO_CNX_ADVICE:
1307                if (val == 1)
1308                        dst_negative_advice(sk);
1309                break;
1310
1311        case SO_ZEROCOPY:
1312                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1313                        if (!((sk->sk_type == SOCK_STREAM &&
1314                               sk->sk_protocol == IPPROTO_TCP) ||
1315                              (sk->sk_type == SOCK_DGRAM &&
1316                               sk->sk_protocol == IPPROTO_UDP)))
1317                                ret = -ENOTSUPP;
1318                } else if (sk->sk_family != PF_RDS) {
1319                        ret = -ENOTSUPP;
1320                }
1321                if (!ret) {
1322                        if (val < 0 || val > 1)
1323                                ret = -EINVAL;
1324                        else
1325                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1326                }
1327                break;
1328
1329        case SO_TXTIME:
1330                if (optlen != sizeof(struct sock_txtime)) {
1331                        ret = -EINVAL;
1332                        break;
1333                } else if (copy_from_sockptr(&sk_txtime, optval,
1334                           sizeof(struct sock_txtime))) {
1335                        ret = -EFAULT;
1336                        break;
1337                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1338                        ret = -EINVAL;
1339                        break;
1340                }
1341                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1342                 * scheduler has enough safe guards.
1343                 */
1344                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1345                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1346                        ret = -EPERM;
1347                        break;
1348                }
1349                sock_valbool_flag(sk, SOCK_TXTIME, true);
1350                sk->sk_clockid = sk_txtime.clockid;
1351                sk->sk_txtime_deadline_mode =
1352                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1353                sk->sk_txtime_report_errors =
1354                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1355                break;
1356
1357        case SO_BINDTOIFINDEX:
1358                ret = sock_bindtoindex_locked(sk, val);
1359                break;
1360
1361        case SO_BUF_LOCK:
1362                if (val & ~SOCK_BUF_LOCK_MASK) {
1363                        ret = -EINVAL;
1364                        break;
1365                }
1366                sk->sk_userlocks = val | (sk->sk_userlocks &
1367                                          ~SOCK_BUF_LOCK_MASK);
1368                break;
1369
1370        default:
1371                ret = -ENOPROTOOPT;
1372                break;
1373        }
1374        release_sock(sk);
1375        return ret;
1376}
1377EXPORT_SYMBOL(sock_setsockopt);
1378
1379static const struct cred *sk_get_peer_cred(struct sock *sk)
1380{
1381        const struct cred *cred;
1382
1383        spin_lock(&sk->sk_peer_lock);
1384        cred = get_cred(sk->sk_peer_cred);
1385        spin_unlock(&sk->sk_peer_lock);
1386
1387        return cred;
1388}
1389
1390static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1391                          struct ucred *ucred)
1392{
1393        ucred->pid = pid_vnr(pid);
1394        ucred->uid = ucred->gid = -1;
1395        if (cred) {
1396                struct user_namespace *current_ns = current_user_ns();
1397
1398                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1399                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1400        }
1401}
1402
1403static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1404{
1405        struct user_namespace *user_ns = current_user_ns();
1406        int i;
1407
1408        for (i = 0; i < src->ngroups; i++)
1409                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1410                        return -EFAULT;
1411
1412        return 0;
1413}
1414
1415int sock_getsockopt(struct socket *sock, int level, int optname,
1416                    char __user *optval, int __user *optlen)
1417{
1418        struct sock *sk = sock->sk;
1419
1420        union {
1421                int val;
1422                u64 val64;
1423                unsigned long ulval;
1424                struct linger ling;
1425                struct old_timeval32 tm32;
1426                struct __kernel_old_timeval tm;
1427                struct  __kernel_sock_timeval stm;
1428                struct sock_txtime txtime;
1429                struct so_timestamping timestamping;
1430        } v;
1431
1432        int lv = sizeof(int);
1433        int len;
1434
1435        if (get_user(len, optlen))
1436                return -EFAULT;
1437        if (len < 0)
1438                return -EINVAL;
1439
1440        memset(&v, 0, sizeof(v));
1441
1442        switch (optname) {
1443        case SO_DEBUG:
1444                v.val = sock_flag(sk, SOCK_DBG);
1445                break;
1446
1447        case SO_DONTROUTE:
1448                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1449                break;
1450
1451        case SO_BROADCAST:
1452                v.val = sock_flag(sk, SOCK_BROADCAST);
1453                break;
1454
1455        case SO_SNDBUF:
1456                v.val = sk->sk_sndbuf;
1457                break;
1458
1459        case SO_RCVBUF:
1460                v.val = sk->sk_rcvbuf;
1461                break;
1462
1463        case SO_REUSEADDR:
1464                v.val = sk->sk_reuse;
1465                break;
1466
1467        case SO_REUSEPORT:
1468                v.val = sk->sk_reuseport;
1469                break;
1470
1471        case SO_KEEPALIVE:
1472                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1473                break;
1474
1475        case SO_TYPE:
1476                v.val = sk->sk_type;
1477                break;
1478
1479        case SO_PROTOCOL:
1480                v.val = sk->sk_protocol;
1481                break;
1482
1483        case SO_DOMAIN:
1484                v.val = sk->sk_family;
1485                break;
1486
1487        case SO_ERROR:
1488                v.val = -sock_error(sk);
1489                if (v.val == 0)
1490                        v.val = xchg(&sk->sk_err_soft, 0);
1491                break;
1492
1493        case SO_OOBINLINE:
1494                v.val = sock_flag(sk, SOCK_URGINLINE);
1495                break;
1496
1497        case SO_NO_CHECK:
1498                v.val = sk->sk_no_check_tx;
1499                break;
1500
1501        case SO_PRIORITY:
1502                v.val = sk->sk_priority;
1503                break;
1504
1505        case SO_LINGER:
1506                lv              = sizeof(v.ling);
1507                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1508                v.ling.l_linger = sk->sk_lingertime / HZ;
1509                break;
1510
1511        case SO_BSDCOMPAT:
1512                break;
1513
1514        case SO_TIMESTAMP_OLD:
1515                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1516                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1517                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1518                break;
1519
1520        case SO_TIMESTAMPNS_OLD:
1521                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1522                break;
1523
1524        case SO_TIMESTAMP_NEW:
1525                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1526                break;
1527
1528        case SO_TIMESTAMPNS_NEW:
1529                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1530                break;
1531
1532        case SO_TIMESTAMPING_OLD:
1533                lv = sizeof(v.timestamping);
1534                v.timestamping.flags = sk->sk_tsflags;
1535                v.timestamping.bind_phc = sk->sk_bind_phc;
1536                break;
1537
1538        case SO_RCVTIMEO_OLD:
1539        case SO_RCVTIMEO_NEW:
1540                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1541                break;
1542
1543        case SO_SNDTIMEO_OLD:
1544        case SO_SNDTIMEO_NEW:
1545                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1546                break;
1547
1548        case SO_RCVLOWAT:
1549                v.val = sk->sk_rcvlowat;
1550                break;
1551
1552        case SO_SNDLOWAT:
1553                v.val = 1;
1554                break;
1555
1556        case SO_PASSCRED:
1557                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1558                break;
1559
1560        case SO_PEERCRED:
1561        {
1562                struct ucred peercred;
1563                if (len > sizeof(peercred))
1564                        len = sizeof(peercred);
1565
1566                spin_lock(&sk->sk_peer_lock);
1567                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1568                spin_unlock(&sk->sk_peer_lock);
1569
1570                if (copy_to_user(optval, &peercred, len))
1571                        return -EFAULT;
1572                goto lenout;
1573        }
1574
1575        case SO_PEERGROUPS:
1576        {
1577                const struct cred *cred;
1578                int ret, n;
1579
1580                cred = sk_get_peer_cred(sk);
1581                if (!cred)
1582                        return -ENODATA;
1583
1584                n = cred->group_info->ngroups;
1585                if (len < n * sizeof(gid_t)) {
1586                        len = n * sizeof(gid_t);
1587                        put_cred(cred);
1588                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1589                }
1590                len = n * sizeof(gid_t);
1591
1592                ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1593                put_cred(cred);
1594                if (ret)
1595                        return ret;
1596                goto lenout;
1597        }
1598
1599        case SO_PEERNAME:
1600        {
1601                char address[128];
1602
1603                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1604                if (lv < 0)
1605                        return -ENOTCONN;
1606                if (lv < len)
1607                        return -EINVAL;
1608                if (copy_to_user(optval, address, len))
1609                        return -EFAULT;
1610                goto lenout;
1611        }
1612
1613        /* Dubious BSD thing... Probably nobody even uses it, but
1614         * the UNIX standard wants it for whatever reason... -DaveM
1615         */
1616        case SO_ACCEPTCONN:
1617                v.val = sk->sk_state == TCP_LISTEN;
1618                break;
1619
1620        case SO_PASSSEC:
1621                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1622                break;
1623
1624        case SO_PEERSEC:
1625                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1626
1627        case SO_MARK:
1628                v.val = sk->sk_mark;
1629                break;
1630
1631        case SO_RXQ_OVFL:
1632                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1633                break;
1634
1635        case SO_WIFI_STATUS:
1636                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1637                break;
1638
1639        case SO_PEEK_OFF:
1640                if (!sock->ops->set_peek_off)
1641                        return -EOPNOTSUPP;
1642
1643                v.val = sk->sk_peek_off;
1644                break;
1645        case SO_NOFCS:
1646                v.val = sock_flag(sk, SOCK_NOFCS);
1647                break;
1648
1649        case SO_BINDTODEVICE:
1650                return sock_getbindtodevice(sk, optval, optlen, len);
1651
1652        case SO_GET_FILTER:
1653                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1654                if (len < 0)
1655                        return len;
1656
1657                goto lenout;
1658
1659        case SO_LOCK_FILTER:
1660                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1661                break;
1662
1663        case SO_BPF_EXTENSIONS:
1664                v.val = bpf_tell_extensions();
1665                break;
1666
1667        case SO_SELECT_ERR_QUEUE:
1668                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1669                break;
1670
1671#ifdef CONFIG_NET_RX_BUSY_POLL
1672        case SO_BUSY_POLL:
1673                v.val = sk->sk_ll_usec;
1674                break;
1675        case SO_PREFER_BUSY_POLL:
1676                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1677                break;
1678#endif
1679
1680        case SO_MAX_PACING_RATE:
1681                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1682                        lv = sizeof(v.ulval);
1683                        v.ulval = sk->sk_max_pacing_rate;
1684                } else {
1685                        /* 32bit version */
1686                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1687                }
1688                break;
1689
1690        case SO_INCOMING_CPU:
1691                v.val = READ_ONCE(sk->sk_incoming_cpu);
1692                break;
1693
1694        case SO_MEMINFO:
1695        {
1696                u32 meminfo[SK_MEMINFO_VARS];
1697
1698                sk_get_meminfo(sk, meminfo);
1699
1700                len = min_t(unsigned int, len, sizeof(meminfo));
1701                if (copy_to_user(optval, &meminfo, len))
1702                        return -EFAULT;
1703
1704                goto lenout;
1705        }
1706
1707#ifdef CONFIG_NET_RX_BUSY_POLL
1708        case SO_INCOMING_NAPI_ID:
1709                v.val = READ_ONCE(sk->sk_napi_id);
1710
1711                /* aggregate non-NAPI IDs down to 0 */
1712                if (v.val < MIN_NAPI_ID)
1713                        v.val = 0;
1714
1715                break;
1716#endif
1717
1718        case SO_COOKIE:
1719                lv = sizeof(u64);
1720                if (len < lv)
1721                        return -EINVAL;
1722                v.val64 = sock_gen_cookie(sk);
1723                break;
1724
1725        case SO_ZEROCOPY:
1726                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1727                break;
1728
1729        case SO_TXTIME:
1730                lv = sizeof(v.txtime);
1731                v.txtime.clockid = sk->sk_clockid;
1732                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1733                                  SOF_TXTIME_DEADLINE_MODE : 0;
1734                v.txtime.flags |= sk->sk_txtime_report_errors ?
1735                                  SOF_TXTIME_REPORT_ERRORS : 0;
1736                break;
1737
1738        case SO_BINDTOIFINDEX:
1739                v.val = sk->sk_bound_dev_if;
1740                break;
1741
1742        case SO_NETNS_COOKIE:
1743                lv = sizeof(u64);
1744                if (len != lv)
1745                        return -EINVAL;
1746                v.val64 = sock_net(sk)->net_cookie;
1747                break;
1748
1749        case SO_BUF_LOCK:
1750                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1751                break;
1752
1753        default:
1754                /* We implement the SO_SNDLOWAT etc to not be settable
1755                 * (1003.1g 7).
1756                 */
1757                return -ENOPROTOOPT;
1758        }
1759
1760        if (len > lv)
1761                len = lv;
1762        if (copy_to_user(optval, &v, len))
1763                return -EFAULT;
1764lenout:
1765        if (put_user(len, optlen))
1766                return -EFAULT;
1767        return 0;
1768}
1769
1770/*
1771 * Initialize an sk_lock.
1772 *
1773 * (We also register the sk_lock with the lock validator.)
1774 */
1775static inline void sock_lock_init(struct sock *sk)
1776{
1777        if (sk->sk_kern_sock)
1778                sock_lock_init_class_and_name(
1779                        sk,
1780                        af_family_kern_slock_key_strings[sk->sk_family],
1781                        af_family_kern_slock_keys + sk->sk_family,
1782                        af_family_kern_key_strings[sk->sk_family],
1783                        af_family_kern_keys + sk->sk_family);
1784        else
1785                sock_lock_init_class_and_name(
1786                        sk,
1787                        af_family_slock_key_strings[sk->sk_family],
1788                        af_family_slock_keys + sk->sk_family,
1789                        af_family_key_strings[sk->sk_family],
1790                        af_family_keys + sk->sk_family);
1791}
1792
1793/*
1794 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1795 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1796 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1797 */
1798static void sock_copy(struct sock *nsk, const struct sock *osk)
1799{
1800        const struct proto *prot = READ_ONCE(osk->sk_prot);
1801#ifdef CONFIG_SECURITY_NETWORK
1802        void *sptr = nsk->sk_security;
1803#endif
1804
1805        /* If we move sk_tx_queue_mapping out of the private section,
1806         * we must check if sk_tx_queue_clear() is called after
1807         * sock_copy() in sk_clone_lock().
1808         */
1809        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1810                     offsetof(struct sock, sk_dontcopy_begin) ||
1811                     offsetof(struct sock, sk_tx_queue_mapping) >=
1812                     offsetof(struct sock, sk_dontcopy_end));
1813
1814        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1815
1816        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1817               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1818
1819#ifdef CONFIG_SECURITY_NETWORK
1820        nsk->sk_security = sptr;
1821        security_sk_clone(osk, nsk);
1822#endif
1823}
1824
1825static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1826                int family)
1827{
1828        struct sock *sk;
1829        struct kmem_cache *slab;
1830
1831        slab = prot->slab;
1832        if (slab != NULL) {
1833                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1834                if (!sk)
1835                        return sk;
1836                if (want_init_on_alloc(priority))
1837                        sk_prot_clear_nulls(sk, prot->obj_size);
1838        } else
1839                sk = kmalloc(prot->obj_size, priority);
1840
1841        if (sk != NULL) {
1842                if (security_sk_alloc(sk, family, priority))
1843                        goto out_free;
1844
1845                if (!try_module_get(prot->owner))
1846                        goto out_free_sec;
1847        }
1848
1849        return sk;
1850
1851out_free_sec:
1852        security_sk_free(sk);
1853out_free:
1854        if (slab != NULL)
1855                kmem_cache_free(slab, sk);
1856        else
1857                kfree(sk);
1858        return NULL;
1859}
1860
1861static void sk_prot_free(struct proto *prot, struct sock *sk)
1862{
1863        struct kmem_cache *slab;
1864        struct module *owner;
1865
1866        owner = prot->owner;
1867        slab = prot->slab;
1868
1869        cgroup_sk_free(&sk->sk_cgrp_data);
1870        mem_cgroup_sk_free(sk);
1871        security_sk_free(sk);
1872        if (slab != NULL)
1873                kmem_cache_free(slab, sk);
1874        else
1875                kfree(sk);
1876        module_put(owner);
1877}
1878
1879/**
1880 *      sk_alloc - All socket objects are allocated here
1881 *      @net: the applicable net namespace
1882 *      @family: protocol family
1883 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1884 *      @prot: struct proto associated with this new sock instance
1885 *      @kern: is this to be a kernel socket?
1886 */
1887struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1888                      struct proto *prot, int kern)
1889{
1890        struct sock *sk;
1891
1892        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1893        if (sk) {
1894                sk->sk_family = family;
1895                /*
1896                 * See comment in struct sock definition to understand
1897                 * why we need sk_prot_creator -acme
1898                 */
1899                sk->sk_prot = sk->sk_prot_creator = prot;
1900                sk->sk_kern_sock = kern;
1901                sock_lock_init(sk);
1902                sk->sk_net_refcnt = kern ? 0 : 1;
1903                if (likely(sk->sk_net_refcnt)) {
1904                        get_net(net);
1905                        sock_inuse_add(net, 1);
1906                }
1907
1908                sock_net_set(sk, net);
1909                refcount_set(&sk->sk_wmem_alloc, 1);
1910
1911                mem_cgroup_sk_alloc(sk);
1912                cgroup_sk_alloc(&sk->sk_cgrp_data);
1913                sock_update_classid(&sk->sk_cgrp_data);
1914                sock_update_netprioidx(&sk->sk_cgrp_data);
1915                sk_tx_queue_clear(sk);
1916        }
1917
1918        return sk;
1919}
1920EXPORT_SYMBOL(sk_alloc);
1921
1922/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1923 * grace period. This is the case for UDP sockets and TCP listeners.
1924 */
1925static void __sk_destruct(struct rcu_head *head)
1926{
1927        struct sock *sk = container_of(head, struct sock, sk_rcu);
1928        struct sk_filter *filter;
1929
1930        if (sk->sk_destruct)
1931                sk->sk_destruct(sk);
1932
1933        filter = rcu_dereference_check(sk->sk_filter,
1934                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1935        if (filter) {
1936                sk_filter_uncharge(sk, filter);
1937                RCU_INIT_POINTER(sk->sk_filter, NULL);
1938        }
1939
1940        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1941
1942#ifdef CONFIG_BPF_SYSCALL
1943        bpf_sk_storage_free(sk);
1944#endif
1945
1946        if (atomic_read(&sk->sk_omem_alloc))
1947                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1948                         __func__, atomic_read(&sk->sk_omem_alloc));
1949
1950        if (sk->sk_frag.page) {
1951                put_page(sk->sk_frag.page);
1952                sk->sk_frag.page = NULL;
1953        }
1954
1955        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1956        put_cred(sk->sk_peer_cred);
1957        put_pid(sk->sk_peer_pid);
1958
1959        if (likely(sk->sk_net_refcnt))
1960                put_net(sock_net(sk));
1961        sk_prot_free(sk->sk_prot_creator, sk);
1962}
1963
1964void sk_destruct(struct sock *sk)
1965{
1966        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1967
1968        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1969                reuseport_detach_sock(sk);
1970                use_call_rcu = true;
1971        }
1972
1973        if (use_call_rcu)
1974                call_rcu(&sk->sk_rcu, __sk_destruct);
1975        else
1976                __sk_destruct(&sk->sk_rcu);
1977}
1978
1979static void __sk_free(struct sock *sk)
1980{
1981        if (likely(sk->sk_net_refcnt))
1982                sock_inuse_add(sock_net(sk), -1);
1983
1984        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1985                sock_diag_broadcast_destroy(sk);
1986        else
1987                sk_destruct(sk);
1988}
1989
1990void sk_free(struct sock *sk)
1991{
1992        /*
1993         * We subtract one from sk_wmem_alloc and can know if
1994         * some packets are still in some tx queue.
1995         * If not null, sock_wfree() will call __sk_free(sk) later
1996         */
1997        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1998                __sk_free(sk);
1999}
2000EXPORT_SYMBOL(sk_free);
2001
2002static void sk_init_common(struct sock *sk)
2003{
2004        skb_queue_head_init(&sk->sk_receive_queue);
2005        skb_queue_head_init(&sk->sk_write_queue);
2006        skb_queue_head_init(&sk->sk_error_queue);
2007
2008        rwlock_init(&sk->sk_callback_lock);
2009        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2010                        af_rlock_keys + sk->sk_family,
2011                        af_family_rlock_key_strings[sk->sk_family]);
2012        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2013                        af_wlock_keys + sk->sk_family,
2014                        af_family_wlock_key_strings[sk->sk_family]);
2015        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2016                        af_elock_keys + sk->sk_family,
2017                        af_family_elock_key_strings[sk->sk_family]);
2018        lockdep_set_class_and_name(&sk->sk_callback_lock,
2019                        af_callback_keys + sk->sk_family,
2020                        af_family_clock_key_strings[sk->sk_family]);
2021}
2022
2023/**
2024 *      sk_clone_lock - clone a socket, and lock its clone
2025 *      @sk: the socket to clone
2026 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2027 *
2028 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2029 */
2030struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2031{
2032        struct proto *prot = READ_ONCE(sk->sk_prot);
2033        struct sk_filter *filter;
2034        bool is_charged = true;
2035        struct sock *newsk;
2036
2037        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2038        if (!newsk)
2039                goto out;
2040
2041        sock_copy(newsk, sk);
2042
2043        newsk->sk_prot_creator = prot;
2044
2045        /* SANITY */
2046        if (likely(newsk->sk_net_refcnt))
2047                get_net(sock_net(newsk));
2048        sk_node_init(&newsk->sk_node);
2049        sock_lock_init(newsk);
2050        bh_lock_sock(newsk);
2051        newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2052        newsk->sk_backlog.len = 0;
2053
2054        atomic_set(&newsk->sk_rmem_alloc, 0);
2055
2056        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2057        refcount_set(&newsk->sk_wmem_alloc, 1);
2058
2059        atomic_set(&newsk->sk_omem_alloc, 0);
2060        sk_init_common(newsk);
2061
2062        newsk->sk_dst_cache     = NULL;
2063        newsk->sk_dst_pending_confirm = 0;
2064        newsk->sk_wmem_queued   = 0;
2065        newsk->sk_forward_alloc = 0;
2066        atomic_set(&newsk->sk_drops, 0);
2067        newsk->sk_send_head     = NULL;
2068        newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2069        atomic_set(&newsk->sk_zckey, 0);
2070
2071        sock_reset_flag(newsk, SOCK_DONE);
2072
2073        /* sk->sk_memcg will be populated at accept() time */
2074        newsk->sk_memcg = NULL;
2075
2076        cgroup_sk_clone(&newsk->sk_cgrp_data);
2077
2078        rcu_read_lock();
2079        filter = rcu_dereference(sk->sk_filter);
2080        if (filter != NULL)
2081                /* though it's an empty new sock, the charging may fail
2082                 * if sysctl_optmem_max was changed between creation of
2083                 * original socket and cloning
2084                 */
2085                is_charged = sk_filter_charge(newsk, filter);
2086        RCU_INIT_POINTER(newsk->sk_filter, filter);
2087        rcu_read_unlock();
2088
2089        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2090                /* We need to make sure that we don't uncharge the new
2091                 * socket if we couldn't charge it in the first place
2092                 * as otherwise we uncharge the parent's filter.
2093                 */
2094                if (!is_charged)
2095                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
2096                sk_free_unlock_clone(newsk);
2097                newsk = NULL;
2098                goto out;
2099        }
2100        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2101
2102        if (bpf_sk_storage_clone(sk, newsk)) {
2103                sk_free_unlock_clone(newsk);
2104                newsk = NULL;
2105                goto out;
2106        }
2107
2108        /* Clear sk_user_data if parent had the pointer tagged
2109         * as not suitable for copying when cloning.
2110         */
2111        if (sk_user_data_is_nocopy(newsk))
2112                newsk->sk_user_data = NULL;
2113
2114        newsk->sk_err      = 0;
2115        newsk->sk_err_soft = 0;
2116        newsk->sk_priority = 0;
2117        newsk->sk_incoming_cpu = raw_smp_processor_id();
2118        if (likely(newsk->sk_net_refcnt))
2119                sock_inuse_add(sock_net(newsk), 1);
2120
2121        /* Before updating sk_refcnt, we must commit prior changes to memory
2122         * (Documentation/RCU/rculist_nulls.rst for details)
2123         */
2124        smp_wmb();
2125        refcount_set(&newsk->sk_refcnt, 2);
2126
2127        /* Increment the counter in the same struct proto as the master
2128         * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2129         * is the same as sk->sk_prot->socks, as this field was copied
2130         * with memcpy).
2131         *
2132         * This _changes_ the previous behaviour, where
2133         * tcp_create_openreq_child always was incrementing the
2134         * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2135         * to be taken into account in all callers. -acme
2136         */
2137        sk_refcnt_debug_inc(newsk);
2138        sk_set_socket(newsk, NULL);
2139        sk_tx_queue_clear(newsk);
2140        RCU_INIT_POINTER(newsk->sk_wq, NULL);
2141
2142        if (newsk->sk_prot->sockets_allocated)
2143                sk_sockets_allocated_inc(newsk);
2144
2145        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2146                net_enable_timestamp();
2147out:
2148        return newsk;
2149}
2150EXPORT_SYMBOL_GPL(sk_clone_lock);
2151
2152void sk_free_unlock_clone(struct sock *sk)
2153{
2154        /* It is still raw copy of parent, so invalidate
2155         * destructor and make plain sk_free() */
2156        sk->sk_destruct = NULL;
2157        bh_unlock_sock(sk);
2158        sk_free(sk);
2159}
2160EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2161
2162void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2163{
2164        u32 max_segs = 1;
2165
2166        sk_dst_set(sk, dst);
2167        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2168        if (sk->sk_route_caps & NETIF_F_GSO)
2169                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2170        sk->sk_route_caps &= ~sk->sk_route_nocaps;
2171        if (sk_can_gso(sk)) {
2172                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2173                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2174                } else {
2175                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2176                        sk->sk_gso_max_size = dst->dev->gso_max_size;
2177                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2178                }
2179        }
2180        sk->sk_gso_max_segs = max_segs;
2181}
2182EXPORT_SYMBOL_GPL(sk_setup_caps);
2183
2184/*
2185 *      Simple resource managers for sockets.
2186 */
2187
2188
2189/*
2190 * Write buffer destructor automatically called from kfree_skb.
2191 */
2192void sock_wfree(struct sk_buff *skb)
2193{
2194        struct sock *sk = skb->sk;
2195        unsigned int len = skb->truesize;
2196
2197        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2198                /*
2199                 * Keep a reference on sk_wmem_alloc, this will be released
2200                 * after sk_write_space() call
2201                 */
2202                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2203                sk->sk_write_space(sk);
2204                len = 1;
2205        }
2206        /*
2207         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2208         * could not do because of in-flight packets
2209         */
2210        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2211                __sk_free(sk);
2212}
2213EXPORT_SYMBOL(sock_wfree);
2214
2215/* This variant of sock_wfree() is used by TCP,
2216 * since it sets SOCK_USE_WRITE_QUEUE.
2217 */
2218void __sock_wfree(struct sk_buff *skb)
2219{
2220        struct sock *sk = skb->sk;
2221
2222        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2223                __sk_free(sk);
2224}
2225
2226void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2227{
2228        skb_orphan(skb);
2229        skb->sk = sk;
2230#ifdef CONFIG_INET
2231        if (unlikely(!sk_fullsock(sk))) {
2232                skb->destructor = sock_edemux;
2233                sock_hold(sk);
2234                return;
2235        }
2236#endif
2237        skb->destructor = sock_wfree;
2238        skb_set_hash_from_sk(skb, sk);
2239        /*
2240         * We used to take a refcount on sk, but following operation
2241         * is enough to guarantee sk_free() wont free this sock until
2242         * all in-flight packets are completed
2243         */
2244        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2245}
2246EXPORT_SYMBOL(skb_set_owner_w);
2247
2248static bool can_skb_orphan_partial(const struct sk_buff *skb)
2249{
2250#ifdef CONFIG_TLS_DEVICE
2251        /* Drivers depend on in-order delivery for crypto offload,
2252         * partial orphan breaks out-of-order-OK logic.
2253         */
2254        if (skb->decrypted)
2255                return false;
2256#endif
2257        return (skb->destructor == sock_wfree ||
2258                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2259}
2260
2261/* This helper is used by netem, as it can hold packets in its
2262 * delay queue. We want to allow the owner socket to send more
2263 * packets, as if they were already TX completed by a typical driver.
2264 * But we also want to keep skb->sk set because some packet schedulers
2265 * rely on it (sch_fq for example).
2266 */
2267void skb_orphan_partial(struct sk_buff *skb)
2268{
2269        if (skb_is_tcp_pure_ack(skb))
2270                return;
2271
2272        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2273                return;
2274
2275        skb_orphan(skb);
2276}
2277EXPORT_SYMBOL(skb_orphan_partial);
2278
2279/*
2280 * Read buffer destructor automatically called from kfree_skb.
2281 */
2282void sock_rfree(struct sk_buff *skb)
2283{
2284        struct sock *sk = skb->sk;
2285        unsigned int len = skb->truesize;
2286
2287        atomic_sub(len, &sk->sk_rmem_alloc);
2288        sk_mem_uncharge(sk, len);
2289}
2290EXPORT_SYMBOL(sock_rfree);
2291
2292/*
2293 * Buffer destructor for skbs that are not used directly in read or write
2294 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2295 */
2296void sock_efree(struct sk_buff *skb)
2297{
2298        sock_put(skb->sk);
2299}
2300EXPORT_SYMBOL(sock_efree);
2301
2302/* Buffer destructor for prefetch/receive path where reference count may
2303 * not be held, e.g. for listen sockets.
2304 */
2305#ifdef CONFIG_INET
2306void sock_pfree(struct sk_buff *skb)
2307{
2308        if (sk_is_refcounted(skb->sk))
2309                sock_gen_put(skb->sk);
2310}
2311EXPORT_SYMBOL(sock_pfree);
2312#endif /* CONFIG_INET */
2313
2314kuid_t sock_i_uid(struct sock *sk)
2315{
2316        kuid_t uid;
2317
2318        read_lock_bh(&sk->sk_callback_lock);
2319        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2320        read_unlock_bh(&sk->sk_callback_lock);
2321        return uid;
2322}
2323EXPORT_SYMBOL(sock_i_uid);
2324
2325unsigned long sock_i_ino(struct sock *sk)
2326{
2327        unsigned long ino;
2328
2329        read_lock_bh(&sk->sk_callback_lock);
2330        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2331        read_unlock_bh(&sk->sk_callback_lock);
2332        return ino;
2333}
2334EXPORT_SYMBOL(sock_i_ino);
2335
2336/*
2337 * Allocate a skb from the socket's send buffer.
2338 */
2339struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2340                             gfp_t priority)
2341{
2342        if (force ||
2343            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2344                struct sk_buff *skb = alloc_skb(size, priority);
2345
2346                if (skb) {
2347                        skb_set_owner_w(skb, sk);
2348                        return skb;
2349                }
2350        }
2351        return NULL;
2352}
2353EXPORT_SYMBOL(sock_wmalloc);
2354
2355static void sock_ofree(struct sk_buff *skb)
2356{
2357        struct sock *sk = skb->sk;
2358
2359        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2360}
2361
2362struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2363                             gfp_t priority)
2364{
2365        struct sk_buff *skb;
2366
2367        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2368        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2369            sysctl_optmem_max)
2370                return NULL;
2371
2372        skb = alloc_skb(size, priority);
2373        if (!skb)
2374                return NULL;
2375
2376        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2377        skb->sk = sk;
2378        skb->destructor = sock_ofree;
2379        return skb;
2380}
2381
2382/*
2383 * Allocate a memory block from the socket's option memory buffer.
2384 */
2385void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2386{
2387        if ((unsigned int)size <= sysctl_optmem_max &&
2388            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2389                void *mem;
2390                /* First do the add, to avoid the race if kmalloc
2391                 * might sleep.
2392                 */
2393                atomic_add(size, &sk->sk_omem_alloc);
2394                mem = kmalloc(size, priority);
2395                if (mem)
2396                        return mem;
2397                atomic_sub(size, &sk->sk_omem_alloc);
2398        }
2399        return NULL;
2400}
2401EXPORT_SYMBOL(sock_kmalloc);
2402
2403/* Free an option memory block. Note, we actually want the inline
2404 * here as this allows gcc to detect the nullify and fold away the
2405 * condition entirely.
2406 */
2407static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2408                                  const bool nullify)
2409{
2410        if (WARN_ON_ONCE(!mem))
2411                return;
2412        if (nullify)
2413                kfree_sensitive(mem);
2414        else
2415                kfree(mem);
2416        atomic_sub(size, &sk->sk_omem_alloc);
2417}
2418
2419void sock_kfree_s(struct sock *sk, void *mem, int size)
2420{
2421        __sock_kfree_s(sk, mem, size, false);
2422}
2423EXPORT_SYMBOL(sock_kfree_s);
2424
2425void sock_kzfree_s(struct sock *sk, void *mem, int size)
2426{
2427        __sock_kfree_s(sk, mem, size, true);
2428}
2429EXPORT_SYMBOL(sock_kzfree_s);
2430
2431/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2432   I think, these locks should be removed for datagram sockets.
2433 */
2434static long sock_wait_for_wmem(struct sock *sk, long timeo)
2435{
2436        DEFINE_WAIT(wait);
2437
2438        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2439        for (;;) {
2440                if (!timeo)
2441                        break;
2442                if (signal_pending(current))
2443                        break;
2444                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2445                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2446                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2447                        break;
2448                if (sk->sk_shutdown & SEND_SHUTDOWN)
2449                        break;
2450                if (sk->sk_err)
2451                        break;
2452                timeo = schedule_timeout(timeo);
2453        }
2454        finish_wait(sk_sleep(sk), &wait);
2455        return timeo;
2456}
2457
2458
2459/*
2460 *      Generic send/receive buffer handlers
2461 */
2462
2463struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2464                                     unsigned long data_len, int noblock,
2465                                     int *errcode, int max_page_order)
2466{
2467        struct sk_buff *skb;
2468        long timeo;
2469        int err;
2470
2471        timeo = sock_sndtimeo(sk, noblock);
2472        for (;;) {
2473                err = sock_error(sk);
2474                if (err != 0)
2475                        goto failure;
2476
2477                err = -EPIPE;
2478                if (sk->sk_shutdown & SEND_SHUTDOWN)
2479                        goto failure;
2480
2481                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2482                        break;
2483
2484                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2485                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2486                err = -EAGAIN;
2487                if (!timeo)
2488                        goto failure;
2489                if (signal_pending(current))
2490                        goto interrupted;
2491                timeo = sock_wait_for_wmem(sk, timeo);
2492        }
2493        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2494                                   errcode, sk->sk_allocation);
2495        if (skb)
2496                skb_set_owner_w(skb, sk);
2497        return skb;
2498
2499interrupted:
2500        err = sock_intr_errno(timeo);
2501failure:
2502        *errcode = err;
2503        return NULL;
2504}
2505EXPORT_SYMBOL(sock_alloc_send_pskb);
2506
2507struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2508                                    int noblock, int *errcode)
2509{
2510        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2511}
2512EXPORT_SYMBOL(sock_alloc_send_skb);
2513
2514int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2515                     struct sockcm_cookie *sockc)
2516{
2517        u32 tsflags;
2518
2519        switch (cmsg->cmsg_type) {
2520        case SO_MARK:
2521                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2522                        return -EPERM;
2523                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2524                        return -EINVAL;
2525                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2526                break;
2527        case SO_TIMESTAMPING_OLD:
2528                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2529                        return -EINVAL;
2530
2531                tsflags = *(u32 *)CMSG_DATA(cmsg);
2532                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2533                        return -EINVAL;
2534
2535                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2536                sockc->tsflags |= tsflags;
2537                break;
2538        case SCM_TXTIME:
2539                if (!sock_flag(sk, SOCK_TXTIME))
2540                        return -EINVAL;
2541                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2542                        return -EINVAL;
2543                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2544                break;
2545        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2546        case SCM_RIGHTS:
2547        case SCM_CREDENTIALS:
2548                break;
2549        default:
2550                return -EINVAL;
2551        }
2552        return 0;
2553}
2554EXPORT_SYMBOL(__sock_cmsg_send);
2555
2556int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2557                   struct sockcm_cookie *sockc)
2558{
2559        struct cmsghdr *cmsg;
2560        int ret;
2561
2562        for_each_cmsghdr(cmsg, msg) {
2563                if (!CMSG_OK(msg, cmsg))
2564                        return -EINVAL;
2565                if (cmsg->cmsg_level != SOL_SOCKET)
2566                        continue;
2567                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2568                if (ret)
2569                        return ret;
2570        }
2571        return 0;
2572}
2573EXPORT_SYMBOL(sock_cmsg_send);
2574
2575static void sk_enter_memory_pressure(struct sock *sk)
2576{
2577        if (!sk->sk_prot->enter_memory_pressure)
2578                return;
2579
2580        sk->sk_prot->enter_memory_pressure(sk);
2581}
2582
2583static void sk_leave_memory_pressure(struct sock *sk)
2584{
2585        if (sk->sk_prot->leave_memory_pressure) {
2586                sk->sk_prot->leave_memory_pressure(sk);
2587        } else {
2588                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2589
2590                if (memory_pressure && READ_ONCE(*memory_pressure))
2591                        WRITE_ONCE(*memory_pressure, 0);
2592        }
2593}
2594
2595DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2596
2597/**
2598 * skb_page_frag_refill - check that a page_frag contains enough room
2599 * @sz: minimum size of the fragment we want to get
2600 * @pfrag: pointer to page_frag
2601 * @gfp: priority for memory allocation
2602 *
2603 * Note: While this allocator tries to use high order pages, there is
2604 * no guarantee that allocations succeed. Therefore, @sz MUST be
2605 * less or equal than PAGE_SIZE.
2606 */
2607bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2608{
2609        if (pfrag->page) {
2610                if (page_ref_count(pfrag->page) == 1) {
2611                        pfrag->offset = 0;
2612                        return true;
2613                }
2614                if (pfrag->offset + sz <= pfrag->size)
2615                        return true;
2616                put_page(pfrag->page);
2617        }
2618
2619        pfrag->offset = 0;
2620        if (SKB_FRAG_PAGE_ORDER &&
2621            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2622                /* Avoid direct reclaim but allow kswapd to wake */
2623                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2624                                          __GFP_COMP | __GFP_NOWARN |
2625                                          __GFP_NORETRY,
2626                                          SKB_FRAG_PAGE_ORDER);
2627                if (likely(pfrag->page)) {
2628                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2629                        return true;
2630                }
2631        }
2632        pfrag->page = alloc_page(gfp);
2633        if (likely(pfrag->page)) {
2634                pfrag->size = PAGE_SIZE;
2635                return true;
2636        }
2637        return false;
2638}
2639EXPORT_SYMBOL(skb_page_frag_refill);
2640
2641bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2642{
2643        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2644                return true;
2645
2646        sk_enter_memory_pressure(sk);
2647        sk_stream_moderate_sndbuf(sk);
2648        return false;
2649}
2650EXPORT_SYMBOL(sk_page_frag_refill);
2651
2652void __lock_sock(struct sock *sk)
2653        __releases(&sk->sk_lock.slock)
2654        __acquires(&sk->sk_lock.slock)
2655{
2656        DEFINE_WAIT(wait);
2657
2658        for (;;) {
2659                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2660                                        TASK_UNINTERRUPTIBLE);
2661                spin_unlock_bh(&sk->sk_lock.slock);
2662                schedule();
2663                spin_lock_bh(&sk->sk_lock.slock);
2664                if (!sock_owned_by_user(sk))
2665                        break;
2666        }
2667        finish_wait(&sk->sk_lock.wq, &wait);
2668}
2669
2670void __release_sock(struct sock *sk)
2671        __releases(&sk->sk_lock.slock)
2672        __acquires(&sk->sk_lock.slock)
2673{
2674        struct sk_buff *skb, *next;
2675
2676        while ((skb = sk->sk_backlog.head) != NULL) {
2677                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2678
2679                spin_unlock_bh(&sk->sk_lock.slock);
2680
2681                do {
2682                        next = skb->next;
2683                        prefetch(next);
2684                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2685                        skb_mark_not_on_list(skb);
2686                        sk_backlog_rcv(sk, skb);
2687
2688                        cond_resched();
2689
2690                        skb = next;
2691                } while (skb != NULL);
2692
2693                spin_lock_bh(&sk->sk_lock.slock);
2694        }
2695
2696        /*
2697         * Doing the zeroing here guarantee we can not loop forever
2698         * while a wild producer attempts to flood us.
2699         */
2700        sk->sk_backlog.len = 0;
2701}
2702
2703void __sk_flush_backlog(struct sock *sk)
2704{
2705        spin_lock_bh(&sk->sk_lock.slock);
2706        __release_sock(sk);
2707        spin_unlock_bh(&sk->sk_lock.slock);
2708}
2709
2710/**
2711 * sk_wait_data - wait for data to arrive at sk_receive_queue
2712 * @sk:    sock to wait on
2713 * @timeo: for how long
2714 * @skb:   last skb seen on sk_receive_queue
2715 *
2716 * Now socket state including sk->sk_err is changed only under lock,
2717 * hence we may omit checks after joining wait queue.
2718 * We check receive queue before schedule() only as optimization;
2719 * it is very likely that release_sock() added new data.
2720 */
2721int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2722{
2723        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2724        int rc;
2725
2726        add_wait_queue(sk_sleep(sk), &wait);
2727        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2728        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2729        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2730        remove_wait_queue(sk_sleep(sk), &wait);
2731        return rc;
2732}
2733EXPORT_SYMBOL(sk_wait_data);
2734
2735/**
2736 *      __sk_mem_raise_allocated - increase memory_allocated
2737 *      @sk: socket
2738 *      @size: memory size to allocate
2739 *      @amt: pages to allocate
2740 *      @kind: allocation type
2741 *
2742 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2743 */
2744int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2745{
2746        struct proto *prot = sk->sk_prot;
2747        long allocated = sk_memory_allocated_add(sk, amt);
2748        bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2749        bool charged = true;
2750
2751        if (memcg_charge &&
2752            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2753                                                gfp_memcg_charge())))
2754                goto suppress_allocation;
2755
2756        /* Under limit. */
2757        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2758                sk_leave_memory_pressure(sk);
2759                return 1;
2760        }
2761
2762        /* Under pressure. */
2763        if (allocated > sk_prot_mem_limits(sk, 1))
2764                sk_enter_memory_pressure(sk);
2765
2766        /* Over hard limit. */
2767        if (allocated > sk_prot_mem_limits(sk, 2))
2768                goto suppress_allocation;
2769
2770        /* guarantee minimum buffer size under pressure */
2771        if (kind == SK_MEM_RECV) {
2772                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2773                        return 1;
2774
2775        } else { /* SK_MEM_SEND */
2776                int wmem0 = sk_get_wmem0(sk, prot);
2777
2778                if (sk->sk_type == SOCK_STREAM) {
2779                        if (sk->sk_wmem_queued < wmem0)
2780                                return 1;
2781                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2782                                return 1;
2783                }
2784        }
2785
2786        if (sk_has_memory_pressure(sk)) {
2787                u64 alloc;
2788
2789                if (!sk_under_memory_pressure(sk))
2790                        return 1;
2791                alloc = sk_sockets_allocated_read_positive(sk);
2792                if (sk_prot_mem_limits(sk, 2) > alloc *
2793                    sk_mem_pages(sk->sk_wmem_queued +
2794                                 atomic_read(&sk->sk_rmem_alloc) +
2795                                 sk->sk_forward_alloc))
2796                        return 1;
2797        }
2798
2799suppress_allocation:
2800
2801        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2802                sk_stream_moderate_sndbuf(sk);
2803
2804                /* Fail only if socket is _under_ its sndbuf.
2805                 * In this case we cannot block, so that we have to fail.
2806                 */
2807                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2808                        /* Force charge with __GFP_NOFAIL */
2809                        if (memcg_charge && !charged) {
2810                                mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2811                                        gfp_memcg_charge() | __GFP_NOFAIL);
2812                        }
2813                        return 1;
2814                }
2815        }
2816
2817        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2818                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2819
2820        sk_memory_allocated_sub(sk, amt);
2821
2822        if (memcg_charge && charged)
2823                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2824
2825        return 0;
2826}
2827EXPORT_SYMBOL(__sk_mem_raise_allocated);
2828
2829/**
2830 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2831 *      @sk: socket
2832 *      @size: memory size to allocate
2833 *      @kind: allocation type
2834 *
2835 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2836 *      rmem allocation. This function assumes that protocols which have
2837 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2838 */
2839int __sk_mem_schedule(struct sock *sk, int size, int kind)
2840{
2841        int ret, amt = sk_mem_pages(size);
2842
2843        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2844        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2845        if (!ret)
2846                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2847        return ret;
2848}
2849EXPORT_SYMBOL(__sk_mem_schedule);
2850
2851/**
2852 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2853 *      @sk: socket
2854 *      @amount: number of quanta
2855 *
2856 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2857 */
2858void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2859{
2860        sk_memory_allocated_sub(sk, amount);
2861
2862        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2863                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2864
2865        if (sk_under_memory_pressure(sk) &&
2866            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2867                sk_leave_memory_pressure(sk);
2868}
2869EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2870
2871/**
2872 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2873 *      @sk: socket
2874 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2875 */
2876void __sk_mem_reclaim(struct sock *sk, int amount)
2877{
2878        amount >>= SK_MEM_QUANTUM_SHIFT;
2879        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2880        __sk_mem_reduce_allocated(sk, amount);
2881}
2882EXPORT_SYMBOL(__sk_mem_reclaim);
2883
2884int sk_set_peek_off(struct sock *sk, int val)
2885{
2886        sk->sk_peek_off = val;
2887        return 0;
2888}
2889EXPORT_SYMBOL_GPL(sk_set_peek_off);
2890
2891/*
2892 * Set of default routines for initialising struct proto_ops when
2893 * the protocol does not support a particular function. In certain
2894 * cases where it makes no sense for a protocol to have a "do nothing"
2895 * function, some default processing is provided.
2896 */
2897
2898int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2899{
2900        return -EOPNOTSUPP;
2901}
2902EXPORT_SYMBOL(sock_no_bind);
2903
2904int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2905                    int len, int flags)
2906{
2907        return -EOPNOTSUPP;
2908}
2909EXPORT_SYMBOL(sock_no_connect);
2910
2911int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2912{
2913        return -EOPNOTSUPP;
2914}
2915EXPORT_SYMBOL(sock_no_socketpair);
2916
2917int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2918                   bool kern)
2919{
2920        return -EOPNOTSUPP;
2921}
2922EXPORT_SYMBOL(sock_no_accept);
2923
2924int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2925                    int peer)
2926{
2927        return -EOPNOTSUPP;
2928}
2929EXPORT_SYMBOL(sock_no_getname);
2930
2931int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2932{
2933        return -EOPNOTSUPP;
2934}
2935EXPORT_SYMBOL(sock_no_ioctl);
2936
2937int sock_no_listen(struct socket *sock, int backlog)
2938{
2939        return -EOPNOTSUPP;
2940}
2941EXPORT_SYMBOL(sock_no_listen);
2942
2943int sock_no_shutdown(struct socket *sock, int how)
2944{
2945        return -EOPNOTSUPP;
2946}
2947EXPORT_SYMBOL(sock_no_shutdown);
2948
2949int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2950{
2951        return -EOPNOTSUPP;
2952}
2953EXPORT_SYMBOL(sock_no_sendmsg);
2954
2955int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2956{
2957        return -EOPNOTSUPP;
2958}
2959EXPORT_SYMBOL(sock_no_sendmsg_locked);
2960
2961int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2962                    int flags)
2963{
2964        return -EOPNOTSUPP;
2965}
2966EXPORT_SYMBOL(sock_no_recvmsg);
2967
2968int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2969{
2970        /* Mirror missing mmap method error code */
2971        return -ENODEV;
2972}
2973EXPORT_SYMBOL(sock_no_mmap);
2974
2975/*
2976 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2977 * various sock-based usage counts.
2978 */
2979void __receive_sock(struct file *file)
2980{
2981        struct socket *sock;
2982
2983        sock = sock_from_file(file);
2984        if (sock) {
2985                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2986                sock_update_classid(&sock->sk->sk_cgrp_data);
2987        }
2988}
2989
2990ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2991{
2992        ssize_t res;
2993        struct msghdr msg = {.msg_flags = flags};
2994        struct kvec iov;
2995        char *kaddr = kmap(page);
2996        iov.iov_base = kaddr + offset;
2997        iov.iov_len = size;
2998        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2999        kunmap(page);
3000        return res;
3001}
3002EXPORT_SYMBOL(sock_no_sendpage);
3003
3004ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3005                                int offset, size_t size, int flags)
3006{
3007        ssize_t res;
3008        struct msghdr msg = {.msg_flags = flags};
3009        struct kvec iov;
3010        char *kaddr = kmap(page);
3011
3012        iov.iov_base = kaddr + offset;
3013        iov.iov_len = size;
3014        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3015        kunmap(page);
3016        return res;
3017}
3018EXPORT_SYMBOL(sock_no_sendpage_locked);
3019
3020/*
3021 *      Default Socket Callbacks
3022 */
3023
3024static void sock_def_wakeup(struct sock *sk)
3025{
3026        struct socket_wq *wq;
3027
3028        rcu_read_lock();
3029        wq = rcu_dereference(sk->sk_wq);
3030        if (skwq_has_sleeper(wq))
3031                wake_up_interruptible_all(&wq->wait);
3032        rcu_read_unlock();
3033}
3034
3035static void sock_def_error_report(struct sock *sk)
3036{
3037        struct socket_wq *wq;
3038
3039        rcu_read_lock();
3040        wq = rcu_dereference(sk->sk_wq);
3041        if (skwq_has_sleeper(wq))
3042                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3043        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3044        rcu_read_unlock();
3045}
3046
3047void sock_def_readable(struct sock *sk)
3048{
3049        struct socket_wq *wq;
3050
3051        rcu_read_lock();
3052        wq = rcu_dereference(sk->sk_wq);
3053        if (skwq_has_sleeper(wq))
3054                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3055                                                EPOLLRDNORM | EPOLLRDBAND);
3056        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3057        rcu_read_unlock();
3058}
3059
3060static void sock_def_write_space(struct sock *sk)
3061{
3062        struct socket_wq *wq;
3063
3064        rcu_read_lock();
3065
3066        /* Do not wake up a writer until he can make "significant"
3067         * progress.  --DaveM
3068         */
3069        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3070                wq = rcu_dereference(sk->sk_wq);
3071                if (skwq_has_sleeper(wq))
3072                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3073                                                EPOLLWRNORM | EPOLLWRBAND);
3074
3075                /* Should agree with poll, otherwise some programs break */
3076                if (sock_writeable(sk))
3077                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3078        }
3079
3080        rcu_read_unlock();
3081}
3082
3083static void sock_def_destruct(struct sock *sk)
3084{
3085}
3086
3087void sk_send_sigurg(struct sock *sk)
3088{
3089        if (sk->sk_socket && sk->sk_socket->file)
3090                if (send_sigurg(&sk->sk_socket->file->f_owner))
3091                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3092}
3093EXPORT_SYMBOL(sk_send_sigurg);
3094
3095void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3096                    unsigned long expires)
3097{
3098        if (!mod_timer(timer, expires))
3099                sock_hold(sk);
3100}
3101EXPORT_SYMBOL(sk_reset_timer);
3102
3103void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3104{
3105        if (del_timer(timer))
3106                __sock_put(sk);
3107}
3108EXPORT_SYMBOL(sk_stop_timer);
3109
3110void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3111{
3112        if (del_timer_sync(timer))
3113                __sock_put(sk);
3114}
3115EXPORT_SYMBOL(sk_stop_timer_sync);
3116
3117void sock_init_data(struct socket *sock, struct sock *sk)
3118{
3119        sk_init_common(sk);
3120        sk->sk_send_head        =       NULL;
3121
3122        timer_setup(&sk->sk_timer, NULL, 0);
3123
3124        sk->sk_allocation       =       GFP_KERNEL;
3125        sk->sk_rcvbuf           =       sysctl_rmem_default;
3126        sk->sk_sndbuf           =       sysctl_wmem_default;
3127        sk->sk_state            =       TCP_CLOSE;
3128        sk_set_socket(sk, sock);
3129
3130        sock_set_flag(sk, SOCK_ZAPPED);
3131
3132        if (sock) {
3133                sk->sk_type     =       sock->type;
3134                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3135                sock->sk        =       sk;
3136                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3137        } else {
3138                RCU_INIT_POINTER(sk->sk_wq, NULL);
3139                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3140        }
3141
3142        rwlock_init(&sk->sk_callback_lock);
3143        if (sk->sk_kern_sock)
3144                lockdep_set_class_and_name(
3145                        &sk->sk_callback_lock,
3146                        af_kern_callback_keys + sk->sk_family,
3147                        af_family_kern_clock_key_strings[sk->sk_family]);
3148        else
3149                lockdep_set_class_and_name(
3150                        &sk->sk_callback_lock,
3151                        af_callback_keys + sk->sk_family,
3152                        af_family_clock_key_strings[sk->sk_family]);
3153
3154        sk->sk_state_change     =       sock_def_wakeup;
3155        sk->sk_data_ready       =       sock_def_readable;
3156        sk->sk_write_space      =       sock_def_write_space;
3157        sk->sk_error_report     =       sock_def_error_report;
3158        sk->sk_destruct         =       sock_def_destruct;
3159
3160        sk->sk_frag.page        =       NULL;
3161        sk->sk_frag.offset      =       0;
3162        sk->sk_peek_off         =       -1;
3163
3164        sk->sk_peer_pid         =       NULL;
3165        sk->sk_peer_cred        =       NULL;
3166        spin_lock_init(&sk->sk_peer_lock);
3167
3168        sk->sk_write_pending    =       0;
3169        sk->sk_rcvlowat         =       1;
3170        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3171        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3172
3173        sk->sk_stamp = SK_DEFAULT_STAMP;
3174#if BITS_PER_LONG==32
3175        seqlock_init(&sk->sk_stamp_seq);
3176#endif
3177        atomic_set(&sk->sk_zckey, 0);
3178
3179#ifdef CONFIG_NET_RX_BUSY_POLL
3180        sk->sk_napi_id          =       0;
3181        sk->sk_ll_usec          =       sysctl_net_busy_read;
3182#endif
3183
3184        sk->sk_max_pacing_rate = ~0UL;
3185        sk->sk_pacing_rate = ~0UL;
3186        WRITE_ONCE(sk->sk_pacing_shift, 10);
3187        sk->sk_incoming_cpu = -1;
3188
3189        sk_rx_queue_clear(sk);
3190        /*
3191         * Before updating sk_refcnt, we must commit prior changes to memory
3192         * (Documentation/RCU/rculist_nulls.rst for details)
3193         */
3194        smp_wmb();
3195        refcount_set(&sk->sk_refcnt, 1);
3196        atomic_set(&sk->sk_drops, 0);
3197}
3198EXPORT_SYMBOL(sock_init_data);
3199
3200void lock_sock_nested(struct sock *sk, int subclass)
3201{
3202        /* The sk_lock has mutex_lock() semantics here. */
3203        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3204
3205        might_sleep();
3206        spin_lock_bh(&sk->sk_lock.slock);
3207        if (sk->sk_lock.owned)
3208                __lock_sock(sk);
3209        sk->sk_lock.owned = 1;
3210        spin_unlock_bh(&sk->sk_lock.slock);
3211}
3212EXPORT_SYMBOL(lock_sock_nested);
3213
3214void release_sock(struct sock *sk)
3215{
3216        spin_lock_bh(&sk->sk_lock.slock);
3217        if (sk->sk_backlog.tail)
3218                __release_sock(sk);
3219
3220        /* Warning : release_cb() might need to release sk ownership,
3221         * ie call sock_release_ownership(sk) before us.
3222         */
3223        if (sk->sk_prot->release_cb)
3224                sk->sk_prot->release_cb(sk);
3225
3226        sock_release_ownership(sk);
3227        if (waitqueue_active(&sk->sk_lock.wq))
3228                wake_up(&sk->sk_lock.wq);
3229        spin_unlock_bh(&sk->sk_lock.slock);
3230}
3231EXPORT_SYMBOL(release_sock);
3232
3233bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3234{
3235        might_sleep();
3236        spin_lock_bh(&sk->sk_lock.slock);
3237
3238        if (!sk->sk_lock.owned) {
3239                /*
3240                 * Fast path return with bottom halves disabled and
3241                 * sock::sk_lock.slock held.
3242                 *
3243                 * The 'mutex' is not contended and holding
3244                 * sock::sk_lock.slock prevents all other lockers to
3245                 * proceed so the corresponding unlock_sock_fast() can
3246                 * avoid the slow path of release_sock() completely and
3247                 * just release slock.
3248                 *
3249                 * From a semantical POV this is equivalent to 'acquiring'
3250                 * the 'mutex', hence the corresponding lockdep
3251                 * mutex_release() has to happen in the fast path of
3252                 * unlock_sock_fast().
3253                 */
3254                return false;
3255        }
3256
3257        __lock_sock(sk);
3258        sk->sk_lock.owned = 1;
3259        __acquire(&sk->sk_lock.slock);
3260        spin_unlock_bh(&sk->sk_lock.slock);
3261        return true;
3262}
3263EXPORT_SYMBOL(__lock_sock_fast);
3264
3265int sock_gettstamp(struct socket *sock, void __user *userstamp,
3266                   bool timeval, bool time32)
3267{
3268        struct sock *sk = sock->sk;
3269        struct timespec64 ts;
3270
3271        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3272        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3273        if (ts.tv_sec == -1)
3274                return -ENOENT;
3275        if (ts.tv_sec == 0) {
3276                ktime_t kt = ktime_get_real();
3277                sock_write_timestamp(sk, kt);
3278                ts = ktime_to_timespec64(kt);
3279        }
3280
3281        if (timeval)
3282                ts.tv_nsec /= 1000;
3283
3284#ifdef CONFIG_COMPAT_32BIT_TIME
3285        if (time32)
3286                return put_old_timespec32(&ts, userstamp);
3287#endif
3288#ifdef CONFIG_SPARC64
3289        /* beware of padding in sparc64 timeval */
3290        if (timeval && !in_compat_syscall()) {
3291                struct __kernel_old_timeval __user tv = {
3292                        .tv_sec = ts.tv_sec,
3293                        .tv_usec = ts.tv_nsec,
3294                };
3295                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3296                        return -EFAULT;
3297                return 0;
3298        }
3299#endif
3300        return put_timespec64(&ts, userstamp);
3301}
3302EXPORT_SYMBOL(sock_gettstamp);
3303
3304void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3305{
3306        if (!sock_flag(sk, flag)) {
3307                unsigned long previous_flags = sk->sk_flags;
3308
3309                sock_set_flag(sk, flag);
3310                /*
3311                 * we just set one of the two flags which require net
3312                 * time stamping, but time stamping might have been on
3313                 * already because of the other one
3314                 */
3315                if (sock_needs_netstamp(sk) &&
3316                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3317                        net_enable_timestamp();
3318        }
3319}
3320
3321int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3322                       int level, int type)
3323{
3324        struct sock_exterr_skb *serr;
3325        struct sk_buff *skb;
3326        int copied, err;
3327
3328        err = -EAGAIN;
3329        skb = sock_dequeue_err_skb(sk);
3330        if (skb == NULL)
3331                goto out;
3332
3333        copied = skb->len;
3334        if (copied > len) {
3335                msg->msg_flags |= MSG_TRUNC;
3336                copied = len;
3337        }
3338        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3339        if (err)
3340                goto out_free_skb;
3341
3342        sock_recv_timestamp(msg, sk, skb);
3343
3344        serr = SKB_EXT_ERR(skb);
3345        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3346
3347        msg->msg_flags |= MSG_ERRQUEUE;
3348        err = copied;
3349
3350out_free_skb:
3351        kfree_skb(skb);
3352out:
3353        return err;
3354}
3355EXPORT_SYMBOL(sock_recv_errqueue);
3356
3357/*
3358 *      Get a socket option on an socket.
3359 *
3360 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3361 *      asynchronous errors should be reported by getsockopt. We assume
3362 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3363 */
3364int sock_common_getsockopt(struct socket *sock, int level, int optname,
3365                           char __user *optval, int __user *optlen)
3366{
3367        struct sock *sk = sock->sk;
3368
3369        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3370}
3371EXPORT_SYMBOL(sock_common_getsockopt);
3372
3373int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3374                        int flags)
3375{
3376        struct sock *sk = sock->sk;
3377        int addr_len = 0;
3378        int err;
3379
3380        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3381                                   flags & ~MSG_DONTWAIT, &addr_len);
3382        if (err >= 0)
3383                msg->msg_namelen = addr_len;
3384        return err;
3385}
3386EXPORT_SYMBOL(sock_common_recvmsg);
3387
3388/*
3389 *      Set socket options on an inet socket.
3390 */
3391int sock_common_setsockopt(struct socket *sock, int level, int optname,
3392                           sockptr_t optval, unsigned int optlen)
3393{
3394        struct sock *sk = sock->sk;
3395
3396        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3397}
3398EXPORT_SYMBOL(sock_common_setsockopt);
3399
3400void sk_common_release(struct sock *sk)
3401{
3402        if (sk->sk_prot->destroy)
3403                sk->sk_prot->destroy(sk);
3404
3405        /*
3406         * Observation: when sk_common_release is called, processes have
3407         * no access to socket. But net still has.
3408         * Step one, detach it from networking:
3409         *
3410         * A. Remove from hash tables.
3411         */
3412
3413        sk->sk_prot->unhash(sk);
3414
3415        /*
3416         * In this point socket cannot receive new packets, but it is possible
3417         * that some packets are in flight because some CPU runs receiver and
3418         * did hash table lookup before we unhashed socket. They will achieve
3419         * receive queue and will be purged by socket destructor.
3420         *
3421         * Also we still have packets pending on receive queue and probably,
3422         * our own packets waiting in device queues. sock_destroy will drain
3423         * receive queue, but transmitted packets will delay socket destruction
3424         * until the last reference will be released.
3425         */
3426
3427        sock_orphan(sk);
3428
3429        xfrm_sk_free_policy(sk);
3430
3431        sk_refcnt_debug_release(sk);
3432
3433        sock_put(sk);
3434}
3435EXPORT_SYMBOL(sk_common_release);
3436
3437void sk_get_meminfo(const struct sock *sk, u32 *mem)
3438{
3439        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3440
3441        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3442        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3443        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3444        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3445        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3446        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3447        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3448        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3449        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3450}
3451
3452#ifdef CONFIG_PROC_FS
3453#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3454struct prot_inuse {
3455        int val[PROTO_INUSE_NR];
3456};
3457
3458static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3459
3460void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3461{
3462        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3463}
3464EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3465
3466int sock_prot_inuse_get(struct net *net, struct proto *prot)
3467{
3468        int cpu, idx = prot->inuse_idx;
3469        int res = 0;
3470
3471        for_each_possible_cpu(cpu)
3472                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3473
3474        return res >= 0 ? res : 0;
3475}
3476EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3477
3478static void sock_inuse_add(struct net *net, int val)
3479{
3480        this_cpu_add(*net->core.sock_inuse, val);
3481}
3482
3483int sock_inuse_get(struct net *net)
3484{
3485        int cpu, res = 0;
3486
3487        for_each_possible_cpu(cpu)
3488                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3489
3490        return res;
3491}
3492
3493EXPORT_SYMBOL_GPL(sock_inuse_get);
3494
3495static int __net_init sock_inuse_init_net(struct net *net)
3496{
3497        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3498        if (net->core.prot_inuse == NULL)
3499                return -ENOMEM;
3500
3501        net->core.sock_inuse = alloc_percpu(int);
3502        if (net->core.sock_inuse == NULL)
3503                goto out;
3504
3505        return 0;
3506
3507out:
3508        free_percpu(net->core.prot_inuse);
3509        return -ENOMEM;
3510}
3511
3512static void __net_exit sock_inuse_exit_net(struct net *net)
3513{
3514        free_percpu(net->core.prot_inuse);
3515        free_percpu(net->core.sock_inuse);
3516}
3517
3518static struct pernet_operations net_inuse_ops = {
3519        .init = sock_inuse_init_net,
3520        .exit = sock_inuse_exit_net,
3521};
3522
3523static __init int net_inuse_init(void)
3524{
3525        if (register_pernet_subsys(&net_inuse_ops))
3526                panic("Cannot initialize net inuse counters");
3527
3528        return 0;
3529}
3530
3531core_initcall(net_inuse_init);
3532
3533static int assign_proto_idx(struct proto *prot)
3534{
3535        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3536
3537        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3538                pr_err("PROTO_INUSE_NR exhausted\n");
3539                return -ENOSPC;
3540        }
3541
3542        set_bit(prot->inuse_idx, proto_inuse_idx);
3543        return 0;
3544}
3545
3546static void release_proto_idx(struct proto *prot)
3547{
3548        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3549                clear_bit(prot->inuse_idx, proto_inuse_idx);
3550}
3551#else
3552static inline int assign_proto_idx(struct proto *prot)
3553{
3554        return 0;
3555}
3556
3557static inline void release_proto_idx(struct proto *prot)
3558{
3559}
3560
3561static void sock_inuse_add(struct net *net, int val)
3562{
3563}
3564#endif
3565
3566static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3567{
3568        if (!twsk_prot)
3569                return;
3570        kfree(twsk_prot->twsk_slab_name);
3571        twsk_prot->twsk_slab_name = NULL;
3572        kmem_cache_destroy(twsk_prot->twsk_slab);
3573        twsk_prot->twsk_slab = NULL;
3574}
3575
3576static int tw_prot_init(const struct proto *prot)
3577{
3578        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3579
3580        if (!twsk_prot)
3581                return 0;
3582
3583        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3584                                              prot->name);
3585        if (!twsk_prot->twsk_slab_name)
3586                return -ENOMEM;
3587
3588        twsk_prot->twsk_slab =
3589                kmem_cache_create(twsk_prot->twsk_slab_name,
3590                                  twsk_prot->twsk_obj_size, 0,
3591                                  SLAB_ACCOUNT | prot->slab_flags,
3592                                  NULL);
3593        if (!twsk_prot->twsk_slab) {
3594                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3595                        prot->name);
3596                return -ENOMEM;
3597        }
3598
3599        return 0;
3600}
3601
3602static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3603{
3604        if (!rsk_prot)
3605                return;
3606        kfree(rsk_prot->slab_name);
3607        rsk_prot->slab_name = NULL;
3608        kmem_cache_destroy(rsk_prot->slab);
3609        rsk_prot->slab = NULL;
3610}
3611
3612static int req_prot_init(const struct proto *prot)
3613{
3614        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3615
3616        if (!rsk_prot)
3617                return 0;
3618
3619        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3620                                        prot->name);
3621        if (!rsk_prot->slab_name)
3622                return -ENOMEM;
3623
3624        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3625                                           rsk_prot->obj_size, 0,
3626                                           SLAB_ACCOUNT | prot->slab_flags,
3627                                           NULL);
3628
3629        if (!rsk_prot->slab) {
3630                pr_crit("%s: Can't create request sock SLAB cache!\n",
3631                        prot->name);
3632                return -ENOMEM;
3633        }
3634        return 0;
3635}
3636
3637int proto_register(struct proto *prot, int alloc_slab)
3638{
3639        int ret = -ENOBUFS;
3640
3641        if (alloc_slab) {
3642                prot->slab = kmem_cache_create_usercopy(prot->name,
3643                                        prot->obj_size, 0,
3644                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3645                                        prot->slab_flags,
3646                                        prot->useroffset, prot->usersize,
3647                                        NULL);
3648
3649                if (prot->slab == NULL) {
3650                        pr_crit("%s: Can't create sock SLAB cache!\n",
3651                                prot->name);
3652                        goto out;
3653                }
3654
3655                if (req_prot_init(prot))
3656                        goto out_free_request_sock_slab;
3657
3658                if (tw_prot_init(prot))
3659                        goto out_free_timewait_sock_slab;
3660        }
3661
3662        mutex_lock(&proto_list_mutex);
3663        ret = assign_proto_idx(prot);
3664        if (ret) {
3665                mutex_unlock(&proto_list_mutex);
3666                goto out_free_timewait_sock_slab;
3667        }
3668        list_add(&prot->node, &proto_list);
3669        mutex_unlock(&proto_list_mutex);
3670        return ret;
3671
3672out_free_timewait_sock_slab:
3673        if (alloc_slab)
3674                tw_prot_cleanup(prot->twsk_prot);
3675out_free_request_sock_slab:
3676        if (alloc_slab) {
3677                req_prot_cleanup(prot->rsk_prot);
3678
3679                kmem_cache_destroy(prot->slab);
3680                prot->slab = NULL;
3681        }
3682out:
3683        return ret;
3684}
3685EXPORT_SYMBOL(proto_register);
3686
3687void proto_unregister(struct proto *prot)
3688{
3689        mutex_lock(&proto_list_mutex);
3690        release_proto_idx(prot);
3691        list_del(&prot->node);
3692        mutex_unlock(&proto_list_mutex);
3693
3694        kmem_cache_destroy(prot->slab);
3695        prot->slab = NULL;
3696
3697        req_prot_cleanup(prot->rsk_prot);
3698        tw_prot_cleanup(prot->twsk_prot);
3699}
3700EXPORT_SYMBOL(proto_unregister);
3701
3702int sock_load_diag_module(int family, int protocol)
3703{
3704        if (!protocol) {
3705                if (!sock_is_registered(family))
3706                        return -ENOENT;
3707
3708                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3709                                      NETLINK_SOCK_DIAG, family);
3710        }
3711
3712#ifdef CONFIG_INET
3713        if (family == AF_INET &&
3714            protocol != IPPROTO_RAW &&
3715            protocol < MAX_INET_PROTOS &&
3716            !rcu_access_pointer(inet_protos[protocol]))
3717                return -ENOENT;
3718#endif
3719
3720        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3721                              NETLINK_SOCK_DIAG, family, protocol);
3722}
3723EXPORT_SYMBOL(sock_load_diag_module);
3724
3725#ifdef CONFIG_PROC_FS
3726static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3727        __acquires(proto_list_mutex)
3728{
3729        mutex_lock(&proto_list_mutex);
3730        return seq_list_start_head(&proto_list, *pos);
3731}
3732
3733static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3734{
3735        return seq_list_next(v, &proto_list, pos);
3736}
3737
3738static void proto_seq_stop(struct seq_file *seq, void *v)
3739        __releases(proto_list_mutex)
3740{
3741        mutex_unlock(&proto_list_mutex);
3742}
3743
3744static char proto_method_implemented(const void *method)
3745{
3746        return method == NULL ? 'n' : 'y';
3747}
3748static long sock_prot_memory_allocated(struct proto *proto)
3749{
3750        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3751}
3752
3753static const char *sock_prot_memory_pressure(struct proto *proto)
3754{
3755        return proto->memory_pressure != NULL ?
3756        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3757}
3758
3759static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3760{
3761
3762        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3763                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3764                   proto->name,
3765                   proto->obj_size,
3766                   sock_prot_inuse_get(seq_file_net(seq), proto),
3767                   sock_prot_memory_allocated(proto),
3768                   sock_prot_memory_pressure(proto),
3769                   proto->max_header,
3770                   proto->slab == NULL ? "no" : "yes",
3771                   module_name(proto->owner),
3772                   proto_method_implemented(proto->close),
3773                   proto_method_implemented(proto->connect),
3774                   proto_method_implemented(proto->disconnect),
3775                   proto_method_implemented(proto->accept),
3776                   proto_method_implemented(proto->ioctl),
3777                   proto_method_implemented(proto->init),
3778                   proto_method_implemented(proto->destroy),
3779                   proto_method_implemented(proto->shutdown),
3780                   proto_method_implemented(proto->setsockopt),
3781                   proto_method_implemented(proto->getsockopt),
3782                   proto_method_implemented(proto->sendmsg),
3783                   proto_method_implemented(proto->recvmsg),
3784                   proto_method_implemented(proto->sendpage),
3785                   proto_method_implemented(proto->bind),
3786                   proto_method_implemented(proto->backlog_rcv),
3787                   proto_method_implemented(proto->hash),
3788                   proto_method_implemented(proto->unhash),
3789                   proto_method_implemented(proto->get_port),
3790                   proto_method_implemented(proto->enter_memory_pressure));
3791}
3792
3793static int proto_seq_show(struct seq_file *seq, void *v)
3794{
3795        if (v == &proto_list)
3796                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3797                           "protocol",
3798                           "size",
3799                           "sockets",
3800                           "memory",
3801                           "press",
3802                           "maxhdr",
3803                           "slab",
3804                           "module",
3805                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3806        else
3807                proto_seq_printf(seq, list_entry(v, struct proto, node));
3808        return 0;
3809}
3810
3811static const struct seq_operations proto_seq_ops = {
3812        .start  = proto_seq_start,
3813        .next   = proto_seq_next,
3814        .stop   = proto_seq_stop,
3815        .show   = proto_seq_show,
3816};
3817
3818static __net_init int proto_init_net(struct net *net)
3819{
3820        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3821                        sizeof(struct seq_net_private)))
3822                return -ENOMEM;
3823
3824        return 0;
3825}
3826
3827static __net_exit void proto_exit_net(struct net *net)
3828{
3829        remove_proc_entry("protocols", net->proc_net);
3830}
3831
3832
3833static __net_initdata struct pernet_operations proto_net_ops = {
3834        .init = proto_init_net,
3835        .exit = proto_exit_net,
3836};
3837
3838static int __init proto_init(void)
3839{
3840        return register_pernet_subsys(&proto_net_ops);
3841}
3842
3843subsys_initcall(proto_init);
3844
3845#endif /* PROC_FS */
3846
3847#ifdef CONFIG_NET_RX_BUSY_POLL
3848bool sk_busy_loop_end(void *p, unsigned long start_time)
3849{
3850        struct sock *sk = p;
3851
3852        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3853               sk_busy_loop_timeout(sk, start_time);
3854}
3855EXPORT_SYMBOL(sk_busy_loop_end);
3856#endif /* CONFIG_NET_RX_BUSY_POLL */
3857
3858int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3859{
3860        if (!sk->sk_prot->bind_add)
3861                return -EOPNOTSUPP;
3862        return sk->sk_prot->bind_add(sk, addr, addr_len);
3863}
3864EXPORT_SYMBOL(sock_bind_add);
3865