linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145static void sock_inuse_add(struct net *net, int val);
 146
 147/**
 148 * sk_ns_capable - General socket capability test
 149 * @sk: Socket to use a capability on or through
 150 * @user_ns: The user namespace of the capability to use
 151 * @cap: The capability to use
 152 *
 153 * Test to see if the opener of the socket had when the socket was
 154 * created and the current process has the capability @cap in the user
 155 * namespace @user_ns.
 156 */
 157bool sk_ns_capable(const struct sock *sk,
 158                   struct user_namespace *user_ns, int cap)
 159{
 160        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161                ns_capable(user_ns, cap);
 162}
 163EXPORT_SYMBOL(sk_ns_capable);
 164
 165/**
 166 * sk_capable - Socket global capability test
 167 * @sk: Socket to use a capability on or through
 168 * @cap: The global capability to use
 169 *
 170 * Test to see if the opener of the socket had when the socket was
 171 * created and the current process has the capability @cap in all user
 172 * namespaces.
 173 */
 174bool sk_capable(const struct sock *sk, int cap)
 175{
 176        return sk_ns_capable(sk, &init_user_ns, cap);
 177}
 178EXPORT_SYMBOL(sk_capable);
 179
 180/**
 181 * sk_net_capable - Network namespace socket capability test
 182 * @sk: Socket to use a capability on or through
 183 * @cap: The capability to use
 184 *
 185 * Test to see if the opener of the socket had when the socket was created
 186 * and the current process has the capability @cap over the network namespace
 187 * the socket is a member of.
 188 */
 189bool sk_net_capable(const struct sock *sk, int cap)
 190{
 191        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192}
 193EXPORT_SYMBOL(sk_net_capable);
 194
 195/*
 196 * Each address family might have different locking rules, so we have
 197 * one slock key per address family and separate keys for internal and
 198 * userspace sockets.
 199 */
 200static struct lock_class_key af_family_keys[AF_MAX];
 201static struct lock_class_key af_family_kern_keys[AF_MAX];
 202static struct lock_class_key af_family_slock_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210
 211#define _sock_locks(x)                                            \
 212  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 213  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 214  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 215  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 216  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 217  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 218  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 219  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 220  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 221  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 222  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 223  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 224  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 225  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 226  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 227  x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230        _sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233        _sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236        _sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240        _sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243        _sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249        _sock_locks("rlock-")
 250};
 251static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 252        _sock_locks("wlock-")
 253};
 254static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 255        _sock_locks("elock-")
 256};
 257
 258/*
 259 * sk_callback_lock and sk queues locking rules are per-address-family,
 260 * so split the lock classes by using a per-AF key:
 261 */
 262static struct lock_class_key af_callback_keys[AF_MAX];
 263static struct lock_class_key af_rlock_keys[AF_MAX];
 264static struct lock_class_key af_wlock_keys[AF_MAX];
 265static struct lock_class_key af_elock_keys[AF_MAX];
 266static struct lock_class_key af_kern_callback_keys[AF_MAX];
 267
 268/* Run time adjustable parameters. */
 269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270EXPORT_SYMBOL(sysctl_wmem_max);
 271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272EXPORT_SYMBOL(sysctl_rmem_max);
 273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276/* Maximal space eaten by iovec or ancillary data plus some space */
 277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 283EXPORT_SYMBOL_GPL(memalloc_socks_key);
 284
 285/**
 286 * sk_set_memalloc - sets %SOCK_MEMALLOC
 287 * @sk: socket to set it on
 288 *
 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290 * It's the responsibility of the admin to adjust min_free_kbytes
 291 * to meet the requirements
 292 */
 293void sk_set_memalloc(struct sock *sk)
 294{
 295        sock_set_flag(sk, SOCK_MEMALLOC);
 296        sk->sk_allocation |= __GFP_MEMALLOC;
 297        static_branch_inc(&memalloc_socks_key);
 298}
 299EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301void sk_clear_memalloc(struct sock *sk)
 302{
 303        sock_reset_flag(sk, SOCK_MEMALLOC);
 304        sk->sk_allocation &= ~__GFP_MEMALLOC;
 305        static_branch_dec(&memalloc_socks_key);
 306
 307        /*
 308         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309         * progress of swapping. SOCK_MEMALLOC may be cleared while
 310         * it has rmem allocations due to the last swapfile being deactivated
 311         * but there is a risk that the socket is unusable due to exceeding
 312         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313         */
 314        sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320        int ret;
 321        unsigned int noreclaim_flag;
 322
 323        /* these should have been dropped before queueing */
 324        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326        noreclaim_flag = memalloc_noreclaim_save();
 327        ret = sk->sk_backlog_rcv(sk, skb);
 328        memalloc_noreclaim_restore(noreclaim_flag);
 329
 330        return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 335{
 336        struct __kernel_sock_timeval tv;
 337
 338        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339                tv.tv_sec = 0;
 340                tv.tv_usec = 0;
 341        } else {
 342                tv.tv_sec = timeo / HZ;
 343                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344        }
 345
 346        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348                *(struct old_timeval32 *)optval = tv32;
 349                return sizeof(tv32);
 350        }
 351
 352        if (old_timeval) {
 353                struct __kernel_old_timeval old_tv;
 354                old_tv.tv_sec = tv.tv_sec;
 355                old_tv.tv_usec = tv.tv_usec;
 356                *(struct __kernel_old_timeval *)optval = old_tv;
 357                return sizeof(old_tv);
 358        }
 359
 360        *(struct __kernel_sock_timeval *)optval = tv;
 361        return sizeof(tv);
 362}
 363
 364static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 365                            bool old_timeval)
 366{
 367        struct __kernel_sock_timeval tv;
 368
 369        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 370                struct old_timeval32 tv32;
 371
 372                if (optlen < sizeof(tv32))
 373                        return -EINVAL;
 374
 375                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 376                        return -EFAULT;
 377                tv.tv_sec = tv32.tv_sec;
 378                tv.tv_usec = tv32.tv_usec;
 379        } else if (old_timeval) {
 380                struct __kernel_old_timeval old_tv;
 381
 382                if (optlen < sizeof(old_tv))
 383                        return -EINVAL;
 384                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 385                        return -EFAULT;
 386                tv.tv_sec = old_tv.tv_sec;
 387                tv.tv_usec = old_tv.tv_usec;
 388        } else {
 389                if (optlen < sizeof(tv))
 390                        return -EINVAL;
 391                if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 392                        return -EFAULT;
 393        }
 394        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                return -EDOM;
 396
 397        if (tv.tv_sec < 0) {
 398                static int warned __read_mostly;
 399
 400                *timeo_p = 0;
 401                if (warned < 10 && net_ratelimit()) {
 402                        warned++;
 403                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                __func__, current->comm, task_pid_nr(current));
 405                }
 406                return 0;
 407        }
 408        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                return 0;
 411        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 412                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 413        return 0;
 414}
 415
 416static void sock_warn_obsolete_bsdism(const char *name)
 417{
 418        static int warned;
 419        static char warncomm[TASK_COMM_LEN];
 420        if (strcmp(warncomm, current->comm) && warned < 5) {
 421                strcpy(warncomm,  current->comm);
 422                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                        warncomm, name);
 424                warned++;
 425        }
 426}
 427
 428static bool sock_needs_netstamp(const struct sock *sk)
 429{
 430        switch (sk->sk_family) {
 431        case AF_UNSPEC:
 432        case AF_UNIX:
 433                return false;
 434        default:
 435                return true;
 436        }
 437}
 438
 439static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440{
 441        if (sk->sk_flags & flags) {
 442                sk->sk_flags &= ~flags;
 443                if (sock_needs_netstamp(sk) &&
 444                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                        net_disable_timestamp();
 446        }
 447}
 448
 449
 450int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451{
 452        unsigned long flags;
 453        struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                atomic_inc(&sk->sk_drops);
 457                trace_sock_rcvqueue_full(sk, skb);
 458                return -ENOMEM;
 459        }
 460
 461        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                atomic_inc(&sk->sk_drops);
 463                return -ENOBUFS;
 464        }
 465
 466        skb->dev = NULL;
 467        skb_set_owner_r(skb, sk);
 468
 469        /* we escape from rcu protected region, make sure we dont leak
 470         * a norefcounted dst
 471         */
 472        skb_dst_force(skb);
 473
 474        spin_lock_irqsave(&list->lock, flags);
 475        sock_skb_set_dropcount(sk, skb);
 476        __skb_queue_tail(list, skb);
 477        spin_unlock_irqrestore(&list->lock, flags);
 478
 479        if (!sock_flag(sk, SOCK_DEAD))
 480                sk->sk_data_ready(sk);
 481        return 0;
 482}
 483EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486{
 487        int err;
 488
 489        err = sk_filter(sk, skb);
 490        if (err)
 491                return err;
 492
 493        return __sock_queue_rcv_skb(sk, skb);
 494}
 495EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                     const int nested, unsigned int trim_cap, bool refcounted)
 499{
 500        int rc = NET_RX_SUCCESS;
 501
 502        if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                goto discard_and_relse;
 504
 505        skb->dev = NULL;
 506
 507        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                atomic_inc(&sk->sk_drops);
 509                goto discard_and_relse;
 510        }
 511        if (nested)
 512                bh_lock_sock_nested(sk);
 513        else
 514                bh_lock_sock(sk);
 515        if (!sock_owned_by_user(sk)) {
 516                /*
 517                 * trylock + unlock semantics:
 518                 */
 519                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                rc = sk_backlog_rcv(sk, skb);
 522
 523                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 524        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 525                bh_unlock_sock(sk);
 526                atomic_inc(&sk->sk_drops);
 527                goto discard_and_relse;
 528        }
 529
 530        bh_unlock_sock(sk);
 531out:
 532        if (refcounted)
 533                sock_put(sk);
 534        return rc;
 535discard_and_relse:
 536        kfree_skb(skb);
 537        goto out;
 538}
 539EXPORT_SYMBOL(__sk_receive_skb);
 540
 541struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542{
 543        struct dst_entry *dst = __sk_dst_get(sk);
 544
 545        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                sk_tx_queue_clear(sk);
 547                sk->sk_dst_pending_confirm = 0;
 548                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                dst_release(dst);
 550                return NULL;
 551        }
 552
 553        return dst;
 554}
 555EXPORT_SYMBOL(__sk_dst_check);
 556
 557struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558{
 559        struct dst_entry *dst = sk_dst_get(sk);
 560
 561        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                sk_dst_reset(sk);
 563                dst_release(dst);
 564                return NULL;
 565        }
 566
 567        return dst;
 568}
 569EXPORT_SYMBOL(sk_dst_check);
 570
 571static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 572{
 573        int ret = -ENOPROTOOPT;
 574#ifdef CONFIG_NETDEVICES
 575        struct net *net = sock_net(sk);
 576
 577        /* Sorry... */
 578        ret = -EPERM;
 579        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 580                goto out;
 581
 582        ret = -EINVAL;
 583        if (ifindex < 0)
 584                goto out;
 585
 586        sk->sk_bound_dev_if = ifindex;
 587        if (sk->sk_prot->rehash)
 588                sk->sk_prot->rehash(sk);
 589        sk_dst_reset(sk);
 590
 591        ret = 0;
 592
 593out:
 594#endif
 595
 596        return ret;
 597}
 598
 599int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 600{
 601        int ret;
 602
 603        if (lock_sk)
 604                lock_sock(sk);
 605        ret = sock_bindtoindex_locked(sk, ifindex);
 606        if (lock_sk)
 607                release_sock(sk);
 608
 609        return ret;
 610}
 611EXPORT_SYMBOL(sock_bindtoindex);
 612
 613static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 614{
 615        int ret = -ENOPROTOOPT;
 616#ifdef CONFIG_NETDEVICES
 617        struct net *net = sock_net(sk);
 618        char devname[IFNAMSIZ];
 619        int index;
 620
 621        ret = -EINVAL;
 622        if (optlen < 0)
 623                goto out;
 624
 625        /* Bind this socket to a particular device like "eth0",
 626         * as specified in the passed interface name. If the
 627         * name is "" or the option length is zero the socket
 628         * is not bound.
 629         */
 630        if (optlen > IFNAMSIZ - 1)
 631                optlen = IFNAMSIZ - 1;
 632        memset(devname, 0, sizeof(devname));
 633
 634        ret = -EFAULT;
 635        if (copy_from_sockptr(devname, optval, optlen))
 636                goto out;
 637
 638        index = 0;
 639        if (devname[0] != '\0') {
 640                struct net_device *dev;
 641
 642                rcu_read_lock();
 643                dev = dev_get_by_name_rcu(net, devname);
 644                if (dev)
 645                        index = dev->ifindex;
 646                rcu_read_unlock();
 647                ret = -ENODEV;
 648                if (!dev)
 649                        goto out;
 650        }
 651
 652        return sock_bindtoindex(sk, index, true);
 653out:
 654#endif
 655
 656        return ret;
 657}
 658
 659static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 660                                int __user *optlen, int len)
 661{
 662        int ret = -ENOPROTOOPT;
 663#ifdef CONFIG_NETDEVICES
 664        struct net *net = sock_net(sk);
 665        char devname[IFNAMSIZ];
 666
 667        if (sk->sk_bound_dev_if == 0) {
 668                len = 0;
 669                goto zero;
 670        }
 671
 672        ret = -EINVAL;
 673        if (len < IFNAMSIZ)
 674                goto out;
 675
 676        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 677        if (ret)
 678                goto out;
 679
 680        len = strlen(devname) + 1;
 681
 682        ret = -EFAULT;
 683        if (copy_to_user(optval, devname, len))
 684                goto out;
 685
 686zero:
 687        ret = -EFAULT;
 688        if (put_user(len, optlen))
 689                goto out;
 690
 691        ret = 0;
 692
 693out:
 694#endif
 695
 696        return ret;
 697}
 698
 699bool sk_mc_loop(struct sock *sk)
 700{
 701        if (dev_recursion_level())
 702                return false;
 703        if (!sk)
 704                return true;
 705        switch (sk->sk_family) {
 706        case AF_INET:
 707                return inet_sk(sk)->mc_loop;
 708#if IS_ENABLED(CONFIG_IPV6)
 709        case AF_INET6:
 710                return inet6_sk(sk)->mc_loop;
 711#endif
 712        }
 713        WARN_ON_ONCE(1);
 714        return true;
 715}
 716EXPORT_SYMBOL(sk_mc_loop);
 717
 718void sock_set_reuseaddr(struct sock *sk)
 719{
 720        lock_sock(sk);
 721        sk->sk_reuse = SK_CAN_REUSE;
 722        release_sock(sk);
 723}
 724EXPORT_SYMBOL(sock_set_reuseaddr);
 725
 726void sock_set_reuseport(struct sock *sk)
 727{
 728        lock_sock(sk);
 729        sk->sk_reuseport = true;
 730        release_sock(sk);
 731}
 732EXPORT_SYMBOL(sock_set_reuseport);
 733
 734void sock_no_linger(struct sock *sk)
 735{
 736        lock_sock(sk);
 737        sk->sk_lingertime = 0;
 738        sock_set_flag(sk, SOCK_LINGER);
 739        release_sock(sk);
 740}
 741EXPORT_SYMBOL(sock_no_linger);
 742
 743void sock_set_priority(struct sock *sk, u32 priority)
 744{
 745        lock_sock(sk);
 746        sk->sk_priority = priority;
 747        release_sock(sk);
 748}
 749EXPORT_SYMBOL(sock_set_priority);
 750
 751void sock_set_sndtimeo(struct sock *sk, s64 secs)
 752{
 753        lock_sock(sk);
 754        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 755                sk->sk_sndtimeo = secs * HZ;
 756        else
 757                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 758        release_sock(sk);
 759}
 760EXPORT_SYMBOL(sock_set_sndtimeo);
 761
 762static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 763{
 764        if (val)  {
 765                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 766                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 767                sock_set_flag(sk, SOCK_RCVTSTAMP);
 768                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 769        } else {
 770                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 771                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 772                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 773        }
 774}
 775
 776void sock_enable_timestamps(struct sock *sk)
 777{
 778        lock_sock(sk);
 779        __sock_set_timestamps(sk, true, false, true);
 780        release_sock(sk);
 781}
 782EXPORT_SYMBOL(sock_enable_timestamps);
 783
 784void sock_set_keepalive(struct sock *sk)
 785{
 786        lock_sock(sk);
 787        if (sk->sk_prot->keepalive)
 788                sk->sk_prot->keepalive(sk, true);
 789        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 790        release_sock(sk);
 791}
 792EXPORT_SYMBOL(sock_set_keepalive);
 793
 794static void __sock_set_rcvbuf(struct sock *sk, int val)
 795{
 796        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 797         * as a negative value.
 798         */
 799        val = min_t(int, val, INT_MAX / 2);
 800        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 801
 802        /* We double it on the way in to account for "struct sk_buff" etc.
 803         * overhead.   Applications assume that the SO_RCVBUF setting they make
 804         * will allow that much actual data to be received on that socket.
 805         *
 806         * Applications are unaware that "struct sk_buff" and other overheads
 807         * allocate from the receive buffer during socket buffer allocation.
 808         *
 809         * And after considering the possible alternatives, returning the value
 810         * we actually used in getsockopt is the most desirable behavior.
 811         */
 812        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 813}
 814
 815void sock_set_rcvbuf(struct sock *sk, int val)
 816{
 817        lock_sock(sk);
 818        __sock_set_rcvbuf(sk, val);
 819        release_sock(sk);
 820}
 821EXPORT_SYMBOL(sock_set_rcvbuf);
 822
 823void sock_set_mark(struct sock *sk, u32 val)
 824{
 825        lock_sock(sk);
 826        sk->sk_mark = val;
 827        release_sock(sk);
 828}
 829EXPORT_SYMBOL(sock_set_mark);
 830
 831/*
 832 *      This is meant for all protocols to use and covers goings on
 833 *      at the socket level. Everything here is generic.
 834 */
 835
 836int sock_setsockopt(struct socket *sock, int level, int optname,
 837                    sockptr_t optval, unsigned int optlen)
 838{
 839        struct sock_txtime sk_txtime;
 840        struct sock *sk = sock->sk;
 841        int val;
 842        int valbool;
 843        struct linger ling;
 844        int ret = 0;
 845
 846        /*
 847         *      Options without arguments
 848         */
 849
 850        if (optname == SO_BINDTODEVICE)
 851                return sock_setbindtodevice(sk, optval, optlen);
 852
 853        if (optlen < sizeof(int))
 854                return -EINVAL;
 855
 856        if (copy_from_sockptr(&val, optval, sizeof(val)))
 857                return -EFAULT;
 858
 859        valbool = val ? 1 : 0;
 860
 861        lock_sock(sk);
 862
 863        switch (optname) {
 864        case SO_DEBUG:
 865                if (val && !capable(CAP_NET_ADMIN))
 866                        ret = -EACCES;
 867                else
 868                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 869                break;
 870        case SO_REUSEADDR:
 871                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 872                break;
 873        case SO_REUSEPORT:
 874                sk->sk_reuseport = valbool;
 875                break;
 876        case SO_TYPE:
 877        case SO_PROTOCOL:
 878        case SO_DOMAIN:
 879        case SO_ERROR:
 880                ret = -ENOPROTOOPT;
 881                break;
 882        case SO_DONTROUTE:
 883                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 884                sk_dst_reset(sk);
 885                break;
 886        case SO_BROADCAST:
 887                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 888                break;
 889        case SO_SNDBUF:
 890                /* Don't error on this BSD doesn't and if you think
 891                 * about it this is right. Otherwise apps have to
 892                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 893                 * are treated in BSD as hints
 894                 */
 895                val = min_t(u32, val, sysctl_wmem_max);
 896set_sndbuf:
 897                /* Ensure val * 2 fits into an int, to prevent max_t()
 898                 * from treating it as a negative value.
 899                 */
 900                val = min_t(int, val, INT_MAX / 2);
 901                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 902                WRITE_ONCE(sk->sk_sndbuf,
 903                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
 904                /* Wake up sending tasks if we upped the value. */
 905                sk->sk_write_space(sk);
 906                break;
 907
 908        case SO_SNDBUFFORCE:
 909                if (!capable(CAP_NET_ADMIN)) {
 910                        ret = -EPERM;
 911                        break;
 912                }
 913
 914                /* No negative values (to prevent underflow, as val will be
 915                 * multiplied by 2).
 916                 */
 917                if (val < 0)
 918                        val = 0;
 919                goto set_sndbuf;
 920
 921        case SO_RCVBUF:
 922                /* Don't error on this BSD doesn't and if you think
 923                 * about it this is right. Otherwise apps have to
 924                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 925                 * are treated in BSD as hints
 926                 */
 927                __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 928                break;
 929
 930        case SO_RCVBUFFORCE:
 931                if (!capable(CAP_NET_ADMIN)) {
 932                        ret = -EPERM;
 933                        break;
 934                }
 935
 936                /* No negative values (to prevent underflow, as val will be
 937                 * multiplied by 2).
 938                 */
 939                __sock_set_rcvbuf(sk, max(val, 0));
 940                break;
 941
 942        case SO_KEEPALIVE:
 943                if (sk->sk_prot->keepalive)
 944                        sk->sk_prot->keepalive(sk, valbool);
 945                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 946                break;
 947
 948        case SO_OOBINLINE:
 949                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 950                break;
 951
 952        case SO_NO_CHECK:
 953                sk->sk_no_check_tx = valbool;
 954                break;
 955
 956        case SO_PRIORITY:
 957                if ((val >= 0 && val <= 6) ||
 958                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 959                        sk->sk_priority = val;
 960                else
 961                        ret = -EPERM;
 962                break;
 963
 964        case SO_LINGER:
 965                if (optlen < sizeof(ling)) {
 966                        ret = -EINVAL;  /* 1003.1g */
 967                        break;
 968                }
 969                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
 970                        ret = -EFAULT;
 971                        break;
 972                }
 973                if (!ling.l_onoff)
 974                        sock_reset_flag(sk, SOCK_LINGER);
 975                else {
 976#if (BITS_PER_LONG == 32)
 977                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 978                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 979                        else
 980#endif
 981                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 982                        sock_set_flag(sk, SOCK_LINGER);
 983                }
 984                break;
 985
 986        case SO_BSDCOMPAT:
 987                sock_warn_obsolete_bsdism("setsockopt");
 988                break;
 989
 990        case SO_PASSCRED:
 991                if (valbool)
 992                        set_bit(SOCK_PASSCRED, &sock->flags);
 993                else
 994                        clear_bit(SOCK_PASSCRED, &sock->flags);
 995                break;
 996
 997        case SO_TIMESTAMP_OLD:
 998                __sock_set_timestamps(sk, valbool, false, false);
 999                break;
1000        case SO_TIMESTAMP_NEW:
1001                __sock_set_timestamps(sk, valbool, true, false);
1002                break;
1003        case SO_TIMESTAMPNS_OLD:
1004                __sock_set_timestamps(sk, valbool, false, true);
1005                break;
1006        case SO_TIMESTAMPNS_NEW:
1007                __sock_set_timestamps(sk, valbool, true, true);
1008                break;
1009        case SO_TIMESTAMPING_NEW:
1010                sock_set_flag(sk, SOCK_TSTAMP_NEW);
1011                fallthrough;
1012        case SO_TIMESTAMPING_OLD:
1013                if (val & ~SOF_TIMESTAMPING_MASK) {
1014                        ret = -EINVAL;
1015                        break;
1016                }
1017
1018                if (val & SOF_TIMESTAMPING_OPT_ID &&
1019                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1020                        if (sk->sk_protocol == IPPROTO_TCP &&
1021                            sk->sk_type == SOCK_STREAM) {
1022                                if ((1 << sk->sk_state) &
1023                                    (TCPF_CLOSE | TCPF_LISTEN)) {
1024                                        ret = -EINVAL;
1025                                        break;
1026                                }
1027                                sk->sk_tskey = tcp_sk(sk)->snd_una;
1028                        } else {
1029                                sk->sk_tskey = 0;
1030                        }
1031                }
1032
1033                if (val & SOF_TIMESTAMPING_OPT_STATS &&
1034                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1035                        ret = -EINVAL;
1036                        break;
1037                }
1038
1039                sk->sk_tsflags = val;
1040                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1041                        sock_enable_timestamp(sk,
1042                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
1043                else {
1044                        if (optname == SO_TIMESTAMPING_NEW)
1045                                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
1046
1047                        sock_disable_timestamp(sk,
1048                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1049                }
1050                break;
1051
1052        case SO_RCVLOWAT:
1053                if (val < 0)
1054                        val = INT_MAX;
1055                if (sock->ops->set_rcvlowat)
1056                        ret = sock->ops->set_rcvlowat(sk, val);
1057                else
1058                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1059                break;
1060
1061        case SO_RCVTIMEO_OLD:
1062        case SO_RCVTIMEO_NEW:
1063                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1064                                       optlen, optname == SO_RCVTIMEO_OLD);
1065                break;
1066
1067        case SO_SNDTIMEO_OLD:
1068        case SO_SNDTIMEO_NEW:
1069                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1070                                       optlen, optname == SO_SNDTIMEO_OLD);
1071                break;
1072
1073        case SO_ATTACH_FILTER: {
1074                struct sock_fprog fprog;
1075
1076                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1077                if (!ret)
1078                        ret = sk_attach_filter(&fprog, sk);
1079                break;
1080        }
1081        case SO_ATTACH_BPF:
1082                ret = -EINVAL;
1083                if (optlen == sizeof(u32)) {
1084                        u32 ufd;
1085
1086                        ret = -EFAULT;
1087                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1088                                break;
1089
1090                        ret = sk_attach_bpf(ufd, sk);
1091                }
1092                break;
1093
1094        case SO_ATTACH_REUSEPORT_CBPF: {
1095                struct sock_fprog fprog;
1096
1097                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1098                if (!ret)
1099                        ret = sk_reuseport_attach_filter(&fprog, sk);
1100                break;
1101        }
1102        case SO_ATTACH_REUSEPORT_EBPF:
1103                ret = -EINVAL;
1104                if (optlen == sizeof(u32)) {
1105                        u32 ufd;
1106
1107                        ret = -EFAULT;
1108                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1109                                break;
1110
1111                        ret = sk_reuseport_attach_bpf(ufd, sk);
1112                }
1113                break;
1114
1115        case SO_DETACH_REUSEPORT_BPF:
1116                ret = reuseport_detach_prog(sk);
1117                break;
1118
1119        case SO_DETACH_FILTER:
1120                ret = sk_detach_filter(sk);
1121                break;
1122
1123        case SO_LOCK_FILTER:
1124                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1125                        ret = -EPERM;
1126                else
1127                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1128                break;
1129
1130        case SO_PASSSEC:
1131                if (valbool)
1132                        set_bit(SOCK_PASSSEC, &sock->flags);
1133                else
1134                        clear_bit(SOCK_PASSSEC, &sock->flags);
1135                break;
1136        case SO_MARK:
1137                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1138                        ret = -EPERM;
1139                } else if (val != sk->sk_mark) {
1140                        sk->sk_mark = val;
1141                        sk_dst_reset(sk);
1142                }
1143                break;
1144
1145        case SO_RXQ_OVFL:
1146                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1147                break;
1148
1149        case SO_WIFI_STATUS:
1150                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1151                break;
1152
1153        case SO_PEEK_OFF:
1154                if (sock->ops->set_peek_off)
1155                        ret = sock->ops->set_peek_off(sk, val);
1156                else
1157                        ret = -EOPNOTSUPP;
1158                break;
1159
1160        case SO_NOFCS:
1161                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1162                break;
1163
1164        case SO_SELECT_ERR_QUEUE:
1165                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1166                break;
1167
1168#ifdef CONFIG_NET_RX_BUSY_POLL
1169        case SO_BUSY_POLL:
1170                /* allow unprivileged users to decrease the value */
1171                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1172                        ret = -EPERM;
1173                else {
1174                        if (val < 0)
1175                                ret = -EINVAL;
1176                        else
1177                                sk->sk_ll_usec = val;
1178                }
1179                break;
1180#endif
1181
1182        case SO_MAX_PACING_RATE:
1183                {
1184                unsigned long ulval = (val == ~0U) ? ~0UL : val;
1185
1186                if (sizeof(ulval) != sizeof(val) &&
1187                    optlen >= sizeof(ulval) &&
1188                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1189                        ret = -EFAULT;
1190                        break;
1191                }
1192                if (ulval != ~0UL)
1193                        cmpxchg(&sk->sk_pacing_status,
1194                                SK_PACING_NONE,
1195                                SK_PACING_NEEDED);
1196                sk->sk_max_pacing_rate = ulval;
1197                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1198                break;
1199                }
1200        case SO_INCOMING_CPU:
1201                WRITE_ONCE(sk->sk_incoming_cpu, val);
1202                break;
1203
1204        case SO_CNX_ADVICE:
1205                if (val == 1)
1206                        dst_negative_advice(sk);
1207                break;
1208
1209        case SO_ZEROCOPY:
1210                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1211                        if (!((sk->sk_type == SOCK_STREAM &&
1212                               sk->sk_protocol == IPPROTO_TCP) ||
1213                              (sk->sk_type == SOCK_DGRAM &&
1214                               sk->sk_protocol == IPPROTO_UDP)))
1215                                ret = -ENOTSUPP;
1216                } else if (sk->sk_family != PF_RDS) {
1217                        ret = -ENOTSUPP;
1218                }
1219                if (!ret) {
1220                        if (val < 0 || val > 1)
1221                                ret = -EINVAL;
1222                        else
1223                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1224                }
1225                break;
1226
1227        case SO_TXTIME:
1228                if (optlen != sizeof(struct sock_txtime)) {
1229                        ret = -EINVAL;
1230                        break;
1231                } else if (copy_from_sockptr(&sk_txtime, optval,
1232                           sizeof(struct sock_txtime))) {
1233                        ret = -EFAULT;
1234                        break;
1235                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1236                        ret = -EINVAL;
1237                        break;
1238                }
1239                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1240                 * scheduler has enough safe guards.
1241                 */
1242                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1243                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1244                        ret = -EPERM;
1245                        break;
1246                }
1247                sock_valbool_flag(sk, SOCK_TXTIME, true);
1248                sk->sk_clockid = sk_txtime.clockid;
1249                sk->sk_txtime_deadline_mode =
1250                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1251                sk->sk_txtime_report_errors =
1252                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1253                break;
1254
1255        case SO_BINDTOIFINDEX:
1256                ret = sock_bindtoindex_locked(sk, val);
1257                break;
1258
1259        default:
1260                ret = -ENOPROTOOPT;
1261                break;
1262        }
1263        release_sock(sk);
1264        return ret;
1265}
1266EXPORT_SYMBOL(sock_setsockopt);
1267
1268
1269static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1270                          struct ucred *ucred)
1271{
1272        ucred->pid = pid_vnr(pid);
1273        ucred->uid = ucred->gid = -1;
1274        if (cred) {
1275                struct user_namespace *current_ns = current_user_ns();
1276
1277                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1278                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1279        }
1280}
1281
1282static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1283{
1284        struct user_namespace *user_ns = current_user_ns();
1285        int i;
1286
1287        for (i = 0; i < src->ngroups; i++)
1288                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1289                        return -EFAULT;
1290
1291        return 0;
1292}
1293
1294int sock_getsockopt(struct socket *sock, int level, int optname,
1295                    char __user *optval, int __user *optlen)
1296{
1297        struct sock *sk = sock->sk;
1298
1299        union {
1300                int val;
1301                u64 val64;
1302                unsigned long ulval;
1303                struct linger ling;
1304                struct old_timeval32 tm32;
1305                struct __kernel_old_timeval tm;
1306                struct  __kernel_sock_timeval stm;
1307                struct sock_txtime txtime;
1308        } v;
1309
1310        int lv = sizeof(int);
1311        int len;
1312
1313        if (get_user(len, optlen))
1314                return -EFAULT;
1315        if (len < 0)
1316                return -EINVAL;
1317
1318        memset(&v, 0, sizeof(v));
1319
1320        switch (optname) {
1321        case SO_DEBUG:
1322                v.val = sock_flag(sk, SOCK_DBG);
1323                break;
1324
1325        case SO_DONTROUTE:
1326                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1327                break;
1328
1329        case SO_BROADCAST:
1330                v.val = sock_flag(sk, SOCK_BROADCAST);
1331                break;
1332
1333        case SO_SNDBUF:
1334                v.val = sk->sk_sndbuf;
1335                break;
1336
1337        case SO_RCVBUF:
1338                v.val = sk->sk_rcvbuf;
1339                break;
1340
1341        case SO_REUSEADDR:
1342                v.val = sk->sk_reuse;
1343                break;
1344
1345        case SO_REUSEPORT:
1346                v.val = sk->sk_reuseport;
1347                break;
1348
1349        case SO_KEEPALIVE:
1350                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1351                break;
1352
1353        case SO_TYPE:
1354                v.val = sk->sk_type;
1355                break;
1356
1357        case SO_PROTOCOL:
1358                v.val = sk->sk_protocol;
1359                break;
1360
1361        case SO_DOMAIN:
1362                v.val = sk->sk_family;
1363                break;
1364
1365        case SO_ERROR:
1366                v.val = -sock_error(sk);
1367                if (v.val == 0)
1368                        v.val = xchg(&sk->sk_err_soft, 0);
1369                break;
1370
1371        case SO_OOBINLINE:
1372                v.val = sock_flag(sk, SOCK_URGINLINE);
1373                break;
1374
1375        case SO_NO_CHECK:
1376                v.val = sk->sk_no_check_tx;
1377                break;
1378
1379        case SO_PRIORITY:
1380                v.val = sk->sk_priority;
1381                break;
1382
1383        case SO_LINGER:
1384                lv              = sizeof(v.ling);
1385                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1386                v.ling.l_linger = sk->sk_lingertime / HZ;
1387                break;
1388
1389        case SO_BSDCOMPAT:
1390                sock_warn_obsolete_bsdism("getsockopt");
1391                break;
1392
1393        case SO_TIMESTAMP_OLD:
1394                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1395                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1396                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1397                break;
1398
1399        case SO_TIMESTAMPNS_OLD:
1400                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1401                break;
1402
1403        case SO_TIMESTAMP_NEW:
1404                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1405                break;
1406
1407        case SO_TIMESTAMPNS_NEW:
1408                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1409                break;
1410
1411        case SO_TIMESTAMPING_OLD:
1412                v.val = sk->sk_tsflags;
1413                break;
1414
1415        case SO_RCVTIMEO_OLD:
1416        case SO_RCVTIMEO_NEW:
1417                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1418                break;
1419
1420        case SO_SNDTIMEO_OLD:
1421        case SO_SNDTIMEO_NEW:
1422                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1423                break;
1424
1425        case SO_RCVLOWAT:
1426                v.val = sk->sk_rcvlowat;
1427                break;
1428
1429        case SO_SNDLOWAT:
1430                v.val = 1;
1431                break;
1432
1433        case SO_PASSCRED:
1434                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1435                break;
1436
1437        case SO_PEERCRED:
1438        {
1439                struct ucred peercred;
1440                if (len > sizeof(peercred))
1441                        len = sizeof(peercred);
1442                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1443                if (copy_to_user(optval, &peercred, len))
1444                        return -EFAULT;
1445                goto lenout;
1446        }
1447
1448        case SO_PEERGROUPS:
1449        {
1450                int ret, n;
1451
1452                if (!sk->sk_peer_cred)
1453                        return -ENODATA;
1454
1455                n = sk->sk_peer_cred->group_info->ngroups;
1456                if (len < n * sizeof(gid_t)) {
1457                        len = n * sizeof(gid_t);
1458                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1459                }
1460                len = n * sizeof(gid_t);
1461
1462                ret = groups_to_user((gid_t __user *)optval,
1463                                     sk->sk_peer_cred->group_info);
1464                if (ret)
1465                        return ret;
1466                goto lenout;
1467        }
1468
1469        case SO_PEERNAME:
1470        {
1471                char address[128];
1472
1473                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1474                if (lv < 0)
1475                        return -ENOTCONN;
1476                if (lv < len)
1477                        return -EINVAL;
1478                if (copy_to_user(optval, address, len))
1479                        return -EFAULT;
1480                goto lenout;
1481        }
1482
1483        /* Dubious BSD thing... Probably nobody even uses it, but
1484         * the UNIX standard wants it for whatever reason... -DaveM
1485         */
1486        case SO_ACCEPTCONN:
1487                v.val = sk->sk_state == TCP_LISTEN;
1488                break;
1489
1490        case SO_PASSSEC:
1491                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1492                break;
1493
1494        case SO_PEERSEC:
1495                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1496
1497        case SO_MARK:
1498                v.val = sk->sk_mark;
1499                break;
1500
1501        case SO_RXQ_OVFL:
1502                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1503                break;
1504
1505        case SO_WIFI_STATUS:
1506                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1507                break;
1508
1509        case SO_PEEK_OFF:
1510                if (!sock->ops->set_peek_off)
1511                        return -EOPNOTSUPP;
1512
1513                v.val = sk->sk_peek_off;
1514                break;
1515        case SO_NOFCS:
1516                v.val = sock_flag(sk, SOCK_NOFCS);
1517                break;
1518
1519        case SO_BINDTODEVICE:
1520                return sock_getbindtodevice(sk, optval, optlen, len);
1521
1522        case SO_GET_FILTER:
1523                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1524                if (len < 0)
1525                        return len;
1526
1527                goto lenout;
1528
1529        case SO_LOCK_FILTER:
1530                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1531                break;
1532
1533        case SO_BPF_EXTENSIONS:
1534                v.val = bpf_tell_extensions();
1535                break;
1536
1537        case SO_SELECT_ERR_QUEUE:
1538                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1539                break;
1540
1541#ifdef CONFIG_NET_RX_BUSY_POLL
1542        case SO_BUSY_POLL:
1543                v.val = sk->sk_ll_usec;
1544                break;
1545#endif
1546
1547        case SO_MAX_PACING_RATE:
1548                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1549                        lv = sizeof(v.ulval);
1550                        v.ulval = sk->sk_max_pacing_rate;
1551                } else {
1552                        /* 32bit version */
1553                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1554                }
1555                break;
1556
1557        case SO_INCOMING_CPU:
1558                v.val = READ_ONCE(sk->sk_incoming_cpu);
1559                break;
1560
1561        case SO_MEMINFO:
1562        {
1563                u32 meminfo[SK_MEMINFO_VARS];
1564
1565                sk_get_meminfo(sk, meminfo);
1566
1567                len = min_t(unsigned int, len, sizeof(meminfo));
1568                if (copy_to_user(optval, &meminfo, len))
1569                        return -EFAULT;
1570
1571                goto lenout;
1572        }
1573
1574#ifdef CONFIG_NET_RX_BUSY_POLL
1575        case SO_INCOMING_NAPI_ID:
1576                v.val = READ_ONCE(sk->sk_napi_id);
1577
1578                /* aggregate non-NAPI IDs down to 0 */
1579                if (v.val < MIN_NAPI_ID)
1580                        v.val = 0;
1581
1582                break;
1583#endif
1584
1585        case SO_COOKIE:
1586                lv = sizeof(u64);
1587                if (len < lv)
1588                        return -EINVAL;
1589                v.val64 = sock_gen_cookie(sk);
1590                break;
1591
1592        case SO_ZEROCOPY:
1593                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1594                break;
1595
1596        case SO_TXTIME:
1597                lv = sizeof(v.txtime);
1598                v.txtime.clockid = sk->sk_clockid;
1599                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1600                                  SOF_TXTIME_DEADLINE_MODE : 0;
1601                v.txtime.flags |= sk->sk_txtime_report_errors ?
1602                                  SOF_TXTIME_REPORT_ERRORS : 0;
1603                break;
1604
1605        case SO_BINDTOIFINDEX:
1606                v.val = sk->sk_bound_dev_if;
1607                break;
1608
1609        default:
1610                /* We implement the SO_SNDLOWAT etc to not be settable
1611                 * (1003.1g 7).
1612                 */
1613                return -ENOPROTOOPT;
1614        }
1615
1616        if (len > lv)
1617                len = lv;
1618        if (copy_to_user(optval, &v, len))
1619                return -EFAULT;
1620lenout:
1621        if (put_user(len, optlen))
1622                return -EFAULT;
1623        return 0;
1624}
1625
1626/*
1627 * Initialize an sk_lock.
1628 *
1629 * (We also register the sk_lock with the lock validator.)
1630 */
1631static inline void sock_lock_init(struct sock *sk)
1632{
1633        if (sk->sk_kern_sock)
1634                sock_lock_init_class_and_name(
1635                        sk,
1636                        af_family_kern_slock_key_strings[sk->sk_family],
1637                        af_family_kern_slock_keys + sk->sk_family,
1638                        af_family_kern_key_strings[sk->sk_family],
1639                        af_family_kern_keys + sk->sk_family);
1640        else
1641                sock_lock_init_class_and_name(
1642                        sk,
1643                        af_family_slock_key_strings[sk->sk_family],
1644                        af_family_slock_keys + sk->sk_family,
1645                        af_family_key_strings[sk->sk_family],
1646                        af_family_keys + sk->sk_family);
1647}
1648
1649/*
1650 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1651 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1652 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1653 */
1654static void sock_copy(struct sock *nsk, const struct sock *osk)
1655{
1656        const struct proto *prot = READ_ONCE(osk->sk_prot);
1657#ifdef CONFIG_SECURITY_NETWORK
1658        void *sptr = nsk->sk_security;
1659#endif
1660        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1661
1662        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1663               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1664
1665#ifdef CONFIG_SECURITY_NETWORK
1666        nsk->sk_security = sptr;
1667        security_sk_clone(osk, nsk);
1668#endif
1669}
1670
1671static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1672                int family)
1673{
1674        struct sock *sk;
1675        struct kmem_cache *slab;
1676
1677        slab = prot->slab;
1678        if (slab != NULL) {
1679                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1680                if (!sk)
1681                        return sk;
1682                if (want_init_on_alloc(priority))
1683                        sk_prot_clear_nulls(sk, prot->obj_size);
1684        } else
1685                sk = kmalloc(prot->obj_size, priority);
1686
1687        if (sk != NULL) {
1688                if (security_sk_alloc(sk, family, priority))
1689                        goto out_free;
1690
1691                if (!try_module_get(prot->owner))
1692                        goto out_free_sec;
1693                sk_tx_queue_clear(sk);
1694        }
1695
1696        return sk;
1697
1698out_free_sec:
1699        security_sk_free(sk);
1700out_free:
1701        if (slab != NULL)
1702                kmem_cache_free(slab, sk);
1703        else
1704                kfree(sk);
1705        return NULL;
1706}
1707
1708static void sk_prot_free(struct proto *prot, struct sock *sk)
1709{
1710        struct kmem_cache *slab;
1711        struct module *owner;
1712
1713        owner = prot->owner;
1714        slab = prot->slab;
1715
1716        cgroup_sk_free(&sk->sk_cgrp_data);
1717        mem_cgroup_sk_free(sk);
1718        security_sk_free(sk);
1719        if (slab != NULL)
1720                kmem_cache_free(slab, sk);
1721        else
1722                kfree(sk);
1723        module_put(owner);
1724}
1725
1726/**
1727 *      sk_alloc - All socket objects are allocated here
1728 *      @net: the applicable net namespace
1729 *      @family: protocol family
1730 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1731 *      @prot: struct proto associated with this new sock instance
1732 *      @kern: is this to be a kernel socket?
1733 */
1734struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1735                      struct proto *prot, int kern)
1736{
1737        struct sock *sk;
1738
1739        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1740        if (sk) {
1741                sk->sk_family = family;
1742                /*
1743                 * See comment in struct sock definition to understand
1744                 * why we need sk_prot_creator -acme
1745                 */
1746                sk->sk_prot = sk->sk_prot_creator = prot;
1747                sk->sk_kern_sock = kern;
1748                sock_lock_init(sk);
1749                sk->sk_net_refcnt = kern ? 0 : 1;
1750                if (likely(sk->sk_net_refcnt)) {
1751                        get_net(net);
1752                        sock_inuse_add(net, 1);
1753                }
1754
1755                sock_net_set(sk, net);
1756                refcount_set(&sk->sk_wmem_alloc, 1);
1757
1758                mem_cgroup_sk_alloc(sk);
1759                cgroup_sk_alloc(&sk->sk_cgrp_data);
1760                sock_update_classid(&sk->sk_cgrp_data);
1761                sock_update_netprioidx(&sk->sk_cgrp_data);
1762                sk_tx_queue_clear(sk);
1763        }
1764
1765        return sk;
1766}
1767EXPORT_SYMBOL(sk_alloc);
1768
1769/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1770 * grace period. This is the case for UDP sockets and TCP listeners.
1771 */
1772static void __sk_destruct(struct rcu_head *head)
1773{
1774        struct sock *sk = container_of(head, struct sock, sk_rcu);
1775        struct sk_filter *filter;
1776
1777        if (sk->sk_destruct)
1778                sk->sk_destruct(sk);
1779
1780        filter = rcu_dereference_check(sk->sk_filter,
1781                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1782        if (filter) {
1783                sk_filter_uncharge(sk, filter);
1784                RCU_INIT_POINTER(sk->sk_filter, NULL);
1785        }
1786
1787        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1788
1789#ifdef CONFIG_BPF_SYSCALL
1790        bpf_sk_storage_free(sk);
1791#endif
1792
1793        if (atomic_read(&sk->sk_omem_alloc))
1794                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1795                         __func__, atomic_read(&sk->sk_omem_alloc));
1796
1797        if (sk->sk_frag.page) {
1798                put_page(sk->sk_frag.page);
1799                sk->sk_frag.page = NULL;
1800        }
1801
1802        if (sk->sk_peer_cred)
1803                put_cred(sk->sk_peer_cred);
1804        put_pid(sk->sk_peer_pid);
1805        if (likely(sk->sk_net_refcnt))
1806                put_net(sock_net(sk));
1807        sk_prot_free(sk->sk_prot_creator, sk);
1808}
1809
1810void sk_destruct(struct sock *sk)
1811{
1812        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1813
1814        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1815                reuseport_detach_sock(sk);
1816                use_call_rcu = true;
1817        }
1818
1819        if (use_call_rcu)
1820                call_rcu(&sk->sk_rcu, __sk_destruct);
1821        else
1822                __sk_destruct(&sk->sk_rcu);
1823}
1824
1825static void __sk_free(struct sock *sk)
1826{
1827        if (likely(sk->sk_net_refcnt))
1828                sock_inuse_add(sock_net(sk), -1);
1829
1830        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1831                sock_diag_broadcast_destroy(sk);
1832        else
1833                sk_destruct(sk);
1834}
1835
1836void sk_free(struct sock *sk)
1837{
1838        /*
1839         * We subtract one from sk_wmem_alloc and can know if
1840         * some packets are still in some tx queue.
1841         * If not null, sock_wfree() will call __sk_free(sk) later
1842         */
1843        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1844                __sk_free(sk);
1845}
1846EXPORT_SYMBOL(sk_free);
1847
1848static void sk_init_common(struct sock *sk)
1849{
1850        skb_queue_head_init(&sk->sk_receive_queue);
1851        skb_queue_head_init(&sk->sk_write_queue);
1852        skb_queue_head_init(&sk->sk_error_queue);
1853
1854        rwlock_init(&sk->sk_callback_lock);
1855        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1856                        af_rlock_keys + sk->sk_family,
1857                        af_family_rlock_key_strings[sk->sk_family]);
1858        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1859                        af_wlock_keys + sk->sk_family,
1860                        af_family_wlock_key_strings[sk->sk_family]);
1861        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1862                        af_elock_keys + sk->sk_family,
1863                        af_family_elock_key_strings[sk->sk_family]);
1864        lockdep_set_class_and_name(&sk->sk_callback_lock,
1865                        af_callback_keys + sk->sk_family,
1866                        af_family_clock_key_strings[sk->sk_family]);
1867}
1868
1869/**
1870 *      sk_clone_lock - clone a socket, and lock its clone
1871 *      @sk: the socket to clone
1872 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1873 *
1874 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1875 */
1876struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1877{
1878        struct proto *prot = READ_ONCE(sk->sk_prot);
1879        struct sock *newsk;
1880        bool is_charged = true;
1881
1882        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1883        if (newsk != NULL) {
1884                struct sk_filter *filter;
1885
1886                sock_copy(newsk, sk);
1887
1888                newsk->sk_prot_creator = prot;
1889
1890                /* SANITY */
1891                if (likely(newsk->sk_net_refcnt))
1892                        get_net(sock_net(newsk));
1893                sk_node_init(&newsk->sk_node);
1894                sock_lock_init(newsk);
1895                bh_lock_sock(newsk);
1896                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1897                newsk->sk_backlog.len = 0;
1898
1899                atomic_set(&newsk->sk_rmem_alloc, 0);
1900                /*
1901                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1902                 */
1903                refcount_set(&newsk->sk_wmem_alloc, 1);
1904                atomic_set(&newsk->sk_omem_alloc, 0);
1905                sk_init_common(newsk);
1906
1907                newsk->sk_dst_cache     = NULL;
1908                newsk->sk_dst_pending_confirm = 0;
1909                newsk->sk_wmem_queued   = 0;
1910                newsk->sk_forward_alloc = 0;
1911                atomic_set(&newsk->sk_drops, 0);
1912                newsk->sk_send_head     = NULL;
1913                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1914                atomic_set(&newsk->sk_zckey, 0);
1915
1916                sock_reset_flag(newsk, SOCK_DONE);
1917
1918                /* sk->sk_memcg will be populated at accept() time */
1919                newsk->sk_memcg = NULL;
1920
1921                cgroup_sk_clone(&newsk->sk_cgrp_data);
1922
1923                rcu_read_lock();
1924                filter = rcu_dereference(sk->sk_filter);
1925                if (filter != NULL)
1926                        /* though it's an empty new sock, the charging may fail
1927                         * if sysctl_optmem_max was changed between creation of
1928                         * original socket and cloning
1929                         */
1930                        is_charged = sk_filter_charge(newsk, filter);
1931                RCU_INIT_POINTER(newsk->sk_filter, filter);
1932                rcu_read_unlock();
1933
1934                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1935                        /* We need to make sure that we don't uncharge the new
1936                         * socket if we couldn't charge it in the first place
1937                         * as otherwise we uncharge the parent's filter.
1938                         */
1939                        if (!is_charged)
1940                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1941                        sk_free_unlock_clone(newsk);
1942                        newsk = NULL;
1943                        goto out;
1944                }
1945                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1946
1947                if (bpf_sk_storage_clone(sk, newsk)) {
1948                        sk_free_unlock_clone(newsk);
1949                        newsk = NULL;
1950                        goto out;
1951                }
1952
1953                /* Clear sk_user_data if parent had the pointer tagged
1954                 * as not suitable for copying when cloning.
1955                 */
1956                if (sk_user_data_is_nocopy(newsk))
1957                        newsk->sk_user_data = NULL;
1958
1959                newsk->sk_err      = 0;
1960                newsk->sk_err_soft = 0;
1961                newsk->sk_priority = 0;
1962                newsk->sk_incoming_cpu = raw_smp_processor_id();
1963                if (likely(newsk->sk_net_refcnt))
1964                        sock_inuse_add(sock_net(newsk), 1);
1965
1966                /*
1967                 * Before updating sk_refcnt, we must commit prior changes to memory
1968                 * (Documentation/RCU/rculist_nulls.rst for details)
1969                 */
1970                smp_wmb();
1971                refcount_set(&newsk->sk_refcnt, 2);
1972
1973                /*
1974                 * Increment the counter in the same struct proto as the master
1975                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1976                 * is the same as sk->sk_prot->socks, as this field was copied
1977                 * with memcpy).
1978                 *
1979                 * This _changes_ the previous behaviour, where
1980                 * tcp_create_openreq_child always was incrementing the
1981                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1982                 * to be taken into account in all callers. -acme
1983                 */
1984                sk_refcnt_debug_inc(newsk);
1985                sk_set_socket(newsk, NULL);
1986                sk_tx_queue_clear(newsk);
1987                RCU_INIT_POINTER(newsk->sk_wq, NULL);
1988
1989                if (newsk->sk_prot->sockets_allocated)
1990                        sk_sockets_allocated_inc(newsk);
1991
1992                if (sock_needs_netstamp(sk) &&
1993                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1994                        net_enable_timestamp();
1995        }
1996out:
1997        return newsk;
1998}
1999EXPORT_SYMBOL_GPL(sk_clone_lock);
2000
2001void sk_free_unlock_clone(struct sock *sk)
2002{
2003        /* It is still raw copy of parent, so invalidate
2004         * destructor and make plain sk_free() */
2005        sk->sk_destruct = NULL;
2006        bh_unlock_sock(sk);
2007        sk_free(sk);
2008}
2009EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2010
2011void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2012{
2013        u32 max_segs = 1;
2014
2015        sk_dst_set(sk, dst);
2016        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2017        if (sk->sk_route_caps & NETIF_F_GSO)
2018                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2019        sk->sk_route_caps &= ~sk->sk_route_nocaps;
2020        if (sk_can_gso(sk)) {
2021                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2022                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2023                } else {
2024                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2025                        sk->sk_gso_max_size = dst->dev->gso_max_size;
2026                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2027                }
2028        }
2029        sk->sk_gso_max_segs = max_segs;
2030}
2031EXPORT_SYMBOL_GPL(sk_setup_caps);
2032
2033/*
2034 *      Simple resource managers for sockets.
2035 */
2036
2037
2038/*
2039 * Write buffer destructor automatically called from kfree_skb.
2040 */
2041void sock_wfree(struct sk_buff *skb)
2042{
2043        struct sock *sk = skb->sk;
2044        unsigned int len = skb->truesize;
2045
2046        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2047                /*
2048                 * Keep a reference on sk_wmem_alloc, this will be released
2049                 * after sk_write_space() call
2050                 */
2051                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2052                sk->sk_write_space(sk);
2053                len = 1;
2054        }
2055        /*
2056         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2057         * could not do because of in-flight packets
2058         */
2059        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2060                __sk_free(sk);
2061}
2062EXPORT_SYMBOL(sock_wfree);
2063
2064/* This variant of sock_wfree() is used by TCP,
2065 * since it sets SOCK_USE_WRITE_QUEUE.
2066 */
2067void __sock_wfree(struct sk_buff *skb)
2068{
2069        struct sock *sk = skb->sk;
2070
2071        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2072                __sk_free(sk);
2073}
2074
2075void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2076{
2077        skb_orphan(skb);
2078        skb->sk = sk;
2079#ifdef CONFIG_INET
2080        if (unlikely(!sk_fullsock(sk))) {
2081                skb->destructor = sock_edemux;
2082                sock_hold(sk);
2083                return;
2084        }
2085#endif
2086        skb->destructor = sock_wfree;
2087        skb_set_hash_from_sk(skb, sk);
2088        /*
2089         * We used to take a refcount on sk, but following operation
2090         * is enough to guarantee sk_free() wont free this sock until
2091         * all in-flight packets are completed
2092         */
2093        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2094}
2095EXPORT_SYMBOL(skb_set_owner_w);
2096
2097static bool can_skb_orphan_partial(const struct sk_buff *skb)
2098{
2099#ifdef CONFIG_TLS_DEVICE
2100        /* Drivers depend on in-order delivery for crypto offload,
2101         * partial orphan breaks out-of-order-OK logic.
2102         */
2103        if (skb->decrypted)
2104                return false;
2105#endif
2106        return (skb->destructor == sock_wfree ||
2107                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2108}
2109
2110/* This helper is used by netem, as it can hold packets in its
2111 * delay queue. We want to allow the owner socket to send more
2112 * packets, as if they were already TX completed by a typical driver.
2113 * But we also want to keep skb->sk set because some packet schedulers
2114 * rely on it (sch_fq for example).
2115 */
2116void skb_orphan_partial(struct sk_buff *skb)
2117{
2118        if (skb_is_tcp_pure_ack(skb))
2119                return;
2120
2121        if (can_skb_orphan_partial(skb)) {
2122                struct sock *sk = skb->sk;
2123
2124                if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2125                        WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2126                        skb->destructor = sock_efree;
2127                }
2128        } else {
2129                skb_orphan(skb);
2130        }
2131}
2132EXPORT_SYMBOL(skb_orphan_partial);
2133
2134/*
2135 * Read buffer destructor automatically called from kfree_skb.
2136 */
2137void sock_rfree(struct sk_buff *skb)
2138{
2139        struct sock *sk = skb->sk;
2140        unsigned int len = skb->truesize;
2141
2142        atomic_sub(len, &sk->sk_rmem_alloc);
2143        sk_mem_uncharge(sk, len);
2144}
2145EXPORT_SYMBOL(sock_rfree);
2146
2147/*
2148 * Buffer destructor for skbs that are not used directly in read or write
2149 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2150 */
2151void sock_efree(struct sk_buff *skb)
2152{
2153        sock_put(skb->sk);
2154}
2155EXPORT_SYMBOL(sock_efree);
2156
2157/* Buffer destructor for prefetch/receive path where reference count may
2158 * not be held, e.g. for listen sockets.
2159 */
2160#ifdef CONFIG_INET
2161void sock_pfree(struct sk_buff *skb)
2162{
2163        if (sk_is_refcounted(skb->sk))
2164                sock_gen_put(skb->sk);
2165}
2166EXPORT_SYMBOL(sock_pfree);
2167#endif /* CONFIG_INET */
2168
2169kuid_t sock_i_uid(struct sock *sk)
2170{
2171        kuid_t uid;
2172
2173        read_lock_bh(&sk->sk_callback_lock);
2174        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2175        read_unlock_bh(&sk->sk_callback_lock);
2176        return uid;
2177}
2178EXPORT_SYMBOL(sock_i_uid);
2179
2180unsigned long sock_i_ino(struct sock *sk)
2181{
2182        unsigned long ino;
2183
2184        read_lock_bh(&sk->sk_callback_lock);
2185        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2186        read_unlock_bh(&sk->sk_callback_lock);
2187        return ino;
2188}
2189EXPORT_SYMBOL(sock_i_ino);
2190
2191/*
2192 * Allocate a skb from the socket's send buffer.
2193 */
2194struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2195                             gfp_t priority)
2196{
2197        if (force ||
2198            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2199                struct sk_buff *skb = alloc_skb(size, priority);
2200
2201                if (skb) {
2202                        skb_set_owner_w(skb, sk);
2203                        return skb;
2204                }
2205        }
2206        return NULL;
2207}
2208EXPORT_SYMBOL(sock_wmalloc);
2209
2210static void sock_ofree(struct sk_buff *skb)
2211{
2212        struct sock *sk = skb->sk;
2213
2214        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2215}
2216
2217struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2218                             gfp_t priority)
2219{
2220        struct sk_buff *skb;
2221
2222        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2223        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2224            sysctl_optmem_max)
2225                return NULL;
2226
2227        skb = alloc_skb(size, priority);
2228        if (!skb)
2229                return NULL;
2230
2231        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2232        skb->sk = sk;
2233        skb->destructor = sock_ofree;
2234        return skb;
2235}
2236
2237/*
2238 * Allocate a memory block from the socket's option memory buffer.
2239 */
2240void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2241{
2242        if ((unsigned int)size <= sysctl_optmem_max &&
2243            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2244                void *mem;
2245                /* First do the add, to avoid the race if kmalloc
2246                 * might sleep.
2247                 */
2248                atomic_add(size, &sk->sk_omem_alloc);
2249                mem = kmalloc(size, priority);
2250                if (mem)
2251                        return mem;
2252                atomic_sub(size, &sk->sk_omem_alloc);
2253        }
2254        return NULL;
2255}
2256EXPORT_SYMBOL(sock_kmalloc);
2257
2258/* Free an option memory block. Note, we actually want the inline
2259 * here as this allows gcc to detect the nullify and fold away the
2260 * condition entirely.
2261 */
2262static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2263                                  const bool nullify)
2264{
2265        if (WARN_ON_ONCE(!mem))
2266                return;
2267        if (nullify)
2268                kfree_sensitive(mem);
2269        else
2270                kfree(mem);
2271        atomic_sub(size, &sk->sk_omem_alloc);
2272}
2273
2274void sock_kfree_s(struct sock *sk, void *mem, int size)
2275{
2276        __sock_kfree_s(sk, mem, size, false);
2277}
2278EXPORT_SYMBOL(sock_kfree_s);
2279
2280void sock_kzfree_s(struct sock *sk, void *mem, int size)
2281{
2282        __sock_kfree_s(sk, mem, size, true);
2283}
2284EXPORT_SYMBOL(sock_kzfree_s);
2285
2286/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2287   I think, these locks should be removed for datagram sockets.
2288 */
2289static long sock_wait_for_wmem(struct sock *sk, long timeo)
2290{
2291        DEFINE_WAIT(wait);
2292
2293        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2294        for (;;) {
2295                if (!timeo)
2296                        break;
2297                if (signal_pending(current))
2298                        break;
2299                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2300                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2301                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2302                        break;
2303                if (sk->sk_shutdown & SEND_SHUTDOWN)
2304                        break;
2305                if (sk->sk_err)
2306                        break;
2307                timeo = schedule_timeout(timeo);
2308        }
2309        finish_wait(sk_sleep(sk), &wait);
2310        return timeo;
2311}
2312
2313
2314/*
2315 *      Generic send/receive buffer handlers
2316 */
2317
2318struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2319                                     unsigned long data_len, int noblock,
2320                                     int *errcode, int max_page_order)
2321{
2322        struct sk_buff *skb;
2323        long timeo;
2324        int err;
2325
2326        timeo = sock_sndtimeo(sk, noblock);
2327        for (;;) {
2328                err = sock_error(sk);
2329                if (err != 0)
2330                        goto failure;
2331
2332                err = -EPIPE;
2333                if (sk->sk_shutdown & SEND_SHUTDOWN)
2334                        goto failure;
2335
2336                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2337                        break;
2338
2339                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2340                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2341                err = -EAGAIN;
2342                if (!timeo)
2343                        goto failure;
2344                if (signal_pending(current))
2345                        goto interrupted;
2346                timeo = sock_wait_for_wmem(sk, timeo);
2347        }
2348        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2349                                   errcode, sk->sk_allocation);
2350        if (skb)
2351                skb_set_owner_w(skb, sk);
2352        return skb;
2353
2354interrupted:
2355        err = sock_intr_errno(timeo);
2356failure:
2357        *errcode = err;
2358        return NULL;
2359}
2360EXPORT_SYMBOL(sock_alloc_send_pskb);
2361
2362struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2363                                    int noblock, int *errcode)
2364{
2365        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2366}
2367EXPORT_SYMBOL(sock_alloc_send_skb);
2368
2369int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2370                     struct sockcm_cookie *sockc)
2371{
2372        u32 tsflags;
2373
2374        switch (cmsg->cmsg_type) {
2375        case SO_MARK:
2376                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2377                        return -EPERM;
2378                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2379                        return -EINVAL;
2380                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2381                break;
2382        case SO_TIMESTAMPING_OLD:
2383                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2384                        return -EINVAL;
2385
2386                tsflags = *(u32 *)CMSG_DATA(cmsg);
2387                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2388                        return -EINVAL;
2389
2390                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2391                sockc->tsflags |= tsflags;
2392                break;
2393        case SCM_TXTIME:
2394                if (!sock_flag(sk, SOCK_TXTIME))
2395                        return -EINVAL;
2396                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2397                        return -EINVAL;
2398                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2399                break;
2400        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2401        case SCM_RIGHTS:
2402        case SCM_CREDENTIALS:
2403                break;
2404        default:
2405                return -EINVAL;
2406        }
2407        return 0;
2408}
2409EXPORT_SYMBOL(__sock_cmsg_send);
2410
2411int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2412                   struct sockcm_cookie *sockc)
2413{
2414        struct cmsghdr *cmsg;
2415        int ret;
2416
2417        for_each_cmsghdr(cmsg, msg) {
2418                if (!CMSG_OK(msg, cmsg))
2419                        return -EINVAL;
2420                if (cmsg->cmsg_level != SOL_SOCKET)
2421                        continue;
2422                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2423                if (ret)
2424                        return ret;
2425        }
2426        return 0;
2427}
2428EXPORT_SYMBOL(sock_cmsg_send);
2429
2430static void sk_enter_memory_pressure(struct sock *sk)
2431{
2432        if (!sk->sk_prot->enter_memory_pressure)
2433                return;
2434
2435        sk->sk_prot->enter_memory_pressure(sk);
2436}
2437
2438static void sk_leave_memory_pressure(struct sock *sk)
2439{
2440        if (sk->sk_prot->leave_memory_pressure) {
2441                sk->sk_prot->leave_memory_pressure(sk);
2442        } else {
2443                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2444
2445                if (memory_pressure && READ_ONCE(*memory_pressure))
2446                        WRITE_ONCE(*memory_pressure, 0);
2447        }
2448}
2449
2450#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2451DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2452
2453/**
2454 * skb_page_frag_refill - check that a page_frag contains enough room
2455 * @sz: minimum size of the fragment we want to get
2456 * @pfrag: pointer to page_frag
2457 * @gfp: priority for memory allocation
2458 *
2459 * Note: While this allocator tries to use high order pages, there is
2460 * no guarantee that allocations succeed. Therefore, @sz MUST be
2461 * less or equal than PAGE_SIZE.
2462 */
2463bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2464{
2465        if (pfrag->page) {
2466                if (page_ref_count(pfrag->page) == 1) {
2467                        pfrag->offset = 0;
2468                        return true;
2469                }
2470                if (pfrag->offset + sz <= pfrag->size)
2471                        return true;
2472                put_page(pfrag->page);
2473        }
2474
2475        pfrag->offset = 0;
2476        if (SKB_FRAG_PAGE_ORDER &&
2477            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2478                /* Avoid direct reclaim but allow kswapd to wake */
2479                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2480                                          __GFP_COMP | __GFP_NOWARN |
2481                                          __GFP_NORETRY,
2482                                          SKB_FRAG_PAGE_ORDER);
2483                if (likely(pfrag->page)) {
2484                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2485                        return true;
2486                }
2487        }
2488        pfrag->page = alloc_page(gfp);
2489        if (likely(pfrag->page)) {
2490                pfrag->size = PAGE_SIZE;
2491                return true;
2492        }
2493        return false;
2494}
2495EXPORT_SYMBOL(skb_page_frag_refill);
2496
2497bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2498{
2499        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2500                return true;
2501
2502        sk_enter_memory_pressure(sk);
2503        sk_stream_moderate_sndbuf(sk);
2504        return false;
2505}
2506EXPORT_SYMBOL(sk_page_frag_refill);
2507
2508static void __lock_sock(struct sock *sk)
2509        __releases(&sk->sk_lock.slock)
2510        __acquires(&sk->sk_lock.slock)
2511{
2512        DEFINE_WAIT(wait);
2513
2514        for (;;) {
2515                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2516                                        TASK_UNINTERRUPTIBLE);
2517                spin_unlock_bh(&sk->sk_lock.slock);
2518                schedule();
2519                spin_lock_bh(&sk->sk_lock.slock);
2520                if (!sock_owned_by_user(sk))
2521                        break;
2522        }
2523        finish_wait(&sk->sk_lock.wq, &wait);
2524}
2525
2526void __release_sock(struct sock *sk)
2527        __releases(&sk->sk_lock.slock)
2528        __acquires(&sk->sk_lock.slock)
2529{
2530        struct sk_buff *skb, *next;
2531
2532        while ((skb = sk->sk_backlog.head) != NULL) {
2533                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2534
2535                spin_unlock_bh(&sk->sk_lock.slock);
2536
2537                do {
2538                        next = skb->next;
2539                        prefetch(next);
2540                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2541                        skb_mark_not_on_list(skb);
2542                        sk_backlog_rcv(sk, skb);
2543
2544                        cond_resched();
2545
2546                        skb = next;
2547                } while (skb != NULL);
2548
2549                spin_lock_bh(&sk->sk_lock.slock);
2550        }
2551
2552        /*
2553         * Doing the zeroing here guarantee we can not loop forever
2554         * while a wild producer attempts to flood us.
2555         */
2556        sk->sk_backlog.len = 0;
2557}
2558
2559void __sk_flush_backlog(struct sock *sk)
2560{
2561        spin_lock_bh(&sk->sk_lock.slock);
2562        __release_sock(sk);
2563        spin_unlock_bh(&sk->sk_lock.slock);
2564}
2565
2566/**
2567 * sk_wait_data - wait for data to arrive at sk_receive_queue
2568 * @sk:    sock to wait on
2569 * @timeo: for how long
2570 * @skb:   last skb seen on sk_receive_queue
2571 *
2572 * Now socket state including sk->sk_err is changed only under lock,
2573 * hence we may omit checks after joining wait queue.
2574 * We check receive queue before schedule() only as optimization;
2575 * it is very likely that release_sock() added new data.
2576 */
2577int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2578{
2579        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2580        int rc;
2581
2582        add_wait_queue(sk_sleep(sk), &wait);
2583        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2584        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2585        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2586        remove_wait_queue(sk_sleep(sk), &wait);
2587        return rc;
2588}
2589EXPORT_SYMBOL(sk_wait_data);
2590
2591/**
2592 *      __sk_mem_raise_allocated - increase memory_allocated
2593 *      @sk: socket
2594 *      @size: memory size to allocate
2595 *      @amt: pages to allocate
2596 *      @kind: allocation type
2597 *
2598 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2599 */
2600int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2601{
2602        struct proto *prot = sk->sk_prot;
2603        long allocated = sk_memory_allocated_add(sk, amt);
2604        bool charged = true;
2605
2606        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2607            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2608                goto suppress_allocation;
2609
2610        /* Under limit. */
2611        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2612                sk_leave_memory_pressure(sk);
2613                return 1;
2614        }
2615
2616        /* Under pressure. */
2617        if (allocated > sk_prot_mem_limits(sk, 1))
2618                sk_enter_memory_pressure(sk);
2619
2620        /* Over hard limit. */
2621        if (allocated > sk_prot_mem_limits(sk, 2))
2622                goto suppress_allocation;
2623
2624        /* guarantee minimum buffer size under pressure */
2625        if (kind == SK_MEM_RECV) {
2626                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2627                        return 1;
2628
2629        } else { /* SK_MEM_SEND */
2630                int wmem0 = sk_get_wmem0(sk, prot);
2631
2632                if (sk->sk_type == SOCK_STREAM) {
2633                        if (sk->sk_wmem_queued < wmem0)
2634                                return 1;
2635                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2636                                return 1;
2637                }
2638        }
2639
2640        if (sk_has_memory_pressure(sk)) {
2641                u64 alloc;
2642
2643                if (!sk_under_memory_pressure(sk))
2644                        return 1;
2645                alloc = sk_sockets_allocated_read_positive(sk);
2646                if (sk_prot_mem_limits(sk, 2) > alloc *
2647                    sk_mem_pages(sk->sk_wmem_queued +
2648                                 atomic_read(&sk->sk_rmem_alloc) +
2649                                 sk->sk_forward_alloc))
2650                        return 1;
2651        }
2652
2653suppress_allocation:
2654
2655        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2656                sk_stream_moderate_sndbuf(sk);
2657
2658                /* Fail only if socket is _under_ its sndbuf.
2659                 * In this case we cannot block, so that we have to fail.
2660                 */
2661                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2662                        return 1;
2663        }
2664
2665        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2666                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2667
2668        sk_memory_allocated_sub(sk, amt);
2669
2670        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2671                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2672
2673        return 0;
2674}
2675EXPORT_SYMBOL(__sk_mem_raise_allocated);
2676
2677/**
2678 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2679 *      @sk: socket
2680 *      @size: memory size to allocate
2681 *      @kind: allocation type
2682 *
2683 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2684 *      rmem allocation. This function assumes that protocols which have
2685 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2686 */
2687int __sk_mem_schedule(struct sock *sk, int size, int kind)
2688{
2689        int ret, amt = sk_mem_pages(size);
2690
2691        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2692        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2693        if (!ret)
2694                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2695        return ret;
2696}
2697EXPORT_SYMBOL(__sk_mem_schedule);
2698
2699/**
2700 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2701 *      @sk: socket
2702 *      @amount: number of quanta
2703 *
2704 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2705 */
2706void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2707{
2708        sk_memory_allocated_sub(sk, amount);
2709
2710        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2711                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2712
2713        if (sk_under_memory_pressure(sk) &&
2714            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2715                sk_leave_memory_pressure(sk);
2716}
2717EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2718
2719/**
2720 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2721 *      @sk: socket
2722 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2723 */
2724void __sk_mem_reclaim(struct sock *sk, int amount)
2725{
2726        amount >>= SK_MEM_QUANTUM_SHIFT;
2727        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2728        __sk_mem_reduce_allocated(sk, amount);
2729}
2730EXPORT_SYMBOL(__sk_mem_reclaim);
2731
2732int sk_set_peek_off(struct sock *sk, int val)
2733{
2734        sk->sk_peek_off = val;
2735        return 0;
2736}
2737EXPORT_SYMBOL_GPL(sk_set_peek_off);
2738
2739/*
2740 * Set of default routines for initialising struct proto_ops when
2741 * the protocol does not support a particular function. In certain
2742 * cases where it makes no sense for a protocol to have a "do nothing"
2743 * function, some default processing is provided.
2744 */
2745
2746int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2747{
2748        return -EOPNOTSUPP;
2749}
2750EXPORT_SYMBOL(sock_no_bind);
2751
2752int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2753                    int len, int flags)
2754{
2755        return -EOPNOTSUPP;
2756}
2757EXPORT_SYMBOL(sock_no_connect);
2758
2759int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2760{
2761        return -EOPNOTSUPP;
2762}
2763EXPORT_SYMBOL(sock_no_socketpair);
2764
2765int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2766                   bool kern)
2767{
2768        return -EOPNOTSUPP;
2769}
2770EXPORT_SYMBOL(sock_no_accept);
2771
2772int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2773                    int peer)
2774{
2775        return -EOPNOTSUPP;
2776}
2777EXPORT_SYMBOL(sock_no_getname);
2778
2779int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2780{
2781        return -EOPNOTSUPP;
2782}
2783EXPORT_SYMBOL(sock_no_ioctl);
2784
2785int sock_no_listen(struct socket *sock, int backlog)
2786{
2787        return -EOPNOTSUPP;
2788}
2789EXPORT_SYMBOL(sock_no_listen);
2790
2791int sock_no_shutdown(struct socket *sock, int how)
2792{
2793        return -EOPNOTSUPP;
2794}
2795EXPORT_SYMBOL(sock_no_shutdown);
2796
2797int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2798{
2799        return -EOPNOTSUPP;
2800}
2801EXPORT_SYMBOL(sock_no_sendmsg);
2802
2803int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2804{
2805        return -EOPNOTSUPP;
2806}
2807EXPORT_SYMBOL(sock_no_sendmsg_locked);
2808
2809int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2810                    int flags)
2811{
2812        return -EOPNOTSUPP;
2813}
2814EXPORT_SYMBOL(sock_no_recvmsg);
2815
2816int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2817{
2818        /* Mirror missing mmap method error code */
2819        return -ENODEV;
2820}
2821EXPORT_SYMBOL(sock_no_mmap);
2822
2823/*
2824 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2825 * various sock-based usage counts.
2826 */
2827void __receive_sock(struct file *file)
2828{
2829        struct socket *sock;
2830        int error;
2831
2832        /*
2833         * The resulting value of "error" is ignored here since we only
2834         * need to take action when the file is a socket and testing
2835         * "sock" for NULL is sufficient.
2836         */
2837        sock = sock_from_file(file, &error);
2838        if (sock) {
2839                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2840                sock_update_classid(&sock->sk->sk_cgrp_data);
2841        }
2842}
2843
2844ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2845{
2846        ssize_t res;
2847        struct msghdr msg = {.msg_flags = flags};
2848        struct kvec iov;
2849        char *kaddr = kmap(page);
2850        iov.iov_base = kaddr + offset;
2851        iov.iov_len = size;
2852        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2853        kunmap(page);
2854        return res;
2855}
2856EXPORT_SYMBOL(sock_no_sendpage);
2857
2858ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2859                                int offset, size_t size, int flags)
2860{
2861        ssize_t res;
2862        struct msghdr msg = {.msg_flags = flags};
2863        struct kvec iov;
2864        char *kaddr = kmap(page);
2865
2866        iov.iov_base = kaddr + offset;
2867        iov.iov_len = size;
2868        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2869        kunmap(page);
2870        return res;
2871}
2872EXPORT_SYMBOL(sock_no_sendpage_locked);
2873
2874/*
2875 *      Default Socket Callbacks
2876 */
2877
2878static void sock_def_wakeup(struct sock *sk)
2879{
2880        struct socket_wq *wq;
2881
2882        rcu_read_lock();
2883        wq = rcu_dereference(sk->sk_wq);
2884        if (skwq_has_sleeper(wq))
2885                wake_up_interruptible_all(&wq->wait);
2886        rcu_read_unlock();
2887}
2888
2889static void sock_def_error_report(struct sock *sk)
2890{
2891        struct socket_wq *wq;
2892
2893        rcu_read_lock();
2894        wq = rcu_dereference(sk->sk_wq);
2895        if (skwq_has_sleeper(wq))
2896                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2897        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2898        rcu_read_unlock();
2899}
2900
2901void sock_def_readable(struct sock *sk)
2902{
2903        struct socket_wq *wq;
2904
2905        rcu_read_lock();
2906        wq = rcu_dereference(sk->sk_wq);
2907        if (skwq_has_sleeper(wq))
2908                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2909                                                EPOLLRDNORM | EPOLLRDBAND);
2910        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2911        rcu_read_unlock();
2912}
2913
2914static void sock_def_write_space(struct sock *sk)
2915{
2916        struct socket_wq *wq;
2917
2918        rcu_read_lock();
2919
2920        /* Do not wake up a writer until he can make "significant"
2921         * progress.  --DaveM
2922         */
2923        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2924                wq = rcu_dereference(sk->sk_wq);
2925                if (skwq_has_sleeper(wq))
2926                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2927                                                EPOLLWRNORM | EPOLLWRBAND);
2928
2929                /* Should agree with poll, otherwise some programs break */
2930                if (sock_writeable(sk))
2931                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2932        }
2933
2934        rcu_read_unlock();
2935}
2936
2937static void sock_def_destruct(struct sock *sk)
2938{
2939}
2940
2941void sk_send_sigurg(struct sock *sk)
2942{
2943        if (sk->sk_socket && sk->sk_socket->file)
2944                if (send_sigurg(&sk->sk_socket->file->f_owner))
2945                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2946}
2947EXPORT_SYMBOL(sk_send_sigurg);
2948
2949void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2950                    unsigned long expires)
2951{
2952        if (!mod_timer(timer, expires))
2953                sock_hold(sk);
2954}
2955EXPORT_SYMBOL(sk_reset_timer);
2956
2957void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2958{
2959        if (del_timer(timer))
2960                __sock_put(sk);
2961}
2962EXPORT_SYMBOL(sk_stop_timer);
2963
2964void sock_init_data(struct socket *sock, struct sock *sk)
2965{
2966        sk_init_common(sk);
2967        sk->sk_send_head        =       NULL;
2968
2969        timer_setup(&sk->sk_timer, NULL, 0);
2970
2971        sk->sk_allocation       =       GFP_KERNEL;
2972        sk->sk_rcvbuf           =       sysctl_rmem_default;
2973        sk->sk_sndbuf           =       sysctl_wmem_default;
2974        sk->sk_state            =       TCP_CLOSE;
2975        sk_set_socket(sk, sock);
2976
2977        sock_set_flag(sk, SOCK_ZAPPED);
2978
2979        if (sock) {
2980                sk->sk_type     =       sock->type;
2981                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2982                sock->sk        =       sk;
2983                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2984        } else {
2985                RCU_INIT_POINTER(sk->sk_wq, NULL);
2986                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2987        }
2988
2989        rwlock_init(&sk->sk_callback_lock);
2990        if (sk->sk_kern_sock)
2991                lockdep_set_class_and_name(
2992                        &sk->sk_callback_lock,
2993                        af_kern_callback_keys + sk->sk_family,
2994                        af_family_kern_clock_key_strings[sk->sk_family]);
2995        else
2996                lockdep_set_class_and_name(
2997                        &sk->sk_callback_lock,
2998                        af_callback_keys + sk->sk_family,
2999                        af_family_clock_key_strings[sk->sk_family]);
3000
3001        sk->sk_state_change     =       sock_def_wakeup;
3002        sk->sk_data_ready       =       sock_def_readable;
3003        sk->sk_write_space      =       sock_def_write_space;
3004        sk->sk_error_report     =       sock_def_error_report;
3005        sk->sk_destruct         =       sock_def_destruct;
3006
3007        sk->sk_frag.page        =       NULL;
3008        sk->sk_frag.offset      =       0;
3009        sk->sk_peek_off         =       -1;
3010
3011        sk->sk_peer_pid         =       NULL;
3012        sk->sk_peer_cred        =       NULL;
3013        sk->sk_write_pending    =       0;
3014        sk->sk_rcvlowat         =       1;
3015        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3016        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3017
3018        sk->sk_stamp = SK_DEFAULT_STAMP;
3019#if BITS_PER_LONG==32
3020        seqlock_init(&sk->sk_stamp_seq);
3021#endif
3022        atomic_set(&sk->sk_zckey, 0);
3023
3024#ifdef CONFIG_NET_RX_BUSY_POLL
3025        sk->sk_napi_id          =       0;
3026        sk->sk_ll_usec          =       sysctl_net_busy_read;
3027#endif
3028
3029        sk->sk_max_pacing_rate = ~0UL;
3030        sk->sk_pacing_rate = ~0UL;
3031        WRITE_ONCE(sk->sk_pacing_shift, 10);
3032        sk->sk_incoming_cpu = -1;
3033
3034        sk_rx_queue_clear(sk);
3035        /*
3036         * Before updating sk_refcnt, we must commit prior changes to memory
3037         * (Documentation/RCU/rculist_nulls.rst for details)
3038         */
3039        smp_wmb();
3040        refcount_set(&sk->sk_refcnt, 1);
3041        atomic_set(&sk->sk_drops, 0);
3042}
3043EXPORT_SYMBOL(sock_init_data);
3044
3045void lock_sock_nested(struct sock *sk, int subclass)
3046{
3047        might_sleep();
3048        spin_lock_bh(&sk->sk_lock.slock);
3049        if (sk->sk_lock.owned)
3050                __lock_sock(sk);
3051        sk->sk_lock.owned = 1;
3052        spin_unlock(&sk->sk_lock.slock);
3053        /*
3054         * The sk_lock has mutex_lock() semantics here:
3055         */
3056        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3057        local_bh_enable();
3058}
3059EXPORT_SYMBOL(lock_sock_nested);
3060
3061void release_sock(struct sock *sk)
3062{
3063        spin_lock_bh(&sk->sk_lock.slock);
3064        if (sk->sk_backlog.tail)
3065                __release_sock(sk);
3066
3067        /* Warning : release_cb() might need to release sk ownership,
3068         * ie call sock_release_ownership(sk) before us.
3069         */
3070        if (sk->sk_prot->release_cb)
3071                sk->sk_prot->release_cb(sk);
3072
3073        sock_release_ownership(sk);
3074        if (waitqueue_active(&sk->sk_lock.wq))
3075                wake_up(&sk->sk_lock.wq);
3076        spin_unlock_bh(&sk->sk_lock.slock);
3077}
3078EXPORT_SYMBOL(release_sock);
3079
3080/**
3081 * lock_sock_fast - fast version of lock_sock
3082 * @sk: socket
3083 *
3084 * This version should be used for very small section, where process wont block
3085 * return false if fast path is taken:
3086 *
3087 *   sk_lock.slock locked, owned = 0, BH disabled
3088 *
3089 * return true if slow path is taken:
3090 *
3091 *   sk_lock.slock unlocked, owned = 1, BH enabled
3092 */
3093bool lock_sock_fast(struct sock *sk)
3094{
3095        might_sleep();
3096        spin_lock_bh(&sk->sk_lock.slock);
3097
3098        if (!sk->sk_lock.owned)
3099                /*
3100                 * Note : We must disable BH
3101                 */
3102                return false;
3103
3104        __lock_sock(sk);
3105        sk->sk_lock.owned = 1;
3106        spin_unlock(&sk->sk_lock.slock);
3107        /*
3108         * The sk_lock has mutex_lock() semantics here:
3109         */
3110        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3111        local_bh_enable();
3112        return true;
3113}
3114EXPORT_SYMBOL(lock_sock_fast);
3115
3116int sock_gettstamp(struct socket *sock, void __user *userstamp,
3117                   bool timeval, bool time32)
3118{
3119        struct sock *sk = sock->sk;
3120        struct timespec64 ts;
3121
3122        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3123        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3124        if (ts.tv_sec == -1)
3125                return -ENOENT;
3126        if (ts.tv_sec == 0) {
3127                ktime_t kt = ktime_get_real();
3128                sock_write_timestamp(sk, kt);
3129                ts = ktime_to_timespec64(kt);
3130        }
3131
3132        if (timeval)
3133                ts.tv_nsec /= 1000;
3134
3135#ifdef CONFIG_COMPAT_32BIT_TIME
3136        if (time32)
3137                return put_old_timespec32(&ts, userstamp);
3138#endif
3139#ifdef CONFIG_SPARC64
3140        /* beware of padding in sparc64 timeval */
3141        if (timeval && !in_compat_syscall()) {
3142                struct __kernel_old_timeval __user tv = {
3143                        .tv_sec = ts.tv_sec,
3144                        .tv_usec = ts.tv_nsec,
3145                };
3146                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3147                        return -EFAULT;
3148                return 0;
3149        }
3150#endif
3151        return put_timespec64(&ts, userstamp);
3152}
3153EXPORT_SYMBOL(sock_gettstamp);
3154
3155void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3156{
3157        if (!sock_flag(sk, flag)) {
3158                unsigned long previous_flags = sk->sk_flags;
3159
3160                sock_set_flag(sk, flag);
3161                /*
3162                 * we just set one of the two flags which require net
3163                 * time stamping, but time stamping might have been on
3164                 * already because of the other one
3165                 */
3166                if (sock_needs_netstamp(sk) &&
3167                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3168                        net_enable_timestamp();
3169        }
3170}
3171
3172int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3173                       int level, int type)
3174{
3175        struct sock_exterr_skb *serr;
3176        struct sk_buff *skb;
3177        int copied, err;
3178
3179        err = -EAGAIN;
3180        skb = sock_dequeue_err_skb(sk);
3181        if (skb == NULL)
3182                goto out;
3183
3184        copied = skb->len;
3185        if (copied > len) {
3186                msg->msg_flags |= MSG_TRUNC;
3187                copied = len;
3188        }
3189        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3190        if (err)
3191                goto out_free_skb;
3192
3193        sock_recv_timestamp(msg, sk, skb);
3194
3195        serr = SKB_EXT_ERR(skb);
3196        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3197
3198        msg->msg_flags |= MSG_ERRQUEUE;
3199        err = copied;
3200
3201out_free_skb:
3202        kfree_skb(skb);
3203out:
3204        return err;
3205}
3206EXPORT_SYMBOL(sock_recv_errqueue);
3207
3208/*
3209 *      Get a socket option on an socket.
3210 *
3211 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3212 *      asynchronous errors should be reported by getsockopt. We assume
3213 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3214 */
3215int sock_common_getsockopt(struct socket *sock, int level, int optname,
3216                           char __user *optval, int __user *optlen)
3217{
3218        struct sock *sk = sock->sk;
3219
3220        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3221}
3222EXPORT_SYMBOL(sock_common_getsockopt);
3223
3224int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3225                        int flags)
3226{
3227        struct sock *sk = sock->sk;
3228        int addr_len = 0;
3229        int err;
3230
3231        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3232                                   flags & ~MSG_DONTWAIT, &addr_len);
3233        if (err >= 0)
3234                msg->msg_namelen = addr_len;
3235        return err;
3236}
3237EXPORT_SYMBOL(sock_common_recvmsg);
3238
3239/*
3240 *      Set socket options on an inet socket.
3241 */
3242int sock_common_setsockopt(struct socket *sock, int level, int optname,
3243                           sockptr_t optval, unsigned int optlen)
3244{
3245        struct sock *sk = sock->sk;
3246
3247        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3248}
3249EXPORT_SYMBOL(sock_common_setsockopt);
3250
3251void sk_common_release(struct sock *sk)
3252{
3253        if (sk->sk_prot->destroy)
3254                sk->sk_prot->destroy(sk);
3255
3256        /*
3257         * Observation: when sk_common_release is called, processes have
3258         * no access to socket. But net still has.
3259         * Step one, detach it from networking:
3260         *
3261         * A. Remove from hash tables.
3262         */
3263
3264        sk->sk_prot->unhash(sk);
3265
3266        /*
3267         * In this point socket cannot receive new packets, but it is possible
3268         * that some packets are in flight because some CPU runs receiver and
3269         * did hash table lookup before we unhashed socket. They will achieve
3270         * receive queue and will be purged by socket destructor.
3271         *
3272         * Also we still have packets pending on receive queue and probably,
3273         * our own packets waiting in device queues. sock_destroy will drain
3274         * receive queue, but transmitted packets will delay socket destruction
3275         * until the last reference will be released.
3276         */
3277
3278        sock_orphan(sk);
3279
3280        xfrm_sk_free_policy(sk);
3281
3282        sk_refcnt_debug_release(sk);
3283
3284        sock_put(sk);
3285}
3286EXPORT_SYMBOL(sk_common_release);
3287
3288void sk_get_meminfo(const struct sock *sk, u32 *mem)
3289{
3290        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3291
3292        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3293        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3294        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3295        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3296        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3297        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3298        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3299        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3300        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3301}
3302
3303#ifdef CONFIG_PROC_FS
3304#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3305struct prot_inuse {
3306        int val[PROTO_INUSE_NR];
3307};
3308
3309static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3310
3311void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3312{
3313        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3314}
3315EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3316
3317int sock_prot_inuse_get(struct net *net, struct proto *prot)
3318{
3319        int cpu, idx = prot->inuse_idx;
3320        int res = 0;
3321
3322        for_each_possible_cpu(cpu)
3323                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3324
3325        return res >= 0 ? res : 0;
3326}
3327EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3328
3329static void sock_inuse_add(struct net *net, int val)
3330{
3331        this_cpu_add(*net->core.sock_inuse, val);
3332}
3333
3334int sock_inuse_get(struct net *net)
3335{
3336        int cpu, res = 0;
3337
3338        for_each_possible_cpu(cpu)
3339                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3340
3341        return res;
3342}
3343
3344EXPORT_SYMBOL_GPL(sock_inuse_get);
3345
3346static int __net_init sock_inuse_init_net(struct net *net)
3347{
3348        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3349        if (net->core.prot_inuse == NULL)
3350                return -ENOMEM;
3351
3352        net->core.sock_inuse = alloc_percpu(int);
3353        if (net->core.sock_inuse == NULL)
3354                goto out;
3355
3356        return 0;
3357
3358out:
3359        free_percpu(net->core.prot_inuse);
3360        return -ENOMEM;
3361}
3362
3363static void __net_exit sock_inuse_exit_net(struct net *net)
3364{
3365        free_percpu(net->core.prot_inuse);
3366        free_percpu(net->core.sock_inuse);
3367}
3368
3369static struct pernet_operations net_inuse_ops = {
3370        .init = sock_inuse_init_net,
3371        .exit = sock_inuse_exit_net,
3372};
3373
3374static __init int net_inuse_init(void)
3375{
3376        if (register_pernet_subsys(&net_inuse_ops))
3377                panic("Cannot initialize net inuse counters");
3378
3379        return 0;
3380}
3381
3382core_initcall(net_inuse_init);
3383
3384static int assign_proto_idx(struct proto *prot)
3385{
3386        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3387
3388        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3389                pr_err("PROTO_INUSE_NR exhausted\n");
3390                return -ENOSPC;
3391        }
3392
3393        set_bit(prot->inuse_idx, proto_inuse_idx);
3394        return 0;
3395}
3396
3397static void release_proto_idx(struct proto *prot)
3398{
3399        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3400                clear_bit(prot->inuse_idx, proto_inuse_idx);
3401}
3402#else
3403static inline int assign_proto_idx(struct proto *prot)
3404{
3405        return 0;
3406}
3407
3408static inline void release_proto_idx(struct proto *prot)
3409{
3410}
3411
3412static void sock_inuse_add(struct net *net, int val)
3413{
3414}
3415#endif
3416
3417static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3418{
3419        if (!twsk_prot)
3420                return;
3421        kfree(twsk_prot->twsk_slab_name);
3422        twsk_prot->twsk_slab_name = NULL;
3423        kmem_cache_destroy(twsk_prot->twsk_slab);
3424        twsk_prot->twsk_slab = NULL;
3425}
3426
3427static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3428{
3429        if (!rsk_prot)
3430                return;
3431        kfree(rsk_prot->slab_name);
3432        rsk_prot->slab_name = NULL;
3433        kmem_cache_destroy(rsk_prot->slab);
3434        rsk_prot->slab = NULL;
3435}
3436
3437static int req_prot_init(const struct proto *prot)
3438{
3439        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3440
3441        if (!rsk_prot)
3442                return 0;
3443
3444        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3445                                        prot->name);
3446        if (!rsk_prot->slab_name)
3447                return -ENOMEM;
3448
3449        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3450                                           rsk_prot->obj_size, 0,
3451                                           SLAB_ACCOUNT | prot->slab_flags,
3452                                           NULL);
3453
3454        if (!rsk_prot->slab) {
3455                pr_crit("%s: Can't create request sock SLAB cache!\n",
3456                        prot->name);
3457                return -ENOMEM;
3458        }
3459        return 0;
3460}
3461
3462int proto_register(struct proto *prot, int alloc_slab)
3463{
3464        int ret = -ENOBUFS;
3465
3466        if (alloc_slab) {
3467                prot->slab = kmem_cache_create_usercopy(prot->name,
3468                                        prot->obj_size, 0,
3469                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3470                                        prot->slab_flags,
3471                                        prot->useroffset, prot->usersize,
3472                                        NULL);
3473
3474                if (prot->slab == NULL) {
3475                        pr_crit("%s: Can't create sock SLAB cache!\n",
3476                                prot->name);
3477                        goto out;
3478                }
3479
3480                if (req_prot_init(prot))
3481                        goto out_free_request_sock_slab;
3482
3483                if (prot->twsk_prot != NULL) {
3484                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3485
3486                        if (prot->twsk_prot->twsk_slab_name == NULL)
3487                                goto out_free_request_sock_slab;
3488
3489                        prot->twsk_prot->twsk_slab =
3490                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3491                                                  prot->twsk_prot->twsk_obj_size,
3492                                                  0,
3493                                                  SLAB_ACCOUNT |
3494                                                  prot->slab_flags,
3495                                                  NULL);
3496                        if (prot->twsk_prot->twsk_slab == NULL)
3497                                goto out_free_timewait_sock_slab;
3498                }
3499        }
3500
3501        mutex_lock(&proto_list_mutex);
3502        ret = assign_proto_idx(prot);
3503        if (ret) {
3504                mutex_unlock(&proto_list_mutex);
3505                goto out_free_timewait_sock_slab;
3506        }
3507        list_add(&prot->node, &proto_list);
3508        mutex_unlock(&proto_list_mutex);
3509        return ret;
3510
3511out_free_timewait_sock_slab:
3512        if (alloc_slab && prot->twsk_prot)
3513                tw_prot_cleanup(prot->twsk_prot);
3514out_free_request_sock_slab:
3515        if (alloc_slab) {
3516                req_prot_cleanup(prot->rsk_prot);
3517
3518                kmem_cache_destroy(prot->slab);
3519                prot->slab = NULL;
3520        }
3521out:
3522        return ret;
3523}
3524EXPORT_SYMBOL(proto_register);
3525
3526void proto_unregister(struct proto *prot)
3527{
3528        mutex_lock(&proto_list_mutex);
3529        release_proto_idx(prot);
3530        list_del(&prot->node);
3531        mutex_unlock(&proto_list_mutex);
3532
3533        kmem_cache_destroy(prot->slab);
3534        prot->slab = NULL;
3535
3536        req_prot_cleanup(prot->rsk_prot);
3537        tw_prot_cleanup(prot->twsk_prot);
3538}
3539EXPORT_SYMBOL(proto_unregister);
3540
3541int sock_load_diag_module(int family, int protocol)
3542{
3543        if (!protocol) {
3544                if (!sock_is_registered(family))
3545                        return -ENOENT;
3546
3547                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3548                                      NETLINK_SOCK_DIAG, family);
3549        }
3550
3551#ifdef CONFIG_INET
3552        if (family == AF_INET &&
3553            protocol != IPPROTO_RAW &&
3554            protocol < MAX_INET_PROTOS &&
3555            !rcu_access_pointer(inet_protos[protocol]))
3556                return -ENOENT;
3557#endif
3558
3559        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3560                              NETLINK_SOCK_DIAG, family, protocol);
3561}
3562EXPORT_SYMBOL(sock_load_diag_module);
3563
3564#ifdef CONFIG_PROC_FS
3565static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3566        __acquires(proto_list_mutex)
3567{
3568        mutex_lock(&proto_list_mutex);
3569        return seq_list_start_head(&proto_list, *pos);
3570}
3571
3572static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3573{
3574        return seq_list_next(v, &proto_list, pos);
3575}
3576
3577static void proto_seq_stop(struct seq_file *seq, void *v)
3578        __releases(proto_list_mutex)
3579{
3580        mutex_unlock(&proto_list_mutex);
3581}
3582
3583static char proto_method_implemented(const void *method)
3584{
3585        return method == NULL ? 'n' : 'y';
3586}
3587static long sock_prot_memory_allocated(struct proto *proto)
3588{
3589        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3590}
3591
3592static const char *sock_prot_memory_pressure(struct proto *proto)
3593{
3594        return proto->memory_pressure != NULL ?
3595        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3596}
3597
3598static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3599{
3600
3601        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3602                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3603                   proto->name,
3604                   proto->obj_size,
3605                   sock_prot_inuse_get(seq_file_net(seq), proto),
3606                   sock_prot_memory_allocated(proto),
3607                   sock_prot_memory_pressure(proto),
3608                   proto->max_header,
3609                   proto->slab == NULL ? "no" : "yes",
3610                   module_name(proto->owner),
3611                   proto_method_implemented(proto->close),
3612                   proto_method_implemented(proto->connect),
3613                   proto_method_implemented(proto->disconnect),
3614                   proto_method_implemented(proto->accept),
3615                   proto_method_implemented(proto->ioctl),
3616                   proto_method_implemented(proto->init),
3617                   proto_method_implemented(proto->destroy),
3618                   proto_method_implemented(proto->shutdown),
3619                   proto_method_implemented(proto->setsockopt),
3620                   proto_method_implemented(proto->getsockopt),
3621                   proto_method_implemented(proto->sendmsg),
3622                   proto_method_implemented(proto->recvmsg),
3623                   proto_method_implemented(proto->sendpage),
3624                   proto_method_implemented(proto->bind),
3625                   proto_method_implemented(proto->backlog_rcv),
3626                   proto_method_implemented(proto->hash),
3627                   proto_method_implemented(proto->unhash),
3628                   proto_method_implemented(proto->get_port),
3629                   proto_method_implemented(proto->enter_memory_pressure));
3630}
3631
3632static int proto_seq_show(struct seq_file *seq, void *v)
3633{
3634        if (v == &proto_list)
3635                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3636                           "protocol",
3637                           "size",
3638                           "sockets",
3639                           "memory",
3640                           "press",
3641                           "maxhdr",
3642                           "slab",
3643                           "module",
3644                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3645        else
3646                proto_seq_printf(seq, list_entry(v, struct proto, node));
3647        return 0;
3648}
3649
3650static const struct seq_operations proto_seq_ops = {
3651        .start  = proto_seq_start,
3652        .next   = proto_seq_next,
3653        .stop   = proto_seq_stop,
3654        .show   = proto_seq_show,
3655};
3656
3657static __net_init int proto_init_net(struct net *net)
3658{
3659        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3660                        sizeof(struct seq_net_private)))
3661                return -ENOMEM;
3662
3663        return 0;
3664}
3665
3666static __net_exit void proto_exit_net(struct net *net)
3667{
3668        remove_proc_entry("protocols", net->proc_net);
3669}
3670
3671
3672static __net_initdata struct pernet_operations proto_net_ops = {
3673        .init = proto_init_net,
3674        .exit = proto_exit_net,
3675};
3676
3677static int __init proto_init(void)
3678{
3679        return register_pernet_subsys(&proto_net_ops);
3680}
3681
3682subsys_initcall(proto_init);
3683
3684#endif /* PROC_FS */
3685
3686#ifdef CONFIG_NET_RX_BUSY_POLL
3687bool sk_busy_loop_end(void *p, unsigned long start_time)
3688{
3689        struct sock *sk = p;
3690
3691        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3692               sk_busy_loop_timeout(sk, start_time);
3693}
3694EXPORT_SYMBOL(sk_busy_loop_end);
3695#endif /* CONFIG_NET_RX_BUSY_POLL */
3696
3697int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3698{
3699        if (!sk->sk_prot->bind_add)
3700                return -EOPNOTSUPP;
3701        return sk->sk_prot->bind_add(sk, addr, addr_len);
3702}
3703EXPORT_SYMBOL(sock_bind_add);
3704