linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116
 117#include <linux/uaccess.h>
 118
 119#include <linux/netdevice.h>
 120#include <net/protocol.h>
 121#include <linux/skbuff.h>
 122#include <net/net_namespace.h>
 123#include <net/request_sock.h>
 124#include <net/sock.h>
 125#include <linux/net_tstamp.h>
 126#include <net/xfrm.h>
 127#include <linux/ipsec.h>
 128#include <net/cls_cgroup.h>
 129#include <net/netprio_cgroup.h>
 130#include <linux/sock_diag.h>
 131
 132#include <linux/filter.h>
 133#include <net/sock_reuseport.h>
 134#include <net/bpf_sk_storage.h>
 135
 136#include <trace/events/sock.h>
 137
 138#include <net/tcp.h>
 139#include <net/busy_poll.h>
 140
 141static DEFINE_MUTEX(proto_list_mutex);
 142static LIST_HEAD(proto_list);
 143
 144static void sock_inuse_add(struct net *net, int val);
 145
 146/**
 147 * sk_ns_capable - General socket capability test
 148 * @sk: Socket to use a capability on or through
 149 * @user_ns: The user namespace of the capability to use
 150 * @cap: The capability to use
 151 *
 152 * Test to see if the opener of the socket had when the socket was
 153 * created and the current process has the capability @cap in the user
 154 * namespace @user_ns.
 155 */
 156bool sk_ns_capable(const struct sock *sk,
 157                   struct user_namespace *user_ns, int cap)
 158{
 159        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 160                ns_capable(user_ns, cap);
 161}
 162EXPORT_SYMBOL(sk_ns_capable);
 163
 164/**
 165 * sk_capable - Socket global capability test
 166 * @sk: Socket to use a capability on or through
 167 * @cap: The global capability to use
 168 *
 169 * Test to see if the opener of the socket had when the socket was
 170 * created and the current process has the capability @cap in all user
 171 * namespaces.
 172 */
 173bool sk_capable(const struct sock *sk, int cap)
 174{
 175        return sk_ns_capable(sk, &init_user_ns, cap);
 176}
 177EXPORT_SYMBOL(sk_capable);
 178
 179/**
 180 * sk_net_capable - Network namespace socket capability test
 181 * @sk: Socket to use a capability on or through
 182 * @cap: The capability to use
 183 *
 184 * Test to see if the opener of the socket had when the socket was created
 185 * and the current process has the capability @cap over the network namespace
 186 * the socket is a member of.
 187 */
 188bool sk_net_capable(const struct sock *sk, int cap)
 189{
 190        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 191}
 192EXPORT_SYMBOL(sk_net_capable);
 193
 194/*
 195 * Each address family might have different locking rules, so we have
 196 * one slock key per address family and separate keys for internal and
 197 * userspace sockets.
 198 */
 199static struct lock_class_key af_family_keys[AF_MAX];
 200static struct lock_class_key af_family_kern_keys[AF_MAX];
 201static struct lock_class_key af_family_slock_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 203
 204/*
 205 * Make lock validator output more readable. (we pre-construct these
 206 * strings build-time, so that runtime initialization of socket
 207 * locks is fast):
 208 */
 209
 210#define _sock_locks(x)                                            \
 211  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 212  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 213  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 214  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 215  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 216  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 217  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 218  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 219  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 220  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 221  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 222  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 223  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 224  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 225  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 226  x "AF_MAX"
 227
 228static const char *const af_family_key_strings[AF_MAX+1] = {
 229        _sock_locks("sk_lock-")
 230};
 231static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 232        _sock_locks("slock-")
 233};
 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235        _sock_locks("clock-")
 236};
 237
 238static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 239        _sock_locks("k-sk_lock-")
 240};
 241static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 242        _sock_locks("k-slock-")
 243};
 244static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 245        _sock_locks("k-clock-")
 246};
 247static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 248        _sock_locks("rlock-")
 249};
 250static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 251        _sock_locks("wlock-")
 252};
 253static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 254        _sock_locks("elock-")
 255};
 256
 257/*
 258 * sk_callback_lock and sk queues locking rules are per-address-family,
 259 * so split the lock classes by using a per-AF key:
 260 */
 261static struct lock_class_key af_callback_keys[AF_MAX];
 262static struct lock_class_key af_rlock_keys[AF_MAX];
 263static struct lock_class_key af_wlock_keys[AF_MAX];
 264static struct lock_class_key af_elock_keys[AF_MAX];
 265static struct lock_class_key af_kern_callback_keys[AF_MAX];
 266
 267/* Run time adjustable parameters. */
 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269EXPORT_SYMBOL(sysctl_wmem_max);
 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271EXPORT_SYMBOL(sysctl_rmem_max);
 272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275/* Maximal space eaten by iovec or ancillary data plus some space */
 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279int sysctl_tstamp_allow_data __read_mostly = 1;
 280
 281DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 282EXPORT_SYMBOL_GPL(memalloc_socks_key);
 283
 284/**
 285 * sk_set_memalloc - sets %SOCK_MEMALLOC
 286 * @sk: socket to set it on
 287 *
 288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 289 * It's the responsibility of the admin to adjust min_free_kbytes
 290 * to meet the requirements
 291 */
 292void sk_set_memalloc(struct sock *sk)
 293{
 294        sock_set_flag(sk, SOCK_MEMALLOC);
 295        sk->sk_allocation |= __GFP_MEMALLOC;
 296        static_branch_inc(&memalloc_socks_key);
 297}
 298EXPORT_SYMBOL_GPL(sk_set_memalloc);
 299
 300void sk_clear_memalloc(struct sock *sk)
 301{
 302        sock_reset_flag(sk, SOCK_MEMALLOC);
 303        sk->sk_allocation &= ~__GFP_MEMALLOC;
 304        static_branch_dec(&memalloc_socks_key);
 305
 306        /*
 307         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 308         * progress of swapping. SOCK_MEMALLOC may be cleared while
 309         * it has rmem allocations due to the last swapfile being deactivated
 310         * but there is a risk that the socket is unusable due to exceeding
 311         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 312         */
 313        sk_mem_reclaim(sk);
 314}
 315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318{
 319        int ret;
 320        unsigned int noreclaim_flag;
 321
 322        /* these should have been dropped before queueing */
 323        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325        noreclaim_flag = memalloc_noreclaim_save();
 326        ret = sk->sk_backlog_rcv(sk, skb);
 327        memalloc_noreclaim_restore(noreclaim_flag);
 328
 329        return ret;
 330}
 331EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 334{
 335        struct __kernel_sock_timeval tv;
 336
 337        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 338                tv.tv_sec = 0;
 339                tv.tv_usec = 0;
 340        } else {
 341                tv.tv_sec = timeo / HZ;
 342                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 343        }
 344
 345        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 346                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 347                *(struct old_timeval32 *)optval = tv32;
 348                return sizeof(tv32);
 349        }
 350
 351        if (old_timeval) {
 352                struct __kernel_old_timeval old_tv;
 353                old_tv.tv_sec = tv.tv_sec;
 354                old_tv.tv_usec = tv.tv_usec;
 355                *(struct __kernel_old_timeval *)optval = old_tv;
 356                return sizeof(old_tv);
 357        }
 358
 359        *(struct __kernel_sock_timeval *)optval = tv;
 360        return sizeof(tv);
 361}
 362
 363static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 364{
 365        struct __kernel_sock_timeval tv;
 366
 367        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 368                struct old_timeval32 tv32;
 369
 370                if (optlen < sizeof(tv32))
 371                        return -EINVAL;
 372
 373                if (copy_from_user(&tv32, optval, sizeof(tv32)))
 374                        return -EFAULT;
 375                tv.tv_sec = tv32.tv_sec;
 376                tv.tv_usec = tv32.tv_usec;
 377        } else if (old_timeval) {
 378                struct __kernel_old_timeval old_tv;
 379
 380                if (optlen < sizeof(old_tv))
 381                        return -EINVAL;
 382                if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
 383                        return -EFAULT;
 384                tv.tv_sec = old_tv.tv_sec;
 385                tv.tv_usec = old_tv.tv_usec;
 386        } else {
 387                if (optlen < sizeof(tv))
 388                        return -EINVAL;
 389                if (copy_from_user(&tv, optval, sizeof(tv)))
 390                        return -EFAULT;
 391        }
 392        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 393                return -EDOM;
 394
 395        if (tv.tv_sec < 0) {
 396                static int warned __read_mostly;
 397
 398                *timeo_p = 0;
 399                if (warned < 10 && net_ratelimit()) {
 400                        warned++;
 401                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 402                                __func__, current->comm, task_pid_nr(current));
 403                }
 404                return 0;
 405        }
 406        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 407        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 408                return 0;
 409        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 410                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 411        return 0;
 412}
 413
 414static void sock_warn_obsolete_bsdism(const char *name)
 415{
 416        static int warned;
 417        static char warncomm[TASK_COMM_LEN];
 418        if (strcmp(warncomm, current->comm) && warned < 5) {
 419                strcpy(warncomm,  current->comm);
 420                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 421                        warncomm, name);
 422                warned++;
 423        }
 424}
 425
 426static bool sock_needs_netstamp(const struct sock *sk)
 427{
 428        switch (sk->sk_family) {
 429        case AF_UNSPEC:
 430        case AF_UNIX:
 431                return false;
 432        default:
 433                return true;
 434        }
 435}
 436
 437static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 438{
 439        if (sk->sk_flags & flags) {
 440                sk->sk_flags &= ~flags;
 441                if (sock_needs_netstamp(sk) &&
 442                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 443                        net_disable_timestamp();
 444        }
 445}
 446
 447
 448int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 449{
 450        unsigned long flags;
 451        struct sk_buff_head *list = &sk->sk_receive_queue;
 452
 453        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 454                atomic_inc(&sk->sk_drops);
 455                trace_sock_rcvqueue_full(sk, skb);
 456                return -ENOMEM;
 457        }
 458
 459        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 460                atomic_inc(&sk->sk_drops);
 461                return -ENOBUFS;
 462        }
 463
 464        skb->dev = NULL;
 465        skb_set_owner_r(skb, sk);
 466
 467        /* we escape from rcu protected region, make sure we dont leak
 468         * a norefcounted dst
 469         */
 470        skb_dst_force(skb);
 471
 472        spin_lock_irqsave(&list->lock, flags);
 473        sock_skb_set_dropcount(sk, skb);
 474        __skb_queue_tail(list, skb);
 475        spin_unlock_irqrestore(&list->lock, flags);
 476
 477        if (!sock_flag(sk, SOCK_DEAD))
 478                sk->sk_data_ready(sk);
 479        return 0;
 480}
 481EXPORT_SYMBOL(__sock_queue_rcv_skb);
 482
 483int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 484{
 485        int err;
 486
 487        err = sk_filter(sk, skb);
 488        if (err)
 489                return err;
 490
 491        return __sock_queue_rcv_skb(sk, skb);
 492}
 493EXPORT_SYMBOL(sock_queue_rcv_skb);
 494
 495int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 496                     const int nested, unsigned int trim_cap, bool refcounted)
 497{
 498        int rc = NET_RX_SUCCESS;
 499
 500        if (sk_filter_trim_cap(sk, skb, trim_cap))
 501                goto discard_and_relse;
 502
 503        skb->dev = NULL;
 504
 505        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 506                atomic_inc(&sk->sk_drops);
 507                goto discard_and_relse;
 508        }
 509        if (nested)
 510                bh_lock_sock_nested(sk);
 511        else
 512                bh_lock_sock(sk);
 513        if (!sock_owned_by_user(sk)) {
 514                /*
 515                 * trylock + unlock semantics:
 516                 */
 517                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 518
 519                rc = sk_backlog_rcv(sk, skb);
 520
 521                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 522        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 523                bh_unlock_sock(sk);
 524                atomic_inc(&sk->sk_drops);
 525                goto discard_and_relse;
 526        }
 527
 528        bh_unlock_sock(sk);
 529out:
 530        if (refcounted)
 531                sock_put(sk);
 532        return rc;
 533discard_and_relse:
 534        kfree_skb(skb);
 535        goto out;
 536}
 537EXPORT_SYMBOL(__sk_receive_skb);
 538
 539struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 540{
 541        struct dst_entry *dst = __sk_dst_get(sk);
 542
 543        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 544                sk_tx_queue_clear(sk);
 545                sk->sk_dst_pending_confirm = 0;
 546                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 547                dst_release(dst);
 548                return NULL;
 549        }
 550
 551        return dst;
 552}
 553EXPORT_SYMBOL(__sk_dst_check);
 554
 555struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 556{
 557        struct dst_entry *dst = sk_dst_get(sk);
 558
 559        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 560                sk_dst_reset(sk);
 561                dst_release(dst);
 562                return NULL;
 563        }
 564
 565        return dst;
 566}
 567EXPORT_SYMBOL(sk_dst_check);
 568
 569static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 570{
 571        int ret = -ENOPROTOOPT;
 572#ifdef CONFIG_NETDEVICES
 573        struct net *net = sock_net(sk);
 574
 575        /* Sorry... */
 576        ret = -EPERM;
 577        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 578                goto out;
 579
 580        ret = -EINVAL;
 581        if (ifindex < 0)
 582                goto out;
 583
 584        sk->sk_bound_dev_if = ifindex;
 585        if (sk->sk_prot->rehash)
 586                sk->sk_prot->rehash(sk);
 587        sk_dst_reset(sk);
 588
 589        ret = 0;
 590
 591out:
 592#endif
 593
 594        return ret;
 595}
 596
 597static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 598                                int optlen)
 599{
 600        int ret = -ENOPROTOOPT;
 601#ifdef CONFIG_NETDEVICES
 602        struct net *net = sock_net(sk);
 603        char devname[IFNAMSIZ];
 604        int index;
 605
 606        ret = -EINVAL;
 607        if (optlen < 0)
 608                goto out;
 609
 610        /* Bind this socket to a particular device like "eth0",
 611         * as specified in the passed interface name. If the
 612         * name is "" or the option length is zero the socket
 613         * is not bound.
 614         */
 615        if (optlen > IFNAMSIZ - 1)
 616                optlen = IFNAMSIZ - 1;
 617        memset(devname, 0, sizeof(devname));
 618
 619        ret = -EFAULT;
 620        if (copy_from_user(devname, optval, optlen))
 621                goto out;
 622
 623        index = 0;
 624        if (devname[0] != '\0') {
 625                struct net_device *dev;
 626
 627                rcu_read_lock();
 628                dev = dev_get_by_name_rcu(net, devname);
 629                if (dev)
 630                        index = dev->ifindex;
 631                rcu_read_unlock();
 632                ret = -ENODEV;
 633                if (!dev)
 634                        goto out;
 635        }
 636
 637        lock_sock(sk);
 638        ret = sock_setbindtodevice_locked(sk, index);
 639        release_sock(sk);
 640
 641out:
 642#endif
 643
 644        return ret;
 645}
 646
 647static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 648                                int __user *optlen, int len)
 649{
 650        int ret = -ENOPROTOOPT;
 651#ifdef CONFIG_NETDEVICES
 652        struct net *net = sock_net(sk);
 653        char devname[IFNAMSIZ];
 654
 655        if (sk->sk_bound_dev_if == 0) {
 656                len = 0;
 657                goto zero;
 658        }
 659
 660        ret = -EINVAL;
 661        if (len < IFNAMSIZ)
 662                goto out;
 663
 664        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 665        if (ret)
 666                goto out;
 667
 668        len = strlen(devname) + 1;
 669
 670        ret = -EFAULT;
 671        if (copy_to_user(optval, devname, len))
 672                goto out;
 673
 674zero:
 675        ret = -EFAULT;
 676        if (put_user(len, optlen))
 677                goto out;
 678
 679        ret = 0;
 680
 681out:
 682#endif
 683
 684        return ret;
 685}
 686
 687static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
 688                                     int valbool)
 689{
 690        if (valbool)
 691                sock_set_flag(sk, bit);
 692        else
 693                sock_reset_flag(sk, bit);
 694}
 695
 696bool sk_mc_loop(struct sock *sk)
 697{
 698        if (dev_recursion_level())
 699                return false;
 700        if (!sk)
 701                return true;
 702        switch (sk->sk_family) {
 703        case AF_INET:
 704                return inet_sk(sk)->mc_loop;
 705#if IS_ENABLED(CONFIG_IPV6)
 706        case AF_INET6:
 707                return inet6_sk(sk)->mc_loop;
 708#endif
 709        }
 710        WARN_ON(1);
 711        return true;
 712}
 713EXPORT_SYMBOL(sk_mc_loop);
 714
 715/*
 716 *      This is meant for all protocols to use and covers goings on
 717 *      at the socket level. Everything here is generic.
 718 */
 719
 720int sock_setsockopt(struct socket *sock, int level, int optname,
 721                    char __user *optval, unsigned int optlen)
 722{
 723        struct sock_txtime sk_txtime;
 724        struct sock *sk = sock->sk;
 725        int val;
 726        int valbool;
 727        struct linger ling;
 728        int ret = 0;
 729
 730        /*
 731         *      Options without arguments
 732         */
 733
 734        if (optname == SO_BINDTODEVICE)
 735                return sock_setbindtodevice(sk, optval, optlen);
 736
 737        if (optlen < sizeof(int))
 738                return -EINVAL;
 739
 740        if (get_user(val, (int __user *)optval))
 741                return -EFAULT;
 742
 743        valbool = val ? 1 : 0;
 744
 745        lock_sock(sk);
 746
 747        switch (optname) {
 748        case SO_DEBUG:
 749                if (val && !capable(CAP_NET_ADMIN))
 750                        ret = -EACCES;
 751                else
 752                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 753                break;
 754        case SO_REUSEADDR:
 755                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 756                break;
 757        case SO_REUSEPORT:
 758                sk->sk_reuseport = valbool;
 759                break;
 760        case SO_TYPE:
 761        case SO_PROTOCOL:
 762        case SO_DOMAIN:
 763        case SO_ERROR:
 764                ret = -ENOPROTOOPT;
 765                break;
 766        case SO_DONTROUTE:
 767                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 768                sk_dst_reset(sk);
 769                break;
 770        case SO_BROADCAST:
 771                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 772                break;
 773        case SO_SNDBUF:
 774                /* Don't error on this BSD doesn't and if you think
 775                 * about it this is right. Otherwise apps have to
 776                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                 * are treated in BSD as hints
 778                 */
 779                val = min_t(u32, val, sysctl_wmem_max);
 780set_sndbuf:
 781                /* Ensure val * 2 fits into an int, to prevent max_t()
 782                 * from treating it as a negative value.
 783                 */
 784                val = min_t(int, val, INT_MAX / 2);
 785                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 786                WRITE_ONCE(sk->sk_sndbuf,
 787                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
 788                /* Wake up sending tasks if we upped the value. */
 789                sk->sk_write_space(sk);
 790                break;
 791
 792        case SO_SNDBUFFORCE:
 793                if (!capable(CAP_NET_ADMIN)) {
 794                        ret = -EPERM;
 795                        break;
 796                }
 797
 798                /* No negative values (to prevent underflow, as val will be
 799                 * multiplied by 2).
 800                 */
 801                if (val < 0)
 802                        val = 0;
 803                goto set_sndbuf;
 804
 805        case SO_RCVBUF:
 806                /* Don't error on this BSD doesn't and if you think
 807                 * about it this is right. Otherwise apps have to
 808                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 809                 * are treated in BSD as hints
 810                 */
 811                val = min_t(u32, val, sysctl_rmem_max);
 812set_rcvbuf:
 813                /* Ensure val * 2 fits into an int, to prevent max_t()
 814                 * from treating it as a negative value.
 815                 */
 816                val = min_t(int, val, INT_MAX / 2);
 817                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 818                /*
 819                 * We double it on the way in to account for
 820                 * "struct sk_buff" etc. overhead.   Applications
 821                 * assume that the SO_RCVBUF setting they make will
 822                 * allow that much actual data to be received on that
 823                 * socket.
 824                 *
 825                 * Applications are unaware that "struct sk_buff" and
 826                 * other overheads allocate from the receive buffer
 827                 * during socket buffer allocation.
 828                 *
 829                 * And after considering the possible alternatives,
 830                 * returning the value we actually used in getsockopt
 831                 * is the most desirable behavior.
 832                 */
 833                WRITE_ONCE(sk->sk_rcvbuf,
 834                           max_t(int, val * 2, SOCK_MIN_RCVBUF));
 835                break;
 836
 837        case SO_RCVBUFFORCE:
 838                if (!capable(CAP_NET_ADMIN)) {
 839                        ret = -EPERM;
 840                        break;
 841                }
 842
 843                /* No negative values (to prevent underflow, as val will be
 844                 * multiplied by 2).
 845                 */
 846                if (val < 0)
 847                        val = 0;
 848                goto set_rcvbuf;
 849
 850        case SO_KEEPALIVE:
 851                if (sk->sk_prot->keepalive)
 852                        sk->sk_prot->keepalive(sk, valbool);
 853                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 854                break;
 855
 856        case SO_OOBINLINE:
 857                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 858                break;
 859
 860        case SO_NO_CHECK:
 861                sk->sk_no_check_tx = valbool;
 862                break;
 863
 864        case SO_PRIORITY:
 865                if ((val >= 0 && val <= 6) ||
 866                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 867                        sk->sk_priority = val;
 868                else
 869                        ret = -EPERM;
 870                break;
 871
 872        case SO_LINGER:
 873                if (optlen < sizeof(ling)) {
 874                        ret = -EINVAL;  /* 1003.1g */
 875                        break;
 876                }
 877                if (copy_from_user(&ling, optval, sizeof(ling))) {
 878                        ret = -EFAULT;
 879                        break;
 880                }
 881                if (!ling.l_onoff)
 882                        sock_reset_flag(sk, SOCK_LINGER);
 883                else {
 884#if (BITS_PER_LONG == 32)
 885                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 886                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 887                        else
 888#endif
 889                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 890                        sock_set_flag(sk, SOCK_LINGER);
 891                }
 892                break;
 893
 894        case SO_BSDCOMPAT:
 895                sock_warn_obsolete_bsdism("setsockopt");
 896                break;
 897
 898        case SO_PASSCRED:
 899                if (valbool)
 900                        set_bit(SOCK_PASSCRED, &sock->flags);
 901                else
 902                        clear_bit(SOCK_PASSCRED, &sock->flags);
 903                break;
 904
 905        case SO_TIMESTAMP_OLD:
 906        case SO_TIMESTAMP_NEW:
 907        case SO_TIMESTAMPNS_OLD:
 908        case SO_TIMESTAMPNS_NEW:
 909                if (valbool)  {
 910                        if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
 911                                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 912                        else
 913                                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 914
 915                        if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 916                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 917                        else
 918                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 919                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 920                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 921                } else {
 922                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 923                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 924                        sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 925                }
 926                break;
 927
 928        case SO_TIMESTAMPING_NEW:
 929                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 930                /* fall through */
 931        case SO_TIMESTAMPING_OLD:
 932                if (val & ~SOF_TIMESTAMPING_MASK) {
 933                        ret = -EINVAL;
 934                        break;
 935                }
 936
 937                if (val & SOF_TIMESTAMPING_OPT_ID &&
 938                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 939                        if (sk->sk_protocol == IPPROTO_TCP &&
 940                            sk->sk_type == SOCK_STREAM) {
 941                                if ((1 << sk->sk_state) &
 942                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 943                                        ret = -EINVAL;
 944                                        break;
 945                                }
 946                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 947                        } else {
 948                                sk->sk_tskey = 0;
 949                        }
 950                }
 951
 952                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 953                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 954                        ret = -EINVAL;
 955                        break;
 956                }
 957
 958                sk->sk_tsflags = val;
 959                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 960                        sock_enable_timestamp(sk,
 961                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 962                else {
 963                        if (optname == SO_TIMESTAMPING_NEW)
 964                                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 965
 966                        sock_disable_timestamp(sk,
 967                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 968                }
 969                break;
 970
 971        case SO_RCVLOWAT:
 972                if (val < 0)
 973                        val = INT_MAX;
 974                if (sock->ops->set_rcvlowat)
 975                        ret = sock->ops->set_rcvlowat(sk, val);
 976                else
 977                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 978                break;
 979
 980        case SO_RCVTIMEO_OLD:
 981        case SO_RCVTIMEO_NEW:
 982                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 983                break;
 984
 985        case SO_SNDTIMEO_OLD:
 986        case SO_SNDTIMEO_NEW:
 987                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 988                break;
 989
 990        case SO_ATTACH_FILTER:
 991                ret = -EINVAL;
 992                if (optlen == sizeof(struct sock_fprog)) {
 993                        struct sock_fprog fprog;
 994
 995                        ret = -EFAULT;
 996                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 997                                break;
 998
 999                        ret = sk_attach_filter(&fprog, sk);
1000                }
1001                break;
1002
1003        case SO_ATTACH_BPF:
1004                ret = -EINVAL;
1005                if (optlen == sizeof(u32)) {
1006                        u32 ufd;
1007
1008                        ret = -EFAULT;
1009                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
1010                                break;
1011
1012                        ret = sk_attach_bpf(ufd, sk);
1013                }
1014                break;
1015
1016        case SO_ATTACH_REUSEPORT_CBPF:
1017                ret = -EINVAL;
1018                if (optlen == sizeof(struct sock_fprog)) {
1019                        struct sock_fprog fprog;
1020
1021                        ret = -EFAULT;
1022                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
1023                                break;
1024
1025                        ret = sk_reuseport_attach_filter(&fprog, sk);
1026                }
1027                break;
1028
1029        case SO_ATTACH_REUSEPORT_EBPF:
1030                ret = -EINVAL;
1031                if (optlen == sizeof(u32)) {
1032                        u32 ufd;
1033
1034                        ret = -EFAULT;
1035                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
1036                                break;
1037
1038                        ret = sk_reuseport_attach_bpf(ufd, sk);
1039                }
1040                break;
1041
1042        case SO_DETACH_REUSEPORT_BPF:
1043                ret = reuseport_detach_prog(sk);
1044                break;
1045
1046        case SO_DETACH_FILTER:
1047                ret = sk_detach_filter(sk);
1048                break;
1049
1050        case SO_LOCK_FILTER:
1051                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1052                        ret = -EPERM;
1053                else
1054                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1055                break;
1056
1057        case SO_PASSSEC:
1058                if (valbool)
1059                        set_bit(SOCK_PASSSEC, &sock->flags);
1060                else
1061                        clear_bit(SOCK_PASSSEC, &sock->flags);
1062                break;
1063        case SO_MARK:
1064                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1065                        ret = -EPERM;
1066                } else if (val != sk->sk_mark) {
1067                        sk->sk_mark = val;
1068                        sk_dst_reset(sk);
1069                }
1070                break;
1071
1072        case SO_RXQ_OVFL:
1073                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1074                break;
1075
1076        case SO_WIFI_STATUS:
1077                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1078                break;
1079
1080        case SO_PEEK_OFF:
1081                if (sock->ops->set_peek_off)
1082                        ret = sock->ops->set_peek_off(sk, val);
1083                else
1084                        ret = -EOPNOTSUPP;
1085                break;
1086
1087        case SO_NOFCS:
1088                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1089                break;
1090
1091        case SO_SELECT_ERR_QUEUE:
1092                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1093                break;
1094
1095#ifdef CONFIG_NET_RX_BUSY_POLL
1096        case SO_BUSY_POLL:
1097                /* allow unprivileged users to decrease the value */
1098                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1099                        ret = -EPERM;
1100                else {
1101                        if (val < 0)
1102                                ret = -EINVAL;
1103                        else
1104                                sk->sk_ll_usec = val;
1105                }
1106                break;
1107#endif
1108
1109        case SO_MAX_PACING_RATE:
1110                {
1111                unsigned long ulval = (val == ~0U) ? ~0UL : val;
1112
1113                if (sizeof(ulval) != sizeof(val) &&
1114                    optlen >= sizeof(ulval) &&
1115                    get_user(ulval, (unsigned long __user *)optval)) {
1116                        ret = -EFAULT;
1117                        break;
1118                }
1119                if (ulval != ~0UL)
1120                        cmpxchg(&sk->sk_pacing_status,
1121                                SK_PACING_NONE,
1122                                SK_PACING_NEEDED);
1123                sk->sk_max_pacing_rate = ulval;
1124                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1125                break;
1126                }
1127        case SO_INCOMING_CPU:
1128                WRITE_ONCE(sk->sk_incoming_cpu, val);
1129                break;
1130
1131        case SO_CNX_ADVICE:
1132                if (val == 1)
1133                        dst_negative_advice(sk);
1134                break;
1135
1136        case SO_ZEROCOPY:
1137                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1138                        if (!((sk->sk_type == SOCK_STREAM &&
1139                               sk->sk_protocol == IPPROTO_TCP) ||
1140                              (sk->sk_type == SOCK_DGRAM &&
1141                               sk->sk_protocol == IPPROTO_UDP)))
1142                                ret = -ENOTSUPP;
1143                } else if (sk->sk_family != PF_RDS) {
1144                        ret = -ENOTSUPP;
1145                }
1146                if (!ret) {
1147                        if (val < 0 || val > 1)
1148                                ret = -EINVAL;
1149                        else
1150                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1151                }
1152                break;
1153
1154        case SO_TXTIME:
1155                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1156                        ret = -EPERM;
1157                } else if (optlen != sizeof(struct sock_txtime)) {
1158                        ret = -EINVAL;
1159                } else if (copy_from_user(&sk_txtime, optval,
1160                           sizeof(struct sock_txtime))) {
1161                        ret = -EFAULT;
1162                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1163                        ret = -EINVAL;
1164                } else {
1165                        sock_valbool_flag(sk, SOCK_TXTIME, true);
1166                        sk->sk_clockid = sk_txtime.clockid;
1167                        sk->sk_txtime_deadline_mode =
1168                                !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1169                        sk->sk_txtime_report_errors =
1170                                !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1171                }
1172                break;
1173
1174        case SO_BINDTOIFINDEX:
1175                ret = sock_setbindtodevice_locked(sk, val);
1176                break;
1177
1178        default:
1179                ret = -ENOPROTOOPT;
1180                break;
1181        }
1182        release_sock(sk);
1183        return ret;
1184}
1185EXPORT_SYMBOL(sock_setsockopt);
1186
1187
1188static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1189                          struct ucred *ucred)
1190{
1191        ucred->pid = pid_vnr(pid);
1192        ucred->uid = ucred->gid = -1;
1193        if (cred) {
1194                struct user_namespace *current_ns = current_user_ns();
1195
1196                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1197                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1198        }
1199}
1200
1201static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1202{
1203        struct user_namespace *user_ns = current_user_ns();
1204        int i;
1205
1206        for (i = 0; i < src->ngroups; i++)
1207                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1208                        return -EFAULT;
1209
1210        return 0;
1211}
1212
1213int sock_getsockopt(struct socket *sock, int level, int optname,
1214                    char __user *optval, int __user *optlen)
1215{
1216        struct sock *sk = sock->sk;
1217
1218        union {
1219                int val;
1220                u64 val64;
1221                unsigned long ulval;
1222                struct linger ling;
1223                struct old_timeval32 tm32;
1224                struct __kernel_old_timeval tm;
1225                struct  __kernel_sock_timeval stm;
1226                struct sock_txtime txtime;
1227        } v;
1228
1229        int lv = sizeof(int);
1230        int len;
1231
1232        if (get_user(len, optlen))
1233                return -EFAULT;
1234        if (len < 0)
1235                return -EINVAL;
1236
1237        memset(&v, 0, sizeof(v));
1238
1239        switch (optname) {
1240        case SO_DEBUG:
1241                v.val = sock_flag(sk, SOCK_DBG);
1242                break;
1243
1244        case SO_DONTROUTE:
1245                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1246                break;
1247
1248        case SO_BROADCAST:
1249                v.val = sock_flag(sk, SOCK_BROADCAST);
1250                break;
1251
1252        case SO_SNDBUF:
1253                v.val = sk->sk_sndbuf;
1254                break;
1255
1256        case SO_RCVBUF:
1257                v.val = sk->sk_rcvbuf;
1258                break;
1259
1260        case SO_REUSEADDR:
1261                v.val = sk->sk_reuse;
1262                break;
1263
1264        case SO_REUSEPORT:
1265                v.val = sk->sk_reuseport;
1266                break;
1267
1268        case SO_KEEPALIVE:
1269                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1270                break;
1271
1272        case SO_TYPE:
1273                v.val = sk->sk_type;
1274                break;
1275
1276        case SO_PROTOCOL:
1277                v.val = sk->sk_protocol;
1278                break;
1279
1280        case SO_DOMAIN:
1281                v.val = sk->sk_family;
1282                break;
1283
1284        case SO_ERROR:
1285                v.val = -sock_error(sk);
1286                if (v.val == 0)
1287                        v.val = xchg(&sk->sk_err_soft, 0);
1288                break;
1289
1290        case SO_OOBINLINE:
1291                v.val = sock_flag(sk, SOCK_URGINLINE);
1292                break;
1293
1294        case SO_NO_CHECK:
1295                v.val = sk->sk_no_check_tx;
1296                break;
1297
1298        case SO_PRIORITY:
1299                v.val = sk->sk_priority;
1300                break;
1301
1302        case SO_LINGER:
1303                lv              = sizeof(v.ling);
1304                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1305                v.ling.l_linger = sk->sk_lingertime / HZ;
1306                break;
1307
1308        case SO_BSDCOMPAT:
1309                sock_warn_obsolete_bsdism("getsockopt");
1310                break;
1311
1312        case SO_TIMESTAMP_OLD:
1313                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1314                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1315                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1316                break;
1317
1318        case SO_TIMESTAMPNS_OLD:
1319                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1320                break;
1321
1322        case SO_TIMESTAMP_NEW:
1323                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1324                break;
1325
1326        case SO_TIMESTAMPNS_NEW:
1327                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1328                break;
1329
1330        case SO_TIMESTAMPING_OLD:
1331                v.val = sk->sk_tsflags;
1332                break;
1333
1334        case SO_RCVTIMEO_OLD:
1335        case SO_RCVTIMEO_NEW:
1336                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1337                break;
1338
1339        case SO_SNDTIMEO_OLD:
1340        case SO_SNDTIMEO_NEW:
1341                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1342                break;
1343
1344        case SO_RCVLOWAT:
1345                v.val = sk->sk_rcvlowat;
1346                break;
1347
1348        case SO_SNDLOWAT:
1349                v.val = 1;
1350                break;
1351
1352        case SO_PASSCRED:
1353                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1354                break;
1355
1356        case SO_PEERCRED:
1357        {
1358                struct ucred peercred;
1359                if (len > sizeof(peercred))
1360                        len = sizeof(peercred);
1361                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1362                if (copy_to_user(optval, &peercred, len))
1363                        return -EFAULT;
1364                goto lenout;
1365        }
1366
1367        case SO_PEERGROUPS:
1368        {
1369                int ret, n;
1370
1371                if (!sk->sk_peer_cred)
1372                        return -ENODATA;
1373
1374                n = sk->sk_peer_cred->group_info->ngroups;
1375                if (len < n * sizeof(gid_t)) {
1376                        len = n * sizeof(gid_t);
1377                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1378                }
1379                len = n * sizeof(gid_t);
1380
1381                ret = groups_to_user((gid_t __user *)optval,
1382                                     sk->sk_peer_cred->group_info);
1383                if (ret)
1384                        return ret;
1385                goto lenout;
1386        }
1387
1388        case SO_PEERNAME:
1389        {
1390                char address[128];
1391
1392                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1393                if (lv < 0)
1394                        return -ENOTCONN;
1395                if (lv < len)
1396                        return -EINVAL;
1397                if (copy_to_user(optval, address, len))
1398                        return -EFAULT;
1399                goto lenout;
1400        }
1401
1402        /* Dubious BSD thing... Probably nobody even uses it, but
1403         * the UNIX standard wants it for whatever reason... -DaveM
1404         */
1405        case SO_ACCEPTCONN:
1406                v.val = sk->sk_state == TCP_LISTEN;
1407                break;
1408
1409        case SO_PASSSEC:
1410                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1411                break;
1412
1413        case SO_PEERSEC:
1414                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1415
1416        case SO_MARK:
1417                v.val = sk->sk_mark;
1418                break;
1419
1420        case SO_RXQ_OVFL:
1421                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1422                break;
1423
1424        case SO_WIFI_STATUS:
1425                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1426                break;
1427
1428        case SO_PEEK_OFF:
1429                if (!sock->ops->set_peek_off)
1430                        return -EOPNOTSUPP;
1431
1432                v.val = sk->sk_peek_off;
1433                break;
1434        case SO_NOFCS:
1435                v.val = sock_flag(sk, SOCK_NOFCS);
1436                break;
1437
1438        case SO_BINDTODEVICE:
1439                return sock_getbindtodevice(sk, optval, optlen, len);
1440
1441        case SO_GET_FILTER:
1442                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1443                if (len < 0)
1444                        return len;
1445
1446                goto lenout;
1447
1448        case SO_LOCK_FILTER:
1449                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1450                break;
1451
1452        case SO_BPF_EXTENSIONS:
1453                v.val = bpf_tell_extensions();
1454                break;
1455
1456        case SO_SELECT_ERR_QUEUE:
1457                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1458                break;
1459
1460#ifdef CONFIG_NET_RX_BUSY_POLL
1461        case SO_BUSY_POLL:
1462                v.val = sk->sk_ll_usec;
1463                break;
1464#endif
1465
1466        case SO_MAX_PACING_RATE:
1467                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1468                        lv = sizeof(v.ulval);
1469                        v.ulval = sk->sk_max_pacing_rate;
1470                } else {
1471                        /* 32bit version */
1472                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1473                }
1474                break;
1475
1476        case SO_INCOMING_CPU:
1477                v.val = READ_ONCE(sk->sk_incoming_cpu);
1478                break;
1479
1480        case SO_MEMINFO:
1481        {
1482                u32 meminfo[SK_MEMINFO_VARS];
1483
1484                sk_get_meminfo(sk, meminfo);
1485
1486                len = min_t(unsigned int, len, sizeof(meminfo));
1487                if (copy_to_user(optval, &meminfo, len))
1488                        return -EFAULT;
1489
1490                goto lenout;
1491        }
1492
1493#ifdef CONFIG_NET_RX_BUSY_POLL
1494        case SO_INCOMING_NAPI_ID:
1495                v.val = READ_ONCE(sk->sk_napi_id);
1496
1497                /* aggregate non-NAPI IDs down to 0 */
1498                if (v.val < MIN_NAPI_ID)
1499                        v.val = 0;
1500
1501                break;
1502#endif
1503
1504        case SO_COOKIE:
1505                lv = sizeof(u64);
1506                if (len < lv)
1507                        return -EINVAL;
1508                v.val64 = sock_gen_cookie(sk);
1509                break;
1510
1511        case SO_ZEROCOPY:
1512                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1513                break;
1514
1515        case SO_TXTIME:
1516                lv = sizeof(v.txtime);
1517                v.txtime.clockid = sk->sk_clockid;
1518                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1519                                  SOF_TXTIME_DEADLINE_MODE : 0;
1520                v.txtime.flags |= sk->sk_txtime_report_errors ?
1521                                  SOF_TXTIME_REPORT_ERRORS : 0;
1522                break;
1523
1524        case SO_BINDTOIFINDEX:
1525                v.val = sk->sk_bound_dev_if;
1526                break;
1527
1528        default:
1529                /* We implement the SO_SNDLOWAT etc to not be settable
1530                 * (1003.1g 7).
1531                 */
1532                return -ENOPROTOOPT;
1533        }
1534
1535        if (len > lv)
1536                len = lv;
1537        if (copy_to_user(optval, &v, len))
1538                return -EFAULT;
1539lenout:
1540        if (put_user(len, optlen))
1541                return -EFAULT;
1542        return 0;
1543}
1544
1545/*
1546 * Initialize an sk_lock.
1547 *
1548 * (We also register the sk_lock with the lock validator.)
1549 */
1550static inline void sock_lock_init(struct sock *sk)
1551{
1552        if (sk->sk_kern_sock)
1553                sock_lock_init_class_and_name(
1554                        sk,
1555                        af_family_kern_slock_key_strings[sk->sk_family],
1556                        af_family_kern_slock_keys + sk->sk_family,
1557                        af_family_kern_key_strings[sk->sk_family],
1558                        af_family_kern_keys + sk->sk_family);
1559        else
1560                sock_lock_init_class_and_name(
1561                        sk,
1562                        af_family_slock_key_strings[sk->sk_family],
1563                        af_family_slock_keys + sk->sk_family,
1564                        af_family_key_strings[sk->sk_family],
1565                        af_family_keys + sk->sk_family);
1566}
1567
1568/*
1569 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1570 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1571 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1572 */
1573static void sock_copy(struct sock *nsk, const struct sock *osk)
1574{
1575#ifdef CONFIG_SECURITY_NETWORK
1576        void *sptr = nsk->sk_security;
1577#endif
1578        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1579
1580        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1581               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1582
1583#ifdef CONFIG_SECURITY_NETWORK
1584        nsk->sk_security = sptr;
1585        security_sk_clone(osk, nsk);
1586#endif
1587}
1588
1589static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1590                int family)
1591{
1592        struct sock *sk;
1593        struct kmem_cache *slab;
1594
1595        slab = prot->slab;
1596        if (slab != NULL) {
1597                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1598                if (!sk)
1599                        return sk;
1600                if (want_init_on_alloc(priority))
1601                        sk_prot_clear_nulls(sk, prot->obj_size);
1602        } else
1603                sk = kmalloc(prot->obj_size, priority);
1604
1605        if (sk != NULL) {
1606                if (security_sk_alloc(sk, family, priority))
1607                        goto out_free;
1608
1609                if (!try_module_get(prot->owner))
1610                        goto out_free_sec;
1611                sk_tx_queue_clear(sk);
1612        }
1613
1614        return sk;
1615
1616out_free_sec:
1617        security_sk_free(sk);
1618out_free:
1619        if (slab != NULL)
1620                kmem_cache_free(slab, sk);
1621        else
1622                kfree(sk);
1623        return NULL;
1624}
1625
1626static void sk_prot_free(struct proto *prot, struct sock *sk)
1627{
1628        struct kmem_cache *slab;
1629        struct module *owner;
1630
1631        owner = prot->owner;
1632        slab = prot->slab;
1633
1634        cgroup_sk_free(&sk->sk_cgrp_data);
1635        mem_cgroup_sk_free(sk);
1636        security_sk_free(sk);
1637        if (slab != NULL)
1638                kmem_cache_free(slab, sk);
1639        else
1640                kfree(sk);
1641        module_put(owner);
1642}
1643
1644/**
1645 *      sk_alloc - All socket objects are allocated here
1646 *      @net: the applicable net namespace
1647 *      @family: protocol family
1648 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1649 *      @prot: struct proto associated with this new sock instance
1650 *      @kern: is this to be a kernel socket?
1651 */
1652struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1653                      struct proto *prot, int kern)
1654{
1655        struct sock *sk;
1656
1657        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1658        if (sk) {
1659                sk->sk_family = family;
1660                /*
1661                 * See comment in struct sock definition to understand
1662                 * why we need sk_prot_creator -acme
1663                 */
1664                sk->sk_prot = sk->sk_prot_creator = prot;
1665                sk->sk_kern_sock = kern;
1666                sock_lock_init(sk);
1667                sk->sk_net_refcnt = kern ? 0 : 1;
1668                if (likely(sk->sk_net_refcnt)) {
1669                        get_net(net);
1670                        sock_inuse_add(net, 1);
1671                }
1672
1673                sock_net_set(sk, net);
1674                refcount_set(&sk->sk_wmem_alloc, 1);
1675
1676                mem_cgroup_sk_alloc(sk);
1677                cgroup_sk_alloc(&sk->sk_cgrp_data);
1678                sock_update_classid(&sk->sk_cgrp_data);
1679                sock_update_netprioidx(&sk->sk_cgrp_data);
1680        }
1681
1682        return sk;
1683}
1684EXPORT_SYMBOL(sk_alloc);
1685
1686/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1687 * grace period. This is the case for UDP sockets and TCP listeners.
1688 */
1689static void __sk_destruct(struct rcu_head *head)
1690{
1691        struct sock *sk = container_of(head, struct sock, sk_rcu);
1692        struct sk_filter *filter;
1693
1694        if (sk->sk_destruct)
1695                sk->sk_destruct(sk);
1696
1697        filter = rcu_dereference_check(sk->sk_filter,
1698                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1699        if (filter) {
1700                sk_filter_uncharge(sk, filter);
1701                RCU_INIT_POINTER(sk->sk_filter, NULL);
1702        }
1703
1704        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1705
1706#ifdef CONFIG_BPF_SYSCALL
1707        bpf_sk_storage_free(sk);
1708#endif
1709
1710        if (atomic_read(&sk->sk_omem_alloc))
1711                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1712                         __func__, atomic_read(&sk->sk_omem_alloc));
1713
1714        if (sk->sk_frag.page) {
1715                put_page(sk->sk_frag.page);
1716                sk->sk_frag.page = NULL;
1717        }
1718
1719        if (sk->sk_peer_cred)
1720                put_cred(sk->sk_peer_cred);
1721        put_pid(sk->sk_peer_pid);
1722        if (likely(sk->sk_net_refcnt))
1723                put_net(sock_net(sk));
1724        sk_prot_free(sk->sk_prot_creator, sk);
1725}
1726
1727void sk_destruct(struct sock *sk)
1728{
1729        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1730
1731        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1732                reuseport_detach_sock(sk);
1733                use_call_rcu = true;
1734        }
1735
1736        if (use_call_rcu)
1737                call_rcu(&sk->sk_rcu, __sk_destruct);
1738        else
1739                __sk_destruct(&sk->sk_rcu);
1740}
1741
1742static void __sk_free(struct sock *sk)
1743{
1744        if (likely(sk->sk_net_refcnt))
1745                sock_inuse_add(sock_net(sk), -1);
1746
1747        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1748                sock_diag_broadcast_destroy(sk);
1749        else
1750                sk_destruct(sk);
1751}
1752
1753void sk_free(struct sock *sk)
1754{
1755        /*
1756         * We subtract one from sk_wmem_alloc and can know if
1757         * some packets are still in some tx queue.
1758         * If not null, sock_wfree() will call __sk_free(sk) later
1759         */
1760        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1761                __sk_free(sk);
1762}
1763EXPORT_SYMBOL(sk_free);
1764
1765static void sk_init_common(struct sock *sk)
1766{
1767        skb_queue_head_init(&sk->sk_receive_queue);
1768        skb_queue_head_init(&sk->sk_write_queue);
1769        skb_queue_head_init(&sk->sk_error_queue);
1770
1771        rwlock_init(&sk->sk_callback_lock);
1772        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1773                        af_rlock_keys + sk->sk_family,
1774                        af_family_rlock_key_strings[sk->sk_family]);
1775        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1776                        af_wlock_keys + sk->sk_family,
1777                        af_family_wlock_key_strings[sk->sk_family]);
1778        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1779                        af_elock_keys + sk->sk_family,
1780                        af_family_elock_key_strings[sk->sk_family]);
1781        lockdep_set_class_and_name(&sk->sk_callback_lock,
1782                        af_callback_keys + sk->sk_family,
1783                        af_family_clock_key_strings[sk->sk_family]);
1784}
1785
1786/**
1787 *      sk_clone_lock - clone a socket, and lock its clone
1788 *      @sk: the socket to clone
1789 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1790 *
1791 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1792 */
1793struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1794{
1795        struct sock *newsk;
1796        bool is_charged = true;
1797
1798        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1799        if (newsk != NULL) {
1800                struct sk_filter *filter;
1801
1802                sock_copy(newsk, sk);
1803
1804                newsk->sk_prot_creator = sk->sk_prot;
1805
1806                /* SANITY */
1807                if (likely(newsk->sk_net_refcnt))
1808                        get_net(sock_net(newsk));
1809                sk_node_init(&newsk->sk_node);
1810                sock_lock_init(newsk);
1811                bh_lock_sock(newsk);
1812                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1813                newsk->sk_backlog.len = 0;
1814
1815                atomic_set(&newsk->sk_rmem_alloc, 0);
1816                /*
1817                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1818                 */
1819                refcount_set(&newsk->sk_wmem_alloc, 1);
1820                atomic_set(&newsk->sk_omem_alloc, 0);
1821                sk_init_common(newsk);
1822
1823                newsk->sk_dst_cache     = NULL;
1824                newsk->sk_dst_pending_confirm = 0;
1825                newsk->sk_wmem_queued   = 0;
1826                newsk->sk_forward_alloc = 0;
1827                atomic_set(&newsk->sk_drops, 0);
1828                newsk->sk_send_head     = NULL;
1829                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1830                atomic_set(&newsk->sk_zckey, 0);
1831
1832                sock_reset_flag(newsk, SOCK_DONE);
1833                mem_cgroup_sk_alloc(newsk);
1834                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1835
1836                rcu_read_lock();
1837                filter = rcu_dereference(sk->sk_filter);
1838                if (filter != NULL)
1839                        /* though it's an empty new sock, the charging may fail
1840                         * if sysctl_optmem_max was changed between creation of
1841                         * original socket and cloning
1842                         */
1843                        is_charged = sk_filter_charge(newsk, filter);
1844                RCU_INIT_POINTER(newsk->sk_filter, filter);
1845                rcu_read_unlock();
1846
1847                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1848                        /* We need to make sure that we don't uncharge the new
1849                         * socket if we couldn't charge it in the first place
1850                         * as otherwise we uncharge the parent's filter.
1851                         */
1852                        if (!is_charged)
1853                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1854                        sk_free_unlock_clone(newsk);
1855                        newsk = NULL;
1856                        goto out;
1857                }
1858                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1859
1860                if (bpf_sk_storage_clone(sk, newsk)) {
1861                        sk_free_unlock_clone(newsk);
1862                        newsk = NULL;
1863                        goto out;
1864                }
1865
1866                newsk->sk_err      = 0;
1867                newsk->sk_err_soft = 0;
1868                newsk->sk_priority = 0;
1869                newsk->sk_incoming_cpu = raw_smp_processor_id();
1870                if (likely(newsk->sk_net_refcnt))
1871                        sock_inuse_add(sock_net(newsk), 1);
1872
1873                /*
1874                 * Before updating sk_refcnt, we must commit prior changes to memory
1875                 * (Documentation/RCU/rculist_nulls.txt for details)
1876                 */
1877                smp_wmb();
1878                refcount_set(&newsk->sk_refcnt, 2);
1879
1880                /*
1881                 * Increment the counter in the same struct proto as the master
1882                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1883                 * is the same as sk->sk_prot->socks, as this field was copied
1884                 * with memcpy).
1885                 *
1886                 * This _changes_ the previous behaviour, where
1887                 * tcp_create_openreq_child always was incrementing the
1888                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1889                 * to be taken into account in all callers. -acme
1890                 */
1891                sk_refcnt_debug_inc(newsk);
1892                sk_set_socket(newsk, NULL);
1893                RCU_INIT_POINTER(newsk->sk_wq, NULL);
1894
1895                if (newsk->sk_prot->sockets_allocated)
1896                        sk_sockets_allocated_inc(newsk);
1897
1898                if (sock_needs_netstamp(sk) &&
1899                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1900                        net_enable_timestamp();
1901        }
1902out:
1903        return newsk;
1904}
1905EXPORT_SYMBOL_GPL(sk_clone_lock);
1906
1907void sk_free_unlock_clone(struct sock *sk)
1908{
1909        /* It is still raw copy of parent, so invalidate
1910         * destructor and make plain sk_free() */
1911        sk->sk_destruct = NULL;
1912        bh_unlock_sock(sk);
1913        sk_free(sk);
1914}
1915EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1916
1917void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1918{
1919        u32 max_segs = 1;
1920
1921        sk_dst_set(sk, dst);
1922        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1923        if (sk->sk_route_caps & NETIF_F_GSO)
1924                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1925        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1926        if (sk_can_gso(sk)) {
1927                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1928                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1929                } else {
1930                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1931                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1932                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1933                }
1934        }
1935        sk->sk_gso_max_segs = max_segs;
1936}
1937EXPORT_SYMBOL_GPL(sk_setup_caps);
1938
1939/*
1940 *      Simple resource managers for sockets.
1941 */
1942
1943
1944/*
1945 * Write buffer destructor automatically called from kfree_skb.
1946 */
1947void sock_wfree(struct sk_buff *skb)
1948{
1949        struct sock *sk = skb->sk;
1950        unsigned int len = skb->truesize;
1951
1952        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1953                /*
1954                 * Keep a reference on sk_wmem_alloc, this will be released
1955                 * after sk_write_space() call
1956                 */
1957                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1958                sk->sk_write_space(sk);
1959                len = 1;
1960        }
1961        /*
1962         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1963         * could not do because of in-flight packets
1964         */
1965        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1966                __sk_free(sk);
1967}
1968EXPORT_SYMBOL(sock_wfree);
1969
1970/* This variant of sock_wfree() is used by TCP,
1971 * since it sets SOCK_USE_WRITE_QUEUE.
1972 */
1973void __sock_wfree(struct sk_buff *skb)
1974{
1975        struct sock *sk = skb->sk;
1976
1977        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1978                __sk_free(sk);
1979}
1980
1981void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1982{
1983        skb_orphan(skb);
1984        skb->sk = sk;
1985#ifdef CONFIG_INET
1986        if (unlikely(!sk_fullsock(sk))) {
1987                skb->destructor = sock_edemux;
1988                sock_hold(sk);
1989                return;
1990        }
1991#endif
1992        skb->destructor = sock_wfree;
1993        skb_set_hash_from_sk(skb, sk);
1994        /*
1995         * We used to take a refcount on sk, but following operation
1996         * is enough to guarantee sk_free() wont free this sock until
1997         * all in-flight packets are completed
1998         */
1999        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2000}
2001EXPORT_SYMBOL(skb_set_owner_w);
2002
2003static bool can_skb_orphan_partial(const struct sk_buff *skb)
2004{
2005#ifdef CONFIG_TLS_DEVICE
2006        /* Drivers depend on in-order delivery for crypto offload,
2007         * partial orphan breaks out-of-order-OK logic.
2008         */
2009        if (skb->decrypted)
2010                return false;
2011#endif
2012        return (skb->destructor == sock_wfree ||
2013                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2014}
2015
2016/* This helper is used by netem, as it can hold packets in its
2017 * delay queue. We want to allow the owner socket to send more
2018 * packets, as if they were already TX completed by a typical driver.
2019 * But we also want to keep skb->sk set because some packet schedulers
2020 * rely on it (sch_fq for example).
2021 */
2022void skb_orphan_partial(struct sk_buff *skb)
2023{
2024        if (skb_is_tcp_pure_ack(skb))
2025                return;
2026
2027        if (can_skb_orphan_partial(skb)) {
2028                struct sock *sk = skb->sk;
2029
2030                if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2031                        WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2032                        skb->destructor = sock_efree;
2033                }
2034        } else {
2035                skb_orphan(skb);
2036        }
2037}
2038EXPORT_SYMBOL(skb_orphan_partial);
2039
2040/*
2041 * Read buffer destructor automatically called from kfree_skb.
2042 */
2043void sock_rfree(struct sk_buff *skb)
2044{
2045        struct sock *sk = skb->sk;
2046        unsigned int len = skb->truesize;
2047
2048        atomic_sub(len, &sk->sk_rmem_alloc);
2049        sk_mem_uncharge(sk, len);
2050}
2051EXPORT_SYMBOL(sock_rfree);
2052
2053/*
2054 * Buffer destructor for skbs that are not used directly in read or write
2055 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2056 */
2057void sock_efree(struct sk_buff *skb)
2058{
2059        sock_put(skb->sk);
2060}
2061EXPORT_SYMBOL(sock_efree);
2062
2063kuid_t sock_i_uid(struct sock *sk)
2064{
2065        kuid_t uid;
2066
2067        read_lock_bh(&sk->sk_callback_lock);
2068        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2069        read_unlock_bh(&sk->sk_callback_lock);
2070        return uid;
2071}
2072EXPORT_SYMBOL(sock_i_uid);
2073
2074unsigned long sock_i_ino(struct sock *sk)
2075{
2076        unsigned long ino;
2077
2078        read_lock_bh(&sk->sk_callback_lock);
2079        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2080        read_unlock_bh(&sk->sk_callback_lock);
2081        return ino;
2082}
2083EXPORT_SYMBOL(sock_i_ino);
2084
2085/*
2086 * Allocate a skb from the socket's send buffer.
2087 */
2088struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2089                             gfp_t priority)
2090{
2091        if (force ||
2092            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2093                struct sk_buff *skb = alloc_skb(size, priority);
2094
2095                if (skb) {
2096                        skb_set_owner_w(skb, sk);
2097                        return skb;
2098                }
2099        }
2100        return NULL;
2101}
2102EXPORT_SYMBOL(sock_wmalloc);
2103
2104static void sock_ofree(struct sk_buff *skb)
2105{
2106        struct sock *sk = skb->sk;
2107
2108        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2109}
2110
2111struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2112                             gfp_t priority)
2113{
2114        struct sk_buff *skb;
2115
2116        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2117        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2118            sysctl_optmem_max)
2119                return NULL;
2120
2121        skb = alloc_skb(size, priority);
2122        if (!skb)
2123                return NULL;
2124
2125        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2126        skb->sk = sk;
2127        skb->destructor = sock_ofree;
2128        return skb;
2129}
2130
2131/*
2132 * Allocate a memory block from the socket's option memory buffer.
2133 */
2134void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2135{
2136        if ((unsigned int)size <= sysctl_optmem_max &&
2137            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2138                void *mem;
2139                /* First do the add, to avoid the race if kmalloc
2140                 * might sleep.
2141                 */
2142                atomic_add(size, &sk->sk_omem_alloc);
2143                mem = kmalloc(size, priority);
2144                if (mem)
2145                        return mem;
2146                atomic_sub(size, &sk->sk_omem_alloc);
2147        }
2148        return NULL;
2149}
2150EXPORT_SYMBOL(sock_kmalloc);
2151
2152/* Free an option memory block. Note, we actually want the inline
2153 * here as this allows gcc to detect the nullify and fold away the
2154 * condition entirely.
2155 */
2156static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2157                                  const bool nullify)
2158{
2159        if (WARN_ON_ONCE(!mem))
2160                return;
2161        if (nullify)
2162                kzfree(mem);
2163        else
2164                kfree(mem);
2165        atomic_sub(size, &sk->sk_omem_alloc);
2166}
2167
2168void sock_kfree_s(struct sock *sk, void *mem, int size)
2169{
2170        __sock_kfree_s(sk, mem, size, false);
2171}
2172EXPORT_SYMBOL(sock_kfree_s);
2173
2174void sock_kzfree_s(struct sock *sk, void *mem, int size)
2175{
2176        __sock_kfree_s(sk, mem, size, true);
2177}
2178EXPORT_SYMBOL(sock_kzfree_s);
2179
2180/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2181   I think, these locks should be removed for datagram sockets.
2182 */
2183static long sock_wait_for_wmem(struct sock *sk, long timeo)
2184{
2185        DEFINE_WAIT(wait);
2186
2187        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2188        for (;;) {
2189                if (!timeo)
2190                        break;
2191                if (signal_pending(current))
2192                        break;
2193                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2194                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2195                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2196                        break;
2197                if (sk->sk_shutdown & SEND_SHUTDOWN)
2198                        break;
2199                if (sk->sk_err)
2200                        break;
2201                timeo = schedule_timeout(timeo);
2202        }
2203        finish_wait(sk_sleep(sk), &wait);
2204        return timeo;
2205}
2206
2207
2208/*
2209 *      Generic send/receive buffer handlers
2210 */
2211
2212struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2213                                     unsigned long data_len, int noblock,
2214                                     int *errcode, int max_page_order)
2215{
2216        struct sk_buff *skb;
2217        long timeo;
2218        int err;
2219
2220        timeo = sock_sndtimeo(sk, noblock);
2221        for (;;) {
2222                err = sock_error(sk);
2223                if (err != 0)
2224                        goto failure;
2225
2226                err = -EPIPE;
2227                if (sk->sk_shutdown & SEND_SHUTDOWN)
2228                        goto failure;
2229
2230                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2231                        break;
2232
2233                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2234                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2235                err = -EAGAIN;
2236                if (!timeo)
2237                        goto failure;
2238                if (signal_pending(current))
2239                        goto interrupted;
2240                timeo = sock_wait_for_wmem(sk, timeo);
2241        }
2242        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2243                                   errcode, sk->sk_allocation);
2244        if (skb)
2245                skb_set_owner_w(skb, sk);
2246        return skb;
2247
2248interrupted:
2249        err = sock_intr_errno(timeo);
2250failure:
2251        *errcode = err;
2252        return NULL;
2253}
2254EXPORT_SYMBOL(sock_alloc_send_pskb);
2255
2256struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2257                                    int noblock, int *errcode)
2258{
2259        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2260}
2261EXPORT_SYMBOL(sock_alloc_send_skb);
2262
2263int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2264                     struct sockcm_cookie *sockc)
2265{
2266        u32 tsflags;
2267
2268        switch (cmsg->cmsg_type) {
2269        case SO_MARK:
2270                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2271                        return -EPERM;
2272                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2273                        return -EINVAL;
2274                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2275                break;
2276        case SO_TIMESTAMPING_OLD:
2277                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2278                        return -EINVAL;
2279
2280                tsflags = *(u32 *)CMSG_DATA(cmsg);
2281                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2282                        return -EINVAL;
2283
2284                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2285                sockc->tsflags |= tsflags;
2286                break;
2287        case SCM_TXTIME:
2288                if (!sock_flag(sk, SOCK_TXTIME))
2289                        return -EINVAL;
2290                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2291                        return -EINVAL;
2292                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2293                break;
2294        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2295        case SCM_RIGHTS:
2296        case SCM_CREDENTIALS:
2297                break;
2298        default:
2299                return -EINVAL;
2300        }
2301        return 0;
2302}
2303EXPORT_SYMBOL(__sock_cmsg_send);
2304
2305int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2306                   struct sockcm_cookie *sockc)
2307{
2308        struct cmsghdr *cmsg;
2309        int ret;
2310
2311        for_each_cmsghdr(cmsg, msg) {
2312                if (!CMSG_OK(msg, cmsg))
2313                        return -EINVAL;
2314                if (cmsg->cmsg_level != SOL_SOCKET)
2315                        continue;
2316                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2317                if (ret)
2318                        return ret;
2319        }
2320        return 0;
2321}
2322EXPORT_SYMBOL(sock_cmsg_send);
2323
2324static void sk_enter_memory_pressure(struct sock *sk)
2325{
2326        if (!sk->sk_prot->enter_memory_pressure)
2327                return;
2328
2329        sk->sk_prot->enter_memory_pressure(sk);
2330}
2331
2332static void sk_leave_memory_pressure(struct sock *sk)
2333{
2334        if (sk->sk_prot->leave_memory_pressure) {
2335                sk->sk_prot->leave_memory_pressure(sk);
2336        } else {
2337                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2338
2339                if (memory_pressure && READ_ONCE(*memory_pressure))
2340                        WRITE_ONCE(*memory_pressure, 0);
2341        }
2342}
2343
2344/* On 32bit arches, an skb frag is limited to 2^15 */
2345#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2346DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2347
2348/**
2349 * skb_page_frag_refill - check that a page_frag contains enough room
2350 * @sz: minimum size of the fragment we want to get
2351 * @pfrag: pointer to page_frag
2352 * @gfp: priority for memory allocation
2353 *
2354 * Note: While this allocator tries to use high order pages, there is
2355 * no guarantee that allocations succeed. Therefore, @sz MUST be
2356 * less or equal than PAGE_SIZE.
2357 */
2358bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2359{
2360        if (pfrag->page) {
2361                if (page_ref_count(pfrag->page) == 1) {
2362                        pfrag->offset = 0;
2363                        return true;
2364                }
2365                if (pfrag->offset + sz <= pfrag->size)
2366                        return true;
2367                put_page(pfrag->page);
2368        }
2369
2370        pfrag->offset = 0;
2371        if (SKB_FRAG_PAGE_ORDER &&
2372            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2373                /* Avoid direct reclaim but allow kswapd to wake */
2374                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2375                                          __GFP_COMP | __GFP_NOWARN |
2376                                          __GFP_NORETRY,
2377                                          SKB_FRAG_PAGE_ORDER);
2378                if (likely(pfrag->page)) {
2379                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2380                        return true;
2381                }
2382        }
2383        pfrag->page = alloc_page(gfp);
2384        if (likely(pfrag->page)) {
2385                pfrag->size = PAGE_SIZE;
2386                return true;
2387        }
2388        return false;
2389}
2390EXPORT_SYMBOL(skb_page_frag_refill);
2391
2392bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2393{
2394        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2395                return true;
2396
2397        sk_enter_memory_pressure(sk);
2398        sk_stream_moderate_sndbuf(sk);
2399        return false;
2400}
2401EXPORT_SYMBOL(sk_page_frag_refill);
2402
2403static void __lock_sock(struct sock *sk)
2404        __releases(&sk->sk_lock.slock)
2405        __acquires(&sk->sk_lock.slock)
2406{
2407        DEFINE_WAIT(wait);
2408
2409        for (;;) {
2410                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2411                                        TASK_UNINTERRUPTIBLE);
2412                spin_unlock_bh(&sk->sk_lock.slock);
2413                schedule();
2414                spin_lock_bh(&sk->sk_lock.slock);
2415                if (!sock_owned_by_user(sk))
2416                        break;
2417        }
2418        finish_wait(&sk->sk_lock.wq, &wait);
2419}
2420
2421void __release_sock(struct sock *sk)
2422        __releases(&sk->sk_lock.slock)
2423        __acquires(&sk->sk_lock.slock)
2424{
2425        struct sk_buff *skb, *next;
2426
2427        while ((skb = sk->sk_backlog.head) != NULL) {
2428                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2429
2430                spin_unlock_bh(&sk->sk_lock.slock);
2431
2432                do {
2433                        next = skb->next;
2434                        prefetch(next);
2435                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2436                        skb_mark_not_on_list(skb);
2437                        sk_backlog_rcv(sk, skb);
2438
2439                        cond_resched();
2440
2441                        skb = next;
2442                } while (skb != NULL);
2443
2444                spin_lock_bh(&sk->sk_lock.slock);
2445        }
2446
2447        /*
2448         * Doing the zeroing here guarantee we can not loop forever
2449         * while a wild producer attempts to flood us.
2450         */
2451        sk->sk_backlog.len = 0;
2452}
2453
2454void __sk_flush_backlog(struct sock *sk)
2455{
2456        spin_lock_bh(&sk->sk_lock.slock);
2457        __release_sock(sk);
2458        spin_unlock_bh(&sk->sk_lock.slock);
2459}
2460
2461/**
2462 * sk_wait_data - wait for data to arrive at sk_receive_queue
2463 * @sk:    sock to wait on
2464 * @timeo: for how long
2465 * @skb:   last skb seen on sk_receive_queue
2466 *
2467 * Now socket state including sk->sk_err is changed only under lock,
2468 * hence we may omit checks after joining wait queue.
2469 * We check receive queue before schedule() only as optimization;
2470 * it is very likely that release_sock() added new data.
2471 */
2472int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2473{
2474        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2475        int rc;
2476
2477        add_wait_queue(sk_sleep(sk), &wait);
2478        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2479        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2480        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2481        remove_wait_queue(sk_sleep(sk), &wait);
2482        return rc;
2483}
2484EXPORT_SYMBOL(sk_wait_data);
2485
2486/**
2487 *      __sk_mem_raise_allocated - increase memory_allocated
2488 *      @sk: socket
2489 *      @size: memory size to allocate
2490 *      @amt: pages to allocate
2491 *      @kind: allocation type
2492 *
2493 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2494 */
2495int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2496{
2497        struct proto *prot = sk->sk_prot;
2498        long allocated = sk_memory_allocated_add(sk, amt);
2499        bool charged = true;
2500
2501        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2502            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2503                goto suppress_allocation;
2504
2505        /* Under limit. */
2506        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2507                sk_leave_memory_pressure(sk);
2508                return 1;
2509        }
2510
2511        /* Under pressure. */
2512        if (allocated > sk_prot_mem_limits(sk, 1))
2513                sk_enter_memory_pressure(sk);
2514
2515        /* Over hard limit. */
2516        if (allocated > sk_prot_mem_limits(sk, 2))
2517                goto suppress_allocation;
2518
2519        /* guarantee minimum buffer size under pressure */
2520        if (kind == SK_MEM_RECV) {
2521                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2522                        return 1;
2523
2524        } else { /* SK_MEM_SEND */
2525                int wmem0 = sk_get_wmem0(sk, prot);
2526
2527                if (sk->sk_type == SOCK_STREAM) {
2528                        if (sk->sk_wmem_queued < wmem0)
2529                                return 1;
2530                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2531                                return 1;
2532                }
2533        }
2534
2535        if (sk_has_memory_pressure(sk)) {
2536                u64 alloc;
2537
2538                if (!sk_under_memory_pressure(sk))
2539                        return 1;
2540                alloc = sk_sockets_allocated_read_positive(sk);
2541                if (sk_prot_mem_limits(sk, 2) > alloc *
2542                    sk_mem_pages(sk->sk_wmem_queued +
2543                                 atomic_read(&sk->sk_rmem_alloc) +
2544                                 sk->sk_forward_alloc))
2545                        return 1;
2546        }
2547
2548suppress_allocation:
2549
2550        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2551                sk_stream_moderate_sndbuf(sk);
2552
2553                /* Fail only if socket is _under_ its sndbuf.
2554                 * In this case we cannot block, so that we have to fail.
2555                 */
2556                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2557                        return 1;
2558        }
2559
2560        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2561                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2562
2563        sk_memory_allocated_sub(sk, amt);
2564
2565        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2566                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2567
2568        return 0;
2569}
2570EXPORT_SYMBOL(__sk_mem_raise_allocated);
2571
2572/**
2573 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2574 *      @sk: socket
2575 *      @size: memory size to allocate
2576 *      @kind: allocation type
2577 *
2578 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2579 *      rmem allocation. This function assumes that protocols which have
2580 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2581 */
2582int __sk_mem_schedule(struct sock *sk, int size, int kind)
2583{
2584        int ret, amt = sk_mem_pages(size);
2585
2586        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2587        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2588        if (!ret)
2589                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2590        return ret;
2591}
2592EXPORT_SYMBOL(__sk_mem_schedule);
2593
2594/**
2595 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2596 *      @sk: socket
2597 *      @amount: number of quanta
2598 *
2599 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2600 */
2601void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2602{
2603        sk_memory_allocated_sub(sk, amount);
2604
2605        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2606                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2607
2608        if (sk_under_memory_pressure(sk) &&
2609            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2610                sk_leave_memory_pressure(sk);
2611}
2612EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2613
2614/**
2615 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2616 *      @sk: socket
2617 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2618 */
2619void __sk_mem_reclaim(struct sock *sk, int amount)
2620{
2621        amount >>= SK_MEM_QUANTUM_SHIFT;
2622        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2623        __sk_mem_reduce_allocated(sk, amount);
2624}
2625EXPORT_SYMBOL(__sk_mem_reclaim);
2626
2627int sk_set_peek_off(struct sock *sk, int val)
2628{
2629        sk->sk_peek_off = val;
2630        return 0;
2631}
2632EXPORT_SYMBOL_GPL(sk_set_peek_off);
2633
2634/*
2635 * Set of default routines for initialising struct proto_ops when
2636 * the protocol does not support a particular function. In certain
2637 * cases where it makes no sense for a protocol to have a "do nothing"
2638 * function, some default processing is provided.
2639 */
2640
2641int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2642{
2643        return -EOPNOTSUPP;
2644}
2645EXPORT_SYMBOL(sock_no_bind);
2646
2647int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2648                    int len, int flags)
2649{
2650        return -EOPNOTSUPP;
2651}
2652EXPORT_SYMBOL(sock_no_connect);
2653
2654int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2655{
2656        return -EOPNOTSUPP;
2657}
2658EXPORT_SYMBOL(sock_no_socketpair);
2659
2660int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2661                   bool kern)
2662{
2663        return -EOPNOTSUPP;
2664}
2665EXPORT_SYMBOL(sock_no_accept);
2666
2667int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2668                    int peer)
2669{
2670        return -EOPNOTSUPP;
2671}
2672EXPORT_SYMBOL(sock_no_getname);
2673
2674int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2675{
2676        return -EOPNOTSUPP;
2677}
2678EXPORT_SYMBOL(sock_no_ioctl);
2679
2680int sock_no_listen(struct socket *sock, int backlog)
2681{
2682        return -EOPNOTSUPP;
2683}
2684EXPORT_SYMBOL(sock_no_listen);
2685
2686int sock_no_shutdown(struct socket *sock, int how)
2687{
2688        return -EOPNOTSUPP;
2689}
2690EXPORT_SYMBOL(sock_no_shutdown);
2691
2692int sock_no_setsockopt(struct socket *sock, int level, int optname,
2693                    char __user *optval, unsigned int optlen)
2694{
2695        return -EOPNOTSUPP;
2696}
2697EXPORT_SYMBOL(sock_no_setsockopt);
2698
2699int sock_no_getsockopt(struct socket *sock, int level, int optname,
2700                    char __user *optval, int __user *optlen)
2701{
2702        return -EOPNOTSUPP;
2703}
2704EXPORT_SYMBOL(sock_no_getsockopt);
2705
2706int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2707{
2708        return -EOPNOTSUPP;
2709}
2710EXPORT_SYMBOL(sock_no_sendmsg);
2711
2712int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2713{
2714        return -EOPNOTSUPP;
2715}
2716EXPORT_SYMBOL(sock_no_sendmsg_locked);
2717
2718int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2719                    int flags)
2720{
2721        return -EOPNOTSUPP;
2722}
2723EXPORT_SYMBOL(sock_no_recvmsg);
2724
2725int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2726{
2727        /* Mirror missing mmap method error code */
2728        return -ENODEV;
2729}
2730EXPORT_SYMBOL(sock_no_mmap);
2731
2732ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2733{
2734        ssize_t res;
2735        struct msghdr msg = {.msg_flags = flags};
2736        struct kvec iov;
2737        char *kaddr = kmap(page);
2738        iov.iov_base = kaddr + offset;
2739        iov.iov_len = size;
2740        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2741        kunmap(page);
2742        return res;
2743}
2744EXPORT_SYMBOL(sock_no_sendpage);
2745
2746ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2747                                int offset, size_t size, int flags)
2748{
2749        ssize_t res;
2750        struct msghdr msg = {.msg_flags = flags};
2751        struct kvec iov;
2752        char *kaddr = kmap(page);
2753
2754        iov.iov_base = kaddr + offset;
2755        iov.iov_len = size;
2756        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2757        kunmap(page);
2758        return res;
2759}
2760EXPORT_SYMBOL(sock_no_sendpage_locked);
2761
2762/*
2763 *      Default Socket Callbacks
2764 */
2765
2766static void sock_def_wakeup(struct sock *sk)
2767{
2768        struct socket_wq *wq;
2769
2770        rcu_read_lock();
2771        wq = rcu_dereference(sk->sk_wq);
2772        if (skwq_has_sleeper(wq))
2773                wake_up_interruptible_all(&wq->wait);
2774        rcu_read_unlock();
2775}
2776
2777static void sock_def_error_report(struct sock *sk)
2778{
2779        struct socket_wq *wq;
2780
2781        rcu_read_lock();
2782        wq = rcu_dereference(sk->sk_wq);
2783        if (skwq_has_sleeper(wq))
2784                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2785        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2786        rcu_read_unlock();
2787}
2788
2789static void sock_def_readable(struct sock *sk)
2790{
2791        struct socket_wq *wq;
2792
2793        rcu_read_lock();
2794        wq = rcu_dereference(sk->sk_wq);
2795        if (skwq_has_sleeper(wq))
2796                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2797                                                EPOLLRDNORM | EPOLLRDBAND);
2798        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2799        rcu_read_unlock();
2800}
2801
2802static void sock_def_write_space(struct sock *sk)
2803{
2804        struct socket_wq *wq;
2805
2806        rcu_read_lock();
2807
2808        /* Do not wake up a writer until he can make "significant"
2809         * progress.  --DaveM
2810         */
2811        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2812                wq = rcu_dereference(sk->sk_wq);
2813                if (skwq_has_sleeper(wq))
2814                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2815                                                EPOLLWRNORM | EPOLLWRBAND);
2816
2817                /* Should agree with poll, otherwise some programs break */
2818                if (sock_writeable(sk))
2819                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2820        }
2821
2822        rcu_read_unlock();
2823}
2824
2825static void sock_def_destruct(struct sock *sk)
2826{
2827}
2828
2829void sk_send_sigurg(struct sock *sk)
2830{
2831        if (sk->sk_socket && sk->sk_socket->file)
2832                if (send_sigurg(&sk->sk_socket->file->f_owner))
2833                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2834}
2835EXPORT_SYMBOL(sk_send_sigurg);
2836
2837void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2838                    unsigned long expires)
2839{
2840        if (!mod_timer(timer, expires))
2841                sock_hold(sk);
2842}
2843EXPORT_SYMBOL(sk_reset_timer);
2844
2845void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2846{
2847        if (del_timer(timer))
2848                __sock_put(sk);
2849}
2850EXPORT_SYMBOL(sk_stop_timer);
2851
2852void sock_init_data(struct socket *sock, struct sock *sk)
2853{
2854        sk_init_common(sk);
2855        sk->sk_send_head        =       NULL;
2856
2857        timer_setup(&sk->sk_timer, NULL, 0);
2858
2859        sk->sk_allocation       =       GFP_KERNEL;
2860        sk->sk_rcvbuf           =       sysctl_rmem_default;
2861        sk->sk_sndbuf           =       sysctl_wmem_default;
2862        sk->sk_state            =       TCP_CLOSE;
2863        sk_set_socket(sk, sock);
2864
2865        sock_set_flag(sk, SOCK_ZAPPED);
2866
2867        if (sock) {
2868                sk->sk_type     =       sock->type;
2869                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2870                sock->sk        =       sk;
2871                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2872        } else {
2873                RCU_INIT_POINTER(sk->sk_wq, NULL);
2874                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2875        }
2876
2877        rwlock_init(&sk->sk_callback_lock);
2878        if (sk->sk_kern_sock)
2879                lockdep_set_class_and_name(
2880                        &sk->sk_callback_lock,
2881                        af_kern_callback_keys + sk->sk_family,
2882                        af_family_kern_clock_key_strings[sk->sk_family]);
2883        else
2884                lockdep_set_class_and_name(
2885                        &sk->sk_callback_lock,
2886                        af_callback_keys + sk->sk_family,
2887                        af_family_clock_key_strings[sk->sk_family]);
2888
2889        sk->sk_state_change     =       sock_def_wakeup;
2890        sk->sk_data_ready       =       sock_def_readable;
2891        sk->sk_write_space      =       sock_def_write_space;
2892        sk->sk_error_report     =       sock_def_error_report;
2893        sk->sk_destruct         =       sock_def_destruct;
2894
2895        sk->sk_frag.page        =       NULL;
2896        sk->sk_frag.offset      =       0;
2897        sk->sk_peek_off         =       -1;
2898
2899        sk->sk_peer_pid         =       NULL;
2900        sk->sk_peer_cred        =       NULL;
2901        sk->sk_write_pending    =       0;
2902        sk->sk_rcvlowat         =       1;
2903        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2904        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2905
2906        sk->sk_stamp = SK_DEFAULT_STAMP;
2907#if BITS_PER_LONG==32
2908        seqlock_init(&sk->sk_stamp_seq);
2909#endif
2910        atomic_set(&sk->sk_zckey, 0);
2911
2912#ifdef CONFIG_NET_RX_BUSY_POLL
2913        sk->sk_napi_id          =       0;
2914        sk->sk_ll_usec          =       sysctl_net_busy_read;
2915#endif
2916
2917        sk->sk_max_pacing_rate = ~0UL;
2918        sk->sk_pacing_rate = ~0UL;
2919        WRITE_ONCE(sk->sk_pacing_shift, 10);
2920        sk->sk_incoming_cpu = -1;
2921
2922        sk_rx_queue_clear(sk);
2923        /*
2924         * Before updating sk_refcnt, we must commit prior changes to memory
2925         * (Documentation/RCU/rculist_nulls.txt for details)
2926         */
2927        smp_wmb();
2928        refcount_set(&sk->sk_refcnt, 1);
2929        atomic_set(&sk->sk_drops, 0);
2930}
2931EXPORT_SYMBOL(sock_init_data);
2932
2933void lock_sock_nested(struct sock *sk, int subclass)
2934{
2935        might_sleep();
2936        spin_lock_bh(&sk->sk_lock.slock);
2937        if (sk->sk_lock.owned)
2938                __lock_sock(sk);
2939        sk->sk_lock.owned = 1;
2940        spin_unlock(&sk->sk_lock.slock);
2941        /*
2942         * The sk_lock has mutex_lock() semantics here:
2943         */
2944        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2945        local_bh_enable();
2946}
2947EXPORT_SYMBOL(lock_sock_nested);
2948
2949void release_sock(struct sock *sk)
2950{
2951        spin_lock_bh(&sk->sk_lock.slock);
2952        if (sk->sk_backlog.tail)
2953                __release_sock(sk);
2954
2955        /* Warning : release_cb() might need to release sk ownership,
2956         * ie call sock_release_ownership(sk) before us.
2957         */
2958        if (sk->sk_prot->release_cb)
2959                sk->sk_prot->release_cb(sk);
2960
2961        sock_release_ownership(sk);
2962        if (waitqueue_active(&sk->sk_lock.wq))
2963                wake_up(&sk->sk_lock.wq);
2964        spin_unlock_bh(&sk->sk_lock.slock);
2965}
2966EXPORT_SYMBOL(release_sock);
2967
2968/**
2969 * lock_sock_fast - fast version of lock_sock
2970 * @sk: socket
2971 *
2972 * This version should be used for very small section, where process wont block
2973 * return false if fast path is taken:
2974 *
2975 *   sk_lock.slock locked, owned = 0, BH disabled
2976 *
2977 * return true if slow path is taken:
2978 *
2979 *   sk_lock.slock unlocked, owned = 1, BH enabled
2980 */
2981bool lock_sock_fast(struct sock *sk)
2982{
2983        might_sleep();
2984        spin_lock_bh(&sk->sk_lock.slock);
2985
2986        if (!sk->sk_lock.owned)
2987                /*
2988                 * Note : We must disable BH
2989                 */
2990                return false;
2991
2992        __lock_sock(sk);
2993        sk->sk_lock.owned = 1;
2994        spin_unlock(&sk->sk_lock.slock);
2995        /*
2996         * The sk_lock has mutex_lock() semantics here:
2997         */
2998        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2999        local_bh_enable();
3000        return true;
3001}
3002EXPORT_SYMBOL(lock_sock_fast);
3003
3004int sock_gettstamp(struct socket *sock, void __user *userstamp,
3005                   bool timeval, bool time32)
3006{
3007        struct sock *sk = sock->sk;
3008        struct timespec64 ts;
3009
3010        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3011        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3012        if (ts.tv_sec == -1)
3013                return -ENOENT;
3014        if (ts.tv_sec == 0) {
3015                ktime_t kt = ktime_get_real();
3016                sock_write_timestamp(sk, kt);
3017                ts = ktime_to_timespec64(kt);
3018        }
3019
3020        if (timeval)
3021                ts.tv_nsec /= 1000;
3022
3023#ifdef CONFIG_COMPAT_32BIT_TIME
3024        if (time32)
3025                return put_old_timespec32(&ts, userstamp);
3026#endif
3027#ifdef CONFIG_SPARC64
3028        /* beware of padding in sparc64 timeval */
3029        if (timeval && !in_compat_syscall()) {
3030                struct __kernel_old_timeval __user tv = {
3031                        .tv_sec = ts.tv_sec,
3032                        .tv_usec = ts.tv_nsec,
3033                };
3034                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3035                        return -EFAULT;
3036                return 0;
3037        }
3038#endif
3039        return put_timespec64(&ts, userstamp);
3040}
3041EXPORT_SYMBOL(sock_gettstamp);
3042
3043void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3044{
3045        if (!sock_flag(sk, flag)) {
3046                unsigned long previous_flags = sk->sk_flags;
3047
3048                sock_set_flag(sk, flag);
3049                /*
3050                 * we just set one of the two flags which require net
3051                 * time stamping, but time stamping might have been on
3052                 * already because of the other one
3053                 */
3054                if (sock_needs_netstamp(sk) &&
3055                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3056                        net_enable_timestamp();
3057        }
3058}
3059
3060int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3061                       int level, int type)
3062{
3063        struct sock_exterr_skb *serr;
3064        struct sk_buff *skb;
3065        int copied, err;
3066
3067        err = -EAGAIN;
3068        skb = sock_dequeue_err_skb(sk);
3069        if (skb == NULL)
3070                goto out;
3071
3072        copied = skb->len;
3073        if (copied > len) {
3074                msg->msg_flags |= MSG_TRUNC;
3075                copied = len;
3076        }
3077        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3078        if (err)
3079                goto out_free_skb;
3080
3081        sock_recv_timestamp(msg, sk, skb);
3082
3083        serr = SKB_EXT_ERR(skb);
3084        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3085
3086        msg->msg_flags |= MSG_ERRQUEUE;
3087        err = copied;
3088
3089out_free_skb:
3090        kfree_skb(skb);
3091out:
3092        return err;
3093}
3094EXPORT_SYMBOL(sock_recv_errqueue);
3095
3096/*
3097 *      Get a socket option on an socket.
3098 *
3099 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3100 *      asynchronous errors should be reported by getsockopt. We assume
3101 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3102 */
3103int sock_common_getsockopt(struct socket *sock, int level, int optname,
3104                           char __user *optval, int __user *optlen)
3105{
3106        struct sock *sk = sock->sk;
3107
3108        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3109}
3110EXPORT_SYMBOL(sock_common_getsockopt);
3111
3112#ifdef CONFIG_COMPAT
3113int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3114                                  char __user *optval, int __user *optlen)
3115{
3116        struct sock *sk = sock->sk;
3117
3118        if (sk->sk_prot->compat_getsockopt != NULL)
3119                return sk->sk_prot->compat_getsockopt(sk, level, optname,
3120                                                      optval, optlen);
3121        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3122}
3123EXPORT_SYMBOL(compat_sock_common_getsockopt);
3124#endif
3125
3126int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3127                        int flags)
3128{
3129        struct sock *sk = sock->sk;
3130        int addr_len = 0;
3131        int err;
3132
3133        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3134                                   flags & ~MSG_DONTWAIT, &addr_len);
3135        if (err >= 0)
3136                msg->msg_namelen = addr_len;
3137        return err;
3138}
3139EXPORT_SYMBOL(sock_common_recvmsg);
3140
3141/*
3142 *      Set socket options on an inet socket.
3143 */
3144int sock_common_setsockopt(struct socket *sock, int level, int optname,
3145                           char __user *optval, unsigned int optlen)
3146{
3147        struct sock *sk = sock->sk;
3148
3149        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3150}
3151EXPORT_SYMBOL(sock_common_setsockopt);
3152
3153#ifdef CONFIG_COMPAT
3154int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3155                                  char __user *optval, unsigned int optlen)
3156{
3157        struct sock *sk = sock->sk;
3158
3159        if (sk->sk_prot->compat_setsockopt != NULL)
3160                return sk->sk_prot->compat_setsockopt(sk, level, optname,
3161                                                      optval, optlen);
3162        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3163}
3164EXPORT_SYMBOL(compat_sock_common_setsockopt);
3165#endif
3166
3167void sk_common_release(struct sock *sk)
3168{
3169        if (sk->sk_prot->destroy)
3170                sk->sk_prot->destroy(sk);
3171
3172        /*
3173         * Observation: when sock_common_release is called, processes have
3174         * no access to socket. But net still has.
3175         * Step one, detach it from networking:
3176         *
3177         * A. Remove from hash tables.
3178         */
3179
3180        sk->sk_prot->unhash(sk);
3181
3182        /*
3183         * In this point socket cannot receive new packets, but it is possible
3184         * that some packets are in flight because some CPU runs receiver and
3185         * did hash table lookup before we unhashed socket. They will achieve
3186         * receive queue and will be purged by socket destructor.
3187         *
3188         * Also we still have packets pending on receive queue and probably,
3189         * our own packets waiting in device queues. sock_destroy will drain
3190         * receive queue, but transmitted packets will delay socket destruction
3191         * until the last reference will be released.
3192         */
3193
3194        sock_orphan(sk);
3195
3196        xfrm_sk_free_policy(sk);
3197
3198        sk_refcnt_debug_release(sk);
3199
3200        sock_put(sk);
3201}
3202EXPORT_SYMBOL(sk_common_release);
3203
3204void sk_get_meminfo(const struct sock *sk, u32 *mem)
3205{
3206        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3207
3208        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3209        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3210        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3211        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3212        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3213        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3214        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3215        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3216        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3217}
3218
3219#ifdef CONFIG_PROC_FS
3220#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3221struct prot_inuse {
3222        int val[PROTO_INUSE_NR];
3223};
3224
3225static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3226
3227void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3228{
3229        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3230}
3231EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3232
3233int sock_prot_inuse_get(struct net *net, struct proto *prot)
3234{
3235        int cpu, idx = prot->inuse_idx;
3236        int res = 0;
3237
3238        for_each_possible_cpu(cpu)
3239                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3240
3241        return res >= 0 ? res : 0;
3242}
3243EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3244
3245static void sock_inuse_add(struct net *net, int val)
3246{
3247        this_cpu_add(*net->core.sock_inuse, val);
3248}
3249
3250int sock_inuse_get(struct net *net)
3251{
3252        int cpu, res = 0;
3253
3254        for_each_possible_cpu(cpu)
3255                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3256
3257        return res;
3258}
3259
3260EXPORT_SYMBOL_GPL(sock_inuse_get);
3261
3262static int __net_init sock_inuse_init_net(struct net *net)
3263{
3264        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3265        if (net->core.prot_inuse == NULL)
3266                return -ENOMEM;
3267
3268        net->core.sock_inuse = alloc_percpu(int);
3269        if (net->core.sock_inuse == NULL)
3270                goto out;
3271
3272        return 0;
3273
3274out:
3275        free_percpu(net->core.prot_inuse);
3276        return -ENOMEM;
3277}
3278
3279static void __net_exit sock_inuse_exit_net(struct net *net)
3280{
3281        free_percpu(net->core.prot_inuse);
3282        free_percpu(net->core.sock_inuse);
3283}
3284
3285static struct pernet_operations net_inuse_ops = {
3286        .init = sock_inuse_init_net,
3287        .exit = sock_inuse_exit_net,
3288};
3289
3290static __init int net_inuse_init(void)
3291{
3292        if (register_pernet_subsys(&net_inuse_ops))
3293                panic("Cannot initialize net inuse counters");
3294
3295        return 0;
3296}
3297
3298core_initcall(net_inuse_init);
3299
3300static int assign_proto_idx(struct proto *prot)
3301{
3302        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3303
3304        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3305                pr_err("PROTO_INUSE_NR exhausted\n");
3306                return -ENOSPC;
3307        }
3308
3309        set_bit(prot->inuse_idx, proto_inuse_idx);
3310        return 0;
3311}
3312
3313static void release_proto_idx(struct proto *prot)
3314{
3315        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3316                clear_bit(prot->inuse_idx, proto_inuse_idx);
3317}
3318#else
3319static inline int assign_proto_idx(struct proto *prot)
3320{
3321        return 0;
3322}
3323
3324static inline void release_proto_idx(struct proto *prot)
3325{
3326}
3327
3328static void sock_inuse_add(struct net *net, int val)
3329{
3330}
3331#endif
3332
3333static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3334{
3335        if (!rsk_prot)
3336                return;
3337        kfree(rsk_prot->slab_name);
3338        rsk_prot->slab_name = NULL;
3339        kmem_cache_destroy(rsk_prot->slab);
3340        rsk_prot->slab = NULL;
3341}
3342
3343static int req_prot_init(const struct proto *prot)
3344{
3345        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3346
3347        if (!rsk_prot)
3348                return 0;
3349
3350        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3351                                        prot->name);
3352        if (!rsk_prot->slab_name)
3353                return -ENOMEM;
3354
3355        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3356                                           rsk_prot->obj_size, 0,
3357                                           SLAB_ACCOUNT | prot->slab_flags,
3358                                           NULL);
3359
3360        if (!rsk_prot->slab) {
3361                pr_crit("%s: Can't create request sock SLAB cache!\n",
3362                        prot->name);
3363                return -ENOMEM;
3364        }
3365        return 0;
3366}
3367
3368int proto_register(struct proto *prot, int alloc_slab)
3369{
3370        int ret = -ENOBUFS;
3371
3372        if (alloc_slab) {
3373                prot->slab = kmem_cache_create_usercopy(prot->name,
3374                                        prot->obj_size, 0,
3375                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3376                                        prot->slab_flags,
3377                                        prot->useroffset, prot->usersize,
3378                                        NULL);
3379
3380                if (prot->slab == NULL) {
3381                        pr_crit("%s: Can't create sock SLAB cache!\n",
3382                                prot->name);
3383                        goto out;
3384                }
3385
3386                if (req_prot_init(prot))
3387                        goto out_free_request_sock_slab;
3388
3389                if (prot->twsk_prot != NULL) {
3390                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3391
3392                        if (prot->twsk_prot->twsk_slab_name == NULL)
3393                                goto out_free_request_sock_slab;
3394
3395                        prot->twsk_prot->twsk_slab =
3396                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3397                                                  prot->twsk_prot->twsk_obj_size,
3398                                                  0,
3399                                                  SLAB_ACCOUNT |
3400                                                  prot->slab_flags,
3401                                                  NULL);
3402                        if (prot->twsk_prot->twsk_slab == NULL)
3403                                goto out_free_timewait_sock_slab_name;
3404                }
3405        }
3406
3407        mutex_lock(&proto_list_mutex);
3408        ret = assign_proto_idx(prot);
3409        if (ret) {
3410                mutex_unlock(&proto_list_mutex);
3411                goto out_free_timewait_sock_slab_name;
3412        }
3413        list_add(&prot->node, &proto_list);
3414        mutex_unlock(&proto_list_mutex);
3415        return ret;
3416
3417out_free_timewait_sock_slab_name:
3418        if (alloc_slab && prot->twsk_prot)
3419                kfree(prot->twsk_prot->twsk_slab_name);
3420out_free_request_sock_slab:
3421        if (alloc_slab) {
3422                req_prot_cleanup(prot->rsk_prot);
3423
3424                kmem_cache_destroy(prot->slab);
3425                prot->slab = NULL;
3426        }
3427out:
3428        return ret;
3429}
3430EXPORT_SYMBOL(proto_register);
3431
3432void proto_unregister(struct proto *prot)
3433{
3434        mutex_lock(&proto_list_mutex);
3435        release_proto_idx(prot);
3436        list_del(&prot->node);
3437        mutex_unlock(&proto_list_mutex);
3438
3439        kmem_cache_destroy(prot->slab);
3440        prot->slab = NULL;
3441
3442        req_prot_cleanup(prot->rsk_prot);
3443
3444        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3445                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3446                kfree(prot->twsk_prot->twsk_slab_name);
3447                prot->twsk_prot->twsk_slab = NULL;
3448        }
3449}
3450EXPORT_SYMBOL(proto_unregister);
3451
3452int sock_load_diag_module(int family, int protocol)
3453{
3454        if (!protocol) {
3455                if (!sock_is_registered(family))
3456                        return -ENOENT;
3457
3458                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3459                                      NETLINK_SOCK_DIAG, family);
3460        }
3461
3462#ifdef CONFIG_INET
3463        if (family == AF_INET &&
3464            protocol != IPPROTO_RAW &&
3465            !rcu_access_pointer(inet_protos[protocol]))
3466                return -ENOENT;
3467#endif
3468
3469        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3470                              NETLINK_SOCK_DIAG, family, protocol);
3471}
3472EXPORT_SYMBOL(sock_load_diag_module);
3473
3474#ifdef CONFIG_PROC_FS
3475static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3476        __acquires(proto_list_mutex)
3477{
3478        mutex_lock(&proto_list_mutex);
3479        return seq_list_start_head(&proto_list, *pos);
3480}
3481
3482static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3483{
3484        return seq_list_next(v, &proto_list, pos);
3485}
3486
3487static void proto_seq_stop(struct seq_file *seq, void *v)
3488        __releases(proto_list_mutex)
3489{
3490        mutex_unlock(&proto_list_mutex);
3491}
3492
3493static char proto_method_implemented(const void *method)
3494{
3495        return method == NULL ? 'n' : 'y';
3496}
3497static long sock_prot_memory_allocated(struct proto *proto)
3498{
3499        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3500}
3501
3502static const char *sock_prot_memory_pressure(struct proto *proto)
3503{
3504        return proto->memory_pressure != NULL ?
3505        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3506}
3507
3508static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3509{
3510
3511        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3512                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3513                   proto->name,
3514                   proto->obj_size,
3515                   sock_prot_inuse_get(seq_file_net(seq), proto),
3516                   sock_prot_memory_allocated(proto),
3517                   sock_prot_memory_pressure(proto),
3518                   proto->max_header,
3519                   proto->slab == NULL ? "no" : "yes",
3520                   module_name(proto->owner),
3521                   proto_method_implemented(proto->close),
3522                   proto_method_implemented(proto->connect),
3523                   proto_method_implemented(proto->disconnect),
3524                   proto_method_implemented(proto->accept),
3525                   proto_method_implemented(proto->ioctl),
3526                   proto_method_implemented(proto->init),
3527                   proto_method_implemented(proto->destroy),
3528                   proto_method_implemented(proto->shutdown),
3529                   proto_method_implemented(proto->setsockopt),
3530                   proto_method_implemented(proto->getsockopt),
3531                   proto_method_implemented(proto->sendmsg),
3532                   proto_method_implemented(proto->recvmsg),
3533                   proto_method_implemented(proto->sendpage),
3534                   proto_method_implemented(proto->bind),
3535                   proto_method_implemented(proto->backlog_rcv),
3536                   proto_method_implemented(proto->hash),
3537                   proto_method_implemented(proto->unhash),
3538                   proto_method_implemented(proto->get_port),
3539                   proto_method_implemented(proto->enter_memory_pressure));
3540}
3541
3542static int proto_seq_show(struct seq_file *seq, void *v)
3543{
3544        if (v == &proto_list)
3545                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3546                           "protocol",
3547                           "size",
3548                           "sockets",
3549                           "memory",
3550                           "press",
3551                           "maxhdr",
3552                           "slab",
3553                           "module",
3554                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3555        else
3556                proto_seq_printf(seq, list_entry(v, struct proto, node));
3557        return 0;
3558}
3559
3560static const struct seq_operations proto_seq_ops = {
3561        .start  = proto_seq_start,
3562        .next   = proto_seq_next,
3563        .stop   = proto_seq_stop,
3564        .show   = proto_seq_show,
3565};
3566
3567static __net_init int proto_init_net(struct net *net)
3568{
3569        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3570                        sizeof(struct seq_net_private)))
3571                return -ENOMEM;
3572
3573        return 0;
3574}
3575
3576static __net_exit void proto_exit_net(struct net *net)
3577{
3578        remove_proc_entry("protocols", net->proc_net);
3579}
3580
3581
3582static __net_initdata struct pernet_operations proto_net_ops = {
3583        .init = proto_init_net,
3584        .exit = proto_exit_net,
3585};
3586
3587static int __init proto_init(void)
3588{
3589        return register_pernet_subsys(&proto_net_ops);
3590}
3591
3592subsys_initcall(proto_init);
3593
3594#endif /* PROC_FS */
3595
3596#ifdef CONFIG_NET_RX_BUSY_POLL
3597bool sk_busy_loop_end(void *p, unsigned long start_time)
3598{
3599        struct sock *sk = p;
3600
3601        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3602               sk_busy_loop_timeout(sk, start_time);
3603}
3604EXPORT_SYMBOL(sk_busy_loop_end);
3605#endif /* CONFIG_NET_RX_BUSY_POLL */
3606