linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116
 117#include <linux/uaccess.h>
 118
 119#include <linux/netdevice.h>
 120#include <net/protocol.h>
 121#include <linux/skbuff.h>
 122#include <net/net_namespace.h>
 123#include <net/request_sock.h>
 124#include <net/sock.h>
 125#include <linux/net_tstamp.h>
 126#include <net/xfrm.h>
 127#include <linux/ipsec.h>
 128#include <net/cls_cgroup.h>
 129#include <net/netprio_cgroup.h>
 130#include <linux/sock_diag.h>
 131
 132#include <linux/filter.h>
 133#include <net/sock_reuseport.h>
 134#include <net/bpf_sk_storage.h>
 135
 136#include <trace/events/sock.h>
 137
 138#include <net/tcp.h>
 139#include <net/busy_poll.h>
 140
 141static DEFINE_MUTEX(proto_list_mutex);
 142static LIST_HEAD(proto_list);
 143
 144static void sock_inuse_add(struct net *net, int val);
 145
 146/**
 147 * sk_ns_capable - General socket capability test
 148 * @sk: Socket to use a capability on or through
 149 * @user_ns: The user namespace of the capability to use
 150 * @cap: The capability to use
 151 *
 152 * Test to see if the opener of the socket had when the socket was
 153 * created and the current process has the capability @cap in the user
 154 * namespace @user_ns.
 155 */
 156bool sk_ns_capable(const struct sock *sk,
 157                   struct user_namespace *user_ns, int cap)
 158{
 159        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 160                ns_capable(user_ns, cap);
 161}
 162EXPORT_SYMBOL(sk_ns_capable);
 163
 164/**
 165 * sk_capable - Socket global capability test
 166 * @sk: Socket to use a capability on or through
 167 * @cap: The global capability to use
 168 *
 169 * Test to see if the opener of the socket had when the socket was
 170 * created and the current process has the capability @cap in all user
 171 * namespaces.
 172 */
 173bool sk_capable(const struct sock *sk, int cap)
 174{
 175        return sk_ns_capable(sk, &init_user_ns, cap);
 176}
 177EXPORT_SYMBOL(sk_capable);
 178
 179/**
 180 * sk_net_capable - Network namespace socket capability test
 181 * @sk: Socket to use a capability on or through
 182 * @cap: The capability to use
 183 *
 184 * Test to see if the opener of the socket had when the socket was created
 185 * and the current process has the capability @cap over the network namespace
 186 * the socket is a member of.
 187 */
 188bool sk_net_capable(const struct sock *sk, int cap)
 189{
 190        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 191}
 192EXPORT_SYMBOL(sk_net_capable);
 193
 194/*
 195 * Each address family might have different locking rules, so we have
 196 * one slock key per address family and separate keys for internal and
 197 * userspace sockets.
 198 */
 199static struct lock_class_key af_family_keys[AF_MAX];
 200static struct lock_class_key af_family_kern_keys[AF_MAX];
 201static struct lock_class_key af_family_slock_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 203
 204/*
 205 * Make lock validator output more readable. (we pre-construct these
 206 * strings build-time, so that runtime initialization of socket
 207 * locks is fast):
 208 */
 209
 210#define _sock_locks(x)                                            \
 211  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 212  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 213  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 214  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 215  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 216  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 217  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 218  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 219  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 220  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 221  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 222  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 223  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 224  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 225  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 226  x "AF_MAX"
 227
 228static const char *const af_family_key_strings[AF_MAX+1] = {
 229        _sock_locks("sk_lock-")
 230};
 231static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 232        _sock_locks("slock-")
 233};
 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235        _sock_locks("clock-")
 236};
 237
 238static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 239        _sock_locks("k-sk_lock-")
 240};
 241static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 242        _sock_locks("k-slock-")
 243};
 244static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 245        _sock_locks("k-clock-")
 246};
 247static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 248        _sock_locks("rlock-")
 249};
 250static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 251        _sock_locks("wlock-")
 252};
 253static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 254        _sock_locks("elock-")
 255};
 256
 257/*
 258 * sk_callback_lock and sk queues locking rules are per-address-family,
 259 * so split the lock classes by using a per-AF key:
 260 */
 261static struct lock_class_key af_callback_keys[AF_MAX];
 262static struct lock_class_key af_rlock_keys[AF_MAX];
 263static struct lock_class_key af_wlock_keys[AF_MAX];
 264static struct lock_class_key af_elock_keys[AF_MAX];
 265static struct lock_class_key af_kern_callback_keys[AF_MAX];
 266
 267/* Run time adjustable parameters. */
 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269EXPORT_SYMBOL(sysctl_wmem_max);
 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271EXPORT_SYMBOL(sysctl_rmem_max);
 272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275/* Maximal space eaten by iovec or ancillary data plus some space */
 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279int sysctl_tstamp_allow_data __read_mostly = 1;
 280
 281DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 282EXPORT_SYMBOL_GPL(memalloc_socks_key);
 283
 284/**
 285 * sk_set_memalloc - sets %SOCK_MEMALLOC
 286 * @sk: socket to set it on
 287 *
 288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 289 * It's the responsibility of the admin to adjust min_free_kbytes
 290 * to meet the requirements
 291 */
 292void sk_set_memalloc(struct sock *sk)
 293{
 294        sock_set_flag(sk, SOCK_MEMALLOC);
 295        sk->sk_allocation |= __GFP_MEMALLOC;
 296        static_branch_inc(&memalloc_socks_key);
 297}
 298EXPORT_SYMBOL_GPL(sk_set_memalloc);
 299
 300void sk_clear_memalloc(struct sock *sk)
 301{
 302        sock_reset_flag(sk, SOCK_MEMALLOC);
 303        sk->sk_allocation &= ~__GFP_MEMALLOC;
 304        static_branch_dec(&memalloc_socks_key);
 305
 306        /*
 307         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 308         * progress of swapping. SOCK_MEMALLOC may be cleared while
 309         * it has rmem allocations due to the last swapfile being deactivated
 310         * but there is a risk that the socket is unusable due to exceeding
 311         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 312         */
 313        sk_mem_reclaim(sk);
 314}
 315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318{
 319        int ret;
 320        unsigned int noreclaim_flag;
 321
 322        /* these should have been dropped before queueing */
 323        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325        noreclaim_flag = memalloc_noreclaim_save();
 326        ret = sk->sk_backlog_rcv(sk, skb);
 327        memalloc_noreclaim_restore(noreclaim_flag);
 328
 329        return ret;
 330}
 331EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 334{
 335        struct __kernel_sock_timeval tv;
 336        int size;
 337
 338        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339                tv.tv_sec = 0;
 340                tv.tv_usec = 0;
 341        } else {
 342                tv.tv_sec = timeo / HZ;
 343                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344        }
 345
 346        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348                *(struct old_timeval32 *)optval = tv32;
 349                return sizeof(tv32);
 350        }
 351
 352        if (old_timeval) {
 353                struct __kernel_old_timeval old_tv;
 354                old_tv.tv_sec = tv.tv_sec;
 355                old_tv.tv_usec = tv.tv_usec;
 356                *(struct __kernel_old_timeval *)optval = old_tv;
 357                size = sizeof(old_tv);
 358        } else {
 359                *(struct __kernel_sock_timeval *)optval = tv;
 360                size = sizeof(tv);
 361        }
 362
 363        return size;
 364}
 365
 366static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 367{
 368        struct __kernel_sock_timeval tv;
 369
 370        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 371                struct old_timeval32 tv32;
 372
 373                if (optlen < sizeof(tv32))
 374                        return -EINVAL;
 375
 376                if (copy_from_user(&tv32, optval, sizeof(tv32)))
 377                        return -EFAULT;
 378                tv.tv_sec = tv32.tv_sec;
 379                tv.tv_usec = tv32.tv_usec;
 380        } else if (old_timeval) {
 381                struct __kernel_old_timeval old_tv;
 382
 383                if (optlen < sizeof(old_tv))
 384                        return -EINVAL;
 385                if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
 386                        return -EFAULT;
 387                tv.tv_sec = old_tv.tv_sec;
 388                tv.tv_usec = old_tv.tv_usec;
 389        } else {
 390                if (optlen < sizeof(tv))
 391                        return -EINVAL;
 392                if (copy_from_user(&tv, optval, sizeof(tv)))
 393                        return -EFAULT;
 394        }
 395        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 396                return -EDOM;
 397
 398        if (tv.tv_sec < 0) {
 399                static int warned __read_mostly;
 400
 401                *timeo_p = 0;
 402                if (warned < 10 && net_ratelimit()) {
 403                        warned++;
 404                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 405                                __func__, current->comm, task_pid_nr(current));
 406                }
 407                return 0;
 408        }
 409        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 410        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 411                return 0;
 412        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 413                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 414        return 0;
 415}
 416
 417static void sock_warn_obsolete_bsdism(const char *name)
 418{
 419        static int warned;
 420        static char warncomm[TASK_COMM_LEN];
 421        if (strcmp(warncomm, current->comm) && warned < 5) {
 422                strcpy(warncomm,  current->comm);
 423                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 424                        warncomm, name);
 425                warned++;
 426        }
 427}
 428
 429static bool sock_needs_netstamp(const struct sock *sk)
 430{
 431        switch (sk->sk_family) {
 432        case AF_UNSPEC:
 433        case AF_UNIX:
 434                return false;
 435        default:
 436                return true;
 437        }
 438}
 439
 440static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 441{
 442        if (sk->sk_flags & flags) {
 443                sk->sk_flags &= ~flags;
 444                if (sock_needs_netstamp(sk) &&
 445                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 446                        net_disable_timestamp();
 447        }
 448}
 449
 450
 451int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 452{
 453        unsigned long flags;
 454        struct sk_buff_head *list = &sk->sk_receive_queue;
 455
 456        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 457                atomic_inc(&sk->sk_drops);
 458                trace_sock_rcvqueue_full(sk, skb);
 459                return -ENOMEM;
 460        }
 461
 462        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 463                atomic_inc(&sk->sk_drops);
 464                return -ENOBUFS;
 465        }
 466
 467        skb->dev = NULL;
 468        skb_set_owner_r(skb, sk);
 469
 470        /* we escape from rcu protected region, make sure we dont leak
 471         * a norefcounted dst
 472         */
 473        skb_dst_force(skb);
 474
 475        spin_lock_irqsave(&list->lock, flags);
 476        sock_skb_set_dropcount(sk, skb);
 477        __skb_queue_tail(list, skb);
 478        spin_unlock_irqrestore(&list->lock, flags);
 479
 480        if (!sock_flag(sk, SOCK_DEAD))
 481                sk->sk_data_ready(sk);
 482        return 0;
 483}
 484EXPORT_SYMBOL(__sock_queue_rcv_skb);
 485
 486int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 487{
 488        int err;
 489
 490        err = sk_filter(sk, skb);
 491        if (err)
 492                return err;
 493
 494        return __sock_queue_rcv_skb(sk, skb);
 495}
 496EXPORT_SYMBOL(sock_queue_rcv_skb);
 497
 498int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 499                     const int nested, unsigned int trim_cap, bool refcounted)
 500{
 501        int rc = NET_RX_SUCCESS;
 502
 503        if (sk_filter_trim_cap(sk, skb, trim_cap))
 504                goto discard_and_relse;
 505
 506        skb->dev = NULL;
 507
 508        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 509                atomic_inc(&sk->sk_drops);
 510                goto discard_and_relse;
 511        }
 512        if (nested)
 513                bh_lock_sock_nested(sk);
 514        else
 515                bh_lock_sock(sk);
 516        if (!sock_owned_by_user(sk)) {
 517                /*
 518                 * trylock + unlock semantics:
 519                 */
 520                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 521
 522                rc = sk_backlog_rcv(sk, skb);
 523
 524                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 525        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 526                bh_unlock_sock(sk);
 527                atomic_inc(&sk->sk_drops);
 528                goto discard_and_relse;
 529        }
 530
 531        bh_unlock_sock(sk);
 532out:
 533        if (refcounted)
 534                sock_put(sk);
 535        return rc;
 536discard_and_relse:
 537        kfree_skb(skb);
 538        goto out;
 539}
 540EXPORT_SYMBOL(__sk_receive_skb);
 541
 542struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 543{
 544        struct dst_entry *dst = __sk_dst_get(sk);
 545
 546        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 547                sk_tx_queue_clear(sk);
 548                sk->sk_dst_pending_confirm = 0;
 549                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 550                dst_release(dst);
 551                return NULL;
 552        }
 553
 554        return dst;
 555}
 556EXPORT_SYMBOL(__sk_dst_check);
 557
 558struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 559{
 560        struct dst_entry *dst = sk_dst_get(sk);
 561
 562        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 563                sk_dst_reset(sk);
 564                dst_release(dst);
 565                return NULL;
 566        }
 567
 568        return dst;
 569}
 570EXPORT_SYMBOL(sk_dst_check);
 571
 572static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 573{
 574        int ret = -ENOPROTOOPT;
 575#ifdef CONFIG_NETDEVICES
 576        struct net *net = sock_net(sk);
 577
 578        /* Sorry... */
 579        ret = -EPERM;
 580        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 581                goto out;
 582
 583        ret = -EINVAL;
 584        if (ifindex < 0)
 585                goto out;
 586
 587        sk->sk_bound_dev_if = ifindex;
 588        if (sk->sk_prot->rehash)
 589                sk->sk_prot->rehash(sk);
 590        sk_dst_reset(sk);
 591
 592        ret = 0;
 593
 594out:
 595#endif
 596
 597        return ret;
 598}
 599
 600static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 601                                int optlen)
 602{
 603        int ret = -ENOPROTOOPT;
 604#ifdef CONFIG_NETDEVICES
 605        struct net *net = sock_net(sk);
 606        char devname[IFNAMSIZ];
 607        int index;
 608
 609        ret = -EINVAL;
 610        if (optlen < 0)
 611                goto out;
 612
 613        /* Bind this socket to a particular device like "eth0",
 614         * as specified in the passed interface name. If the
 615         * name is "" or the option length is zero the socket
 616         * is not bound.
 617         */
 618        if (optlen > IFNAMSIZ - 1)
 619                optlen = IFNAMSIZ - 1;
 620        memset(devname, 0, sizeof(devname));
 621
 622        ret = -EFAULT;
 623        if (copy_from_user(devname, optval, optlen))
 624                goto out;
 625
 626        index = 0;
 627        if (devname[0] != '\0') {
 628                struct net_device *dev;
 629
 630                rcu_read_lock();
 631                dev = dev_get_by_name_rcu(net, devname);
 632                if (dev)
 633                        index = dev->ifindex;
 634                rcu_read_unlock();
 635                ret = -ENODEV;
 636                if (!dev)
 637                        goto out;
 638        }
 639
 640        lock_sock(sk);
 641        ret = sock_setbindtodevice_locked(sk, index);
 642        release_sock(sk);
 643
 644out:
 645#endif
 646
 647        return ret;
 648}
 649
 650static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 651                                int __user *optlen, int len)
 652{
 653        int ret = -ENOPROTOOPT;
 654#ifdef CONFIG_NETDEVICES
 655        struct net *net = sock_net(sk);
 656        char devname[IFNAMSIZ];
 657
 658        if (sk->sk_bound_dev_if == 0) {
 659                len = 0;
 660                goto zero;
 661        }
 662
 663        ret = -EINVAL;
 664        if (len < IFNAMSIZ)
 665                goto out;
 666
 667        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 668        if (ret)
 669                goto out;
 670
 671        len = strlen(devname) + 1;
 672
 673        ret = -EFAULT;
 674        if (copy_to_user(optval, devname, len))
 675                goto out;
 676
 677zero:
 678        ret = -EFAULT;
 679        if (put_user(len, optlen))
 680                goto out;
 681
 682        ret = 0;
 683
 684out:
 685#endif
 686
 687        return ret;
 688}
 689
 690static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 691{
 692        if (valbool)
 693                sock_set_flag(sk, bit);
 694        else
 695                sock_reset_flag(sk, bit);
 696}
 697
 698bool sk_mc_loop(struct sock *sk)
 699{
 700        if (dev_recursion_level())
 701                return false;
 702        if (!sk)
 703                return true;
 704        switch (sk->sk_family) {
 705        case AF_INET:
 706                return inet_sk(sk)->mc_loop;
 707#if IS_ENABLED(CONFIG_IPV6)
 708        case AF_INET6:
 709                return inet6_sk(sk)->mc_loop;
 710#endif
 711        }
 712        WARN_ON(1);
 713        return true;
 714}
 715EXPORT_SYMBOL(sk_mc_loop);
 716
 717/*
 718 *      This is meant for all protocols to use and covers goings on
 719 *      at the socket level. Everything here is generic.
 720 */
 721
 722int sock_setsockopt(struct socket *sock, int level, int optname,
 723                    char __user *optval, unsigned int optlen)
 724{
 725        struct sock_txtime sk_txtime;
 726        struct sock *sk = sock->sk;
 727        int val;
 728        int valbool;
 729        struct linger ling;
 730        int ret = 0;
 731
 732        /*
 733         *      Options without arguments
 734         */
 735
 736        if (optname == SO_BINDTODEVICE)
 737                return sock_setbindtodevice(sk, optval, optlen);
 738
 739        if (optlen < sizeof(int))
 740                return -EINVAL;
 741
 742        if (get_user(val, (int __user *)optval))
 743                return -EFAULT;
 744
 745        valbool = val ? 1 : 0;
 746
 747        lock_sock(sk);
 748
 749        switch (optname) {
 750        case SO_DEBUG:
 751                if (val && !capable(CAP_NET_ADMIN))
 752                        ret = -EACCES;
 753                else
 754                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 755                break;
 756        case SO_REUSEADDR:
 757                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 758                break;
 759        case SO_REUSEPORT:
 760                sk->sk_reuseport = valbool;
 761                break;
 762        case SO_TYPE:
 763        case SO_PROTOCOL:
 764        case SO_DOMAIN:
 765        case SO_ERROR:
 766                ret = -ENOPROTOOPT;
 767                break;
 768        case SO_DONTROUTE:
 769                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 770                sk_dst_reset(sk);
 771                break;
 772        case SO_BROADCAST:
 773                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 774                break;
 775        case SO_SNDBUF:
 776                /* Don't error on this BSD doesn't and if you think
 777                 * about it this is right. Otherwise apps have to
 778                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 779                 * are treated in BSD as hints
 780                 */
 781                val = min_t(u32, val, sysctl_wmem_max);
 782set_sndbuf:
 783                /* Ensure val * 2 fits into an int, to prevent max_t()
 784                 * from treating it as a negative value.
 785                 */
 786                val = min_t(int, val, INT_MAX / 2);
 787                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 788                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 789                /* Wake up sending tasks if we upped the value. */
 790                sk->sk_write_space(sk);
 791                break;
 792
 793        case SO_SNDBUFFORCE:
 794                if (!capable(CAP_NET_ADMIN)) {
 795                        ret = -EPERM;
 796                        break;
 797                }
 798
 799                /* No negative values (to prevent underflow, as val will be
 800                 * multiplied by 2).
 801                 */
 802                if (val < 0)
 803                        val = 0;
 804                goto set_sndbuf;
 805
 806        case SO_RCVBUF:
 807                /* Don't error on this BSD doesn't and if you think
 808                 * about it this is right. Otherwise apps have to
 809                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 810                 * are treated in BSD as hints
 811                 */
 812                val = min_t(u32, val, sysctl_rmem_max);
 813set_rcvbuf:
 814                /* Ensure val * 2 fits into an int, to prevent max_t()
 815                 * from treating it as a negative value.
 816                 */
 817                val = min_t(int, val, INT_MAX / 2);
 818                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 819                /*
 820                 * We double it on the way in to account for
 821                 * "struct sk_buff" etc. overhead.   Applications
 822                 * assume that the SO_RCVBUF setting they make will
 823                 * allow that much actual data to be received on that
 824                 * socket.
 825                 *
 826                 * Applications are unaware that "struct sk_buff" and
 827                 * other overheads allocate from the receive buffer
 828                 * during socket buffer allocation.
 829                 *
 830                 * And after considering the possible alternatives,
 831                 * returning the value we actually used in getsockopt
 832                 * is the most desirable behavior.
 833                 */
 834                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 835                break;
 836
 837        case SO_RCVBUFFORCE:
 838                if (!capable(CAP_NET_ADMIN)) {
 839                        ret = -EPERM;
 840                        break;
 841                }
 842
 843                /* No negative values (to prevent underflow, as val will be
 844                 * multiplied by 2).
 845                 */
 846                if (val < 0)
 847                        val = 0;
 848                goto set_rcvbuf;
 849
 850        case SO_KEEPALIVE:
 851                if (sk->sk_prot->keepalive)
 852                        sk->sk_prot->keepalive(sk, valbool);
 853                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 854                break;
 855
 856        case SO_OOBINLINE:
 857                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 858                break;
 859
 860        case SO_NO_CHECK:
 861                sk->sk_no_check_tx = valbool;
 862                break;
 863
 864        case SO_PRIORITY:
 865                if ((val >= 0 && val <= 6) ||
 866                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 867                        sk->sk_priority = val;
 868                else
 869                        ret = -EPERM;
 870                break;
 871
 872        case SO_LINGER:
 873                if (optlen < sizeof(ling)) {
 874                        ret = -EINVAL;  /* 1003.1g */
 875                        break;
 876                }
 877                if (copy_from_user(&ling, optval, sizeof(ling))) {
 878                        ret = -EFAULT;
 879                        break;
 880                }
 881                if (!ling.l_onoff)
 882                        sock_reset_flag(sk, SOCK_LINGER);
 883                else {
 884#if (BITS_PER_LONG == 32)
 885                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 886                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 887                        else
 888#endif
 889                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 890                        sock_set_flag(sk, SOCK_LINGER);
 891                }
 892                break;
 893
 894        case SO_BSDCOMPAT:
 895                sock_warn_obsolete_bsdism("setsockopt");
 896                break;
 897
 898        case SO_PASSCRED:
 899                if (valbool)
 900                        set_bit(SOCK_PASSCRED, &sock->flags);
 901                else
 902                        clear_bit(SOCK_PASSCRED, &sock->flags);
 903                break;
 904
 905        case SO_TIMESTAMP_OLD:
 906        case SO_TIMESTAMP_NEW:
 907        case SO_TIMESTAMPNS_OLD:
 908        case SO_TIMESTAMPNS_NEW:
 909                if (valbool)  {
 910                        if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
 911                                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 912                        else
 913                                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 914
 915                        if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 916                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 917                        else
 918                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 919                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 920                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 921                } else {
 922                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 923                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 924                        sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 925                }
 926                break;
 927
 928        case SO_TIMESTAMPING_NEW:
 929                sock_set_flag(sk, SOCK_TSTAMP_NEW);
 930                /* fall through */
 931        case SO_TIMESTAMPING_OLD:
 932                if (val & ~SOF_TIMESTAMPING_MASK) {
 933                        ret = -EINVAL;
 934                        break;
 935                }
 936
 937                if (val & SOF_TIMESTAMPING_OPT_ID &&
 938                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 939                        if (sk->sk_protocol == IPPROTO_TCP &&
 940                            sk->sk_type == SOCK_STREAM) {
 941                                if ((1 << sk->sk_state) &
 942                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 943                                        ret = -EINVAL;
 944                                        break;
 945                                }
 946                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 947                        } else {
 948                                sk->sk_tskey = 0;
 949                        }
 950                }
 951
 952                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 953                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 954                        ret = -EINVAL;
 955                        break;
 956                }
 957
 958                sk->sk_tsflags = val;
 959                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 960                        sock_enable_timestamp(sk,
 961                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 962                else {
 963                        if (optname == SO_TIMESTAMPING_NEW)
 964                                sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 965
 966                        sock_disable_timestamp(sk,
 967                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 968                }
 969                break;
 970
 971        case SO_RCVLOWAT:
 972                if (val < 0)
 973                        val = INT_MAX;
 974                if (sock->ops->set_rcvlowat)
 975                        ret = sock->ops->set_rcvlowat(sk, val);
 976                else
 977                        sk->sk_rcvlowat = val ? : 1;
 978                break;
 979
 980        case SO_RCVTIMEO_OLD:
 981        case SO_RCVTIMEO_NEW:
 982                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 983                break;
 984
 985        case SO_SNDTIMEO_OLD:
 986        case SO_SNDTIMEO_NEW:
 987                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 988                break;
 989
 990        case SO_ATTACH_FILTER:
 991                ret = -EINVAL;
 992                if (optlen == sizeof(struct sock_fprog)) {
 993                        struct sock_fprog fprog;
 994
 995                        ret = -EFAULT;
 996                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 997                                break;
 998
 999                        ret = sk_attach_filter(&fprog, sk);
1000                }
1001                break;
1002
1003        case SO_ATTACH_BPF:
1004                ret = -EINVAL;
1005                if (optlen == sizeof(u32)) {
1006                        u32 ufd;
1007
1008                        ret = -EFAULT;
1009                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
1010                                break;
1011
1012                        ret = sk_attach_bpf(ufd, sk);
1013                }
1014                break;
1015
1016        case SO_ATTACH_REUSEPORT_CBPF:
1017                ret = -EINVAL;
1018                if (optlen == sizeof(struct sock_fprog)) {
1019                        struct sock_fprog fprog;
1020
1021                        ret = -EFAULT;
1022                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
1023                                break;
1024
1025                        ret = sk_reuseport_attach_filter(&fprog, sk);
1026                }
1027                break;
1028
1029        case SO_ATTACH_REUSEPORT_EBPF:
1030                ret = -EINVAL;
1031                if (optlen == sizeof(u32)) {
1032                        u32 ufd;
1033
1034                        ret = -EFAULT;
1035                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
1036                                break;
1037
1038                        ret = sk_reuseport_attach_bpf(ufd, sk);
1039                }
1040                break;
1041
1042        case SO_DETACH_REUSEPORT_BPF:
1043                ret = reuseport_detach_prog(sk);
1044                break;
1045
1046        case SO_DETACH_FILTER:
1047                ret = sk_detach_filter(sk);
1048                break;
1049
1050        case SO_LOCK_FILTER:
1051                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1052                        ret = -EPERM;
1053                else
1054                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1055                break;
1056
1057        case SO_PASSSEC:
1058                if (valbool)
1059                        set_bit(SOCK_PASSSEC, &sock->flags);
1060                else
1061                        clear_bit(SOCK_PASSSEC, &sock->flags);
1062                break;
1063        case SO_MARK:
1064                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1065                        ret = -EPERM;
1066                } else if (val != sk->sk_mark) {
1067                        sk->sk_mark = val;
1068                        sk_dst_reset(sk);
1069                }
1070                break;
1071
1072        case SO_RXQ_OVFL:
1073                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1074                break;
1075
1076        case SO_WIFI_STATUS:
1077                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1078                break;
1079
1080        case SO_PEEK_OFF:
1081                if (sock->ops->set_peek_off)
1082                        ret = sock->ops->set_peek_off(sk, val);
1083                else
1084                        ret = -EOPNOTSUPP;
1085                break;
1086
1087        case SO_NOFCS:
1088                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1089                break;
1090
1091        case SO_SELECT_ERR_QUEUE:
1092                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1093                break;
1094
1095#ifdef CONFIG_NET_RX_BUSY_POLL
1096        case SO_BUSY_POLL:
1097                /* allow unprivileged users to decrease the value */
1098                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1099                        ret = -EPERM;
1100                else {
1101                        if (val < 0)
1102                                ret = -EINVAL;
1103                        else
1104                                sk->sk_ll_usec = val;
1105                }
1106                break;
1107#endif
1108
1109        case SO_MAX_PACING_RATE:
1110                {
1111                unsigned long ulval = (val == ~0U) ? ~0UL : val;
1112
1113                if (sizeof(ulval) != sizeof(val) &&
1114                    optlen >= sizeof(ulval) &&
1115                    get_user(ulval, (unsigned long __user *)optval)) {
1116                        ret = -EFAULT;
1117                        break;
1118                }
1119                if (ulval != ~0UL)
1120                        cmpxchg(&sk->sk_pacing_status,
1121                                SK_PACING_NONE,
1122                                SK_PACING_NEEDED);
1123                sk->sk_max_pacing_rate = ulval;
1124                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1125                break;
1126                }
1127        case SO_INCOMING_CPU:
1128                sk->sk_incoming_cpu = val;
1129                break;
1130
1131        case SO_CNX_ADVICE:
1132                if (val == 1)
1133                        dst_negative_advice(sk);
1134                break;
1135
1136        case SO_ZEROCOPY:
1137                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1138                        if (!((sk->sk_type == SOCK_STREAM &&
1139                               sk->sk_protocol == IPPROTO_TCP) ||
1140                              (sk->sk_type == SOCK_DGRAM &&
1141                               sk->sk_protocol == IPPROTO_UDP)))
1142                                ret = -ENOTSUPP;
1143                } else if (sk->sk_family != PF_RDS) {
1144                        ret = -ENOTSUPP;
1145                }
1146                if (!ret) {
1147                        if (val < 0 || val > 1)
1148                                ret = -EINVAL;
1149                        else
1150                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1151                }
1152                break;
1153
1154        case SO_TXTIME:
1155                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1156                        ret = -EPERM;
1157                } else if (optlen != sizeof(struct sock_txtime)) {
1158                        ret = -EINVAL;
1159                } else if (copy_from_user(&sk_txtime, optval,
1160                           sizeof(struct sock_txtime))) {
1161                        ret = -EFAULT;
1162                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1163                        ret = -EINVAL;
1164                } else {
1165                        sock_valbool_flag(sk, SOCK_TXTIME, true);
1166                        sk->sk_clockid = sk_txtime.clockid;
1167                        sk->sk_txtime_deadline_mode =
1168                                !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1169                        sk->sk_txtime_report_errors =
1170                                !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1171                }
1172                break;
1173
1174        case SO_BINDTOIFINDEX:
1175                ret = sock_setbindtodevice_locked(sk, val);
1176                break;
1177
1178        default:
1179                ret = -ENOPROTOOPT;
1180                break;
1181        }
1182        release_sock(sk);
1183        return ret;
1184}
1185EXPORT_SYMBOL(sock_setsockopt);
1186
1187
1188static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1189                          struct ucred *ucred)
1190{
1191        ucred->pid = pid_vnr(pid);
1192        ucred->uid = ucred->gid = -1;
1193        if (cred) {
1194                struct user_namespace *current_ns = current_user_ns();
1195
1196                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1197                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1198        }
1199}
1200
1201static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1202{
1203        struct user_namespace *user_ns = current_user_ns();
1204        int i;
1205
1206        for (i = 0; i < src->ngroups; i++)
1207                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1208                        return -EFAULT;
1209
1210        return 0;
1211}
1212
1213int sock_getsockopt(struct socket *sock, int level, int optname,
1214                    char __user *optval, int __user *optlen)
1215{
1216        struct sock *sk = sock->sk;
1217
1218        union {
1219                int val;
1220                u64 val64;
1221                unsigned long ulval;
1222                struct linger ling;
1223                struct old_timeval32 tm32;
1224                struct __kernel_old_timeval tm;
1225                struct  __kernel_sock_timeval stm;
1226                struct sock_txtime txtime;
1227        } v;
1228
1229        int lv = sizeof(int);
1230        int len;
1231
1232        if (get_user(len, optlen))
1233                return -EFAULT;
1234        if (len < 0)
1235                return -EINVAL;
1236
1237        memset(&v, 0, sizeof(v));
1238
1239        switch (optname) {
1240        case SO_DEBUG:
1241                v.val = sock_flag(sk, SOCK_DBG);
1242                break;
1243
1244        case SO_DONTROUTE:
1245                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1246                break;
1247
1248        case SO_BROADCAST:
1249                v.val = sock_flag(sk, SOCK_BROADCAST);
1250                break;
1251
1252        case SO_SNDBUF:
1253                v.val = sk->sk_sndbuf;
1254                break;
1255
1256        case SO_RCVBUF:
1257                v.val = sk->sk_rcvbuf;
1258                break;
1259
1260        case SO_REUSEADDR:
1261                v.val = sk->sk_reuse;
1262                break;
1263
1264        case SO_REUSEPORT:
1265                v.val = sk->sk_reuseport;
1266                break;
1267
1268        case SO_KEEPALIVE:
1269                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1270                break;
1271
1272        case SO_TYPE:
1273                v.val = sk->sk_type;
1274                break;
1275
1276        case SO_PROTOCOL:
1277                v.val = sk->sk_protocol;
1278                break;
1279
1280        case SO_DOMAIN:
1281                v.val = sk->sk_family;
1282                break;
1283
1284        case SO_ERROR:
1285                v.val = -sock_error(sk);
1286                if (v.val == 0)
1287                        v.val = xchg(&sk->sk_err_soft, 0);
1288                break;
1289
1290        case SO_OOBINLINE:
1291                v.val = sock_flag(sk, SOCK_URGINLINE);
1292                break;
1293
1294        case SO_NO_CHECK:
1295                v.val = sk->sk_no_check_tx;
1296                break;
1297
1298        case SO_PRIORITY:
1299                v.val = sk->sk_priority;
1300                break;
1301
1302        case SO_LINGER:
1303                lv              = sizeof(v.ling);
1304                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1305                v.ling.l_linger = sk->sk_lingertime / HZ;
1306                break;
1307
1308        case SO_BSDCOMPAT:
1309                sock_warn_obsolete_bsdism("getsockopt");
1310                break;
1311
1312        case SO_TIMESTAMP_OLD:
1313                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1314                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1315                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1316                break;
1317
1318        case SO_TIMESTAMPNS_OLD:
1319                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1320                break;
1321
1322        case SO_TIMESTAMP_NEW:
1323                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1324                break;
1325
1326        case SO_TIMESTAMPNS_NEW:
1327                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1328                break;
1329
1330        case SO_TIMESTAMPING_OLD:
1331                v.val = sk->sk_tsflags;
1332                break;
1333
1334        case SO_RCVTIMEO_OLD:
1335        case SO_RCVTIMEO_NEW:
1336                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1337                break;
1338
1339        case SO_SNDTIMEO_OLD:
1340        case SO_SNDTIMEO_NEW:
1341                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1342                break;
1343
1344        case SO_RCVLOWAT:
1345                v.val = sk->sk_rcvlowat;
1346                break;
1347
1348        case SO_SNDLOWAT:
1349                v.val = 1;
1350                break;
1351
1352        case SO_PASSCRED:
1353                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1354                break;
1355
1356        case SO_PEERCRED:
1357        {
1358                struct ucred peercred;
1359                if (len > sizeof(peercred))
1360                        len = sizeof(peercred);
1361                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1362                if (copy_to_user(optval, &peercred, len))
1363                        return -EFAULT;
1364                goto lenout;
1365        }
1366
1367        case SO_PEERGROUPS:
1368        {
1369                int ret, n;
1370
1371                if (!sk->sk_peer_cred)
1372                        return -ENODATA;
1373
1374                n = sk->sk_peer_cred->group_info->ngroups;
1375                if (len < n * sizeof(gid_t)) {
1376                        len = n * sizeof(gid_t);
1377                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1378                }
1379                len = n * sizeof(gid_t);
1380
1381                ret = groups_to_user((gid_t __user *)optval,
1382                                     sk->sk_peer_cred->group_info);
1383                if (ret)
1384                        return ret;
1385                goto lenout;
1386        }
1387
1388        case SO_PEERNAME:
1389        {
1390                char address[128];
1391
1392                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1393                if (lv < 0)
1394                        return -ENOTCONN;
1395                if (lv < len)
1396                        return -EINVAL;
1397                if (copy_to_user(optval, address, len))
1398                        return -EFAULT;
1399                goto lenout;
1400        }
1401
1402        /* Dubious BSD thing... Probably nobody even uses it, but
1403         * the UNIX standard wants it for whatever reason... -DaveM
1404         */
1405        case SO_ACCEPTCONN:
1406                v.val = sk->sk_state == TCP_LISTEN;
1407                break;
1408
1409        case SO_PASSSEC:
1410                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1411                break;
1412
1413        case SO_PEERSEC:
1414                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1415
1416        case SO_MARK:
1417                v.val = sk->sk_mark;
1418                break;
1419
1420        case SO_RXQ_OVFL:
1421                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1422                break;
1423
1424        case SO_WIFI_STATUS:
1425                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1426                break;
1427
1428        case SO_PEEK_OFF:
1429                if (!sock->ops->set_peek_off)
1430                        return -EOPNOTSUPP;
1431
1432                v.val = sk->sk_peek_off;
1433                break;
1434        case SO_NOFCS:
1435                v.val = sock_flag(sk, SOCK_NOFCS);
1436                break;
1437
1438        case SO_BINDTODEVICE:
1439                return sock_getbindtodevice(sk, optval, optlen, len);
1440
1441        case SO_GET_FILTER:
1442                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1443                if (len < 0)
1444                        return len;
1445
1446                goto lenout;
1447
1448        case SO_LOCK_FILTER:
1449                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1450                break;
1451
1452        case SO_BPF_EXTENSIONS:
1453                v.val = bpf_tell_extensions();
1454                break;
1455
1456        case SO_SELECT_ERR_QUEUE:
1457                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1458                break;
1459
1460#ifdef CONFIG_NET_RX_BUSY_POLL
1461        case SO_BUSY_POLL:
1462                v.val = sk->sk_ll_usec;
1463                break;
1464#endif
1465
1466        case SO_MAX_PACING_RATE:
1467                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1468                        lv = sizeof(v.ulval);
1469                        v.ulval = sk->sk_max_pacing_rate;
1470                } else {
1471                        /* 32bit version */
1472                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1473                }
1474                break;
1475
1476        case SO_INCOMING_CPU:
1477                v.val = sk->sk_incoming_cpu;
1478                break;
1479
1480        case SO_MEMINFO:
1481        {
1482                u32 meminfo[SK_MEMINFO_VARS];
1483
1484                sk_get_meminfo(sk, meminfo);
1485
1486                len = min_t(unsigned int, len, sizeof(meminfo));
1487                if (copy_to_user(optval, &meminfo, len))
1488                        return -EFAULT;
1489
1490                goto lenout;
1491        }
1492
1493#ifdef CONFIG_NET_RX_BUSY_POLL
1494        case SO_INCOMING_NAPI_ID:
1495                v.val = READ_ONCE(sk->sk_napi_id);
1496
1497                /* aggregate non-NAPI IDs down to 0 */
1498                if (v.val < MIN_NAPI_ID)
1499                        v.val = 0;
1500
1501                break;
1502#endif
1503
1504        case SO_COOKIE:
1505                lv = sizeof(u64);
1506                if (len < lv)
1507                        return -EINVAL;
1508                v.val64 = sock_gen_cookie(sk);
1509                break;
1510
1511        case SO_ZEROCOPY:
1512                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1513                break;
1514
1515        case SO_TXTIME:
1516                lv = sizeof(v.txtime);
1517                v.txtime.clockid = sk->sk_clockid;
1518                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1519                                  SOF_TXTIME_DEADLINE_MODE : 0;
1520                v.txtime.flags |= sk->sk_txtime_report_errors ?
1521                                  SOF_TXTIME_REPORT_ERRORS : 0;
1522                break;
1523
1524        case SO_BINDTOIFINDEX:
1525                v.val = sk->sk_bound_dev_if;
1526                break;
1527
1528        default:
1529                /* We implement the SO_SNDLOWAT etc to not be settable
1530                 * (1003.1g 7).
1531                 */
1532                return -ENOPROTOOPT;
1533        }
1534
1535        if (len > lv)
1536                len = lv;
1537        if (copy_to_user(optval, &v, len))
1538                return -EFAULT;
1539lenout:
1540        if (put_user(len, optlen))
1541                return -EFAULT;
1542        return 0;
1543}
1544
1545/*
1546 * Initialize an sk_lock.
1547 *
1548 * (We also register the sk_lock with the lock validator.)
1549 */
1550static inline void sock_lock_init(struct sock *sk)
1551{
1552        if (sk->sk_kern_sock)
1553                sock_lock_init_class_and_name(
1554                        sk,
1555                        af_family_kern_slock_key_strings[sk->sk_family],
1556                        af_family_kern_slock_keys + sk->sk_family,
1557                        af_family_kern_key_strings[sk->sk_family],
1558                        af_family_kern_keys + sk->sk_family);
1559        else
1560                sock_lock_init_class_and_name(
1561                        sk,
1562                        af_family_slock_key_strings[sk->sk_family],
1563                        af_family_slock_keys + sk->sk_family,
1564                        af_family_key_strings[sk->sk_family],
1565                        af_family_keys + sk->sk_family);
1566}
1567
1568/*
1569 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1570 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1571 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1572 */
1573static void sock_copy(struct sock *nsk, const struct sock *osk)
1574{
1575#ifdef CONFIG_SECURITY_NETWORK
1576        void *sptr = nsk->sk_security;
1577#endif
1578        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1579
1580        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1581               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1582
1583#ifdef CONFIG_SECURITY_NETWORK
1584        nsk->sk_security = sptr;
1585        security_sk_clone(osk, nsk);
1586#endif
1587}
1588
1589static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1590                int family)
1591{
1592        struct sock *sk;
1593        struct kmem_cache *slab;
1594
1595        slab = prot->slab;
1596        if (slab != NULL) {
1597                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1598                if (!sk)
1599                        return sk;
1600                if (want_init_on_alloc(priority))
1601                        sk_prot_clear_nulls(sk, prot->obj_size);
1602        } else
1603                sk = kmalloc(prot->obj_size, priority);
1604
1605        if (sk != NULL) {
1606                if (security_sk_alloc(sk, family, priority))
1607                        goto out_free;
1608
1609                if (!try_module_get(prot->owner))
1610                        goto out_free_sec;
1611                sk_tx_queue_clear(sk);
1612        }
1613
1614        return sk;
1615
1616out_free_sec:
1617        security_sk_free(sk);
1618out_free:
1619        if (slab != NULL)
1620                kmem_cache_free(slab, sk);
1621        else
1622                kfree(sk);
1623        return NULL;
1624}
1625
1626static void sk_prot_free(struct proto *prot, struct sock *sk)
1627{
1628        struct kmem_cache *slab;
1629        struct module *owner;
1630
1631        owner = prot->owner;
1632        slab = prot->slab;
1633
1634        cgroup_sk_free(&sk->sk_cgrp_data);
1635        mem_cgroup_sk_free(sk);
1636        security_sk_free(sk);
1637        if (slab != NULL)
1638                kmem_cache_free(slab, sk);
1639        else
1640                kfree(sk);
1641        module_put(owner);
1642}
1643
1644/**
1645 *      sk_alloc - All socket objects are allocated here
1646 *      @net: the applicable net namespace
1647 *      @family: protocol family
1648 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1649 *      @prot: struct proto associated with this new sock instance
1650 *      @kern: is this to be a kernel socket?
1651 */
1652struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1653                      struct proto *prot, int kern)
1654{
1655        struct sock *sk;
1656
1657        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1658        if (sk) {
1659                sk->sk_family = family;
1660                /*
1661                 * See comment in struct sock definition to understand
1662                 * why we need sk_prot_creator -acme
1663                 */
1664                sk->sk_prot = sk->sk_prot_creator = prot;
1665                sk->sk_kern_sock = kern;
1666                sock_lock_init(sk);
1667                sk->sk_net_refcnt = kern ? 0 : 1;
1668                if (likely(sk->sk_net_refcnt)) {
1669                        get_net(net);
1670                        sock_inuse_add(net, 1);
1671                }
1672
1673                sock_net_set(sk, net);
1674                refcount_set(&sk->sk_wmem_alloc, 1);
1675
1676                mem_cgroup_sk_alloc(sk);
1677                cgroup_sk_alloc(&sk->sk_cgrp_data);
1678                sock_update_classid(&sk->sk_cgrp_data);
1679                sock_update_netprioidx(&sk->sk_cgrp_data);
1680        }
1681
1682        return sk;
1683}
1684EXPORT_SYMBOL(sk_alloc);
1685
1686/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1687 * grace period. This is the case for UDP sockets and TCP listeners.
1688 */
1689static void __sk_destruct(struct rcu_head *head)
1690{
1691        struct sock *sk = container_of(head, struct sock, sk_rcu);
1692        struct sk_filter *filter;
1693
1694        if (sk->sk_destruct)
1695                sk->sk_destruct(sk);
1696
1697        filter = rcu_dereference_check(sk->sk_filter,
1698                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1699        if (filter) {
1700                sk_filter_uncharge(sk, filter);
1701                RCU_INIT_POINTER(sk->sk_filter, NULL);
1702        }
1703        if (rcu_access_pointer(sk->sk_reuseport_cb))
1704                reuseport_detach_sock(sk);
1705
1706        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1707
1708#ifdef CONFIG_BPF_SYSCALL
1709        bpf_sk_storage_free(sk);
1710#endif
1711
1712        if (atomic_read(&sk->sk_omem_alloc))
1713                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1714                         __func__, atomic_read(&sk->sk_omem_alloc));
1715
1716        if (sk->sk_frag.page) {
1717                put_page(sk->sk_frag.page);
1718                sk->sk_frag.page = NULL;
1719        }
1720
1721        if (sk->sk_peer_cred)
1722                put_cred(sk->sk_peer_cred);
1723        put_pid(sk->sk_peer_pid);
1724        if (likely(sk->sk_net_refcnt))
1725                put_net(sock_net(sk));
1726        sk_prot_free(sk->sk_prot_creator, sk);
1727}
1728
1729void sk_destruct(struct sock *sk)
1730{
1731        if (sock_flag(sk, SOCK_RCU_FREE))
1732                call_rcu(&sk->sk_rcu, __sk_destruct);
1733        else
1734                __sk_destruct(&sk->sk_rcu);
1735}
1736
1737static void __sk_free(struct sock *sk)
1738{
1739        if (likely(sk->sk_net_refcnt))
1740                sock_inuse_add(sock_net(sk), -1);
1741
1742        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1743                sock_diag_broadcast_destroy(sk);
1744        else
1745                sk_destruct(sk);
1746}
1747
1748void sk_free(struct sock *sk)
1749{
1750        /*
1751         * We subtract one from sk_wmem_alloc and can know if
1752         * some packets are still in some tx queue.
1753         * If not null, sock_wfree() will call __sk_free(sk) later
1754         */
1755        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1756                __sk_free(sk);
1757}
1758EXPORT_SYMBOL(sk_free);
1759
1760static void sk_init_common(struct sock *sk)
1761{
1762        skb_queue_head_init(&sk->sk_receive_queue);
1763        skb_queue_head_init(&sk->sk_write_queue);
1764        skb_queue_head_init(&sk->sk_error_queue);
1765
1766        rwlock_init(&sk->sk_callback_lock);
1767        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1768                        af_rlock_keys + sk->sk_family,
1769                        af_family_rlock_key_strings[sk->sk_family]);
1770        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1771                        af_wlock_keys + sk->sk_family,
1772                        af_family_wlock_key_strings[sk->sk_family]);
1773        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1774                        af_elock_keys + sk->sk_family,
1775                        af_family_elock_key_strings[sk->sk_family]);
1776        lockdep_set_class_and_name(&sk->sk_callback_lock,
1777                        af_callback_keys + sk->sk_family,
1778                        af_family_clock_key_strings[sk->sk_family]);
1779}
1780
1781/**
1782 *      sk_clone_lock - clone a socket, and lock its clone
1783 *      @sk: the socket to clone
1784 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1785 *
1786 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1787 */
1788struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1789{
1790        struct sock *newsk;
1791        bool is_charged = true;
1792
1793        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1794        if (newsk != NULL) {
1795                struct sk_filter *filter;
1796
1797                sock_copy(newsk, sk);
1798
1799                newsk->sk_prot_creator = sk->sk_prot;
1800
1801                /* SANITY */
1802                if (likely(newsk->sk_net_refcnt))
1803                        get_net(sock_net(newsk));
1804                sk_node_init(&newsk->sk_node);
1805                sock_lock_init(newsk);
1806                bh_lock_sock(newsk);
1807                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1808                newsk->sk_backlog.len = 0;
1809
1810                atomic_set(&newsk->sk_rmem_alloc, 0);
1811                /*
1812                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1813                 */
1814                refcount_set(&newsk->sk_wmem_alloc, 1);
1815                atomic_set(&newsk->sk_omem_alloc, 0);
1816                sk_init_common(newsk);
1817
1818                newsk->sk_dst_cache     = NULL;
1819                newsk->sk_dst_pending_confirm = 0;
1820                newsk->sk_wmem_queued   = 0;
1821                newsk->sk_forward_alloc = 0;
1822                atomic_set(&newsk->sk_drops, 0);
1823                newsk->sk_send_head     = NULL;
1824                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1825                atomic_set(&newsk->sk_zckey, 0);
1826
1827                sock_reset_flag(newsk, SOCK_DONE);
1828                mem_cgroup_sk_alloc(newsk);
1829                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1830
1831                rcu_read_lock();
1832                filter = rcu_dereference(sk->sk_filter);
1833                if (filter != NULL)
1834                        /* though it's an empty new sock, the charging may fail
1835                         * if sysctl_optmem_max was changed between creation of
1836                         * original socket and cloning
1837                         */
1838                        is_charged = sk_filter_charge(newsk, filter);
1839                RCU_INIT_POINTER(newsk->sk_filter, filter);
1840                rcu_read_unlock();
1841
1842                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1843                        /* We need to make sure that we don't uncharge the new
1844                         * socket if we couldn't charge it in the first place
1845                         * as otherwise we uncharge the parent's filter.
1846                         */
1847                        if (!is_charged)
1848                                RCU_INIT_POINTER(newsk->sk_filter, NULL);
1849                        sk_free_unlock_clone(newsk);
1850                        newsk = NULL;
1851                        goto out;
1852                }
1853                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1854#ifdef CONFIG_BPF_SYSCALL
1855                RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
1856#endif
1857
1858                newsk->sk_err      = 0;
1859                newsk->sk_err_soft = 0;
1860                newsk->sk_priority = 0;
1861                newsk->sk_incoming_cpu = raw_smp_processor_id();
1862                if (likely(newsk->sk_net_refcnt))
1863                        sock_inuse_add(sock_net(newsk), 1);
1864
1865                /*
1866                 * Before updating sk_refcnt, we must commit prior changes to memory
1867                 * (Documentation/RCU/rculist_nulls.txt for details)
1868                 */
1869                smp_wmb();
1870                refcount_set(&newsk->sk_refcnt, 2);
1871
1872                /*
1873                 * Increment the counter in the same struct proto as the master
1874                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1875                 * is the same as sk->sk_prot->socks, as this field was copied
1876                 * with memcpy).
1877                 *
1878                 * This _changes_ the previous behaviour, where
1879                 * tcp_create_openreq_child always was incrementing the
1880                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1881                 * to be taken into account in all callers. -acme
1882                 */
1883                sk_refcnt_debug_inc(newsk);
1884                sk_set_socket(newsk, NULL);
1885                RCU_INIT_POINTER(newsk->sk_wq, NULL);
1886
1887                if (newsk->sk_prot->sockets_allocated)
1888                        sk_sockets_allocated_inc(newsk);
1889
1890                if (sock_needs_netstamp(sk) &&
1891                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1892                        net_enable_timestamp();
1893        }
1894out:
1895        return newsk;
1896}
1897EXPORT_SYMBOL_GPL(sk_clone_lock);
1898
1899void sk_free_unlock_clone(struct sock *sk)
1900{
1901        /* It is still raw copy of parent, so invalidate
1902         * destructor and make plain sk_free() */
1903        sk->sk_destruct = NULL;
1904        bh_unlock_sock(sk);
1905        sk_free(sk);
1906}
1907EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1908
1909void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1910{
1911        u32 max_segs = 1;
1912
1913        sk_dst_set(sk, dst);
1914        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1915        if (sk->sk_route_caps & NETIF_F_GSO)
1916                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1917        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1918        if (sk_can_gso(sk)) {
1919                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1920                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1921                } else {
1922                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1923                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1924                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1925                }
1926        }
1927        sk->sk_gso_max_segs = max_segs;
1928}
1929EXPORT_SYMBOL_GPL(sk_setup_caps);
1930
1931/*
1932 *      Simple resource managers for sockets.
1933 */
1934
1935
1936/*
1937 * Write buffer destructor automatically called from kfree_skb.
1938 */
1939void sock_wfree(struct sk_buff *skb)
1940{
1941        struct sock *sk = skb->sk;
1942        unsigned int len = skb->truesize;
1943
1944        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1945                /*
1946                 * Keep a reference on sk_wmem_alloc, this will be released
1947                 * after sk_write_space() call
1948                 */
1949                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1950                sk->sk_write_space(sk);
1951                len = 1;
1952        }
1953        /*
1954         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1955         * could not do because of in-flight packets
1956         */
1957        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1958                __sk_free(sk);
1959}
1960EXPORT_SYMBOL(sock_wfree);
1961
1962/* This variant of sock_wfree() is used by TCP,
1963 * since it sets SOCK_USE_WRITE_QUEUE.
1964 */
1965void __sock_wfree(struct sk_buff *skb)
1966{
1967        struct sock *sk = skb->sk;
1968
1969        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1970                __sk_free(sk);
1971}
1972
1973void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1974{
1975        skb_orphan(skb);
1976        skb->sk = sk;
1977#ifdef CONFIG_INET
1978        if (unlikely(!sk_fullsock(sk))) {
1979                skb->destructor = sock_edemux;
1980                sock_hold(sk);
1981                return;
1982        }
1983#endif
1984        skb->destructor = sock_wfree;
1985        skb_set_hash_from_sk(skb, sk);
1986        /*
1987         * We used to take a refcount on sk, but following operation
1988         * is enough to guarantee sk_free() wont free this sock until
1989         * all in-flight packets are completed
1990         */
1991        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1992}
1993EXPORT_SYMBOL(skb_set_owner_w);
1994
1995static bool can_skb_orphan_partial(const struct sk_buff *skb)
1996{
1997#ifdef CONFIG_TLS_DEVICE
1998        /* Drivers depend on in-order delivery for crypto offload,
1999         * partial orphan breaks out-of-order-OK logic.
2000         */
2001        if (skb->decrypted)
2002                return false;
2003#endif
2004        return (skb->destructor == sock_wfree ||
2005                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2006}
2007
2008/* This helper is used by netem, as it can hold packets in its
2009 * delay queue. We want to allow the owner socket to send more
2010 * packets, as if they were already TX completed by a typical driver.
2011 * But we also want to keep skb->sk set because some packet schedulers
2012 * rely on it (sch_fq for example).
2013 */
2014void skb_orphan_partial(struct sk_buff *skb)
2015{
2016        if (skb_is_tcp_pure_ack(skb))
2017                return;
2018
2019        if (can_skb_orphan_partial(skb)) {
2020                struct sock *sk = skb->sk;
2021
2022                if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2023                        WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2024                        skb->destructor = sock_efree;
2025                }
2026        } else {
2027                skb_orphan(skb);
2028        }
2029}
2030EXPORT_SYMBOL(skb_orphan_partial);
2031
2032/*
2033 * Read buffer destructor automatically called from kfree_skb.
2034 */
2035void sock_rfree(struct sk_buff *skb)
2036{
2037        struct sock *sk = skb->sk;
2038        unsigned int len = skb->truesize;
2039
2040        atomic_sub(len, &sk->sk_rmem_alloc);
2041        sk_mem_uncharge(sk, len);
2042}
2043EXPORT_SYMBOL(sock_rfree);
2044
2045/*
2046 * Buffer destructor for skbs that are not used directly in read or write
2047 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2048 */
2049void sock_efree(struct sk_buff *skb)
2050{
2051        sock_put(skb->sk);
2052}
2053EXPORT_SYMBOL(sock_efree);
2054
2055kuid_t sock_i_uid(struct sock *sk)
2056{
2057        kuid_t uid;
2058
2059        read_lock_bh(&sk->sk_callback_lock);
2060        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2061        read_unlock_bh(&sk->sk_callback_lock);
2062        return uid;
2063}
2064EXPORT_SYMBOL(sock_i_uid);
2065
2066unsigned long sock_i_ino(struct sock *sk)
2067{
2068        unsigned long ino;
2069
2070        read_lock_bh(&sk->sk_callback_lock);
2071        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2072        read_unlock_bh(&sk->sk_callback_lock);
2073        return ino;
2074}
2075EXPORT_SYMBOL(sock_i_ino);
2076
2077/*
2078 * Allocate a skb from the socket's send buffer.
2079 */
2080struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2081                             gfp_t priority)
2082{
2083        if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2084                struct sk_buff *skb = alloc_skb(size, priority);
2085                if (skb) {
2086                        skb_set_owner_w(skb, sk);
2087                        return skb;
2088                }
2089        }
2090        return NULL;
2091}
2092EXPORT_SYMBOL(sock_wmalloc);
2093
2094static void sock_ofree(struct sk_buff *skb)
2095{
2096        struct sock *sk = skb->sk;
2097
2098        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2099}
2100
2101struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2102                             gfp_t priority)
2103{
2104        struct sk_buff *skb;
2105
2106        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2107        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2108            sysctl_optmem_max)
2109                return NULL;
2110
2111        skb = alloc_skb(size, priority);
2112        if (!skb)
2113                return NULL;
2114
2115        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2116        skb->sk = sk;
2117        skb->destructor = sock_ofree;
2118        return skb;
2119}
2120
2121/*
2122 * Allocate a memory block from the socket's option memory buffer.
2123 */
2124void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2125{
2126        if ((unsigned int)size <= sysctl_optmem_max &&
2127            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2128                void *mem;
2129                /* First do the add, to avoid the race if kmalloc
2130                 * might sleep.
2131                 */
2132                atomic_add(size, &sk->sk_omem_alloc);
2133                mem = kmalloc(size, priority);
2134                if (mem)
2135                        return mem;
2136                atomic_sub(size, &sk->sk_omem_alloc);
2137        }
2138        return NULL;
2139}
2140EXPORT_SYMBOL(sock_kmalloc);
2141
2142/* Free an option memory block. Note, we actually want the inline
2143 * here as this allows gcc to detect the nullify and fold away the
2144 * condition entirely.
2145 */
2146static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2147                                  const bool nullify)
2148{
2149        if (WARN_ON_ONCE(!mem))
2150                return;
2151        if (nullify)
2152                kzfree(mem);
2153        else
2154                kfree(mem);
2155        atomic_sub(size, &sk->sk_omem_alloc);
2156}
2157
2158void sock_kfree_s(struct sock *sk, void *mem, int size)
2159{
2160        __sock_kfree_s(sk, mem, size, false);
2161}
2162EXPORT_SYMBOL(sock_kfree_s);
2163
2164void sock_kzfree_s(struct sock *sk, void *mem, int size)
2165{
2166        __sock_kfree_s(sk, mem, size, true);
2167}
2168EXPORT_SYMBOL(sock_kzfree_s);
2169
2170/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2171   I think, these locks should be removed for datagram sockets.
2172 */
2173static long sock_wait_for_wmem(struct sock *sk, long timeo)
2174{
2175        DEFINE_WAIT(wait);
2176
2177        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2178        for (;;) {
2179                if (!timeo)
2180                        break;
2181                if (signal_pending(current))
2182                        break;
2183                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2184                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2185                if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2186                        break;
2187                if (sk->sk_shutdown & SEND_SHUTDOWN)
2188                        break;
2189                if (sk->sk_err)
2190                        break;
2191                timeo = schedule_timeout(timeo);
2192        }
2193        finish_wait(sk_sleep(sk), &wait);
2194        return timeo;
2195}
2196
2197
2198/*
2199 *      Generic send/receive buffer handlers
2200 */
2201
2202struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2203                                     unsigned long data_len, int noblock,
2204                                     int *errcode, int max_page_order)
2205{
2206        struct sk_buff *skb;
2207        long timeo;
2208        int err;
2209
2210        timeo = sock_sndtimeo(sk, noblock);
2211        for (;;) {
2212                err = sock_error(sk);
2213                if (err != 0)
2214                        goto failure;
2215
2216                err = -EPIPE;
2217                if (sk->sk_shutdown & SEND_SHUTDOWN)
2218                        goto failure;
2219
2220                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2221                        break;
2222
2223                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2224                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2225                err = -EAGAIN;
2226                if (!timeo)
2227                        goto failure;
2228                if (signal_pending(current))
2229                        goto interrupted;
2230                timeo = sock_wait_for_wmem(sk, timeo);
2231        }
2232        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2233                                   errcode, sk->sk_allocation);
2234        if (skb)
2235                skb_set_owner_w(skb, sk);
2236        return skb;
2237
2238interrupted:
2239        err = sock_intr_errno(timeo);
2240failure:
2241        *errcode = err;
2242        return NULL;
2243}
2244EXPORT_SYMBOL(sock_alloc_send_pskb);
2245
2246struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2247                                    int noblock, int *errcode)
2248{
2249        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2250}
2251EXPORT_SYMBOL(sock_alloc_send_skb);
2252
2253int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2254                     struct sockcm_cookie *sockc)
2255{
2256        u32 tsflags;
2257
2258        switch (cmsg->cmsg_type) {
2259        case SO_MARK:
2260                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2261                        return -EPERM;
2262                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2263                        return -EINVAL;
2264                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2265                break;
2266        case SO_TIMESTAMPING_OLD:
2267                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2268                        return -EINVAL;
2269
2270                tsflags = *(u32 *)CMSG_DATA(cmsg);
2271                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2272                        return -EINVAL;
2273
2274                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2275                sockc->tsflags |= tsflags;
2276                break;
2277        case SCM_TXTIME:
2278                if (!sock_flag(sk, SOCK_TXTIME))
2279                        return -EINVAL;
2280                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2281                        return -EINVAL;
2282                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2283                break;
2284        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2285        case SCM_RIGHTS:
2286        case SCM_CREDENTIALS:
2287                break;
2288        default:
2289                return -EINVAL;
2290        }
2291        return 0;
2292}
2293EXPORT_SYMBOL(__sock_cmsg_send);
2294
2295int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2296                   struct sockcm_cookie *sockc)
2297{
2298        struct cmsghdr *cmsg;
2299        int ret;
2300
2301        for_each_cmsghdr(cmsg, msg) {
2302                if (!CMSG_OK(msg, cmsg))
2303                        return -EINVAL;
2304                if (cmsg->cmsg_level != SOL_SOCKET)
2305                        continue;
2306                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2307                if (ret)
2308                        return ret;
2309        }
2310        return 0;
2311}
2312EXPORT_SYMBOL(sock_cmsg_send);
2313
2314static void sk_enter_memory_pressure(struct sock *sk)
2315{
2316        if (!sk->sk_prot->enter_memory_pressure)
2317                return;
2318
2319        sk->sk_prot->enter_memory_pressure(sk);
2320}
2321
2322static void sk_leave_memory_pressure(struct sock *sk)
2323{
2324        if (sk->sk_prot->leave_memory_pressure) {
2325                sk->sk_prot->leave_memory_pressure(sk);
2326        } else {
2327                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2328
2329                if (memory_pressure && *memory_pressure)
2330                        *memory_pressure = 0;
2331        }
2332}
2333
2334/* On 32bit arches, an skb frag is limited to 2^15 */
2335#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2336DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2337
2338/**
2339 * skb_page_frag_refill - check that a page_frag contains enough room
2340 * @sz: minimum size of the fragment we want to get
2341 * @pfrag: pointer to page_frag
2342 * @gfp: priority for memory allocation
2343 *
2344 * Note: While this allocator tries to use high order pages, there is
2345 * no guarantee that allocations succeed. Therefore, @sz MUST be
2346 * less or equal than PAGE_SIZE.
2347 */
2348bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2349{
2350        if (pfrag->page) {
2351                if (page_ref_count(pfrag->page) == 1) {
2352                        pfrag->offset = 0;
2353                        return true;
2354                }
2355                if (pfrag->offset + sz <= pfrag->size)
2356                        return true;
2357                put_page(pfrag->page);
2358        }
2359
2360        pfrag->offset = 0;
2361        if (SKB_FRAG_PAGE_ORDER &&
2362            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2363                /* Avoid direct reclaim but allow kswapd to wake */
2364                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2365                                          __GFP_COMP | __GFP_NOWARN |
2366                                          __GFP_NORETRY,
2367                                          SKB_FRAG_PAGE_ORDER);
2368                if (likely(pfrag->page)) {
2369                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2370                        return true;
2371                }
2372        }
2373        pfrag->page = alloc_page(gfp);
2374        if (likely(pfrag->page)) {
2375                pfrag->size = PAGE_SIZE;
2376                return true;
2377        }
2378        return false;
2379}
2380EXPORT_SYMBOL(skb_page_frag_refill);
2381
2382bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2383{
2384        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2385                return true;
2386
2387        sk_enter_memory_pressure(sk);
2388        sk_stream_moderate_sndbuf(sk);
2389        return false;
2390}
2391EXPORT_SYMBOL(sk_page_frag_refill);
2392
2393static void __lock_sock(struct sock *sk)
2394        __releases(&sk->sk_lock.slock)
2395        __acquires(&sk->sk_lock.slock)
2396{
2397        DEFINE_WAIT(wait);
2398
2399        for (;;) {
2400                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2401                                        TASK_UNINTERRUPTIBLE);
2402                spin_unlock_bh(&sk->sk_lock.slock);
2403                schedule();
2404                spin_lock_bh(&sk->sk_lock.slock);
2405                if (!sock_owned_by_user(sk))
2406                        break;
2407        }
2408        finish_wait(&sk->sk_lock.wq, &wait);
2409}
2410
2411void __release_sock(struct sock *sk)
2412        __releases(&sk->sk_lock.slock)
2413        __acquires(&sk->sk_lock.slock)
2414{
2415        struct sk_buff *skb, *next;
2416
2417        while ((skb = sk->sk_backlog.head) != NULL) {
2418                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2419
2420                spin_unlock_bh(&sk->sk_lock.slock);
2421
2422                do {
2423                        next = skb->next;
2424                        prefetch(next);
2425                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2426                        skb_mark_not_on_list(skb);
2427                        sk_backlog_rcv(sk, skb);
2428
2429                        cond_resched();
2430
2431                        skb = next;
2432                } while (skb != NULL);
2433
2434                spin_lock_bh(&sk->sk_lock.slock);
2435        }
2436
2437        /*
2438         * Doing the zeroing here guarantee we can not loop forever
2439         * while a wild producer attempts to flood us.
2440         */
2441        sk->sk_backlog.len = 0;
2442}
2443
2444void __sk_flush_backlog(struct sock *sk)
2445{
2446        spin_lock_bh(&sk->sk_lock.slock);
2447        __release_sock(sk);
2448        spin_unlock_bh(&sk->sk_lock.slock);
2449}
2450
2451/**
2452 * sk_wait_data - wait for data to arrive at sk_receive_queue
2453 * @sk:    sock to wait on
2454 * @timeo: for how long
2455 * @skb:   last skb seen on sk_receive_queue
2456 *
2457 * Now socket state including sk->sk_err is changed only under lock,
2458 * hence we may omit checks after joining wait queue.
2459 * We check receive queue before schedule() only as optimization;
2460 * it is very likely that release_sock() added new data.
2461 */
2462int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2463{
2464        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2465        int rc;
2466
2467        add_wait_queue(sk_sleep(sk), &wait);
2468        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2469        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2470        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2471        remove_wait_queue(sk_sleep(sk), &wait);
2472        return rc;
2473}
2474EXPORT_SYMBOL(sk_wait_data);
2475
2476/**
2477 *      __sk_mem_raise_allocated - increase memory_allocated
2478 *      @sk: socket
2479 *      @size: memory size to allocate
2480 *      @amt: pages to allocate
2481 *      @kind: allocation type
2482 *
2483 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2484 */
2485int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2486{
2487        struct proto *prot = sk->sk_prot;
2488        long allocated = sk_memory_allocated_add(sk, amt);
2489        bool charged = true;
2490
2491        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2492            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2493                goto suppress_allocation;
2494
2495        /* Under limit. */
2496        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2497                sk_leave_memory_pressure(sk);
2498                return 1;
2499        }
2500
2501        /* Under pressure. */
2502        if (allocated > sk_prot_mem_limits(sk, 1))
2503                sk_enter_memory_pressure(sk);
2504
2505        /* Over hard limit. */
2506        if (allocated > sk_prot_mem_limits(sk, 2))
2507                goto suppress_allocation;
2508
2509        /* guarantee minimum buffer size under pressure */
2510        if (kind == SK_MEM_RECV) {
2511                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2512                        return 1;
2513
2514        } else { /* SK_MEM_SEND */
2515                int wmem0 = sk_get_wmem0(sk, prot);
2516
2517                if (sk->sk_type == SOCK_STREAM) {
2518                        if (sk->sk_wmem_queued < wmem0)
2519                                return 1;
2520                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2521                                return 1;
2522                }
2523        }
2524
2525        if (sk_has_memory_pressure(sk)) {
2526                u64 alloc;
2527
2528                if (!sk_under_memory_pressure(sk))
2529                        return 1;
2530                alloc = sk_sockets_allocated_read_positive(sk);
2531                if (sk_prot_mem_limits(sk, 2) > alloc *
2532                    sk_mem_pages(sk->sk_wmem_queued +
2533                                 atomic_read(&sk->sk_rmem_alloc) +
2534                                 sk->sk_forward_alloc))
2535                        return 1;
2536        }
2537
2538suppress_allocation:
2539
2540        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2541                sk_stream_moderate_sndbuf(sk);
2542
2543                /* Fail only if socket is _under_ its sndbuf.
2544                 * In this case we cannot block, so that we have to fail.
2545                 */
2546                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2547                        return 1;
2548        }
2549
2550        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2551                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2552
2553        sk_memory_allocated_sub(sk, amt);
2554
2555        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2556                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2557
2558        return 0;
2559}
2560EXPORT_SYMBOL(__sk_mem_raise_allocated);
2561
2562/**
2563 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2564 *      @sk: socket
2565 *      @size: memory size to allocate
2566 *      @kind: allocation type
2567 *
2568 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2569 *      rmem allocation. This function assumes that protocols which have
2570 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2571 */
2572int __sk_mem_schedule(struct sock *sk, int size, int kind)
2573{
2574        int ret, amt = sk_mem_pages(size);
2575
2576        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2577        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2578        if (!ret)
2579                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2580        return ret;
2581}
2582EXPORT_SYMBOL(__sk_mem_schedule);
2583
2584/**
2585 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2586 *      @sk: socket
2587 *      @amount: number of quanta
2588 *
2589 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2590 */
2591void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2592{
2593        sk_memory_allocated_sub(sk, amount);
2594
2595        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2596                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2597
2598        if (sk_under_memory_pressure(sk) &&
2599            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2600                sk_leave_memory_pressure(sk);
2601}
2602EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2603
2604/**
2605 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2606 *      @sk: socket
2607 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2608 */
2609void __sk_mem_reclaim(struct sock *sk, int amount)
2610{
2611        amount >>= SK_MEM_QUANTUM_SHIFT;
2612        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2613        __sk_mem_reduce_allocated(sk, amount);
2614}
2615EXPORT_SYMBOL(__sk_mem_reclaim);
2616
2617int sk_set_peek_off(struct sock *sk, int val)
2618{
2619        sk->sk_peek_off = val;
2620        return 0;
2621}
2622EXPORT_SYMBOL_GPL(sk_set_peek_off);
2623
2624/*
2625 * Set of default routines for initialising struct proto_ops when
2626 * the protocol does not support a particular function. In certain
2627 * cases where it makes no sense for a protocol to have a "do nothing"
2628 * function, some default processing is provided.
2629 */
2630
2631int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2632{
2633        return -EOPNOTSUPP;
2634}
2635EXPORT_SYMBOL(sock_no_bind);
2636
2637int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2638                    int len, int flags)
2639{
2640        return -EOPNOTSUPP;
2641}
2642EXPORT_SYMBOL(sock_no_connect);
2643
2644int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2645{
2646        return -EOPNOTSUPP;
2647}
2648EXPORT_SYMBOL(sock_no_socketpair);
2649
2650int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2651                   bool kern)
2652{
2653        return -EOPNOTSUPP;
2654}
2655EXPORT_SYMBOL(sock_no_accept);
2656
2657int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2658                    int peer)
2659{
2660        return -EOPNOTSUPP;
2661}
2662EXPORT_SYMBOL(sock_no_getname);
2663
2664int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2665{
2666        return -EOPNOTSUPP;
2667}
2668EXPORT_SYMBOL(sock_no_ioctl);
2669
2670int sock_no_listen(struct socket *sock, int backlog)
2671{
2672        return -EOPNOTSUPP;
2673}
2674EXPORT_SYMBOL(sock_no_listen);
2675
2676int sock_no_shutdown(struct socket *sock, int how)
2677{
2678        return -EOPNOTSUPP;
2679}
2680EXPORT_SYMBOL(sock_no_shutdown);
2681
2682int sock_no_setsockopt(struct socket *sock, int level, int optname,
2683                    char __user *optval, unsigned int optlen)
2684{
2685        return -EOPNOTSUPP;
2686}
2687EXPORT_SYMBOL(sock_no_setsockopt);
2688
2689int sock_no_getsockopt(struct socket *sock, int level, int optname,
2690                    char __user *optval, int __user *optlen)
2691{
2692        return -EOPNOTSUPP;
2693}
2694EXPORT_SYMBOL(sock_no_getsockopt);
2695
2696int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2697{
2698        return -EOPNOTSUPP;
2699}
2700EXPORT_SYMBOL(sock_no_sendmsg);
2701
2702int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2703{
2704        return -EOPNOTSUPP;
2705}
2706EXPORT_SYMBOL(sock_no_sendmsg_locked);
2707
2708int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2709                    int flags)
2710{
2711        return -EOPNOTSUPP;
2712}
2713EXPORT_SYMBOL(sock_no_recvmsg);
2714
2715int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2716{
2717        /* Mirror missing mmap method error code */
2718        return -ENODEV;
2719}
2720EXPORT_SYMBOL(sock_no_mmap);
2721
2722ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2723{
2724        ssize_t res;
2725        struct msghdr msg = {.msg_flags = flags};
2726        struct kvec iov;
2727        char *kaddr = kmap(page);
2728        iov.iov_base = kaddr + offset;
2729        iov.iov_len = size;
2730        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2731        kunmap(page);
2732        return res;
2733}
2734EXPORT_SYMBOL(sock_no_sendpage);
2735
2736ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2737                                int offset, size_t size, int flags)
2738{
2739        ssize_t res;
2740        struct msghdr msg = {.msg_flags = flags};
2741        struct kvec iov;
2742        char *kaddr = kmap(page);
2743
2744        iov.iov_base = kaddr + offset;
2745        iov.iov_len = size;
2746        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2747        kunmap(page);
2748        return res;
2749}
2750EXPORT_SYMBOL(sock_no_sendpage_locked);
2751
2752/*
2753 *      Default Socket Callbacks
2754 */
2755
2756static void sock_def_wakeup(struct sock *sk)
2757{
2758        struct socket_wq *wq;
2759
2760        rcu_read_lock();
2761        wq = rcu_dereference(sk->sk_wq);
2762        if (skwq_has_sleeper(wq))
2763                wake_up_interruptible_all(&wq->wait);
2764        rcu_read_unlock();
2765}
2766
2767static void sock_def_error_report(struct sock *sk)
2768{
2769        struct socket_wq *wq;
2770
2771        rcu_read_lock();
2772        wq = rcu_dereference(sk->sk_wq);
2773        if (skwq_has_sleeper(wq))
2774                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2775        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2776        rcu_read_unlock();
2777}
2778
2779static void sock_def_readable(struct sock *sk)
2780{
2781        struct socket_wq *wq;
2782
2783        rcu_read_lock();
2784        wq = rcu_dereference(sk->sk_wq);
2785        if (skwq_has_sleeper(wq))
2786                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2787                                                EPOLLRDNORM | EPOLLRDBAND);
2788        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2789        rcu_read_unlock();
2790}
2791
2792static void sock_def_write_space(struct sock *sk)
2793{
2794        struct socket_wq *wq;
2795
2796        rcu_read_lock();
2797
2798        /* Do not wake up a writer until he can make "significant"
2799         * progress.  --DaveM
2800         */
2801        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2802                wq = rcu_dereference(sk->sk_wq);
2803                if (skwq_has_sleeper(wq))
2804                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2805                                                EPOLLWRNORM | EPOLLWRBAND);
2806
2807                /* Should agree with poll, otherwise some programs break */
2808                if (sock_writeable(sk))
2809                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2810        }
2811
2812        rcu_read_unlock();
2813}
2814
2815static void sock_def_destruct(struct sock *sk)
2816{
2817}
2818
2819void sk_send_sigurg(struct sock *sk)
2820{
2821        if (sk->sk_socket && sk->sk_socket->file)
2822                if (send_sigurg(&sk->sk_socket->file->f_owner))
2823                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2824}
2825EXPORT_SYMBOL(sk_send_sigurg);
2826
2827void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2828                    unsigned long expires)
2829{
2830        if (!mod_timer(timer, expires))
2831                sock_hold(sk);
2832}
2833EXPORT_SYMBOL(sk_reset_timer);
2834
2835void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2836{
2837        if (del_timer(timer))
2838                __sock_put(sk);
2839}
2840EXPORT_SYMBOL(sk_stop_timer);
2841
2842void sock_init_data(struct socket *sock, struct sock *sk)
2843{
2844        sk_init_common(sk);
2845        sk->sk_send_head        =       NULL;
2846
2847        timer_setup(&sk->sk_timer, NULL, 0);
2848
2849        sk->sk_allocation       =       GFP_KERNEL;
2850        sk->sk_rcvbuf           =       sysctl_rmem_default;
2851        sk->sk_sndbuf           =       sysctl_wmem_default;
2852        sk->sk_state            =       TCP_CLOSE;
2853        sk_set_socket(sk, sock);
2854
2855        sock_set_flag(sk, SOCK_ZAPPED);
2856
2857        if (sock) {
2858                sk->sk_type     =       sock->type;
2859                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2860                sock->sk        =       sk;
2861                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2862        } else {
2863                RCU_INIT_POINTER(sk->sk_wq, NULL);
2864                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2865        }
2866
2867        rwlock_init(&sk->sk_callback_lock);
2868        if (sk->sk_kern_sock)
2869                lockdep_set_class_and_name(
2870                        &sk->sk_callback_lock,
2871                        af_kern_callback_keys + sk->sk_family,
2872                        af_family_kern_clock_key_strings[sk->sk_family]);
2873        else
2874                lockdep_set_class_and_name(
2875                        &sk->sk_callback_lock,
2876                        af_callback_keys + sk->sk_family,
2877                        af_family_clock_key_strings[sk->sk_family]);
2878
2879        sk->sk_state_change     =       sock_def_wakeup;
2880        sk->sk_data_ready       =       sock_def_readable;
2881        sk->sk_write_space      =       sock_def_write_space;
2882        sk->sk_error_report     =       sock_def_error_report;
2883        sk->sk_destruct         =       sock_def_destruct;
2884
2885        sk->sk_frag.page        =       NULL;
2886        sk->sk_frag.offset      =       0;
2887        sk->sk_peek_off         =       -1;
2888
2889        sk->sk_peer_pid         =       NULL;
2890        sk->sk_peer_cred        =       NULL;
2891        sk->sk_write_pending    =       0;
2892        sk->sk_rcvlowat         =       1;
2893        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2894        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2895
2896        sk->sk_stamp = SK_DEFAULT_STAMP;
2897#if BITS_PER_LONG==32
2898        seqlock_init(&sk->sk_stamp_seq);
2899#endif
2900        atomic_set(&sk->sk_zckey, 0);
2901
2902#ifdef CONFIG_NET_RX_BUSY_POLL
2903        sk->sk_napi_id          =       0;
2904        sk->sk_ll_usec          =       sysctl_net_busy_read;
2905#endif
2906
2907        sk->sk_max_pacing_rate = ~0UL;
2908        sk->sk_pacing_rate = ~0UL;
2909        sk->sk_pacing_shift = 10;
2910        sk->sk_incoming_cpu = -1;
2911
2912        sk_rx_queue_clear(sk);
2913        /*
2914         * Before updating sk_refcnt, we must commit prior changes to memory
2915         * (Documentation/RCU/rculist_nulls.txt for details)
2916         */
2917        smp_wmb();
2918        refcount_set(&sk->sk_refcnt, 1);
2919        atomic_set(&sk->sk_drops, 0);
2920}
2921EXPORT_SYMBOL(sock_init_data);
2922
2923void lock_sock_nested(struct sock *sk, int subclass)
2924{
2925        might_sleep();
2926        spin_lock_bh(&sk->sk_lock.slock);
2927        if (sk->sk_lock.owned)
2928                __lock_sock(sk);
2929        sk->sk_lock.owned = 1;
2930        spin_unlock(&sk->sk_lock.slock);
2931        /*
2932         * The sk_lock has mutex_lock() semantics here:
2933         */
2934        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2935        local_bh_enable();
2936}
2937EXPORT_SYMBOL(lock_sock_nested);
2938
2939void release_sock(struct sock *sk)
2940{
2941        spin_lock_bh(&sk->sk_lock.slock);
2942        if (sk->sk_backlog.tail)
2943                __release_sock(sk);
2944
2945        /* Warning : release_cb() might need to release sk ownership,
2946         * ie call sock_release_ownership(sk) before us.
2947         */
2948        if (sk->sk_prot->release_cb)
2949                sk->sk_prot->release_cb(sk);
2950
2951        sock_release_ownership(sk);
2952        if (waitqueue_active(&sk->sk_lock.wq))
2953                wake_up(&sk->sk_lock.wq);
2954        spin_unlock_bh(&sk->sk_lock.slock);
2955}
2956EXPORT_SYMBOL(release_sock);
2957
2958/**
2959 * lock_sock_fast - fast version of lock_sock
2960 * @sk: socket
2961 *
2962 * This version should be used for very small section, where process wont block
2963 * return false if fast path is taken:
2964 *
2965 *   sk_lock.slock locked, owned = 0, BH disabled
2966 *
2967 * return true if slow path is taken:
2968 *
2969 *   sk_lock.slock unlocked, owned = 1, BH enabled
2970 */
2971bool lock_sock_fast(struct sock *sk)
2972{
2973        might_sleep();
2974        spin_lock_bh(&sk->sk_lock.slock);
2975
2976        if (!sk->sk_lock.owned)
2977                /*
2978                 * Note : We must disable BH
2979                 */
2980                return false;
2981
2982        __lock_sock(sk);
2983        sk->sk_lock.owned = 1;
2984        spin_unlock(&sk->sk_lock.slock);
2985        /*
2986         * The sk_lock has mutex_lock() semantics here:
2987         */
2988        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2989        local_bh_enable();
2990        return true;
2991}
2992EXPORT_SYMBOL(lock_sock_fast);
2993
2994int sock_gettstamp(struct socket *sock, void __user *userstamp,
2995                   bool timeval, bool time32)
2996{
2997        struct sock *sk = sock->sk;
2998        struct timespec64 ts;
2999
3000        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3001        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3002        if (ts.tv_sec == -1)
3003                return -ENOENT;
3004        if (ts.tv_sec == 0) {
3005                ktime_t kt = ktime_get_real();
3006                sock_write_timestamp(sk, kt);;
3007                ts = ktime_to_timespec64(kt);
3008        }
3009
3010        if (timeval)
3011                ts.tv_nsec /= 1000;
3012
3013#ifdef CONFIG_COMPAT_32BIT_TIME
3014        if (time32)
3015                return put_old_timespec32(&ts, userstamp);
3016#endif
3017#ifdef CONFIG_SPARC64
3018        /* beware of padding in sparc64 timeval */
3019        if (timeval && !in_compat_syscall()) {
3020                struct __kernel_old_timeval __user tv = {
3021                        .tv_sec = ts.tv_sec,
3022                        .tv_usec = ts.tv_nsec,
3023                };
3024                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3025                        return -EFAULT;
3026                return 0;
3027        }
3028#endif
3029        return put_timespec64(&ts, userstamp);
3030}
3031EXPORT_SYMBOL(sock_gettstamp);
3032
3033void sock_enable_timestamp(struct sock *sk, int flag)
3034{
3035        if (!sock_flag(sk, flag)) {
3036                unsigned long previous_flags = sk->sk_flags;
3037
3038                sock_set_flag(sk, flag);
3039                /*
3040                 * we just set one of the two flags which require net
3041                 * time stamping, but time stamping might have been on
3042                 * already because of the other one
3043                 */
3044                if (sock_needs_netstamp(sk) &&
3045                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3046                        net_enable_timestamp();
3047        }
3048}
3049
3050int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3051                       int level, int type)
3052{
3053        struct sock_exterr_skb *serr;
3054        struct sk_buff *skb;
3055        int copied, err;
3056
3057        err = -EAGAIN;
3058        skb = sock_dequeue_err_skb(sk);
3059        if (skb == NULL)
3060                goto out;
3061
3062        copied = skb->len;
3063        if (copied > len) {
3064                msg->msg_flags |= MSG_TRUNC;
3065                copied = len;
3066        }
3067        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3068        if (err)
3069                goto out_free_skb;
3070
3071        sock_recv_timestamp(msg, sk, skb);
3072
3073        serr = SKB_EXT_ERR(skb);
3074        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3075
3076        msg->msg_flags |= MSG_ERRQUEUE;
3077        err = copied;
3078
3079out_free_skb:
3080        kfree_skb(skb);
3081out:
3082        return err;
3083}
3084EXPORT_SYMBOL(sock_recv_errqueue);
3085
3086/*
3087 *      Get a socket option on an socket.
3088 *
3089 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3090 *      asynchronous errors should be reported by getsockopt. We assume
3091 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3092 */
3093int sock_common_getsockopt(struct socket *sock, int level, int optname,
3094                           char __user *optval, int __user *optlen)
3095{
3096        struct sock *sk = sock->sk;
3097
3098        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3099}
3100EXPORT_SYMBOL(sock_common_getsockopt);
3101
3102#ifdef CONFIG_COMPAT
3103int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3104                                  char __user *optval, int __user *optlen)
3105{
3106        struct sock *sk = sock->sk;
3107
3108        if (sk->sk_prot->compat_getsockopt != NULL)
3109                return sk->sk_prot->compat_getsockopt(sk, level, optname,
3110                                                      optval, optlen);
3111        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3112}
3113EXPORT_SYMBOL(compat_sock_common_getsockopt);
3114#endif
3115
3116int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3117                        int flags)
3118{
3119        struct sock *sk = sock->sk;
3120        int addr_len = 0;
3121        int err;
3122
3123        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3124                                   flags & ~MSG_DONTWAIT, &addr_len);
3125        if (err >= 0)
3126                msg->msg_namelen = addr_len;
3127        return err;
3128}
3129EXPORT_SYMBOL(sock_common_recvmsg);
3130
3131/*
3132 *      Set socket options on an inet socket.
3133 */
3134int sock_common_setsockopt(struct socket *sock, int level, int optname,
3135                           char __user *optval, unsigned int optlen)
3136{
3137        struct sock *sk = sock->sk;
3138
3139        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3140}
3141EXPORT_SYMBOL(sock_common_setsockopt);
3142
3143#ifdef CONFIG_COMPAT
3144int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3145                                  char __user *optval, unsigned int optlen)
3146{
3147        struct sock *sk = sock->sk;
3148
3149        if (sk->sk_prot->compat_setsockopt != NULL)
3150                return sk->sk_prot->compat_setsockopt(sk, level, optname,
3151                                                      optval, optlen);
3152        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3153}
3154EXPORT_SYMBOL(compat_sock_common_setsockopt);
3155#endif
3156
3157void sk_common_release(struct sock *sk)
3158{
3159        if (sk->sk_prot->destroy)
3160                sk->sk_prot->destroy(sk);
3161
3162        /*
3163         * Observation: when sock_common_release is called, processes have
3164         * no access to socket. But net still has.
3165         * Step one, detach it from networking:
3166         *
3167         * A. Remove from hash tables.
3168         */
3169
3170        sk->sk_prot->unhash(sk);
3171
3172        /*
3173         * In this point socket cannot receive new packets, but it is possible
3174         * that some packets are in flight because some CPU runs receiver and
3175         * did hash table lookup before we unhashed socket. They will achieve
3176         * receive queue and will be purged by socket destructor.
3177         *
3178         * Also we still have packets pending on receive queue and probably,
3179         * our own packets waiting in device queues. sock_destroy will drain
3180         * receive queue, but transmitted packets will delay socket destruction
3181         * until the last reference will be released.
3182         */
3183
3184        sock_orphan(sk);
3185
3186        xfrm_sk_free_policy(sk);
3187
3188        sk_refcnt_debug_release(sk);
3189
3190        sock_put(sk);
3191}
3192EXPORT_SYMBOL(sk_common_release);
3193
3194void sk_get_meminfo(const struct sock *sk, u32 *mem)
3195{
3196        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3197
3198        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3199        mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3200        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3201        mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3202        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3203        mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3204        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3205        mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3206        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3207}
3208
3209#ifdef CONFIG_PROC_FS
3210#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3211struct prot_inuse {
3212        int val[PROTO_INUSE_NR];
3213};
3214
3215static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3216
3217void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3218{
3219        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3220}
3221EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3222
3223int sock_prot_inuse_get(struct net *net, struct proto *prot)
3224{
3225        int cpu, idx = prot->inuse_idx;
3226        int res = 0;
3227
3228        for_each_possible_cpu(cpu)
3229                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3230
3231        return res >= 0 ? res : 0;
3232}
3233EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3234
3235static void sock_inuse_add(struct net *net, int val)
3236{
3237        this_cpu_add(*net->core.sock_inuse, val);
3238}
3239
3240int sock_inuse_get(struct net *net)
3241{
3242        int cpu, res = 0;
3243
3244        for_each_possible_cpu(cpu)
3245                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3246
3247        return res;
3248}
3249
3250EXPORT_SYMBOL_GPL(sock_inuse_get);
3251
3252static int __net_init sock_inuse_init_net(struct net *net)
3253{
3254        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3255        if (net->core.prot_inuse == NULL)
3256                return -ENOMEM;
3257
3258        net->core.sock_inuse = alloc_percpu(int);
3259        if (net->core.sock_inuse == NULL)
3260                goto out;
3261
3262        return 0;
3263
3264out:
3265        free_percpu(net->core.prot_inuse);
3266        return -ENOMEM;
3267}
3268
3269static void __net_exit sock_inuse_exit_net(struct net *net)
3270{
3271        free_percpu(net->core.prot_inuse);
3272        free_percpu(net->core.sock_inuse);
3273}
3274
3275static struct pernet_operations net_inuse_ops = {
3276        .init = sock_inuse_init_net,
3277        .exit = sock_inuse_exit_net,
3278};
3279
3280static __init int net_inuse_init(void)
3281{
3282        if (register_pernet_subsys(&net_inuse_ops))
3283                panic("Cannot initialize net inuse counters");
3284
3285        return 0;
3286}
3287
3288core_initcall(net_inuse_init);
3289
3290static int assign_proto_idx(struct proto *prot)
3291{
3292        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3293
3294        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3295                pr_err("PROTO_INUSE_NR exhausted\n");
3296                return -ENOSPC;
3297        }
3298
3299        set_bit(prot->inuse_idx, proto_inuse_idx);
3300        return 0;
3301}
3302
3303static void release_proto_idx(struct proto *prot)
3304{
3305        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3306                clear_bit(prot->inuse_idx, proto_inuse_idx);
3307}
3308#else
3309static inline int assign_proto_idx(struct proto *prot)
3310{
3311        return 0;
3312}
3313
3314static inline void release_proto_idx(struct proto *prot)
3315{
3316}
3317
3318static void sock_inuse_add(struct net *net, int val)
3319{
3320}
3321#endif
3322
3323static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3324{
3325        if (!rsk_prot)
3326                return;
3327        kfree(rsk_prot->slab_name);
3328        rsk_prot->slab_name = NULL;
3329        kmem_cache_destroy(rsk_prot->slab);
3330        rsk_prot->slab = NULL;
3331}
3332
3333static int req_prot_init(const struct proto *prot)
3334{
3335        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3336
3337        if (!rsk_prot)
3338                return 0;
3339
3340        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3341                                        prot->name);
3342        if (!rsk_prot->slab_name)
3343                return -ENOMEM;
3344
3345        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3346                                           rsk_prot->obj_size, 0,
3347                                           SLAB_ACCOUNT | prot->slab_flags,
3348                                           NULL);
3349
3350        if (!rsk_prot->slab) {
3351                pr_crit("%s: Can't create request sock SLAB cache!\n",
3352                        prot->name);
3353                return -ENOMEM;
3354        }
3355        return 0;
3356}
3357
3358int proto_register(struct proto *prot, int alloc_slab)
3359{
3360        int ret = -ENOBUFS;
3361
3362        if (alloc_slab) {
3363                prot->slab = kmem_cache_create_usercopy(prot->name,
3364                                        prot->obj_size, 0,
3365                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3366                                        prot->slab_flags,
3367                                        prot->useroffset, prot->usersize,
3368                                        NULL);
3369
3370                if (prot->slab == NULL) {
3371                        pr_crit("%s: Can't create sock SLAB cache!\n",
3372                                prot->name);
3373                        goto out;
3374                }
3375
3376                if (req_prot_init(prot))
3377                        goto out_free_request_sock_slab;
3378
3379                if (prot->twsk_prot != NULL) {
3380                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3381
3382                        if (prot->twsk_prot->twsk_slab_name == NULL)
3383                                goto out_free_request_sock_slab;
3384
3385                        prot->twsk_prot->twsk_slab =
3386                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3387                                                  prot->twsk_prot->twsk_obj_size,
3388                                                  0,
3389                                                  SLAB_ACCOUNT |
3390                                                  prot->slab_flags,
3391                                                  NULL);
3392                        if (prot->twsk_prot->twsk_slab == NULL)
3393                                goto out_free_timewait_sock_slab_name;
3394                }
3395        }
3396
3397        mutex_lock(&proto_list_mutex);
3398        ret = assign_proto_idx(prot);
3399        if (ret) {
3400                mutex_unlock(&proto_list_mutex);
3401                goto out_free_timewait_sock_slab_name;
3402        }
3403        list_add(&prot->node, &proto_list);
3404        mutex_unlock(&proto_list_mutex);
3405        return ret;
3406
3407out_free_timewait_sock_slab_name:
3408        if (alloc_slab && prot->twsk_prot)
3409                kfree(prot->twsk_prot->twsk_slab_name);
3410out_free_request_sock_slab:
3411        if (alloc_slab) {
3412                req_prot_cleanup(prot->rsk_prot);
3413
3414                kmem_cache_destroy(prot->slab);
3415                prot->slab = NULL;
3416        }
3417out:
3418        return ret;
3419}
3420EXPORT_SYMBOL(proto_register);
3421
3422void proto_unregister(struct proto *prot)
3423{
3424        mutex_lock(&proto_list_mutex);
3425        release_proto_idx(prot);
3426        list_del(&prot->node);
3427        mutex_unlock(&proto_list_mutex);
3428
3429        kmem_cache_destroy(prot->slab);
3430        prot->slab = NULL;
3431
3432        req_prot_cleanup(prot->rsk_prot);
3433
3434        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3435                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3436                kfree(prot->twsk_prot->twsk_slab_name);
3437                prot->twsk_prot->twsk_slab = NULL;
3438        }
3439}
3440EXPORT_SYMBOL(proto_unregister);
3441
3442int sock_load_diag_module(int family, int protocol)
3443{
3444        if (!protocol) {
3445                if (!sock_is_registered(family))
3446                        return -ENOENT;
3447
3448                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3449                                      NETLINK_SOCK_DIAG, family);
3450        }
3451
3452#ifdef CONFIG_INET
3453        if (family == AF_INET &&
3454            protocol != IPPROTO_RAW &&
3455            !rcu_access_pointer(inet_protos[protocol]))
3456                return -ENOENT;
3457#endif
3458
3459        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3460                              NETLINK_SOCK_DIAG, family, protocol);
3461}
3462EXPORT_SYMBOL(sock_load_diag_module);
3463
3464#ifdef CONFIG_PROC_FS
3465static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3466        __acquires(proto_list_mutex)
3467{
3468        mutex_lock(&proto_list_mutex);
3469        return seq_list_start_head(&proto_list, *pos);
3470}
3471
3472static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3473{
3474        return seq_list_next(v, &proto_list, pos);
3475}
3476
3477static void proto_seq_stop(struct seq_file *seq, void *v)
3478        __releases(proto_list_mutex)
3479{
3480        mutex_unlock(&proto_list_mutex);
3481}
3482
3483static char proto_method_implemented(const void *method)
3484{
3485        return method == NULL ? 'n' : 'y';
3486}
3487static long sock_prot_memory_allocated(struct proto *proto)
3488{
3489        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3490}
3491
3492static char *sock_prot_memory_pressure(struct proto *proto)
3493{
3494        return proto->memory_pressure != NULL ?
3495        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3496}
3497
3498static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3499{
3500
3501        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3502                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3503                   proto->name,
3504                   proto->obj_size,
3505                   sock_prot_inuse_get(seq_file_net(seq), proto),
3506                   sock_prot_memory_allocated(proto),
3507                   sock_prot_memory_pressure(proto),
3508                   proto->max_header,
3509                   proto->slab == NULL ? "no" : "yes",
3510                   module_name(proto->owner),
3511                   proto_method_implemented(proto->close),
3512                   proto_method_implemented(proto->connect),
3513                   proto_method_implemented(proto->disconnect),
3514                   proto_method_implemented(proto->accept),
3515                   proto_method_implemented(proto->ioctl),
3516                   proto_method_implemented(proto->init),
3517                   proto_method_implemented(proto->destroy),
3518                   proto_method_implemented(proto->shutdown),
3519                   proto_method_implemented(proto->setsockopt),
3520                   proto_method_implemented(proto->getsockopt),
3521                   proto_method_implemented(proto->sendmsg),
3522                   proto_method_implemented(proto->recvmsg),
3523                   proto_method_implemented(proto->sendpage),
3524                   proto_method_implemented(proto->bind),
3525                   proto_method_implemented(proto->backlog_rcv),
3526                   proto_method_implemented(proto->hash),
3527                   proto_method_implemented(proto->unhash),
3528                   proto_method_implemented(proto->get_port),
3529                   proto_method_implemented(proto->enter_memory_pressure));
3530}
3531
3532static int proto_seq_show(struct seq_file *seq, void *v)
3533{
3534        if (v == &proto_list)
3535                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3536                           "protocol",
3537                           "size",
3538                           "sockets",
3539                           "memory",
3540                           "press",
3541                           "maxhdr",
3542                           "slab",
3543                           "module",
3544                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3545        else
3546                proto_seq_printf(seq, list_entry(v, struct proto, node));
3547        return 0;
3548}
3549
3550static const struct seq_operations proto_seq_ops = {
3551        .start  = proto_seq_start,
3552        .next   = proto_seq_next,
3553        .stop   = proto_seq_stop,
3554        .show   = proto_seq_show,
3555};
3556
3557static __net_init int proto_init_net(struct net *net)
3558{
3559        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3560                        sizeof(struct seq_net_private)))
3561                return -ENOMEM;
3562
3563        return 0;
3564}
3565
3566static __net_exit void proto_exit_net(struct net *net)
3567{
3568        remove_proc_entry("protocols", net->proc_net);
3569}
3570
3571
3572static __net_initdata struct pernet_operations proto_net_ops = {
3573        .init = proto_init_net,
3574        .exit = proto_exit_net,
3575};
3576
3577static int __init proto_init(void)
3578{
3579        return register_pernet_subsys(&proto_net_ops);
3580}
3581
3582subsys_initcall(proto_init);
3583
3584#endif /* PROC_FS */
3585
3586#ifdef CONFIG_NET_RX_BUSY_POLL
3587bool sk_busy_loop_end(void *p, unsigned long start_time)
3588{
3589        struct sock *sk = p;
3590
3591        return !skb_queue_empty(&sk->sk_receive_queue) ||
3592               sk_busy_loop_timeout(sk, start_time);
3593}
3594EXPORT_SYMBOL(sk_busy_loop_end);
3595#endif /* CONFIG_NET_RX_BUSY_POLL */
3596