linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <linux/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134#include <linux/sock_diag.h>
 135
 136#include <linux/filter.h>
 137#include <net/sock_reuseport.h>
 138
 139#include <trace/events/sock.h>
 140
 141#ifdef CONFIG_INET
 142#include <net/tcp.h>
 143#endif
 144
 145#include <net/busy_poll.h>
 146
 147static DEFINE_MUTEX(proto_list_mutex);
 148static LIST_HEAD(proto_list);
 149
 150/**
 151 * sk_ns_capable - General socket capability test
 152 * @sk: Socket to use a capability on or through
 153 * @user_ns: The user namespace of the capability to use
 154 * @cap: The capability to use
 155 *
 156 * Test to see if the opener of the socket had when the socket was
 157 * created and the current process has the capability @cap in the user
 158 * namespace @user_ns.
 159 */
 160bool sk_ns_capable(const struct sock *sk,
 161                   struct user_namespace *user_ns, int cap)
 162{
 163        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                ns_capable(user_ns, cap);
 165}
 166EXPORT_SYMBOL(sk_ns_capable);
 167
 168/**
 169 * sk_capable - Socket global capability test
 170 * @sk: Socket to use a capability on or through
 171 * @cap: The global capability to use
 172 *
 173 * Test to see if the opener of the socket had when the socket was
 174 * created and the current process has the capability @cap in all user
 175 * namespaces.
 176 */
 177bool sk_capable(const struct sock *sk, int cap)
 178{
 179        return sk_ns_capable(sk, &init_user_ns, cap);
 180}
 181EXPORT_SYMBOL(sk_capable);
 182
 183/**
 184 * sk_net_capable - Network namespace socket capability test
 185 * @sk: Socket to use a capability on or through
 186 * @cap: The capability to use
 187 *
 188 * Test to see if the opener of the socket had when the socket was created
 189 * and the current process has the capability @cap over the network namespace
 190 * the socket is a member of.
 191 */
 192bool sk_net_capable(const struct sock *sk, int cap)
 193{
 194        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195}
 196EXPORT_SYMBOL(sk_net_capable);
 197
 198/*
 199 * Each address family might have different locking rules, so we have
 200 * one slock key per address family:
 201 */
 202static struct lock_class_key af_family_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210static const char *const af_family_key_strings[AF_MAX+1] = {
 211  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225  "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
 226};
 227static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242  "slock-AF_QIPCRTR", "slock-AF_MAX"
 243};
 244static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259  "clock-AF_QIPCRTR", "clock-AF_MAX"
 260};
 261
 262/*
 263 * sk_callback_lock locking rules are per-address-family,
 264 * so split the lock classes by using a per-AF key:
 265 */
 266static struct lock_class_key af_callback_keys[AF_MAX];
 267
 268/* Take into consideration the size of the struct sk_buff overhead in the
 269 * determination of these values, since that is non-constant across
 270 * platforms.  This makes socket queueing behavior and performance
 271 * not depend upon such differences.
 272 */
 273#define _SK_MEM_PACKETS         256
 274#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 275#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 276#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 277
 278/* Run time adjustable parameters. */
 279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280EXPORT_SYMBOL(sysctl_wmem_max);
 281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282EXPORT_SYMBOL(sysctl_rmem_max);
 283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286/* Maximal space eaten by iovec or ancillary data plus some space */
 287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 293EXPORT_SYMBOL_GPL(memalloc_socks);
 294
 295/**
 296 * sk_set_memalloc - sets %SOCK_MEMALLOC
 297 * @sk: socket to set it on
 298 *
 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300 * It's the responsibility of the admin to adjust min_free_kbytes
 301 * to meet the requirements
 302 */
 303void sk_set_memalloc(struct sock *sk)
 304{
 305        sock_set_flag(sk, SOCK_MEMALLOC);
 306        sk->sk_allocation |= __GFP_MEMALLOC;
 307        static_key_slow_inc(&memalloc_socks);
 308}
 309EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311void sk_clear_memalloc(struct sock *sk)
 312{
 313        sock_reset_flag(sk, SOCK_MEMALLOC);
 314        sk->sk_allocation &= ~__GFP_MEMALLOC;
 315        static_key_slow_dec(&memalloc_socks);
 316
 317        /*
 318         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319         * progress of swapping. SOCK_MEMALLOC may be cleared while
 320         * it has rmem allocations due to the last swapfile being deactivated
 321         * but there is a risk that the socket is unusable due to exceeding
 322         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323         */
 324        sk_mem_reclaim(sk);
 325}
 326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329{
 330        int ret;
 331        unsigned long pflags = current->flags;
 332
 333        /* these should have been dropped before queueing */
 334        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336        current->flags |= PF_MEMALLOC;
 337        ret = sk->sk_backlog_rcv(sk, skb);
 338        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 339
 340        return ret;
 341}
 342EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 345{
 346        struct timeval tv;
 347
 348        if (optlen < sizeof(tv))
 349                return -EINVAL;
 350        if (copy_from_user(&tv, optval, sizeof(tv)))
 351                return -EFAULT;
 352        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 353                return -EDOM;
 354
 355        if (tv.tv_sec < 0) {
 356                static int warned __read_mostly;
 357
 358                *timeo_p = 0;
 359                if (warned < 10 && net_ratelimit()) {
 360                        warned++;
 361                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 362                                __func__, current->comm, task_pid_nr(current));
 363                }
 364                return 0;
 365        }
 366        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 367        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 368                return 0;
 369        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 370                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 371        return 0;
 372}
 373
 374static void sock_warn_obsolete_bsdism(const char *name)
 375{
 376        static int warned;
 377        static char warncomm[TASK_COMM_LEN];
 378        if (strcmp(warncomm, current->comm) && warned < 5) {
 379                strcpy(warncomm,  current->comm);
 380                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 381                        warncomm, name);
 382                warned++;
 383        }
 384}
 385
 386static bool sock_needs_netstamp(const struct sock *sk)
 387{
 388        switch (sk->sk_family) {
 389        case AF_UNSPEC:
 390        case AF_UNIX:
 391                return false;
 392        default:
 393                return true;
 394        }
 395}
 396
 397static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 398{
 399        if (sk->sk_flags & flags) {
 400                sk->sk_flags &= ~flags;
 401                if (sock_needs_netstamp(sk) &&
 402                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 403                        net_disable_timestamp();
 404        }
 405}
 406
 407
 408int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 409{
 410        unsigned long flags;
 411        struct sk_buff_head *list = &sk->sk_receive_queue;
 412
 413        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 414                atomic_inc(&sk->sk_drops);
 415                trace_sock_rcvqueue_full(sk, skb);
 416                return -ENOMEM;
 417        }
 418
 419        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 420                atomic_inc(&sk->sk_drops);
 421                return -ENOBUFS;
 422        }
 423
 424        skb->dev = NULL;
 425        skb_set_owner_r(skb, sk);
 426
 427        /* we escape from rcu protected region, make sure we dont leak
 428         * a norefcounted dst
 429         */
 430        skb_dst_force(skb);
 431
 432        spin_lock_irqsave(&list->lock, flags);
 433        sock_skb_set_dropcount(sk, skb);
 434        __skb_queue_tail(list, skb);
 435        spin_unlock_irqrestore(&list->lock, flags);
 436
 437        if (!sock_flag(sk, SOCK_DEAD))
 438                sk->sk_data_ready(sk);
 439        return 0;
 440}
 441EXPORT_SYMBOL(__sock_queue_rcv_skb);
 442
 443int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 444{
 445        int err;
 446
 447        err = sk_filter(sk, skb);
 448        if (err)
 449                return err;
 450
 451        return __sock_queue_rcv_skb(sk, skb);
 452}
 453EXPORT_SYMBOL(sock_queue_rcv_skb);
 454
 455int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 456                     const int nested, unsigned int trim_cap, bool refcounted)
 457{
 458        int rc = NET_RX_SUCCESS;
 459
 460        if (sk_filter_trim_cap(sk, skb, trim_cap))
 461                goto discard_and_relse;
 462
 463        skb->dev = NULL;
 464
 465        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 466                atomic_inc(&sk->sk_drops);
 467                goto discard_and_relse;
 468        }
 469        if (nested)
 470                bh_lock_sock_nested(sk);
 471        else
 472                bh_lock_sock(sk);
 473        if (!sock_owned_by_user(sk)) {
 474                /*
 475                 * trylock + unlock semantics:
 476                 */
 477                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 478
 479                rc = sk_backlog_rcv(sk, skb);
 480
 481                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 482        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 483                bh_unlock_sock(sk);
 484                atomic_inc(&sk->sk_drops);
 485                goto discard_and_relse;
 486        }
 487
 488        bh_unlock_sock(sk);
 489out:
 490        if (refcounted)
 491                sock_put(sk);
 492        return rc;
 493discard_and_relse:
 494        kfree_skb(skb);
 495        goto out;
 496}
 497EXPORT_SYMBOL(__sk_receive_skb);
 498
 499struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 500{
 501        struct dst_entry *dst = __sk_dst_get(sk);
 502
 503        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 504                sk_tx_queue_clear(sk);
 505                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 506                dst_release(dst);
 507                return NULL;
 508        }
 509
 510        return dst;
 511}
 512EXPORT_SYMBOL(__sk_dst_check);
 513
 514struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 515{
 516        struct dst_entry *dst = sk_dst_get(sk);
 517
 518        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 519                sk_dst_reset(sk);
 520                dst_release(dst);
 521                return NULL;
 522        }
 523
 524        return dst;
 525}
 526EXPORT_SYMBOL(sk_dst_check);
 527
 528static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 529                                int optlen)
 530{
 531        int ret = -ENOPROTOOPT;
 532#ifdef CONFIG_NETDEVICES
 533        struct net *net = sock_net(sk);
 534        char devname[IFNAMSIZ];
 535        int index;
 536
 537        /* Sorry... */
 538        ret = -EPERM;
 539        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 540                goto out;
 541
 542        ret = -EINVAL;
 543        if (optlen < 0)
 544                goto out;
 545
 546        /* Bind this socket to a particular device like "eth0",
 547         * as specified in the passed interface name. If the
 548         * name is "" or the option length is zero the socket
 549         * is not bound.
 550         */
 551        if (optlen > IFNAMSIZ - 1)
 552                optlen = IFNAMSIZ - 1;
 553        memset(devname, 0, sizeof(devname));
 554
 555        ret = -EFAULT;
 556        if (copy_from_user(devname, optval, optlen))
 557                goto out;
 558
 559        index = 0;
 560        if (devname[0] != '\0') {
 561                struct net_device *dev;
 562
 563                rcu_read_lock();
 564                dev = dev_get_by_name_rcu(net, devname);
 565                if (dev)
 566                        index = dev->ifindex;
 567                rcu_read_unlock();
 568                ret = -ENODEV;
 569                if (!dev)
 570                        goto out;
 571        }
 572
 573        lock_sock(sk);
 574        sk->sk_bound_dev_if = index;
 575        sk_dst_reset(sk);
 576        release_sock(sk);
 577
 578        ret = 0;
 579
 580out:
 581#endif
 582
 583        return ret;
 584}
 585
 586static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 587                                int __user *optlen, int len)
 588{
 589        int ret = -ENOPROTOOPT;
 590#ifdef CONFIG_NETDEVICES
 591        struct net *net = sock_net(sk);
 592        char devname[IFNAMSIZ];
 593
 594        if (sk->sk_bound_dev_if == 0) {
 595                len = 0;
 596                goto zero;
 597        }
 598
 599        ret = -EINVAL;
 600        if (len < IFNAMSIZ)
 601                goto out;
 602
 603        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 604        if (ret)
 605                goto out;
 606
 607        len = strlen(devname) + 1;
 608
 609        ret = -EFAULT;
 610        if (copy_to_user(optval, devname, len))
 611                goto out;
 612
 613zero:
 614        ret = -EFAULT;
 615        if (put_user(len, optlen))
 616                goto out;
 617
 618        ret = 0;
 619
 620out:
 621#endif
 622
 623        return ret;
 624}
 625
 626static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 627{
 628        if (valbool)
 629                sock_set_flag(sk, bit);
 630        else
 631                sock_reset_flag(sk, bit);
 632}
 633
 634bool sk_mc_loop(struct sock *sk)
 635{
 636        if (dev_recursion_level())
 637                return false;
 638        if (!sk)
 639                return true;
 640        switch (sk->sk_family) {
 641        case AF_INET:
 642                return inet_sk(sk)->mc_loop;
 643#if IS_ENABLED(CONFIG_IPV6)
 644        case AF_INET6:
 645                return inet6_sk(sk)->mc_loop;
 646#endif
 647        }
 648        WARN_ON(1);
 649        return true;
 650}
 651EXPORT_SYMBOL(sk_mc_loop);
 652
 653/*
 654 *      This is meant for all protocols to use and covers goings on
 655 *      at the socket level. Everything here is generic.
 656 */
 657
 658int sock_setsockopt(struct socket *sock, int level, int optname,
 659                    char __user *optval, unsigned int optlen)
 660{
 661        struct sock *sk = sock->sk;
 662        int val;
 663        int valbool;
 664        struct linger ling;
 665        int ret = 0;
 666
 667        /*
 668         *      Options without arguments
 669         */
 670
 671        if (optname == SO_BINDTODEVICE)
 672                return sock_setbindtodevice(sk, optval, optlen);
 673
 674        if (optlen < sizeof(int))
 675                return -EINVAL;
 676
 677        if (get_user(val, (int __user *)optval))
 678                return -EFAULT;
 679
 680        valbool = val ? 1 : 0;
 681
 682        lock_sock(sk);
 683
 684        switch (optname) {
 685        case SO_DEBUG:
 686                if (val && !capable(CAP_NET_ADMIN))
 687                        ret = -EACCES;
 688                else
 689                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 690                break;
 691        case SO_REUSEADDR:
 692                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 693                break;
 694        case SO_REUSEPORT:
 695                sk->sk_reuseport = valbool;
 696                break;
 697        case SO_TYPE:
 698        case SO_PROTOCOL:
 699        case SO_DOMAIN:
 700        case SO_ERROR:
 701                ret = -ENOPROTOOPT;
 702                break;
 703        case SO_DONTROUTE:
 704                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 705                break;
 706        case SO_BROADCAST:
 707                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 708                break;
 709        case SO_SNDBUF:
 710                /* Don't error on this BSD doesn't and if you think
 711                 * about it this is right. Otherwise apps have to
 712                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 713                 * are treated in BSD as hints
 714                 */
 715                val = min_t(u32, val, sysctl_wmem_max);
 716set_sndbuf:
 717                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 718                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 719                /* Wake up sending tasks if we upped the value. */
 720                sk->sk_write_space(sk);
 721                break;
 722
 723        case SO_SNDBUFFORCE:
 724                if (!capable(CAP_NET_ADMIN)) {
 725                        ret = -EPERM;
 726                        break;
 727                }
 728                goto set_sndbuf;
 729
 730        case SO_RCVBUF:
 731                /* Don't error on this BSD doesn't and if you think
 732                 * about it this is right. Otherwise apps have to
 733                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 734                 * are treated in BSD as hints
 735                 */
 736                val = min_t(u32, val, sysctl_rmem_max);
 737set_rcvbuf:
 738                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 739                /*
 740                 * We double it on the way in to account for
 741                 * "struct sk_buff" etc. overhead.   Applications
 742                 * assume that the SO_RCVBUF setting they make will
 743                 * allow that much actual data to be received on that
 744                 * socket.
 745                 *
 746                 * Applications are unaware that "struct sk_buff" and
 747                 * other overheads allocate from the receive buffer
 748                 * during socket buffer allocation.
 749                 *
 750                 * And after considering the possible alternatives,
 751                 * returning the value we actually used in getsockopt
 752                 * is the most desirable behavior.
 753                 */
 754                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 755                break;
 756
 757        case SO_RCVBUFFORCE:
 758                if (!capable(CAP_NET_ADMIN)) {
 759                        ret = -EPERM;
 760                        break;
 761                }
 762                goto set_rcvbuf;
 763
 764        case SO_KEEPALIVE:
 765#ifdef CONFIG_INET
 766                if (sk->sk_protocol == IPPROTO_TCP &&
 767                    sk->sk_type == SOCK_STREAM)
 768                        tcp_set_keepalive(sk, valbool);
 769#endif
 770                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 771                break;
 772
 773        case SO_OOBINLINE:
 774                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 775                break;
 776
 777        case SO_NO_CHECK:
 778                sk->sk_no_check_tx = valbool;
 779                break;
 780
 781        case SO_PRIORITY:
 782                if ((val >= 0 && val <= 6) ||
 783                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 784                        sk->sk_priority = val;
 785                else
 786                        ret = -EPERM;
 787                break;
 788
 789        case SO_LINGER:
 790                if (optlen < sizeof(ling)) {
 791                        ret = -EINVAL;  /* 1003.1g */
 792                        break;
 793                }
 794                if (copy_from_user(&ling, optval, sizeof(ling))) {
 795                        ret = -EFAULT;
 796                        break;
 797                }
 798                if (!ling.l_onoff)
 799                        sock_reset_flag(sk, SOCK_LINGER);
 800                else {
 801#if (BITS_PER_LONG == 32)
 802                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 803                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 804                        else
 805#endif
 806                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 807                        sock_set_flag(sk, SOCK_LINGER);
 808                }
 809                break;
 810
 811        case SO_BSDCOMPAT:
 812                sock_warn_obsolete_bsdism("setsockopt");
 813                break;
 814
 815        case SO_PASSCRED:
 816                if (valbool)
 817                        set_bit(SOCK_PASSCRED, &sock->flags);
 818                else
 819                        clear_bit(SOCK_PASSCRED, &sock->flags);
 820                break;
 821
 822        case SO_TIMESTAMP:
 823        case SO_TIMESTAMPNS:
 824                if (valbool)  {
 825                        if (optname == SO_TIMESTAMP)
 826                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827                        else
 828                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 829                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831                } else {
 832                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834                }
 835                break;
 836
 837        case SO_TIMESTAMPING:
 838                if (val & ~SOF_TIMESTAMPING_MASK) {
 839                        ret = -EINVAL;
 840                        break;
 841                }
 842
 843                if (val & SOF_TIMESTAMPING_OPT_ID &&
 844                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 845                        if (sk->sk_protocol == IPPROTO_TCP &&
 846                            sk->sk_type == SOCK_STREAM) {
 847                                if ((1 << sk->sk_state) &
 848                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 849                                        ret = -EINVAL;
 850                                        break;
 851                                }
 852                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 853                        } else {
 854                                sk->sk_tskey = 0;
 855                        }
 856                }
 857
 858                if (val & SOF_TIMESTAMPING_OPT_STATS &&
 859                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 860                        ret = -EINVAL;
 861                        break;
 862                }
 863
 864                sk->sk_tsflags = val;
 865                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 866                        sock_enable_timestamp(sk,
 867                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 868                else
 869                        sock_disable_timestamp(sk,
 870                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 871                break;
 872
 873        case SO_RCVLOWAT:
 874                if (val < 0)
 875                        val = INT_MAX;
 876                sk->sk_rcvlowat = val ? : 1;
 877                break;
 878
 879        case SO_RCVTIMEO:
 880                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 881                break;
 882
 883        case SO_SNDTIMEO:
 884                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 885                break;
 886
 887        case SO_ATTACH_FILTER:
 888                ret = -EINVAL;
 889                if (optlen == sizeof(struct sock_fprog)) {
 890                        struct sock_fprog fprog;
 891
 892                        ret = -EFAULT;
 893                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 894                                break;
 895
 896                        ret = sk_attach_filter(&fprog, sk);
 897                }
 898                break;
 899
 900        case SO_ATTACH_BPF:
 901                ret = -EINVAL;
 902                if (optlen == sizeof(u32)) {
 903                        u32 ufd;
 904
 905                        ret = -EFAULT;
 906                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 907                                break;
 908
 909                        ret = sk_attach_bpf(ufd, sk);
 910                }
 911                break;
 912
 913        case SO_ATTACH_REUSEPORT_CBPF:
 914                ret = -EINVAL;
 915                if (optlen == sizeof(struct sock_fprog)) {
 916                        struct sock_fprog fprog;
 917
 918                        ret = -EFAULT;
 919                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 920                                break;
 921
 922                        ret = sk_reuseport_attach_filter(&fprog, sk);
 923                }
 924                break;
 925
 926        case SO_ATTACH_REUSEPORT_EBPF:
 927                ret = -EINVAL;
 928                if (optlen == sizeof(u32)) {
 929                        u32 ufd;
 930
 931                        ret = -EFAULT;
 932                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 933                                break;
 934
 935                        ret = sk_reuseport_attach_bpf(ufd, sk);
 936                }
 937                break;
 938
 939        case SO_DETACH_FILTER:
 940                ret = sk_detach_filter(sk);
 941                break;
 942
 943        case SO_LOCK_FILTER:
 944                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 945                        ret = -EPERM;
 946                else
 947                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 948                break;
 949
 950        case SO_PASSSEC:
 951                if (valbool)
 952                        set_bit(SOCK_PASSSEC, &sock->flags);
 953                else
 954                        clear_bit(SOCK_PASSSEC, &sock->flags);
 955                break;
 956        case SO_MARK:
 957                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 958                        ret = -EPERM;
 959                else
 960                        sk->sk_mark = val;
 961                break;
 962
 963        case SO_RXQ_OVFL:
 964                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 965                break;
 966
 967        case SO_WIFI_STATUS:
 968                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 969                break;
 970
 971        case SO_PEEK_OFF:
 972                if (sock->ops->set_peek_off)
 973                        ret = sock->ops->set_peek_off(sk, val);
 974                else
 975                        ret = -EOPNOTSUPP;
 976                break;
 977
 978        case SO_NOFCS:
 979                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 980                break;
 981
 982        case SO_SELECT_ERR_QUEUE:
 983                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 984                break;
 985
 986#ifdef CONFIG_NET_RX_BUSY_POLL
 987        case SO_BUSY_POLL:
 988                /* allow unprivileged users to decrease the value */
 989                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 990                        ret = -EPERM;
 991                else {
 992                        if (val < 0)
 993                                ret = -EINVAL;
 994                        else
 995                                sk->sk_ll_usec = val;
 996                }
 997                break;
 998#endif
 999
1000        case SO_MAX_PACING_RATE:
1001                sk->sk_max_pacing_rate = val;
1002                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1003                                         sk->sk_max_pacing_rate);
1004                break;
1005
1006        case SO_INCOMING_CPU:
1007                sk->sk_incoming_cpu = val;
1008                break;
1009
1010        case SO_CNX_ADVICE:
1011                if (val == 1)
1012                        dst_negative_advice(sk);
1013                break;
1014        default:
1015                ret = -ENOPROTOOPT;
1016                break;
1017        }
1018        release_sock(sk);
1019        return ret;
1020}
1021EXPORT_SYMBOL(sock_setsockopt);
1022
1023
1024static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1025                          struct ucred *ucred)
1026{
1027        ucred->pid = pid_vnr(pid);
1028        ucred->uid = ucred->gid = -1;
1029        if (cred) {
1030                struct user_namespace *current_ns = current_user_ns();
1031
1032                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1033                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1034        }
1035}
1036
1037int sock_getsockopt(struct socket *sock, int level, int optname,
1038                    char __user *optval, int __user *optlen)
1039{
1040        struct sock *sk = sock->sk;
1041
1042        union {
1043                int val;
1044                struct linger ling;
1045                struct timeval tm;
1046        } v;
1047
1048        int lv = sizeof(int);
1049        int len;
1050
1051        if (get_user(len, optlen))
1052                return -EFAULT;
1053        if (len < 0)
1054                return -EINVAL;
1055
1056        memset(&v, 0, sizeof(v));
1057
1058        switch (optname) {
1059        case SO_DEBUG:
1060                v.val = sock_flag(sk, SOCK_DBG);
1061                break;
1062
1063        case SO_DONTROUTE:
1064                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1065                break;
1066
1067        case SO_BROADCAST:
1068                v.val = sock_flag(sk, SOCK_BROADCAST);
1069                break;
1070
1071        case SO_SNDBUF:
1072                v.val = sk->sk_sndbuf;
1073                break;
1074
1075        case SO_RCVBUF:
1076                v.val = sk->sk_rcvbuf;
1077                break;
1078
1079        case SO_REUSEADDR:
1080                v.val = sk->sk_reuse;
1081                break;
1082
1083        case SO_REUSEPORT:
1084                v.val = sk->sk_reuseport;
1085                break;
1086
1087        case SO_KEEPALIVE:
1088                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1089                break;
1090
1091        case SO_TYPE:
1092                v.val = sk->sk_type;
1093                break;
1094
1095        case SO_PROTOCOL:
1096                v.val = sk->sk_protocol;
1097                break;
1098
1099        case SO_DOMAIN:
1100                v.val = sk->sk_family;
1101                break;
1102
1103        case SO_ERROR:
1104                v.val = -sock_error(sk);
1105                if (v.val == 0)
1106                        v.val = xchg(&sk->sk_err_soft, 0);
1107                break;
1108
1109        case SO_OOBINLINE:
1110                v.val = sock_flag(sk, SOCK_URGINLINE);
1111                break;
1112
1113        case SO_NO_CHECK:
1114                v.val = sk->sk_no_check_tx;
1115                break;
1116
1117        case SO_PRIORITY:
1118                v.val = sk->sk_priority;
1119                break;
1120
1121        case SO_LINGER:
1122                lv              = sizeof(v.ling);
1123                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1124                v.ling.l_linger = sk->sk_lingertime / HZ;
1125                break;
1126
1127        case SO_BSDCOMPAT:
1128                sock_warn_obsolete_bsdism("getsockopt");
1129                break;
1130
1131        case SO_TIMESTAMP:
1132                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1133                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1134                break;
1135
1136        case SO_TIMESTAMPNS:
1137                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1138                break;
1139
1140        case SO_TIMESTAMPING:
1141                v.val = sk->sk_tsflags;
1142                break;
1143
1144        case SO_RCVTIMEO:
1145                lv = sizeof(struct timeval);
1146                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1147                        v.tm.tv_sec = 0;
1148                        v.tm.tv_usec = 0;
1149                } else {
1150                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1151                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1152                }
1153                break;
1154
1155        case SO_SNDTIMEO:
1156                lv = sizeof(struct timeval);
1157                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1158                        v.tm.tv_sec = 0;
1159                        v.tm.tv_usec = 0;
1160                } else {
1161                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1162                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1163                }
1164                break;
1165
1166        case SO_RCVLOWAT:
1167                v.val = sk->sk_rcvlowat;
1168                break;
1169
1170        case SO_SNDLOWAT:
1171                v.val = 1;
1172                break;
1173
1174        case SO_PASSCRED:
1175                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1176                break;
1177
1178        case SO_PEERCRED:
1179        {
1180                struct ucred peercred;
1181                if (len > sizeof(peercred))
1182                        len = sizeof(peercred);
1183                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1184                if (copy_to_user(optval, &peercred, len))
1185                        return -EFAULT;
1186                goto lenout;
1187        }
1188
1189        case SO_PEERNAME:
1190        {
1191                char address[128];
1192
1193                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1194                        return -ENOTCONN;
1195                if (lv < len)
1196                        return -EINVAL;
1197                if (copy_to_user(optval, address, len))
1198                        return -EFAULT;
1199                goto lenout;
1200        }
1201
1202        /* Dubious BSD thing... Probably nobody even uses it, but
1203         * the UNIX standard wants it for whatever reason... -DaveM
1204         */
1205        case SO_ACCEPTCONN:
1206                v.val = sk->sk_state == TCP_LISTEN;
1207                break;
1208
1209        case SO_PASSSEC:
1210                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1211                break;
1212
1213        case SO_PEERSEC:
1214                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1215
1216        case SO_MARK:
1217                v.val = sk->sk_mark;
1218                break;
1219
1220        case SO_RXQ_OVFL:
1221                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1222                break;
1223
1224        case SO_WIFI_STATUS:
1225                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1226                break;
1227
1228        case SO_PEEK_OFF:
1229                if (!sock->ops->set_peek_off)
1230                        return -EOPNOTSUPP;
1231
1232                v.val = sk->sk_peek_off;
1233                break;
1234        case SO_NOFCS:
1235                v.val = sock_flag(sk, SOCK_NOFCS);
1236                break;
1237
1238        case SO_BINDTODEVICE:
1239                return sock_getbindtodevice(sk, optval, optlen, len);
1240
1241        case SO_GET_FILTER:
1242                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1243                if (len < 0)
1244                        return len;
1245
1246                goto lenout;
1247
1248        case SO_LOCK_FILTER:
1249                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1250                break;
1251
1252        case SO_BPF_EXTENSIONS:
1253                v.val = bpf_tell_extensions();
1254                break;
1255
1256        case SO_SELECT_ERR_QUEUE:
1257                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1258                break;
1259
1260#ifdef CONFIG_NET_RX_BUSY_POLL
1261        case SO_BUSY_POLL:
1262                v.val = sk->sk_ll_usec;
1263                break;
1264#endif
1265
1266        case SO_MAX_PACING_RATE:
1267                v.val = sk->sk_max_pacing_rate;
1268                break;
1269
1270        case SO_INCOMING_CPU:
1271                v.val = sk->sk_incoming_cpu;
1272                break;
1273
1274        default:
1275                /* We implement the SO_SNDLOWAT etc to not be settable
1276                 * (1003.1g 7).
1277                 */
1278                return -ENOPROTOOPT;
1279        }
1280
1281        if (len > lv)
1282                len = lv;
1283        if (copy_to_user(optval, &v, len))
1284                return -EFAULT;
1285lenout:
1286        if (put_user(len, optlen))
1287                return -EFAULT;
1288        return 0;
1289}
1290
1291/*
1292 * Initialize an sk_lock.
1293 *
1294 * (We also register the sk_lock with the lock validator.)
1295 */
1296static inline void sock_lock_init(struct sock *sk)
1297{
1298        sock_lock_init_class_and_name(sk,
1299                        af_family_slock_key_strings[sk->sk_family],
1300                        af_family_slock_keys + sk->sk_family,
1301                        af_family_key_strings[sk->sk_family],
1302                        af_family_keys + sk->sk_family);
1303}
1304
1305/*
1306 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1307 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1308 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1309 */
1310static void sock_copy(struct sock *nsk, const struct sock *osk)
1311{
1312#ifdef CONFIG_SECURITY_NETWORK
1313        void *sptr = nsk->sk_security;
1314#endif
1315        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1316
1317        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1318               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1319
1320#ifdef CONFIG_SECURITY_NETWORK
1321        nsk->sk_security = sptr;
1322        security_sk_clone(osk, nsk);
1323#endif
1324}
1325
1326static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1327                int family)
1328{
1329        struct sock *sk;
1330        struct kmem_cache *slab;
1331
1332        slab = prot->slab;
1333        if (slab != NULL) {
1334                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1335                if (!sk)
1336                        return sk;
1337                if (priority & __GFP_ZERO)
1338                        sk_prot_clear_nulls(sk, prot->obj_size);
1339        } else
1340                sk = kmalloc(prot->obj_size, priority);
1341
1342        if (sk != NULL) {
1343                kmemcheck_annotate_bitfield(sk, flags);
1344
1345                if (security_sk_alloc(sk, family, priority))
1346                        goto out_free;
1347
1348                if (!try_module_get(prot->owner))
1349                        goto out_free_sec;
1350                sk_tx_queue_clear(sk);
1351        }
1352
1353        return sk;
1354
1355out_free_sec:
1356        security_sk_free(sk);
1357out_free:
1358        if (slab != NULL)
1359                kmem_cache_free(slab, sk);
1360        else
1361                kfree(sk);
1362        return NULL;
1363}
1364
1365static void sk_prot_free(struct proto *prot, struct sock *sk)
1366{
1367        struct kmem_cache *slab;
1368        struct module *owner;
1369
1370        owner = prot->owner;
1371        slab = prot->slab;
1372
1373        cgroup_sk_free(&sk->sk_cgrp_data);
1374        mem_cgroup_sk_free(sk);
1375        security_sk_free(sk);
1376        if (slab != NULL)
1377                kmem_cache_free(slab, sk);
1378        else
1379                kfree(sk);
1380        module_put(owner);
1381}
1382
1383/**
1384 *      sk_alloc - All socket objects are allocated here
1385 *      @net: the applicable net namespace
1386 *      @family: protocol family
1387 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1388 *      @prot: struct proto associated with this new sock instance
1389 *      @kern: is this to be a kernel socket?
1390 */
1391struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1392                      struct proto *prot, int kern)
1393{
1394        struct sock *sk;
1395
1396        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1397        if (sk) {
1398                sk->sk_family = family;
1399                /*
1400                 * See comment in struct sock definition to understand
1401                 * why we need sk_prot_creator -acme
1402                 */
1403                sk->sk_prot = sk->sk_prot_creator = prot;
1404                sock_lock_init(sk);
1405                sk->sk_net_refcnt = kern ? 0 : 1;
1406                if (likely(sk->sk_net_refcnt))
1407                        get_net(net);
1408                sock_net_set(sk, net);
1409                atomic_set(&sk->sk_wmem_alloc, 1);
1410
1411                mem_cgroup_sk_alloc(sk);
1412                cgroup_sk_alloc(&sk->sk_cgrp_data);
1413                sock_update_classid(&sk->sk_cgrp_data);
1414                sock_update_netprioidx(&sk->sk_cgrp_data);
1415        }
1416
1417        return sk;
1418}
1419EXPORT_SYMBOL(sk_alloc);
1420
1421/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1422 * grace period. This is the case for UDP sockets and TCP listeners.
1423 */
1424static void __sk_destruct(struct rcu_head *head)
1425{
1426        struct sock *sk = container_of(head, struct sock, sk_rcu);
1427        struct sk_filter *filter;
1428
1429        if (sk->sk_destruct)
1430                sk->sk_destruct(sk);
1431
1432        filter = rcu_dereference_check(sk->sk_filter,
1433                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1434        if (filter) {
1435                sk_filter_uncharge(sk, filter);
1436                RCU_INIT_POINTER(sk->sk_filter, NULL);
1437        }
1438        if (rcu_access_pointer(sk->sk_reuseport_cb))
1439                reuseport_detach_sock(sk);
1440
1441        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1442
1443        if (atomic_read(&sk->sk_omem_alloc))
1444                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1445                         __func__, atomic_read(&sk->sk_omem_alloc));
1446
1447        if (sk->sk_peer_cred)
1448                put_cred(sk->sk_peer_cred);
1449        put_pid(sk->sk_peer_pid);
1450        if (likely(sk->sk_net_refcnt))
1451                put_net(sock_net(sk));
1452        sk_prot_free(sk->sk_prot_creator, sk);
1453}
1454
1455void sk_destruct(struct sock *sk)
1456{
1457        if (sock_flag(sk, SOCK_RCU_FREE))
1458                call_rcu(&sk->sk_rcu, __sk_destruct);
1459        else
1460                __sk_destruct(&sk->sk_rcu);
1461}
1462
1463static void __sk_free(struct sock *sk)
1464{
1465        if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1466                sock_diag_broadcast_destroy(sk);
1467        else
1468                sk_destruct(sk);
1469}
1470
1471void sk_free(struct sock *sk)
1472{
1473        /*
1474         * We subtract one from sk_wmem_alloc and can know if
1475         * some packets are still in some tx queue.
1476         * If not null, sock_wfree() will call __sk_free(sk) later
1477         */
1478        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1479                __sk_free(sk);
1480}
1481EXPORT_SYMBOL(sk_free);
1482
1483/**
1484 *      sk_clone_lock - clone a socket, and lock its clone
1485 *      @sk: the socket to clone
1486 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1487 *
1488 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1489 */
1490struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1491{
1492        struct sock *newsk;
1493        bool is_charged = true;
1494
1495        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1496        if (newsk != NULL) {
1497                struct sk_filter *filter;
1498
1499                sock_copy(newsk, sk);
1500
1501                /* SANITY */
1502                if (likely(newsk->sk_net_refcnt))
1503                        get_net(sock_net(newsk));
1504                sk_node_init(&newsk->sk_node);
1505                sock_lock_init(newsk);
1506                bh_lock_sock(newsk);
1507                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1508                newsk->sk_backlog.len = 0;
1509
1510                atomic_set(&newsk->sk_rmem_alloc, 0);
1511                /*
1512                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1513                 */
1514                atomic_set(&newsk->sk_wmem_alloc, 1);
1515                atomic_set(&newsk->sk_omem_alloc, 0);
1516                skb_queue_head_init(&newsk->sk_receive_queue);
1517                skb_queue_head_init(&newsk->sk_write_queue);
1518
1519                rwlock_init(&newsk->sk_callback_lock);
1520                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1521                                af_callback_keys + newsk->sk_family,
1522                                af_family_clock_key_strings[newsk->sk_family]);
1523
1524                newsk->sk_dst_cache     = NULL;
1525                newsk->sk_wmem_queued   = 0;
1526                newsk->sk_forward_alloc = 0;
1527                atomic_set(&newsk->sk_drops, 0);
1528                newsk->sk_send_head     = NULL;
1529                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1530
1531                sock_reset_flag(newsk, SOCK_DONE);
1532                skb_queue_head_init(&newsk->sk_error_queue);
1533
1534                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1535                if (filter != NULL)
1536                        /* though it's an empty new sock, the charging may fail
1537                         * if sysctl_optmem_max was changed between creation of
1538                         * original socket and cloning
1539                         */
1540                        is_charged = sk_filter_charge(newsk, filter);
1541
1542                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1543                        /* It is still raw copy of parent, so invalidate
1544                         * destructor and make plain sk_free() */
1545                        newsk->sk_destruct = NULL;
1546                        bh_unlock_sock(newsk);
1547                        sk_free(newsk);
1548                        newsk = NULL;
1549                        goto out;
1550                }
1551                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1552
1553                newsk->sk_err      = 0;
1554                newsk->sk_err_soft = 0;
1555                newsk->sk_priority = 0;
1556                newsk->sk_incoming_cpu = raw_smp_processor_id();
1557                atomic64_set(&newsk->sk_cookie, 0);
1558
1559                mem_cgroup_sk_alloc(newsk);
1560                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1561
1562                /*
1563                 * Before updating sk_refcnt, we must commit prior changes to memory
1564                 * (Documentation/RCU/rculist_nulls.txt for details)
1565                 */
1566                smp_wmb();
1567                atomic_set(&newsk->sk_refcnt, 2);
1568
1569                /*
1570                 * Increment the counter in the same struct proto as the master
1571                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1572                 * is the same as sk->sk_prot->socks, as this field was copied
1573                 * with memcpy).
1574                 *
1575                 * This _changes_ the previous behaviour, where
1576                 * tcp_create_openreq_child always was incrementing the
1577                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1578                 * to be taken into account in all callers. -acme
1579                 */
1580                sk_refcnt_debug_inc(newsk);
1581                sk_set_socket(newsk, NULL);
1582                newsk->sk_wq = NULL;
1583
1584                if (newsk->sk_prot->sockets_allocated)
1585                        sk_sockets_allocated_inc(newsk);
1586
1587                if (sock_needs_netstamp(sk) &&
1588                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1589                        net_enable_timestamp();
1590        }
1591out:
1592        return newsk;
1593}
1594EXPORT_SYMBOL_GPL(sk_clone_lock);
1595
1596void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1597{
1598        u32 max_segs = 1;
1599
1600        sk_dst_set(sk, dst);
1601        sk->sk_route_caps = dst->dev->features;
1602        if (sk->sk_route_caps & NETIF_F_GSO)
1603                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1604        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1605        if (sk_can_gso(sk)) {
1606                if (dst->header_len) {
1607                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1608                } else {
1609                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1610                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1611                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1612                }
1613        }
1614        sk->sk_gso_max_segs = max_segs;
1615}
1616EXPORT_SYMBOL_GPL(sk_setup_caps);
1617
1618/*
1619 *      Simple resource managers for sockets.
1620 */
1621
1622
1623/*
1624 * Write buffer destructor automatically called from kfree_skb.
1625 */
1626void sock_wfree(struct sk_buff *skb)
1627{
1628        struct sock *sk = skb->sk;
1629        unsigned int len = skb->truesize;
1630
1631        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1632                /*
1633                 * Keep a reference on sk_wmem_alloc, this will be released
1634                 * after sk_write_space() call
1635                 */
1636                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1637                sk->sk_write_space(sk);
1638                len = 1;
1639        }
1640        /*
1641         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1642         * could not do because of in-flight packets
1643         */
1644        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1645                __sk_free(sk);
1646}
1647EXPORT_SYMBOL(sock_wfree);
1648
1649/* This variant of sock_wfree() is used by TCP,
1650 * since it sets SOCK_USE_WRITE_QUEUE.
1651 */
1652void __sock_wfree(struct sk_buff *skb)
1653{
1654        struct sock *sk = skb->sk;
1655
1656        if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1657                __sk_free(sk);
1658}
1659
1660void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1661{
1662        skb_orphan(skb);
1663        skb->sk = sk;
1664#ifdef CONFIG_INET
1665        if (unlikely(!sk_fullsock(sk))) {
1666                skb->destructor = sock_edemux;
1667                sock_hold(sk);
1668                return;
1669        }
1670#endif
1671        skb->destructor = sock_wfree;
1672        skb_set_hash_from_sk(skb, sk);
1673        /*
1674         * We used to take a refcount on sk, but following operation
1675         * is enough to guarantee sk_free() wont free this sock until
1676         * all in-flight packets are completed
1677         */
1678        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1679}
1680EXPORT_SYMBOL(skb_set_owner_w);
1681
1682/* This helper is used by netem, as it can hold packets in its
1683 * delay queue. We want to allow the owner socket to send more
1684 * packets, as if they were already TX completed by a typical driver.
1685 * But we also want to keep skb->sk set because some packet schedulers
1686 * rely on it (sch_fq for example). So we set skb->truesize to a small
1687 * amount (1) and decrease sk_wmem_alloc accordingly.
1688 */
1689void skb_orphan_partial(struct sk_buff *skb)
1690{
1691        /* If this skb is a TCP pure ACK or already went here,
1692         * we have nothing to do. 2 is already a very small truesize.
1693         */
1694        if (skb->truesize <= 2)
1695                return;
1696
1697        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1698         * so we do not completely orphan skb, but transfert all
1699         * accounted bytes but one, to avoid unexpected reorders.
1700         */
1701        if (skb->destructor == sock_wfree
1702#ifdef CONFIG_INET
1703            || skb->destructor == tcp_wfree
1704#endif
1705                ) {
1706                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1707                skb->truesize = 1;
1708        } else {
1709                skb_orphan(skb);
1710        }
1711}
1712EXPORT_SYMBOL(skb_orphan_partial);
1713
1714/*
1715 * Read buffer destructor automatically called from kfree_skb.
1716 */
1717void sock_rfree(struct sk_buff *skb)
1718{
1719        struct sock *sk = skb->sk;
1720        unsigned int len = skb->truesize;
1721
1722        atomic_sub(len, &sk->sk_rmem_alloc);
1723        sk_mem_uncharge(sk, len);
1724}
1725EXPORT_SYMBOL(sock_rfree);
1726
1727/*
1728 * Buffer destructor for skbs that are not used directly in read or write
1729 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1730 */
1731void sock_efree(struct sk_buff *skb)
1732{
1733        sock_put(skb->sk);
1734}
1735EXPORT_SYMBOL(sock_efree);
1736
1737kuid_t sock_i_uid(struct sock *sk)
1738{
1739        kuid_t uid;
1740
1741        read_lock_bh(&sk->sk_callback_lock);
1742        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1743        read_unlock_bh(&sk->sk_callback_lock);
1744        return uid;
1745}
1746EXPORT_SYMBOL(sock_i_uid);
1747
1748unsigned long sock_i_ino(struct sock *sk)
1749{
1750        unsigned long ino;
1751
1752        read_lock_bh(&sk->sk_callback_lock);
1753        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1754        read_unlock_bh(&sk->sk_callback_lock);
1755        return ino;
1756}
1757EXPORT_SYMBOL(sock_i_ino);
1758
1759/*
1760 * Allocate a skb from the socket's send buffer.
1761 */
1762struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1763                             gfp_t priority)
1764{
1765        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1766                struct sk_buff *skb = alloc_skb(size, priority);
1767                if (skb) {
1768                        skb_set_owner_w(skb, sk);
1769                        return skb;
1770                }
1771        }
1772        return NULL;
1773}
1774EXPORT_SYMBOL(sock_wmalloc);
1775
1776/*
1777 * Allocate a memory block from the socket's option memory buffer.
1778 */
1779void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1780{
1781        if ((unsigned int)size <= sysctl_optmem_max &&
1782            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1783                void *mem;
1784                /* First do the add, to avoid the race if kmalloc
1785                 * might sleep.
1786                 */
1787                atomic_add(size, &sk->sk_omem_alloc);
1788                mem = kmalloc(size, priority);
1789                if (mem)
1790                        return mem;
1791                atomic_sub(size, &sk->sk_omem_alloc);
1792        }
1793        return NULL;
1794}
1795EXPORT_SYMBOL(sock_kmalloc);
1796
1797/* Free an option memory block. Note, we actually want the inline
1798 * here as this allows gcc to detect the nullify and fold away the
1799 * condition entirely.
1800 */
1801static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1802                                  const bool nullify)
1803{
1804        if (WARN_ON_ONCE(!mem))
1805                return;
1806        if (nullify)
1807                kzfree(mem);
1808        else
1809                kfree(mem);
1810        atomic_sub(size, &sk->sk_omem_alloc);
1811}
1812
1813void sock_kfree_s(struct sock *sk, void *mem, int size)
1814{
1815        __sock_kfree_s(sk, mem, size, false);
1816}
1817EXPORT_SYMBOL(sock_kfree_s);
1818
1819void sock_kzfree_s(struct sock *sk, void *mem, int size)
1820{
1821        __sock_kfree_s(sk, mem, size, true);
1822}
1823EXPORT_SYMBOL(sock_kzfree_s);
1824
1825/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1826   I think, these locks should be removed for datagram sockets.
1827 */
1828static long sock_wait_for_wmem(struct sock *sk, long timeo)
1829{
1830        DEFINE_WAIT(wait);
1831
1832        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1833        for (;;) {
1834                if (!timeo)
1835                        break;
1836                if (signal_pending(current))
1837                        break;
1838                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1839                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1840                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1841                        break;
1842                if (sk->sk_shutdown & SEND_SHUTDOWN)
1843                        break;
1844                if (sk->sk_err)
1845                        break;
1846                timeo = schedule_timeout(timeo);
1847        }
1848        finish_wait(sk_sleep(sk), &wait);
1849        return timeo;
1850}
1851
1852
1853/*
1854 *      Generic send/receive buffer handlers
1855 */
1856
1857struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1858                                     unsigned long data_len, int noblock,
1859                                     int *errcode, int max_page_order)
1860{
1861        struct sk_buff *skb;
1862        long timeo;
1863        int err;
1864
1865        timeo = sock_sndtimeo(sk, noblock);
1866        for (;;) {
1867                err = sock_error(sk);
1868                if (err != 0)
1869                        goto failure;
1870
1871                err = -EPIPE;
1872                if (sk->sk_shutdown & SEND_SHUTDOWN)
1873                        goto failure;
1874
1875                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1876                        break;
1877
1878                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1879                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1880                err = -EAGAIN;
1881                if (!timeo)
1882                        goto failure;
1883                if (signal_pending(current))
1884                        goto interrupted;
1885                timeo = sock_wait_for_wmem(sk, timeo);
1886        }
1887        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1888                                   errcode, sk->sk_allocation);
1889        if (skb)
1890                skb_set_owner_w(skb, sk);
1891        return skb;
1892
1893interrupted:
1894        err = sock_intr_errno(timeo);
1895failure:
1896        *errcode = err;
1897        return NULL;
1898}
1899EXPORT_SYMBOL(sock_alloc_send_pskb);
1900
1901struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1902                                    int noblock, int *errcode)
1903{
1904        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1905}
1906EXPORT_SYMBOL(sock_alloc_send_skb);
1907
1908int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1909                     struct sockcm_cookie *sockc)
1910{
1911        u32 tsflags;
1912
1913        switch (cmsg->cmsg_type) {
1914        case SO_MARK:
1915                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1916                        return -EPERM;
1917                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1918                        return -EINVAL;
1919                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1920                break;
1921        case SO_TIMESTAMPING:
1922                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1923                        return -EINVAL;
1924
1925                tsflags = *(u32 *)CMSG_DATA(cmsg);
1926                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1927                        return -EINVAL;
1928
1929                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1930                sockc->tsflags |= tsflags;
1931                break;
1932        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1933        case SCM_RIGHTS:
1934        case SCM_CREDENTIALS:
1935                break;
1936        default:
1937                return -EINVAL;
1938        }
1939        return 0;
1940}
1941EXPORT_SYMBOL(__sock_cmsg_send);
1942
1943int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1944                   struct sockcm_cookie *sockc)
1945{
1946        struct cmsghdr *cmsg;
1947        int ret;
1948
1949        for_each_cmsghdr(cmsg, msg) {
1950                if (!CMSG_OK(msg, cmsg))
1951                        return -EINVAL;
1952                if (cmsg->cmsg_level != SOL_SOCKET)
1953                        continue;
1954                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1955                if (ret)
1956                        return ret;
1957        }
1958        return 0;
1959}
1960EXPORT_SYMBOL(sock_cmsg_send);
1961
1962/* On 32bit arches, an skb frag is limited to 2^15 */
1963#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1964
1965/**
1966 * skb_page_frag_refill - check that a page_frag contains enough room
1967 * @sz: minimum size of the fragment we want to get
1968 * @pfrag: pointer to page_frag
1969 * @gfp: priority for memory allocation
1970 *
1971 * Note: While this allocator tries to use high order pages, there is
1972 * no guarantee that allocations succeed. Therefore, @sz MUST be
1973 * less or equal than PAGE_SIZE.
1974 */
1975bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1976{
1977        if (pfrag->page) {
1978                if (page_ref_count(pfrag->page) == 1) {
1979                        pfrag->offset = 0;
1980                        return true;
1981                }
1982                if (pfrag->offset + sz <= pfrag->size)
1983                        return true;
1984                put_page(pfrag->page);
1985        }
1986
1987        pfrag->offset = 0;
1988        if (SKB_FRAG_PAGE_ORDER) {
1989                /* Avoid direct reclaim but allow kswapd to wake */
1990                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1991                                          __GFP_COMP | __GFP_NOWARN |
1992                                          __GFP_NORETRY,
1993                                          SKB_FRAG_PAGE_ORDER);
1994                if (likely(pfrag->page)) {
1995                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1996                        return true;
1997                }
1998        }
1999        pfrag->page = alloc_page(gfp);
2000        if (likely(pfrag->page)) {
2001                pfrag->size = PAGE_SIZE;
2002                return true;
2003        }
2004        return false;
2005}
2006EXPORT_SYMBOL(skb_page_frag_refill);
2007
2008bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2009{
2010        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2011                return true;
2012
2013        sk_enter_memory_pressure(sk);
2014        sk_stream_moderate_sndbuf(sk);
2015        return false;
2016}
2017EXPORT_SYMBOL(sk_page_frag_refill);
2018
2019static void __lock_sock(struct sock *sk)
2020        __releases(&sk->sk_lock.slock)
2021        __acquires(&sk->sk_lock.slock)
2022{
2023        DEFINE_WAIT(wait);
2024
2025        for (;;) {
2026                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2027                                        TASK_UNINTERRUPTIBLE);
2028                spin_unlock_bh(&sk->sk_lock.slock);
2029                schedule();
2030                spin_lock_bh(&sk->sk_lock.slock);
2031                if (!sock_owned_by_user(sk))
2032                        break;
2033        }
2034        finish_wait(&sk->sk_lock.wq, &wait);
2035}
2036
2037static void __release_sock(struct sock *sk)
2038        __releases(&sk->sk_lock.slock)
2039        __acquires(&sk->sk_lock.slock)
2040{
2041        struct sk_buff *skb, *next;
2042
2043        while ((skb = sk->sk_backlog.head) != NULL) {
2044                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2045
2046                spin_unlock_bh(&sk->sk_lock.slock);
2047
2048                do {
2049                        next = skb->next;
2050                        prefetch(next);
2051                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2052                        skb->next = NULL;
2053                        sk_backlog_rcv(sk, skb);
2054
2055                        cond_resched();
2056
2057                        skb = next;
2058                } while (skb != NULL);
2059
2060                spin_lock_bh(&sk->sk_lock.slock);
2061        }
2062
2063        /*
2064         * Doing the zeroing here guarantee we can not loop forever
2065         * while a wild producer attempts to flood us.
2066         */
2067        sk->sk_backlog.len = 0;
2068}
2069
2070void __sk_flush_backlog(struct sock *sk)
2071{
2072        spin_lock_bh(&sk->sk_lock.slock);
2073        __release_sock(sk);
2074        spin_unlock_bh(&sk->sk_lock.slock);
2075}
2076
2077/**
2078 * sk_wait_data - wait for data to arrive at sk_receive_queue
2079 * @sk:    sock to wait on
2080 * @timeo: for how long
2081 * @skb:   last skb seen on sk_receive_queue
2082 *
2083 * Now socket state including sk->sk_err is changed only under lock,
2084 * hence we may omit checks after joining wait queue.
2085 * We check receive queue before schedule() only as optimization;
2086 * it is very likely that release_sock() added new data.
2087 */
2088int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2089{
2090        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2091        int rc;
2092
2093        add_wait_queue(sk_sleep(sk), &wait);
2094        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2095        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2096        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2097        remove_wait_queue(sk_sleep(sk), &wait);
2098        return rc;
2099}
2100EXPORT_SYMBOL(sk_wait_data);
2101
2102/**
2103 *      __sk_mem_raise_allocated - increase memory_allocated
2104 *      @sk: socket
2105 *      @size: memory size to allocate
2106 *      @amt: pages to allocate
2107 *      @kind: allocation type
2108 *
2109 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2110 */
2111int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2112{
2113        struct proto *prot = sk->sk_prot;
2114        long allocated = sk_memory_allocated_add(sk, amt);
2115
2116        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2117            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2118                goto suppress_allocation;
2119
2120        /* Under limit. */
2121        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2122                sk_leave_memory_pressure(sk);
2123                return 1;
2124        }
2125
2126        /* Under pressure. */
2127        if (allocated > sk_prot_mem_limits(sk, 1))
2128                sk_enter_memory_pressure(sk);
2129
2130        /* Over hard limit. */
2131        if (allocated > sk_prot_mem_limits(sk, 2))
2132                goto suppress_allocation;
2133
2134        /* guarantee minimum buffer size under pressure */
2135        if (kind == SK_MEM_RECV) {
2136                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2137                        return 1;
2138
2139        } else { /* SK_MEM_SEND */
2140                if (sk->sk_type == SOCK_STREAM) {
2141                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2142                                return 1;
2143                } else if (atomic_read(&sk->sk_wmem_alloc) <
2144                           prot->sysctl_wmem[0])
2145                                return 1;
2146        }
2147
2148        if (sk_has_memory_pressure(sk)) {
2149                int alloc;
2150
2151                if (!sk_under_memory_pressure(sk))
2152                        return 1;
2153                alloc = sk_sockets_allocated_read_positive(sk);
2154                if (sk_prot_mem_limits(sk, 2) > alloc *
2155                    sk_mem_pages(sk->sk_wmem_queued +
2156                                 atomic_read(&sk->sk_rmem_alloc) +
2157                                 sk->sk_forward_alloc))
2158                        return 1;
2159        }
2160
2161suppress_allocation:
2162
2163        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2164                sk_stream_moderate_sndbuf(sk);
2165
2166                /* Fail only if socket is _under_ its sndbuf.
2167                 * In this case we cannot block, so that we have to fail.
2168                 */
2169                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2170                        return 1;
2171        }
2172
2173        trace_sock_exceed_buf_limit(sk, prot, allocated);
2174
2175        sk_memory_allocated_sub(sk, amt);
2176
2177        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2178                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2179
2180        return 0;
2181}
2182EXPORT_SYMBOL(__sk_mem_raise_allocated);
2183
2184/**
2185 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2186 *      @sk: socket
2187 *      @size: memory size to allocate
2188 *      @kind: allocation type
2189 *
2190 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2191 *      rmem allocation. This function assumes that protocols which have
2192 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2193 */
2194int __sk_mem_schedule(struct sock *sk, int size, int kind)
2195{
2196        int ret, amt = sk_mem_pages(size);
2197
2198        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2199        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2200        if (!ret)
2201                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2202        return ret;
2203}
2204EXPORT_SYMBOL(__sk_mem_schedule);
2205
2206/**
2207 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2208 *      @sk: socket
2209 *      @amount: number of quanta
2210 *
2211 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2212 */
2213void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2214{
2215        sk_memory_allocated_sub(sk, amount);
2216
2217        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2218                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2219
2220        if (sk_under_memory_pressure(sk) &&
2221            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2222                sk_leave_memory_pressure(sk);
2223}
2224EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2225
2226/**
2227 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2228 *      @sk: socket
2229 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2230 */
2231void __sk_mem_reclaim(struct sock *sk, int amount)
2232{
2233        amount >>= SK_MEM_QUANTUM_SHIFT;
2234        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2235        __sk_mem_reduce_allocated(sk, amount);
2236}
2237EXPORT_SYMBOL(__sk_mem_reclaim);
2238
2239int sk_set_peek_off(struct sock *sk, int val)
2240{
2241        if (val < 0)
2242                return -EINVAL;
2243
2244        sk->sk_peek_off = val;
2245        return 0;
2246}
2247EXPORT_SYMBOL_GPL(sk_set_peek_off);
2248
2249/*
2250 * Set of default routines for initialising struct proto_ops when
2251 * the protocol does not support a particular function. In certain
2252 * cases where it makes no sense for a protocol to have a "do nothing"
2253 * function, some default processing is provided.
2254 */
2255
2256int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2257{
2258        return -EOPNOTSUPP;
2259}
2260EXPORT_SYMBOL(sock_no_bind);
2261
2262int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2263                    int len, int flags)
2264{
2265        return -EOPNOTSUPP;
2266}
2267EXPORT_SYMBOL(sock_no_connect);
2268
2269int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2270{
2271        return -EOPNOTSUPP;
2272}
2273EXPORT_SYMBOL(sock_no_socketpair);
2274
2275int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2276{
2277        return -EOPNOTSUPP;
2278}
2279EXPORT_SYMBOL(sock_no_accept);
2280
2281int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2282                    int *len, int peer)
2283{
2284        return -EOPNOTSUPP;
2285}
2286EXPORT_SYMBOL(sock_no_getname);
2287
2288unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2289{
2290        return 0;
2291}
2292EXPORT_SYMBOL(sock_no_poll);
2293
2294int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2295{
2296        return -EOPNOTSUPP;
2297}
2298EXPORT_SYMBOL(sock_no_ioctl);
2299
2300int sock_no_listen(struct socket *sock, int backlog)
2301{
2302        return -EOPNOTSUPP;
2303}
2304EXPORT_SYMBOL(sock_no_listen);
2305
2306int sock_no_shutdown(struct socket *sock, int how)
2307{
2308        return -EOPNOTSUPP;
2309}
2310EXPORT_SYMBOL(sock_no_shutdown);
2311
2312int sock_no_setsockopt(struct socket *sock, int level, int optname,
2313                    char __user *optval, unsigned int optlen)
2314{
2315        return -EOPNOTSUPP;
2316}
2317EXPORT_SYMBOL(sock_no_setsockopt);
2318
2319int sock_no_getsockopt(struct socket *sock, int level, int optname,
2320                    char __user *optval, int __user *optlen)
2321{
2322        return -EOPNOTSUPP;
2323}
2324EXPORT_SYMBOL(sock_no_getsockopt);
2325
2326int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2327{
2328        return -EOPNOTSUPP;
2329}
2330EXPORT_SYMBOL(sock_no_sendmsg);
2331
2332int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2333                    int flags)
2334{
2335        return -EOPNOTSUPP;
2336}
2337EXPORT_SYMBOL(sock_no_recvmsg);
2338
2339int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2340{
2341        /* Mirror missing mmap method error code */
2342        return -ENODEV;
2343}
2344EXPORT_SYMBOL(sock_no_mmap);
2345
2346ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2347{
2348        ssize_t res;
2349        struct msghdr msg = {.msg_flags = flags};
2350        struct kvec iov;
2351        char *kaddr = kmap(page);
2352        iov.iov_base = kaddr + offset;
2353        iov.iov_len = size;
2354        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2355        kunmap(page);
2356        return res;
2357}
2358EXPORT_SYMBOL(sock_no_sendpage);
2359
2360/*
2361 *      Default Socket Callbacks
2362 */
2363
2364static void sock_def_wakeup(struct sock *sk)
2365{
2366        struct socket_wq *wq;
2367
2368        rcu_read_lock();
2369        wq = rcu_dereference(sk->sk_wq);
2370        if (skwq_has_sleeper(wq))
2371                wake_up_interruptible_all(&wq->wait);
2372        rcu_read_unlock();
2373}
2374
2375static void sock_def_error_report(struct sock *sk)
2376{
2377        struct socket_wq *wq;
2378
2379        rcu_read_lock();
2380        wq = rcu_dereference(sk->sk_wq);
2381        if (skwq_has_sleeper(wq))
2382                wake_up_interruptible_poll(&wq->wait, POLLERR);
2383        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2384        rcu_read_unlock();
2385}
2386
2387static void sock_def_readable(struct sock *sk)
2388{
2389        struct socket_wq *wq;
2390
2391        rcu_read_lock();
2392        wq = rcu_dereference(sk->sk_wq);
2393        if (skwq_has_sleeper(wq))
2394                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2395                                                POLLRDNORM | POLLRDBAND);
2396        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2397        rcu_read_unlock();
2398}
2399
2400static void sock_def_write_space(struct sock *sk)
2401{
2402        struct socket_wq *wq;
2403
2404        rcu_read_lock();
2405
2406        /* Do not wake up a writer until he can make "significant"
2407         * progress.  --DaveM
2408         */
2409        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2410                wq = rcu_dereference(sk->sk_wq);
2411                if (skwq_has_sleeper(wq))
2412                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2413                                                POLLWRNORM | POLLWRBAND);
2414
2415                /* Should agree with poll, otherwise some programs break */
2416                if (sock_writeable(sk))
2417                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2418        }
2419
2420        rcu_read_unlock();
2421}
2422
2423static void sock_def_destruct(struct sock *sk)
2424{
2425}
2426
2427void sk_send_sigurg(struct sock *sk)
2428{
2429        if (sk->sk_socket && sk->sk_socket->file)
2430                if (send_sigurg(&sk->sk_socket->file->f_owner))
2431                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2432}
2433EXPORT_SYMBOL(sk_send_sigurg);
2434
2435void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2436                    unsigned long expires)
2437{
2438        if (!mod_timer(timer, expires))
2439                sock_hold(sk);
2440}
2441EXPORT_SYMBOL(sk_reset_timer);
2442
2443void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2444{
2445        if (del_timer(timer))
2446                __sock_put(sk);
2447}
2448EXPORT_SYMBOL(sk_stop_timer);
2449
2450void sock_init_data(struct socket *sock, struct sock *sk)
2451{
2452        skb_queue_head_init(&sk->sk_receive_queue);
2453        skb_queue_head_init(&sk->sk_write_queue);
2454        skb_queue_head_init(&sk->sk_error_queue);
2455
2456        sk->sk_send_head        =       NULL;
2457
2458        init_timer(&sk->sk_timer);
2459
2460        sk->sk_allocation       =       GFP_KERNEL;
2461        sk->sk_rcvbuf           =       sysctl_rmem_default;
2462        sk->sk_sndbuf           =       sysctl_wmem_default;
2463        sk->sk_state            =       TCP_CLOSE;
2464        sk_set_socket(sk, sock);
2465
2466        sock_set_flag(sk, SOCK_ZAPPED);
2467
2468        if (sock) {
2469                sk->sk_type     =       sock->type;
2470                sk->sk_wq       =       sock->wq;
2471                sock->sk        =       sk;
2472                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2473        } else {
2474                sk->sk_wq       =       NULL;
2475                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2476        }
2477
2478        rwlock_init(&sk->sk_callback_lock);
2479        lockdep_set_class_and_name(&sk->sk_callback_lock,
2480                        af_callback_keys + sk->sk_family,
2481                        af_family_clock_key_strings[sk->sk_family]);
2482
2483        sk->sk_state_change     =       sock_def_wakeup;
2484        sk->sk_data_ready       =       sock_def_readable;
2485        sk->sk_write_space      =       sock_def_write_space;
2486        sk->sk_error_report     =       sock_def_error_report;
2487        sk->sk_destruct         =       sock_def_destruct;
2488
2489        sk->sk_frag.page        =       NULL;
2490        sk->sk_frag.offset      =       0;
2491        sk->sk_peek_off         =       -1;
2492
2493        sk->sk_peer_pid         =       NULL;
2494        sk->sk_peer_cred        =       NULL;
2495        sk->sk_write_pending    =       0;
2496        sk->sk_rcvlowat         =       1;
2497        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2498        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2499
2500        sk->sk_stamp = ktime_set(-1L, 0);
2501
2502#ifdef CONFIG_NET_RX_BUSY_POLL
2503        sk->sk_napi_id          =       0;
2504        sk->sk_ll_usec          =       sysctl_net_busy_read;
2505#endif
2506
2507        sk->sk_max_pacing_rate = ~0U;
2508        sk->sk_pacing_rate = ~0U;
2509        sk->sk_incoming_cpu = -1;
2510        /*
2511         * Before updating sk_refcnt, we must commit prior changes to memory
2512         * (Documentation/RCU/rculist_nulls.txt for details)
2513         */
2514        smp_wmb();
2515        atomic_set(&sk->sk_refcnt, 1);
2516        atomic_set(&sk->sk_drops, 0);
2517}
2518EXPORT_SYMBOL(sock_init_data);
2519
2520void lock_sock_nested(struct sock *sk, int subclass)
2521{
2522        might_sleep();
2523        spin_lock_bh(&sk->sk_lock.slock);
2524        if (sk->sk_lock.owned)
2525                __lock_sock(sk);
2526        sk->sk_lock.owned = 1;
2527        spin_unlock(&sk->sk_lock.slock);
2528        /*
2529         * The sk_lock has mutex_lock() semantics here:
2530         */
2531        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2532        local_bh_enable();
2533}
2534EXPORT_SYMBOL(lock_sock_nested);
2535
2536void release_sock(struct sock *sk)
2537{
2538        spin_lock_bh(&sk->sk_lock.slock);
2539        if (sk->sk_backlog.tail)
2540                __release_sock(sk);
2541
2542        /* Warning : release_cb() might need to release sk ownership,
2543         * ie call sock_release_ownership(sk) before us.
2544         */
2545        if (sk->sk_prot->release_cb)
2546                sk->sk_prot->release_cb(sk);
2547
2548        sock_release_ownership(sk);
2549        if (waitqueue_active(&sk->sk_lock.wq))
2550                wake_up(&sk->sk_lock.wq);
2551        spin_unlock_bh(&sk->sk_lock.slock);
2552}
2553EXPORT_SYMBOL(release_sock);
2554
2555/**
2556 * lock_sock_fast - fast version of lock_sock
2557 * @sk: socket
2558 *
2559 * This version should be used for very small section, where process wont block
2560 * return false if fast path is taken
2561 *   sk_lock.slock locked, owned = 0, BH disabled
2562 * return true if slow path is taken
2563 *   sk_lock.slock unlocked, owned = 1, BH enabled
2564 */
2565bool lock_sock_fast(struct sock *sk)
2566{
2567        might_sleep();
2568        spin_lock_bh(&sk->sk_lock.slock);
2569
2570        if (!sk->sk_lock.owned)
2571                /*
2572                 * Note : We must disable BH
2573                 */
2574                return false;
2575
2576        __lock_sock(sk);
2577        sk->sk_lock.owned = 1;
2578        spin_unlock(&sk->sk_lock.slock);
2579        /*
2580         * The sk_lock has mutex_lock() semantics here:
2581         */
2582        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2583        local_bh_enable();
2584        return true;
2585}
2586EXPORT_SYMBOL(lock_sock_fast);
2587
2588int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2589{
2590        struct timeval tv;
2591        if (!sock_flag(sk, SOCK_TIMESTAMP))
2592                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2593        tv = ktime_to_timeval(sk->sk_stamp);
2594        if (tv.tv_sec == -1)
2595                return -ENOENT;
2596        if (tv.tv_sec == 0) {
2597                sk->sk_stamp = ktime_get_real();
2598                tv = ktime_to_timeval(sk->sk_stamp);
2599        }
2600        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2601}
2602EXPORT_SYMBOL(sock_get_timestamp);
2603
2604int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2605{
2606        struct timespec ts;
2607        if (!sock_flag(sk, SOCK_TIMESTAMP))
2608                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2609        ts = ktime_to_timespec(sk->sk_stamp);
2610        if (ts.tv_sec == -1)
2611                return -ENOENT;
2612        if (ts.tv_sec == 0) {
2613                sk->sk_stamp = ktime_get_real();
2614                ts = ktime_to_timespec(sk->sk_stamp);
2615        }
2616        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2617}
2618EXPORT_SYMBOL(sock_get_timestampns);
2619
2620void sock_enable_timestamp(struct sock *sk, int flag)
2621{
2622        if (!sock_flag(sk, flag)) {
2623                unsigned long previous_flags = sk->sk_flags;
2624
2625                sock_set_flag(sk, flag);
2626                /*
2627                 * we just set one of the two flags which require net
2628                 * time stamping, but time stamping might have been on
2629                 * already because of the other one
2630                 */
2631                if (sock_needs_netstamp(sk) &&
2632                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2633                        net_enable_timestamp();
2634        }
2635}
2636
2637int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2638                       int level, int type)
2639{
2640        struct sock_exterr_skb *serr;
2641        struct sk_buff *skb;
2642        int copied, err;
2643
2644        err = -EAGAIN;
2645        skb = sock_dequeue_err_skb(sk);
2646        if (skb == NULL)
2647                goto out;
2648
2649        copied = skb->len;
2650        if (copied > len) {
2651                msg->msg_flags |= MSG_TRUNC;
2652                copied = len;
2653        }
2654        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2655        if (err)
2656                goto out_free_skb;
2657
2658        sock_recv_timestamp(msg, sk, skb);
2659
2660        serr = SKB_EXT_ERR(skb);
2661        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2662
2663        msg->msg_flags |= MSG_ERRQUEUE;
2664        err = copied;
2665
2666out_free_skb:
2667        kfree_skb(skb);
2668out:
2669        return err;
2670}
2671EXPORT_SYMBOL(sock_recv_errqueue);
2672
2673/*
2674 *      Get a socket option on an socket.
2675 *
2676 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2677 *      asynchronous errors should be reported by getsockopt. We assume
2678 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2679 */
2680int sock_common_getsockopt(struct socket *sock, int level, int optname,
2681                           char __user *optval, int __user *optlen)
2682{
2683        struct sock *sk = sock->sk;
2684
2685        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2686}
2687EXPORT_SYMBOL(sock_common_getsockopt);
2688
2689#ifdef CONFIG_COMPAT
2690int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2691                                  char __user *optval, int __user *optlen)
2692{
2693        struct sock *sk = sock->sk;
2694
2695        if (sk->sk_prot->compat_getsockopt != NULL)
2696                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2697                                                      optval, optlen);
2698        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2699}
2700EXPORT_SYMBOL(compat_sock_common_getsockopt);
2701#endif
2702
2703int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2704                        int flags)
2705{
2706        struct sock *sk = sock->sk;
2707        int addr_len = 0;
2708        int err;
2709
2710        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2711                                   flags & ~MSG_DONTWAIT, &addr_len);
2712        if (err >= 0)
2713                msg->msg_namelen = addr_len;
2714        return err;
2715}
2716EXPORT_SYMBOL(sock_common_recvmsg);
2717
2718/*
2719 *      Set socket options on an inet socket.
2720 */
2721int sock_common_setsockopt(struct socket *sock, int level, int optname,
2722                           char __user *optval, unsigned int optlen)
2723{
2724        struct sock *sk = sock->sk;
2725
2726        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2727}
2728EXPORT_SYMBOL(sock_common_setsockopt);
2729
2730#ifdef CONFIG_COMPAT
2731int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2732                                  char __user *optval, unsigned int optlen)
2733{
2734        struct sock *sk = sock->sk;
2735
2736        if (sk->sk_prot->compat_setsockopt != NULL)
2737                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2738                                                      optval, optlen);
2739        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2740}
2741EXPORT_SYMBOL(compat_sock_common_setsockopt);
2742#endif
2743
2744void sk_common_release(struct sock *sk)
2745{
2746        if (sk->sk_prot->destroy)
2747                sk->sk_prot->destroy(sk);
2748
2749        /*
2750         * Observation: when sock_common_release is called, processes have
2751         * no access to socket. But net still has.
2752         * Step one, detach it from networking:
2753         *
2754         * A. Remove from hash tables.
2755         */
2756
2757        sk->sk_prot->unhash(sk);
2758
2759        /*
2760         * In this point socket cannot receive new packets, but it is possible
2761         * that some packets are in flight because some CPU runs receiver and
2762         * did hash table lookup before we unhashed socket. They will achieve
2763         * receive queue and will be purged by socket destructor.
2764         *
2765         * Also we still have packets pending on receive queue and probably,
2766         * our own packets waiting in device queues. sock_destroy will drain
2767         * receive queue, but transmitted packets will delay socket destruction
2768         * until the last reference will be released.
2769         */
2770
2771        sock_orphan(sk);
2772
2773        xfrm_sk_free_policy(sk);
2774
2775        sk_refcnt_debug_release(sk);
2776
2777        if (sk->sk_frag.page) {
2778                put_page(sk->sk_frag.page);
2779                sk->sk_frag.page = NULL;
2780        }
2781
2782        sock_put(sk);
2783}
2784EXPORT_SYMBOL(sk_common_release);
2785
2786#ifdef CONFIG_PROC_FS
2787#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2788struct prot_inuse {
2789        int val[PROTO_INUSE_NR];
2790};
2791
2792static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2793
2794#ifdef CONFIG_NET_NS
2795void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2796{
2797        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2798}
2799EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2800
2801int sock_prot_inuse_get(struct net *net, struct proto *prot)
2802{
2803        int cpu, idx = prot->inuse_idx;
2804        int res = 0;
2805
2806        for_each_possible_cpu(cpu)
2807                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2808
2809        return res >= 0 ? res : 0;
2810}
2811EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2812
2813static int __net_init sock_inuse_init_net(struct net *net)
2814{
2815        net->core.inuse = alloc_percpu(struct prot_inuse);
2816        return net->core.inuse ? 0 : -ENOMEM;
2817}
2818
2819static void __net_exit sock_inuse_exit_net(struct net *net)
2820{
2821        free_percpu(net->core.inuse);
2822}
2823
2824static struct pernet_operations net_inuse_ops = {
2825        .init = sock_inuse_init_net,
2826        .exit = sock_inuse_exit_net,
2827};
2828
2829static __init int net_inuse_init(void)
2830{
2831        if (register_pernet_subsys(&net_inuse_ops))
2832                panic("Cannot initialize net inuse counters");
2833
2834        return 0;
2835}
2836
2837core_initcall(net_inuse_init);
2838#else
2839static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2840
2841void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2842{
2843        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2844}
2845EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2846
2847int sock_prot_inuse_get(struct net *net, struct proto *prot)
2848{
2849        int cpu, idx = prot->inuse_idx;
2850        int res = 0;
2851
2852        for_each_possible_cpu(cpu)
2853                res += per_cpu(prot_inuse, cpu).val[idx];
2854
2855        return res >= 0 ? res : 0;
2856}
2857EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2858#endif
2859
2860static void assign_proto_idx(struct proto *prot)
2861{
2862        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2863
2864        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2865                pr_err("PROTO_INUSE_NR exhausted\n");
2866                return;
2867        }
2868
2869        set_bit(prot->inuse_idx, proto_inuse_idx);
2870}
2871
2872static void release_proto_idx(struct proto *prot)
2873{
2874        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2875                clear_bit(prot->inuse_idx, proto_inuse_idx);
2876}
2877#else
2878static inline void assign_proto_idx(struct proto *prot)
2879{
2880}
2881
2882static inline void release_proto_idx(struct proto *prot)
2883{
2884}
2885#endif
2886
2887static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2888{
2889        if (!rsk_prot)
2890                return;
2891        kfree(rsk_prot->slab_name);
2892        rsk_prot->slab_name = NULL;
2893        kmem_cache_destroy(rsk_prot->slab);
2894        rsk_prot->slab = NULL;
2895}
2896
2897static int req_prot_init(const struct proto *prot)
2898{
2899        struct request_sock_ops *rsk_prot = prot->rsk_prot;
2900
2901        if (!rsk_prot)
2902                return 0;
2903
2904        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2905                                        prot->name);
2906        if (!rsk_prot->slab_name)
2907                return -ENOMEM;
2908
2909        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2910                                           rsk_prot->obj_size, 0,
2911                                           prot->slab_flags, NULL);
2912
2913        if (!rsk_prot->slab) {
2914                pr_crit("%s: Can't create request sock SLAB cache!\n",
2915                        prot->name);
2916                return -ENOMEM;
2917        }
2918        return 0;
2919}
2920
2921int proto_register(struct proto *prot, int alloc_slab)
2922{
2923        if (alloc_slab) {
2924                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2925                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2926                                        NULL);
2927
2928                if (prot->slab == NULL) {
2929                        pr_crit("%s: Can't create sock SLAB cache!\n",
2930                                prot->name);
2931                        goto out;
2932                }
2933
2934                if (req_prot_init(prot))
2935                        goto out_free_request_sock_slab;
2936
2937                if (prot->twsk_prot != NULL) {
2938                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2939
2940                        if (prot->twsk_prot->twsk_slab_name == NULL)
2941                                goto out_free_request_sock_slab;
2942
2943                        prot->twsk_prot->twsk_slab =
2944                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2945                                                  prot->twsk_prot->twsk_obj_size,
2946                                                  0,
2947                                                  prot->slab_flags,
2948                                                  NULL);
2949                        if (prot->twsk_prot->twsk_slab == NULL)
2950                                goto out_free_timewait_sock_slab_name;
2951                }
2952        }
2953
2954        mutex_lock(&proto_list_mutex);
2955        list_add(&prot->node, &proto_list);
2956        assign_proto_idx(prot);
2957        mutex_unlock(&proto_list_mutex);
2958        return 0;
2959
2960out_free_timewait_sock_slab_name:
2961        kfree(prot->twsk_prot->twsk_slab_name);
2962out_free_request_sock_slab:
2963        req_prot_cleanup(prot->rsk_prot);
2964
2965        kmem_cache_destroy(prot->slab);
2966        prot->slab = NULL;
2967out:
2968        return -ENOBUFS;
2969}
2970EXPORT_SYMBOL(proto_register);
2971
2972void proto_unregister(struct proto *prot)
2973{
2974        mutex_lock(&proto_list_mutex);
2975        release_proto_idx(prot);
2976        list_del(&prot->node);
2977        mutex_unlock(&proto_list_mutex);
2978
2979        kmem_cache_destroy(prot->slab);
2980        prot->slab = NULL;
2981
2982        req_prot_cleanup(prot->rsk_prot);
2983
2984        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2985                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2986                kfree(prot->twsk_prot->twsk_slab_name);
2987                prot->twsk_prot->twsk_slab = NULL;
2988        }
2989}
2990EXPORT_SYMBOL(proto_unregister);
2991
2992#ifdef CONFIG_PROC_FS
2993static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2994        __acquires(proto_list_mutex)
2995{
2996        mutex_lock(&proto_list_mutex);
2997        return seq_list_start_head(&proto_list, *pos);
2998}
2999
3000static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3001{
3002        return seq_list_next(v, &proto_list, pos);
3003}
3004
3005static void proto_seq_stop(struct seq_file *seq, void *v)
3006        __releases(proto_list_mutex)
3007{
3008        mutex_unlock(&proto_list_mutex);
3009}
3010
3011static char proto_method_implemented(const void *method)
3012{
3013        return method == NULL ? 'n' : 'y';
3014}
3015static long sock_prot_memory_allocated(struct proto *proto)
3016{
3017        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3018}
3019
3020static char *sock_prot_memory_pressure(struct proto *proto)
3021{
3022        return proto->memory_pressure != NULL ?
3023        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3024}
3025
3026static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3027{
3028
3029        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3030                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3031                   proto->name,
3032                   proto->obj_size,
3033                   sock_prot_inuse_get(seq_file_net(seq), proto),
3034                   sock_prot_memory_allocated(proto),
3035                   sock_prot_memory_pressure(proto),
3036                   proto->max_header,
3037                   proto->slab == NULL ? "no" : "yes",
3038                   module_name(proto->owner),
3039                   proto_method_implemented(proto->close),
3040                   proto_method_implemented(proto->connect),
3041                   proto_method_implemented(proto->disconnect),
3042                   proto_method_implemented(proto->accept),
3043                   proto_method_implemented(proto->ioctl),
3044                   proto_method_implemented(proto->init),
3045                   proto_method_implemented(proto->destroy),
3046                   proto_method_implemented(proto->shutdown),
3047                   proto_method_implemented(proto->setsockopt),
3048                   proto_method_implemented(proto->getsockopt),
3049                   proto_method_implemented(proto->sendmsg),
3050                   proto_method_implemented(proto->recvmsg),
3051                   proto_method_implemented(proto->sendpage),
3052                   proto_method_implemented(proto->bind),
3053                   proto_method_implemented(proto->backlog_rcv),
3054                   proto_method_implemented(proto->hash),
3055                   proto_method_implemented(proto->unhash),
3056                   proto_method_implemented(proto->get_port),
3057                   proto_method_implemented(proto->enter_memory_pressure));
3058}
3059
3060static int proto_seq_show(struct seq_file *seq, void *v)
3061{
3062        if (v == &proto_list)
3063                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3064                           "protocol",
3065                           "size",
3066                           "sockets",
3067                           "memory",
3068                           "press",
3069                           "maxhdr",
3070                           "slab",
3071                           "module",
3072                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3073        else
3074                proto_seq_printf(seq, list_entry(v, struct proto, node));
3075        return 0;
3076}
3077
3078static const struct seq_operations proto_seq_ops = {
3079        .start  = proto_seq_start,
3080        .next   = proto_seq_next,
3081        .stop   = proto_seq_stop,
3082        .show   = proto_seq_show,
3083};
3084
3085static int proto_seq_open(struct inode *inode, struct file *file)
3086{
3087        return seq_open_net(inode, file, &proto_seq_ops,
3088                            sizeof(struct seq_net_private));
3089}
3090
3091static const struct file_operations proto_seq_fops = {
3092        .owner          = THIS_MODULE,
3093        .open           = proto_seq_open,
3094        .read           = seq_read,
3095        .llseek         = seq_lseek,
3096        .release        = seq_release_net,
3097};
3098
3099static __net_init int proto_init_net(struct net *net)
3100{
3101        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3102                return -ENOMEM;
3103
3104        return 0;
3105}
3106
3107static __net_exit void proto_exit_net(struct net *net)
3108{
3109        remove_proc_entry("protocols", net->proc_net);
3110}
3111
3112
3113static __net_initdata struct pernet_operations proto_net_ops = {
3114        .init = proto_init_net,
3115        .exit = proto_exit_net,
3116};
3117
3118static int __init proto_init(void)
3119{
3120        return register_pernet_subsys(&proto_net_ops);
3121}
3122
3123subsys_initcall(proto_init);
3124
3125#endif /* PROC_FS */
3126