linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134#include <linux/sock_diag.h>
 135
 136#include <linux/filter.h>
 137#include <net/sock_reuseport.h>
 138
 139#include <trace/events/sock.h>
 140
 141#ifdef CONFIG_INET
 142#include <net/tcp.h>
 143#endif
 144
 145#include <net/busy_poll.h>
 146
 147static DEFINE_MUTEX(proto_list_mutex);
 148static LIST_HEAD(proto_list);
 149
 150/**
 151 * sk_ns_capable - General socket capability test
 152 * @sk: Socket to use a capability on or through
 153 * @user_ns: The user namespace of the capability to use
 154 * @cap: The capability to use
 155 *
 156 * Test to see if the opener of the socket had when the socket was
 157 * created and the current process has the capability @cap in the user
 158 * namespace @user_ns.
 159 */
 160bool sk_ns_capable(const struct sock *sk,
 161                   struct user_namespace *user_ns, int cap)
 162{
 163        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                ns_capable(user_ns, cap);
 165}
 166EXPORT_SYMBOL(sk_ns_capable);
 167
 168/**
 169 * sk_capable - Socket global capability test
 170 * @sk: Socket to use a capability on or through
 171 * @cap: The global capability to use
 172 *
 173 * Test to see if the opener of the socket had when the socket was
 174 * created and the current process has the capability @cap in all user
 175 * namespaces.
 176 */
 177bool sk_capable(const struct sock *sk, int cap)
 178{
 179        return sk_ns_capable(sk, &init_user_ns, cap);
 180}
 181EXPORT_SYMBOL(sk_capable);
 182
 183/**
 184 * sk_net_capable - Network namespace socket capability test
 185 * @sk: Socket to use a capability on or through
 186 * @cap: The capability to use
 187 *
 188 * Test to see if the opener of the socket had when the socket was created
 189 * and the current process has the capability @cap over the network namespace
 190 * the socket is a member of.
 191 */
 192bool sk_net_capable(const struct sock *sk, int cap)
 193{
 194        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195}
 196EXPORT_SYMBOL(sk_net_capable);
 197
 198/*
 199 * Each address family might have different locking rules, so we have
 200 * one slock key per address family:
 201 */
 202static struct lock_class_key af_family_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210static const char *const af_family_key_strings[AF_MAX+1] = {
 211  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225  "sk_lock-AF_MAX"
 226};
 227static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242  "slock-AF_MAX"
 243};
 244static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259  "clock-AF_MAX"
 260};
 261
 262/*
 263 * sk_callback_lock locking rules are per-address-family,
 264 * so split the lock classes by using a per-AF key:
 265 */
 266static struct lock_class_key af_callback_keys[AF_MAX];
 267
 268/* Take into consideration the size of the struct sk_buff overhead in the
 269 * determination of these values, since that is non-constant across
 270 * platforms.  This makes socket queueing behavior and performance
 271 * not depend upon such differences.
 272 */
 273#define _SK_MEM_PACKETS         256
 274#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 275#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 276#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 277
 278/* Run time adjustable parameters. */
 279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280EXPORT_SYMBOL(sysctl_wmem_max);
 281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282EXPORT_SYMBOL(sysctl_rmem_max);
 283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286/* Maximal space eaten by iovec or ancillary data plus some space */
 287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 293EXPORT_SYMBOL_GPL(memalloc_socks);
 294
 295/**
 296 * sk_set_memalloc - sets %SOCK_MEMALLOC
 297 * @sk: socket to set it on
 298 *
 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300 * It's the responsibility of the admin to adjust min_free_kbytes
 301 * to meet the requirements
 302 */
 303void sk_set_memalloc(struct sock *sk)
 304{
 305        sock_set_flag(sk, SOCK_MEMALLOC);
 306        sk->sk_allocation |= __GFP_MEMALLOC;
 307        static_key_slow_inc(&memalloc_socks);
 308}
 309EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311void sk_clear_memalloc(struct sock *sk)
 312{
 313        sock_reset_flag(sk, SOCK_MEMALLOC);
 314        sk->sk_allocation &= ~__GFP_MEMALLOC;
 315        static_key_slow_dec(&memalloc_socks);
 316
 317        /*
 318         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319         * progress of swapping. SOCK_MEMALLOC may be cleared while
 320         * it has rmem allocations due to the last swapfile being deactivated
 321         * but there is a risk that the socket is unusable due to exceeding
 322         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323         */
 324        sk_mem_reclaim(sk);
 325}
 326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329{
 330        int ret;
 331        unsigned long pflags = current->flags;
 332
 333        /* these should have been dropped before queueing */
 334        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336        current->flags |= PF_MEMALLOC;
 337        ret = sk->sk_backlog_rcv(sk, skb);
 338        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 339
 340        return ret;
 341}
 342EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 345{
 346        struct timeval tv;
 347
 348        if (optlen < sizeof(tv))
 349                return -EINVAL;
 350        if (copy_from_user(&tv, optval, sizeof(tv)))
 351                return -EFAULT;
 352        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 353                return -EDOM;
 354
 355        if (tv.tv_sec < 0) {
 356                static int warned __read_mostly;
 357
 358                *timeo_p = 0;
 359                if (warned < 10 && net_ratelimit()) {
 360                        warned++;
 361                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 362                                __func__, current->comm, task_pid_nr(current));
 363                }
 364                return 0;
 365        }
 366        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 367        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 368                return 0;
 369        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 370                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 371        return 0;
 372}
 373
 374static void sock_warn_obsolete_bsdism(const char *name)
 375{
 376        static int warned;
 377        static char warncomm[TASK_COMM_LEN];
 378        if (strcmp(warncomm, current->comm) && warned < 5) {
 379                strcpy(warncomm,  current->comm);
 380                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 381                        warncomm, name);
 382                warned++;
 383        }
 384}
 385
 386static bool sock_needs_netstamp(const struct sock *sk)
 387{
 388        switch (sk->sk_family) {
 389        case AF_UNSPEC:
 390        case AF_UNIX:
 391                return false;
 392        default:
 393                return true;
 394        }
 395}
 396
 397static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 398{
 399        if (sk->sk_flags & flags) {
 400                sk->sk_flags &= ~flags;
 401                if (sock_needs_netstamp(sk) &&
 402                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 403                        net_disable_timestamp();
 404        }
 405}
 406
 407
 408int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 409{
 410        int err;
 411        unsigned long flags;
 412        struct sk_buff_head *list = &sk->sk_receive_queue;
 413
 414        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 415                atomic_inc(&sk->sk_drops);
 416                trace_sock_rcvqueue_full(sk, skb);
 417                return -ENOMEM;
 418        }
 419
 420        err = sk_filter(sk, skb);
 421        if (err)
 422                return err;
 423
 424        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 425                atomic_inc(&sk->sk_drops);
 426                return -ENOBUFS;
 427        }
 428
 429        skb->dev = NULL;
 430        skb_set_owner_r(skb, sk);
 431
 432        /* we escape from rcu protected region, make sure we dont leak
 433         * a norefcounted dst
 434         */
 435        skb_dst_force(skb);
 436
 437        spin_lock_irqsave(&list->lock, flags);
 438        sock_skb_set_dropcount(sk, skb);
 439        __skb_queue_tail(list, skb);
 440        spin_unlock_irqrestore(&list->lock, flags);
 441
 442        if (!sock_flag(sk, SOCK_DEAD))
 443                sk->sk_data_ready(sk);
 444        return 0;
 445}
 446EXPORT_SYMBOL(sock_queue_rcv_skb);
 447
 448int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 449{
 450        int rc = NET_RX_SUCCESS;
 451
 452        if (sk_filter(sk, skb))
 453                goto discard_and_relse;
 454
 455        skb->dev = NULL;
 456
 457        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 458                atomic_inc(&sk->sk_drops);
 459                goto discard_and_relse;
 460        }
 461        if (nested)
 462                bh_lock_sock_nested(sk);
 463        else
 464                bh_lock_sock(sk);
 465        if (!sock_owned_by_user(sk)) {
 466                /*
 467                 * trylock + unlock semantics:
 468                 */
 469                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 470
 471                rc = sk_backlog_rcv(sk, skb);
 472
 473                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 474        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 475                bh_unlock_sock(sk);
 476                atomic_inc(&sk->sk_drops);
 477                goto discard_and_relse;
 478        }
 479
 480        bh_unlock_sock(sk);
 481out:
 482        sock_put(sk);
 483        return rc;
 484discard_and_relse:
 485        kfree_skb(skb);
 486        goto out;
 487}
 488EXPORT_SYMBOL(sk_receive_skb);
 489
 490struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 491{
 492        struct dst_entry *dst = __sk_dst_get(sk);
 493
 494        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 495                sk_tx_queue_clear(sk);
 496                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 497                dst_release(dst);
 498                return NULL;
 499        }
 500
 501        return dst;
 502}
 503EXPORT_SYMBOL(__sk_dst_check);
 504
 505struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 506{
 507        struct dst_entry *dst = sk_dst_get(sk);
 508
 509        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 510                sk_dst_reset(sk);
 511                dst_release(dst);
 512                return NULL;
 513        }
 514
 515        return dst;
 516}
 517EXPORT_SYMBOL(sk_dst_check);
 518
 519static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 520                                int optlen)
 521{
 522        int ret = -ENOPROTOOPT;
 523#ifdef CONFIG_NETDEVICES
 524        struct net *net = sock_net(sk);
 525        char devname[IFNAMSIZ];
 526        int index;
 527
 528        /* Sorry... */
 529        ret = -EPERM;
 530        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 531                goto out;
 532
 533        ret = -EINVAL;
 534        if (optlen < 0)
 535                goto out;
 536
 537        /* Bind this socket to a particular device like "eth0",
 538         * as specified in the passed interface name. If the
 539         * name is "" or the option length is zero the socket
 540         * is not bound.
 541         */
 542        if (optlen > IFNAMSIZ - 1)
 543                optlen = IFNAMSIZ - 1;
 544        memset(devname, 0, sizeof(devname));
 545
 546        ret = -EFAULT;
 547        if (copy_from_user(devname, optval, optlen))
 548                goto out;
 549
 550        index = 0;
 551        if (devname[0] != '\0') {
 552                struct net_device *dev;
 553
 554                rcu_read_lock();
 555                dev = dev_get_by_name_rcu(net, devname);
 556                if (dev)
 557                        index = dev->ifindex;
 558                rcu_read_unlock();
 559                ret = -ENODEV;
 560                if (!dev)
 561                        goto out;
 562        }
 563
 564        lock_sock(sk);
 565        sk->sk_bound_dev_if = index;
 566        sk_dst_reset(sk);
 567        release_sock(sk);
 568
 569        ret = 0;
 570
 571out:
 572#endif
 573
 574        return ret;
 575}
 576
 577static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 578                                int __user *optlen, int len)
 579{
 580        int ret = -ENOPROTOOPT;
 581#ifdef CONFIG_NETDEVICES
 582        struct net *net = sock_net(sk);
 583        char devname[IFNAMSIZ];
 584
 585        if (sk->sk_bound_dev_if == 0) {
 586                len = 0;
 587                goto zero;
 588        }
 589
 590        ret = -EINVAL;
 591        if (len < IFNAMSIZ)
 592                goto out;
 593
 594        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 595        if (ret)
 596                goto out;
 597
 598        len = strlen(devname) + 1;
 599
 600        ret = -EFAULT;
 601        if (copy_to_user(optval, devname, len))
 602                goto out;
 603
 604zero:
 605        ret = -EFAULT;
 606        if (put_user(len, optlen))
 607                goto out;
 608
 609        ret = 0;
 610
 611out:
 612#endif
 613
 614        return ret;
 615}
 616
 617static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 618{
 619        if (valbool)
 620                sock_set_flag(sk, bit);
 621        else
 622                sock_reset_flag(sk, bit);
 623}
 624
 625bool sk_mc_loop(struct sock *sk)
 626{
 627        if (dev_recursion_level())
 628                return false;
 629        if (!sk)
 630                return true;
 631        switch (sk->sk_family) {
 632        case AF_INET:
 633                return inet_sk(sk)->mc_loop;
 634#if IS_ENABLED(CONFIG_IPV6)
 635        case AF_INET6:
 636                return inet6_sk(sk)->mc_loop;
 637#endif
 638        }
 639        WARN_ON(1);
 640        return true;
 641}
 642EXPORT_SYMBOL(sk_mc_loop);
 643
 644/*
 645 *      This is meant for all protocols to use and covers goings on
 646 *      at the socket level. Everything here is generic.
 647 */
 648
 649int sock_setsockopt(struct socket *sock, int level, int optname,
 650                    char __user *optval, unsigned int optlen)
 651{
 652        struct sock *sk = sock->sk;
 653        int val;
 654        int valbool;
 655        struct linger ling;
 656        int ret = 0;
 657
 658        /*
 659         *      Options without arguments
 660         */
 661
 662        if (optname == SO_BINDTODEVICE)
 663                return sock_setbindtodevice(sk, optval, optlen);
 664
 665        if (optlen < sizeof(int))
 666                return -EINVAL;
 667
 668        if (get_user(val, (int __user *)optval))
 669                return -EFAULT;
 670
 671        valbool = val ? 1 : 0;
 672
 673        lock_sock(sk);
 674
 675        switch (optname) {
 676        case SO_DEBUG:
 677                if (val && !capable(CAP_NET_ADMIN))
 678                        ret = -EACCES;
 679                else
 680                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 681                break;
 682        case SO_REUSEADDR:
 683                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 684                break;
 685        case SO_REUSEPORT:
 686                sk->sk_reuseport = valbool;
 687                break;
 688        case SO_TYPE:
 689        case SO_PROTOCOL:
 690        case SO_DOMAIN:
 691        case SO_ERROR:
 692                ret = -ENOPROTOOPT;
 693                break;
 694        case SO_DONTROUTE:
 695                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 696                break;
 697        case SO_BROADCAST:
 698                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 699                break;
 700        case SO_SNDBUF:
 701                /* Don't error on this BSD doesn't and if you think
 702                 * about it this is right. Otherwise apps have to
 703                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 704                 * are treated in BSD as hints
 705                 */
 706                val = min_t(u32, val, sysctl_wmem_max);
 707set_sndbuf:
 708                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 709                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 710                /* Wake up sending tasks if we upped the value. */
 711                sk->sk_write_space(sk);
 712                break;
 713
 714        case SO_SNDBUFFORCE:
 715                if (!capable(CAP_NET_ADMIN)) {
 716                        ret = -EPERM;
 717                        break;
 718                }
 719                goto set_sndbuf;
 720
 721        case SO_RCVBUF:
 722                /* Don't error on this BSD doesn't and if you think
 723                 * about it this is right. Otherwise apps have to
 724                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 725                 * are treated in BSD as hints
 726                 */
 727                val = min_t(u32, val, sysctl_rmem_max);
 728set_rcvbuf:
 729                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 730                /*
 731                 * We double it on the way in to account for
 732                 * "struct sk_buff" etc. overhead.   Applications
 733                 * assume that the SO_RCVBUF setting they make will
 734                 * allow that much actual data to be received on that
 735                 * socket.
 736                 *
 737                 * Applications are unaware that "struct sk_buff" and
 738                 * other overheads allocate from the receive buffer
 739                 * during socket buffer allocation.
 740                 *
 741                 * And after considering the possible alternatives,
 742                 * returning the value we actually used in getsockopt
 743                 * is the most desirable behavior.
 744                 */
 745                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 746                break;
 747
 748        case SO_RCVBUFFORCE:
 749                if (!capable(CAP_NET_ADMIN)) {
 750                        ret = -EPERM;
 751                        break;
 752                }
 753                goto set_rcvbuf;
 754
 755        case SO_KEEPALIVE:
 756#ifdef CONFIG_INET
 757                if (sk->sk_protocol == IPPROTO_TCP &&
 758                    sk->sk_type == SOCK_STREAM)
 759                        tcp_set_keepalive(sk, valbool);
 760#endif
 761                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 762                break;
 763
 764        case SO_OOBINLINE:
 765                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 766                break;
 767
 768        case SO_NO_CHECK:
 769                sk->sk_no_check_tx = valbool;
 770                break;
 771
 772        case SO_PRIORITY:
 773                if ((val >= 0 && val <= 6) ||
 774                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 775                        sk->sk_priority = val;
 776                else
 777                        ret = -EPERM;
 778                break;
 779
 780        case SO_LINGER:
 781                if (optlen < sizeof(ling)) {
 782                        ret = -EINVAL;  /* 1003.1g */
 783                        break;
 784                }
 785                if (copy_from_user(&ling, optval, sizeof(ling))) {
 786                        ret = -EFAULT;
 787                        break;
 788                }
 789                if (!ling.l_onoff)
 790                        sock_reset_flag(sk, SOCK_LINGER);
 791                else {
 792#if (BITS_PER_LONG == 32)
 793                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 794                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 795                        else
 796#endif
 797                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 798                        sock_set_flag(sk, SOCK_LINGER);
 799                }
 800                break;
 801
 802        case SO_BSDCOMPAT:
 803                sock_warn_obsolete_bsdism("setsockopt");
 804                break;
 805
 806        case SO_PASSCRED:
 807                if (valbool)
 808                        set_bit(SOCK_PASSCRED, &sock->flags);
 809                else
 810                        clear_bit(SOCK_PASSCRED, &sock->flags);
 811                break;
 812
 813        case SO_TIMESTAMP:
 814        case SO_TIMESTAMPNS:
 815                if (valbool)  {
 816                        if (optname == SO_TIMESTAMP)
 817                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 818                        else
 819                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 820                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 821                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 822                } else {
 823                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 824                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 825                }
 826                break;
 827
 828        case SO_TIMESTAMPING:
 829                if (val & ~SOF_TIMESTAMPING_MASK) {
 830                        ret = -EINVAL;
 831                        break;
 832                }
 833
 834                if (val & SOF_TIMESTAMPING_OPT_ID &&
 835                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 836                        if (sk->sk_protocol == IPPROTO_TCP &&
 837                            sk->sk_type == SOCK_STREAM) {
 838                                if (sk->sk_state != TCP_ESTABLISHED) {
 839                                        ret = -EINVAL;
 840                                        break;
 841                                }
 842                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 843                        } else {
 844                                sk->sk_tskey = 0;
 845                        }
 846                }
 847                sk->sk_tsflags = val;
 848                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 849                        sock_enable_timestamp(sk,
 850                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 851                else
 852                        sock_disable_timestamp(sk,
 853                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 854                break;
 855
 856        case SO_RCVLOWAT:
 857                if (val < 0)
 858                        val = INT_MAX;
 859                sk->sk_rcvlowat = val ? : 1;
 860                break;
 861
 862        case SO_RCVTIMEO:
 863                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 864                break;
 865
 866        case SO_SNDTIMEO:
 867                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 868                break;
 869
 870        case SO_ATTACH_FILTER:
 871                ret = -EINVAL;
 872                if (optlen == sizeof(struct sock_fprog)) {
 873                        struct sock_fprog fprog;
 874
 875                        ret = -EFAULT;
 876                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 877                                break;
 878
 879                        ret = sk_attach_filter(&fprog, sk);
 880                }
 881                break;
 882
 883        case SO_ATTACH_BPF:
 884                ret = -EINVAL;
 885                if (optlen == sizeof(u32)) {
 886                        u32 ufd;
 887
 888                        ret = -EFAULT;
 889                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 890                                break;
 891
 892                        ret = sk_attach_bpf(ufd, sk);
 893                }
 894                break;
 895
 896        case SO_ATTACH_REUSEPORT_CBPF:
 897                ret = -EINVAL;
 898                if (optlen == sizeof(struct sock_fprog)) {
 899                        struct sock_fprog fprog;
 900
 901                        ret = -EFAULT;
 902                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 903                                break;
 904
 905                        ret = sk_reuseport_attach_filter(&fprog, sk);
 906                }
 907                break;
 908
 909        case SO_ATTACH_REUSEPORT_EBPF:
 910                ret = -EINVAL;
 911                if (optlen == sizeof(u32)) {
 912                        u32 ufd;
 913
 914                        ret = -EFAULT;
 915                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 916                                break;
 917
 918                        ret = sk_reuseport_attach_bpf(ufd, sk);
 919                }
 920                break;
 921
 922        case SO_DETACH_FILTER:
 923                ret = sk_detach_filter(sk);
 924                break;
 925
 926        case SO_LOCK_FILTER:
 927                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 928                        ret = -EPERM;
 929                else
 930                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 931                break;
 932
 933        case SO_PASSSEC:
 934                if (valbool)
 935                        set_bit(SOCK_PASSSEC, &sock->flags);
 936                else
 937                        clear_bit(SOCK_PASSSEC, &sock->flags);
 938                break;
 939        case SO_MARK:
 940                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 941                        ret = -EPERM;
 942                else
 943                        sk->sk_mark = val;
 944                break;
 945
 946        case SO_RXQ_OVFL:
 947                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 948                break;
 949
 950        case SO_WIFI_STATUS:
 951                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 952                break;
 953
 954        case SO_PEEK_OFF:
 955                if (sock->ops->set_peek_off)
 956                        ret = sock->ops->set_peek_off(sk, val);
 957                else
 958                        ret = -EOPNOTSUPP;
 959                break;
 960
 961        case SO_NOFCS:
 962                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 963                break;
 964
 965        case SO_SELECT_ERR_QUEUE:
 966                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 967                break;
 968
 969#ifdef CONFIG_NET_RX_BUSY_POLL
 970        case SO_BUSY_POLL:
 971                /* allow unprivileged users to decrease the value */
 972                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 973                        ret = -EPERM;
 974                else {
 975                        if (val < 0)
 976                                ret = -EINVAL;
 977                        else
 978                                sk->sk_ll_usec = val;
 979                }
 980                break;
 981#endif
 982
 983        case SO_MAX_PACING_RATE:
 984                sk->sk_max_pacing_rate = val;
 985                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 986                                         sk->sk_max_pacing_rate);
 987                break;
 988
 989        case SO_INCOMING_CPU:
 990                sk->sk_incoming_cpu = val;
 991                break;
 992
 993        case SO_CNX_ADVICE:
 994                if (val == 1)
 995                        dst_negative_advice(sk);
 996                break;
 997        default:
 998                ret = -ENOPROTOOPT;
 999                break;
1000        }
1001        release_sock(sk);
1002        return ret;
1003}
1004EXPORT_SYMBOL(sock_setsockopt);
1005
1006
1007static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1008                          struct ucred *ucred)
1009{
1010        ucred->pid = pid_vnr(pid);
1011        ucred->uid = ucred->gid = -1;
1012        if (cred) {
1013                struct user_namespace *current_ns = current_user_ns();
1014
1015                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1016                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1017        }
1018}
1019
1020int sock_getsockopt(struct socket *sock, int level, int optname,
1021                    char __user *optval, int __user *optlen)
1022{
1023        struct sock *sk = sock->sk;
1024
1025        union {
1026                int val;
1027                struct linger ling;
1028                struct timeval tm;
1029        } v;
1030
1031        int lv = sizeof(int);
1032        int len;
1033
1034        if (get_user(len, optlen))
1035                return -EFAULT;
1036        if (len < 0)
1037                return -EINVAL;
1038
1039        memset(&v, 0, sizeof(v));
1040
1041        switch (optname) {
1042        case SO_DEBUG:
1043                v.val = sock_flag(sk, SOCK_DBG);
1044                break;
1045
1046        case SO_DONTROUTE:
1047                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1048                break;
1049
1050        case SO_BROADCAST:
1051                v.val = sock_flag(sk, SOCK_BROADCAST);
1052                break;
1053
1054        case SO_SNDBUF:
1055                v.val = sk->sk_sndbuf;
1056                break;
1057
1058        case SO_RCVBUF:
1059                v.val = sk->sk_rcvbuf;
1060                break;
1061
1062        case SO_REUSEADDR:
1063                v.val = sk->sk_reuse;
1064                break;
1065
1066        case SO_REUSEPORT:
1067                v.val = sk->sk_reuseport;
1068                break;
1069
1070        case SO_KEEPALIVE:
1071                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1072                break;
1073
1074        case SO_TYPE:
1075                v.val = sk->sk_type;
1076                break;
1077
1078        case SO_PROTOCOL:
1079                v.val = sk->sk_protocol;
1080                break;
1081
1082        case SO_DOMAIN:
1083                v.val = sk->sk_family;
1084                break;
1085
1086        case SO_ERROR:
1087                v.val = -sock_error(sk);
1088                if (v.val == 0)
1089                        v.val = xchg(&sk->sk_err_soft, 0);
1090                break;
1091
1092        case SO_OOBINLINE:
1093                v.val = sock_flag(sk, SOCK_URGINLINE);
1094                break;
1095
1096        case SO_NO_CHECK:
1097                v.val = sk->sk_no_check_tx;
1098                break;
1099
1100        case SO_PRIORITY:
1101                v.val = sk->sk_priority;
1102                break;
1103
1104        case SO_LINGER:
1105                lv              = sizeof(v.ling);
1106                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1107                v.ling.l_linger = sk->sk_lingertime / HZ;
1108                break;
1109
1110        case SO_BSDCOMPAT:
1111                sock_warn_obsolete_bsdism("getsockopt");
1112                break;
1113
1114        case SO_TIMESTAMP:
1115                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1116                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1117                break;
1118
1119        case SO_TIMESTAMPNS:
1120                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1121                break;
1122
1123        case SO_TIMESTAMPING:
1124                v.val = sk->sk_tsflags;
1125                break;
1126
1127        case SO_RCVTIMEO:
1128                lv = sizeof(struct timeval);
1129                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1130                        v.tm.tv_sec = 0;
1131                        v.tm.tv_usec = 0;
1132                } else {
1133                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1134                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1135                }
1136                break;
1137
1138        case SO_SNDTIMEO:
1139                lv = sizeof(struct timeval);
1140                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1141                        v.tm.tv_sec = 0;
1142                        v.tm.tv_usec = 0;
1143                } else {
1144                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1145                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1146                }
1147                break;
1148
1149        case SO_RCVLOWAT:
1150                v.val = sk->sk_rcvlowat;
1151                break;
1152
1153        case SO_SNDLOWAT:
1154                v.val = 1;
1155                break;
1156
1157        case SO_PASSCRED:
1158                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1159                break;
1160
1161        case SO_PEERCRED:
1162        {
1163                struct ucred peercred;
1164                if (len > sizeof(peercred))
1165                        len = sizeof(peercred);
1166                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1167                if (copy_to_user(optval, &peercred, len))
1168                        return -EFAULT;
1169                goto lenout;
1170        }
1171
1172        case SO_PEERNAME:
1173        {
1174                char address[128];
1175
1176                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1177                        return -ENOTCONN;
1178                if (lv < len)
1179                        return -EINVAL;
1180                if (copy_to_user(optval, address, len))
1181                        return -EFAULT;
1182                goto lenout;
1183        }
1184
1185        /* Dubious BSD thing... Probably nobody even uses it, but
1186         * the UNIX standard wants it for whatever reason... -DaveM
1187         */
1188        case SO_ACCEPTCONN:
1189                v.val = sk->sk_state == TCP_LISTEN;
1190                break;
1191
1192        case SO_PASSSEC:
1193                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1194                break;
1195
1196        case SO_PEERSEC:
1197                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1198
1199        case SO_MARK:
1200                v.val = sk->sk_mark;
1201                break;
1202
1203        case SO_RXQ_OVFL:
1204                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1205                break;
1206
1207        case SO_WIFI_STATUS:
1208                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1209                break;
1210
1211        case SO_PEEK_OFF:
1212                if (!sock->ops->set_peek_off)
1213                        return -EOPNOTSUPP;
1214
1215                v.val = sk->sk_peek_off;
1216                break;
1217        case SO_NOFCS:
1218                v.val = sock_flag(sk, SOCK_NOFCS);
1219                break;
1220
1221        case SO_BINDTODEVICE:
1222                return sock_getbindtodevice(sk, optval, optlen, len);
1223
1224        case SO_GET_FILTER:
1225                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1226                if (len < 0)
1227                        return len;
1228
1229                goto lenout;
1230
1231        case SO_LOCK_FILTER:
1232                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1233                break;
1234
1235        case SO_BPF_EXTENSIONS:
1236                v.val = bpf_tell_extensions();
1237                break;
1238
1239        case SO_SELECT_ERR_QUEUE:
1240                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1241                break;
1242
1243#ifdef CONFIG_NET_RX_BUSY_POLL
1244        case SO_BUSY_POLL:
1245                v.val = sk->sk_ll_usec;
1246                break;
1247#endif
1248
1249        case SO_MAX_PACING_RATE:
1250                v.val = sk->sk_max_pacing_rate;
1251                break;
1252
1253        case SO_INCOMING_CPU:
1254                v.val = sk->sk_incoming_cpu;
1255                break;
1256
1257        default:
1258                /* We implement the SO_SNDLOWAT etc to not be settable
1259                 * (1003.1g 7).
1260                 */
1261                return -ENOPROTOOPT;
1262        }
1263
1264        if (len > lv)
1265                len = lv;
1266        if (copy_to_user(optval, &v, len))
1267                return -EFAULT;
1268lenout:
1269        if (put_user(len, optlen))
1270                return -EFAULT;
1271        return 0;
1272}
1273
1274/*
1275 * Initialize an sk_lock.
1276 *
1277 * (We also register the sk_lock with the lock validator.)
1278 */
1279static inline void sock_lock_init(struct sock *sk)
1280{
1281        sock_lock_init_class_and_name(sk,
1282                        af_family_slock_key_strings[sk->sk_family],
1283                        af_family_slock_keys + sk->sk_family,
1284                        af_family_key_strings[sk->sk_family],
1285                        af_family_keys + sk->sk_family);
1286}
1287
1288/*
1289 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1290 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1291 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1292 */
1293static void sock_copy(struct sock *nsk, const struct sock *osk)
1294{
1295#ifdef CONFIG_SECURITY_NETWORK
1296        void *sptr = nsk->sk_security;
1297#endif
1298        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1299
1300        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1301               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1302
1303#ifdef CONFIG_SECURITY_NETWORK
1304        nsk->sk_security = sptr;
1305        security_sk_clone(osk, nsk);
1306#endif
1307}
1308
1309void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1310{
1311        unsigned long nulls1, nulls2;
1312
1313        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1314        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1315        if (nulls1 > nulls2)
1316                swap(nulls1, nulls2);
1317
1318        if (nulls1 != 0)
1319                memset((char *)sk, 0, nulls1);
1320        memset((char *)sk + nulls1 + sizeof(void *), 0,
1321               nulls2 - nulls1 - sizeof(void *));
1322        memset((char *)sk + nulls2 + sizeof(void *), 0,
1323               size - nulls2 - sizeof(void *));
1324}
1325EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1326
1327static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1328                int family)
1329{
1330        struct sock *sk;
1331        struct kmem_cache *slab;
1332
1333        slab = prot->slab;
1334        if (slab != NULL) {
1335                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1336                if (!sk)
1337                        return sk;
1338                if (priority & __GFP_ZERO) {
1339                        if (prot->clear_sk)
1340                                prot->clear_sk(sk, prot->obj_size);
1341                        else
1342                                sk_prot_clear_nulls(sk, prot->obj_size);
1343                }
1344        } else
1345                sk = kmalloc(prot->obj_size, priority);
1346
1347        if (sk != NULL) {
1348                kmemcheck_annotate_bitfield(sk, flags);
1349
1350                if (security_sk_alloc(sk, family, priority))
1351                        goto out_free;
1352
1353                if (!try_module_get(prot->owner))
1354                        goto out_free_sec;
1355                sk_tx_queue_clear(sk);
1356                cgroup_sk_alloc(&sk->sk_cgrp_data);
1357        }
1358
1359        return sk;
1360
1361out_free_sec:
1362        security_sk_free(sk);
1363out_free:
1364        if (slab != NULL)
1365                kmem_cache_free(slab, sk);
1366        else
1367                kfree(sk);
1368        return NULL;
1369}
1370
1371static void sk_prot_free(struct proto *prot, struct sock *sk)
1372{
1373        struct kmem_cache *slab;
1374        struct module *owner;
1375
1376        owner = prot->owner;
1377        slab = prot->slab;
1378
1379        cgroup_sk_free(&sk->sk_cgrp_data);
1380        security_sk_free(sk);
1381        if (slab != NULL)
1382                kmem_cache_free(slab, sk);
1383        else
1384                kfree(sk);
1385        module_put(owner);
1386}
1387
1388/**
1389 *      sk_alloc - All socket objects are allocated here
1390 *      @net: the applicable net namespace
1391 *      @family: protocol family
1392 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1393 *      @prot: struct proto associated with this new sock instance
1394 *      @kern: is this to be a kernel socket?
1395 */
1396struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1397                      struct proto *prot, int kern)
1398{
1399        struct sock *sk;
1400
1401        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1402        if (sk) {
1403                sk->sk_family = family;
1404                /*
1405                 * See comment in struct sock definition to understand
1406                 * why we need sk_prot_creator -acme
1407                 */
1408                sk->sk_prot = sk->sk_prot_creator = prot;
1409                sock_lock_init(sk);
1410                sk->sk_net_refcnt = kern ? 0 : 1;
1411                if (likely(sk->sk_net_refcnt))
1412                        get_net(net);
1413                sock_net_set(sk, net);
1414                atomic_set(&sk->sk_wmem_alloc, 1);
1415
1416                sock_update_classid(&sk->sk_cgrp_data);
1417                sock_update_netprioidx(&sk->sk_cgrp_data);
1418        }
1419
1420        return sk;
1421}
1422EXPORT_SYMBOL(sk_alloc);
1423
1424void sk_destruct(struct sock *sk)
1425{
1426        struct sk_filter *filter;
1427
1428        if (sk->sk_destruct)
1429                sk->sk_destruct(sk);
1430
1431        filter = rcu_dereference_check(sk->sk_filter,
1432                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1433        if (filter) {
1434                sk_filter_uncharge(sk, filter);
1435                RCU_INIT_POINTER(sk->sk_filter, NULL);
1436        }
1437        if (rcu_access_pointer(sk->sk_reuseport_cb))
1438                reuseport_detach_sock(sk);
1439
1440        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1441
1442        if (atomic_read(&sk->sk_omem_alloc))
1443                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1444                         __func__, atomic_read(&sk->sk_omem_alloc));
1445
1446        if (sk->sk_peer_cred)
1447                put_cred(sk->sk_peer_cred);
1448        put_pid(sk->sk_peer_pid);
1449        if (likely(sk->sk_net_refcnt))
1450                put_net(sock_net(sk));
1451        sk_prot_free(sk->sk_prot_creator, sk);
1452}
1453
1454static void __sk_free(struct sock *sk)
1455{
1456        if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1457                sock_diag_broadcast_destroy(sk);
1458        else
1459                sk_destruct(sk);
1460}
1461
1462void sk_free(struct sock *sk)
1463{
1464        /*
1465         * We subtract one from sk_wmem_alloc and can know if
1466         * some packets are still in some tx queue.
1467         * If not null, sock_wfree() will call __sk_free(sk) later
1468         */
1469        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1470                __sk_free(sk);
1471}
1472EXPORT_SYMBOL(sk_free);
1473
1474/**
1475 *      sk_clone_lock - clone a socket, and lock its clone
1476 *      @sk: the socket to clone
1477 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1478 *
1479 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1480 */
1481struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1482{
1483        struct sock *newsk;
1484        bool is_charged = true;
1485
1486        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1487        if (newsk != NULL) {
1488                struct sk_filter *filter;
1489
1490                sock_copy(newsk, sk);
1491
1492                /* SANITY */
1493                if (likely(newsk->sk_net_refcnt))
1494                        get_net(sock_net(newsk));
1495                sk_node_init(&newsk->sk_node);
1496                sock_lock_init(newsk);
1497                bh_lock_sock(newsk);
1498                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1499                newsk->sk_backlog.len = 0;
1500
1501                atomic_set(&newsk->sk_rmem_alloc, 0);
1502                /*
1503                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1504                 */
1505                atomic_set(&newsk->sk_wmem_alloc, 1);
1506                atomic_set(&newsk->sk_omem_alloc, 0);
1507                skb_queue_head_init(&newsk->sk_receive_queue);
1508                skb_queue_head_init(&newsk->sk_write_queue);
1509
1510                rwlock_init(&newsk->sk_callback_lock);
1511                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1512                                af_callback_keys + newsk->sk_family,
1513                                af_family_clock_key_strings[newsk->sk_family]);
1514
1515                newsk->sk_dst_cache     = NULL;
1516                newsk->sk_wmem_queued   = 0;
1517                newsk->sk_forward_alloc = 0;
1518                newsk->sk_send_head     = NULL;
1519                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1520
1521                sock_reset_flag(newsk, SOCK_DONE);
1522                skb_queue_head_init(&newsk->sk_error_queue);
1523
1524                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1525                if (filter != NULL)
1526                        /* though it's an empty new sock, the charging may fail
1527                         * if sysctl_optmem_max was changed between creation of
1528                         * original socket and cloning
1529                         */
1530                        is_charged = sk_filter_charge(newsk, filter);
1531
1532                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1533                        /* It is still raw copy of parent, so invalidate
1534                         * destructor and make plain sk_free() */
1535                        newsk->sk_destruct = NULL;
1536                        bh_unlock_sock(newsk);
1537                        sk_free(newsk);
1538                        newsk = NULL;
1539                        goto out;
1540                }
1541                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1542
1543                newsk->sk_err      = 0;
1544                newsk->sk_priority = 0;
1545                newsk->sk_incoming_cpu = raw_smp_processor_id();
1546                atomic64_set(&newsk->sk_cookie, 0);
1547                /*
1548                 * Before updating sk_refcnt, we must commit prior changes to memory
1549                 * (Documentation/RCU/rculist_nulls.txt for details)
1550                 */
1551                smp_wmb();
1552                atomic_set(&newsk->sk_refcnt, 2);
1553
1554                /*
1555                 * Increment the counter in the same struct proto as the master
1556                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1557                 * is the same as sk->sk_prot->socks, as this field was copied
1558                 * with memcpy).
1559                 *
1560                 * This _changes_ the previous behaviour, where
1561                 * tcp_create_openreq_child always was incrementing the
1562                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1563                 * to be taken into account in all callers. -acme
1564                 */
1565                sk_refcnt_debug_inc(newsk);
1566                sk_set_socket(newsk, NULL);
1567                newsk->sk_wq = NULL;
1568
1569                if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1570                        sock_update_memcg(newsk);
1571
1572                if (newsk->sk_prot->sockets_allocated)
1573                        sk_sockets_allocated_inc(newsk);
1574
1575                if (sock_needs_netstamp(sk) &&
1576                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1577                        net_enable_timestamp();
1578        }
1579out:
1580        return newsk;
1581}
1582EXPORT_SYMBOL_GPL(sk_clone_lock);
1583
1584void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1585{
1586        u32 max_segs = 1;
1587
1588        sk_dst_set(sk, dst);
1589        sk->sk_route_caps = dst->dev->features;
1590        if (sk->sk_route_caps & NETIF_F_GSO)
1591                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1592        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1593        if (sk_can_gso(sk)) {
1594                if (dst->header_len) {
1595                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1596                } else {
1597                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1598                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1599                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1600                }
1601        }
1602        sk->sk_gso_max_segs = max_segs;
1603}
1604EXPORT_SYMBOL_GPL(sk_setup_caps);
1605
1606/*
1607 *      Simple resource managers for sockets.
1608 */
1609
1610
1611/*
1612 * Write buffer destructor automatically called from kfree_skb.
1613 */
1614void sock_wfree(struct sk_buff *skb)
1615{
1616        struct sock *sk = skb->sk;
1617        unsigned int len = skb->truesize;
1618
1619        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1620                /*
1621                 * Keep a reference on sk_wmem_alloc, this will be released
1622                 * after sk_write_space() call
1623                 */
1624                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1625                sk->sk_write_space(sk);
1626                len = 1;
1627        }
1628        /*
1629         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1630         * could not do because of in-flight packets
1631         */
1632        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1633                __sk_free(sk);
1634}
1635EXPORT_SYMBOL(sock_wfree);
1636
1637void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1638{
1639        skb_orphan(skb);
1640        skb->sk = sk;
1641#ifdef CONFIG_INET
1642        if (unlikely(!sk_fullsock(sk))) {
1643                skb->destructor = sock_edemux;
1644                sock_hold(sk);
1645                return;
1646        }
1647#endif
1648        skb->destructor = sock_wfree;
1649        skb_set_hash_from_sk(skb, sk);
1650        /*
1651         * We used to take a refcount on sk, but following operation
1652         * is enough to guarantee sk_free() wont free this sock until
1653         * all in-flight packets are completed
1654         */
1655        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1656}
1657EXPORT_SYMBOL(skb_set_owner_w);
1658
1659void skb_orphan_partial(struct sk_buff *skb)
1660{
1661        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1662         * so we do not completely orphan skb, but transfert all
1663         * accounted bytes but one, to avoid unexpected reorders.
1664         */
1665        if (skb->destructor == sock_wfree
1666#ifdef CONFIG_INET
1667            || skb->destructor == tcp_wfree
1668#endif
1669                ) {
1670                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1671                skb->truesize = 1;
1672        } else {
1673                skb_orphan(skb);
1674        }
1675}
1676EXPORT_SYMBOL(skb_orphan_partial);
1677
1678/*
1679 * Read buffer destructor automatically called from kfree_skb.
1680 */
1681void sock_rfree(struct sk_buff *skb)
1682{
1683        struct sock *sk = skb->sk;
1684        unsigned int len = skb->truesize;
1685
1686        atomic_sub(len, &sk->sk_rmem_alloc);
1687        sk_mem_uncharge(sk, len);
1688}
1689EXPORT_SYMBOL(sock_rfree);
1690
1691/*
1692 * Buffer destructor for skbs that are not used directly in read or write
1693 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1694 */
1695void sock_efree(struct sk_buff *skb)
1696{
1697        sock_put(skb->sk);
1698}
1699EXPORT_SYMBOL(sock_efree);
1700
1701kuid_t sock_i_uid(struct sock *sk)
1702{
1703        kuid_t uid;
1704
1705        read_lock_bh(&sk->sk_callback_lock);
1706        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1707        read_unlock_bh(&sk->sk_callback_lock);
1708        return uid;
1709}
1710EXPORT_SYMBOL(sock_i_uid);
1711
1712unsigned long sock_i_ino(struct sock *sk)
1713{
1714        unsigned long ino;
1715
1716        read_lock_bh(&sk->sk_callback_lock);
1717        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1718        read_unlock_bh(&sk->sk_callback_lock);
1719        return ino;
1720}
1721EXPORT_SYMBOL(sock_i_ino);
1722
1723/*
1724 * Allocate a skb from the socket's send buffer.
1725 */
1726struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1727                             gfp_t priority)
1728{
1729        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1730                struct sk_buff *skb = alloc_skb(size, priority);
1731                if (skb) {
1732                        skb_set_owner_w(skb, sk);
1733                        return skb;
1734                }
1735        }
1736        return NULL;
1737}
1738EXPORT_SYMBOL(sock_wmalloc);
1739
1740/*
1741 * Allocate a memory block from the socket's option memory buffer.
1742 */
1743void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1744{
1745        if ((unsigned int)size <= sysctl_optmem_max &&
1746            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1747                void *mem;
1748                /* First do the add, to avoid the race if kmalloc
1749                 * might sleep.
1750                 */
1751                atomic_add(size, &sk->sk_omem_alloc);
1752                mem = kmalloc(size, priority);
1753                if (mem)
1754                        return mem;
1755                atomic_sub(size, &sk->sk_omem_alloc);
1756        }
1757        return NULL;
1758}
1759EXPORT_SYMBOL(sock_kmalloc);
1760
1761/* Free an option memory block. Note, we actually want the inline
1762 * here as this allows gcc to detect the nullify and fold away the
1763 * condition entirely.
1764 */
1765static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1766                                  const bool nullify)
1767{
1768        if (WARN_ON_ONCE(!mem))
1769                return;
1770        if (nullify)
1771                kzfree(mem);
1772        else
1773                kfree(mem);
1774        atomic_sub(size, &sk->sk_omem_alloc);
1775}
1776
1777void sock_kfree_s(struct sock *sk, void *mem, int size)
1778{
1779        __sock_kfree_s(sk, mem, size, false);
1780}
1781EXPORT_SYMBOL(sock_kfree_s);
1782
1783void sock_kzfree_s(struct sock *sk, void *mem, int size)
1784{
1785        __sock_kfree_s(sk, mem, size, true);
1786}
1787EXPORT_SYMBOL(sock_kzfree_s);
1788
1789/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1790   I think, these locks should be removed for datagram sockets.
1791 */
1792static long sock_wait_for_wmem(struct sock *sk, long timeo)
1793{
1794        DEFINE_WAIT(wait);
1795
1796        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1797        for (;;) {
1798                if (!timeo)
1799                        break;
1800                if (signal_pending(current))
1801                        break;
1802                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1803                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1804                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1805                        break;
1806                if (sk->sk_shutdown & SEND_SHUTDOWN)
1807                        break;
1808                if (sk->sk_err)
1809                        break;
1810                timeo = schedule_timeout(timeo);
1811        }
1812        finish_wait(sk_sleep(sk), &wait);
1813        return timeo;
1814}
1815
1816
1817/*
1818 *      Generic send/receive buffer handlers
1819 */
1820
1821struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1822                                     unsigned long data_len, int noblock,
1823                                     int *errcode, int max_page_order)
1824{
1825        struct sk_buff *skb;
1826        long timeo;
1827        int err;
1828
1829        timeo = sock_sndtimeo(sk, noblock);
1830        for (;;) {
1831                err = sock_error(sk);
1832                if (err != 0)
1833                        goto failure;
1834
1835                err = -EPIPE;
1836                if (sk->sk_shutdown & SEND_SHUTDOWN)
1837                        goto failure;
1838
1839                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1840                        break;
1841
1842                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1843                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1844                err = -EAGAIN;
1845                if (!timeo)
1846                        goto failure;
1847                if (signal_pending(current))
1848                        goto interrupted;
1849                timeo = sock_wait_for_wmem(sk, timeo);
1850        }
1851        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1852                                   errcode, sk->sk_allocation);
1853        if (skb)
1854                skb_set_owner_w(skb, sk);
1855        return skb;
1856
1857interrupted:
1858        err = sock_intr_errno(timeo);
1859failure:
1860        *errcode = err;
1861        return NULL;
1862}
1863EXPORT_SYMBOL(sock_alloc_send_pskb);
1864
1865struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1866                                    int noblock, int *errcode)
1867{
1868        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1869}
1870EXPORT_SYMBOL(sock_alloc_send_skb);
1871
1872int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1873                   struct sockcm_cookie *sockc)
1874{
1875        struct cmsghdr *cmsg;
1876
1877        for_each_cmsghdr(cmsg, msg) {
1878                if (!CMSG_OK(msg, cmsg))
1879                        return -EINVAL;
1880                if (cmsg->cmsg_level != SOL_SOCKET)
1881                        continue;
1882                switch (cmsg->cmsg_type) {
1883                case SO_MARK:
1884                        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1885                                return -EPERM;
1886                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1887                                return -EINVAL;
1888                        sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1889                        break;
1890                default:
1891                        return -EINVAL;
1892                }
1893        }
1894        return 0;
1895}
1896EXPORT_SYMBOL(sock_cmsg_send);
1897
1898/* On 32bit arches, an skb frag is limited to 2^15 */
1899#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1900
1901/**
1902 * skb_page_frag_refill - check that a page_frag contains enough room
1903 * @sz: minimum size of the fragment we want to get
1904 * @pfrag: pointer to page_frag
1905 * @gfp: priority for memory allocation
1906 *
1907 * Note: While this allocator tries to use high order pages, there is
1908 * no guarantee that allocations succeed. Therefore, @sz MUST be
1909 * less or equal than PAGE_SIZE.
1910 */
1911bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1912{
1913        if (pfrag->page) {
1914                if (page_ref_count(pfrag->page) == 1) {
1915                        pfrag->offset = 0;
1916                        return true;
1917                }
1918                if (pfrag->offset + sz <= pfrag->size)
1919                        return true;
1920                put_page(pfrag->page);
1921        }
1922
1923        pfrag->offset = 0;
1924        if (SKB_FRAG_PAGE_ORDER) {
1925                /* Avoid direct reclaim but allow kswapd to wake */
1926                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1927                                          __GFP_COMP | __GFP_NOWARN |
1928                                          __GFP_NORETRY,
1929                                          SKB_FRAG_PAGE_ORDER);
1930                if (likely(pfrag->page)) {
1931                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1932                        return true;
1933                }
1934        }
1935        pfrag->page = alloc_page(gfp);
1936        if (likely(pfrag->page)) {
1937                pfrag->size = PAGE_SIZE;
1938                return true;
1939        }
1940        return false;
1941}
1942EXPORT_SYMBOL(skb_page_frag_refill);
1943
1944bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1945{
1946        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1947                return true;
1948
1949        sk_enter_memory_pressure(sk);
1950        sk_stream_moderate_sndbuf(sk);
1951        return false;
1952}
1953EXPORT_SYMBOL(sk_page_frag_refill);
1954
1955static void __lock_sock(struct sock *sk)
1956        __releases(&sk->sk_lock.slock)
1957        __acquires(&sk->sk_lock.slock)
1958{
1959        DEFINE_WAIT(wait);
1960
1961        for (;;) {
1962                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1963                                        TASK_UNINTERRUPTIBLE);
1964                spin_unlock_bh(&sk->sk_lock.slock);
1965                schedule();
1966                spin_lock_bh(&sk->sk_lock.slock);
1967                if (!sock_owned_by_user(sk))
1968                        break;
1969        }
1970        finish_wait(&sk->sk_lock.wq, &wait);
1971}
1972
1973static void __release_sock(struct sock *sk)
1974        __releases(&sk->sk_lock.slock)
1975        __acquires(&sk->sk_lock.slock)
1976{
1977        struct sk_buff *skb = sk->sk_backlog.head;
1978
1979        do {
1980                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1981                bh_unlock_sock(sk);
1982
1983                do {
1984                        struct sk_buff *next = skb->next;
1985
1986                        prefetch(next);
1987                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1988                        skb->next = NULL;
1989                        sk_backlog_rcv(sk, skb);
1990
1991                        /*
1992                         * We are in process context here with softirqs
1993                         * disabled, use cond_resched_softirq() to preempt.
1994                         * This is safe to do because we've taken the backlog
1995                         * queue private:
1996                         */
1997                        cond_resched_softirq();
1998
1999                        skb = next;
2000                } while (skb != NULL);
2001
2002                bh_lock_sock(sk);
2003        } while ((skb = sk->sk_backlog.head) != NULL);
2004
2005        /*
2006         * Doing the zeroing here guarantee we can not loop forever
2007         * while a wild producer attempts to flood us.
2008         */
2009        sk->sk_backlog.len = 0;
2010}
2011
2012/**
2013 * sk_wait_data - wait for data to arrive at sk_receive_queue
2014 * @sk:    sock to wait on
2015 * @timeo: for how long
2016 * @skb:   last skb seen on sk_receive_queue
2017 *
2018 * Now socket state including sk->sk_err is changed only under lock,
2019 * hence we may omit checks after joining wait queue.
2020 * We check receive queue before schedule() only as optimization;
2021 * it is very likely that release_sock() added new data.
2022 */
2023int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2024{
2025        int rc;
2026        DEFINE_WAIT(wait);
2027
2028        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2029        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2030        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2031        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2032        finish_wait(sk_sleep(sk), &wait);
2033        return rc;
2034}
2035EXPORT_SYMBOL(sk_wait_data);
2036
2037/**
2038 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2039 *      @sk: socket
2040 *      @size: memory size to allocate
2041 *      @kind: allocation type
2042 *
2043 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2044 *      rmem allocation. This function assumes that protocols which have
2045 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2046 */
2047int __sk_mem_schedule(struct sock *sk, int size, int kind)
2048{
2049        struct proto *prot = sk->sk_prot;
2050        int amt = sk_mem_pages(size);
2051        long allocated;
2052
2053        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2054
2055        allocated = sk_memory_allocated_add(sk, amt);
2056
2057        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2058            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2059                goto suppress_allocation;
2060
2061        /* Under limit. */
2062        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2063                sk_leave_memory_pressure(sk);
2064                return 1;
2065        }
2066
2067        /* Under pressure. */
2068        if (allocated > sk_prot_mem_limits(sk, 1))
2069                sk_enter_memory_pressure(sk);
2070
2071        /* Over hard limit. */
2072        if (allocated > sk_prot_mem_limits(sk, 2))
2073                goto suppress_allocation;
2074
2075        /* guarantee minimum buffer size under pressure */
2076        if (kind == SK_MEM_RECV) {
2077                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2078                        return 1;
2079
2080        } else { /* SK_MEM_SEND */
2081                if (sk->sk_type == SOCK_STREAM) {
2082                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2083                                return 1;
2084                } else if (atomic_read(&sk->sk_wmem_alloc) <
2085                           prot->sysctl_wmem[0])
2086                                return 1;
2087        }
2088
2089        if (sk_has_memory_pressure(sk)) {
2090                int alloc;
2091
2092                if (!sk_under_memory_pressure(sk))
2093                        return 1;
2094                alloc = sk_sockets_allocated_read_positive(sk);
2095                if (sk_prot_mem_limits(sk, 2) > alloc *
2096                    sk_mem_pages(sk->sk_wmem_queued +
2097                                 atomic_read(&sk->sk_rmem_alloc) +
2098                                 sk->sk_forward_alloc))
2099                        return 1;
2100        }
2101
2102suppress_allocation:
2103
2104        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2105                sk_stream_moderate_sndbuf(sk);
2106
2107                /* Fail only if socket is _under_ its sndbuf.
2108                 * In this case we cannot block, so that we have to fail.
2109                 */
2110                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2111                        return 1;
2112        }
2113
2114        trace_sock_exceed_buf_limit(sk, prot, allocated);
2115
2116        /* Alas. Undo changes. */
2117        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2118
2119        sk_memory_allocated_sub(sk, amt);
2120
2121        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2122                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2123
2124        return 0;
2125}
2126EXPORT_SYMBOL(__sk_mem_schedule);
2127
2128/**
2129 *      __sk_mem_reclaim - reclaim memory_allocated
2130 *      @sk: socket
2131 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2132 */
2133void __sk_mem_reclaim(struct sock *sk, int amount)
2134{
2135        amount >>= SK_MEM_QUANTUM_SHIFT;
2136        sk_memory_allocated_sub(sk, amount);
2137        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2138
2139        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2140                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2141
2142        if (sk_under_memory_pressure(sk) &&
2143            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2144                sk_leave_memory_pressure(sk);
2145}
2146EXPORT_SYMBOL(__sk_mem_reclaim);
2147
2148
2149/*
2150 * Set of default routines for initialising struct proto_ops when
2151 * the protocol does not support a particular function. In certain
2152 * cases where it makes no sense for a protocol to have a "do nothing"
2153 * function, some default processing is provided.
2154 */
2155
2156int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2157{
2158        return -EOPNOTSUPP;
2159}
2160EXPORT_SYMBOL(sock_no_bind);
2161
2162int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2163                    int len, int flags)
2164{
2165        return -EOPNOTSUPP;
2166}
2167EXPORT_SYMBOL(sock_no_connect);
2168
2169int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2170{
2171        return -EOPNOTSUPP;
2172}
2173EXPORT_SYMBOL(sock_no_socketpair);
2174
2175int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2176{
2177        return -EOPNOTSUPP;
2178}
2179EXPORT_SYMBOL(sock_no_accept);
2180
2181int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2182                    int *len, int peer)
2183{
2184        return -EOPNOTSUPP;
2185}
2186EXPORT_SYMBOL(sock_no_getname);
2187
2188unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2189{
2190        return 0;
2191}
2192EXPORT_SYMBOL(sock_no_poll);
2193
2194int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2195{
2196        return -EOPNOTSUPP;
2197}
2198EXPORT_SYMBOL(sock_no_ioctl);
2199
2200int sock_no_listen(struct socket *sock, int backlog)
2201{
2202        return -EOPNOTSUPP;
2203}
2204EXPORT_SYMBOL(sock_no_listen);
2205
2206int sock_no_shutdown(struct socket *sock, int how)
2207{
2208        return -EOPNOTSUPP;
2209}
2210EXPORT_SYMBOL(sock_no_shutdown);
2211
2212int sock_no_setsockopt(struct socket *sock, int level, int optname,
2213                    char __user *optval, unsigned int optlen)
2214{
2215        return -EOPNOTSUPP;
2216}
2217EXPORT_SYMBOL(sock_no_setsockopt);
2218
2219int sock_no_getsockopt(struct socket *sock, int level, int optname,
2220                    char __user *optval, int __user *optlen)
2221{
2222        return -EOPNOTSUPP;
2223}
2224EXPORT_SYMBOL(sock_no_getsockopt);
2225
2226int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2227{
2228        return -EOPNOTSUPP;
2229}
2230EXPORT_SYMBOL(sock_no_sendmsg);
2231
2232int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2233                    int flags)
2234{
2235        return -EOPNOTSUPP;
2236}
2237EXPORT_SYMBOL(sock_no_recvmsg);
2238
2239int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2240{
2241        /* Mirror missing mmap method error code */
2242        return -ENODEV;
2243}
2244EXPORT_SYMBOL(sock_no_mmap);
2245
2246ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2247{
2248        ssize_t res;
2249        struct msghdr msg = {.msg_flags = flags};
2250        struct kvec iov;
2251        char *kaddr = kmap(page);
2252        iov.iov_base = kaddr + offset;
2253        iov.iov_len = size;
2254        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2255        kunmap(page);
2256        return res;
2257}
2258EXPORT_SYMBOL(sock_no_sendpage);
2259
2260/*
2261 *      Default Socket Callbacks
2262 */
2263
2264static void sock_def_wakeup(struct sock *sk)
2265{
2266        struct socket_wq *wq;
2267
2268        rcu_read_lock();
2269        wq = rcu_dereference(sk->sk_wq);
2270        if (skwq_has_sleeper(wq))
2271                wake_up_interruptible_all(&wq->wait);
2272        rcu_read_unlock();
2273}
2274
2275static void sock_def_error_report(struct sock *sk)
2276{
2277        struct socket_wq *wq;
2278
2279        rcu_read_lock();
2280        wq = rcu_dereference(sk->sk_wq);
2281        if (skwq_has_sleeper(wq))
2282                wake_up_interruptible_poll(&wq->wait, POLLERR);
2283        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2284        rcu_read_unlock();
2285}
2286
2287static void sock_def_readable(struct sock *sk)
2288{
2289        struct socket_wq *wq;
2290
2291        rcu_read_lock();
2292        wq = rcu_dereference(sk->sk_wq);
2293        if (skwq_has_sleeper(wq))
2294                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2295                                                POLLRDNORM | POLLRDBAND);
2296        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2297        rcu_read_unlock();
2298}
2299
2300static void sock_def_write_space(struct sock *sk)
2301{
2302        struct socket_wq *wq;
2303
2304        rcu_read_lock();
2305
2306        /* Do not wake up a writer until he can make "significant"
2307         * progress.  --DaveM
2308         */
2309        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2310                wq = rcu_dereference(sk->sk_wq);
2311                if (skwq_has_sleeper(wq))
2312                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2313                                                POLLWRNORM | POLLWRBAND);
2314
2315                /* Should agree with poll, otherwise some programs break */
2316                if (sock_writeable(sk))
2317                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2318        }
2319
2320        rcu_read_unlock();
2321}
2322
2323static void sock_def_destruct(struct sock *sk)
2324{
2325}
2326
2327void sk_send_sigurg(struct sock *sk)
2328{
2329        if (sk->sk_socket && sk->sk_socket->file)
2330                if (send_sigurg(&sk->sk_socket->file->f_owner))
2331                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2332}
2333EXPORT_SYMBOL(sk_send_sigurg);
2334
2335void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2336                    unsigned long expires)
2337{
2338        if (!mod_timer(timer, expires))
2339                sock_hold(sk);
2340}
2341EXPORT_SYMBOL(sk_reset_timer);
2342
2343void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2344{
2345        if (del_timer(timer))
2346                __sock_put(sk);
2347}
2348EXPORT_SYMBOL(sk_stop_timer);
2349
2350void sock_init_data(struct socket *sock, struct sock *sk)
2351{
2352        skb_queue_head_init(&sk->sk_receive_queue);
2353        skb_queue_head_init(&sk->sk_write_queue);
2354        skb_queue_head_init(&sk->sk_error_queue);
2355
2356        sk->sk_send_head        =       NULL;
2357
2358        init_timer(&sk->sk_timer);
2359
2360        sk->sk_allocation       =       GFP_KERNEL;
2361        sk->sk_rcvbuf           =       sysctl_rmem_default;
2362        sk->sk_sndbuf           =       sysctl_wmem_default;
2363        sk->sk_state            =       TCP_CLOSE;
2364        sk_set_socket(sk, sock);
2365
2366        sock_set_flag(sk, SOCK_ZAPPED);
2367
2368        if (sock) {
2369                sk->sk_type     =       sock->type;
2370                sk->sk_wq       =       sock->wq;
2371                sock->sk        =       sk;
2372        } else
2373                sk->sk_wq       =       NULL;
2374
2375        rwlock_init(&sk->sk_callback_lock);
2376        lockdep_set_class_and_name(&sk->sk_callback_lock,
2377                        af_callback_keys + sk->sk_family,
2378                        af_family_clock_key_strings[sk->sk_family]);
2379
2380        sk->sk_state_change     =       sock_def_wakeup;
2381        sk->sk_data_ready       =       sock_def_readable;
2382        sk->sk_write_space      =       sock_def_write_space;
2383        sk->sk_error_report     =       sock_def_error_report;
2384        sk->sk_destruct         =       sock_def_destruct;
2385
2386        sk->sk_frag.page        =       NULL;
2387        sk->sk_frag.offset      =       0;
2388        sk->sk_peek_off         =       -1;
2389
2390        sk->sk_peer_pid         =       NULL;
2391        sk->sk_peer_cred        =       NULL;
2392        sk->sk_write_pending    =       0;
2393        sk->sk_rcvlowat         =       1;
2394        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2395        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2396
2397        sk->sk_stamp = ktime_set(-1L, 0);
2398
2399#ifdef CONFIG_NET_RX_BUSY_POLL
2400        sk->sk_napi_id          =       0;
2401        sk->sk_ll_usec          =       sysctl_net_busy_read;
2402#endif
2403
2404        sk->sk_max_pacing_rate = ~0U;
2405        sk->sk_pacing_rate = ~0U;
2406        sk->sk_incoming_cpu = -1;
2407        /*
2408         * Before updating sk_refcnt, we must commit prior changes to memory
2409         * (Documentation/RCU/rculist_nulls.txt for details)
2410         */
2411        smp_wmb();
2412        atomic_set(&sk->sk_refcnt, 1);
2413        atomic_set(&sk->sk_drops, 0);
2414}
2415EXPORT_SYMBOL(sock_init_data);
2416
2417void lock_sock_nested(struct sock *sk, int subclass)
2418{
2419        might_sleep();
2420        spin_lock_bh(&sk->sk_lock.slock);
2421        if (sk->sk_lock.owned)
2422                __lock_sock(sk);
2423        sk->sk_lock.owned = 1;
2424        spin_unlock(&sk->sk_lock.slock);
2425        /*
2426         * The sk_lock has mutex_lock() semantics here:
2427         */
2428        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2429        local_bh_enable();
2430}
2431EXPORT_SYMBOL(lock_sock_nested);
2432
2433void release_sock(struct sock *sk)
2434{
2435        /*
2436         * The sk_lock has mutex_unlock() semantics:
2437         */
2438        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2439
2440        spin_lock_bh(&sk->sk_lock.slock);
2441        if (sk->sk_backlog.tail)
2442                __release_sock(sk);
2443
2444        /* Warning : release_cb() might need to release sk ownership,
2445         * ie call sock_release_ownership(sk) before us.
2446         */
2447        if (sk->sk_prot->release_cb)
2448                sk->sk_prot->release_cb(sk);
2449
2450        sock_release_ownership(sk);
2451        if (waitqueue_active(&sk->sk_lock.wq))
2452                wake_up(&sk->sk_lock.wq);
2453        spin_unlock_bh(&sk->sk_lock.slock);
2454}
2455EXPORT_SYMBOL(release_sock);
2456
2457/**
2458 * lock_sock_fast - fast version of lock_sock
2459 * @sk: socket
2460 *
2461 * This version should be used for very small section, where process wont block
2462 * return false if fast path is taken
2463 *   sk_lock.slock locked, owned = 0, BH disabled
2464 * return true if slow path is taken
2465 *   sk_lock.slock unlocked, owned = 1, BH enabled
2466 */
2467bool lock_sock_fast(struct sock *sk)
2468{
2469        might_sleep();
2470        spin_lock_bh(&sk->sk_lock.slock);
2471
2472        if (!sk->sk_lock.owned)
2473                /*
2474                 * Note : We must disable BH
2475                 */
2476                return false;
2477
2478        __lock_sock(sk);
2479        sk->sk_lock.owned = 1;
2480        spin_unlock(&sk->sk_lock.slock);
2481        /*
2482         * The sk_lock has mutex_lock() semantics here:
2483         */
2484        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2485        local_bh_enable();
2486        return true;
2487}
2488EXPORT_SYMBOL(lock_sock_fast);
2489
2490int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2491{
2492        struct timeval tv;
2493        if (!sock_flag(sk, SOCK_TIMESTAMP))
2494                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2495        tv = ktime_to_timeval(sk->sk_stamp);
2496        if (tv.tv_sec == -1)
2497                return -ENOENT;
2498        if (tv.tv_sec == 0) {
2499                sk->sk_stamp = ktime_get_real();
2500                tv = ktime_to_timeval(sk->sk_stamp);
2501        }
2502        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2503}
2504EXPORT_SYMBOL(sock_get_timestamp);
2505
2506int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2507{
2508        struct timespec ts;
2509        if (!sock_flag(sk, SOCK_TIMESTAMP))
2510                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2511        ts = ktime_to_timespec(sk->sk_stamp);
2512        if (ts.tv_sec == -1)
2513                return -ENOENT;
2514        if (ts.tv_sec == 0) {
2515                sk->sk_stamp = ktime_get_real();
2516                ts = ktime_to_timespec(sk->sk_stamp);
2517        }
2518        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2519}
2520EXPORT_SYMBOL(sock_get_timestampns);
2521
2522void sock_enable_timestamp(struct sock *sk, int flag)
2523{
2524        if (!sock_flag(sk, flag)) {
2525                unsigned long previous_flags = sk->sk_flags;
2526
2527                sock_set_flag(sk, flag);
2528                /*
2529                 * we just set one of the two flags which require net
2530                 * time stamping, but time stamping might have been on
2531                 * already because of the other one
2532                 */
2533                if (sock_needs_netstamp(sk) &&
2534                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2535                        net_enable_timestamp();
2536        }
2537}
2538
2539int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2540                       int level, int type)
2541{
2542        struct sock_exterr_skb *serr;
2543        struct sk_buff *skb;
2544        int copied, err;
2545
2546        err = -EAGAIN;
2547        skb = sock_dequeue_err_skb(sk);
2548        if (skb == NULL)
2549                goto out;
2550
2551        copied = skb->len;
2552        if (copied > len) {
2553                msg->msg_flags |= MSG_TRUNC;
2554                copied = len;
2555        }
2556        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2557        if (err)
2558                goto out_free_skb;
2559
2560        sock_recv_timestamp(msg, sk, skb);
2561
2562        serr = SKB_EXT_ERR(skb);
2563        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2564
2565        msg->msg_flags |= MSG_ERRQUEUE;
2566        err = copied;
2567
2568out_free_skb:
2569        kfree_skb(skb);
2570out:
2571        return err;
2572}
2573EXPORT_SYMBOL(sock_recv_errqueue);
2574
2575/*
2576 *      Get a socket option on an socket.
2577 *
2578 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2579 *      asynchronous errors should be reported by getsockopt. We assume
2580 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2581 */
2582int sock_common_getsockopt(struct socket *sock, int level, int optname,
2583                           char __user *optval, int __user *optlen)
2584{
2585        struct sock *sk = sock->sk;
2586
2587        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2588}
2589EXPORT_SYMBOL(sock_common_getsockopt);
2590
2591#ifdef CONFIG_COMPAT
2592int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2593                                  char __user *optval, int __user *optlen)
2594{
2595        struct sock *sk = sock->sk;
2596
2597        if (sk->sk_prot->compat_getsockopt != NULL)
2598                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2599                                                      optval, optlen);
2600        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2601}
2602EXPORT_SYMBOL(compat_sock_common_getsockopt);
2603#endif
2604
2605int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2606                        int flags)
2607{
2608        struct sock *sk = sock->sk;
2609        int addr_len = 0;
2610        int err;
2611
2612        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2613                                   flags & ~MSG_DONTWAIT, &addr_len);
2614        if (err >= 0)
2615                msg->msg_namelen = addr_len;
2616        return err;
2617}
2618EXPORT_SYMBOL(sock_common_recvmsg);
2619
2620/*
2621 *      Set socket options on an inet socket.
2622 */
2623int sock_common_setsockopt(struct socket *sock, int level, int optname,
2624                           char __user *optval, unsigned int optlen)
2625{
2626        struct sock *sk = sock->sk;
2627
2628        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2629}
2630EXPORT_SYMBOL(sock_common_setsockopt);
2631
2632#ifdef CONFIG_COMPAT
2633int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2634                                  char __user *optval, unsigned int optlen)
2635{
2636        struct sock *sk = sock->sk;
2637
2638        if (sk->sk_prot->compat_setsockopt != NULL)
2639                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2640                                                      optval, optlen);
2641        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2642}
2643EXPORT_SYMBOL(compat_sock_common_setsockopt);
2644#endif
2645
2646void sk_common_release(struct sock *sk)
2647{
2648        if (sk->sk_prot->destroy)
2649                sk->sk_prot->destroy(sk);
2650
2651        /*
2652         * Observation: when sock_common_release is called, processes have
2653         * no access to socket. But net still has.
2654         * Step one, detach it from networking:
2655         *
2656         * A. Remove from hash tables.
2657         */
2658
2659        sk->sk_prot->unhash(sk);
2660
2661        /*
2662         * In this point socket cannot receive new packets, but it is possible
2663         * that some packets are in flight because some CPU runs receiver and
2664         * did hash table lookup before we unhashed socket. They will achieve
2665         * receive queue and will be purged by socket destructor.
2666         *
2667         * Also we still have packets pending on receive queue and probably,
2668         * our own packets waiting in device queues. sock_destroy will drain
2669         * receive queue, but transmitted packets will delay socket destruction
2670         * until the last reference will be released.
2671         */
2672
2673        sock_orphan(sk);
2674
2675        xfrm_sk_free_policy(sk);
2676
2677        sk_refcnt_debug_release(sk);
2678
2679        if (sk->sk_frag.page) {
2680                put_page(sk->sk_frag.page);
2681                sk->sk_frag.page = NULL;
2682        }
2683
2684        sock_put(sk);
2685}
2686EXPORT_SYMBOL(sk_common_release);
2687
2688#ifdef CONFIG_PROC_FS
2689#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2690struct prot_inuse {
2691        int val[PROTO_INUSE_NR];
2692};
2693
2694static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2695
2696#ifdef CONFIG_NET_NS
2697void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2698{
2699        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2700}
2701EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2702
2703int sock_prot_inuse_get(struct net *net, struct proto *prot)
2704{
2705        int cpu, idx = prot->inuse_idx;
2706        int res = 0;
2707
2708        for_each_possible_cpu(cpu)
2709                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2710
2711        return res >= 0 ? res : 0;
2712}
2713EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2714
2715static int __net_init sock_inuse_init_net(struct net *net)
2716{
2717        net->core.inuse = alloc_percpu(struct prot_inuse);
2718        return net->core.inuse ? 0 : -ENOMEM;
2719}
2720
2721static void __net_exit sock_inuse_exit_net(struct net *net)
2722{
2723        free_percpu(net->core.inuse);
2724}
2725
2726static struct pernet_operations net_inuse_ops = {
2727        .init = sock_inuse_init_net,
2728        .exit = sock_inuse_exit_net,
2729};
2730
2731static __init int net_inuse_init(void)
2732{
2733        if (register_pernet_subsys(&net_inuse_ops))
2734                panic("Cannot initialize net inuse counters");
2735
2736        return 0;
2737}
2738
2739core_initcall(net_inuse_init);
2740#else
2741static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2742
2743void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2744{
2745        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2746}
2747EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2748
2749int sock_prot_inuse_get(struct net *net, struct proto *prot)
2750{
2751        int cpu, idx = prot->inuse_idx;
2752        int res = 0;
2753
2754        for_each_possible_cpu(cpu)
2755                res += per_cpu(prot_inuse, cpu).val[idx];
2756
2757        return res >= 0 ? res : 0;
2758}
2759EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2760#endif
2761
2762static void assign_proto_idx(struct proto *prot)
2763{
2764        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2765
2766        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2767                pr_err("PROTO_INUSE_NR exhausted\n");
2768                return;
2769        }
2770
2771        set_bit(prot->inuse_idx, proto_inuse_idx);
2772}
2773
2774static void release_proto_idx(struct proto *prot)
2775{
2776        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2777                clear_bit(prot->inuse_idx, proto_inuse_idx);
2778}
2779#else
2780static inline void assign_proto_idx(struct proto *prot)
2781{
2782}
2783
2784static inline void release_proto_idx(struct proto *prot)
2785{
2786}
2787#endif
2788
2789static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2790{
2791        if (!rsk_prot)
2792                return;
2793        kfree(rsk_prot->slab_name);
2794        rsk_prot->slab_name = NULL;
2795        kmem_cache_destroy(rsk_prot->slab);
2796        rsk_prot->slab = NULL;
2797}
2798
2799static int req_prot_init(const struct proto *prot)
2800{
2801        struct request_sock_ops *rsk_prot = prot->rsk_prot;
2802
2803        if (!rsk_prot)
2804                return 0;
2805
2806        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2807                                        prot->name);
2808        if (!rsk_prot->slab_name)
2809                return -ENOMEM;
2810
2811        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2812                                           rsk_prot->obj_size, 0,
2813                                           prot->slab_flags, NULL);
2814
2815        if (!rsk_prot->slab) {
2816                pr_crit("%s: Can't create request sock SLAB cache!\n",
2817                        prot->name);
2818                return -ENOMEM;
2819        }
2820        return 0;
2821}
2822
2823int proto_register(struct proto *prot, int alloc_slab)
2824{
2825        if (alloc_slab) {
2826                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2827                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2828                                        NULL);
2829
2830                if (prot->slab == NULL) {
2831                        pr_crit("%s: Can't create sock SLAB cache!\n",
2832                                prot->name);
2833                        goto out;
2834                }
2835
2836                if (req_prot_init(prot))
2837                        goto out_free_request_sock_slab;
2838
2839                if (prot->twsk_prot != NULL) {
2840                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2841
2842                        if (prot->twsk_prot->twsk_slab_name == NULL)
2843                                goto out_free_request_sock_slab;
2844
2845                        prot->twsk_prot->twsk_slab =
2846                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2847                                                  prot->twsk_prot->twsk_obj_size,
2848                                                  0,
2849                                                  prot->slab_flags,
2850                                                  NULL);
2851                        if (prot->twsk_prot->twsk_slab == NULL)
2852                                goto out_free_timewait_sock_slab_name;
2853                }
2854        }
2855
2856        mutex_lock(&proto_list_mutex);
2857        list_add(&prot->node, &proto_list);
2858        assign_proto_idx(prot);
2859        mutex_unlock(&proto_list_mutex);
2860        return 0;
2861
2862out_free_timewait_sock_slab_name:
2863        kfree(prot->twsk_prot->twsk_slab_name);
2864out_free_request_sock_slab:
2865        req_prot_cleanup(prot->rsk_prot);
2866
2867        kmem_cache_destroy(prot->slab);
2868        prot->slab = NULL;
2869out:
2870        return -ENOBUFS;
2871}
2872EXPORT_SYMBOL(proto_register);
2873
2874void proto_unregister(struct proto *prot)
2875{
2876        mutex_lock(&proto_list_mutex);
2877        release_proto_idx(prot);
2878        list_del(&prot->node);
2879        mutex_unlock(&proto_list_mutex);
2880
2881        kmem_cache_destroy(prot->slab);
2882        prot->slab = NULL;
2883
2884        req_prot_cleanup(prot->rsk_prot);
2885
2886        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2887                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2888                kfree(prot->twsk_prot->twsk_slab_name);
2889                prot->twsk_prot->twsk_slab = NULL;
2890        }
2891}
2892EXPORT_SYMBOL(proto_unregister);
2893
2894#ifdef CONFIG_PROC_FS
2895static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2896        __acquires(proto_list_mutex)
2897{
2898        mutex_lock(&proto_list_mutex);
2899        return seq_list_start_head(&proto_list, *pos);
2900}
2901
2902static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2903{
2904        return seq_list_next(v, &proto_list, pos);
2905}
2906
2907static void proto_seq_stop(struct seq_file *seq, void *v)
2908        __releases(proto_list_mutex)
2909{
2910        mutex_unlock(&proto_list_mutex);
2911}
2912
2913static char proto_method_implemented(const void *method)
2914{
2915        return method == NULL ? 'n' : 'y';
2916}
2917static long sock_prot_memory_allocated(struct proto *proto)
2918{
2919        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2920}
2921
2922static char *sock_prot_memory_pressure(struct proto *proto)
2923{
2924        return proto->memory_pressure != NULL ?
2925        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2926}
2927
2928static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2929{
2930
2931        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2932                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2933                   proto->name,
2934                   proto->obj_size,
2935                   sock_prot_inuse_get(seq_file_net(seq), proto),
2936                   sock_prot_memory_allocated(proto),
2937                   sock_prot_memory_pressure(proto),
2938                   proto->max_header,
2939                   proto->slab == NULL ? "no" : "yes",
2940                   module_name(proto->owner),
2941                   proto_method_implemented(proto->close),
2942                   proto_method_implemented(proto->connect),
2943                   proto_method_implemented(proto->disconnect),
2944                   proto_method_implemented(proto->accept),
2945                   proto_method_implemented(proto->ioctl),
2946                   proto_method_implemented(proto->init),
2947                   proto_method_implemented(proto->destroy),
2948                   proto_method_implemented(proto->shutdown),
2949                   proto_method_implemented(proto->setsockopt),
2950                   proto_method_implemented(proto->getsockopt),
2951                   proto_method_implemented(proto->sendmsg),
2952                   proto_method_implemented(proto->recvmsg),
2953                   proto_method_implemented(proto->sendpage),
2954                   proto_method_implemented(proto->bind),
2955                   proto_method_implemented(proto->backlog_rcv),
2956                   proto_method_implemented(proto->hash),
2957                   proto_method_implemented(proto->unhash),
2958                   proto_method_implemented(proto->get_port),
2959                   proto_method_implemented(proto->enter_memory_pressure));
2960}
2961
2962static int proto_seq_show(struct seq_file *seq, void *v)
2963{
2964        if (v == &proto_list)
2965                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2966                           "protocol",
2967                           "size",
2968                           "sockets",
2969                           "memory",
2970                           "press",
2971                           "maxhdr",
2972                           "slab",
2973                           "module",
2974                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2975        else
2976                proto_seq_printf(seq, list_entry(v, struct proto, node));
2977        return 0;
2978}
2979
2980static const struct seq_operations proto_seq_ops = {
2981        .start  = proto_seq_start,
2982        .next   = proto_seq_next,
2983        .stop   = proto_seq_stop,
2984        .show   = proto_seq_show,
2985};
2986
2987static int proto_seq_open(struct inode *inode, struct file *file)
2988{
2989        return seq_open_net(inode, file, &proto_seq_ops,
2990                            sizeof(struct seq_net_private));
2991}
2992
2993static const struct file_operations proto_seq_fops = {
2994        .owner          = THIS_MODULE,
2995        .open           = proto_seq_open,
2996        .read           = seq_read,
2997        .llseek         = seq_lseek,
2998        .release        = seq_release_net,
2999};
3000
3001static __net_init int proto_init_net(struct net *net)
3002{
3003        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3004                return -ENOMEM;
3005
3006        return 0;
3007}
3008
3009static __net_exit void proto_exit_net(struct net *net)
3010{
3011        remove_proc_entry("protocols", net->proc_net);
3012}
3013
3014
3015static __net_initdata struct pernet_operations proto_net_ops = {
3016        .init = proto_init_net,
3017        .exit = proto_exit_net,
3018};
3019
3020static int __init proto_init(void)
3021{
3022        return register_pernet_subsys(&proto_net_ops);
3023}
3024
3025subsys_initcall(proto_init);
3026
3027#endif /* PROC_FS */
3028