linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134
 135#include <linux/filter.h>
 136
 137#include <trace/events/sock.h>
 138
 139#ifdef CONFIG_INET
 140#include <net/tcp.h>
 141#endif
 142
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159                   struct user_namespace *user_ns, int cap)
 160{
 161        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177        return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196
 197#ifdef CONFIG_MEMCG_KMEM
 198int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 199{
 200        struct proto *proto;
 201        int ret = 0;
 202
 203        mutex_lock(&proto_list_mutex);
 204        list_for_each_entry(proto, &proto_list, node) {
 205                if (proto->init_cgroup) {
 206                        ret = proto->init_cgroup(memcg, ss);
 207                        if (ret)
 208                                goto out;
 209                }
 210        }
 211
 212        mutex_unlock(&proto_list_mutex);
 213        return ret;
 214out:
 215        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 216                if (proto->destroy_cgroup)
 217                        proto->destroy_cgroup(memcg);
 218        mutex_unlock(&proto_list_mutex);
 219        return ret;
 220}
 221
 222void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 223{
 224        struct proto *proto;
 225
 226        mutex_lock(&proto_list_mutex);
 227        list_for_each_entry_reverse(proto, &proto_list, node)
 228                if (proto->destroy_cgroup)
 229                        proto->destroy_cgroup(memcg);
 230        mutex_unlock(&proto_list_mutex);
 231}
 232#endif
 233
 234/*
 235 * Each address family might have different locking rules, so we have
 236 * one slock key per address family:
 237 */
 238static struct lock_class_key af_family_keys[AF_MAX];
 239static struct lock_class_key af_family_slock_keys[AF_MAX];
 240
 241#if defined(CONFIG_MEMCG_KMEM)
 242struct static_key memcg_socket_limit_enabled;
 243EXPORT_SYMBOL(memcg_socket_limit_enabled);
 244#endif
 245
 246/*
 247 * Make lock validator output more readable. (we pre-construct these
 248 * strings build-time, so that runtime initialization of socket
 249 * locks is fast):
 250 */
 251static const char *const af_family_key_strings[AF_MAX+1] = {
 252  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 253  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 254  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 255  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 256  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 257  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 258  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 259  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 260  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 261  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 262  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 263  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 264  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 265  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 266};
 267static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 268  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 269  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 270  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 271  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 272  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 273  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 274  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 275  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 276  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 277  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 278  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 279  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 280  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 281  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 282};
 283static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 284  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 285  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 286  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 287  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 288  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 289  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 290  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 291  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 292  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 293  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 294  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 295  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 296  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 297  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305
 306/* Take into consideration the size of the struct sk_buff overhead in the
 307 * determination of these values, since that is non-constant across
 308 * platforms.  This makes socket queueing behavior and performance
 309 * not depend upon such differences.
 310 */
 311#define _SK_MEM_PACKETS         256
 312#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 313#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 314#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 315
 316/* Run time adjustable parameters. */
 317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318EXPORT_SYMBOL(sysctl_wmem_max);
 319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320EXPORT_SYMBOL(sysctl_rmem_max);
 321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324/* Maximal space eaten by iovec or ancillary data plus some space */
 325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328int sysctl_tstamp_allow_data __read_mostly = 1;
 329
 330struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 331EXPORT_SYMBOL_GPL(memalloc_socks);
 332
 333/**
 334 * sk_set_memalloc - sets %SOCK_MEMALLOC
 335 * @sk: socket to set it on
 336 *
 337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 338 * It's the responsibility of the admin to adjust min_free_kbytes
 339 * to meet the requirements
 340 */
 341void sk_set_memalloc(struct sock *sk)
 342{
 343        sock_set_flag(sk, SOCK_MEMALLOC);
 344        sk->sk_allocation |= __GFP_MEMALLOC;
 345        static_key_slow_inc(&memalloc_socks);
 346}
 347EXPORT_SYMBOL_GPL(sk_set_memalloc);
 348
 349void sk_clear_memalloc(struct sock *sk)
 350{
 351        sock_reset_flag(sk, SOCK_MEMALLOC);
 352        sk->sk_allocation &= ~__GFP_MEMALLOC;
 353        static_key_slow_dec(&memalloc_socks);
 354
 355        /*
 356         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 357         * progress of swapping. SOCK_MEMALLOC may be cleared while
 358         * it has rmem allocations due to the last swapfile being deactivated
 359         * but there is a risk that the socket is unusable due to exceeding
 360         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 361         */
 362        sk_mem_reclaim(sk);
 363}
 364EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 365
 366int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 367{
 368        int ret;
 369        unsigned long pflags = current->flags;
 370
 371        /* these should have been dropped before queueing */
 372        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 373
 374        current->flags |= PF_MEMALLOC;
 375        ret = sk->sk_backlog_rcv(sk, skb);
 376        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 377
 378        return ret;
 379}
 380EXPORT_SYMBOL(__sk_backlog_rcv);
 381
 382static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 383{
 384        struct timeval tv;
 385
 386        if (optlen < sizeof(tv))
 387                return -EINVAL;
 388        if (copy_from_user(&tv, optval, sizeof(tv)))
 389                return -EFAULT;
 390        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 391                return -EDOM;
 392
 393        if (tv.tv_sec < 0) {
 394                static int warned __read_mostly;
 395
 396                *timeo_p = 0;
 397                if (warned < 10 && net_ratelimit()) {
 398                        warned++;
 399                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 400                                __func__, current->comm, task_pid_nr(current));
 401                }
 402                return 0;
 403        }
 404        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 405        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 406                return 0;
 407        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 408                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 409        return 0;
 410}
 411
 412static void sock_warn_obsolete_bsdism(const char *name)
 413{
 414        static int warned;
 415        static char warncomm[TASK_COMM_LEN];
 416        if (strcmp(warncomm, current->comm) && warned < 5) {
 417                strcpy(warncomm,  current->comm);
 418                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 419                        warncomm, name);
 420                warned++;
 421        }
 422}
 423
 424#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 425
 426static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 427{
 428        if (sk->sk_flags & flags) {
 429                sk->sk_flags &= ~flags;
 430                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 431                        net_disable_timestamp();
 432        }
 433}
 434
 435
 436int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 437{
 438        int err;
 439        unsigned long flags;
 440        struct sk_buff_head *list = &sk->sk_receive_queue;
 441
 442        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 443                atomic_inc(&sk->sk_drops);
 444                trace_sock_rcvqueue_full(sk, skb);
 445                return -ENOMEM;
 446        }
 447
 448        err = sk_filter(sk, skb);
 449        if (err)
 450                return err;
 451
 452        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 453                atomic_inc(&sk->sk_drops);
 454                return -ENOBUFS;
 455        }
 456
 457        skb->dev = NULL;
 458        skb_set_owner_r(skb, sk);
 459
 460        /* we escape from rcu protected region, make sure we dont leak
 461         * a norefcounted dst
 462         */
 463        skb_dst_force(skb);
 464
 465        spin_lock_irqsave(&list->lock, flags);
 466        sock_skb_set_dropcount(sk, skb);
 467        __skb_queue_tail(list, skb);
 468        spin_unlock_irqrestore(&list->lock, flags);
 469
 470        if (!sock_flag(sk, SOCK_DEAD))
 471                sk->sk_data_ready(sk);
 472        return 0;
 473}
 474EXPORT_SYMBOL(sock_queue_rcv_skb);
 475
 476int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 477{
 478        int rc = NET_RX_SUCCESS;
 479
 480        if (sk_filter(sk, skb))
 481                goto discard_and_relse;
 482
 483        skb->dev = NULL;
 484
 485        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 486                atomic_inc(&sk->sk_drops);
 487                goto discard_and_relse;
 488        }
 489        if (nested)
 490                bh_lock_sock_nested(sk);
 491        else
 492                bh_lock_sock(sk);
 493        if (!sock_owned_by_user(sk)) {
 494                /*
 495                 * trylock + unlock semantics:
 496                 */
 497                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 498
 499                rc = sk_backlog_rcv(sk, skb);
 500
 501                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 502        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 503                bh_unlock_sock(sk);
 504                atomic_inc(&sk->sk_drops);
 505                goto discard_and_relse;
 506        }
 507
 508        bh_unlock_sock(sk);
 509out:
 510        sock_put(sk);
 511        return rc;
 512discard_and_relse:
 513        kfree_skb(skb);
 514        goto out;
 515}
 516EXPORT_SYMBOL(sk_receive_skb);
 517
 518struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 519{
 520        struct dst_entry *dst = __sk_dst_get(sk);
 521
 522        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 523                sk_tx_queue_clear(sk);
 524                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 525                dst_release(dst);
 526                return NULL;
 527        }
 528
 529        return dst;
 530}
 531EXPORT_SYMBOL(__sk_dst_check);
 532
 533struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 534{
 535        struct dst_entry *dst = sk_dst_get(sk);
 536
 537        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 538                sk_dst_reset(sk);
 539                dst_release(dst);
 540                return NULL;
 541        }
 542
 543        return dst;
 544}
 545EXPORT_SYMBOL(sk_dst_check);
 546
 547static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 548                                int optlen)
 549{
 550        int ret = -ENOPROTOOPT;
 551#ifdef CONFIG_NETDEVICES
 552        struct net *net = sock_net(sk);
 553        char devname[IFNAMSIZ];
 554        int index;
 555
 556        /* Sorry... */
 557        ret = -EPERM;
 558        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 559                goto out;
 560
 561        ret = -EINVAL;
 562        if (optlen < 0)
 563                goto out;
 564
 565        /* Bind this socket to a particular device like "eth0",
 566         * as specified in the passed interface name. If the
 567         * name is "" or the option length is zero the socket
 568         * is not bound.
 569         */
 570        if (optlen > IFNAMSIZ - 1)
 571                optlen = IFNAMSIZ - 1;
 572        memset(devname, 0, sizeof(devname));
 573
 574        ret = -EFAULT;
 575        if (copy_from_user(devname, optval, optlen))
 576                goto out;
 577
 578        index = 0;
 579        if (devname[0] != '\0') {
 580                struct net_device *dev;
 581
 582                rcu_read_lock();
 583                dev = dev_get_by_name_rcu(net, devname);
 584                if (dev)
 585                        index = dev->ifindex;
 586                rcu_read_unlock();
 587                ret = -ENODEV;
 588                if (!dev)
 589                        goto out;
 590        }
 591
 592        lock_sock(sk);
 593        sk->sk_bound_dev_if = index;
 594        sk_dst_reset(sk);
 595        release_sock(sk);
 596
 597        ret = 0;
 598
 599out:
 600#endif
 601
 602        return ret;
 603}
 604
 605static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 606                                int __user *optlen, int len)
 607{
 608        int ret = -ENOPROTOOPT;
 609#ifdef CONFIG_NETDEVICES
 610        struct net *net = sock_net(sk);
 611        char devname[IFNAMSIZ];
 612
 613        if (sk->sk_bound_dev_if == 0) {
 614                len = 0;
 615                goto zero;
 616        }
 617
 618        ret = -EINVAL;
 619        if (len < IFNAMSIZ)
 620                goto out;
 621
 622        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 623        if (ret)
 624                goto out;
 625
 626        len = strlen(devname) + 1;
 627
 628        ret = -EFAULT;
 629        if (copy_to_user(optval, devname, len))
 630                goto out;
 631
 632zero:
 633        ret = -EFAULT;
 634        if (put_user(len, optlen))
 635                goto out;
 636
 637        ret = 0;
 638
 639out:
 640#endif
 641
 642        return ret;
 643}
 644
 645static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 646{
 647        if (valbool)
 648                sock_set_flag(sk, bit);
 649        else
 650                sock_reset_flag(sk, bit);
 651}
 652
 653bool sk_mc_loop(struct sock *sk)
 654{
 655        if (dev_recursion_level())
 656                return false;
 657        if (!sk)
 658                return true;
 659        switch (sk->sk_family) {
 660        case AF_INET:
 661                return inet_sk(sk)->mc_loop;
 662#if IS_ENABLED(CONFIG_IPV6)
 663        case AF_INET6:
 664                return inet6_sk(sk)->mc_loop;
 665#endif
 666        }
 667        WARN_ON(1);
 668        return true;
 669}
 670EXPORT_SYMBOL(sk_mc_loop);
 671
 672/*
 673 *      This is meant for all protocols to use and covers goings on
 674 *      at the socket level. Everything here is generic.
 675 */
 676
 677int sock_setsockopt(struct socket *sock, int level, int optname,
 678                    char __user *optval, unsigned int optlen)
 679{
 680        struct sock *sk = sock->sk;
 681        int val;
 682        int valbool;
 683        struct linger ling;
 684        int ret = 0;
 685
 686        /*
 687         *      Options without arguments
 688         */
 689
 690        if (optname == SO_BINDTODEVICE)
 691                return sock_setbindtodevice(sk, optval, optlen);
 692
 693        if (optlen < sizeof(int))
 694                return -EINVAL;
 695
 696        if (get_user(val, (int __user *)optval))
 697                return -EFAULT;
 698
 699        valbool = val ? 1 : 0;
 700
 701        lock_sock(sk);
 702
 703        switch (optname) {
 704        case SO_DEBUG:
 705                if (val && !capable(CAP_NET_ADMIN))
 706                        ret = -EACCES;
 707                else
 708                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 709                break;
 710        case SO_REUSEADDR:
 711                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 712                break;
 713        case SO_REUSEPORT:
 714                sk->sk_reuseport = valbool;
 715                break;
 716        case SO_TYPE:
 717        case SO_PROTOCOL:
 718        case SO_DOMAIN:
 719        case SO_ERROR:
 720                ret = -ENOPROTOOPT;
 721                break;
 722        case SO_DONTROUTE:
 723                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 724                break;
 725        case SO_BROADCAST:
 726                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 727                break;
 728        case SO_SNDBUF:
 729                /* Don't error on this BSD doesn't and if you think
 730                 * about it this is right. Otherwise apps have to
 731                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 732                 * are treated in BSD as hints
 733                 */
 734                val = min_t(u32, val, sysctl_wmem_max);
 735set_sndbuf:
 736                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 737                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 738                /* Wake up sending tasks if we upped the value. */
 739                sk->sk_write_space(sk);
 740                break;
 741
 742        case SO_SNDBUFFORCE:
 743                if (!capable(CAP_NET_ADMIN)) {
 744                        ret = -EPERM;
 745                        break;
 746                }
 747                goto set_sndbuf;
 748
 749        case SO_RCVBUF:
 750                /* Don't error on this BSD doesn't and if you think
 751                 * about it this is right. Otherwise apps have to
 752                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 753                 * are treated in BSD as hints
 754                 */
 755                val = min_t(u32, val, sysctl_rmem_max);
 756set_rcvbuf:
 757                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 758                /*
 759                 * We double it on the way in to account for
 760                 * "struct sk_buff" etc. overhead.   Applications
 761                 * assume that the SO_RCVBUF setting they make will
 762                 * allow that much actual data to be received on that
 763                 * socket.
 764                 *
 765                 * Applications are unaware that "struct sk_buff" and
 766                 * other overheads allocate from the receive buffer
 767                 * during socket buffer allocation.
 768                 *
 769                 * And after considering the possible alternatives,
 770                 * returning the value we actually used in getsockopt
 771                 * is the most desirable behavior.
 772                 */
 773                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 774                break;
 775
 776        case SO_RCVBUFFORCE:
 777                if (!capable(CAP_NET_ADMIN)) {
 778                        ret = -EPERM;
 779                        break;
 780                }
 781                goto set_rcvbuf;
 782
 783        case SO_KEEPALIVE:
 784#ifdef CONFIG_INET
 785                if (sk->sk_protocol == IPPROTO_TCP &&
 786                    sk->sk_type == SOCK_STREAM)
 787                        tcp_set_keepalive(sk, valbool);
 788#endif
 789                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 790                break;
 791
 792        case SO_OOBINLINE:
 793                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 794                break;
 795
 796        case SO_NO_CHECK:
 797                sk->sk_no_check_tx = valbool;
 798                break;
 799
 800        case SO_PRIORITY:
 801                if ((val >= 0 && val <= 6) ||
 802                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 803                        sk->sk_priority = val;
 804                else
 805                        ret = -EPERM;
 806                break;
 807
 808        case SO_LINGER:
 809                if (optlen < sizeof(ling)) {
 810                        ret = -EINVAL;  /* 1003.1g */
 811                        break;
 812                }
 813                if (copy_from_user(&ling, optval, sizeof(ling))) {
 814                        ret = -EFAULT;
 815                        break;
 816                }
 817                if (!ling.l_onoff)
 818                        sock_reset_flag(sk, SOCK_LINGER);
 819                else {
 820#if (BITS_PER_LONG == 32)
 821                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 822                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 823                        else
 824#endif
 825                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 826                        sock_set_flag(sk, SOCK_LINGER);
 827                }
 828                break;
 829
 830        case SO_BSDCOMPAT:
 831                sock_warn_obsolete_bsdism("setsockopt");
 832                break;
 833
 834        case SO_PASSCRED:
 835                if (valbool)
 836                        set_bit(SOCK_PASSCRED, &sock->flags);
 837                else
 838                        clear_bit(SOCK_PASSCRED, &sock->flags);
 839                break;
 840
 841        case SO_TIMESTAMP:
 842        case SO_TIMESTAMPNS:
 843                if (valbool)  {
 844                        if (optname == SO_TIMESTAMP)
 845                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 846                        else
 847                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 848                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 849                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 850                } else {
 851                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 852                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 853                }
 854                break;
 855
 856        case SO_TIMESTAMPING:
 857                if (val & ~SOF_TIMESTAMPING_MASK) {
 858                        ret = -EINVAL;
 859                        break;
 860                }
 861
 862                if (val & SOF_TIMESTAMPING_OPT_ID &&
 863                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 864                        if (sk->sk_protocol == IPPROTO_TCP) {
 865                                if (sk->sk_state != TCP_ESTABLISHED) {
 866                                        ret = -EINVAL;
 867                                        break;
 868                                }
 869                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 870                        } else {
 871                                sk->sk_tskey = 0;
 872                        }
 873                }
 874                sk->sk_tsflags = val;
 875                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 876                        sock_enable_timestamp(sk,
 877                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 878                else
 879                        sock_disable_timestamp(sk,
 880                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 881                break;
 882
 883        case SO_RCVLOWAT:
 884                if (val < 0)
 885                        val = INT_MAX;
 886                sk->sk_rcvlowat = val ? : 1;
 887                break;
 888
 889        case SO_RCVTIMEO:
 890                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 891                break;
 892
 893        case SO_SNDTIMEO:
 894                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 895                break;
 896
 897        case SO_ATTACH_FILTER:
 898                ret = -EINVAL;
 899                if (optlen == sizeof(struct sock_fprog)) {
 900                        struct sock_fprog fprog;
 901
 902                        ret = -EFAULT;
 903                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 904                                break;
 905
 906                        ret = sk_attach_filter(&fprog, sk);
 907                }
 908                break;
 909
 910        case SO_ATTACH_BPF:
 911                ret = -EINVAL;
 912                if (optlen == sizeof(u32)) {
 913                        u32 ufd;
 914
 915                        ret = -EFAULT;
 916                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 917                                break;
 918
 919                        ret = sk_attach_bpf(ufd, sk);
 920                }
 921                break;
 922
 923        case SO_DETACH_FILTER:
 924                ret = sk_detach_filter(sk);
 925                break;
 926
 927        case SO_LOCK_FILTER:
 928                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 929                        ret = -EPERM;
 930                else
 931                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 932                break;
 933
 934        case SO_PASSSEC:
 935                if (valbool)
 936                        set_bit(SOCK_PASSSEC, &sock->flags);
 937                else
 938                        clear_bit(SOCK_PASSSEC, &sock->flags);
 939                break;
 940        case SO_MARK:
 941                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 942                        ret = -EPERM;
 943                else
 944                        sk->sk_mark = val;
 945                break;
 946
 947        case SO_RXQ_OVFL:
 948                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 949                break;
 950
 951        case SO_WIFI_STATUS:
 952                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 953                break;
 954
 955        case SO_PEEK_OFF:
 956                if (sock->ops->set_peek_off)
 957                        ret = sock->ops->set_peek_off(sk, val);
 958                else
 959                        ret = -EOPNOTSUPP;
 960                break;
 961
 962        case SO_NOFCS:
 963                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 964                break;
 965
 966        case SO_SELECT_ERR_QUEUE:
 967                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 968                break;
 969
 970#ifdef CONFIG_NET_RX_BUSY_POLL
 971        case SO_BUSY_POLL:
 972                /* allow unprivileged users to decrease the value */
 973                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 974                        ret = -EPERM;
 975                else {
 976                        if (val < 0)
 977                                ret = -EINVAL;
 978                        else
 979                                sk->sk_ll_usec = val;
 980                }
 981                break;
 982#endif
 983
 984        case SO_MAX_PACING_RATE:
 985                sk->sk_max_pacing_rate = val;
 986                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 987                                         sk->sk_max_pacing_rate);
 988                break;
 989
 990        default:
 991                ret = -ENOPROTOOPT;
 992                break;
 993        }
 994        release_sock(sk);
 995        return ret;
 996}
 997EXPORT_SYMBOL(sock_setsockopt);
 998
 999
1000static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001                          struct ucred *ucred)
1002{
1003        ucred->pid = pid_vnr(pid);
1004        ucred->uid = ucred->gid = -1;
1005        if (cred) {
1006                struct user_namespace *current_ns = current_user_ns();
1007
1008                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1010        }
1011}
1012
1013int sock_getsockopt(struct socket *sock, int level, int optname,
1014                    char __user *optval, int __user *optlen)
1015{
1016        struct sock *sk = sock->sk;
1017
1018        union {
1019                int val;
1020                struct linger ling;
1021                struct timeval tm;
1022        } v;
1023
1024        int lv = sizeof(int);
1025        int len;
1026
1027        if (get_user(len, optlen))
1028                return -EFAULT;
1029        if (len < 0)
1030                return -EINVAL;
1031
1032        memset(&v, 0, sizeof(v));
1033
1034        switch (optname) {
1035        case SO_DEBUG:
1036                v.val = sock_flag(sk, SOCK_DBG);
1037                break;
1038
1039        case SO_DONTROUTE:
1040                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041                break;
1042
1043        case SO_BROADCAST:
1044                v.val = sock_flag(sk, SOCK_BROADCAST);
1045                break;
1046
1047        case SO_SNDBUF:
1048                v.val = sk->sk_sndbuf;
1049                break;
1050
1051        case SO_RCVBUF:
1052                v.val = sk->sk_rcvbuf;
1053                break;
1054
1055        case SO_REUSEADDR:
1056                v.val = sk->sk_reuse;
1057                break;
1058
1059        case SO_REUSEPORT:
1060                v.val = sk->sk_reuseport;
1061                break;
1062
1063        case SO_KEEPALIVE:
1064                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1065                break;
1066
1067        case SO_TYPE:
1068                v.val = sk->sk_type;
1069                break;
1070
1071        case SO_PROTOCOL:
1072                v.val = sk->sk_protocol;
1073                break;
1074
1075        case SO_DOMAIN:
1076                v.val = sk->sk_family;
1077                break;
1078
1079        case SO_ERROR:
1080                v.val = -sock_error(sk);
1081                if (v.val == 0)
1082                        v.val = xchg(&sk->sk_err_soft, 0);
1083                break;
1084
1085        case SO_OOBINLINE:
1086                v.val = sock_flag(sk, SOCK_URGINLINE);
1087                break;
1088
1089        case SO_NO_CHECK:
1090                v.val = sk->sk_no_check_tx;
1091                break;
1092
1093        case SO_PRIORITY:
1094                v.val = sk->sk_priority;
1095                break;
1096
1097        case SO_LINGER:
1098                lv              = sizeof(v.ling);
1099                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1100                v.ling.l_linger = sk->sk_lingertime / HZ;
1101                break;
1102
1103        case SO_BSDCOMPAT:
1104                sock_warn_obsolete_bsdism("getsockopt");
1105                break;
1106
1107        case SO_TIMESTAMP:
1108                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110                break;
1111
1112        case SO_TIMESTAMPNS:
1113                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1114                break;
1115
1116        case SO_TIMESTAMPING:
1117                v.val = sk->sk_tsflags;
1118                break;
1119
1120        case SO_RCVTIMEO:
1121                lv = sizeof(struct timeval);
1122                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123                        v.tm.tv_sec = 0;
1124                        v.tm.tv_usec = 0;
1125                } else {
1126                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128                }
1129                break;
1130
1131        case SO_SNDTIMEO:
1132                lv = sizeof(struct timeval);
1133                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134                        v.tm.tv_sec = 0;
1135                        v.tm.tv_usec = 0;
1136                } else {
1137                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139                }
1140                break;
1141
1142        case SO_RCVLOWAT:
1143                v.val = sk->sk_rcvlowat;
1144                break;
1145
1146        case SO_SNDLOWAT:
1147                v.val = 1;
1148                break;
1149
1150        case SO_PASSCRED:
1151                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1152                break;
1153
1154        case SO_PEERCRED:
1155        {
1156                struct ucred peercred;
1157                if (len > sizeof(peercred))
1158                        len = sizeof(peercred);
1159                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160                if (copy_to_user(optval, &peercred, len))
1161                        return -EFAULT;
1162                goto lenout;
1163        }
1164
1165        case SO_PEERNAME:
1166        {
1167                char address[128];
1168
1169                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170                        return -ENOTCONN;
1171                if (lv < len)
1172                        return -EINVAL;
1173                if (copy_to_user(optval, address, len))
1174                        return -EFAULT;
1175                goto lenout;
1176        }
1177
1178        /* Dubious BSD thing... Probably nobody even uses it, but
1179         * the UNIX standard wants it for whatever reason... -DaveM
1180         */
1181        case SO_ACCEPTCONN:
1182                v.val = sk->sk_state == TCP_LISTEN;
1183                break;
1184
1185        case SO_PASSSEC:
1186                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1187                break;
1188
1189        case SO_PEERSEC:
1190                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191
1192        case SO_MARK:
1193                v.val = sk->sk_mark;
1194                break;
1195
1196        case SO_RXQ_OVFL:
1197                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1198                break;
1199
1200        case SO_WIFI_STATUS:
1201                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1202                break;
1203
1204        case SO_PEEK_OFF:
1205                if (!sock->ops->set_peek_off)
1206                        return -EOPNOTSUPP;
1207
1208                v.val = sk->sk_peek_off;
1209                break;
1210        case SO_NOFCS:
1211                v.val = sock_flag(sk, SOCK_NOFCS);
1212                break;
1213
1214        case SO_BINDTODEVICE:
1215                return sock_getbindtodevice(sk, optval, optlen, len);
1216
1217        case SO_GET_FILTER:
1218                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219                if (len < 0)
1220                        return len;
1221
1222                goto lenout;
1223
1224        case SO_LOCK_FILTER:
1225                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226                break;
1227
1228        case SO_BPF_EXTENSIONS:
1229                v.val = bpf_tell_extensions();
1230                break;
1231
1232        case SO_SELECT_ERR_QUEUE:
1233                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234                break;
1235
1236#ifdef CONFIG_NET_RX_BUSY_POLL
1237        case SO_BUSY_POLL:
1238                v.val = sk->sk_ll_usec;
1239                break;
1240#endif
1241
1242        case SO_MAX_PACING_RATE:
1243                v.val = sk->sk_max_pacing_rate;
1244                break;
1245
1246        case SO_INCOMING_CPU:
1247                v.val = sk->sk_incoming_cpu;
1248                break;
1249
1250        default:
1251                /* We implement the SO_SNDLOWAT etc to not be settable
1252                 * (1003.1g 7).
1253                 */
1254                return -ENOPROTOOPT;
1255        }
1256
1257        if (len > lv)
1258                len = lv;
1259        if (copy_to_user(optval, &v, len))
1260                return -EFAULT;
1261lenout:
1262        if (put_user(len, optlen))
1263                return -EFAULT;
1264        return 0;
1265}
1266
1267/*
1268 * Initialize an sk_lock.
1269 *
1270 * (We also register the sk_lock with the lock validator.)
1271 */
1272static inline void sock_lock_init(struct sock *sk)
1273{
1274        sock_lock_init_class_and_name(sk,
1275                        af_family_slock_key_strings[sk->sk_family],
1276                        af_family_slock_keys + sk->sk_family,
1277                        af_family_key_strings[sk->sk_family],
1278                        af_family_keys + sk->sk_family);
1279}
1280
1281/*
1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1285 */
1286static void sock_copy(struct sock *nsk, const struct sock *osk)
1287{
1288#ifdef CONFIG_SECURITY_NETWORK
1289        void *sptr = nsk->sk_security;
1290#endif
1291        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292
1293        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295
1296#ifdef CONFIG_SECURITY_NETWORK
1297        nsk->sk_security = sptr;
1298        security_sk_clone(osk, nsk);
1299#endif
1300}
1301
1302void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303{
1304        unsigned long nulls1, nulls2;
1305
1306        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308        if (nulls1 > nulls2)
1309                swap(nulls1, nulls2);
1310
1311        if (nulls1 != 0)
1312                memset((char *)sk, 0, nulls1);
1313        memset((char *)sk + nulls1 + sizeof(void *), 0,
1314               nulls2 - nulls1 - sizeof(void *));
1315        memset((char *)sk + nulls2 + sizeof(void *), 0,
1316               size - nulls2 - sizeof(void *));
1317}
1318EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319
1320static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321                int family)
1322{
1323        struct sock *sk;
1324        struct kmem_cache *slab;
1325
1326        slab = prot->slab;
1327        if (slab != NULL) {
1328                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329                if (!sk)
1330                        return sk;
1331                if (priority & __GFP_ZERO) {
1332                        if (prot->clear_sk)
1333                                prot->clear_sk(sk, prot->obj_size);
1334                        else
1335                                sk_prot_clear_nulls(sk, prot->obj_size);
1336                }
1337        } else
1338                sk = kmalloc(prot->obj_size, priority);
1339
1340        if (sk != NULL) {
1341                kmemcheck_annotate_bitfield(sk, flags);
1342
1343                if (security_sk_alloc(sk, family, priority))
1344                        goto out_free;
1345
1346                if (!try_module_get(prot->owner))
1347                        goto out_free_sec;
1348                sk_tx_queue_clear(sk);
1349        }
1350
1351        return sk;
1352
1353out_free_sec:
1354        security_sk_free(sk);
1355out_free:
1356        if (slab != NULL)
1357                kmem_cache_free(slab, sk);
1358        else
1359                kfree(sk);
1360        return NULL;
1361}
1362
1363static void sk_prot_free(struct proto *prot, struct sock *sk)
1364{
1365        struct kmem_cache *slab;
1366        struct module *owner;
1367
1368        owner = prot->owner;
1369        slab = prot->slab;
1370
1371        security_sk_free(sk);
1372        if (slab != NULL)
1373                kmem_cache_free(slab, sk);
1374        else
1375                kfree(sk);
1376        module_put(owner);
1377}
1378
1379#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1380void sock_update_netprioidx(struct sock *sk)
1381{
1382        if (in_interrupt())
1383                return;
1384
1385        sk->sk_cgrp_prioidx = task_netprioidx(current);
1386}
1387EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1388#endif
1389
1390/**
1391 *      sk_alloc - All socket objects are allocated here
1392 *      @net: the applicable net namespace
1393 *      @family: protocol family
1394 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1395 *      @prot: struct proto associated with this new sock instance
1396 */
1397struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1398                      struct proto *prot)
1399{
1400        struct sock *sk;
1401
1402        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1403        if (sk) {
1404                sk->sk_family = family;
1405                /*
1406                 * See comment in struct sock definition to understand
1407                 * why we need sk_prot_creator -acme
1408                 */
1409                sk->sk_prot = sk->sk_prot_creator = prot;
1410                sock_lock_init(sk);
1411                sock_net_set(sk, get_net(net));
1412                atomic_set(&sk->sk_wmem_alloc, 1);
1413
1414                sock_update_classid(sk);
1415                sock_update_netprioidx(sk);
1416        }
1417
1418        return sk;
1419}
1420EXPORT_SYMBOL(sk_alloc);
1421
1422static void __sk_free(struct sock *sk)
1423{
1424        struct sk_filter *filter;
1425
1426        if (sk->sk_destruct)
1427                sk->sk_destruct(sk);
1428
1429        filter = rcu_dereference_check(sk->sk_filter,
1430                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1431        if (filter) {
1432                sk_filter_uncharge(sk, filter);
1433                RCU_INIT_POINTER(sk->sk_filter, NULL);
1434        }
1435
1436        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1437
1438        if (atomic_read(&sk->sk_omem_alloc))
1439                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1440                         __func__, atomic_read(&sk->sk_omem_alloc));
1441
1442        if (sk->sk_peer_cred)
1443                put_cred(sk->sk_peer_cred);
1444        put_pid(sk->sk_peer_pid);
1445        put_net(sock_net(sk));
1446        sk_prot_free(sk->sk_prot_creator, sk);
1447}
1448
1449void sk_free(struct sock *sk)
1450{
1451        /*
1452         * We subtract one from sk_wmem_alloc and can know if
1453         * some packets are still in some tx queue.
1454         * If not null, sock_wfree() will call __sk_free(sk) later
1455         */
1456        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1457                __sk_free(sk);
1458}
1459EXPORT_SYMBOL(sk_free);
1460
1461/*
1462 * Last sock_put should drop reference to sk->sk_net. It has already
1463 * been dropped in sk_change_net. Taking reference to stopping namespace
1464 * is not an option.
1465 * Take reference to a socket to remove it from hash _alive_ and after that
1466 * destroy it in the context of init_net.
1467 */
1468void sk_release_kernel(struct sock *sk)
1469{
1470        if (sk == NULL || sk->sk_socket == NULL)
1471                return;
1472
1473        sock_hold(sk);
1474        sock_release(sk->sk_socket);
1475        sock_net_set(sk, get_net(&init_net));
1476        sock_put(sk);
1477}
1478EXPORT_SYMBOL(sk_release_kernel);
1479
1480static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1481{
1482        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1483                sock_update_memcg(newsk);
1484}
1485
1486/**
1487 *      sk_clone_lock - clone a socket, and lock its clone
1488 *      @sk: the socket to clone
1489 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490 *
1491 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492 */
1493struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494{
1495        struct sock *newsk;
1496        bool is_charged = true;
1497
1498        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499        if (newsk != NULL) {
1500                struct sk_filter *filter;
1501
1502                sock_copy(newsk, sk);
1503
1504                /* SANITY */
1505                get_net(sock_net(newsk));
1506                sk_node_init(&newsk->sk_node);
1507                sock_lock_init(newsk);
1508                bh_lock_sock(newsk);
1509                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1510                newsk->sk_backlog.len = 0;
1511
1512                atomic_set(&newsk->sk_rmem_alloc, 0);
1513                /*
1514                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1515                 */
1516                atomic_set(&newsk->sk_wmem_alloc, 1);
1517                atomic_set(&newsk->sk_omem_alloc, 0);
1518                skb_queue_head_init(&newsk->sk_receive_queue);
1519                skb_queue_head_init(&newsk->sk_write_queue);
1520
1521                spin_lock_init(&newsk->sk_dst_lock);
1522                rwlock_init(&newsk->sk_callback_lock);
1523                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1524                                af_callback_keys + newsk->sk_family,
1525                                af_family_clock_key_strings[newsk->sk_family]);
1526
1527                newsk->sk_dst_cache     = NULL;
1528                newsk->sk_wmem_queued   = 0;
1529                newsk->sk_forward_alloc = 0;
1530                newsk->sk_send_head     = NULL;
1531                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1532
1533                sock_reset_flag(newsk, SOCK_DONE);
1534                skb_queue_head_init(&newsk->sk_error_queue);
1535
1536                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1537                if (filter != NULL)
1538                        /* though it's an empty new sock, the charging may fail
1539                         * if sysctl_optmem_max was changed between creation of
1540                         * original socket and cloning
1541                         */
1542                        is_charged = sk_filter_charge(newsk, filter);
1543
1544                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1545                        /* It is still raw copy of parent, so invalidate
1546                         * destructor and make plain sk_free() */
1547                        newsk->sk_destruct = NULL;
1548                        bh_unlock_sock(newsk);
1549                        sk_free(newsk);
1550                        newsk = NULL;
1551                        goto out;
1552                }
1553
1554                newsk->sk_err      = 0;
1555                newsk->sk_priority = 0;
1556                newsk->sk_incoming_cpu = raw_smp_processor_id();
1557                atomic64_set(&newsk->sk_cookie, 0);
1558                /*
1559                 * Before updating sk_refcnt, we must commit prior changes to memory
1560                 * (Documentation/RCU/rculist_nulls.txt for details)
1561                 */
1562                smp_wmb();
1563                atomic_set(&newsk->sk_refcnt, 2);
1564
1565                /*
1566                 * Increment the counter in the same struct proto as the master
1567                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1568                 * is the same as sk->sk_prot->socks, as this field was copied
1569                 * with memcpy).
1570                 *
1571                 * This _changes_ the previous behaviour, where
1572                 * tcp_create_openreq_child always was incrementing the
1573                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1574                 * to be taken into account in all callers. -acme
1575                 */
1576                sk_refcnt_debug_inc(newsk);
1577                sk_set_socket(newsk, NULL);
1578                newsk->sk_wq = NULL;
1579
1580                sk_update_clone(sk, newsk);
1581
1582                if (newsk->sk_prot->sockets_allocated)
1583                        sk_sockets_allocated_inc(newsk);
1584
1585                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1586                        net_enable_timestamp();
1587        }
1588out:
1589        return newsk;
1590}
1591EXPORT_SYMBOL_GPL(sk_clone_lock);
1592
1593void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1594{
1595        __sk_dst_set(sk, dst);
1596        sk->sk_route_caps = dst->dev->features;
1597        if (sk->sk_route_caps & NETIF_F_GSO)
1598                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1599        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1600        if (sk_can_gso(sk)) {
1601                if (dst->header_len) {
1602                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1603                } else {
1604                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1605                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1606                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1607                }
1608        }
1609}
1610EXPORT_SYMBOL_GPL(sk_setup_caps);
1611
1612/*
1613 *      Simple resource managers for sockets.
1614 */
1615
1616
1617/*
1618 * Write buffer destructor automatically called from kfree_skb.
1619 */
1620void sock_wfree(struct sk_buff *skb)
1621{
1622        struct sock *sk = skb->sk;
1623        unsigned int len = skb->truesize;
1624
1625        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1626                /*
1627                 * Keep a reference on sk_wmem_alloc, this will be released
1628                 * after sk_write_space() call
1629                 */
1630                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1631                sk->sk_write_space(sk);
1632                len = 1;
1633        }
1634        /*
1635         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1636         * could not do because of in-flight packets
1637         */
1638        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1639                __sk_free(sk);
1640}
1641EXPORT_SYMBOL(sock_wfree);
1642
1643void skb_orphan_partial(struct sk_buff *skb)
1644{
1645        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1646         * so we do not completely orphan skb, but transfert all
1647         * accounted bytes but one, to avoid unexpected reorders.
1648         */
1649        if (skb->destructor == sock_wfree
1650#ifdef CONFIG_INET
1651            || skb->destructor == tcp_wfree
1652#endif
1653                ) {
1654                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1655                skb->truesize = 1;
1656        } else {
1657                skb_orphan(skb);
1658        }
1659}
1660EXPORT_SYMBOL(skb_orphan_partial);
1661
1662/*
1663 * Read buffer destructor automatically called from kfree_skb.
1664 */
1665void sock_rfree(struct sk_buff *skb)
1666{
1667        struct sock *sk = skb->sk;
1668        unsigned int len = skb->truesize;
1669
1670        atomic_sub(len, &sk->sk_rmem_alloc);
1671        sk_mem_uncharge(sk, len);
1672}
1673EXPORT_SYMBOL(sock_rfree);
1674
1675/*
1676 * Buffer destructor for skbs that are not used directly in read or write
1677 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1678 */
1679void sock_efree(struct sk_buff *skb)
1680{
1681        sock_put(skb->sk);
1682}
1683EXPORT_SYMBOL(sock_efree);
1684
1685kuid_t sock_i_uid(struct sock *sk)
1686{
1687        kuid_t uid;
1688
1689        read_lock_bh(&sk->sk_callback_lock);
1690        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1691        read_unlock_bh(&sk->sk_callback_lock);
1692        return uid;
1693}
1694EXPORT_SYMBOL(sock_i_uid);
1695
1696unsigned long sock_i_ino(struct sock *sk)
1697{
1698        unsigned long ino;
1699
1700        read_lock_bh(&sk->sk_callback_lock);
1701        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1702        read_unlock_bh(&sk->sk_callback_lock);
1703        return ino;
1704}
1705EXPORT_SYMBOL(sock_i_ino);
1706
1707/*
1708 * Allocate a skb from the socket's send buffer.
1709 */
1710struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1711                             gfp_t priority)
1712{
1713        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1714                struct sk_buff *skb = alloc_skb(size, priority);
1715                if (skb) {
1716                        skb_set_owner_w(skb, sk);
1717                        return skb;
1718                }
1719        }
1720        return NULL;
1721}
1722EXPORT_SYMBOL(sock_wmalloc);
1723
1724/*
1725 * Allocate a memory block from the socket's option memory buffer.
1726 */
1727void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1728{
1729        if ((unsigned int)size <= sysctl_optmem_max &&
1730            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1731                void *mem;
1732                /* First do the add, to avoid the race if kmalloc
1733                 * might sleep.
1734                 */
1735                atomic_add(size, &sk->sk_omem_alloc);
1736                mem = kmalloc(size, priority);
1737                if (mem)
1738                        return mem;
1739                atomic_sub(size, &sk->sk_omem_alloc);
1740        }
1741        return NULL;
1742}
1743EXPORT_SYMBOL(sock_kmalloc);
1744
1745/* Free an option memory block. Note, we actually want the inline
1746 * here as this allows gcc to detect the nullify and fold away the
1747 * condition entirely.
1748 */
1749static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1750                                  const bool nullify)
1751{
1752        if (WARN_ON_ONCE(!mem))
1753                return;
1754        if (nullify)
1755                kzfree(mem);
1756        else
1757                kfree(mem);
1758        atomic_sub(size, &sk->sk_omem_alloc);
1759}
1760
1761void sock_kfree_s(struct sock *sk, void *mem, int size)
1762{
1763        __sock_kfree_s(sk, mem, size, false);
1764}
1765EXPORT_SYMBOL(sock_kfree_s);
1766
1767void sock_kzfree_s(struct sock *sk, void *mem, int size)
1768{
1769        __sock_kfree_s(sk, mem, size, true);
1770}
1771EXPORT_SYMBOL(sock_kzfree_s);
1772
1773/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1774   I think, these locks should be removed for datagram sockets.
1775 */
1776static long sock_wait_for_wmem(struct sock *sk, long timeo)
1777{
1778        DEFINE_WAIT(wait);
1779
1780        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1781        for (;;) {
1782                if (!timeo)
1783                        break;
1784                if (signal_pending(current))
1785                        break;
1786                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1787                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1788                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1789                        break;
1790                if (sk->sk_shutdown & SEND_SHUTDOWN)
1791                        break;
1792                if (sk->sk_err)
1793                        break;
1794                timeo = schedule_timeout(timeo);
1795        }
1796        finish_wait(sk_sleep(sk), &wait);
1797        return timeo;
1798}
1799
1800
1801/*
1802 *      Generic send/receive buffer handlers
1803 */
1804
1805struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1806                                     unsigned long data_len, int noblock,
1807                                     int *errcode, int max_page_order)
1808{
1809        struct sk_buff *skb;
1810        long timeo;
1811        int err;
1812
1813        timeo = sock_sndtimeo(sk, noblock);
1814        for (;;) {
1815                err = sock_error(sk);
1816                if (err != 0)
1817                        goto failure;
1818
1819                err = -EPIPE;
1820                if (sk->sk_shutdown & SEND_SHUTDOWN)
1821                        goto failure;
1822
1823                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1824                        break;
1825
1826                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1827                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1828                err = -EAGAIN;
1829                if (!timeo)
1830                        goto failure;
1831                if (signal_pending(current))
1832                        goto interrupted;
1833                timeo = sock_wait_for_wmem(sk, timeo);
1834        }
1835        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1836                                   errcode, sk->sk_allocation);
1837        if (skb)
1838                skb_set_owner_w(skb, sk);
1839        return skb;
1840
1841interrupted:
1842        err = sock_intr_errno(timeo);
1843failure:
1844        *errcode = err;
1845        return NULL;
1846}
1847EXPORT_SYMBOL(sock_alloc_send_pskb);
1848
1849struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1850                                    int noblock, int *errcode)
1851{
1852        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1853}
1854EXPORT_SYMBOL(sock_alloc_send_skb);
1855
1856/* On 32bit arches, an skb frag is limited to 2^15 */
1857#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1858
1859/**
1860 * skb_page_frag_refill - check that a page_frag contains enough room
1861 * @sz: minimum size of the fragment we want to get
1862 * @pfrag: pointer to page_frag
1863 * @gfp: priority for memory allocation
1864 *
1865 * Note: While this allocator tries to use high order pages, there is
1866 * no guarantee that allocations succeed. Therefore, @sz MUST be
1867 * less or equal than PAGE_SIZE.
1868 */
1869bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1870{
1871        if (pfrag->page) {
1872                if (atomic_read(&pfrag->page->_count) == 1) {
1873                        pfrag->offset = 0;
1874                        return true;
1875                }
1876                if (pfrag->offset + sz <= pfrag->size)
1877                        return true;
1878                put_page(pfrag->page);
1879        }
1880
1881        pfrag->offset = 0;
1882        if (SKB_FRAG_PAGE_ORDER) {
1883                pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1884                                          __GFP_NOWARN | __GFP_NORETRY,
1885                                          SKB_FRAG_PAGE_ORDER);
1886                if (likely(pfrag->page)) {
1887                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1888                        return true;
1889                }
1890        }
1891        pfrag->page = alloc_page(gfp);
1892        if (likely(pfrag->page)) {
1893                pfrag->size = PAGE_SIZE;
1894                return true;
1895        }
1896        return false;
1897}
1898EXPORT_SYMBOL(skb_page_frag_refill);
1899
1900bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1901{
1902        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1903                return true;
1904
1905        sk_enter_memory_pressure(sk);
1906        sk_stream_moderate_sndbuf(sk);
1907        return false;
1908}
1909EXPORT_SYMBOL(sk_page_frag_refill);
1910
1911static void __lock_sock(struct sock *sk)
1912        __releases(&sk->sk_lock.slock)
1913        __acquires(&sk->sk_lock.slock)
1914{
1915        DEFINE_WAIT(wait);
1916
1917        for (;;) {
1918                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1919                                        TASK_UNINTERRUPTIBLE);
1920                spin_unlock_bh(&sk->sk_lock.slock);
1921                schedule();
1922                spin_lock_bh(&sk->sk_lock.slock);
1923                if (!sock_owned_by_user(sk))
1924                        break;
1925        }
1926        finish_wait(&sk->sk_lock.wq, &wait);
1927}
1928
1929static void __release_sock(struct sock *sk)
1930        __releases(&sk->sk_lock.slock)
1931        __acquires(&sk->sk_lock.slock)
1932{
1933        struct sk_buff *skb = sk->sk_backlog.head;
1934
1935        do {
1936                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1937                bh_unlock_sock(sk);
1938
1939                do {
1940                        struct sk_buff *next = skb->next;
1941
1942                        prefetch(next);
1943                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1944                        skb->next = NULL;
1945                        sk_backlog_rcv(sk, skb);
1946
1947                        /*
1948                         * We are in process context here with softirqs
1949                         * disabled, use cond_resched_softirq() to preempt.
1950                         * This is safe to do because we've taken the backlog
1951                         * queue private:
1952                         */
1953                        cond_resched_softirq();
1954
1955                        skb = next;
1956                } while (skb != NULL);
1957
1958                bh_lock_sock(sk);
1959        } while ((skb = sk->sk_backlog.head) != NULL);
1960
1961        /*
1962         * Doing the zeroing here guarantee we can not loop forever
1963         * while a wild producer attempts to flood us.
1964         */
1965        sk->sk_backlog.len = 0;
1966}
1967
1968/**
1969 * sk_wait_data - wait for data to arrive at sk_receive_queue
1970 * @sk:    sock to wait on
1971 * @timeo: for how long
1972 *
1973 * Now socket state including sk->sk_err is changed only under lock,
1974 * hence we may omit checks after joining wait queue.
1975 * We check receive queue before schedule() only as optimization;
1976 * it is very likely that release_sock() added new data.
1977 */
1978int sk_wait_data(struct sock *sk, long *timeo)
1979{
1980        int rc;
1981        DEFINE_WAIT(wait);
1982
1983        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1984        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1985        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1986        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1987        finish_wait(sk_sleep(sk), &wait);
1988        return rc;
1989}
1990EXPORT_SYMBOL(sk_wait_data);
1991
1992/**
1993 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1994 *      @sk: socket
1995 *      @size: memory size to allocate
1996 *      @kind: allocation type
1997 *
1998 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1999 *      rmem allocation. This function assumes that protocols which have
2000 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2001 */
2002int __sk_mem_schedule(struct sock *sk, int size, int kind)
2003{
2004        struct proto *prot = sk->sk_prot;
2005        int amt = sk_mem_pages(size);
2006        long allocated;
2007        int parent_status = UNDER_LIMIT;
2008
2009        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2010
2011        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2012
2013        /* Under limit. */
2014        if (parent_status == UNDER_LIMIT &&
2015                        allocated <= sk_prot_mem_limits(sk, 0)) {
2016                sk_leave_memory_pressure(sk);
2017                return 1;
2018        }
2019
2020        /* Under pressure. (we or our parents) */
2021        if ((parent_status > SOFT_LIMIT) ||
2022                        allocated > sk_prot_mem_limits(sk, 1))
2023                sk_enter_memory_pressure(sk);
2024
2025        /* Over hard limit (we or our parents) */
2026        if ((parent_status == OVER_LIMIT) ||
2027                        (allocated > sk_prot_mem_limits(sk, 2)))
2028                goto suppress_allocation;
2029
2030        /* guarantee minimum buffer size under pressure */
2031        if (kind == SK_MEM_RECV) {
2032                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2033                        return 1;
2034
2035        } else { /* SK_MEM_SEND */
2036                if (sk->sk_type == SOCK_STREAM) {
2037                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2038                                return 1;
2039                } else if (atomic_read(&sk->sk_wmem_alloc) <
2040                           prot->sysctl_wmem[0])
2041                                return 1;
2042        }
2043
2044        if (sk_has_memory_pressure(sk)) {
2045                int alloc;
2046
2047                if (!sk_under_memory_pressure(sk))
2048                        return 1;
2049                alloc = sk_sockets_allocated_read_positive(sk);
2050                if (sk_prot_mem_limits(sk, 2) > alloc *
2051                    sk_mem_pages(sk->sk_wmem_queued +
2052                                 atomic_read(&sk->sk_rmem_alloc) +
2053                                 sk->sk_forward_alloc))
2054                        return 1;
2055        }
2056
2057suppress_allocation:
2058
2059        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2060                sk_stream_moderate_sndbuf(sk);
2061
2062                /* Fail only if socket is _under_ its sndbuf.
2063                 * In this case we cannot block, so that we have to fail.
2064                 */
2065                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2066                        return 1;
2067        }
2068
2069        trace_sock_exceed_buf_limit(sk, prot, allocated);
2070
2071        /* Alas. Undo changes. */
2072        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2073
2074        sk_memory_allocated_sub(sk, amt);
2075
2076        return 0;
2077}
2078EXPORT_SYMBOL(__sk_mem_schedule);
2079
2080/**
2081 *      __sk_reclaim - reclaim memory_allocated
2082 *      @sk: socket
2083 */
2084void __sk_mem_reclaim(struct sock *sk)
2085{
2086        sk_memory_allocated_sub(sk,
2087                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2088        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2089
2090        if (sk_under_memory_pressure(sk) &&
2091            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2092                sk_leave_memory_pressure(sk);
2093}
2094EXPORT_SYMBOL(__sk_mem_reclaim);
2095
2096
2097/*
2098 * Set of default routines for initialising struct proto_ops when
2099 * the protocol does not support a particular function. In certain
2100 * cases where it makes no sense for a protocol to have a "do nothing"
2101 * function, some default processing is provided.
2102 */
2103
2104int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2105{
2106        return -EOPNOTSUPP;
2107}
2108EXPORT_SYMBOL(sock_no_bind);
2109
2110int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2111                    int len, int flags)
2112{
2113        return -EOPNOTSUPP;
2114}
2115EXPORT_SYMBOL(sock_no_connect);
2116
2117int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2118{
2119        return -EOPNOTSUPP;
2120}
2121EXPORT_SYMBOL(sock_no_socketpair);
2122
2123int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2124{
2125        return -EOPNOTSUPP;
2126}
2127EXPORT_SYMBOL(sock_no_accept);
2128
2129int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2130                    int *len, int peer)
2131{
2132        return -EOPNOTSUPP;
2133}
2134EXPORT_SYMBOL(sock_no_getname);
2135
2136unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2137{
2138        return 0;
2139}
2140EXPORT_SYMBOL(sock_no_poll);
2141
2142int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2143{
2144        return -EOPNOTSUPP;
2145}
2146EXPORT_SYMBOL(sock_no_ioctl);
2147
2148int sock_no_listen(struct socket *sock, int backlog)
2149{
2150        return -EOPNOTSUPP;
2151}
2152EXPORT_SYMBOL(sock_no_listen);
2153
2154int sock_no_shutdown(struct socket *sock, int how)
2155{
2156        return -EOPNOTSUPP;
2157}
2158EXPORT_SYMBOL(sock_no_shutdown);
2159
2160int sock_no_setsockopt(struct socket *sock, int level, int optname,
2161                    char __user *optval, unsigned int optlen)
2162{
2163        return -EOPNOTSUPP;
2164}
2165EXPORT_SYMBOL(sock_no_setsockopt);
2166
2167int sock_no_getsockopt(struct socket *sock, int level, int optname,
2168                    char __user *optval, int __user *optlen)
2169{
2170        return -EOPNOTSUPP;
2171}
2172EXPORT_SYMBOL(sock_no_getsockopt);
2173
2174int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2175{
2176        return -EOPNOTSUPP;
2177}
2178EXPORT_SYMBOL(sock_no_sendmsg);
2179
2180int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2181                    int flags)
2182{
2183        return -EOPNOTSUPP;
2184}
2185EXPORT_SYMBOL(sock_no_recvmsg);
2186
2187int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2188{
2189        /* Mirror missing mmap method error code */
2190        return -ENODEV;
2191}
2192EXPORT_SYMBOL(sock_no_mmap);
2193
2194ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2195{
2196        ssize_t res;
2197        struct msghdr msg = {.msg_flags = flags};
2198        struct kvec iov;
2199        char *kaddr = kmap(page);
2200        iov.iov_base = kaddr + offset;
2201        iov.iov_len = size;
2202        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2203        kunmap(page);
2204        return res;
2205}
2206EXPORT_SYMBOL(sock_no_sendpage);
2207
2208/*
2209 *      Default Socket Callbacks
2210 */
2211
2212static void sock_def_wakeup(struct sock *sk)
2213{
2214        struct socket_wq *wq;
2215
2216        rcu_read_lock();
2217        wq = rcu_dereference(sk->sk_wq);
2218        if (wq_has_sleeper(wq))
2219                wake_up_interruptible_all(&wq->wait);
2220        rcu_read_unlock();
2221}
2222
2223static void sock_def_error_report(struct sock *sk)
2224{
2225        struct socket_wq *wq;
2226
2227        rcu_read_lock();
2228        wq = rcu_dereference(sk->sk_wq);
2229        if (wq_has_sleeper(wq))
2230                wake_up_interruptible_poll(&wq->wait, POLLERR);
2231        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2232        rcu_read_unlock();
2233}
2234
2235static void sock_def_readable(struct sock *sk)
2236{
2237        struct socket_wq *wq;
2238
2239        rcu_read_lock();
2240        wq = rcu_dereference(sk->sk_wq);
2241        if (wq_has_sleeper(wq))
2242                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2243                                                POLLRDNORM | POLLRDBAND);
2244        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2245        rcu_read_unlock();
2246}
2247
2248static void sock_def_write_space(struct sock *sk)
2249{
2250        struct socket_wq *wq;
2251
2252        rcu_read_lock();
2253
2254        /* Do not wake up a writer until he can make "significant"
2255         * progress.  --DaveM
2256         */
2257        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2258                wq = rcu_dereference(sk->sk_wq);
2259                if (wq_has_sleeper(wq))
2260                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2261                                                POLLWRNORM | POLLWRBAND);
2262
2263                /* Should agree with poll, otherwise some programs break */
2264                if (sock_writeable(sk))
2265                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2266        }
2267
2268        rcu_read_unlock();
2269}
2270
2271static void sock_def_destruct(struct sock *sk)
2272{
2273        kfree(sk->sk_protinfo);
2274}
2275
2276void sk_send_sigurg(struct sock *sk)
2277{
2278        if (sk->sk_socket && sk->sk_socket->file)
2279                if (send_sigurg(&sk->sk_socket->file->f_owner))
2280                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2281}
2282EXPORT_SYMBOL(sk_send_sigurg);
2283
2284void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2285                    unsigned long expires)
2286{
2287        if (!mod_timer(timer, expires))
2288                sock_hold(sk);
2289}
2290EXPORT_SYMBOL(sk_reset_timer);
2291
2292void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2293{
2294        if (del_timer(timer))
2295                __sock_put(sk);
2296}
2297EXPORT_SYMBOL(sk_stop_timer);
2298
2299void sock_init_data(struct socket *sock, struct sock *sk)
2300{
2301        skb_queue_head_init(&sk->sk_receive_queue);
2302        skb_queue_head_init(&sk->sk_write_queue);
2303        skb_queue_head_init(&sk->sk_error_queue);
2304
2305        sk->sk_send_head        =       NULL;
2306
2307        init_timer(&sk->sk_timer);
2308
2309        sk->sk_allocation       =       GFP_KERNEL;
2310        sk->sk_rcvbuf           =       sysctl_rmem_default;
2311        sk->sk_sndbuf           =       sysctl_wmem_default;
2312        sk->sk_state            =       TCP_CLOSE;
2313        sk_set_socket(sk, sock);
2314
2315        sock_set_flag(sk, SOCK_ZAPPED);
2316
2317        if (sock) {
2318                sk->sk_type     =       sock->type;
2319                sk->sk_wq       =       sock->wq;
2320                sock->sk        =       sk;
2321        } else
2322                sk->sk_wq       =       NULL;
2323
2324        spin_lock_init(&sk->sk_dst_lock);
2325        rwlock_init(&sk->sk_callback_lock);
2326        lockdep_set_class_and_name(&sk->sk_callback_lock,
2327                        af_callback_keys + sk->sk_family,
2328                        af_family_clock_key_strings[sk->sk_family]);
2329
2330        sk->sk_state_change     =       sock_def_wakeup;
2331        sk->sk_data_ready       =       sock_def_readable;
2332        sk->sk_write_space      =       sock_def_write_space;
2333        sk->sk_error_report     =       sock_def_error_report;
2334        sk->sk_destruct         =       sock_def_destruct;
2335
2336        sk->sk_frag.page        =       NULL;
2337        sk->sk_frag.offset      =       0;
2338        sk->sk_peek_off         =       -1;
2339
2340        sk->sk_peer_pid         =       NULL;
2341        sk->sk_peer_cred        =       NULL;
2342        sk->sk_write_pending    =       0;
2343        sk->sk_rcvlowat         =       1;
2344        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2345        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2346
2347        sk->sk_stamp = ktime_set(-1L, 0);
2348
2349#ifdef CONFIG_NET_RX_BUSY_POLL
2350        sk->sk_napi_id          =       0;
2351        sk->sk_ll_usec          =       sysctl_net_busy_read;
2352#endif
2353
2354        sk->sk_max_pacing_rate = ~0U;
2355        sk->sk_pacing_rate = ~0U;
2356        /*
2357         * Before updating sk_refcnt, we must commit prior changes to memory
2358         * (Documentation/RCU/rculist_nulls.txt for details)
2359         */
2360        smp_wmb();
2361        atomic_set(&sk->sk_refcnt, 1);
2362        atomic_set(&sk->sk_drops, 0);
2363}
2364EXPORT_SYMBOL(sock_init_data);
2365
2366void lock_sock_nested(struct sock *sk, int subclass)
2367{
2368        might_sleep();
2369        spin_lock_bh(&sk->sk_lock.slock);
2370        if (sk->sk_lock.owned)
2371                __lock_sock(sk);
2372        sk->sk_lock.owned = 1;
2373        spin_unlock(&sk->sk_lock.slock);
2374        /*
2375         * The sk_lock has mutex_lock() semantics here:
2376         */
2377        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2378        local_bh_enable();
2379}
2380EXPORT_SYMBOL(lock_sock_nested);
2381
2382void release_sock(struct sock *sk)
2383{
2384        /*
2385         * The sk_lock has mutex_unlock() semantics:
2386         */
2387        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2388
2389        spin_lock_bh(&sk->sk_lock.slock);
2390        if (sk->sk_backlog.tail)
2391                __release_sock(sk);
2392
2393        /* Warning : release_cb() might need to release sk ownership,
2394         * ie call sock_release_ownership(sk) before us.
2395         */
2396        if (sk->sk_prot->release_cb)
2397                sk->sk_prot->release_cb(sk);
2398
2399        sock_release_ownership(sk);
2400        if (waitqueue_active(&sk->sk_lock.wq))
2401                wake_up(&sk->sk_lock.wq);
2402        spin_unlock_bh(&sk->sk_lock.slock);
2403}
2404EXPORT_SYMBOL(release_sock);
2405
2406/**
2407 * lock_sock_fast - fast version of lock_sock
2408 * @sk: socket
2409 *
2410 * This version should be used for very small section, where process wont block
2411 * return false if fast path is taken
2412 *   sk_lock.slock locked, owned = 0, BH disabled
2413 * return true if slow path is taken
2414 *   sk_lock.slock unlocked, owned = 1, BH enabled
2415 */
2416bool lock_sock_fast(struct sock *sk)
2417{
2418        might_sleep();
2419        spin_lock_bh(&sk->sk_lock.slock);
2420
2421        if (!sk->sk_lock.owned)
2422                /*
2423                 * Note : We must disable BH
2424                 */
2425                return false;
2426
2427        __lock_sock(sk);
2428        sk->sk_lock.owned = 1;
2429        spin_unlock(&sk->sk_lock.slock);
2430        /*
2431         * The sk_lock has mutex_lock() semantics here:
2432         */
2433        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2434        local_bh_enable();
2435        return true;
2436}
2437EXPORT_SYMBOL(lock_sock_fast);
2438
2439int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2440{
2441        struct timeval tv;
2442        if (!sock_flag(sk, SOCK_TIMESTAMP))
2443                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2444        tv = ktime_to_timeval(sk->sk_stamp);
2445        if (tv.tv_sec == -1)
2446                return -ENOENT;
2447        if (tv.tv_sec == 0) {
2448                sk->sk_stamp = ktime_get_real();
2449                tv = ktime_to_timeval(sk->sk_stamp);
2450        }
2451        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2452}
2453EXPORT_SYMBOL(sock_get_timestamp);
2454
2455int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2456{
2457        struct timespec ts;
2458        if (!sock_flag(sk, SOCK_TIMESTAMP))
2459                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2460        ts = ktime_to_timespec(sk->sk_stamp);
2461        if (ts.tv_sec == -1)
2462                return -ENOENT;
2463        if (ts.tv_sec == 0) {
2464                sk->sk_stamp = ktime_get_real();
2465                ts = ktime_to_timespec(sk->sk_stamp);
2466        }
2467        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2468}
2469EXPORT_SYMBOL(sock_get_timestampns);
2470
2471void sock_enable_timestamp(struct sock *sk, int flag)
2472{
2473        if (!sock_flag(sk, flag)) {
2474                unsigned long previous_flags = sk->sk_flags;
2475
2476                sock_set_flag(sk, flag);
2477                /*
2478                 * we just set one of the two flags which require net
2479                 * time stamping, but time stamping might have been on
2480                 * already because of the other one
2481                 */
2482                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2483                        net_enable_timestamp();
2484        }
2485}
2486
2487int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2488                       int level, int type)
2489{
2490        struct sock_exterr_skb *serr;
2491        struct sk_buff *skb;
2492        int copied, err;
2493
2494        err = -EAGAIN;
2495        skb = sock_dequeue_err_skb(sk);
2496        if (skb == NULL)
2497                goto out;
2498
2499        copied = skb->len;
2500        if (copied > len) {
2501                msg->msg_flags |= MSG_TRUNC;
2502                copied = len;
2503        }
2504        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2505        if (err)
2506                goto out_free_skb;
2507
2508        sock_recv_timestamp(msg, sk, skb);
2509
2510        serr = SKB_EXT_ERR(skb);
2511        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2512
2513        msg->msg_flags |= MSG_ERRQUEUE;
2514        err = copied;
2515
2516out_free_skb:
2517        kfree_skb(skb);
2518out:
2519        return err;
2520}
2521EXPORT_SYMBOL(sock_recv_errqueue);
2522
2523/*
2524 *      Get a socket option on an socket.
2525 *
2526 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2527 *      asynchronous errors should be reported by getsockopt. We assume
2528 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2529 */
2530int sock_common_getsockopt(struct socket *sock, int level, int optname,
2531                           char __user *optval, int __user *optlen)
2532{
2533        struct sock *sk = sock->sk;
2534
2535        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2536}
2537EXPORT_SYMBOL(sock_common_getsockopt);
2538
2539#ifdef CONFIG_COMPAT
2540int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2541                                  char __user *optval, int __user *optlen)
2542{
2543        struct sock *sk = sock->sk;
2544
2545        if (sk->sk_prot->compat_getsockopt != NULL)
2546                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2547                                                      optval, optlen);
2548        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2549}
2550EXPORT_SYMBOL(compat_sock_common_getsockopt);
2551#endif
2552
2553int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2554                        int flags)
2555{
2556        struct sock *sk = sock->sk;
2557        int addr_len = 0;
2558        int err;
2559
2560        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2561                                   flags & ~MSG_DONTWAIT, &addr_len);
2562        if (err >= 0)
2563                msg->msg_namelen = addr_len;
2564        return err;
2565}
2566EXPORT_SYMBOL(sock_common_recvmsg);
2567
2568/*
2569 *      Set socket options on an inet socket.
2570 */
2571int sock_common_setsockopt(struct socket *sock, int level, int optname,
2572                           char __user *optval, unsigned int optlen)
2573{
2574        struct sock *sk = sock->sk;
2575
2576        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2577}
2578EXPORT_SYMBOL(sock_common_setsockopt);
2579
2580#ifdef CONFIG_COMPAT
2581int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2582                                  char __user *optval, unsigned int optlen)
2583{
2584        struct sock *sk = sock->sk;
2585
2586        if (sk->sk_prot->compat_setsockopt != NULL)
2587                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2588                                                      optval, optlen);
2589        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2590}
2591EXPORT_SYMBOL(compat_sock_common_setsockopt);
2592#endif
2593
2594void sk_common_release(struct sock *sk)
2595{
2596        if (sk->sk_prot->destroy)
2597                sk->sk_prot->destroy(sk);
2598
2599        /*
2600         * Observation: when sock_common_release is called, processes have
2601         * no access to socket. But net still has.
2602         * Step one, detach it from networking:
2603         *
2604         * A. Remove from hash tables.
2605         */
2606
2607        sk->sk_prot->unhash(sk);
2608
2609        /*
2610         * In this point socket cannot receive new packets, but it is possible
2611         * that some packets are in flight because some CPU runs receiver and
2612         * did hash table lookup before we unhashed socket. They will achieve
2613         * receive queue and will be purged by socket destructor.
2614         *
2615         * Also we still have packets pending on receive queue and probably,
2616         * our own packets waiting in device queues. sock_destroy will drain
2617         * receive queue, but transmitted packets will delay socket destruction
2618         * until the last reference will be released.
2619         */
2620
2621        sock_orphan(sk);
2622
2623        xfrm_sk_free_policy(sk);
2624
2625        sk_refcnt_debug_release(sk);
2626
2627        if (sk->sk_frag.page) {
2628                put_page(sk->sk_frag.page);
2629                sk->sk_frag.page = NULL;
2630        }
2631
2632        sock_put(sk);
2633}
2634EXPORT_SYMBOL(sk_common_release);
2635
2636#ifdef CONFIG_PROC_FS
2637#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2638struct prot_inuse {
2639        int val[PROTO_INUSE_NR];
2640};
2641
2642static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2643
2644#ifdef CONFIG_NET_NS
2645void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2646{
2647        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2648}
2649EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2650
2651int sock_prot_inuse_get(struct net *net, struct proto *prot)
2652{
2653        int cpu, idx = prot->inuse_idx;
2654        int res = 0;
2655
2656        for_each_possible_cpu(cpu)
2657                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2658
2659        return res >= 0 ? res : 0;
2660}
2661EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2662
2663static int __net_init sock_inuse_init_net(struct net *net)
2664{
2665        net->core.inuse = alloc_percpu(struct prot_inuse);
2666        return net->core.inuse ? 0 : -ENOMEM;
2667}
2668
2669static void __net_exit sock_inuse_exit_net(struct net *net)
2670{
2671        free_percpu(net->core.inuse);
2672}
2673
2674static struct pernet_operations net_inuse_ops = {
2675        .init = sock_inuse_init_net,
2676        .exit = sock_inuse_exit_net,
2677};
2678
2679static __init int net_inuse_init(void)
2680{
2681        if (register_pernet_subsys(&net_inuse_ops))
2682                panic("Cannot initialize net inuse counters");
2683
2684        return 0;
2685}
2686
2687core_initcall(net_inuse_init);
2688#else
2689static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2690
2691void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2692{
2693        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2694}
2695EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2696
2697int sock_prot_inuse_get(struct net *net, struct proto *prot)
2698{
2699        int cpu, idx = prot->inuse_idx;
2700        int res = 0;
2701
2702        for_each_possible_cpu(cpu)
2703                res += per_cpu(prot_inuse, cpu).val[idx];
2704
2705        return res >= 0 ? res : 0;
2706}
2707EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2708#endif
2709
2710static void assign_proto_idx(struct proto *prot)
2711{
2712        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2713
2714        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2715                pr_err("PROTO_INUSE_NR exhausted\n");
2716                return;
2717        }
2718
2719        set_bit(prot->inuse_idx, proto_inuse_idx);
2720}
2721
2722static void release_proto_idx(struct proto *prot)
2723{
2724        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2725                clear_bit(prot->inuse_idx, proto_inuse_idx);
2726}
2727#else
2728static inline void assign_proto_idx(struct proto *prot)
2729{
2730}
2731
2732static inline void release_proto_idx(struct proto *prot)
2733{
2734}
2735#endif
2736
2737static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2738{
2739        if (!rsk_prot)
2740                return;
2741        kfree(rsk_prot->slab_name);
2742        rsk_prot->slab_name = NULL;
2743        if (rsk_prot->slab) {
2744                kmem_cache_destroy(rsk_prot->slab);
2745                rsk_prot->slab = NULL;
2746        }
2747}
2748
2749static int req_prot_init(const struct proto *prot)
2750{
2751        struct request_sock_ops *rsk_prot = prot->rsk_prot;
2752
2753        if (!rsk_prot)
2754                return 0;
2755
2756        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2757                                        prot->name);
2758        if (!rsk_prot->slab_name)
2759                return -ENOMEM;
2760
2761        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2762                                           rsk_prot->obj_size, 0,
2763                                           0, NULL);
2764
2765        if (!rsk_prot->slab) {
2766                pr_crit("%s: Can't create request sock SLAB cache!\n",
2767                        prot->name);
2768                return -ENOMEM;
2769        }
2770        return 0;
2771}
2772
2773int proto_register(struct proto *prot, int alloc_slab)
2774{
2775        if (alloc_slab) {
2776                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2777                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2778                                        NULL);
2779
2780                if (prot->slab == NULL) {
2781                        pr_crit("%s: Can't create sock SLAB cache!\n",
2782                                prot->name);
2783                        goto out;
2784                }
2785
2786                if (req_prot_init(prot))
2787                        goto out_free_request_sock_slab;
2788
2789                if (prot->twsk_prot != NULL) {
2790                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2791
2792                        if (prot->twsk_prot->twsk_slab_name == NULL)
2793                                goto out_free_request_sock_slab;
2794
2795                        prot->twsk_prot->twsk_slab =
2796                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2797                                                  prot->twsk_prot->twsk_obj_size,
2798                                                  0,
2799                                                  prot->slab_flags,
2800                                                  NULL);
2801                        if (prot->twsk_prot->twsk_slab == NULL)
2802                                goto out_free_timewait_sock_slab_name;
2803                }
2804        }
2805
2806        mutex_lock(&proto_list_mutex);
2807        list_add(&prot->node, &proto_list);
2808        assign_proto_idx(prot);
2809        mutex_unlock(&proto_list_mutex);
2810        return 0;
2811
2812out_free_timewait_sock_slab_name:
2813        kfree(prot->twsk_prot->twsk_slab_name);
2814out_free_request_sock_slab:
2815        req_prot_cleanup(prot->rsk_prot);
2816
2817        kmem_cache_destroy(prot->slab);
2818        prot->slab = NULL;
2819out:
2820        return -ENOBUFS;
2821}
2822EXPORT_SYMBOL(proto_register);
2823
2824void proto_unregister(struct proto *prot)
2825{
2826        mutex_lock(&proto_list_mutex);
2827        release_proto_idx(prot);
2828        list_del(&prot->node);
2829        mutex_unlock(&proto_list_mutex);
2830
2831        if (prot->slab != NULL) {
2832                kmem_cache_destroy(prot->slab);
2833                prot->slab = NULL;
2834        }
2835
2836        req_prot_cleanup(prot->rsk_prot);
2837
2838        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2839                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2840                kfree(prot->twsk_prot->twsk_slab_name);
2841                prot->twsk_prot->twsk_slab = NULL;
2842        }
2843}
2844EXPORT_SYMBOL(proto_unregister);
2845
2846#ifdef CONFIG_PROC_FS
2847static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2848        __acquires(proto_list_mutex)
2849{
2850        mutex_lock(&proto_list_mutex);
2851        return seq_list_start_head(&proto_list, *pos);
2852}
2853
2854static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2855{
2856        return seq_list_next(v, &proto_list, pos);
2857}
2858
2859static void proto_seq_stop(struct seq_file *seq, void *v)
2860        __releases(proto_list_mutex)
2861{
2862        mutex_unlock(&proto_list_mutex);
2863}
2864
2865static char proto_method_implemented(const void *method)
2866{
2867        return method == NULL ? 'n' : 'y';
2868}
2869static long sock_prot_memory_allocated(struct proto *proto)
2870{
2871        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2872}
2873
2874static char *sock_prot_memory_pressure(struct proto *proto)
2875{
2876        return proto->memory_pressure != NULL ?
2877        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2878}
2879
2880static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2881{
2882
2883        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2884                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2885                   proto->name,
2886                   proto->obj_size,
2887                   sock_prot_inuse_get(seq_file_net(seq), proto),
2888                   sock_prot_memory_allocated(proto),
2889                   sock_prot_memory_pressure(proto),
2890                   proto->max_header,
2891                   proto->slab == NULL ? "no" : "yes",
2892                   module_name(proto->owner),
2893                   proto_method_implemented(proto->close),
2894                   proto_method_implemented(proto->connect),
2895                   proto_method_implemented(proto->disconnect),
2896                   proto_method_implemented(proto->accept),
2897                   proto_method_implemented(proto->ioctl),
2898                   proto_method_implemented(proto->init),
2899                   proto_method_implemented(proto->destroy),
2900                   proto_method_implemented(proto->shutdown),
2901                   proto_method_implemented(proto->setsockopt),
2902                   proto_method_implemented(proto->getsockopt),
2903                   proto_method_implemented(proto->sendmsg),
2904                   proto_method_implemented(proto->recvmsg),
2905                   proto_method_implemented(proto->sendpage),
2906                   proto_method_implemented(proto->bind),
2907                   proto_method_implemented(proto->backlog_rcv),
2908                   proto_method_implemented(proto->hash),
2909                   proto_method_implemented(proto->unhash),
2910                   proto_method_implemented(proto->get_port),
2911                   proto_method_implemented(proto->enter_memory_pressure));
2912}
2913
2914static int proto_seq_show(struct seq_file *seq, void *v)
2915{
2916        if (v == &proto_list)
2917                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2918                           "protocol",
2919                           "size",
2920                           "sockets",
2921                           "memory",
2922                           "press",
2923                           "maxhdr",
2924                           "slab",
2925                           "module",
2926                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2927        else
2928                proto_seq_printf(seq, list_entry(v, struct proto, node));
2929        return 0;
2930}
2931
2932static const struct seq_operations proto_seq_ops = {
2933        .start  = proto_seq_start,
2934        .next   = proto_seq_next,
2935        .stop   = proto_seq_stop,
2936        .show   = proto_seq_show,
2937};
2938
2939static int proto_seq_open(struct inode *inode, struct file *file)
2940{
2941        return seq_open_net(inode, file, &proto_seq_ops,
2942                            sizeof(struct seq_net_private));
2943}
2944
2945static const struct file_operations proto_seq_fops = {
2946        .owner          = THIS_MODULE,
2947        .open           = proto_seq_open,
2948        .read           = seq_read,
2949        .llseek         = seq_lseek,
2950        .release        = seq_release_net,
2951};
2952
2953static __net_init int proto_init_net(struct net *net)
2954{
2955        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2956                return -ENOMEM;
2957
2958        return 0;
2959}
2960
2961static __net_exit void proto_exit_net(struct net *net)
2962{
2963        remove_proc_entry("protocols", net->proc_net);
2964}
2965
2966
2967static __net_initdata struct pernet_operations proto_net_ops = {
2968        .init = proto_init_net,
2969        .exit = proto_exit_net,
2970};
2971
2972static int __init proto_init(void)
2973{
2974        return register_pernet_subsys(&proto_net_ops);
2975}
2976
2977subsys_initcall(proto_init);
2978
2979#endif /* PROC_FS */
2980