linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134
 135#include <linux/filter.h>
 136
 137#include <trace/events/sock.h>
 138
 139#ifdef CONFIG_INET
 140#include <net/tcp.h>
 141#endif
 142
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148#ifdef CONFIG_MEMCG_KMEM
 149int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 150{
 151        struct proto *proto;
 152        int ret = 0;
 153
 154        mutex_lock(&proto_list_mutex);
 155        list_for_each_entry(proto, &proto_list, node) {
 156                if (proto->init_cgroup) {
 157                        ret = proto->init_cgroup(memcg, ss);
 158                        if (ret)
 159                                goto out;
 160                }
 161        }
 162
 163        mutex_unlock(&proto_list_mutex);
 164        return ret;
 165out:
 166        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 167                if (proto->destroy_cgroup)
 168                        proto->destroy_cgroup(memcg);
 169        mutex_unlock(&proto_list_mutex);
 170        return ret;
 171}
 172
 173void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 174{
 175        struct proto *proto;
 176
 177        mutex_lock(&proto_list_mutex);
 178        list_for_each_entry_reverse(proto, &proto_list, node)
 179                if (proto->destroy_cgroup)
 180                        proto->destroy_cgroup(memcg);
 181        mutex_unlock(&proto_list_mutex);
 182}
 183#endif
 184
 185/*
 186 * Each address family might have different locking rules, so we have
 187 * one slock key per address family:
 188 */
 189static struct lock_class_key af_family_keys[AF_MAX];
 190static struct lock_class_key af_family_slock_keys[AF_MAX];
 191
 192#if defined(CONFIG_MEMCG_KMEM)
 193struct static_key memcg_socket_limit_enabled;
 194EXPORT_SYMBOL(memcg_socket_limit_enabled);
 195#endif
 196
 197/*
 198 * Make lock validator output more readable. (we pre-construct these
 199 * strings build-time, so that runtime initialization of socket
 200 * locks is fast):
 201 */
 202static const char *const af_family_key_strings[AF_MAX+1] = {
 203  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 204  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 205  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 206  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 207  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 208  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 209  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 210  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 211  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 212  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 213  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 214  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 215  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 216  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 217};
 218static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 219  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 220  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 221  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 222  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 223  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 224  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 225  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 226  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 227  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 228  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 229  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 230  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 231  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 232  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 233};
 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 236  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 237  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 238  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 239  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 240  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 241  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 242  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 243  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 244  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 245  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 246  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 247  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 248  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 249};
 250
 251/*
 252 * sk_callback_lock locking rules are per-address-family,
 253 * so split the lock classes by using a per-AF key:
 254 */
 255static struct lock_class_key af_callback_keys[AF_MAX];
 256
 257/* Take into consideration the size of the struct sk_buff overhead in the
 258 * determination of these values, since that is non-constant across
 259 * platforms.  This makes socket queueing behavior and performance
 260 * not depend upon such differences.
 261 */
 262#define _SK_MEM_PACKETS         256
 263#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 264#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 265#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 266
 267/* Run time adjustable parameters. */
 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269EXPORT_SYMBOL(sysctl_wmem_max);
 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271EXPORT_SYMBOL(sysctl_rmem_max);
 272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275/* Maximal space eaten by iovec or ancillary data plus some space */
 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 280EXPORT_SYMBOL_GPL(memalloc_socks);
 281
 282/**
 283 * sk_set_memalloc - sets %SOCK_MEMALLOC
 284 * @sk: socket to set it on
 285 *
 286 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 287 * It's the responsibility of the admin to adjust min_free_kbytes
 288 * to meet the requirements
 289 */
 290void sk_set_memalloc(struct sock *sk)
 291{
 292        sock_set_flag(sk, SOCK_MEMALLOC);
 293        sk->sk_allocation |= __GFP_MEMALLOC;
 294        static_key_slow_inc(&memalloc_socks);
 295}
 296EXPORT_SYMBOL_GPL(sk_set_memalloc);
 297
 298void sk_clear_memalloc(struct sock *sk)
 299{
 300        sock_reset_flag(sk, SOCK_MEMALLOC);
 301        sk->sk_allocation &= ~__GFP_MEMALLOC;
 302        static_key_slow_dec(&memalloc_socks);
 303
 304        /*
 305         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 306         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 307         * it has rmem allocations there is a risk that the user of the
 308         * socket cannot make forward progress due to exceeding the rmem
 309         * limits. By rights, sk_clear_memalloc() should only be called
 310         * on sockets being torn down but warn and reset the accounting if
 311         * that assumption breaks.
 312         */
 313        if (WARN_ON(sk->sk_forward_alloc))
 314                sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320        int ret;
 321        unsigned long pflags = current->flags;
 322
 323        /* these should have been dropped before queueing */
 324        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326        current->flags |= PF_MEMALLOC;
 327        ret = sk->sk_backlog_rcv(sk, skb);
 328        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 329
 330        return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 335{
 336        struct timeval tv;
 337
 338        if (optlen < sizeof(tv))
 339                return -EINVAL;
 340        if (copy_from_user(&tv, optval, sizeof(tv)))
 341                return -EFAULT;
 342        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 343                return -EDOM;
 344
 345        if (tv.tv_sec < 0) {
 346                static int warned __read_mostly;
 347
 348                *timeo_p = 0;
 349                if (warned < 10 && net_ratelimit()) {
 350                        warned++;
 351                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 352                                __func__, current->comm, task_pid_nr(current));
 353                }
 354                return 0;
 355        }
 356        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 357        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 358                return 0;
 359        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 360                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 361        return 0;
 362}
 363
 364static void sock_warn_obsolete_bsdism(const char *name)
 365{
 366        static int warned;
 367        static char warncomm[TASK_COMM_LEN];
 368        if (strcmp(warncomm, current->comm) && warned < 5) {
 369                strcpy(warncomm,  current->comm);
 370                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 371                        warncomm, name);
 372                warned++;
 373        }
 374}
 375
 376#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 377
 378static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 379{
 380        if (sk->sk_flags & flags) {
 381                sk->sk_flags &= ~flags;
 382                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 383                        net_disable_timestamp();
 384        }
 385}
 386
 387
 388int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 389{
 390        int err;
 391        int skb_len;
 392        unsigned long flags;
 393        struct sk_buff_head *list = &sk->sk_receive_queue;
 394
 395        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 396                atomic_inc(&sk->sk_drops);
 397                trace_sock_rcvqueue_full(sk, skb);
 398                return -ENOMEM;
 399        }
 400
 401        err = sk_filter(sk, skb);
 402        if (err)
 403                return err;
 404
 405        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 406                atomic_inc(&sk->sk_drops);
 407                return -ENOBUFS;
 408        }
 409
 410        skb->dev = NULL;
 411        skb_set_owner_r(skb, sk);
 412
 413        /* Cache the SKB length before we tack it onto the receive
 414         * queue.  Once it is added it no longer belongs to us and
 415         * may be freed by other threads of control pulling packets
 416         * from the queue.
 417         */
 418        skb_len = skb->len;
 419
 420        /* we escape from rcu protected region, make sure we dont leak
 421         * a norefcounted dst
 422         */
 423        skb_dst_force(skb);
 424
 425        spin_lock_irqsave(&list->lock, flags);
 426        skb->dropcount = atomic_read(&sk->sk_drops);
 427        __skb_queue_tail(list, skb);
 428        spin_unlock_irqrestore(&list->lock, flags);
 429
 430        if (!sock_flag(sk, SOCK_DEAD))
 431                sk->sk_data_ready(sk, skb_len);
 432        return 0;
 433}
 434EXPORT_SYMBOL(sock_queue_rcv_skb);
 435
 436int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 437{
 438        int rc = NET_RX_SUCCESS;
 439
 440        if (sk_filter(sk, skb))
 441                goto discard_and_relse;
 442
 443        skb->dev = NULL;
 444
 445        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 446                atomic_inc(&sk->sk_drops);
 447                goto discard_and_relse;
 448        }
 449        if (nested)
 450                bh_lock_sock_nested(sk);
 451        else
 452                bh_lock_sock(sk);
 453        if (!sock_owned_by_user(sk)) {
 454                /*
 455                 * trylock + unlock semantics:
 456                 */
 457                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 458
 459                rc = sk_backlog_rcv(sk, skb);
 460
 461                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 462        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 463                bh_unlock_sock(sk);
 464                atomic_inc(&sk->sk_drops);
 465                goto discard_and_relse;
 466        }
 467
 468        bh_unlock_sock(sk);
 469out:
 470        sock_put(sk);
 471        return rc;
 472discard_and_relse:
 473        kfree_skb(skb);
 474        goto out;
 475}
 476EXPORT_SYMBOL(sk_receive_skb);
 477
 478void sk_reset_txq(struct sock *sk)
 479{
 480        sk_tx_queue_clear(sk);
 481}
 482EXPORT_SYMBOL(sk_reset_txq);
 483
 484struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 485{
 486        struct dst_entry *dst = __sk_dst_get(sk);
 487
 488        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 489                sk_tx_queue_clear(sk);
 490                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 491                dst_release(dst);
 492                return NULL;
 493        }
 494
 495        return dst;
 496}
 497EXPORT_SYMBOL(__sk_dst_check);
 498
 499struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 500{
 501        struct dst_entry *dst = sk_dst_get(sk);
 502
 503        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 504                sk_dst_reset(sk);
 505                dst_release(dst);
 506                return NULL;
 507        }
 508
 509        return dst;
 510}
 511EXPORT_SYMBOL(sk_dst_check);
 512
 513static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 514                                int optlen)
 515{
 516        int ret = -ENOPROTOOPT;
 517#ifdef CONFIG_NETDEVICES
 518        struct net *net = sock_net(sk);
 519        char devname[IFNAMSIZ];
 520        int index;
 521
 522        /* Sorry... */
 523        ret = -EPERM;
 524        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 525                goto out;
 526
 527        ret = -EINVAL;
 528        if (optlen < 0)
 529                goto out;
 530
 531        /* Bind this socket to a particular device like "eth0",
 532         * as specified in the passed interface name. If the
 533         * name is "" or the option length is zero the socket
 534         * is not bound.
 535         */
 536        if (optlen > IFNAMSIZ - 1)
 537                optlen = IFNAMSIZ - 1;
 538        memset(devname, 0, sizeof(devname));
 539
 540        ret = -EFAULT;
 541        if (copy_from_user(devname, optval, optlen))
 542                goto out;
 543
 544        index = 0;
 545        if (devname[0] != '\0') {
 546                struct net_device *dev;
 547
 548                rcu_read_lock();
 549                dev = dev_get_by_name_rcu(net, devname);
 550                if (dev)
 551                        index = dev->ifindex;
 552                rcu_read_unlock();
 553                ret = -ENODEV;
 554                if (!dev)
 555                        goto out;
 556        }
 557
 558        lock_sock(sk);
 559        sk->sk_bound_dev_if = index;
 560        sk_dst_reset(sk);
 561        release_sock(sk);
 562
 563        ret = 0;
 564
 565out:
 566#endif
 567
 568        return ret;
 569}
 570
 571static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 572                                int __user *optlen, int len)
 573{
 574        int ret = -ENOPROTOOPT;
 575#ifdef CONFIG_NETDEVICES
 576        struct net *net = sock_net(sk);
 577        char devname[IFNAMSIZ];
 578
 579        if (sk->sk_bound_dev_if == 0) {
 580                len = 0;
 581                goto zero;
 582        }
 583
 584        ret = -EINVAL;
 585        if (len < IFNAMSIZ)
 586                goto out;
 587
 588        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 589        if (ret)
 590                goto out;
 591
 592        len = strlen(devname) + 1;
 593
 594        ret = -EFAULT;
 595        if (copy_to_user(optval, devname, len))
 596                goto out;
 597
 598zero:
 599        ret = -EFAULT;
 600        if (put_user(len, optlen))
 601                goto out;
 602
 603        ret = 0;
 604
 605out:
 606#endif
 607
 608        return ret;
 609}
 610
 611static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 612{
 613        if (valbool)
 614                sock_set_flag(sk, bit);
 615        else
 616                sock_reset_flag(sk, bit);
 617}
 618
 619/*
 620 *      This is meant for all protocols to use and covers goings on
 621 *      at the socket level. Everything here is generic.
 622 */
 623
 624int sock_setsockopt(struct socket *sock, int level, int optname,
 625                    char __user *optval, unsigned int optlen)
 626{
 627        struct sock *sk = sock->sk;
 628        int val;
 629        int valbool;
 630        struct linger ling;
 631        int ret = 0;
 632
 633        /*
 634         *      Options without arguments
 635         */
 636
 637        if (optname == SO_BINDTODEVICE)
 638                return sock_setbindtodevice(sk, optval, optlen);
 639
 640        if (optlen < sizeof(int))
 641                return -EINVAL;
 642
 643        if (get_user(val, (int __user *)optval))
 644                return -EFAULT;
 645
 646        valbool = val ? 1 : 0;
 647
 648        lock_sock(sk);
 649
 650        switch (optname) {
 651        case SO_DEBUG:
 652                if (val && !capable(CAP_NET_ADMIN))
 653                        ret = -EACCES;
 654                else
 655                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 656                break;
 657        case SO_REUSEADDR:
 658                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 659                break;
 660        case SO_REUSEPORT:
 661                sk->sk_reuseport = valbool;
 662                break;
 663        case SO_TYPE:
 664        case SO_PROTOCOL:
 665        case SO_DOMAIN:
 666        case SO_ERROR:
 667                ret = -ENOPROTOOPT;
 668                break;
 669        case SO_DONTROUTE:
 670                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 671                break;
 672        case SO_BROADCAST:
 673                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 674                break;
 675        case SO_SNDBUF:
 676                /* Don't error on this BSD doesn't and if you think
 677                 * about it this is right. Otherwise apps have to
 678                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 679                 * are treated in BSD as hints
 680                 */
 681                val = min_t(u32, val, sysctl_wmem_max);
 682set_sndbuf:
 683                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 684                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 685                /* Wake up sending tasks if we upped the value. */
 686                sk->sk_write_space(sk);
 687                break;
 688
 689        case SO_SNDBUFFORCE:
 690                if (!capable(CAP_NET_ADMIN)) {
 691                        ret = -EPERM;
 692                        break;
 693                }
 694                goto set_sndbuf;
 695
 696        case SO_RCVBUF:
 697                /* Don't error on this BSD doesn't and if you think
 698                 * about it this is right. Otherwise apps have to
 699                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 700                 * are treated in BSD as hints
 701                 */
 702                val = min_t(u32, val, sysctl_rmem_max);
 703set_rcvbuf:
 704                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 705                /*
 706                 * We double it on the way in to account for
 707                 * "struct sk_buff" etc. overhead.   Applications
 708                 * assume that the SO_RCVBUF setting they make will
 709                 * allow that much actual data to be received on that
 710                 * socket.
 711                 *
 712                 * Applications are unaware that "struct sk_buff" and
 713                 * other overheads allocate from the receive buffer
 714                 * during socket buffer allocation.
 715                 *
 716                 * And after considering the possible alternatives,
 717                 * returning the value we actually used in getsockopt
 718                 * is the most desirable behavior.
 719                 */
 720                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 721                break;
 722
 723        case SO_RCVBUFFORCE:
 724                if (!capable(CAP_NET_ADMIN)) {
 725                        ret = -EPERM;
 726                        break;
 727                }
 728                goto set_rcvbuf;
 729
 730        case SO_KEEPALIVE:
 731#ifdef CONFIG_INET
 732                if (sk->sk_protocol == IPPROTO_TCP &&
 733                    sk->sk_type == SOCK_STREAM)
 734                        tcp_set_keepalive(sk, valbool);
 735#endif
 736                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 737                break;
 738
 739        case SO_OOBINLINE:
 740                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 741                break;
 742
 743        case SO_NO_CHECK:
 744                sk->sk_no_check = valbool;
 745                break;
 746
 747        case SO_PRIORITY:
 748                if ((val >= 0 && val <= 6) ||
 749                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 750                        sk->sk_priority = val;
 751                else
 752                        ret = -EPERM;
 753                break;
 754
 755        case SO_LINGER:
 756                if (optlen < sizeof(ling)) {
 757                        ret = -EINVAL;  /* 1003.1g */
 758                        break;
 759                }
 760                if (copy_from_user(&ling, optval, sizeof(ling))) {
 761                        ret = -EFAULT;
 762                        break;
 763                }
 764                if (!ling.l_onoff)
 765                        sock_reset_flag(sk, SOCK_LINGER);
 766                else {
 767#if (BITS_PER_LONG == 32)
 768                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 769                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 770                        else
 771#endif
 772                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 773                        sock_set_flag(sk, SOCK_LINGER);
 774                }
 775                break;
 776
 777        case SO_BSDCOMPAT:
 778                sock_warn_obsolete_bsdism("setsockopt");
 779                break;
 780
 781        case SO_PASSCRED:
 782                if (valbool)
 783                        set_bit(SOCK_PASSCRED, &sock->flags);
 784                else
 785                        clear_bit(SOCK_PASSCRED, &sock->flags);
 786                break;
 787
 788        case SO_TIMESTAMP:
 789        case SO_TIMESTAMPNS:
 790                if (valbool)  {
 791                        if (optname == SO_TIMESTAMP)
 792                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 793                        else
 794                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 795                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 796                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 797                } else {
 798                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 799                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 800                }
 801                break;
 802
 803        case SO_TIMESTAMPING:
 804                if (val & ~SOF_TIMESTAMPING_MASK) {
 805                        ret = -EINVAL;
 806                        break;
 807                }
 808                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 809                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 810                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 811                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 812                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 813                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 814                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 815                        sock_enable_timestamp(sk,
 816                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 817                else
 818                        sock_disable_timestamp(sk,
 819                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 820                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 821                                  val & SOF_TIMESTAMPING_SOFTWARE);
 822                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 823                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 824                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 825                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 826                break;
 827
 828        case SO_RCVLOWAT:
 829                if (val < 0)
 830                        val = INT_MAX;
 831                sk->sk_rcvlowat = val ? : 1;
 832                break;
 833
 834        case SO_RCVTIMEO:
 835                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 836                break;
 837
 838        case SO_SNDTIMEO:
 839                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 840                break;
 841
 842        case SO_ATTACH_FILTER:
 843                ret = -EINVAL;
 844                if (optlen == sizeof(struct sock_fprog)) {
 845                        struct sock_fprog fprog;
 846
 847                        ret = -EFAULT;
 848                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 849                                break;
 850
 851                        ret = sk_attach_filter(&fprog, sk);
 852                }
 853                break;
 854
 855        case SO_DETACH_FILTER:
 856                ret = sk_detach_filter(sk);
 857                break;
 858
 859        case SO_LOCK_FILTER:
 860                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 861                        ret = -EPERM;
 862                else
 863                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 864                break;
 865
 866        case SO_PASSSEC:
 867                if (valbool)
 868                        set_bit(SOCK_PASSSEC, &sock->flags);
 869                else
 870                        clear_bit(SOCK_PASSSEC, &sock->flags);
 871                break;
 872        case SO_MARK:
 873                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 874                        ret = -EPERM;
 875                else
 876                        sk->sk_mark = val;
 877                break;
 878
 879                /* We implement the SO_SNDLOWAT etc to
 880                   not be settable (1003.1g 5.3) */
 881        case SO_RXQ_OVFL:
 882                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 883                break;
 884
 885        case SO_WIFI_STATUS:
 886                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 887                break;
 888
 889        case SO_PEEK_OFF:
 890                if (sock->ops->set_peek_off)
 891                        sock->ops->set_peek_off(sk, val);
 892                else
 893                        ret = -EOPNOTSUPP;
 894                break;
 895
 896        case SO_NOFCS:
 897                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 898                break;
 899
 900        case SO_SELECT_ERR_QUEUE:
 901                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 902                break;
 903
 904#ifdef CONFIG_NET_RX_BUSY_POLL
 905        case SO_BUSY_POLL:
 906                /* allow unprivileged users to decrease the value */
 907                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 908                        ret = -EPERM;
 909                else {
 910                        if (val < 0)
 911                                ret = -EINVAL;
 912                        else
 913                                sk->sk_ll_usec = val;
 914                }
 915                break;
 916#endif
 917        default:
 918                ret = -ENOPROTOOPT;
 919                break;
 920        }
 921        release_sock(sk);
 922        return ret;
 923}
 924EXPORT_SYMBOL(sock_setsockopt);
 925
 926
 927void cred_to_ucred(struct pid *pid, const struct cred *cred,
 928                   struct ucred *ucred)
 929{
 930        ucred->pid = pid_vnr(pid);
 931        ucred->uid = ucred->gid = -1;
 932        if (cred) {
 933                struct user_namespace *current_ns = current_user_ns();
 934
 935                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 936                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 937        }
 938}
 939EXPORT_SYMBOL_GPL(cred_to_ucred);
 940
 941int sock_getsockopt(struct socket *sock, int level, int optname,
 942                    char __user *optval, int __user *optlen)
 943{
 944        struct sock *sk = sock->sk;
 945
 946        union {
 947                int val;
 948                struct linger ling;
 949                struct timeval tm;
 950        } v;
 951
 952        int lv = sizeof(int);
 953        int len;
 954
 955        if (get_user(len, optlen))
 956                return -EFAULT;
 957        if (len < 0)
 958                return -EINVAL;
 959
 960        memset(&v, 0, sizeof(v));
 961
 962        switch (optname) {
 963        case SO_DEBUG:
 964                v.val = sock_flag(sk, SOCK_DBG);
 965                break;
 966
 967        case SO_DONTROUTE:
 968                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 969                break;
 970
 971        case SO_BROADCAST:
 972                v.val = sock_flag(sk, SOCK_BROADCAST);
 973                break;
 974
 975        case SO_SNDBUF:
 976                v.val = sk->sk_sndbuf;
 977                break;
 978
 979        case SO_RCVBUF:
 980                v.val = sk->sk_rcvbuf;
 981                break;
 982
 983        case SO_REUSEADDR:
 984                v.val = sk->sk_reuse;
 985                break;
 986
 987        case SO_REUSEPORT:
 988                v.val = sk->sk_reuseport;
 989                break;
 990
 991        case SO_KEEPALIVE:
 992                v.val = sock_flag(sk, SOCK_KEEPOPEN);
 993                break;
 994
 995        case SO_TYPE:
 996                v.val = sk->sk_type;
 997                break;
 998
 999        case SO_PROTOCOL:
1000                v.val = sk->sk_protocol;
1001                break;
1002
1003        case SO_DOMAIN:
1004                v.val = sk->sk_family;
1005                break;
1006
1007        case SO_ERROR:
1008                v.val = -sock_error(sk);
1009                if (v.val == 0)
1010                        v.val = xchg(&sk->sk_err_soft, 0);
1011                break;
1012
1013        case SO_OOBINLINE:
1014                v.val = sock_flag(sk, SOCK_URGINLINE);
1015                break;
1016
1017        case SO_NO_CHECK:
1018                v.val = sk->sk_no_check;
1019                break;
1020
1021        case SO_PRIORITY:
1022                v.val = sk->sk_priority;
1023                break;
1024
1025        case SO_LINGER:
1026                lv              = sizeof(v.ling);
1027                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1028                v.ling.l_linger = sk->sk_lingertime / HZ;
1029                break;
1030
1031        case SO_BSDCOMPAT:
1032                sock_warn_obsolete_bsdism("getsockopt");
1033                break;
1034
1035        case SO_TIMESTAMP:
1036                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1037                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1038                break;
1039
1040        case SO_TIMESTAMPNS:
1041                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1042                break;
1043
1044        case SO_TIMESTAMPING:
1045                v.val = 0;
1046                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1047                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1048                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1049                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1050                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1051                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1052                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1053                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1054                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1055                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
1056                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1057                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1058                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1059                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1060                break;
1061
1062        case SO_RCVTIMEO:
1063                lv = sizeof(struct timeval);
1064                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1065                        v.tm.tv_sec = 0;
1066                        v.tm.tv_usec = 0;
1067                } else {
1068                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1069                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1070                }
1071                break;
1072
1073        case SO_SNDTIMEO:
1074                lv = sizeof(struct timeval);
1075                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1076                        v.tm.tv_sec = 0;
1077                        v.tm.tv_usec = 0;
1078                } else {
1079                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1080                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1081                }
1082                break;
1083
1084        case SO_RCVLOWAT:
1085                v.val = sk->sk_rcvlowat;
1086                break;
1087
1088        case SO_SNDLOWAT:
1089                v.val = 1;
1090                break;
1091
1092        case SO_PASSCRED:
1093                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1094                break;
1095
1096        case SO_PEERCRED:
1097        {
1098                struct ucred peercred;
1099                if (len > sizeof(peercred))
1100                        len = sizeof(peercred);
1101                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1102                if (copy_to_user(optval, &peercred, len))
1103                        return -EFAULT;
1104                goto lenout;
1105        }
1106
1107        case SO_PEERNAME:
1108        {
1109                char address[128];
1110
1111                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1112                        return -ENOTCONN;
1113                if (lv < len)
1114                        return -EINVAL;
1115                if (copy_to_user(optval, address, len))
1116                        return -EFAULT;
1117                goto lenout;
1118        }
1119
1120        /* Dubious BSD thing... Probably nobody even uses it, but
1121         * the UNIX standard wants it for whatever reason... -DaveM
1122         */
1123        case SO_ACCEPTCONN:
1124                v.val = sk->sk_state == TCP_LISTEN;
1125                break;
1126
1127        case SO_PASSSEC:
1128                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1129                break;
1130
1131        case SO_PEERSEC:
1132                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1133
1134        case SO_MARK:
1135                v.val = sk->sk_mark;
1136                break;
1137
1138        case SO_RXQ_OVFL:
1139                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1140                break;
1141
1142        case SO_WIFI_STATUS:
1143                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1144                break;
1145
1146        case SO_PEEK_OFF:
1147                if (!sock->ops->set_peek_off)
1148                        return -EOPNOTSUPP;
1149
1150                v.val = sk->sk_peek_off;
1151                break;
1152        case SO_NOFCS:
1153                v.val = sock_flag(sk, SOCK_NOFCS);
1154                break;
1155
1156        case SO_BINDTODEVICE:
1157                return sock_getbindtodevice(sk, optval, optlen, len);
1158
1159        case SO_GET_FILTER:
1160                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1161                if (len < 0)
1162                        return len;
1163
1164                goto lenout;
1165
1166        case SO_LOCK_FILTER:
1167                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1168                break;
1169
1170        case SO_SELECT_ERR_QUEUE:
1171                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1172                break;
1173
1174#ifdef CONFIG_NET_RX_BUSY_POLL
1175        case SO_BUSY_POLL:
1176                v.val = sk->sk_ll_usec;
1177                break;
1178#endif
1179
1180        default:
1181                return -ENOPROTOOPT;
1182        }
1183
1184        if (len > lv)
1185                len = lv;
1186        if (copy_to_user(optval, &v, len))
1187                return -EFAULT;
1188lenout:
1189        if (put_user(len, optlen))
1190                return -EFAULT;
1191        return 0;
1192}
1193
1194/*
1195 * Initialize an sk_lock.
1196 *
1197 * (We also register the sk_lock with the lock validator.)
1198 */
1199static inline void sock_lock_init(struct sock *sk)
1200{
1201        sock_lock_init_class_and_name(sk,
1202                        af_family_slock_key_strings[sk->sk_family],
1203                        af_family_slock_keys + sk->sk_family,
1204                        af_family_key_strings[sk->sk_family],
1205                        af_family_keys + sk->sk_family);
1206}
1207
1208/*
1209 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1210 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1211 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1212 */
1213static void sock_copy(struct sock *nsk, const struct sock *osk)
1214{
1215#ifdef CONFIG_SECURITY_NETWORK
1216        void *sptr = nsk->sk_security;
1217#endif
1218        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1219
1220        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1221               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1222
1223#ifdef CONFIG_SECURITY_NETWORK
1224        nsk->sk_security = sptr;
1225        security_sk_clone(osk, nsk);
1226#endif
1227}
1228
1229void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1230{
1231        unsigned long nulls1, nulls2;
1232
1233        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1234        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1235        if (nulls1 > nulls2)
1236                swap(nulls1, nulls2);
1237
1238        if (nulls1 != 0)
1239                memset((char *)sk, 0, nulls1);
1240        memset((char *)sk + nulls1 + sizeof(void *), 0,
1241               nulls2 - nulls1 - sizeof(void *));
1242        memset((char *)sk + nulls2 + sizeof(void *), 0,
1243               size - nulls2 - sizeof(void *));
1244}
1245EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1246
1247static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1248                int family)
1249{
1250        struct sock *sk;
1251        struct kmem_cache *slab;
1252
1253        slab = prot->slab;
1254        if (slab != NULL) {
1255                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1256                if (!sk)
1257                        return sk;
1258                if (priority & __GFP_ZERO) {
1259                        if (prot->clear_sk)
1260                                prot->clear_sk(sk, prot->obj_size);
1261                        else
1262                                sk_prot_clear_nulls(sk, prot->obj_size);
1263                }
1264        } else
1265                sk = kmalloc(prot->obj_size, priority);
1266
1267        if (sk != NULL) {
1268                kmemcheck_annotate_bitfield(sk, flags);
1269
1270                if (security_sk_alloc(sk, family, priority))
1271                        goto out_free;
1272
1273                if (!try_module_get(prot->owner))
1274                        goto out_free_sec;
1275                sk_tx_queue_clear(sk);
1276        }
1277
1278        return sk;
1279
1280out_free_sec:
1281        security_sk_free(sk);
1282out_free:
1283        if (slab != NULL)
1284                kmem_cache_free(slab, sk);
1285        else
1286                kfree(sk);
1287        return NULL;
1288}
1289
1290static void sk_prot_free(struct proto *prot, struct sock *sk)
1291{
1292        struct kmem_cache *slab;
1293        struct module *owner;
1294
1295        owner = prot->owner;
1296        slab = prot->slab;
1297
1298        security_sk_free(sk);
1299        if (slab != NULL)
1300                kmem_cache_free(slab, sk);
1301        else
1302                kfree(sk);
1303        module_put(owner);
1304}
1305
1306#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1307void sock_update_classid(struct sock *sk)
1308{
1309        u32 classid;
1310
1311        classid = task_cls_classid(current);
1312        if (classid != sk->sk_classid)
1313                sk->sk_classid = classid;
1314}
1315EXPORT_SYMBOL(sock_update_classid);
1316#endif
1317
1318#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1319void sock_update_netprioidx(struct sock *sk)
1320{
1321        if (in_interrupt())
1322                return;
1323
1324        sk->sk_cgrp_prioidx = task_netprioidx(current);
1325}
1326EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1327#endif
1328
1329/**
1330 *      sk_alloc - All socket objects are allocated here
1331 *      @net: the applicable net namespace
1332 *      @family: protocol family
1333 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1334 *      @prot: struct proto associated with this new sock instance
1335 */
1336struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1337                      struct proto *prot)
1338{
1339        struct sock *sk;
1340
1341        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1342        if (sk) {
1343                sk->sk_family = family;
1344                /*
1345                 * See comment in struct sock definition to understand
1346                 * why we need sk_prot_creator -acme
1347                 */
1348                sk->sk_prot = sk->sk_prot_creator = prot;
1349                sock_lock_init(sk);
1350                sock_net_set(sk, get_net(net));
1351                atomic_set(&sk->sk_wmem_alloc, 1);
1352
1353                sock_update_classid(sk);
1354                sock_update_netprioidx(sk);
1355        }
1356
1357        return sk;
1358}
1359EXPORT_SYMBOL(sk_alloc);
1360
1361static void __sk_free(struct sock *sk)
1362{
1363        struct sk_filter *filter;
1364
1365        if (sk->sk_destruct)
1366                sk->sk_destruct(sk);
1367
1368        filter = rcu_dereference_check(sk->sk_filter,
1369                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1370        if (filter) {
1371                sk_filter_uncharge(sk, filter);
1372                RCU_INIT_POINTER(sk->sk_filter, NULL);
1373        }
1374
1375        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1376
1377        if (atomic_read(&sk->sk_omem_alloc))
1378                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1379                         __func__, atomic_read(&sk->sk_omem_alloc));
1380
1381        if (sk->sk_peer_cred)
1382                put_cred(sk->sk_peer_cred);
1383        put_pid(sk->sk_peer_pid);
1384        put_net(sock_net(sk));
1385        sk_prot_free(sk->sk_prot_creator, sk);
1386}
1387
1388void sk_free(struct sock *sk)
1389{
1390        /*
1391         * We subtract one from sk_wmem_alloc and can know if
1392         * some packets are still in some tx queue.
1393         * If not null, sock_wfree() will call __sk_free(sk) later
1394         */
1395        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1396                __sk_free(sk);
1397}
1398EXPORT_SYMBOL(sk_free);
1399
1400/*
1401 * Last sock_put should drop reference to sk->sk_net. It has already
1402 * been dropped in sk_change_net. Taking reference to stopping namespace
1403 * is not an option.
1404 * Take reference to a socket to remove it from hash _alive_ and after that
1405 * destroy it in the context of init_net.
1406 */
1407void sk_release_kernel(struct sock *sk)
1408{
1409        if (sk == NULL || sk->sk_socket == NULL)
1410                return;
1411
1412        sock_hold(sk);
1413        sock_release(sk->sk_socket);
1414        release_net(sock_net(sk));
1415        sock_net_set(sk, get_net(&init_net));
1416        sock_put(sk);
1417}
1418EXPORT_SYMBOL(sk_release_kernel);
1419
1420static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1421{
1422        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1423                sock_update_memcg(newsk);
1424}
1425
1426/**
1427 *      sk_clone_lock - clone a socket, and lock its clone
1428 *      @sk: the socket to clone
1429 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1430 *
1431 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1432 */
1433struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1434{
1435        struct sock *newsk;
1436
1437        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1438        if (newsk != NULL) {
1439                struct sk_filter *filter;
1440
1441                sock_copy(newsk, sk);
1442
1443                /* SANITY */
1444                get_net(sock_net(newsk));
1445                sk_node_init(&newsk->sk_node);
1446                sock_lock_init(newsk);
1447                bh_lock_sock(newsk);
1448                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1449                newsk->sk_backlog.len = 0;
1450
1451                atomic_set(&newsk->sk_rmem_alloc, 0);
1452                /*
1453                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1454                 */
1455                atomic_set(&newsk->sk_wmem_alloc, 1);
1456                atomic_set(&newsk->sk_omem_alloc, 0);
1457                skb_queue_head_init(&newsk->sk_receive_queue);
1458                skb_queue_head_init(&newsk->sk_write_queue);
1459#ifdef CONFIG_NET_DMA
1460                skb_queue_head_init(&newsk->sk_async_wait_queue);
1461#endif
1462
1463                spin_lock_init(&newsk->sk_dst_lock);
1464                rwlock_init(&newsk->sk_callback_lock);
1465                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1466                                af_callback_keys + newsk->sk_family,
1467                                af_family_clock_key_strings[newsk->sk_family]);
1468
1469                newsk->sk_dst_cache     = NULL;
1470                newsk->sk_wmem_queued   = 0;
1471                newsk->sk_forward_alloc = 0;
1472                newsk->sk_send_head     = NULL;
1473                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1474
1475                sock_reset_flag(newsk, SOCK_DONE);
1476                skb_queue_head_init(&newsk->sk_error_queue);
1477
1478                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1479                if (filter != NULL)
1480                        sk_filter_charge(newsk, filter);
1481
1482                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1483                        /* It is still raw copy of parent, so invalidate
1484                         * destructor and make plain sk_free() */
1485                        newsk->sk_destruct = NULL;
1486                        bh_unlock_sock(newsk);
1487                        sk_free(newsk);
1488                        newsk = NULL;
1489                        goto out;
1490                }
1491
1492                newsk->sk_err      = 0;
1493                newsk->sk_priority = 0;
1494                /*
1495                 * Before updating sk_refcnt, we must commit prior changes to memory
1496                 * (Documentation/RCU/rculist_nulls.txt for details)
1497                 */
1498                smp_wmb();
1499                atomic_set(&newsk->sk_refcnt, 2);
1500
1501                /*
1502                 * Increment the counter in the same struct proto as the master
1503                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1504                 * is the same as sk->sk_prot->socks, as this field was copied
1505                 * with memcpy).
1506                 *
1507                 * This _changes_ the previous behaviour, where
1508                 * tcp_create_openreq_child always was incrementing the
1509                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1510                 * to be taken into account in all callers. -acme
1511                 */
1512                sk_refcnt_debug_inc(newsk);
1513                sk_set_socket(newsk, NULL);
1514                newsk->sk_wq = NULL;
1515
1516                sk_update_clone(sk, newsk);
1517
1518                if (newsk->sk_prot->sockets_allocated)
1519                        sk_sockets_allocated_inc(newsk);
1520
1521                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1522                        net_enable_timestamp();
1523        }
1524out:
1525        return newsk;
1526}
1527EXPORT_SYMBOL_GPL(sk_clone_lock);
1528
1529void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1530{
1531        __sk_dst_set(sk, dst);
1532        sk->sk_route_caps = dst->dev->features;
1533        if (sk->sk_route_caps & NETIF_F_GSO)
1534                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1535        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1536        if (sk_can_gso(sk)) {
1537                if (dst->header_len) {
1538                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1539                } else {
1540                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1541                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1542                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1543                }
1544        }
1545}
1546EXPORT_SYMBOL_GPL(sk_setup_caps);
1547
1548/*
1549 *      Simple resource managers for sockets.
1550 */
1551
1552
1553/*
1554 * Write buffer destructor automatically called from kfree_skb.
1555 */
1556void sock_wfree(struct sk_buff *skb)
1557{
1558        struct sock *sk = skb->sk;
1559        unsigned int len = skb->truesize;
1560
1561        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1562                /*
1563                 * Keep a reference on sk_wmem_alloc, this will be released
1564                 * after sk_write_space() call
1565                 */
1566                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1567                sk->sk_write_space(sk);
1568                len = 1;
1569        }
1570        /*
1571         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1572         * could not do because of in-flight packets
1573         */
1574        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1575                __sk_free(sk);
1576}
1577EXPORT_SYMBOL(sock_wfree);
1578
1579void skb_orphan_partial(struct sk_buff *skb)
1580{
1581        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1582         * so we do not completely orphan skb, but transfert all
1583         * accounted bytes but one, to avoid unexpected reorders.
1584         */
1585        if (skb->destructor == sock_wfree
1586#ifdef CONFIG_INET
1587            || skb->destructor == tcp_wfree
1588#endif
1589                ) {
1590                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1591                skb->truesize = 1;
1592        } else {
1593                skb_orphan(skb);
1594        }
1595}
1596EXPORT_SYMBOL(skb_orphan_partial);
1597
1598/*
1599 * Read buffer destructor automatically called from kfree_skb.
1600 */
1601void sock_rfree(struct sk_buff *skb)
1602{
1603        struct sock *sk = skb->sk;
1604        unsigned int len = skb->truesize;
1605
1606        atomic_sub(len, &sk->sk_rmem_alloc);
1607        sk_mem_uncharge(sk, len);
1608}
1609EXPORT_SYMBOL(sock_rfree);
1610
1611void sock_edemux(struct sk_buff *skb)
1612{
1613        struct sock *sk = skb->sk;
1614
1615#ifdef CONFIG_INET
1616        if (sk->sk_state == TCP_TIME_WAIT)
1617                inet_twsk_put(inet_twsk(sk));
1618        else
1619#endif
1620                sock_put(sk);
1621}
1622EXPORT_SYMBOL(sock_edemux);
1623
1624kuid_t sock_i_uid(struct sock *sk)
1625{
1626        kuid_t uid;
1627
1628        read_lock_bh(&sk->sk_callback_lock);
1629        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1630        read_unlock_bh(&sk->sk_callback_lock);
1631        return uid;
1632}
1633EXPORT_SYMBOL(sock_i_uid);
1634
1635unsigned long sock_i_ino(struct sock *sk)
1636{
1637        unsigned long ino;
1638
1639        read_lock_bh(&sk->sk_callback_lock);
1640        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1641        read_unlock_bh(&sk->sk_callback_lock);
1642        return ino;
1643}
1644EXPORT_SYMBOL(sock_i_ino);
1645
1646/*
1647 * Allocate a skb from the socket's send buffer.
1648 */
1649struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1650                             gfp_t priority)
1651{
1652        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1653                struct sk_buff *skb = alloc_skb(size, priority);
1654                if (skb) {
1655                        skb_set_owner_w(skb, sk);
1656                        return skb;
1657                }
1658        }
1659        return NULL;
1660}
1661EXPORT_SYMBOL(sock_wmalloc);
1662
1663/*
1664 * Allocate a skb from the socket's receive buffer.
1665 */
1666struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1667                             gfp_t priority)
1668{
1669        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1670                struct sk_buff *skb = alloc_skb(size, priority);
1671                if (skb) {
1672                        skb_set_owner_r(skb, sk);
1673                        return skb;
1674                }
1675        }
1676        return NULL;
1677}
1678
1679/*
1680 * Allocate a memory block from the socket's option memory buffer.
1681 */
1682void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1683{
1684        if ((unsigned int)size <= sysctl_optmem_max &&
1685            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1686                void *mem;
1687                /* First do the add, to avoid the race if kmalloc
1688                 * might sleep.
1689                 */
1690                atomic_add(size, &sk->sk_omem_alloc);
1691                mem = kmalloc(size, priority);
1692                if (mem)
1693                        return mem;
1694                atomic_sub(size, &sk->sk_omem_alloc);
1695        }
1696        return NULL;
1697}
1698EXPORT_SYMBOL(sock_kmalloc);
1699
1700/*
1701 * Free an option memory block.
1702 */
1703void sock_kfree_s(struct sock *sk, void *mem, int size)
1704{
1705        kfree(mem);
1706        atomic_sub(size, &sk->sk_omem_alloc);
1707}
1708EXPORT_SYMBOL(sock_kfree_s);
1709
1710/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1711   I think, these locks should be removed for datagram sockets.
1712 */
1713static long sock_wait_for_wmem(struct sock *sk, long timeo)
1714{
1715        DEFINE_WAIT(wait);
1716
1717        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1718        for (;;) {
1719                if (!timeo)
1720                        break;
1721                if (signal_pending(current))
1722                        break;
1723                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1724                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1725                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1726                        break;
1727                if (sk->sk_shutdown & SEND_SHUTDOWN)
1728                        break;
1729                if (sk->sk_err)
1730                        break;
1731                timeo = schedule_timeout(timeo);
1732        }
1733        finish_wait(sk_sleep(sk), &wait);
1734        return timeo;
1735}
1736
1737
1738/*
1739 *      Generic send/receive buffer handlers
1740 */
1741
1742struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1743                                     unsigned long data_len, int noblock,
1744                                     int *errcode, int max_page_order)
1745{
1746        struct sk_buff *skb = NULL;
1747        unsigned long chunk;
1748        gfp_t gfp_mask;
1749        long timeo;
1750        int err;
1751        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1752        struct page *page;
1753        int i;
1754
1755        err = -EMSGSIZE;
1756        if (npages > MAX_SKB_FRAGS)
1757                goto failure;
1758
1759        timeo = sock_sndtimeo(sk, noblock);
1760        while (!skb) {
1761                err = sock_error(sk);
1762                if (err != 0)
1763                        goto failure;
1764
1765                err = -EPIPE;
1766                if (sk->sk_shutdown & SEND_SHUTDOWN)
1767                        goto failure;
1768
1769                if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1770                        set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1771                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1772                        err = -EAGAIN;
1773                        if (!timeo)
1774                                goto failure;
1775                        if (signal_pending(current))
1776                                goto interrupted;
1777                        timeo = sock_wait_for_wmem(sk, timeo);
1778                        continue;
1779                }
1780
1781                err = -ENOBUFS;
1782                gfp_mask = sk->sk_allocation;
1783                if (gfp_mask & __GFP_WAIT)
1784                        gfp_mask |= __GFP_REPEAT;
1785
1786                skb = alloc_skb(header_len, gfp_mask);
1787                if (!skb)
1788                        goto failure;
1789
1790                skb->truesize += data_len;
1791
1792                for (i = 0; npages > 0; i++) {
1793                        int order = max_page_order;
1794
1795                        while (order) {
1796                                if (npages >= 1 << order) {
1797                                        page = alloc_pages(sk->sk_allocation |
1798                                                           __GFP_COMP | __GFP_NOWARN,
1799                                                           order);
1800                                        if (page)
1801                                                goto fill_page;
1802                                }
1803                                order--;
1804                        }
1805                        page = alloc_page(sk->sk_allocation);
1806                        if (!page)
1807                                goto failure;
1808fill_page:
1809                        chunk = min_t(unsigned long, data_len,
1810                                      PAGE_SIZE << order);
1811                        skb_fill_page_desc(skb, i, page, 0, chunk);
1812                        data_len -= chunk;
1813                        npages -= 1 << order;
1814                }
1815        }
1816
1817        skb_set_owner_w(skb, sk);
1818        return skb;
1819
1820interrupted:
1821        err = sock_intr_errno(timeo);
1822failure:
1823        kfree_skb(skb);
1824        *errcode = err;
1825        return NULL;
1826}
1827EXPORT_SYMBOL(sock_alloc_send_pskb);
1828
1829struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1830                                    int noblock, int *errcode)
1831{
1832        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1833}
1834EXPORT_SYMBOL(sock_alloc_send_skb);
1835
1836/* On 32bit arches, an skb frag is limited to 2^15 */
1837#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1838
1839bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1840{
1841        int order;
1842
1843        if (pfrag->page) {
1844                if (atomic_read(&pfrag->page->_count) == 1) {
1845                        pfrag->offset = 0;
1846                        return true;
1847                }
1848                if (pfrag->offset < pfrag->size)
1849                        return true;
1850                put_page(pfrag->page);
1851        }
1852
1853        /* We restrict high order allocations to users that can afford to wait */
1854        order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1855
1856        do {
1857                gfp_t gfp = sk->sk_allocation;
1858
1859                if (order)
1860                        gfp |= __GFP_COMP | __GFP_NOWARN;
1861                pfrag->page = alloc_pages(gfp, order);
1862                if (likely(pfrag->page)) {
1863                        pfrag->offset = 0;
1864                        pfrag->size = PAGE_SIZE << order;
1865                        return true;
1866                }
1867        } while (--order >= 0);
1868
1869        sk_enter_memory_pressure(sk);
1870        sk_stream_moderate_sndbuf(sk);
1871        return false;
1872}
1873EXPORT_SYMBOL(sk_page_frag_refill);
1874
1875static void __lock_sock(struct sock *sk)
1876        __releases(&sk->sk_lock.slock)
1877        __acquires(&sk->sk_lock.slock)
1878{
1879        DEFINE_WAIT(wait);
1880
1881        for (;;) {
1882                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1883                                        TASK_UNINTERRUPTIBLE);
1884                spin_unlock_bh(&sk->sk_lock.slock);
1885                schedule();
1886                spin_lock_bh(&sk->sk_lock.slock);
1887                if (!sock_owned_by_user(sk))
1888                        break;
1889        }
1890        finish_wait(&sk->sk_lock.wq, &wait);
1891}
1892
1893static void __release_sock(struct sock *sk)
1894        __releases(&sk->sk_lock.slock)
1895        __acquires(&sk->sk_lock.slock)
1896{
1897        struct sk_buff *skb = sk->sk_backlog.head;
1898
1899        do {
1900                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1901                bh_unlock_sock(sk);
1902
1903                do {
1904                        struct sk_buff *next = skb->next;
1905
1906                        prefetch(next);
1907                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1908                        skb->next = NULL;
1909                        sk_backlog_rcv(sk, skb);
1910
1911                        /*
1912                         * We are in process context here with softirqs
1913                         * disabled, use cond_resched_softirq() to preempt.
1914                         * This is safe to do because we've taken the backlog
1915                         * queue private:
1916                         */
1917                        cond_resched_softirq();
1918
1919                        skb = next;
1920                } while (skb != NULL);
1921
1922                bh_lock_sock(sk);
1923        } while ((skb = sk->sk_backlog.head) != NULL);
1924
1925        /*
1926         * Doing the zeroing here guarantee we can not loop forever
1927         * while a wild producer attempts to flood us.
1928         */
1929        sk->sk_backlog.len = 0;
1930}
1931
1932/**
1933 * sk_wait_data - wait for data to arrive at sk_receive_queue
1934 * @sk:    sock to wait on
1935 * @timeo: for how long
1936 *
1937 * Now socket state including sk->sk_err is changed only under lock,
1938 * hence we may omit checks after joining wait queue.
1939 * We check receive queue before schedule() only as optimization;
1940 * it is very likely that release_sock() added new data.
1941 */
1942int sk_wait_data(struct sock *sk, long *timeo)
1943{
1944        int rc;
1945        DEFINE_WAIT(wait);
1946
1947        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1948        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1949        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1950        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1951        finish_wait(sk_sleep(sk), &wait);
1952        return rc;
1953}
1954EXPORT_SYMBOL(sk_wait_data);
1955
1956/**
1957 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1958 *      @sk: socket
1959 *      @size: memory size to allocate
1960 *      @kind: allocation type
1961 *
1962 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1963 *      rmem allocation. This function assumes that protocols which have
1964 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1965 */
1966int __sk_mem_schedule(struct sock *sk, int size, int kind)
1967{
1968        struct proto *prot = sk->sk_prot;
1969        int amt = sk_mem_pages(size);
1970        long allocated;
1971        int parent_status = UNDER_LIMIT;
1972
1973        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1974
1975        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1976
1977        /* Under limit. */
1978        if (parent_status == UNDER_LIMIT &&
1979                        allocated <= sk_prot_mem_limits(sk, 0)) {
1980                sk_leave_memory_pressure(sk);
1981                return 1;
1982        }
1983
1984        /* Under pressure. (we or our parents) */
1985        if ((parent_status > SOFT_LIMIT) ||
1986                        allocated > sk_prot_mem_limits(sk, 1))
1987                sk_enter_memory_pressure(sk);
1988
1989        /* Over hard limit (we or our parents) */
1990        if ((parent_status == OVER_LIMIT) ||
1991                        (allocated > sk_prot_mem_limits(sk, 2)))
1992                goto suppress_allocation;
1993
1994        /* guarantee minimum buffer size under pressure */
1995        if (kind == SK_MEM_RECV) {
1996                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1997                        return 1;
1998
1999        } else { /* SK_MEM_SEND */
2000                if (sk->sk_type == SOCK_STREAM) {
2001                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2002                                return 1;
2003                } else if (atomic_read(&sk->sk_wmem_alloc) <
2004                           prot->sysctl_wmem[0])
2005                                return 1;
2006        }
2007
2008        if (sk_has_memory_pressure(sk)) {
2009                int alloc;
2010
2011                if (!sk_under_memory_pressure(sk))
2012                        return 1;
2013                alloc = sk_sockets_allocated_read_positive(sk);
2014                if (sk_prot_mem_limits(sk, 2) > alloc *
2015                    sk_mem_pages(sk->sk_wmem_queued +
2016                                 atomic_read(&sk->sk_rmem_alloc) +
2017                                 sk->sk_forward_alloc))
2018                        return 1;
2019        }
2020
2021suppress_allocation:
2022
2023        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2024                sk_stream_moderate_sndbuf(sk);
2025
2026                /* Fail only if socket is _under_ its sndbuf.
2027                 * In this case we cannot block, so that we have to fail.
2028                 */
2029                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2030                        return 1;
2031        }
2032
2033        trace_sock_exceed_buf_limit(sk, prot, allocated);
2034
2035        /* Alas. Undo changes. */
2036        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2037
2038        sk_memory_allocated_sub(sk, amt);
2039
2040        return 0;
2041}
2042EXPORT_SYMBOL(__sk_mem_schedule);
2043
2044/**
2045 *      __sk_reclaim - reclaim memory_allocated
2046 *      @sk: socket
2047 */
2048void __sk_mem_reclaim(struct sock *sk)
2049{
2050        sk_memory_allocated_sub(sk,
2051                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2052        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2053
2054        if (sk_under_memory_pressure(sk) &&
2055            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2056                sk_leave_memory_pressure(sk);
2057}
2058EXPORT_SYMBOL(__sk_mem_reclaim);
2059
2060
2061/*
2062 * Set of default routines for initialising struct proto_ops when
2063 * the protocol does not support a particular function. In certain
2064 * cases where it makes no sense for a protocol to have a "do nothing"
2065 * function, some default processing is provided.
2066 */
2067
2068int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2069{
2070        return -EOPNOTSUPP;
2071}
2072EXPORT_SYMBOL(sock_no_bind);
2073
2074int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2075                    int len, int flags)
2076{
2077        return -EOPNOTSUPP;
2078}
2079EXPORT_SYMBOL(sock_no_connect);
2080
2081int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2082{
2083        return -EOPNOTSUPP;
2084}
2085EXPORT_SYMBOL(sock_no_socketpair);
2086
2087int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2088{
2089        return -EOPNOTSUPP;
2090}
2091EXPORT_SYMBOL(sock_no_accept);
2092
2093int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2094                    int *len, int peer)
2095{
2096        return -EOPNOTSUPP;
2097}
2098EXPORT_SYMBOL(sock_no_getname);
2099
2100unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2101{
2102        return 0;
2103}
2104EXPORT_SYMBOL(sock_no_poll);
2105
2106int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2107{
2108        return -EOPNOTSUPP;
2109}
2110EXPORT_SYMBOL(sock_no_ioctl);
2111
2112int sock_no_listen(struct socket *sock, int backlog)
2113{
2114        return -EOPNOTSUPP;
2115}
2116EXPORT_SYMBOL(sock_no_listen);
2117
2118int sock_no_shutdown(struct socket *sock, int how)
2119{
2120        return -EOPNOTSUPP;
2121}
2122EXPORT_SYMBOL(sock_no_shutdown);
2123
2124int sock_no_setsockopt(struct socket *sock, int level, int optname,
2125                    char __user *optval, unsigned int optlen)
2126{
2127        return -EOPNOTSUPP;
2128}
2129EXPORT_SYMBOL(sock_no_setsockopt);
2130
2131int sock_no_getsockopt(struct socket *sock, int level, int optname,
2132                    char __user *optval, int __user *optlen)
2133{
2134        return -EOPNOTSUPP;
2135}
2136EXPORT_SYMBOL(sock_no_getsockopt);
2137
2138int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2139                    size_t len)
2140{
2141        return -EOPNOTSUPP;
2142}
2143EXPORT_SYMBOL(sock_no_sendmsg);
2144
2145int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2146                    size_t len, int flags)
2147{
2148        return -EOPNOTSUPP;
2149}
2150EXPORT_SYMBOL(sock_no_recvmsg);
2151
2152int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2153{
2154        /* Mirror missing mmap method error code */
2155        return -ENODEV;
2156}
2157EXPORT_SYMBOL(sock_no_mmap);
2158
2159ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2160{
2161        ssize_t res;
2162        struct msghdr msg = {.msg_flags = flags};
2163        struct kvec iov;
2164        char *kaddr = kmap(page);
2165        iov.iov_base = kaddr + offset;
2166        iov.iov_len = size;
2167        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2168        kunmap(page);
2169        return res;
2170}
2171EXPORT_SYMBOL(sock_no_sendpage);
2172
2173/*
2174 *      Default Socket Callbacks
2175 */
2176
2177static void sock_def_wakeup(struct sock *sk)
2178{
2179        struct socket_wq *wq;
2180
2181        rcu_read_lock();
2182        wq = rcu_dereference(sk->sk_wq);
2183        if (wq_has_sleeper(wq))
2184                wake_up_interruptible_all(&wq->wait);
2185        rcu_read_unlock();
2186}
2187
2188static void sock_def_error_report(struct sock *sk)
2189{
2190        struct socket_wq *wq;
2191
2192        rcu_read_lock();
2193        wq = rcu_dereference(sk->sk_wq);
2194        if (wq_has_sleeper(wq))
2195                wake_up_interruptible_poll(&wq->wait, POLLERR);
2196        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2197        rcu_read_unlock();
2198}
2199
2200static void sock_def_readable(struct sock *sk, int len)
2201{
2202        struct socket_wq *wq;
2203
2204        rcu_read_lock();
2205        wq = rcu_dereference(sk->sk_wq);
2206        if (wq_has_sleeper(wq))
2207                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2208                                                POLLRDNORM | POLLRDBAND);
2209        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2210        rcu_read_unlock();
2211}
2212
2213static void sock_def_write_space(struct sock *sk)
2214{
2215        struct socket_wq *wq;
2216
2217        rcu_read_lock();
2218
2219        /* Do not wake up a writer until he can make "significant"
2220         * progress.  --DaveM
2221         */
2222        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2223                wq = rcu_dereference(sk->sk_wq);
2224                if (wq_has_sleeper(wq))
2225                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2226                                                POLLWRNORM | POLLWRBAND);
2227
2228                /* Should agree with poll, otherwise some programs break */
2229                if (sock_writeable(sk))
2230                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2231        }
2232
2233        rcu_read_unlock();
2234}
2235
2236static void sock_def_destruct(struct sock *sk)
2237{
2238        kfree(sk->sk_protinfo);
2239}
2240
2241void sk_send_sigurg(struct sock *sk)
2242{
2243        if (sk->sk_socket && sk->sk_socket->file)
2244                if (send_sigurg(&sk->sk_socket->file->f_owner))
2245                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2246}
2247EXPORT_SYMBOL(sk_send_sigurg);
2248
2249void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2250                    unsigned long expires)
2251{
2252        if (!mod_timer(timer, expires))
2253                sock_hold(sk);
2254}
2255EXPORT_SYMBOL(sk_reset_timer);
2256
2257void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2258{
2259        if (del_timer(timer))
2260                __sock_put(sk);
2261}
2262EXPORT_SYMBOL(sk_stop_timer);
2263
2264void sock_init_data(struct socket *sock, struct sock *sk)
2265{
2266        skb_queue_head_init(&sk->sk_receive_queue);
2267        skb_queue_head_init(&sk->sk_write_queue);
2268        skb_queue_head_init(&sk->sk_error_queue);
2269#ifdef CONFIG_NET_DMA
2270        skb_queue_head_init(&sk->sk_async_wait_queue);
2271#endif
2272
2273        sk->sk_send_head        =       NULL;
2274
2275        init_timer(&sk->sk_timer);
2276
2277        sk->sk_allocation       =       GFP_KERNEL;
2278        sk->sk_rcvbuf           =       sysctl_rmem_default;
2279        sk->sk_sndbuf           =       sysctl_wmem_default;
2280        sk->sk_state            =       TCP_CLOSE;
2281        sk_set_socket(sk, sock);
2282
2283        sock_set_flag(sk, SOCK_ZAPPED);
2284
2285        if (sock) {
2286                sk->sk_type     =       sock->type;
2287                sk->sk_wq       =       sock->wq;
2288                sock->sk        =       sk;
2289        } else
2290                sk->sk_wq       =       NULL;
2291
2292        spin_lock_init(&sk->sk_dst_lock);
2293        rwlock_init(&sk->sk_callback_lock);
2294        lockdep_set_class_and_name(&sk->sk_callback_lock,
2295                        af_callback_keys + sk->sk_family,
2296                        af_family_clock_key_strings[sk->sk_family]);
2297
2298        sk->sk_state_change     =       sock_def_wakeup;
2299        sk->sk_data_ready       =       sock_def_readable;
2300        sk->sk_write_space      =       sock_def_write_space;
2301        sk->sk_error_report     =       sock_def_error_report;
2302        sk->sk_destruct         =       sock_def_destruct;
2303
2304        sk->sk_frag.page        =       NULL;
2305        sk->sk_frag.offset      =       0;
2306        sk->sk_peek_off         =       -1;
2307
2308        sk->sk_peer_pid         =       NULL;
2309        sk->sk_peer_cred        =       NULL;
2310        sk->sk_write_pending    =       0;
2311        sk->sk_rcvlowat         =       1;
2312        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2313        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2314
2315        sk->sk_stamp = ktime_set(-1L, 0);
2316
2317#ifdef CONFIG_NET_RX_BUSY_POLL
2318        sk->sk_napi_id          =       0;
2319        sk->sk_ll_usec          =       sysctl_net_busy_read;
2320#endif
2321
2322        sk->sk_pacing_rate = ~0U;
2323        /*
2324         * Before updating sk_refcnt, we must commit prior changes to memory
2325         * (Documentation/RCU/rculist_nulls.txt for details)
2326         */
2327        smp_wmb();
2328        atomic_set(&sk->sk_refcnt, 1);
2329        atomic_set(&sk->sk_drops, 0);
2330}
2331EXPORT_SYMBOL(sock_init_data);
2332
2333void lock_sock_nested(struct sock *sk, int subclass)
2334{
2335        might_sleep();
2336        spin_lock_bh(&sk->sk_lock.slock);
2337        if (sk->sk_lock.owned)
2338                __lock_sock(sk);
2339        sk->sk_lock.owned = 1;
2340        spin_unlock(&sk->sk_lock.slock);
2341        /*
2342         * The sk_lock has mutex_lock() semantics here:
2343         */
2344        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2345        local_bh_enable();
2346}
2347EXPORT_SYMBOL(lock_sock_nested);
2348
2349void release_sock(struct sock *sk)
2350{
2351        /*
2352         * The sk_lock has mutex_unlock() semantics:
2353         */
2354        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2355
2356        spin_lock_bh(&sk->sk_lock.slock);
2357        if (sk->sk_backlog.tail)
2358                __release_sock(sk);
2359
2360        if (sk->sk_prot->release_cb)
2361                sk->sk_prot->release_cb(sk);
2362
2363        sk->sk_lock.owned = 0;
2364        if (waitqueue_active(&sk->sk_lock.wq))
2365                wake_up(&sk->sk_lock.wq);
2366        spin_unlock_bh(&sk->sk_lock.slock);
2367}
2368EXPORT_SYMBOL(release_sock);
2369
2370/**
2371 * lock_sock_fast - fast version of lock_sock
2372 * @sk: socket
2373 *
2374 * This version should be used for very small section, where process wont block
2375 * return false if fast path is taken
2376 *   sk_lock.slock locked, owned = 0, BH disabled
2377 * return true if slow path is taken
2378 *   sk_lock.slock unlocked, owned = 1, BH enabled
2379 */
2380bool lock_sock_fast(struct sock *sk)
2381{
2382        might_sleep();
2383        spin_lock_bh(&sk->sk_lock.slock);
2384
2385        if (!sk->sk_lock.owned)
2386                /*
2387                 * Note : We must disable BH
2388                 */
2389                return false;
2390
2391        __lock_sock(sk);
2392        sk->sk_lock.owned = 1;
2393        spin_unlock(&sk->sk_lock.slock);
2394        /*
2395         * The sk_lock has mutex_lock() semantics here:
2396         */
2397        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2398        local_bh_enable();
2399        return true;
2400}
2401EXPORT_SYMBOL(lock_sock_fast);
2402
2403int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2404{
2405        struct timeval tv;
2406        if (!sock_flag(sk, SOCK_TIMESTAMP))
2407                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2408        tv = ktime_to_timeval(sk->sk_stamp);
2409        if (tv.tv_sec == -1)
2410                return -ENOENT;
2411        if (tv.tv_sec == 0) {
2412                sk->sk_stamp = ktime_get_real();
2413                tv = ktime_to_timeval(sk->sk_stamp);
2414        }
2415        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2416}
2417EXPORT_SYMBOL(sock_get_timestamp);
2418
2419int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2420{
2421        struct timespec ts;
2422        if (!sock_flag(sk, SOCK_TIMESTAMP))
2423                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2424        ts = ktime_to_timespec(sk->sk_stamp);
2425        if (ts.tv_sec == -1)
2426                return -ENOENT;
2427        if (ts.tv_sec == 0) {
2428                sk->sk_stamp = ktime_get_real();
2429                ts = ktime_to_timespec(sk->sk_stamp);
2430        }
2431        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2432}
2433EXPORT_SYMBOL(sock_get_timestampns);
2434
2435void sock_enable_timestamp(struct sock *sk, int flag)
2436{
2437        if (!sock_flag(sk, flag)) {
2438                unsigned long previous_flags = sk->sk_flags;
2439
2440                sock_set_flag(sk, flag);
2441                /*
2442                 * we just set one of the two flags which require net
2443                 * time stamping, but time stamping might have been on
2444                 * already because of the other one
2445                 */
2446                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2447                        net_enable_timestamp();
2448        }
2449}
2450
2451int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2452                       int level, int type)
2453{
2454        struct sock_exterr_skb *serr;
2455        struct sk_buff *skb, *skb2;
2456        int copied, err;
2457
2458        err = -EAGAIN;
2459        skb = skb_dequeue(&sk->sk_error_queue);
2460        if (skb == NULL)
2461                goto out;
2462
2463        copied = skb->len;
2464        if (copied > len) {
2465                msg->msg_flags |= MSG_TRUNC;
2466                copied = len;
2467        }
2468        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2469        if (err)
2470                goto out_free_skb;
2471
2472        sock_recv_timestamp(msg, sk, skb);
2473
2474        serr = SKB_EXT_ERR(skb);
2475        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2476
2477        msg->msg_flags |= MSG_ERRQUEUE;
2478        err = copied;
2479
2480        /* Reset and regenerate socket error */
2481        spin_lock_bh(&sk->sk_error_queue.lock);
2482        sk->sk_err = 0;
2483        if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2484                sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2485                spin_unlock_bh(&sk->sk_error_queue.lock);
2486                sk->sk_error_report(sk);
2487        } else
2488                spin_unlock_bh(&sk->sk_error_queue.lock);
2489
2490out_free_skb:
2491        kfree_skb(skb);
2492out:
2493        return err;
2494}
2495EXPORT_SYMBOL(sock_recv_errqueue);
2496
2497/*
2498 *      Get a socket option on an socket.
2499 *
2500 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2501 *      asynchronous errors should be reported by getsockopt. We assume
2502 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2503 */
2504int sock_common_getsockopt(struct socket *sock, int level, int optname,
2505                           char __user *optval, int __user *optlen)
2506{
2507        struct sock *sk = sock->sk;
2508
2509        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2510}
2511EXPORT_SYMBOL(sock_common_getsockopt);
2512
2513#ifdef CONFIG_COMPAT
2514int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2515                                  char __user *optval, int __user *optlen)
2516{
2517        struct sock *sk = sock->sk;
2518
2519        if (sk->sk_prot->compat_getsockopt != NULL)
2520                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2521                                                      optval, optlen);
2522        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2523}
2524EXPORT_SYMBOL(compat_sock_common_getsockopt);
2525#endif
2526
2527int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2528                        struct msghdr *msg, size_t size, int flags)
2529{
2530        struct sock *sk = sock->sk;
2531        int addr_len = 0;
2532        int err;
2533
2534        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2535                                   flags & ~MSG_DONTWAIT, &addr_len);
2536        if (err >= 0)
2537                msg->msg_namelen = addr_len;
2538        return err;
2539}
2540EXPORT_SYMBOL(sock_common_recvmsg);
2541
2542/*
2543 *      Set socket options on an inet socket.
2544 */
2545int sock_common_setsockopt(struct socket *sock, int level, int optname,
2546                           char __user *optval, unsigned int optlen)
2547{
2548        struct sock *sk = sock->sk;
2549
2550        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2551}
2552EXPORT_SYMBOL(sock_common_setsockopt);
2553
2554#ifdef CONFIG_COMPAT
2555int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2556                                  char __user *optval, unsigned int optlen)
2557{
2558        struct sock *sk = sock->sk;
2559
2560        if (sk->sk_prot->compat_setsockopt != NULL)
2561                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2562                                                      optval, optlen);
2563        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2564}
2565EXPORT_SYMBOL(compat_sock_common_setsockopt);
2566#endif
2567
2568void sk_common_release(struct sock *sk)
2569{
2570        if (sk->sk_prot->destroy)
2571                sk->sk_prot->destroy(sk);
2572
2573        /*
2574         * Observation: when sock_common_release is called, processes have
2575         * no access to socket. But net still has.
2576         * Step one, detach it from networking:
2577         *
2578         * A. Remove from hash tables.
2579         */
2580
2581        sk->sk_prot->unhash(sk);
2582
2583        /*
2584         * In this point socket cannot receive new packets, but it is possible
2585         * that some packets are in flight because some CPU runs receiver and
2586         * did hash table lookup before we unhashed socket. They will achieve
2587         * receive queue and will be purged by socket destructor.
2588         *
2589         * Also we still have packets pending on receive queue and probably,
2590         * our own packets waiting in device queues. sock_destroy will drain
2591         * receive queue, but transmitted packets will delay socket destruction
2592         * until the last reference will be released.
2593         */
2594
2595        sock_orphan(sk);
2596
2597        xfrm_sk_free_policy(sk);
2598
2599        sk_refcnt_debug_release(sk);
2600
2601        if (sk->sk_frag.page) {
2602                put_page(sk->sk_frag.page);
2603                sk->sk_frag.page = NULL;
2604        }
2605
2606        sock_put(sk);
2607}
2608EXPORT_SYMBOL(sk_common_release);
2609
2610#ifdef CONFIG_PROC_FS
2611#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2612struct prot_inuse {
2613        int val[PROTO_INUSE_NR];
2614};
2615
2616static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2617
2618#ifdef CONFIG_NET_NS
2619void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2620{
2621        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2622}
2623EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2624
2625int sock_prot_inuse_get(struct net *net, struct proto *prot)
2626{
2627        int cpu, idx = prot->inuse_idx;
2628        int res = 0;
2629
2630        for_each_possible_cpu(cpu)
2631                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2632
2633        return res >= 0 ? res : 0;
2634}
2635EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2636
2637static int __net_init sock_inuse_init_net(struct net *net)
2638{
2639        net->core.inuse = alloc_percpu(struct prot_inuse);
2640        return net->core.inuse ? 0 : -ENOMEM;
2641}
2642
2643static void __net_exit sock_inuse_exit_net(struct net *net)
2644{
2645        free_percpu(net->core.inuse);
2646}
2647
2648static struct pernet_operations net_inuse_ops = {
2649        .init = sock_inuse_init_net,
2650        .exit = sock_inuse_exit_net,
2651};
2652
2653static __init int net_inuse_init(void)
2654{
2655        if (register_pernet_subsys(&net_inuse_ops))
2656                panic("Cannot initialize net inuse counters");
2657
2658        return 0;
2659}
2660
2661core_initcall(net_inuse_init);
2662#else
2663static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2664
2665void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2666{
2667        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2668}
2669EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2670
2671int sock_prot_inuse_get(struct net *net, struct proto *prot)
2672{
2673        int cpu, idx = prot->inuse_idx;
2674        int res = 0;
2675
2676        for_each_possible_cpu(cpu)
2677                res += per_cpu(prot_inuse, cpu).val[idx];
2678
2679        return res >= 0 ? res : 0;
2680}
2681EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2682#endif
2683
2684static void assign_proto_idx(struct proto *prot)
2685{
2686        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2687
2688        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2689                pr_err("PROTO_INUSE_NR exhausted\n");
2690                return;
2691        }
2692
2693        set_bit(prot->inuse_idx, proto_inuse_idx);
2694}
2695
2696static void release_proto_idx(struct proto *prot)
2697{
2698        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2699                clear_bit(prot->inuse_idx, proto_inuse_idx);
2700}
2701#else
2702static inline void assign_proto_idx(struct proto *prot)
2703{
2704}
2705
2706static inline void release_proto_idx(struct proto *prot)
2707{
2708}
2709#endif
2710
2711int proto_register(struct proto *prot, int alloc_slab)
2712{
2713        if (alloc_slab) {
2714                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2715                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2716                                        NULL);
2717
2718                if (prot->slab == NULL) {
2719                        pr_crit("%s: Can't create sock SLAB cache!\n",
2720                                prot->name);
2721                        goto out;
2722                }
2723
2724                if (prot->rsk_prot != NULL) {
2725                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2726                        if (prot->rsk_prot->slab_name == NULL)
2727                                goto out_free_sock_slab;
2728
2729                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2730                                                                 prot->rsk_prot->obj_size, 0,
2731                                                                 SLAB_HWCACHE_ALIGN, NULL);
2732
2733                        if (prot->rsk_prot->slab == NULL) {
2734                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2735                                        prot->name);
2736                                goto out_free_request_sock_slab_name;
2737                        }
2738                }
2739
2740                if (prot->twsk_prot != NULL) {
2741                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2742
2743                        if (prot->twsk_prot->twsk_slab_name == NULL)
2744                                goto out_free_request_sock_slab;
2745
2746                        prot->twsk_prot->twsk_slab =
2747                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2748                                                  prot->twsk_prot->twsk_obj_size,
2749                                                  0,
2750                                                  SLAB_HWCACHE_ALIGN |
2751                                                        prot->slab_flags,
2752                                                  NULL);
2753                        if (prot->twsk_prot->twsk_slab == NULL)
2754                                goto out_free_timewait_sock_slab_name;
2755                }
2756        }
2757
2758        mutex_lock(&proto_list_mutex);
2759        list_add(&prot->node, &proto_list);
2760        assign_proto_idx(prot);
2761        mutex_unlock(&proto_list_mutex);
2762        return 0;
2763
2764out_free_timewait_sock_slab_name:
2765        kfree(prot->twsk_prot->twsk_slab_name);
2766out_free_request_sock_slab:
2767        if (prot->rsk_prot && prot->rsk_prot->slab) {
2768                kmem_cache_destroy(prot->rsk_prot->slab);
2769                prot->rsk_prot->slab = NULL;
2770        }
2771out_free_request_sock_slab_name:
2772        if (prot->rsk_prot)
2773                kfree(prot->rsk_prot->slab_name);
2774out_free_sock_slab:
2775        kmem_cache_destroy(prot->slab);
2776        prot->slab = NULL;
2777out:
2778        return -ENOBUFS;
2779}
2780EXPORT_SYMBOL(proto_register);
2781
2782void proto_unregister(struct proto *prot)
2783{
2784        mutex_lock(&proto_list_mutex);
2785        release_proto_idx(prot);
2786        list_del(&prot->node);
2787        mutex_unlock(&proto_list_mutex);
2788
2789        if (prot->slab != NULL) {
2790                kmem_cache_destroy(prot->slab);
2791                prot->slab = NULL;
2792        }
2793
2794        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2795                kmem_cache_destroy(prot->rsk_prot->slab);
2796                kfree(prot->rsk_prot->slab_name);
2797                prot->rsk_prot->slab = NULL;
2798        }
2799
2800        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2801                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2802                kfree(prot->twsk_prot->twsk_slab_name);
2803                prot->twsk_prot->twsk_slab = NULL;
2804        }
2805}
2806EXPORT_SYMBOL(proto_unregister);
2807
2808#ifdef CONFIG_PROC_FS
2809static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2810        __acquires(proto_list_mutex)
2811{
2812        mutex_lock(&proto_list_mutex);
2813        return seq_list_start_head(&proto_list, *pos);
2814}
2815
2816static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2817{
2818        return seq_list_next(v, &proto_list, pos);
2819}
2820
2821static void proto_seq_stop(struct seq_file *seq, void *v)
2822        __releases(proto_list_mutex)
2823{
2824        mutex_unlock(&proto_list_mutex);
2825}
2826
2827static char proto_method_implemented(const void *method)
2828{
2829        return method == NULL ? 'n' : 'y';
2830}
2831static long sock_prot_memory_allocated(struct proto *proto)
2832{
2833        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2834}
2835
2836static char *sock_prot_memory_pressure(struct proto *proto)
2837{
2838        return proto->memory_pressure != NULL ?
2839        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2840}
2841
2842static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2843{
2844
2845        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2846                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2847                   proto->name,
2848                   proto->obj_size,
2849                   sock_prot_inuse_get(seq_file_net(seq), proto),
2850                   sock_prot_memory_allocated(proto),
2851                   sock_prot_memory_pressure(proto),
2852                   proto->max_header,
2853                   proto->slab == NULL ? "no" : "yes",
2854                   module_name(proto->owner),
2855                   proto_method_implemented(proto->close),
2856                   proto_method_implemented(proto->connect),
2857                   proto_method_implemented(proto->disconnect),
2858                   proto_method_implemented(proto->accept),
2859                   proto_method_implemented(proto->ioctl),
2860                   proto_method_implemented(proto->init),
2861                   proto_method_implemented(proto->destroy),
2862                   proto_method_implemented(proto->shutdown),
2863                   proto_method_implemented(proto->setsockopt),
2864                   proto_method_implemented(proto->getsockopt),
2865                   proto_method_implemented(proto->sendmsg),
2866                   proto_method_implemented(proto->recvmsg),
2867                   proto_method_implemented(proto->sendpage),
2868                   proto_method_implemented(proto->bind),
2869                   proto_method_implemented(proto->backlog_rcv),
2870                   proto_method_implemented(proto->hash),
2871                   proto_method_implemented(proto->unhash),
2872                   proto_method_implemented(proto->get_port),
2873                   proto_method_implemented(proto->enter_memory_pressure));
2874}
2875
2876static int proto_seq_show(struct seq_file *seq, void *v)
2877{
2878        if (v == &proto_list)
2879                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2880                           "protocol",
2881                           "size",
2882                           "sockets",
2883                           "memory",
2884                           "press",
2885                           "maxhdr",
2886                           "slab",
2887                           "module",
2888                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2889        else
2890                proto_seq_printf(seq, list_entry(v, struct proto, node));
2891        return 0;
2892}
2893
2894static const struct seq_operations proto_seq_ops = {
2895        .start  = proto_seq_start,
2896        .next   = proto_seq_next,
2897        .stop   = proto_seq_stop,
2898        .show   = proto_seq_show,
2899};
2900
2901static int proto_seq_open(struct inode *inode, struct file *file)
2902{
2903        return seq_open_net(inode, file, &proto_seq_ops,
2904                            sizeof(struct seq_net_private));
2905}
2906
2907static const struct file_operations proto_seq_fops = {
2908        .owner          = THIS_MODULE,
2909        .open           = proto_seq_open,
2910        .read           = seq_read,
2911        .llseek         = seq_lseek,
2912        .release        = seq_release_net,
2913};
2914
2915static __net_init int proto_init_net(struct net *net)
2916{
2917        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2918                return -ENOMEM;
2919
2920        return 0;
2921}
2922
2923static __net_exit void proto_exit_net(struct net *net)
2924{
2925        remove_proc_entry("protocols", net->proc_net);
2926}
2927
2928
2929static __net_initdata struct pernet_operations proto_net_ops = {
2930        .init = proto_init_net,
2931        .exit = proto_exit_net,
2932};
2933
2934static int __init proto_init(void)
2935{
2936        return register_pernet_subsys(&proto_net_ops);
2937}
2938
2939subsys_initcall(proto_init);
2940
2941#endif /* PROC_FS */
2942