LXR linux/net/core/sock.c

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134
 135#include <linux/filter.h>
 136
 137#include <trace/events/sock.h>
 138
 139#ifdef CONFIG_INET
 140#include <net/tcp.h>
 141#endif
 142
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148#ifdef CONFIG_MEMCG_KMEM
 149int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 150{
 151        struct proto *proto;
 152        int ret = 0;
 153
 154        mutex_lock(&proto_list_mutex);
 155        list_for_each_entry(proto, &proto_list, node) {
 156                if (proto->init_cgroup) {
 157                        ret = proto->init_cgroup(memcg, ss);
 158                        if (ret)
 159                                goto out;
 160                }
 161        }
 162
 163        mutex_unlock(&proto_list_mutex);
 164        return ret;
 165out:
 166        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 167                if (proto->destroy_cgroup)
 168                        proto->destroy_cgroup(memcg);
 169        mutex_unlock(&proto_list_mutex);
 170        return ret;
 171}
 172
 173void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 174{
 175        struct proto *proto;
 176
 177        mutex_lock(&proto_list_mutex);
 178        list_for_each_entry_reverse(proto, &proto_list, node)
 179                if (proto->destroy_cgroup)
 180                        proto->destroy_cgroup(memcg);
 181        mutex_unlock(&proto_list_mutex);
 182}
 183#endif
 184
 185/*
 186 * Each address family might have different locking rules, so we have
 187 * one slock key per address family:
 188 */
 189static struct lock_class_key af_family_keys[AF_MAX];
 190static struct lock_class_key af_family_slock_keys[AF_MAX];
 191
 192#if defined(CONFIG_MEMCG_KMEM)
 193struct static_key memcg_socket_limit_enabled;
 194EXPORT_SYMBOL(memcg_socket_limit_enabled);
 195#endif
 196
 197/*
 198 * Make lock validator output more readable. (we pre-construct these
 199 * strings build-time, so that runtime initialization of socket
 200 * locks is fast):
 201 */
 202static const char *const af_family_key_strings[AF_MAX+1] = {
 203  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 204  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 205  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 206  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 207  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 208  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 209  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 210  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 211  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 212  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 213  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 214  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 215  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 216  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 217};
 218static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 219  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 220  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 221  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 222  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 223  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 224  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 225  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 226  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 227  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 228  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 229  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 230  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 231  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 232  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 233};
 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 236  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 237  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 238  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 239  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 240  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 241  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 242  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 243  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 244  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 245  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 246  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 247  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 248  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 249};
 250
 251/*
 252 * sk_callback_lock locking rules are per-address-family,
 253 * so split the lock classes by using a per-AF key:
 254 */
 255static struct lock_class_key af_callback_keys[AF_MAX];
 256
 257/* Take into consideration the size of the struct sk_buff overhead in the
 258 * determination of these values, since that is non-constant across
 259 * platforms.  This makes socket queueing behavior and performance
 260 * not depend upon such differences.
 261 */
 262#define _SK_MEM_PACKETS         256
 263#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 264#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 265#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 266
 267/* Run time adjustable parameters. */
 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269EXPORT_SYMBOL(sysctl_wmem_max);
 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271EXPORT_SYMBOL(sysctl_rmem_max);
 272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275/* Maximal space eaten by iovec or ancillary data plus some space */
 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 280EXPORT_SYMBOL_GPL(memalloc_socks);
 281
 282/**
 283 * sk_set_memalloc - sets %SOCK_MEMALLOC
 284 * @sk: socket to set it on
 285 *
 286 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 287 * It's the responsibility of the admin to adjust min_free_kbytes
 288 * to meet the requirements
 289 */
 290void sk_set_memalloc(struct sock *sk)
 291{
 292        sock_set_flag(sk, SOCK_MEMALLOC);
 293        sk->sk_allocation |= __GFP_MEMALLOC;
 294        static_key_slow_inc(&memalloc_socks);
 295}
 296EXPORT_SYMBOL_GPL(sk_set_memalloc);
 297
 298void sk_clear_memalloc(struct sock *sk)
 299{
 300        sock_reset_flag(sk, SOCK_MEMALLOC);
 301        sk->sk_allocation &= ~__GFP_MEMALLOC;
 302        static_key_slow_dec(&memalloc_socks);
 303
 304        /*
 305         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 306         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 307         * it has rmem allocations there is a risk that the user of the
 308         * socket cannot make forward progress due to exceeding the rmem
 309         * limits. By rights, sk_clear_memalloc() should only be called
 310         * on sockets being torn down but warn and reset the accounting if
 311         * that assumption breaks.
 312         */
 313        if (WARN_ON(sk->sk_forward_alloc))
 314                sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320        int ret;
 321        unsigned long pflags = current->flags;
 322
 323        /* these should have been dropped before queueing */
 324        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326        current->flags |= PF_MEMALLOC;
 327        ret = sk->sk_backlog_rcv(sk, skb);
 328        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 329
 330        return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 335{
 336        struct timeval tv;
 337
 338        if (optlen < sizeof(tv))
 339                return -EINVAL;
 340        if (copy_from_user(&tv, optval, sizeof(tv)))
 341                return -EFAULT;
 342        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 343                return -EDOM;
 344
 345        if (tv.tv_sec < 0) {
 346                static int warned __read_mostly;
 347
 348                *timeo_p = 0;
 349                if (warned < 10 && net_ratelimit()) {
 350                        warned++;
 351                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 352                                __func__, current->comm, task_pid_nr(current));
 353                }
 354                return 0;
 355        }
 356        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 357        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 358                return 0;
 359        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 360                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 361        return 0;
 362}
 363
 364static void sock_warn_obsolete_bsdism(const char *name)
 365{
 366        static int warned;
 367        static char warncomm[TASK_COMM_LEN];
 368        if (strcmp(warncomm, current->comm) && warned < 5) {
 369                strcpy(warncomm,  current->comm);
 370                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 371                        warncomm, name);
 372                warned++;
 373        }
 374}
 375
 376#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 377
 378static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 379{
 380        if (sk->sk_flags & flags) {
 381                sk->sk_flags &= ~flags;
 382                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 383                        net_disable_timestamp();
 384        }
 385}
 386
 387
 388int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 389{
 390        int err;
 391        int skb_len;
 392        unsigned long flags;
 393        struct sk_buff_head *list = &sk->sk_receive_queue;
 394
 395        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 396                atomic_inc(&sk->sk_drops);
 397                trace_sock_rcvqueue_full(sk, skb);
 398                return -ENOMEM;
 399        }
 400
 401        err = sk_filter(sk, skb);
 402        if (err)
 403                return err;
 404
 405        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 406                atomic_inc(&sk->sk_drops);
 407                return -ENOBUFS;
 408        }
 409
 410        skb->dev = NULL;
 411        skb_set_owner_r(skb, sk);
 412
 413        /* Cache the SKB length before we tack it onto the receive
 414         * queue.  Once it is added it no longer belongs to us and
 415         * may be freed by other threads of control pulling packets
 416         * from the queue.
 417         */
 418        skb_len = skb->len;
 419
 420        /* we escape from rcu protected region, make sure we dont leak
 421         * a norefcounted dst
 422         */
 423        skb_dst_force(skb);
 424
 425        spin_lock_irqsave(&list->lock, flags);
 426        skb->dropcount = atomic_read(&sk->sk_drops);
 427        __skb_queue_tail(list, skb);
 428        spin_unlock_irqrestore(&list->lock, flags);
 429
 430        if (!sock_flag(sk, SOCK_DEAD))
 431                sk->sk_data_ready(sk, skb_len);
 432        return 0;
 433}
 434EXPORT_SYMBOL(sock_queue_rcv_skb);
 435
 436int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 437{
 438        int rc = NET_RX_SUCCESS;
 439
 440        if (sk_filter(sk, skb))
 441                goto discard_and_relse;
 442
 443        skb->dev = NULL;
 444
 445        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 446                atomic_inc(&sk->sk_drops);
 447                goto discard_and_relse;
 448        }
 449        if (nested)
 450                bh_lock_sock_nested(sk);
 451        else
 452                bh_lock_sock(sk);
 453        if (!sock_owned_by_user(sk)) {
 454                /*
 455                 * trylock + unlock semantics:
 456                 */
 457                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 458
 459                rc = sk_backlog_rcv(sk, skb);
 460
 461                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 462        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 463                bh_unlock_sock(sk);
 464                atomic_inc(&sk->sk_drops);
 465                goto discard_and_relse;
 466        }
 467
 468        bh_unlock_sock(sk);
 469out:
 470        sock_put(sk);
 471        return rc;
 472discard_and_relse:
 473        kfree_skb(skb);
 474        goto out;
 475}
 476EXPORT_SYMBOL(sk_receive_skb);
 477
 478struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 479{
 480        struct dst_entry *dst = __sk_dst_get(sk);
 481
 482        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 483                sk_tx_queue_clear(sk);
 484                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 485                dst_release(dst);
 486                return NULL;
 487        }
 488
 489        return dst;
 490}
 491EXPORT_SYMBOL(__sk_dst_check);
 492
 493struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 494{
 495        struct dst_entry *dst = sk_dst_get(sk);
 496
 497        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498                sk_dst_reset(sk);
 499                dst_release(dst);
 500                return NULL;
 501        }
 502
 503        return dst;
 504}
 505EXPORT_SYMBOL(sk_dst_check);
 506
 507static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 508                                int optlen)
 509{
 510        int ret = -ENOPROTOOPT;
 511#ifdef CONFIG_NETDEVICES
 512        struct net *net = sock_net(sk);
 513        char devname[IFNAMSIZ];
 514        int index;
 515
 516        /* Sorry... */
 517        ret = -EPERM;
 518        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 519                goto out;
 520
 521        ret = -EINVAL;
 522        if (optlen < 0)
 523                goto out;
 524
 525        /* Bind this socket to a particular device like "eth0",
 526         * as specified in the passed interface name. If the
 527         * name is "" or the option length is zero the socket
 528         * is not bound.
 529         */
 530        if (optlen > IFNAMSIZ - 1)
 531                optlen = IFNAMSIZ - 1;
 532        memset(devname, 0, sizeof(devname));
 533
 534        ret = -EFAULT;
 535        if (copy_from_user(devname, optval, optlen))
 536                goto out;
 537
 538        index = 0;
 539        if (devname[0] != '\0') {
 540                struct net_device *dev;
 541
 542                rcu_read_lock();
 543                dev = dev_get_by_name_rcu(net, devname);
 544                if (dev)
 545                        index = dev->ifindex;
 546                rcu_read_unlock();
 547                ret = -ENODEV;
 548                if (!dev)
 549                        goto out;
 550        }
 551
 552        lock_sock(sk);
 553        sk->sk_bound_dev_if = index;
 554        sk_dst_reset(sk);
 555        release_sock(sk);
 556
 557        ret = 0;
 558
 559out:
 560#endif
 561
 562        return ret;
 563}
 564
 565static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 566                                int __user *optlen, int len)
 567{
 568        int ret = -ENOPROTOOPT;
 569#ifdef CONFIG_NETDEVICES
 570        struct net *net = sock_net(sk);
 571        char devname[IFNAMSIZ];
 572
 573        if (sk->sk_bound_dev_if == 0) {
 574                len = 0;
 575                goto zero;
 576        }
 577
 578        ret = -EINVAL;
 579        if (len < IFNAMSIZ)
 580                goto out;
 581
 582        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 583        if (ret)
 584                goto out;
 585
 586        len = strlen(devname) + 1;
 587
 588        ret = -EFAULT;
 589        if (copy_to_user(optval, devname, len))
 590                goto out;
 591
 592zero:
 593        ret = -EFAULT;
 594        if (put_user(len, optlen))
 595                goto out;
 596
 597        ret = 0;
 598
 599out:
 600#endif
 601
 602        return ret;
 603}
 604
 605static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 606{
 607        if (valbool)
 608                sock_set_flag(sk, bit);
 609        else
 610                sock_reset_flag(sk, bit);
 611}
 612
 613/*
 614 *      This is meant for all protocols to use and covers goings on
 615 *      at the socket level. Everything here is generic.
 616 */
 617
 618int sock_setsockopt(struct socket *sock, int level, int optname,
 619                    char __user *optval, unsigned int optlen)
 620{
 621        struct sock *sk = sock->sk;
 622        int val;
 623        int valbool;
 624        struct linger ling;
 625        int ret = 0;
 626
 627        /*
 628         *      Options without arguments
 629         */
 630
 631        if (optname == SO_BINDTODEVICE)
 632                return sock_setbindtodevice(sk, optval, optlen);
 633
 634        if (optlen < sizeof(int))
 635                return -EINVAL;
 636
 637        if (get_user(val, (int __user *)optval))
 638                return -EFAULT;
 639
 640        valbool = val ? 1 : 0;
 641
 642        lock_sock(sk);
 643
 644        switch (optname) {
 645        case SO_DEBUG:
 646                if (val && !capable(CAP_NET_ADMIN))
 647                        ret = -EACCES;
 648                else
 649                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 650                break;
 651        case SO_REUSEADDR:
 652                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 653                break;
 654        case SO_REUSEPORT:
 655                sk->sk_reuseport = valbool;
 656                break;
 657        case SO_TYPE:
 658        case SO_PROTOCOL:
 659        case SO_DOMAIN:
 660        case SO_ERROR:
 661                ret = -ENOPROTOOPT;
 662                break;
 663        case SO_DONTROUTE:
 664                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 665                break;
 666        case SO_BROADCAST:
 667                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 668                break;
 669        case SO_SNDBUF:
 670                /* Don't error on this BSD doesn't and if you think
 671                 * about it this is right. Otherwise apps have to
 672                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 673                 * are treated in BSD as hints
 674                 */
 675                val = min_t(u32, val, sysctl_wmem_max);
 676set_sndbuf:
 677                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 678                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 679                /* Wake up sending tasks if we upped the value. */
 680                sk->sk_write_space(sk);
 681                break;
 682
 683        case SO_SNDBUFFORCE:
 684                if (!capable(CAP_NET_ADMIN)) {
 685                        ret = -EPERM;
 686                        break;
 687                }
 688                goto set_sndbuf;
 689
 690        case SO_RCVBUF:
 691                /* Don't error on this BSD doesn't and if you think
 692                 * about it this is right. Otherwise apps have to
 693                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 694                 * are treated in BSD as hints
 695                 */
 696                val = min_t(u32, val, sysctl_rmem_max);
 697set_rcvbuf:
 698                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 699                /*
 700                 * We double it on the way in to account for
 701                 * "struct sk_buff" etc. overhead.   Applications
 702                 * assume that the SO_RCVBUF setting they make will
 703                 * allow that much actual data to be received on that
 704                 * socket.
 705                 *
 706                 * Applications are unaware that "struct sk_buff" and
 707                 * other overheads allocate from the receive buffer
 708                 * during socket buffer allocation.
 709                 *
 710                 * And after considering the possible alternatives,
 711                 * returning the value we actually used in getsockopt
 712                 * is the most desirable behavior.
 713                 */
 714                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 715                break;
 716
 717        case SO_RCVBUFFORCE:
 718                if (!capable(CAP_NET_ADMIN)) {
 719                        ret = -EPERM;
 720                        break;
 721                }
 722                goto set_rcvbuf;
 723
 724        case SO_KEEPALIVE:
 725#ifdef CONFIG_INET
 726                if (sk->sk_protocol == IPPROTO_TCP &&
 727                    sk->sk_type == SOCK_STREAM)
 728                        tcp_set_keepalive(sk, valbool);
 729#endif
 730                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 731                break;
 732
 733        case SO_OOBINLINE:
 734                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 735                break;
 736
 737        case SO_NO_CHECK:
 738                sk->sk_no_check = valbool;
 739                break;
 740
 741        case SO_PRIORITY:
 742                if ((val >= 0 && val <= 6) ||
 743                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 744                        sk->sk_priority = val;
 745                else
 746                        ret = -EPERM;
 747                break;
 748
 749        case SO_LINGER:
 750                if (optlen < sizeof(ling)) {
 751                        ret = -EINVAL;  /* 1003.1g */
 752                        break;
 753                }
 754                if (copy_from_user(&ling, optval, sizeof(ling))) {
 755                        ret = -EFAULT;
 756                        break;
 757                }
 758                if (!ling.l_onoff)
 759                        sock_reset_flag(sk, SOCK_LINGER);
 760                else {
 761#if (BITS_PER_LONG == 32)
 762                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 763                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 764                        else
 765#endif
 766                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 767                        sock_set_flag(sk, SOCK_LINGER);
 768                }
 769                break;
 770
 771        case SO_BSDCOMPAT:
 772                sock_warn_obsolete_bsdism("setsockopt");
 773                break;
 774
 775        case SO_PASSCRED:
 776                if (valbool)
 777                        set_bit(SOCK_PASSCRED, &sock->flags);
 778                else
 779                        clear_bit(SOCK_PASSCRED, &sock->flags);
 780                break;
 781
 782        case SO_TIMESTAMP:
 783        case SO_TIMESTAMPNS:
 784                if (valbool)  {
 785                        if (optname == SO_TIMESTAMP)
 786                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 787                        else
 788                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 789                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 790                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 791                } else {
 792                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 793                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 794                }
 795                break;
 796
 797        case SO_TIMESTAMPING:
 798                if (val & ~SOF_TIMESTAMPING_MASK) {
 799                        ret = -EINVAL;
 800                        break;
 801                }
 802                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 803                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 804                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 805                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 806                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 807                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 808                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 809                        sock_enable_timestamp(sk,
 810                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 811                else
 812                        sock_disable_timestamp(sk,
 813                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 814                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 815                                  val & SOF_TIMESTAMPING_SOFTWARE);
 816                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 817                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 818                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 819                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 820                break;
 821
 822        case SO_RCVLOWAT:
 823                if (val < 0)
 824                        val = INT_MAX;
 825                sk->sk_rcvlowat = val ? : 1;
 826                break;
 827
 828        case SO_RCVTIMEO:
 829                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 830                break;
 831
 832        case SO_SNDTIMEO:
 833                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 834                break;
 835
 836        case SO_ATTACH_FILTER:
 837                ret = -EINVAL;
 838                if (optlen == sizeof(struct sock_fprog)) {
 839                        struct sock_fprog fprog;
 840
 841                        ret = -EFAULT;
 842                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 843                                break;
 844
 845                        ret = sk_attach_filter(&fprog, sk);
 846                }
 847                break;
 848
 849        case SO_DETACH_FILTER:
 850                ret = sk_detach_filter(sk);
 851                break;
 852
 853        case SO_LOCK_FILTER:
 854                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 855                        ret = -EPERM;
 856                else
 857                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 858                break;
 859
 860        case SO_PASSSEC:
 861                if (valbool)
 862                        set_bit(SOCK_PASSSEC, &sock->flags);
 863                else
 864                        clear_bit(SOCK_PASSSEC, &sock->flags);
 865                break;
 866        case SO_MARK:
 867                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 868                        ret = -EPERM;
 869                else
 870                        sk->sk_mark = val;
 871                break;
 872
 873                /* We implement the SO_SNDLOWAT etc to
 874                   not be settable (1003.1g 5.3) */
 875        case SO_RXQ_OVFL:
 876                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 877                break;
 878
 879        case SO_WIFI_STATUS:
 880                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 881                break;
 882
 883        case SO_PEEK_OFF:
 884                if (sock->ops->set_peek_off)
 885                        ret = sock->ops->set_peek_off(sk, val);
 886                else
 887                        ret = -EOPNOTSUPP;
 888                break;
 889
 890        case SO_NOFCS:
 891                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 892                break;
 893
 894        case SO_SELECT_ERR_QUEUE:
 895                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 896                break;
 897
 898#ifdef CONFIG_NET_RX_BUSY_POLL
 899        case SO_BUSY_POLL:
 900                /* allow unprivileged users to decrease the value */
 901                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 902                        ret = -EPERM;
 903                else {
 904                        if (val < 0)
 905                                ret = -EINVAL;
 906                        else
 907                                sk->sk_ll_usec = val;
 908                }
 909                break;
 910#endif
 911
 912        case SO_MAX_PACING_RATE:
 913                sk->sk_max_pacing_rate = val;
 914                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 915                                         sk->sk_max_pacing_rate);
 916                break;
 917
 918        default:
 919                ret = -ENOPROTOOPT;
 920                break;
 921        }
 922        release_sock(sk);
 923        return ret;
 924}
 925EXPORT_SYMBOL(sock_setsockopt);
 926
 927
 928void cred_to_ucred(struct pid *pid, const struct cred *cred,
 929                   struct ucred *ucred)
 930{
 931        ucred->pid = pid_vnr(pid);
 932        ucred->uid = ucred->gid = -1;
 933        if (cred) {
 934                struct user_namespace *current_ns = current_user_ns();
 935
 936                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 937                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 938        }
 939}
 940EXPORT_SYMBOL_GPL(cred_to_ucred);
 941
 942int sock_getsockopt(struct socket *sock, int level, int optname,
 943                    char __user *optval, int __user *optlen)
 944{
 945        struct sock *sk = sock->sk;
 946
 947        union {
 948                int val;
 949                struct linger ling;
 950                struct timeval tm;
 951        } v;
 952
 953        int lv = sizeof(int);
 954        int len;
 955
 956        if (get_user(len, optlen))
 957                return -EFAULT;
 958        if (len < 0)
 959                return -EINVAL;
 960
 961        memset(&v, 0, sizeof(v));
 962
 963        switch (optname) {
 964        case SO_DEBUG:
 965                v.val = sock_flag(sk, SOCK_DBG);
 966                break;
 967
 968        case SO_DONTROUTE:
 969                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 970                break;
 971
 972        case SO_BROADCAST:
 973                v.val = sock_flag(sk, SOCK_BROADCAST);
 974                break;
 975
 976        case SO_SNDBUF:
 977                v.val = sk->sk_sndbuf;
 978                break;
 979
 980        case SO_RCVBUF:
 981                v.val = sk->sk_rcvbuf;
 982                break;
 983
 984        case SO_REUSEADDR:
 985                v.val = sk->sk_reuse;
 986                break;
 987
 988        case SO_REUSEPORT:
 989                v.val = sk->sk_reuseport;
 990                break;
 991
 992        case SO_KEEPALIVE:
 993                v.val = sock_flag(sk, SOCK_KEEPOPEN);
 994                break;
 995
 996        case SO_TYPE:
 997                v.val = sk->sk_type;
 998                break;
 999
1000        case SO_PROTOCOL:

1001                v.val = sk->sk_protocol;
1002                break;
1003
1004        case SO_DOMAIN:
1005                v.val = sk->sk_family;
1006                break;
1007
1008        case SO_ERROR:
1009                v.val = -sock_error(sk);
1010                if (v.val == 0)
1011                        v.val = xchg(&sk->sk_err_soft, 0);
1012                break;
1013
1014        case SO_OOBINLINE:
1015                v.val = sock_flag(sk, SOCK_URGINLINE);
1016                break;
1017
1018        case SO_NO_CHECK:
1019                v.val = sk->sk_no_check;
1020                break;
1021
1022        case SO_PRIORITY:
1023                v.val = sk->sk_priority;
1024                break;
1025
1026        case SO_LINGER:
1027                lv              = sizeof(v.ling);
1028                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1029                v.ling.l_linger = sk->sk_lingertime / HZ;
1030                break;
1031
1032        case SO_BSDCOMPAT:
1033                sock_warn_obsolete_bsdism("getsockopt");
1034                break;
1035
1036        case SO_TIMESTAMP:
1037                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1038                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1039                break;
1040
1041        case SO_TIMESTAMPNS:
1042                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1043                break;
1044
1045        case SO_TIMESTAMPING:
1046                v.val = 0;
1047                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1048                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1049                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1050                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1051                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1052                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1053                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1054                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1055                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1056                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
1057                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1058                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1059                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1060                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1061                break;
1062
1063        case SO_RCVTIMEO:
1064                lv = sizeof(struct timeval);
1065                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1066                        v.tm.tv_sec = 0;
1067                        v.tm.tv_usec = 0;
1068                } else {
1069                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1070                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1071                }
1072                break;
1073
1074        case SO_SNDTIMEO:
1075                lv = sizeof(struct timeval);
1076                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1077                        v.tm.tv_sec = 0;
1078                        v.tm.tv_usec = 0;
1079                } else {
1080                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1081                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1082                }
1083                break;
1084
1085        case SO_RCVLOWAT:
1086                v.val = sk->sk_rcvlowat;
1087                break;
1088
1089        case SO_SNDLOWAT:
1090                v.val = 1;
1091                break;
1092
1093        case SO_PASSCRED:
1094                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1095                break;
1096
1097        case SO_PEERCRED:
1098        {
1099                struct ucred peercred;
1100                if (len > sizeof(peercred))
1101                        len = sizeof(peercred);
1102                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1103                if (copy_to_user(optval, &peercred, len))
1104                        return -EFAULT;
1105                goto lenout;
1106        }
1107
1108        case SO_PEERNAME:
1109        {
1110                char address[128];
1111
1112                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1113                        return -ENOTCONN;
1114                if (lv < len)
1115                        return -EINVAL;
1116                if (copy_to_user(optval, address, len))
1117                        return -EFAULT;
1118                goto lenout;
1119        }
1120
1121        /* Dubious BSD thing... Probably nobody even uses it, but
1122         * the UNIX standard wants it for whatever reason... -DaveM
1123         */
1124        case SO_ACCEPTCONN:
1125                v.val = sk->sk_state == TCP_LISTEN;
1126                break;
1127
1128        case SO_PASSSEC:
1129                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1130                break;
1131
1132        case SO_PEERSEC:
1133                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1134
1135        case SO_MARK:
1136                v.val = sk->sk_mark;
1137                break;
1138
1139        case SO_RXQ_OVFL:
1140                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1141                break;
1142
1143        case SO_WIFI_STATUS:
1144                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1145                break;
1146
1147        case SO_PEEK_OFF:
1148                if (!sock->ops->set_peek_off)
1149                        return -EOPNOTSUPP;
1150
1151                v.val = sk->sk_peek_off;
1152                break;
1153        case SO_NOFCS:
1154                v.val = sock_flag(sk, SOCK_NOFCS);
1155                break;
1156
1157        case SO_BINDTODEVICE:
1158                return sock_getbindtodevice(sk, optval, optlen, len);
1159
1160        case SO_GET_FILTER:
1161                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1162                if (len < 0)
1163                        return len;
1164
1165                goto lenout;
1166
1167        case SO_LOCK_FILTER:
1168                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1169                break;
1170
1171        case SO_SELECT_ERR_QUEUE:
1172                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1173                break;
1174
1175#ifdef CONFIG_NET_RX_BUSY_POLL
1176        case SO_BUSY_POLL:
1177                v.val = sk->sk_ll_usec;
1178                break;
1179#endif
1180
1181        case SO_MAX_PACING_RATE:
1182                v.val = sk->sk_max_pacing_rate;
1183                break;
1184
1185        default:
1186                return -ENOPROTOOPT;
1187        }
1188
1189        if (len > lv)
1190                len = lv;
1191        if (copy_to_user(optval, &v, len))
1192                return -EFAULT;
1193lenout:
1194        if (put_user(len, optlen))
1195                return -EFAULT;
1196        return 0;
1197}
1198
1199/*
1200 * Initialize an sk_lock.
1201 *
1202 * (We also register the sk_lock with the lock validator.)
1203 */
1204static inline void sock_lock_init(struct sock *sk)
1205{
1206        sock_lock_init_class_and_name(sk,
1207                        af_family_slock_key_strings[sk->sk_family],
1208                        af_family_slock_keys + sk->sk_family,
1209                        af_family_key_strings[sk->sk_family],
1210                        af_family_keys + sk->sk_family);
1211}
1212
1213/*
1214 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1215 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1216 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1217 */
1218static void sock_copy(struct sock *nsk, const struct sock *osk)
1219{
1220#ifdef CONFIG_SECURITY_NETWORK
1221        void *sptr = nsk->sk_security;
1222#endif
1223        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1224
1225        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1226               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1227
1228#ifdef CONFIG_SECURITY_NETWORK
1229        nsk->sk_security = sptr;
1230        security_sk_clone(osk, nsk);
1231#endif
1232}
1233
1234void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1235{
1236        unsigned long nulls1, nulls2;
1237
1238        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1239        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1240        if (nulls1 > nulls2)
1241                swap(nulls1, nulls2);
1242
1243        if (nulls1 != 0)
1244                memset((char *)sk, 0, nulls1);
1245        memset((char *)sk + nulls1 + sizeof(void *), 0,
1246               nulls2 - nulls1 - sizeof(void *));
1247        memset((char *)sk + nulls2 + sizeof(void *), 0,
1248               size - nulls2 - sizeof(void *));
1249}
1250EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1251
1252static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1253                int family)
1254{
1255        struct sock *sk;
1256        struct kmem_cache *slab;
1257
1258        slab = prot->slab;
1259        if (slab != NULL) {
1260                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1261                if (!sk)
1262                        return sk;
1263                if (priority & __GFP_ZERO) {
1264                        if (prot->clear_sk)
1265                                prot->clear_sk(sk, prot->obj_size);
1266                        else
1267                                sk_prot_clear_nulls(sk, prot->obj_size);
1268                }
1269        } else
1270                sk = kmalloc(prot->obj_size, priority);
1271
1272        if (sk != NULL) {
1273                kmemcheck_annotate_bitfield(sk, flags);
1274
1275                if (security_sk_alloc(sk, family, priority))
1276                        goto out_free;
1277
1278                if (!try_module_get(prot->owner))
1279                        goto out_free_sec;
1280                sk_tx_queue_clear(sk);
1281        }
1282
1283        return sk;
1284
1285out_free_sec:
1286        security_sk_free(sk);
1287out_free:
1288        if (slab != NULL)
1289                kmem_cache_free(slab, sk);
1290        else
1291                kfree(sk);
1292        return NULL;
1293}
1294
1295static void sk_prot_free(struct proto *prot, struct sock *sk)
1296{
1297        struct kmem_cache *slab;
1298        struct module *owner;
1299
1300        owner = prot->owner;
1301        slab = prot->slab;
1302
1303        security_sk_free(sk);
1304        if (slab != NULL)
1305                kmem_cache_free(slab, sk);
1306        else
1307                kfree(sk);
1308        module_put(owner);
1309}
1310
1311#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1312void sock_update_classid(struct sock *sk)
1313{
1314        u32 classid;
1315
1316        classid = task_cls_classid(current);
1317        if (classid != sk->sk_classid)
1318                sk->sk_classid = classid;
1319}
1320EXPORT_SYMBOL(sock_update_classid);
1321#endif
1322
1323#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1324void sock_update_netprioidx(struct sock *sk)
1325{
1326        if (in_interrupt())
1327                return;
1328
1329        sk->sk_cgrp_prioidx = task_netprioidx(current);
1330}
1331EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1332#endif
1333
1334/**
1335 *      sk_alloc - All socket objects are allocated here
1336 *      @net: the applicable net namespace
1337 *      @family: protocol family
1338 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1339 *      @prot: struct proto associated with this new sock instance
1340 */
1341struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1342                      struct proto *prot)
1343{
1344        struct sock *sk;
1345
1346        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1347        if (sk) {
1348                sk->sk_family = family;
1349                /*
1350                 * See comment in struct sock definition to understand
1351                 * why we need sk_prot_creator -acme
1352                 */
1353                sk->sk_prot = sk->sk_prot_creator = prot;
1354                sock_lock_init(sk);
1355                sock_net_set(sk, get_net(net));
1356                atomic_set(&sk->sk_wmem_alloc, 1);
1357
1358                sock_update_classid(sk);
1359                sock_update_netprioidx(sk);
1360        }
1361
1362        return sk;
1363}
1364EXPORT_SYMBOL(sk_alloc);
1365
1366static void __sk_free(struct sock *sk)
1367{
1368        struct sk_filter *filter;
1369
1370        if (sk->sk_destruct)
1371                sk->sk_destruct(sk);
1372
1373        filter = rcu_dereference_check(sk->sk_filter,
1374                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1375        if (filter) {
1376                sk_filter_uncharge(sk, filter);
1377                RCU_INIT_POINTER(sk->sk_filter, NULL);
1378        }
1379
1380        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1381
1382        if (atomic_read(&sk->sk_omem_alloc))
1383                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1384                         __func__, atomic_read(&sk->sk_omem_alloc));
1385
1386        if (sk->sk_peer_cred)
1387                put_cred(sk->sk_peer_cred);
1388        put_pid(sk->sk_peer_pid);
1389        put_net(sock_net(sk));
1390        sk_prot_free(sk->sk_prot_creator, sk);
1391}
1392
1393void sk_free(struct sock *sk)
1394{
1395        /*
1396         * We subtract one from sk_wmem_alloc and can know if
1397         * some packets are still in some tx queue.
1398         * If not null, sock_wfree() will call __sk_free(sk) later
1399         */
1400        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1401                __sk_free(sk);
1402}
1403EXPORT_SYMBOL(sk_free);
1404
1405/*
1406 * Last sock_put should drop reference to sk->sk_net. It has already
1407 * been dropped in sk_change_net. Taking reference to stopping namespace
1408 * is not an option.
1409 * Take reference to a socket to remove it from hash _alive_ and after that
1410 * destroy it in the context of init_net.
1411 */
1412void sk_release_kernel(struct sock *sk)
1413{
1414        if (sk == NULL || sk->sk_socket == NULL)
1415                return;
1416
1417        sock_hold(sk);
1418        sock_release(sk->sk_socket);
1419        release_net(sock_net(sk));
1420        sock_net_set(sk, get_net(&init_net));
1421        sock_put(sk);
1422}
1423EXPORT_SYMBOL(sk_release_kernel);
1424
1425static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1426{
1427        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1428                sock_update_memcg(newsk);
1429}
1430
1431/**
1432 *      sk_clone_lock - clone a socket, and lock its clone
1433 *      @sk: the socket to clone
1434 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1435 *
1436 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1437 */
1438struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1439{
1440        struct sock *newsk;
1441
1442        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1443        if (newsk != NULL) {
1444                struct sk_filter *filter;
1445
1446                sock_copy(newsk, sk);
1447
1448                /* SANITY */
1449                get_net(sock_net(newsk));
1450                sk_node_init(&newsk->sk_node);
1451                sock_lock_init(newsk);
1452                bh_lock_sock(newsk);
1453                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1454                newsk->sk_backlog.len = 0;
1455
1456                atomic_set(&newsk->sk_rmem_alloc, 0);
1457                /*
1458                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1459                 */
1460                atomic_set(&newsk->sk_wmem_alloc, 1);
1461                atomic_set(&newsk->sk_omem_alloc, 0);
1462                skb_queue_head_init(&newsk->sk_receive_queue);
1463                skb_queue_head_init(&newsk->sk_write_queue);
1464#ifdef CONFIG_NET_DMA
1465                skb_queue_head_init(&newsk->sk_async_wait_queue);
1466#endif
1467
1468                spin_lock_init(&newsk->sk_dst_lock);
1469                rwlock_init(&newsk->sk_callback_lock);
1470                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1471                                af_callback_keys + newsk->sk_family,
1472                                af_family_clock_key_strings[newsk->sk_family]);
1473
1474                newsk->sk_dst_cache     = NULL;
1475                newsk->sk_wmem_queued   = 0;
1476                newsk->sk_forward_alloc = 0;
1477                newsk->sk_send_head     = NULL;
1478                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1479
1480                sock_reset_flag(newsk, SOCK_DONE);
1481                skb_queue_head_init(&newsk->sk_error_queue);
1482
1483                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1484                if (filter != NULL)
1485                        sk_filter_charge(newsk, filter);
1486
1487                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1488                        /* It is still raw copy of parent, so invalidate
1489                         * destructor and make plain sk_free() */
1490                        newsk->sk_destruct = NULL;
1491                        bh_unlock_sock(newsk);
1492                        sk_free(newsk);
1493                        newsk = NULL;
1494                        goto out;
1495                }
1496
1497                newsk->sk_err      = 0;
1498                newsk->sk_priority = 0;
1499                /*
1500                 * Before updating sk_refcnt, we must commit prior changes to memory
1501                 * (Documentation/RCU/rculist_nulls.txt for details)
1502                 */
1503                smp_wmb();
1504                atomic_set(&newsk->sk_refcnt, 2);
1505
1506                /*
1507                 * Increment the counter in the same struct proto as the master
1508                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1509                 * is the same as sk->sk_prot->socks, as this field was copied
1510                 * with memcpy).
1511                 *
1512                 * This _changes_ the previous behaviour, where
1513                 * tcp_create_openreq_child always was incrementing the
1514                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1515                 * to be taken into account in all callers. -acme
1516                 */
1517                sk_refcnt_debug_inc(newsk);
1518                sk_set_socket(newsk, NULL);
1519                newsk->sk_wq = NULL;
1520
1521                sk_update_clone(sk, newsk);
1522
1523                if (newsk->sk_prot->sockets_allocated)
1524                        sk_sockets_allocated_inc(newsk);
1525
1526                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1527                        net_enable_timestamp();
1528        }
1529out:
1530        return newsk;
1531}
1532EXPORT_SYMBOL_GPL(sk_clone_lock);
1533
1534void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1535{
1536        __sk_dst_set(sk, dst);
1537        sk->sk_route_caps = dst->dev->features;
1538        if (sk->sk_route_caps & NETIF_F_GSO)
1539                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1540        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1541        if (sk_can_gso(sk)) {
1542                if (dst->header_len) {
1543                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1544                } else {
1545                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1546                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1547                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1548                }
1549        }
1550}
1551EXPORT_SYMBOL_GPL(sk_setup_caps);
1552
1553/*
1554 *      Simple resource managers for sockets.
1555 */
1556
1557
1558/*
1559 * Write buffer destructor automatically called from kfree_skb.
1560 */
1561void sock_wfree(struct sk_buff *skb)
1562{
1563        struct sock *sk = skb->sk;
1564        unsigned int len = skb->truesize;
1565
1566        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1567                /*
1568                 * Keep a reference on sk_wmem_alloc, this will be released
1569                 * after sk_write_space() call
1570                 */
1571                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1572                sk->sk_write_space(sk);
1573                len = 1;
1574        }
1575        /*
1576         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1577         * could not do because of in-flight packets
1578         */
1579        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1580                __sk_free(sk);
1581}
1582EXPORT_SYMBOL(sock_wfree);
1583
1584void skb_orphan_partial(struct sk_buff *skb)
1585{
1586        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1587         * so we do not completely orphan skb, but transfert all
1588         * accounted bytes but one, to avoid unexpected reorders.
1589         */
1590        if (skb->destructor == sock_wfree
1591#ifdef CONFIG_INET
1592            || skb->destructor == tcp_wfree
1593#endif
1594                ) {
1595                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1596                skb->truesize = 1;
1597        } else {
1598                skb_orphan(skb);
1599        }
1600}
1601EXPORT_SYMBOL(skb_orphan_partial);
1602
1603/*
1604 * Read buffer destructor automatically called from kfree_skb.
1605 */
1606void sock_rfree(struct sk_buff *skb)
1607{
1608        struct sock *sk = skb->sk;
1609        unsigned int len = skb->truesize;
1610
1611        atomic_sub(len, &sk->sk_rmem_alloc);
1612        sk_mem_uncharge(sk, len);
1613}
1614EXPORT_SYMBOL(sock_rfree);
1615
1616void sock_edemux(struct sk_buff *skb)
1617{
1618        struct sock *sk = skb->sk;
1619
1620#ifdef CONFIG_INET
1621        if (sk->sk_state == TCP_TIME_WAIT)
1622                inet_twsk_put(inet_twsk(sk));
1623        else
1624#endif
1625                sock_put(sk);
1626}
1627EXPORT_SYMBOL(sock_edemux);
1628
1629kuid_t sock_i_uid(struct sock *sk)
1630{
1631        kuid_t uid;
1632
1633        read_lock_bh(&sk->sk_callback_lock);
1634        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1635        read_unlock_bh(&sk->sk_callback_lock);
1636        return uid;
1637}
1638EXPORT_SYMBOL(sock_i_uid);
1639
1640unsigned long sock_i_ino(struct sock *sk)
1641{
1642        unsigned long ino;
1643
1644        read_lock_bh(&sk->sk_callback_lock);
1645        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1646        read_unlock_bh(&sk->sk_callback_lock);
1647        return ino;
1648}
1649EXPORT_SYMBOL(sock_i_ino);
1650
1651/*
1652 * Allocate a skb from the socket's send buffer.
1653 */
1654struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1655                             gfp_t priority)
1656{
1657        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1658                struct sk_buff *skb = alloc_skb(size, priority);
1659                if (skb) {
1660                        skb_set_owner_w(skb, sk);
1661                        return skb;
1662                }
1663        }
1664        return NULL;
1665}
1666EXPORT_SYMBOL(sock_wmalloc);
1667
1668/*
1669 * Allocate a skb from the socket's receive buffer.
1670 */
1671struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1672                             gfp_t priority)
1673{
1674        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1675                struct sk_buff *skb = alloc_skb(size, priority);
1676                if (skb) {
1677                        skb_set_owner_r(skb, sk);
1678                        return skb;
1679                }
1680        }
1681        return NULL;
1682}
1683
1684/*
1685 * Allocate a memory block from the socket's option memory buffer.
1686 */
1687void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1688{
1689        if ((unsigned int)size <= sysctl_optmem_max &&
1690            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1691                void *mem;
1692                /* First do the add, to avoid the race if kmalloc
1693                 * might sleep.
1694                 */
1695                atomic_add(size, &sk->sk_omem_alloc);
1696                mem = kmalloc(size, priority);
1697                if (mem)
1698                        return mem;
1699                atomic_sub(size, &sk->sk_omem_alloc);
1700        }
1701        return NULL;
1702}
1703EXPORT_SYMBOL(sock_kmalloc);
1704
1705/*
1706 * Free an option memory block.
1707 */
1708void sock_kfree_s(struct sock *sk, void *mem, int size)
1709{
1710        kfree(mem);
1711        atomic_sub(size, &sk->sk_omem_alloc);
1712}
1713EXPORT_SYMBOL(sock_kfree_s);
1714
1715/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1716   I think, these locks should be removed for datagram sockets.
1717 */
1718static long sock_wait_for_wmem(struct sock *sk, long timeo)
1719{
1720        DEFINE_WAIT(wait);
1721
1722        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1723        for (;;) {
1724                if (!timeo)
1725                        break;
1726                if (signal_pending(current))
1727                        break;
1728                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1729                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1730                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1731                        break;
1732                if (sk->sk_shutdown & SEND_SHUTDOWN)
1733                        break;
1734                if (sk->sk_err)
1735                        break;
1736                timeo = schedule_timeout(timeo);
1737        }
1738        finish_wait(sk_sleep(sk), &wait);
1739        return timeo;
1740}
1741
1742
1743/*
1744 *      Generic send/receive buffer handlers
1745 */
1746
1747struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1748                                     unsigned long data_len, int noblock,
1749                                     int *errcode, int max_page_order)
1750{
1751        struct sk_buff *skb = NULL;
1752        unsigned long chunk;
1753        gfp_t gfp_mask;
1754        long timeo;
1755        int err;
1756        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1757        struct page *page;
1758        int i;
1759
1760        err = -EMSGSIZE;
1761        if (npages > MAX_SKB_FRAGS)
1762                goto failure;
1763
1764        timeo = sock_sndtimeo(sk, noblock);
1765        while (!skb) {
1766                err = sock_error(sk);
1767                if (err != 0)
1768                        goto failure;
1769
1770                err = -EPIPE;
1771                if (sk->sk_shutdown & SEND_SHUTDOWN)
1772                        goto failure;
1773
1774                if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1775                        set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1776                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1777                        err = -EAGAIN;
1778                        if (!timeo)
1779                                goto failure;
1780                        if (signal_pending(current))
1781                                goto interrupted;
1782                        timeo = sock_wait_for_wmem(sk, timeo);
1783                        continue;
1784                }
1785
1786                err = -ENOBUFS;
1787                gfp_mask = sk->sk_allocation;
1788                if (gfp_mask & __GFP_WAIT)
1789                        gfp_mask |= __GFP_REPEAT;
1790
1791                skb = alloc_skb(header_len, gfp_mask);
1792                if (!skb)
1793                        goto failure;
1794
1795                skb->truesize += data_len;
1796
1797                for (i = 0; npages > 0; i++) {
1798                        int order = max_page_order;
1799
1800                        while (order) {
1801                                if (npages >= 1 << order) {
1802                                        page = alloc_pages(sk->sk_allocation |
1803                                                           __GFP_COMP | __GFP_NOWARN,
1804                                                           order);
1805                                        if (page)
1806                                                goto fill_page;
1807                                }
1808                                order--;
1809                        }
1810                        page = alloc_page(sk->sk_allocation);
1811                        if (!page)
1812                                goto failure;
1813fill_page:
1814                        chunk = min_t(unsigned long, data_len,
1815                                      PAGE_SIZE << order);
1816                        skb_fill_page_desc(skb, i, page, 0, chunk);
1817                        data_len -= chunk;
1818                        npages -= 1 << order;
1819                }
1820        }
1821
1822        skb_set_owner_w(skb, sk);
1823        return skb;
1824
1825interrupted:
1826        err = sock_intr_errno(timeo);
1827failure:
1828        kfree_skb(skb);
1829        *errcode = err;
1830        return NULL;
1831}
1832EXPORT_SYMBOL(sock_alloc_send_pskb);
1833
1834struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1835                                    int noblock, int *errcode)
1836{
1837        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1838}
1839EXPORT_SYMBOL(sock_alloc_send_skb);
1840
1841/* On 32bit arches, an skb frag is limited to 2^15 */
1842#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1843
1844/**
1845 * skb_page_frag_refill - check that a page_frag contains enough room
1846 * @sz: minimum size of the fragment we want to get
1847 * @pfrag: pointer to page_frag
1848 * @prio: priority for memory allocation
1849 *
1850 * Note: While this allocator tries to use high order pages, there is
1851 * no guarantee that allocations succeed. Therefore, @sz MUST be
1852 * less or equal than PAGE_SIZE.
1853 */
1854bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1855{
1856        int order;
1857
1858        if (pfrag->page) {
1859                if (atomic_read(&pfrag->page->_count) == 1) {
1860                        pfrag->offset = 0;
1861                        return true;
1862                }
1863                if (pfrag->offset + sz <= pfrag->size)
1864                        return true;
1865                put_page(pfrag->page);
1866        }
1867
1868        /* We restrict high order allocations to users that can afford to wait */
1869        order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1870
1871        do {
1872                gfp_t gfp = prio;
1873
1874                if (order)
1875                        gfp |= __GFP_COMP | __GFP_NOWARN;
1876                pfrag->page = alloc_pages(gfp, order);
1877                if (likely(pfrag->page)) {
1878                        pfrag->offset = 0;
1879                        pfrag->size = PAGE_SIZE << order;
1880                        return true;
1881                }
1882        } while (--order >= 0);
1883
1884        return false;
1885}
1886EXPORT_SYMBOL(skb_page_frag_refill);
1887
1888bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1889{
1890        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1891                return true;
1892
1893        sk_enter_memory_pressure(sk);
1894        sk_stream_moderate_sndbuf(sk);
1895        return false;
1896}
1897EXPORT_SYMBOL(sk_page_frag_refill);
1898
1899static void __lock_sock(struct sock *sk)
1900        __releases(&sk->sk_lock.slock)
1901        __acquires(&sk->sk_lock.slock)
1902{
1903        DEFINE_WAIT(wait);
1904
1905        for (;;) {
1906                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1907                                        TASK_UNINTERRUPTIBLE);
1908                spin_unlock_bh(&sk->sk_lock.slock);
1909                schedule();
1910                spin_lock_bh(&sk->sk_lock.slock);
1911                if (!sock_owned_by_user(sk))
1912                        break;
1913        }
1914        finish_wait(&sk->sk_lock.wq, &wait);
1915}
1916
1917static void __release_sock(struct sock *sk)
1918        __releases(&sk->sk_lock.slock)
1919        __acquires(&sk->sk_lock.slock)
1920{
1921        struct sk_buff *skb = sk->sk_backlog.head;
1922
1923        do {
1924                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1925                bh_unlock_sock(sk);
1926
1927                do {
1928                        struct sk_buff *next = skb->next;
1929
1930                        prefetch(next);
1931                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1932                        skb->next = NULL;
1933                        sk_backlog_rcv(sk, skb);
1934
1935                        /*
1936                         * We are in process context here with softirqs
1937                         * disabled, use cond_resched_softirq() to preempt.
1938                         * This is safe to do because we've taken the backlog
1939                         * queue private:
1940                         */
1941                        cond_resched_softirq();
1942
1943                        skb = next;
1944                } while (skb != NULL);
1945
1946                bh_lock_sock(sk);
1947        } while ((skb = sk->sk_backlog.head) != NULL);
1948
1949        /*
1950         * Doing the zeroing here guarantee we can not loop forever
1951         * while a wild producer attempts to flood us.
1952         */
1953        sk->sk_backlog.len = 0;
1954}
1955
1956/**
1957 * sk_wait_data - wait for data to arrive at sk_receive_queue
1958 * @sk:    sock to wait on
1959 * @timeo: for how long
1960 *
1961 * Now socket state including sk->sk_err is changed only under lock,
1962 * hence we may omit checks after joining wait queue.
1963 * We check receive queue before schedule() only as optimization;
1964 * it is very likely that release_sock() added new data.
1965 */
1966int sk_wait_data(struct sock *sk, long *timeo)
1967{
1968        int rc;
1969        DEFINE_WAIT(wait);
1970
1971        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1972        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1973        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1974        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1975        finish_wait(sk_sleep(sk), &wait);
1976        return rc;
1977}
1978EXPORT_SYMBOL(sk_wait_data);
1979
1980/**
1981 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1982 *      @sk: socket
1983 *      @size: memory size to allocate
1984 *      @kind: allocation type
1985 *
1986 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1987 *      rmem allocation. This function assumes that protocols which have
1988 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1989 */
1990int __sk_mem_schedule(struct sock *sk, int size, int kind)
1991{
1992        struct proto *prot = sk->sk_prot;
1993        int amt = sk_mem_pages(size);
1994        long allocated;
1995        int parent_status = UNDER_LIMIT;
1996
1997        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1998
1999        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2000

2001        /* Under limit. */
2002        if (parent_status == UNDER_LIMIT &&
2003                        allocated <= sk_prot_mem_limits(sk, 0)) {
2004                sk_leave_memory_pressure(sk);
2005                return 1;
2006        }
2007
2008        /* Under pressure. (we or our parents) */
2009        if ((parent_status > SOFT_LIMIT) ||
2010                        allocated > sk_prot_mem_limits(sk, 1))
2011                sk_enter_memory_pressure(sk);
2012
2013        /* Over hard limit (we or our parents) */
2014        if ((parent_status == OVER_LIMIT) ||
2015                        (allocated > sk_prot_mem_limits(sk, 2)))
2016                goto suppress_allocation;
2017
2018        /* guarantee minimum buffer size under pressure */
2019        if (kind == SK_MEM_RECV) {
2020                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2021                        return 1;
2022
2023        } else { /* SK_MEM_SEND */
2024                if (sk->sk_type == SOCK_STREAM) {
2025                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2026                                return 1;
2027                } else if (atomic_read(&sk->sk_wmem_alloc) <
2028                           prot->sysctl_wmem[0])
2029                                return 1;
2030        }
2031
2032        if (sk_has_memory_pressure(sk)) {
2033                int alloc;
2034
2035                if (!sk_under_memory_pressure(sk))
2036                        return 1;
2037                alloc = sk_sockets_allocated_read_positive(sk);
2038                if (sk_prot_mem_limits(sk, 2) > alloc *
2039                    sk_mem_pages(sk->sk_wmem_queued +
2040                                 atomic_read(&sk->sk_rmem_alloc) +
2041                                 sk->sk_forward_alloc))
2042                        return 1;
2043        }
2044
2045suppress_allocation:
2046
2047        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2048                sk_stream_moderate_sndbuf(sk);
2049
2050                /* Fail only if socket is _under_ its sndbuf.
2051                 * In this case we cannot block, so that we have to fail.
2052                 */
2053                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2054                        return 1;
2055        }
2056
2057        trace_sock_exceed_buf_limit(sk, prot, allocated);
2058
2059        /* Alas. Undo changes. */
2060        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2061
2062        sk_memory_allocated_sub(sk, amt);
2063
2064        return 0;
2065}
2066EXPORT_SYMBOL(__sk_mem_schedule);
2067
2068/**
2069 *      __sk_reclaim - reclaim memory_allocated
2070 *      @sk: socket
2071 */
2072void __sk_mem_reclaim(struct sock *sk)
2073{
2074        sk_memory_allocated_sub(sk,
2075                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2076        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2077
2078        if (sk_under_memory_pressure(sk) &&
2079            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2080                sk_leave_memory_pressure(sk);
2081}
2082EXPORT_SYMBOL(__sk_mem_reclaim);
2083
2084
2085/*
2086 * Set of default routines for initialising struct proto_ops when
2087 * the protocol does not support a particular function. In certain
2088 * cases where it makes no sense for a protocol to have a "do nothing"
2089 * function, some default processing is provided.
2090 */
2091
2092int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2093{
2094        return -EOPNOTSUPP;
2095}
2096EXPORT_SYMBOL(sock_no_bind);
2097
2098int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2099                    int len, int flags)
2100{
2101        return -EOPNOTSUPP;
2102}
2103EXPORT_SYMBOL(sock_no_connect);
2104
2105int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2106{
2107        return -EOPNOTSUPP;
2108}
2109EXPORT_SYMBOL(sock_no_socketpair);
2110
2111int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2112{
2113        return -EOPNOTSUPP;
2114}
2115EXPORT_SYMBOL(sock_no_accept);
2116
2117int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2118                    int *len, int peer)
2119{
2120        return -EOPNOTSUPP;
2121}
2122EXPORT_SYMBOL(sock_no_getname);
2123
2124unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2125{
2126        return 0;
2127}
2128EXPORT_SYMBOL(sock_no_poll);
2129
2130int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2131{
2132        return -EOPNOTSUPP;
2133}
2134EXPORT_SYMBOL(sock_no_ioctl);
2135
2136int sock_no_listen(struct socket *sock, int backlog)
2137{
2138        return -EOPNOTSUPP;
2139}
2140EXPORT_SYMBOL(sock_no_listen);
2141
2142int sock_no_shutdown(struct socket *sock, int how)
2143{
2144        return -EOPNOTSUPP;
2145}
2146EXPORT_SYMBOL(sock_no_shutdown);
2147
2148int sock_no_setsockopt(struct socket *sock, int level, int optname,
2149                    char __user *optval, unsigned int optlen)
2150{
2151        return -EOPNOTSUPP;
2152}
2153EXPORT_SYMBOL(sock_no_setsockopt);
2154
2155int sock_no_getsockopt(struct socket *sock, int level, int optname,
2156                    char __user *optval, int __user *optlen)
2157{
2158        return -EOPNOTSUPP;
2159}
2160EXPORT_SYMBOL(sock_no_getsockopt);
2161
2162int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2163                    size_t len)
2164{
2165        return -EOPNOTSUPP;
2166}
2167EXPORT_SYMBOL(sock_no_sendmsg);
2168
2169int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2170                    size_t len, int flags)
2171{
2172        return -EOPNOTSUPP;
2173}
2174EXPORT_SYMBOL(sock_no_recvmsg);
2175
2176int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2177{
2178        /* Mirror missing mmap method error code */
2179        return -ENODEV;
2180}
2181EXPORT_SYMBOL(sock_no_mmap);
2182
2183ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2184{
2185        ssize_t res;
2186        struct msghdr msg = {.msg_flags = flags};
2187        struct kvec iov;
2188        char *kaddr = kmap(page);
2189        iov.iov_base = kaddr + offset;
2190        iov.iov_len = size;
2191        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2192        kunmap(page);
2193        return res;
2194}
2195EXPORT_SYMBOL(sock_no_sendpage);
2196
2197/*
2198 *      Default Socket Callbacks
2199 */
2200
2201static void sock_def_wakeup(struct sock *sk)
2202{
2203        struct socket_wq *wq;
2204
2205        rcu_read_lock();
2206        wq = rcu_dereference(sk->sk_wq);
2207        if (wq_has_sleeper(wq))
2208                wake_up_interruptible_all(&wq->wait);
2209        rcu_read_unlock();
2210}
2211
2212static void sock_def_error_report(struct sock *sk)
2213{
2214        struct socket_wq *wq;
2215
2216        rcu_read_lock();
2217        wq = rcu_dereference(sk->sk_wq);
2218        if (wq_has_sleeper(wq))
2219                wake_up_interruptible_poll(&wq->wait, POLLERR);
2220        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2221        rcu_read_unlock();
2222}
2223
2224static void sock_def_readable(struct sock *sk, int len)
2225{
2226        struct socket_wq *wq;
2227
2228        rcu_read_lock();
2229        wq = rcu_dereference(sk->sk_wq);
2230        if (wq_has_sleeper(wq))
2231                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2232                                                POLLRDNORM | POLLRDBAND);
2233        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2234        rcu_read_unlock();
2235}
2236
2237static void sock_def_write_space(struct sock *sk)
2238{
2239        struct socket_wq *wq;
2240
2241        rcu_read_lock();
2242
2243        /* Do not wake up a writer until he can make "significant"
2244         * progress.  --DaveM
2245         */
2246        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2247                wq = rcu_dereference(sk->sk_wq);
2248                if (wq_has_sleeper(wq))
2249                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2250                                                POLLWRNORM | POLLWRBAND);
2251
2252                /* Should agree with poll, otherwise some programs break */
2253                if (sock_writeable(sk))
2254                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2255        }
2256
2257        rcu_read_unlock();
2258}
2259
2260static void sock_def_destruct(struct sock *sk)
2261{
2262        kfree(sk->sk_protinfo);
2263}
2264
2265void sk_send_sigurg(struct sock *sk)
2266{
2267        if (sk->sk_socket && sk->sk_socket->file)
2268                if (send_sigurg(&sk->sk_socket->file->f_owner))
2269                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2270}
2271EXPORT_SYMBOL(sk_send_sigurg);
2272
2273void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2274                    unsigned long expires)
2275{
2276        if (!mod_timer(timer, expires))
2277                sock_hold(sk);
2278}
2279EXPORT_SYMBOL(sk_reset_timer);
2280
2281void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2282{
2283        if (del_timer(timer))
2284                __sock_put(sk);
2285}
2286EXPORT_SYMBOL(sk_stop_timer);
2287
2288void sock_init_data(struct socket *sock, struct sock *sk)
2289{
2290        skb_queue_head_init(&sk->sk_receive_queue);
2291        skb_queue_head_init(&sk->sk_write_queue);
2292        skb_queue_head_init(&sk->sk_error_queue);
2293#ifdef CONFIG_NET_DMA
2294        skb_queue_head_init(&sk->sk_async_wait_queue);
2295#endif
2296
2297        sk->sk_send_head        =       NULL;
2298
2299        init_timer(&sk->sk_timer);
2300
2301        sk->sk_allocation       =       GFP_KERNEL;
2302        sk->sk_rcvbuf           =       sysctl_rmem_default;
2303        sk->sk_sndbuf           =       sysctl_wmem_default;
2304        sk->sk_state            =       TCP_CLOSE;
2305        sk_set_socket(sk, sock);
2306
2307        sock_set_flag(sk, SOCK_ZAPPED);
2308
2309        if (sock) {
2310                sk->sk_type     =       sock->type;
2311                sk->sk_wq       =       sock->wq;
2312                sock->sk        =       sk;
2313        } else
2314                sk->sk_wq       =       NULL;
2315
2316        spin_lock_init(&sk->sk_dst_lock);
2317        rwlock_init(&sk->sk_callback_lock);
2318        lockdep_set_class_and_name(&sk->sk_callback_lock,
2319                        af_callback_keys + sk->sk_family,
2320                        af_family_clock_key_strings[sk->sk_family]);
2321
2322        sk->sk_state_change     =       sock_def_wakeup;
2323        sk->sk_data_ready       =       sock_def_readable;
2324        sk->sk_write_space      =       sock_def_write_space;
2325        sk->sk_error_report     =       sock_def_error_report;
2326        sk->sk_destruct         =       sock_def_destruct;
2327
2328        sk->sk_frag.page        =       NULL;
2329        sk->sk_frag.offset      =       0;
2330        sk->sk_peek_off         =       -1;
2331
2332        sk->sk_peer_pid         =       NULL;
2333        sk->sk_peer_cred        =       NULL;
2334        sk->sk_write_pending    =       0;
2335        sk->sk_rcvlowat         =       1;
2336        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2337        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2338
2339        sk->sk_stamp = ktime_set(-1L, 0);
2340
2341#ifdef CONFIG_NET_RX_BUSY_POLL
2342        sk->sk_napi_id          =       0;
2343        sk->sk_ll_usec          =       sysctl_net_busy_read;
2344#endif
2345
2346        sk->sk_max_pacing_rate = ~0U;
2347        sk->sk_pacing_rate = ~0U;
2348        /*
2349         * Before updating sk_refcnt, we must commit prior changes to memory
2350         * (Documentation/RCU/rculist_nulls.txt for details)
2351         */
2352        smp_wmb();
2353        atomic_set(&sk->sk_refcnt, 1);
2354        atomic_set(&sk->sk_drops, 0);
2355}
2356EXPORT_SYMBOL(sock_init_data);
2357
2358void lock_sock_nested(struct sock *sk, int subclass)
2359{
2360        might_sleep();
2361        spin_lock_bh(&sk->sk_lock.slock);
2362        if (sk->sk_lock.owned)
2363                __lock_sock(sk);
2364        sk->sk_lock.owned = 1;
2365        spin_unlock(&sk->sk_lock.slock);
2366        /*
2367         * The sk_lock has mutex_lock() semantics here:
2368         */
2369        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2370        local_bh_enable();
2371}
2372EXPORT_SYMBOL(lock_sock_nested);
2373
2374void release_sock(struct sock *sk)
2375{
2376        /*
2377         * The sk_lock has mutex_unlock() semantics:
2378         */
2379        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2380
2381        spin_lock_bh(&sk->sk_lock.slock);
2382        if (sk->sk_backlog.tail)
2383                __release_sock(sk);
2384
2385        if (sk->sk_prot->release_cb)
2386                sk->sk_prot->release_cb(sk);
2387
2388        sk->sk_lock.owned = 0;
2389        if (waitqueue_active(&sk->sk_lock.wq))
2390                wake_up(&sk->sk_lock.wq);
2391        spin_unlock_bh(&sk->sk_lock.slock);
2392}
2393EXPORT_SYMBOL(release_sock);
2394
2395/**
2396 * lock_sock_fast - fast version of lock_sock
2397 * @sk: socket
2398 *
2399 * This version should be used for very small section, where process wont block
2400 * return false if fast path is taken
2401 *   sk_lock.slock locked, owned = 0, BH disabled
2402 * return true if slow path is taken
2403 *   sk_lock.slock unlocked, owned = 1, BH enabled
2404 */
2405bool lock_sock_fast(struct sock *sk)
2406{
2407        might_sleep();
2408        spin_lock_bh(&sk->sk_lock.slock);
2409
2410        if (!sk->sk_lock.owned)
2411                /*
2412                 * Note : We must disable BH
2413                 */
2414                return false;
2415
2416        __lock_sock(sk);
2417        sk->sk_lock.owned = 1;
2418        spin_unlock(&sk->sk_lock.slock);
2419        /*
2420         * The sk_lock has mutex_lock() semantics here:
2421         */
2422        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2423        local_bh_enable();
2424        return true;
2425}
2426EXPORT_SYMBOL(lock_sock_fast);
2427
2428int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2429{
2430        struct timeval tv;
2431        if (!sock_flag(sk, SOCK_TIMESTAMP))
2432                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2433        tv = ktime_to_timeval(sk->sk_stamp);
2434        if (tv.tv_sec == -1)
2435                return -ENOENT;
2436        if (tv.tv_sec == 0) {
2437                sk->sk_stamp = ktime_get_real();
2438                tv = ktime_to_timeval(sk->sk_stamp);
2439        }
2440        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2441}
2442EXPORT_SYMBOL(sock_get_timestamp);
2443
2444int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2445{
2446        struct timespec ts;
2447        if (!sock_flag(sk, SOCK_TIMESTAMP))
2448                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2449        ts = ktime_to_timespec(sk->sk_stamp);
2450        if (ts.tv_sec == -1)
2451                return -ENOENT;
2452        if (ts.tv_sec == 0) {
2453                sk->sk_stamp = ktime_get_real();
2454                ts = ktime_to_timespec(sk->sk_stamp);
2455        }
2456        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2457}
2458EXPORT_SYMBOL(sock_get_timestampns);
2459
2460void sock_enable_timestamp(struct sock *sk, int flag)
2461{
2462        if (!sock_flag(sk, flag)) {
2463                unsigned long previous_flags = sk->sk_flags;
2464
2465                sock_set_flag(sk, flag);
2466                /*
2467                 * we just set one of the two flags which require net
2468                 * time stamping, but time stamping might have been on
2469                 * already because of the other one
2470                 */
2471                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2472                        net_enable_timestamp();
2473        }
2474}
2475
2476int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2477                       int level, int type)
2478{
2479        struct sock_exterr_skb *serr;
2480        struct sk_buff *skb, *skb2;
2481        int copied, err;
2482
2483        err = -EAGAIN;
2484        skb = skb_dequeue(&sk->sk_error_queue);
2485        if (skb == NULL)
2486                goto out;
2487
2488        copied = skb->len;
2489        if (copied > len) {
2490                msg->msg_flags |= MSG_TRUNC;
2491                copied = len;
2492        }
2493        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2494        if (err)
2495                goto out_free_skb;
2496
2497        sock_recv_timestamp(msg, sk, skb);
2498
2499        serr = SKB_EXT_ERR(skb);
2500        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2501
2502        msg->msg_flags |= MSG_ERRQUEUE;
2503        err = copied;
2504
2505        /* Reset and regenerate socket error */
2506        spin_lock_bh(&sk->sk_error_queue.lock);
2507        sk->sk_err = 0;
2508        if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2509                sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2510                spin_unlock_bh(&sk->sk_error_queue.lock);
2511                sk->sk_error_report(sk);
2512        } else
2513                spin_unlock_bh(&sk->sk_error_queue.lock);
2514
2515out_free_skb:
2516        kfree_skb(skb);
2517out:
2518        return err;
2519}
2520EXPORT_SYMBOL(sock_recv_errqueue);
2521
2522/*
2523 *      Get a socket option on an socket.
2524 *
2525 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2526 *      asynchronous errors should be reported by getsockopt. We assume
2527 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2528 */
2529int sock_common_getsockopt(struct socket *sock, int level, int optname,
2530                           char __user *optval, int __user *optlen)
2531{
2532        struct sock *sk = sock->sk;
2533
2534        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2535}
2536EXPORT_SYMBOL(sock_common_getsockopt);
2537
2538#ifdef CONFIG_COMPAT
2539int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2540                                  char __user *optval, int __user *optlen)
2541{
2542        struct sock *sk = sock->sk;
2543
2544        if (sk->sk_prot->compat_getsockopt != NULL)
2545                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2546                                                      optval, optlen);
2547        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2548}
2549EXPORT_SYMBOL(compat_sock_common_getsockopt);
2550#endif
2551
2552int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2553                        struct msghdr *msg, size_t size, int flags)
2554{
2555        struct sock *sk = sock->sk;
2556        int addr_len = 0;
2557        int err;
2558
2559        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2560                                   flags & ~MSG_DONTWAIT, &addr_len);
2561        if (err >= 0)
2562                msg->msg_namelen = addr_len;
2563        return err;
2564}
2565EXPORT_SYMBOL(sock_common_recvmsg);
2566
2567/*
2568 *      Set socket options on an inet socket.
2569 */
2570int sock_common_setsockopt(struct socket *sock, int level, int optname,
2571                           char __user *optval, unsigned int optlen)
2572{
2573        struct sock *sk = sock->sk;
2574
2575        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2576}
2577EXPORT_SYMBOL(sock_common_setsockopt);
2578
2579#ifdef CONFIG_COMPAT
2580int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2581                                  char __user *optval, unsigned int optlen)
2582{
2583        struct sock *sk = sock->sk;
2584
2585        if (sk->sk_prot->compat_setsockopt != NULL)
2586                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2587                                                      optval, optlen);
2588        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2589}
2590EXPORT_SYMBOL(compat_sock_common_setsockopt);
2591#endif
2592
2593void sk_common_release(struct sock *sk)
2594{
2595        if (sk->sk_prot->destroy)
2596                sk->sk_prot->destroy(sk);
2597
2598        /*
2599         * Observation: when sock_common_release is called, processes have
2600         * no access to socket. But net still has.
2601         * Step one, detach it from networking:
2602         *
2603         * A. Remove from hash tables.
2604         */
2605
2606        sk->sk_prot->unhash(sk);
2607
2608        /*
2609         * In this point socket cannot receive new packets, but it is possible
2610         * that some packets are in flight because some CPU runs receiver and
2611         * did hash table lookup before we unhashed socket. They will achieve
2612         * receive queue and will be purged by socket destructor.
2613         *
2614         * Also we still have packets pending on receive queue and probably,
2615         * our own packets waiting in device queues. sock_destroy will drain
2616         * receive queue, but transmitted packets will delay socket destruction
2617         * until the last reference will be released.
2618         */
2619
2620        sock_orphan(sk);
2621
2622        xfrm_sk_free_policy(sk);
2623
2624        sk_refcnt_debug_release(sk);
2625
2626        if (sk->sk_frag.page) {
2627                put_page(sk->sk_frag.page);
2628                sk->sk_frag.page = NULL;
2629        }
2630
2631        sock_put(sk);
2632}
2633EXPORT_SYMBOL(sk_common_release);
2634
2635#ifdef CONFIG_PROC_FS
2636#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2637struct prot_inuse {
2638        int val[PROTO_INUSE_NR];
2639};
2640
2641static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2642
2643#ifdef CONFIG_NET_NS
2644void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2645{
2646        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2647}
2648EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2649
2650int sock_prot_inuse_get(struct net *net, struct proto *prot)
2651{
2652        int cpu, idx = prot->inuse_idx;
2653        int res = 0;
2654
2655        for_each_possible_cpu(cpu)
2656                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2657
2658        return res >= 0 ? res : 0;
2659}
2660EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2661
2662static int __net_init sock_inuse_init_net(struct net *net)
2663{
2664        net->core.inuse = alloc_percpu(struct prot_inuse);
2665        return net->core.inuse ? 0 : -ENOMEM;
2666}
2667
2668static void __net_exit sock_inuse_exit_net(struct net *net)
2669{
2670        free_percpu(net->core.inuse);
2671}
2672
2673static struct pernet_operations net_inuse_ops = {
2674        .init = sock_inuse_init_net,
2675        .exit = sock_inuse_exit_net,
2676};
2677
2678static __init int net_inuse_init(void)
2679{
2680        if (register_pernet_subsys(&net_inuse_ops))
2681                panic("Cannot initialize net inuse counters");
2682
2683        return 0;
2684}
2685
2686core_initcall(net_inuse_init);
2687#else
2688static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2689
2690void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2691{
2692        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2693}
2694EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2695
2696int sock_prot_inuse_get(struct net *net, struct proto *prot)
2697{
2698        int cpu, idx = prot->inuse_idx;
2699        int res = 0;
2700
2701        for_each_possible_cpu(cpu)
2702                res += per_cpu(prot_inuse, cpu).val[idx];
2703
2704        return res >= 0 ? res : 0;
2705}
2706EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2707#endif
2708
2709static void assign_proto_idx(struct proto *prot)
2710{
2711        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2712
2713        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2714                pr_err("PROTO_INUSE_NR exhausted\n");
2715                return;
2716        }
2717
2718        set_bit(prot->inuse_idx, proto_inuse_idx);
2719}
2720
2721static void release_proto_idx(struct proto *prot)
2722{
2723        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2724                clear_bit(prot->inuse_idx, proto_inuse_idx);
2725}
2726#else
2727static inline void assign_proto_idx(struct proto *prot)
2728{
2729}
2730
2731static inline void release_proto_idx(struct proto *prot)
2732{
2733}
2734#endif
2735
2736int proto_register(struct proto *prot, int alloc_slab)
2737{
2738        if (alloc_slab) {
2739                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2740                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2741                                        NULL);
2742
2743                if (prot->slab == NULL) {
2744                        pr_crit("%s: Can't create sock SLAB cache!\n",
2745                                prot->name);
2746                        goto out;
2747                }
2748
2749                if (prot->rsk_prot != NULL) {
2750                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2751                        if (prot->rsk_prot->slab_name == NULL)
2752                                goto out_free_sock_slab;
2753
2754                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2755                                                                 prot->rsk_prot->obj_size, 0,
2756                                                                 SLAB_HWCACHE_ALIGN, NULL);
2757
2758                        if (prot->rsk_prot->slab == NULL) {
2759                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2760                                        prot->name);
2761                                goto out_free_request_sock_slab_name;
2762                        }
2763                }
2764
2765                if (prot->twsk_prot != NULL) {
2766                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2767
2768                        if (prot->twsk_prot->twsk_slab_name == NULL)
2769                                goto out_free_request_sock_slab;
2770
2771                        prot->twsk_prot->twsk_slab =
2772                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2773                                                  prot->twsk_prot->twsk_obj_size,
2774                                                  0,
2775                                                  SLAB_HWCACHE_ALIGN |
2776                                                        prot->slab_flags,
2777                                                  NULL);
2778                        if (prot->twsk_prot->twsk_slab == NULL)
2779                                goto out_free_timewait_sock_slab_name;
2780                }
2781        }
2782
2783        mutex_lock(&proto_list_mutex);
2784        list_add(&prot->node, &proto_list);
2785        assign_proto_idx(prot);
2786        mutex_unlock(&proto_list_mutex);
2787        return 0;
2788
2789out_free_timewait_sock_slab_name:
2790        kfree(prot->twsk_prot->twsk_slab_name);
2791out_free_request_sock_slab:
2792        if (prot->rsk_prot && prot->rsk_prot->slab) {
2793                kmem_cache_destroy(prot->rsk_prot->slab);
2794                prot->rsk_prot->slab = NULL;
2795        }
2796out_free_request_sock_slab_name:
2797        if (prot->rsk_prot)
2798                kfree(prot->rsk_prot->slab_name);
2799out_free_sock_slab:
2800        kmem_cache_destroy(prot->slab);
2801        prot->slab = NULL;
2802out:
2803        return -ENOBUFS;
2804}
2805EXPORT_SYMBOL(proto_register);
2806
2807void proto_unregister(struct proto *prot)
2808{
2809        mutex_lock(&proto_list_mutex);
2810        release_proto_idx(prot);
2811        list_del(&prot->node);
2812        mutex_unlock(&proto_list_mutex);
2813
2814        if (prot->slab != NULL) {
2815                kmem_cache_destroy(prot->slab);
2816                prot->slab = NULL;
2817        }
2818
2819        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2820                kmem_cache_destroy(prot->rsk_prot->slab);
2821                kfree(prot->rsk_prot->slab_name);
2822                prot->rsk_prot->slab = NULL;
2823        }
2824
2825        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2826                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2827                kfree(prot->twsk_prot->twsk_slab_name);
2828                prot->twsk_prot->twsk_slab = NULL;
2829        }
2830}
2831EXPORT_SYMBOL(proto_unregister);
2832
2833#ifdef CONFIG_PROC_FS
2834static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2835        __acquires(proto_list_mutex)
2836{
2837        mutex_lock(&proto_list_mutex);
2838        return seq_list_start_head(&proto_list, *pos);
2839}
2840
2841static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2842{
2843        return seq_list_next(v, &proto_list, pos);
2844}
2845
2846static void proto_seq_stop(struct seq_file *seq, void *v)
2847        __releases(proto_list_mutex)
2848{
2849        mutex_unlock(&proto_list_mutex);
2850}
2851
2852static char proto_method_implemented(const void *method)
2853{
2854        return method == NULL ? 'n' : 'y';
2855}
2856static long sock_prot_memory_allocated(struct proto *proto)
2857{
2858        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2859}
2860
2861static char *sock_prot_memory_pressure(struct proto *proto)
2862{
2863        return proto->memory_pressure != NULL ?
2864        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2865}
2866
2867static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2868{
2869
2870        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2871                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2872                   proto->name,
2873                   proto->obj_size,
2874                   sock_prot_inuse_get(seq_file_net(seq), proto),
2875                   sock_prot_memory_allocated(proto),
2876                   sock_prot_memory_pressure(proto),
2877                   proto->max_header,
2878                   proto->slab == NULL ? "no" : "yes",
2879                   module_name(proto->owner),
2880                   proto_method_implemented(proto->close),
2881                   proto_method_implemented(proto->connect),
2882                   proto_method_implemented(proto->disconnect),
2883                   proto_method_implemented(proto->accept),
2884                   proto_method_implemented(proto->ioctl),
2885                   proto_method_implemented(proto->init),
2886                   proto_method_implemented(proto->destroy),
2887                   proto_method_implemented(proto->shutdown),
2888                   proto_method_implemented(proto->setsockopt),
2889                   proto_method_implemented(proto->getsockopt),
2890                   proto_method_implemented(proto->sendmsg),
2891                   proto_method_implemented(proto->recvmsg),
2892                   proto_method_implemented(proto->sendpage),
2893                   proto_method_implemented(proto->bind),
2894                   proto_method_implemented(proto->backlog_rcv),
2895                   proto_method_implemented(proto->hash),
2896                   proto_method_implemented(proto->unhash),
2897                   proto_method_implemented(proto->get_port),
2898                   proto_method_implemented(proto->enter_memory_pressure));
2899}
2900
2901static int proto_seq_show(struct seq_file *seq, void *v)
2902{
2903        if (v == &proto_list)
2904                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2905                           "protocol",
2906                           "size",
2907                           "sockets",
2908                           "memory",
2909                           "press",
2910                           "maxhdr",
2911                           "slab",
2912                           "module",
2913                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2914        else
2915                proto_seq_printf(seq, list_entry(v, struct proto, node));
2916        return 0;
2917}
2918
2919static const struct seq_operations proto_seq_ops = {
2920        .start  = proto_seq_start,
2921        .next   = proto_seq_next,
2922        .stop   = proto_seq_stop,
2923        .show   = proto_seq_show,
2924};
2925
2926static int proto_seq_open(struct inode *inode, struct file *file)
2927{
2928        return seq_open_net(inode, file, &proto_seq_ops,
2929                            sizeof(struct seq_net_private));
2930}
2931
2932static const struct file_operations proto_seq_fops = {
2933        .owner          = THIS_MODULE,
2934        .open           = proto_seq_open,
2935        .read           = seq_read,
2936        .llseek         = seq_lseek,
2937        .release        = seq_release_net,
2938};
2939
2940static __net_init int proto_init_net(struct net *net)
2941{
2942        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2943                return -ENOMEM;
2944
2945        return 0;
2946}
2947
2948static __net_exit void proto_exit_net(struct net *net)
2949{
2950        remove_proc_entry("protocols", net->proc_net);
2951}
2952
2953
2954static __net_initdata struct pernet_operations proto_net_ops = {
2955        .init = proto_init_net,
2956        .exit = proto_exit_net,
2957};
2958
2959static int __init proto_init(void)
2960{
2961        return register_pernet_subsys(&proto_net_ops);
2962}
2963
2964subsys_initcall(proto_init);
2965
2966#endif /* PROC_FS */
2967