linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#include <linux/capability.h>
  93#include <linux/errno.h>
  94#include <linux/types.h>
  95#include <linux/socket.h>
  96#include <linux/in.h>
  97#include <linux/kernel.h>
  98#include <linux/module.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/sched.h>
 102#include <linux/timer.h>
 103#include <linux/string.h>
 104#include <linux/sockios.h>
 105#include <linux/net.h>
 106#include <linux/mm.h>
 107#include <linux/slab.h>
 108#include <linux/interrupt.h>
 109#include <linux/poll.h>
 110#include <linux/tcp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113#include <linux/user_namespace.h>
 114#include <linux/jump_label.h>
 115#include <linux/memcontrol.h>
 116
 117#include <asm/uaccess.h>
 118#include <asm/system.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131
 132#include <linux/filter.h>
 133
 134#include <trace/events/sock.h>
 135
 136#ifdef CONFIG_INET
 137#include <net/tcp.h>
 138#endif
 139
 140static DEFINE_MUTEX(proto_list_mutex);
 141static LIST_HEAD(proto_list);
 142
 143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 145{
 146        struct proto *proto;
 147        int ret = 0;
 148
 149        mutex_lock(&proto_list_mutex);
 150        list_for_each_entry(proto, &proto_list, node) {
 151                if (proto->init_cgroup) {
 152                        ret = proto->init_cgroup(cgrp, ss);
 153                        if (ret)
 154                                goto out;
 155                }
 156        }
 157
 158        mutex_unlock(&proto_list_mutex);
 159        return ret;
 160out:
 161        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 162                if (proto->destroy_cgroup)
 163                        proto->destroy_cgroup(cgrp, ss);
 164        mutex_unlock(&proto_list_mutex);
 165        return ret;
 166}
 167
 168void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
 169{
 170        struct proto *proto;
 171
 172        mutex_lock(&proto_list_mutex);
 173        list_for_each_entry_reverse(proto, &proto_list, node)
 174                if (proto->destroy_cgroup)
 175                        proto->destroy_cgroup(cgrp, ss);
 176        mutex_unlock(&proto_list_mutex);
 177}
 178#endif
 179
 180/*
 181 * Each address family might have different locking rules, so we have
 182 * one slock key per address family:
 183 */
 184static struct lock_class_key af_family_keys[AF_MAX];
 185static struct lock_class_key af_family_slock_keys[AF_MAX];
 186
 187struct jump_label_key memcg_socket_limit_enabled;
 188EXPORT_SYMBOL(memcg_socket_limit_enabled);
 189
 190/*
 191 * Make lock validator output more readable. (we pre-construct these
 192 * strings build-time, so that runtime initialization of socket
 193 * locks is fast):
 194 */
 195static const char *const af_family_key_strings[AF_MAX+1] = {
 196  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 197  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 198  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 199  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 200  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 201  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 202  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 203  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 204  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 205  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 206  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 207  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 208  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 209  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 210};
 211static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 212  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 213  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 214  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 215  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 216  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 217  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 218  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 219  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 220  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 221  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 222  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 223  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 224  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 225  "slock-AF_NFC"   , "slock-AF_MAX"
 226};
 227static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 228  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 229  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 230  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 231  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 232  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 233  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 234  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 235  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 236  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 237  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 238  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 239  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 240  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 241  "clock-AF_NFC"   , "clock-AF_MAX"
 242};
 243
 244/*
 245 * sk_callback_lock locking rules are per-address-family,
 246 * so split the lock classes by using a per-AF key:
 247 */
 248static struct lock_class_key af_callback_keys[AF_MAX];
 249
 250/* Take into consideration the size of the struct sk_buff overhead in the
 251 * determination of these values, since that is non-constant across
 252 * platforms.  This makes socket queueing behavior and performance
 253 * not depend upon such differences.
 254 */
 255#define _SK_MEM_PACKETS         256
 256#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 257#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 258#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 259
 260/* Run time adjustable parameters. */
 261__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 262__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 263__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 264__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 265
 266/* Maximal space eaten by iovec or ancillary data plus some space */
 267int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 268EXPORT_SYMBOL(sysctl_optmem_max);
 269
 270#if defined(CONFIG_CGROUPS)
 271#if !defined(CONFIG_NET_CLS_CGROUP)
 272int net_cls_subsys_id = -1;
 273EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 274#endif
 275#if !defined(CONFIG_NETPRIO_CGROUP)
 276int net_prio_subsys_id = -1;
 277EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 278#endif
 279#endif
 280
 281static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 282{
 283        struct timeval tv;
 284
 285        if (optlen < sizeof(tv))
 286                return -EINVAL;
 287        if (copy_from_user(&tv, optval, sizeof(tv)))
 288                return -EFAULT;
 289        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 290                return -EDOM;
 291
 292        if (tv.tv_sec < 0) {
 293                static int warned __read_mostly;
 294
 295                *timeo_p = 0;
 296                if (warned < 10 && net_ratelimit()) {
 297                        warned++;
 298                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 299                               "tries to set negative timeout\n",
 300                                current->comm, task_pid_nr(current));
 301                }
 302                return 0;
 303        }
 304        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 305        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 306                return 0;
 307        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 308                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 309        return 0;
 310}
 311
 312static void sock_warn_obsolete_bsdism(const char *name)
 313{
 314        static int warned;
 315        static char warncomm[TASK_COMM_LEN];
 316        if (strcmp(warncomm, current->comm) && warned < 5) {
 317                strcpy(warncomm,  current->comm);
 318                printk(KERN_WARNING "process `%s' is using obsolete "
 319                       "%s SO_BSDCOMPAT\n", warncomm, name);
 320                warned++;
 321        }
 322}
 323
 324#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 325
 326static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 327{
 328        if (sk->sk_flags & flags) {
 329                sk->sk_flags &= ~flags;
 330                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 331                        net_disable_timestamp();
 332        }
 333}
 334
 335
 336int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 337{
 338        int err;
 339        int skb_len;
 340        unsigned long flags;
 341        struct sk_buff_head *list = &sk->sk_receive_queue;
 342
 343        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 344                atomic_inc(&sk->sk_drops);
 345                trace_sock_rcvqueue_full(sk, skb);
 346                return -ENOMEM;
 347        }
 348
 349        err = sk_filter(sk, skb);
 350        if (err)
 351                return err;
 352
 353        if (!sk_rmem_schedule(sk, skb->truesize)) {
 354                atomic_inc(&sk->sk_drops);
 355                return -ENOBUFS;
 356        }
 357
 358        skb->dev = NULL;
 359        skb_set_owner_r(skb, sk);
 360
 361        /* Cache the SKB length before we tack it onto the receive
 362         * queue.  Once it is added it no longer belongs to us and
 363         * may be freed by other threads of control pulling packets
 364         * from the queue.
 365         */
 366        skb_len = skb->len;
 367
 368        /* we escape from rcu protected region, make sure we dont leak
 369         * a norefcounted dst
 370         */
 371        skb_dst_force(skb);
 372
 373        spin_lock_irqsave(&list->lock, flags);
 374        skb->dropcount = atomic_read(&sk->sk_drops);
 375        __skb_queue_tail(list, skb);
 376        spin_unlock_irqrestore(&list->lock, flags);
 377
 378        if (!sock_flag(sk, SOCK_DEAD))
 379                sk->sk_data_ready(sk, skb_len);
 380        return 0;
 381}
 382EXPORT_SYMBOL(sock_queue_rcv_skb);
 383
 384int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 385{
 386        int rc = NET_RX_SUCCESS;
 387
 388        if (sk_filter(sk, skb))
 389                goto discard_and_relse;
 390
 391        skb->dev = NULL;
 392
 393        if (sk_rcvqueues_full(sk, skb)) {
 394                atomic_inc(&sk->sk_drops);
 395                goto discard_and_relse;
 396        }
 397        if (nested)
 398                bh_lock_sock_nested(sk);
 399        else
 400                bh_lock_sock(sk);
 401        if (!sock_owned_by_user(sk)) {
 402                /*
 403                 * trylock + unlock semantics:
 404                 */
 405                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 406
 407                rc = sk_backlog_rcv(sk, skb);
 408
 409                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 410        } else if (sk_add_backlog(sk, skb)) {
 411                bh_unlock_sock(sk);
 412                atomic_inc(&sk->sk_drops);
 413                goto discard_and_relse;
 414        }
 415
 416        bh_unlock_sock(sk);
 417out:
 418        sock_put(sk);
 419        return rc;
 420discard_and_relse:
 421        kfree_skb(skb);
 422        goto out;
 423}
 424EXPORT_SYMBOL(sk_receive_skb);
 425
 426void sk_reset_txq(struct sock *sk)
 427{
 428        sk_tx_queue_clear(sk);
 429}
 430EXPORT_SYMBOL(sk_reset_txq);
 431
 432struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 433{
 434        struct dst_entry *dst = __sk_dst_get(sk);
 435
 436        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 437                sk_tx_queue_clear(sk);
 438                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 439                dst_release(dst);
 440                return NULL;
 441        }
 442
 443        return dst;
 444}
 445EXPORT_SYMBOL(__sk_dst_check);
 446
 447struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 448{
 449        struct dst_entry *dst = sk_dst_get(sk);
 450
 451        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 452                sk_dst_reset(sk);
 453                dst_release(dst);
 454                return NULL;
 455        }
 456
 457        return dst;
 458}
 459EXPORT_SYMBOL(sk_dst_check);
 460
 461static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 462{
 463        int ret = -ENOPROTOOPT;
 464#ifdef CONFIG_NETDEVICES
 465        struct net *net = sock_net(sk);
 466        char devname[IFNAMSIZ];
 467        int index;
 468
 469        /* Sorry... */
 470        ret = -EPERM;
 471        if (!capable(CAP_NET_RAW))
 472                goto out;
 473
 474        ret = -EINVAL;
 475        if (optlen < 0)
 476                goto out;
 477
 478        /* Bind this socket to a particular device like "eth0",
 479         * as specified in the passed interface name. If the
 480         * name is "" or the option length is zero the socket
 481         * is not bound.
 482         */
 483        if (optlen > IFNAMSIZ - 1)
 484                optlen = IFNAMSIZ - 1;
 485        memset(devname, 0, sizeof(devname));
 486
 487        ret = -EFAULT;
 488        if (copy_from_user(devname, optval, optlen))
 489                goto out;
 490
 491        index = 0;
 492        if (devname[0] != '\0') {
 493                struct net_device *dev;
 494
 495                rcu_read_lock();
 496                dev = dev_get_by_name_rcu(net, devname);
 497                if (dev)
 498                        index = dev->ifindex;
 499                rcu_read_unlock();
 500                ret = -ENODEV;
 501                if (!dev)
 502                        goto out;
 503        }
 504
 505        lock_sock(sk);
 506        sk->sk_bound_dev_if = index;
 507        sk_dst_reset(sk);
 508        release_sock(sk);
 509
 510        ret = 0;
 511
 512out:
 513#endif
 514
 515        return ret;
 516}
 517
 518static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 519{
 520        if (valbool)
 521                sock_set_flag(sk, bit);
 522        else
 523                sock_reset_flag(sk, bit);
 524}
 525
 526/*
 527 *      This is meant for all protocols to use and covers goings on
 528 *      at the socket level. Everything here is generic.
 529 */
 530
 531int sock_setsockopt(struct socket *sock, int level, int optname,
 532                    char __user *optval, unsigned int optlen)
 533{
 534        struct sock *sk = sock->sk;
 535        int val;
 536        int valbool;
 537        struct linger ling;
 538        int ret = 0;
 539
 540        /*
 541         *      Options without arguments
 542         */
 543
 544        if (optname == SO_BINDTODEVICE)
 545                return sock_bindtodevice(sk, optval, optlen);
 546
 547        if (optlen < sizeof(int))
 548                return -EINVAL;
 549
 550        if (get_user(val, (int __user *)optval))
 551                return -EFAULT;
 552
 553        valbool = val ? 1 : 0;
 554
 555        lock_sock(sk);
 556
 557        switch (optname) {
 558        case SO_DEBUG:
 559                if (val && !capable(CAP_NET_ADMIN))
 560                        ret = -EACCES;
 561                else
 562                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 563                break;
 564        case SO_REUSEADDR:
 565                sk->sk_reuse = valbool;
 566                break;
 567        case SO_TYPE:
 568        case SO_PROTOCOL:
 569        case SO_DOMAIN:
 570        case SO_ERROR:
 571                ret = -ENOPROTOOPT;
 572                break;
 573        case SO_DONTROUTE:
 574                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 575                break;
 576        case SO_BROADCAST:
 577                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 578                break;
 579        case SO_SNDBUF:
 580                /* Don't error on this BSD doesn't and if you think
 581                   about it this is right. Otherwise apps have to
 582                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 583                   are treated in BSD as hints */
 584
 585                if (val > sysctl_wmem_max)
 586                        val = sysctl_wmem_max;
 587set_sndbuf:
 588                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 589                if ((val * 2) < SOCK_MIN_SNDBUF)
 590                        sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 591                else
 592                        sk->sk_sndbuf = val * 2;
 593
 594                /*
 595                 *      Wake up sending tasks if we
 596                 *      upped the value.
 597                 */
 598                sk->sk_write_space(sk);
 599                break;
 600
 601        case SO_SNDBUFFORCE:
 602                if (!capable(CAP_NET_ADMIN)) {
 603                        ret = -EPERM;
 604                        break;
 605                }
 606                goto set_sndbuf;
 607
 608        case SO_RCVBUF:
 609                /* Don't error on this BSD doesn't and if you think
 610                   about it this is right. Otherwise apps have to
 611                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 612                   are treated in BSD as hints */
 613
 614                if (val > sysctl_rmem_max)
 615                        val = sysctl_rmem_max;
 616set_rcvbuf:
 617                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 618                /*
 619                 * We double it on the way in to account for
 620                 * "struct sk_buff" etc. overhead.   Applications
 621                 * assume that the SO_RCVBUF setting they make will
 622                 * allow that much actual data to be received on that
 623                 * socket.
 624                 *
 625                 * Applications are unaware that "struct sk_buff" and
 626                 * other overheads allocate from the receive buffer
 627                 * during socket buffer allocation.
 628                 *
 629                 * And after considering the possible alternatives,
 630                 * returning the value we actually used in getsockopt
 631                 * is the most desirable behavior.
 632                 */
 633                if ((val * 2) < SOCK_MIN_RCVBUF)
 634                        sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 635                else
 636                        sk->sk_rcvbuf = val * 2;
 637                break;
 638
 639        case SO_RCVBUFFORCE:
 640                if (!capable(CAP_NET_ADMIN)) {
 641                        ret = -EPERM;
 642                        break;
 643                }
 644                goto set_rcvbuf;
 645
 646        case SO_KEEPALIVE:
 647#ifdef CONFIG_INET
 648                if (sk->sk_protocol == IPPROTO_TCP)
 649                        tcp_set_keepalive(sk, valbool);
 650#endif
 651                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 652                break;
 653
 654        case SO_OOBINLINE:
 655                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 656                break;
 657
 658        case SO_NO_CHECK:
 659                sk->sk_no_check = valbool;
 660                break;
 661
 662        case SO_PRIORITY:
 663                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 664                        sk->sk_priority = val;
 665                else
 666                        ret = -EPERM;
 667                break;
 668
 669        case SO_LINGER:
 670                if (optlen < sizeof(ling)) {
 671                        ret = -EINVAL;  /* 1003.1g */
 672                        break;
 673                }
 674                if (copy_from_user(&ling, optval, sizeof(ling))) {
 675                        ret = -EFAULT;
 676                        break;
 677                }
 678                if (!ling.l_onoff)
 679                        sock_reset_flag(sk, SOCK_LINGER);
 680                else {
 681#if (BITS_PER_LONG == 32)
 682                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 683                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 684                        else
 685#endif
 686                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 687                        sock_set_flag(sk, SOCK_LINGER);
 688                }
 689                break;
 690
 691        case SO_BSDCOMPAT:
 692                sock_warn_obsolete_bsdism("setsockopt");
 693                break;
 694
 695        case SO_PASSCRED:
 696                if (valbool)
 697                        set_bit(SOCK_PASSCRED, &sock->flags);
 698                else
 699                        clear_bit(SOCK_PASSCRED, &sock->flags);
 700                break;
 701
 702        case SO_TIMESTAMP:
 703        case SO_TIMESTAMPNS:
 704                if (valbool)  {
 705                        if (optname == SO_TIMESTAMP)
 706                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 707                        else
 708                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 709                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 710                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 711                } else {
 712                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 713                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 714                }
 715                break;
 716
 717        case SO_TIMESTAMPING:
 718                if (val & ~SOF_TIMESTAMPING_MASK) {
 719                        ret = -EINVAL;
 720                        break;
 721                }
 722                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 723                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 724                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 725                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 726                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 727                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 728                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 729                        sock_enable_timestamp(sk,
 730                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 731                else
 732                        sock_disable_timestamp(sk,
 733                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 734                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 735                                  val & SOF_TIMESTAMPING_SOFTWARE);
 736                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 737                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 738                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 739                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 740                break;
 741
 742        case SO_RCVLOWAT:
 743                if (val < 0)
 744                        val = INT_MAX;
 745                sk->sk_rcvlowat = val ? : 1;
 746                break;
 747
 748        case SO_RCVTIMEO:
 749                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 750                break;
 751
 752        case SO_SNDTIMEO:
 753                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 754                break;
 755
 756        case SO_ATTACH_FILTER:
 757                ret = -EINVAL;
 758                if (optlen == sizeof(struct sock_fprog)) {
 759                        struct sock_fprog fprog;
 760
 761                        ret = -EFAULT;
 762                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 763                                break;
 764
 765                        ret = sk_attach_filter(&fprog, sk);
 766                }
 767                break;
 768
 769        case SO_DETACH_FILTER:
 770                ret = sk_detach_filter(sk);
 771                break;
 772
 773        case SO_PASSSEC:
 774                if (valbool)
 775                        set_bit(SOCK_PASSSEC, &sock->flags);
 776                else
 777                        clear_bit(SOCK_PASSSEC, &sock->flags);
 778                break;
 779        case SO_MARK:
 780                if (!capable(CAP_NET_ADMIN))
 781                        ret = -EPERM;
 782                else
 783                        sk->sk_mark = val;
 784                break;
 785
 786                /* We implement the SO_SNDLOWAT etc to
 787                   not be settable (1003.1g 5.3) */
 788        case SO_RXQ_OVFL:
 789                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 790                break;
 791
 792        case SO_WIFI_STATUS:
 793                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 794                break;
 795
 796        default:
 797                ret = -ENOPROTOOPT;
 798                break;
 799        }
 800        release_sock(sk);
 801        return ret;
 802}
 803EXPORT_SYMBOL(sock_setsockopt);
 804
 805
 806void cred_to_ucred(struct pid *pid, const struct cred *cred,
 807                   struct ucred *ucred)
 808{
 809        ucred->pid = pid_vnr(pid);
 810        ucred->uid = ucred->gid = -1;
 811        if (cred) {
 812                struct user_namespace *current_ns = current_user_ns();
 813
 814                ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
 815                ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
 816        }
 817}
 818EXPORT_SYMBOL_GPL(cred_to_ucred);
 819
 820int sock_getsockopt(struct socket *sock, int level, int optname,
 821                    char __user *optval, int __user *optlen)
 822{
 823        struct sock *sk = sock->sk;
 824
 825        union {
 826                int val;
 827                struct linger ling;
 828                struct timeval tm;
 829        } v;
 830
 831        int lv = sizeof(int);
 832        int len;
 833
 834        if (get_user(len, optlen))
 835                return -EFAULT;
 836        if (len < 0)
 837                return -EINVAL;
 838
 839        memset(&v, 0, sizeof(v));
 840
 841        switch (optname) {
 842        case SO_DEBUG:
 843                v.val = sock_flag(sk, SOCK_DBG);
 844                break;
 845
 846        case SO_DONTROUTE:
 847                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 848                break;
 849
 850        case SO_BROADCAST:
 851                v.val = !!sock_flag(sk, SOCK_BROADCAST);
 852                break;
 853
 854        case SO_SNDBUF:
 855                v.val = sk->sk_sndbuf;
 856                break;
 857
 858        case SO_RCVBUF:
 859                v.val = sk->sk_rcvbuf;
 860                break;
 861
 862        case SO_REUSEADDR:
 863                v.val = sk->sk_reuse;
 864                break;
 865
 866        case SO_KEEPALIVE:
 867                v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 868                break;
 869
 870        case SO_TYPE:
 871                v.val = sk->sk_type;
 872                break;
 873
 874        case SO_PROTOCOL:
 875                v.val = sk->sk_protocol;
 876                break;
 877
 878        case SO_DOMAIN:
 879                v.val = sk->sk_family;
 880                break;
 881
 882        case SO_ERROR:
 883                v.val = -sock_error(sk);
 884                if (v.val == 0)
 885                        v.val = xchg(&sk->sk_err_soft, 0);
 886                break;
 887
 888        case SO_OOBINLINE:
 889                v.val = !!sock_flag(sk, SOCK_URGINLINE);
 890                break;
 891
 892        case SO_NO_CHECK:
 893                v.val = sk->sk_no_check;
 894                break;
 895
 896        case SO_PRIORITY:
 897                v.val = sk->sk_priority;
 898                break;
 899
 900        case SO_LINGER:
 901                lv              = sizeof(v.ling);
 902                v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 903                v.ling.l_linger = sk->sk_lingertime / HZ;
 904                break;
 905
 906        case SO_BSDCOMPAT:
 907                sock_warn_obsolete_bsdism("getsockopt");
 908                break;
 909
 910        case SO_TIMESTAMP:
 911                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 912                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 913                break;
 914
 915        case SO_TIMESTAMPNS:
 916                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 917                break;
 918
 919        case SO_TIMESTAMPING:
 920                v.val = 0;
 921                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 922                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 923                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 924                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 925                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 926                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 927                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 928                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 929                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 930                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 931                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 932                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 933                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 934                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 935                break;
 936
 937        case SO_RCVTIMEO:
 938                lv = sizeof(struct timeval);
 939                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 940                        v.tm.tv_sec = 0;
 941                        v.tm.tv_usec = 0;
 942                } else {
 943                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 944                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 945                }
 946                break;
 947
 948        case SO_SNDTIMEO:
 949                lv = sizeof(struct timeval);
 950                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 951                        v.tm.tv_sec = 0;
 952                        v.tm.tv_usec = 0;
 953                } else {
 954                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 955                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 956                }
 957                break;
 958
 959        case SO_RCVLOWAT:
 960                v.val = sk->sk_rcvlowat;
 961                break;
 962
 963        case SO_SNDLOWAT:
 964                v.val = 1;
 965                break;
 966
 967        case SO_PASSCRED:
 968                v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 969                break;
 970
 971        case SO_PEERCRED:
 972        {
 973                struct ucred peercred;
 974                if (len > sizeof(peercred))
 975                        len = sizeof(peercred);
 976                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
 977                if (copy_to_user(optval, &peercred, len))
 978                        return -EFAULT;
 979                goto lenout;
 980        }
 981
 982        case SO_PEERNAME:
 983        {
 984                char address[128];
 985
 986                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 987                        return -ENOTCONN;
 988                if (lv < len)
 989                        return -EINVAL;
 990                if (copy_to_user(optval, address, len))
 991                        return -EFAULT;
 992                goto lenout;
 993        }
 994
 995        /* Dubious BSD thing... Probably nobody even uses it, but
 996         * the UNIX standard wants it for whatever reason... -DaveM
 997         */
 998        case SO_ACCEPTCONN:
 999                v.val = sk->sk_state == TCP_LISTEN;
1000                break;
1001
1002        case SO_PASSSEC:
1003                v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1004                break;
1005
1006        case SO_PEERSEC:
1007                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1008
1009        case SO_MARK:
1010                v.val = sk->sk_mark;
1011                break;
1012
1013        case SO_RXQ_OVFL:
1014                v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1015                break;
1016
1017        case SO_WIFI_STATUS:
1018                v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1019                break;
1020
1021        default:
1022                return -ENOPROTOOPT;
1023        }
1024
1025        if (len > lv)
1026                len = lv;
1027        if (copy_to_user(optval, &v, len))
1028                return -EFAULT;
1029lenout:
1030        if (put_user(len, optlen))
1031                return -EFAULT;
1032        return 0;
1033}
1034
1035/*
1036 * Initialize an sk_lock.
1037 *
1038 * (We also register the sk_lock with the lock validator.)
1039 */
1040static inline void sock_lock_init(struct sock *sk)
1041{
1042        sock_lock_init_class_and_name(sk,
1043                        af_family_slock_key_strings[sk->sk_family],
1044                        af_family_slock_keys + sk->sk_family,
1045                        af_family_key_strings[sk->sk_family],
1046                        af_family_keys + sk->sk_family);
1047}
1048
1049/*
1050 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1051 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1052 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1053 */
1054static void sock_copy(struct sock *nsk, const struct sock *osk)
1055{
1056#ifdef CONFIG_SECURITY_NETWORK
1057        void *sptr = nsk->sk_security;
1058#endif
1059        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1060
1061        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1062               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1063
1064#ifdef CONFIG_SECURITY_NETWORK
1065        nsk->sk_security = sptr;
1066        security_sk_clone(osk, nsk);
1067#endif
1068}
1069
1070/*
1071 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1072 * un-modified. Special care is taken when initializing object to zero.
1073 */
1074static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1075{
1076        if (offsetof(struct sock, sk_node.next) != 0)
1077                memset(sk, 0, offsetof(struct sock, sk_node.next));
1078        memset(&sk->sk_node.pprev, 0,
1079               size - offsetof(struct sock, sk_node.pprev));
1080}
1081
1082void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1083{
1084        unsigned long nulls1, nulls2;
1085
1086        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1087        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1088        if (nulls1 > nulls2)
1089                swap(nulls1, nulls2);
1090
1091        if (nulls1 != 0)
1092                memset((char *)sk, 0, nulls1);
1093        memset((char *)sk + nulls1 + sizeof(void *), 0,
1094               nulls2 - nulls1 - sizeof(void *));
1095        memset((char *)sk + nulls2 + sizeof(void *), 0,
1096               size - nulls2 - sizeof(void *));
1097}
1098EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1099
1100static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1101                int family)
1102{
1103        struct sock *sk;
1104        struct kmem_cache *slab;
1105
1106        slab = prot->slab;
1107        if (slab != NULL) {
1108                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1109                if (!sk)
1110                        return sk;
1111                if (priority & __GFP_ZERO) {
1112                        if (prot->clear_sk)
1113                                prot->clear_sk(sk, prot->obj_size);
1114                        else
1115                                sk_prot_clear_nulls(sk, prot->obj_size);
1116                }
1117        } else
1118                sk = kmalloc(prot->obj_size, priority);
1119
1120        if (sk != NULL) {
1121                kmemcheck_annotate_bitfield(sk, flags);
1122
1123                if (security_sk_alloc(sk, family, priority))
1124                        goto out_free;
1125
1126                if (!try_module_get(prot->owner))
1127                        goto out_free_sec;
1128                sk_tx_queue_clear(sk);
1129        }
1130
1131        return sk;
1132
1133out_free_sec:
1134        security_sk_free(sk);
1135out_free:
1136        if (slab != NULL)
1137                kmem_cache_free(slab, sk);
1138        else
1139                kfree(sk);
1140        return NULL;
1141}
1142
1143static void sk_prot_free(struct proto *prot, struct sock *sk)
1144{
1145        struct kmem_cache *slab;
1146        struct module *owner;
1147
1148        owner = prot->owner;
1149        slab = prot->slab;
1150
1151        security_sk_free(sk);
1152        if (slab != NULL)
1153                kmem_cache_free(slab, sk);
1154        else
1155                kfree(sk);
1156        module_put(owner);
1157}
1158
1159#ifdef CONFIG_CGROUPS
1160void sock_update_classid(struct sock *sk)
1161{
1162        u32 classid;
1163
1164        rcu_read_lock();  /* doing current task, which cannot vanish. */
1165        classid = task_cls_classid(current);
1166        rcu_read_unlock();
1167        if (classid && classid != sk->sk_classid)
1168                sk->sk_classid = classid;
1169}
1170EXPORT_SYMBOL(sock_update_classid);
1171
1172void sock_update_netprioidx(struct sock *sk)
1173{
1174        if (in_interrupt())
1175                return;
1176
1177        sk->sk_cgrp_prioidx = task_netprioidx(current);
1178}
1179EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1180#endif
1181
1182/**
1183 *      sk_alloc - All socket objects are allocated here
1184 *      @net: the applicable net namespace
1185 *      @family: protocol family
1186 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1187 *      @prot: struct proto associated with this new sock instance
1188 */
1189struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1190                      struct proto *prot)
1191{
1192        struct sock *sk;
1193
1194        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1195        if (sk) {
1196                sk->sk_family = family;
1197                /*
1198                 * See comment in struct sock definition to understand
1199                 * why we need sk_prot_creator -acme
1200                 */
1201                sk->sk_prot = sk->sk_prot_creator = prot;
1202                sock_lock_init(sk);
1203                sock_net_set(sk, get_net(net));
1204                atomic_set(&sk->sk_wmem_alloc, 1);
1205
1206                sock_update_classid(sk);
1207                sock_update_netprioidx(sk);
1208        }
1209
1210        return sk;
1211}
1212EXPORT_SYMBOL(sk_alloc);
1213
1214static void __sk_free(struct sock *sk)
1215{
1216        struct sk_filter *filter;
1217
1218        if (sk->sk_destruct)
1219                sk->sk_destruct(sk);
1220
1221        filter = rcu_dereference_check(sk->sk_filter,
1222                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1223        if (filter) {
1224                sk_filter_uncharge(sk, filter);
1225                RCU_INIT_POINTER(sk->sk_filter, NULL);
1226        }
1227
1228        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1229
1230        if (atomic_read(&sk->sk_omem_alloc))
1231                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1232                       __func__, atomic_read(&sk->sk_omem_alloc));
1233
1234        if (sk->sk_peer_cred)
1235                put_cred(sk->sk_peer_cred);
1236        put_pid(sk->sk_peer_pid);
1237        put_net(sock_net(sk));
1238        sk_prot_free(sk->sk_prot_creator, sk);
1239}
1240
1241void sk_free(struct sock *sk)
1242{
1243        /*
1244         * We subtract one from sk_wmem_alloc and can know if
1245         * some packets are still in some tx queue.
1246         * If not null, sock_wfree() will call __sk_free(sk) later
1247         */
1248        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1249                __sk_free(sk);
1250}
1251EXPORT_SYMBOL(sk_free);
1252
1253/*
1254 * Last sock_put should drop reference to sk->sk_net. It has already
1255 * been dropped in sk_change_net. Taking reference to stopping namespace
1256 * is not an option.
1257 * Take reference to a socket to remove it from hash _alive_ and after that
1258 * destroy it in the context of init_net.
1259 */
1260void sk_release_kernel(struct sock *sk)
1261{
1262        if (sk == NULL || sk->sk_socket == NULL)
1263                return;
1264
1265        sock_hold(sk);
1266        sock_release(sk->sk_socket);
1267        release_net(sock_net(sk));
1268        sock_net_set(sk, get_net(&init_net));
1269        sock_put(sk);
1270}
1271EXPORT_SYMBOL(sk_release_kernel);
1272
1273static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1274{
1275        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1276                sock_update_memcg(newsk);
1277}
1278
1279/**
1280 *      sk_clone_lock - clone a socket, and lock its clone
1281 *      @sk: the socket to clone
1282 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1283 *
1284 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1285 */
1286struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1287{
1288        struct sock *newsk;
1289
1290        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1291        if (newsk != NULL) {
1292                struct sk_filter *filter;
1293
1294                sock_copy(newsk, sk);
1295
1296                /* SANITY */
1297                get_net(sock_net(newsk));
1298                sk_node_init(&newsk->sk_node);
1299                sock_lock_init(newsk);
1300                bh_lock_sock(newsk);
1301                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1302                newsk->sk_backlog.len = 0;
1303
1304                atomic_set(&newsk->sk_rmem_alloc, 0);
1305                /*
1306                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1307                 */
1308                atomic_set(&newsk->sk_wmem_alloc, 1);
1309                atomic_set(&newsk->sk_omem_alloc, 0);
1310                skb_queue_head_init(&newsk->sk_receive_queue);
1311                skb_queue_head_init(&newsk->sk_write_queue);
1312#ifdef CONFIG_NET_DMA
1313                skb_queue_head_init(&newsk->sk_async_wait_queue);
1314#endif
1315
1316                spin_lock_init(&newsk->sk_dst_lock);
1317                rwlock_init(&newsk->sk_callback_lock);
1318                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1319                                af_callback_keys + newsk->sk_family,
1320                                af_family_clock_key_strings[newsk->sk_family]);
1321
1322                newsk->sk_dst_cache     = NULL;
1323                newsk->sk_wmem_queued   = 0;
1324                newsk->sk_forward_alloc = 0;
1325                newsk->sk_send_head     = NULL;
1326                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1327
1328                sock_reset_flag(newsk, SOCK_DONE);
1329                skb_queue_head_init(&newsk->sk_error_queue);
1330
1331                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1332                if (filter != NULL)
1333                        sk_filter_charge(newsk, filter);
1334
1335                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1336                        /* It is still raw copy of parent, so invalidate
1337                         * destructor and make plain sk_free() */
1338                        newsk->sk_destruct = NULL;
1339                        bh_unlock_sock(newsk);
1340                        sk_free(newsk);
1341                        newsk = NULL;
1342                        goto out;
1343                }
1344
1345                newsk->sk_err      = 0;
1346                newsk->sk_priority = 0;
1347                /*
1348                 * Before updating sk_refcnt, we must commit prior changes to memory
1349                 * (Documentation/RCU/rculist_nulls.txt for details)
1350                 */
1351                smp_wmb();
1352                atomic_set(&newsk->sk_refcnt, 2);
1353
1354                /*
1355                 * Increment the counter in the same struct proto as the master
1356                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1357                 * is the same as sk->sk_prot->socks, as this field was copied
1358                 * with memcpy).
1359                 *
1360                 * This _changes_ the previous behaviour, where
1361                 * tcp_create_openreq_child always was incrementing the
1362                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1363                 * to be taken into account in all callers. -acme
1364                 */
1365                sk_refcnt_debug_inc(newsk);
1366                sk_set_socket(newsk, NULL);
1367                newsk->sk_wq = NULL;
1368
1369                sk_update_clone(sk, newsk);
1370
1371                if (newsk->sk_prot->sockets_allocated)
1372                        sk_sockets_allocated_inc(newsk);
1373
1374                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1375                        net_enable_timestamp();
1376        }
1377out:
1378        return newsk;
1379}
1380EXPORT_SYMBOL_GPL(sk_clone_lock);
1381
1382void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1383{
1384        __sk_dst_set(sk, dst);
1385        sk->sk_route_caps = dst->dev->features;
1386        if (sk->sk_route_caps & NETIF_F_GSO)
1387                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1388        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1389        if (sk_can_gso(sk)) {
1390                if (dst->header_len) {
1391                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1392                } else {
1393                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1394                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1395                }
1396        }
1397}
1398EXPORT_SYMBOL_GPL(sk_setup_caps);
1399
1400void __init sk_init(void)
1401{
1402        if (totalram_pages <= 4096) {
1403                sysctl_wmem_max = 32767;
1404                sysctl_rmem_max = 32767;
1405                sysctl_wmem_default = 32767;
1406                sysctl_rmem_default = 32767;
1407        } else if (totalram_pages >= 131072) {
1408                sysctl_wmem_max = 131071;
1409                sysctl_rmem_max = 131071;
1410        }
1411}
1412
1413/*
1414 *      Simple resource managers for sockets.
1415 */
1416
1417
1418/*
1419 * Write buffer destructor automatically called from kfree_skb.
1420 */
1421void sock_wfree(struct sk_buff *skb)
1422{
1423        struct sock *sk = skb->sk;
1424        unsigned int len = skb->truesize;
1425
1426        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1427                /*
1428                 * Keep a reference on sk_wmem_alloc, this will be released
1429                 * after sk_write_space() call
1430                 */
1431                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1432                sk->sk_write_space(sk);
1433                len = 1;
1434        }
1435        /*
1436         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1437         * could not do because of in-flight packets
1438         */
1439        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1440                __sk_free(sk);
1441}
1442EXPORT_SYMBOL(sock_wfree);
1443
1444/*
1445 * Read buffer destructor automatically called from kfree_skb.
1446 */
1447void sock_rfree(struct sk_buff *skb)
1448{
1449        struct sock *sk = skb->sk;
1450        unsigned int len = skb->truesize;
1451
1452        atomic_sub(len, &sk->sk_rmem_alloc);
1453        sk_mem_uncharge(sk, len);
1454}
1455EXPORT_SYMBOL(sock_rfree);
1456
1457
1458int sock_i_uid(struct sock *sk)
1459{
1460        int uid;
1461
1462        read_lock_bh(&sk->sk_callback_lock);
1463        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1464        read_unlock_bh(&sk->sk_callback_lock);
1465        return uid;
1466}
1467EXPORT_SYMBOL(sock_i_uid);
1468
1469unsigned long sock_i_ino(struct sock *sk)
1470{
1471        unsigned long ino;
1472
1473        read_lock_bh(&sk->sk_callback_lock);
1474        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1475        read_unlock_bh(&sk->sk_callback_lock);
1476        return ino;
1477}
1478EXPORT_SYMBOL(sock_i_ino);
1479
1480/*
1481 * Allocate a skb from the socket's send buffer.
1482 */
1483struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1484                             gfp_t priority)
1485{
1486        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1487                struct sk_buff *skb = alloc_skb(size, priority);
1488                if (skb) {
1489                        skb_set_owner_w(skb, sk);
1490                        return skb;
1491                }
1492        }
1493        return NULL;
1494}
1495EXPORT_SYMBOL(sock_wmalloc);
1496
1497/*
1498 * Allocate a skb from the socket's receive buffer.
1499 */
1500struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1501                             gfp_t priority)
1502{
1503        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1504                struct sk_buff *skb = alloc_skb(size, priority);
1505                if (skb) {
1506                        skb_set_owner_r(skb, sk);
1507                        return skb;
1508                }
1509        }
1510        return NULL;
1511}
1512
1513/*
1514 * Allocate a memory block from the socket's option memory buffer.
1515 */
1516void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1517{
1518        if ((unsigned)size <= sysctl_optmem_max &&
1519            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1520                void *mem;
1521                /* First do the add, to avoid the race if kmalloc
1522                 * might sleep.
1523                 */
1524                atomic_add(size, &sk->sk_omem_alloc);
1525                mem = kmalloc(size, priority);
1526                if (mem)
1527                        return mem;
1528                atomic_sub(size, &sk->sk_omem_alloc);
1529        }
1530        return NULL;
1531}
1532EXPORT_SYMBOL(sock_kmalloc);
1533
1534/*
1535 * Free an option memory block.
1536 */
1537void sock_kfree_s(struct sock *sk, void *mem, int size)
1538{
1539        kfree(mem);
1540        atomic_sub(size, &sk->sk_omem_alloc);
1541}
1542EXPORT_SYMBOL(sock_kfree_s);
1543
1544/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1545   I think, these locks should be removed for datagram sockets.
1546 */
1547static long sock_wait_for_wmem(struct sock *sk, long timeo)
1548{
1549        DEFINE_WAIT(wait);
1550
1551        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1552        for (;;) {
1553                if (!timeo)
1554                        break;
1555                if (signal_pending(current))
1556                        break;
1557                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1558                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1559                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1560                        break;
1561                if (sk->sk_shutdown & SEND_SHUTDOWN)
1562                        break;
1563                if (sk->sk_err)
1564                        break;
1565                timeo = schedule_timeout(timeo);
1566        }
1567        finish_wait(sk_sleep(sk), &wait);
1568        return timeo;
1569}
1570
1571
1572/*
1573 *      Generic send/receive buffer handlers
1574 */
1575
1576struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1577                                     unsigned long data_len, int noblock,
1578                                     int *errcode)
1579{
1580        struct sk_buff *skb;
1581        gfp_t gfp_mask;
1582        long timeo;
1583        int err;
1584
1585        gfp_mask = sk->sk_allocation;
1586        if (gfp_mask & __GFP_WAIT)
1587                gfp_mask |= __GFP_REPEAT;
1588
1589        timeo = sock_sndtimeo(sk, noblock);
1590        while (1) {
1591                err = sock_error(sk);
1592                if (err != 0)
1593                        goto failure;
1594
1595                err = -EPIPE;
1596                if (sk->sk_shutdown & SEND_SHUTDOWN)
1597                        goto failure;
1598
1599                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1600                        skb = alloc_skb(header_len, gfp_mask);
1601                        if (skb) {
1602                                int npages;
1603                                int i;
1604
1605                                /* No pages, we're done... */
1606                                if (!data_len)
1607                                        break;
1608
1609                                npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1610                                skb->truesize += data_len;
1611                                skb_shinfo(skb)->nr_frags = npages;
1612                                for (i = 0; i < npages; i++) {
1613                                        struct page *page;
1614
1615                                        page = alloc_pages(sk->sk_allocation, 0);
1616                                        if (!page) {
1617                                                err = -ENOBUFS;
1618                                                skb_shinfo(skb)->nr_frags = i;
1619                                                kfree_skb(skb);
1620                                                goto failure;
1621                                        }
1622
1623                                        __skb_fill_page_desc(skb, i,
1624                                                        page, 0,
1625                                                        (data_len >= PAGE_SIZE ?
1626                                                         PAGE_SIZE :
1627                                                         data_len));
1628                                        data_len -= PAGE_SIZE;
1629                                }
1630
1631                                /* Full success... */
1632                                break;
1633                        }
1634                        err = -ENOBUFS;
1635                        goto failure;
1636                }
1637                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1638                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1639                err = -EAGAIN;
1640                if (!timeo)
1641                        goto failure;
1642                if (signal_pending(current))
1643                        goto interrupted;
1644                timeo = sock_wait_for_wmem(sk, timeo);
1645        }
1646
1647        skb_set_owner_w(skb, sk);
1648        return skb;
1649
1650interrupted:
1651        err = sock_intr_errno(timeo);
1652failure:
1653        *errcode = err;
1654        return NULL;
1655}
1656EXPORT_SYMBOL(sock_alloc_send_pskb);
1657
1658struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1659                                    int noblock, int *errcode)
1660{
1661        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1662}
1663EXPORT_SYMBOL(sock_alloc_send_skb);
1664
1665static void __lock_sock(struct sock *sk)
1666        __releases(&sk->sk_lock.slock)
1667        __acquires(&sk->sk_lock.slock)
1668{
1669        DEFINE_WAIT(wait);
1670
1671        for (;;) {
1672                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1673                                        TASK_UNINTERRUPTIBLE);
1674                spin_unlock_bh(&sk->sk_lock.slock);
1675                schedule();
1676                spin_lock_bh(&sk->sk_lock.slock);
1677                if (!sock_owned_by_user(sk))
1678                        break;
1679        }
1680        finish_wait(&sk->sk_lock.wq, &wait);
1681}
1682
1683static void __release_sock(struct sock *sk)
1684        __releases(&sk->sk_lock.slock)
1685        __acquires(&sk->sk_lock.slock)
1686{
1687        struct sk_buff *skb = sk->sk_backlog.head;
1688
1689        do {
1690                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1691                bh_unlock_sock(sk);
1692
1693                do {
1694                        struct sk_buff *next = skb->next;
1695
1696                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1697                        skb->next = NULL;
1698                        sk_backlog_rcv(sk, skb);
1699
1700                        /*
1701                         * We are in process context here with softirqs
1702                         * disabled, use cond_resched_softirq() to preempt.
1703                         * This is safe to do because we've taken the backlog
1704                         * queue private:
1705                         */
1706                        cond_resched_softirq();
1707
1708                        skb = next;
1709                } while (skb != NULL);
1710
1711                bh_lock_sock(sk);
1712        } while ((skb = sk->sk_backlog.head) != NULL);
1713
1714        /*
1715         * Doing the zeroing here guarantee we can not loop forever
1716         * while a wild producer attempts to flood us.
1717         */
1718        sk->sk_backlog.len = 0;
1719}
1720
1721/**
1722 * sk_wait_data - wait for data to arrive at sk_receive_queue
1723 * @sk:    sock to wait on
1724 * @timeo: for how long
1725 *
1726 * Now socket state including sk->sk_err is changed only under lock,
1727 * hence we may omit checks after joining wait queue.
1728 * We check receive queue before schedule() only as optimization;
1729 * it is very likely that release_sock() added new data.
1730 */
1731int sk_wait_data(struct sock *sk, long *timeo)
1732{
1733        int rc;
1734        DEFINE_WAIT(wait);
1735
1736        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1737        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1738        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1739        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1740        finish_wait(sk_sleep(sk), &wait);
1741        return rc;
1742}
1743EXPORT_SYMBOL(sk_wait_data);
1744
1745/**
1746 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1747 *      @sk: socket
1748 *      @size: memory size to allocate
1749 *      @kind: allocation type
1750 *
1751 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1752 *      rmem allocation. This function assumes that protocols which have
1753 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1754 */
1755int __sk_mem_schedule(struct sock *sk, int size, int kind)
1756{
1757        struct proto *prot = sk->sk_prot;
1758        int amt = sk_mem_pages(size);
1759        long allocated;
1760        int parent_status = UNDER_LIMIT;
1761
1762        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1763
1764        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1765
1766        /* Under limit. */
1767        if (parent_status == UNDER_LIMIT &&
1768                        allocated <= sk_prot_mem_limits(sk, 0)) {
1769                sk_leave_memory_pressure(sk);
1770                return 1;
1771        }
1772
1773        /* Under pressure. (we or our parents) */
1774        if ((parent_status > SOFT_LIMIT) ||
1775                        allocated > sk_prot_mem_limits(sk, 1))
1776                sk_enter_memory_pressure(sk);
1777
1778        /* Over hard limit (we or our parents) */
1779        if ((parent_status == OVER_LIMIT) ||
1780                        (allocated > sk_prot_mem_limits(sk, 2)))
1781                goto suppress_allocation;
1782
1783        /* guarantee minimum buffer size under pressure */
1784        if (kind == SK_MEM_RECV) {
1785                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1786                        return 1;
1787
1788        } else { /* SK_MEM_SEND */
1789                if (sk->sk_type == SOCK_STREAM) {
1790                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1791                                return 1;
1792                } else if (atomic_read(&sk->sk_wmem_alloc) <
1793                           prot->sysctl_wmem[0])
1794                                return 1;
1795        }
1796
1797        if (sk_has_memory_pressure(sk)) {
1798                int alloc;
1799
1800                if (!sk_under_memory_pressure(sk))
1801                        return 1;
1802                alloc = sk_sockets_allocated_read_positive(sk);
1803                if (sk_prot_mem_limits(sk, 2) > alloc *
1804                    sk_mem_pages(sk->sk_wmem_queued +
1805                                 atomic_read(&sk->sk_rmem_alloc) +
1806                                 sk->sk_forward_alloc))
1807                        return 1;
1808        }
1809
1810suppress_allocation:
1811
1812        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1813                sk_stream_moderate_sndbuf(sk);
1814
1815                /* Fail only if socket is _under_ its sndbuf.
1816                 * In this case we cannot block, so that we have to fail.
1817                 */
1818                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1819                        return 1;
1820        }
1821
1822        trace_sock_exceed_buf_limit(sk, prot, allocated);
1823
1824        /* Alas. Undo changes. */
1825        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1826
1827        sk_memory_allocated_sub(sk, amt);
1828
1829        return 0;
1830}
1831EXPORT_SYMBOL(__sk_mem_schedule);
1832
1833/**
1834 *      __sk_reclaim - reclaim memory_allocated
1835 *      @sk: socket
1836 */
1837void __sk_mem_reclaim(struct sock *sk)
1838{
1839        sk_memory_allocated_sub(sk,
1840                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1841        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1842
1843        if (sk_under_memory_pressure(sk) &&
1844            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1845                sk_leave_memory_pressure(sk);
1846}
1847EXPORT_SYMBOL(__sk_mem_reclaim);
1848
1849
1850/*
1851 * Set of default routines for initialising struct proto_ops when
1852 * the protocol does not support a particular function. In certain
1853 * cases where it makes no sense for a protocol to have a "do nothing"
1854 * function, some default processing is provided.
1855 */
1856
1857int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1858{
1859        return -EOPNOTSUPP;
1860}
1861EXPORT_SYMBOL(sock_no_bind);
1862
1863int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1864                    int len, int flags)
1865{
1866        return -EOPNOTSUPP;
1867}
1868EXPORT_SYMBOL(sock_no_connect);
1869
1870int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1871{
1872        return -EOPNOTSUPP;
1873}
1874EXPORT_SYMBOL(sock_no_socketpair);
1875
1876int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1877{
1878        return -EOPNOTSUPP;
1879}
1880EXPORT_SYMBOL(sock_no_accept);
1881
1882int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1883                    int *len, int peer)
1884{
1885        return -EOPNOTSUPP;
1886}
1887EXPORT_SYMBOL(sock_no_getname);
1888
1889unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1890{
1891        return 0;
1892}
1893EXPORT_SYMBOL(sock_no_poll);
1894
1895int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1896{
1897        return -EOPNOTSUPP;
1898}
1899EXPORT_SYMBOL(sock_no_ioctl);
1900
1901int sock_no_listen(struct socket *sock, int backlog)
1902{
1903        return -EOPNOTSUPP;
1904}
1905EXPORT_SYMBOL(sock_no_listen);
1906
1907int sock_no_shutdown(struct socket *sock, int how)
1908{
1909        return -EOPNOTSUPP;
1910}
1911EXPORT_SYMBOL(sock_no_shutdown);
1912
1913int sock_no_setsockopt(struct socket *sock, int level, int optname,
1914                    char __user *optval, unsigned int optlen)
1915{
1916        return -EOPNOTSUPP;
1917}
1918EXPORT_SYMBOL(sock_no_setsockopt);
1919
1920int sock_no_getsockopt(struct socket *sock, int level, int optname,
1921                    char __user *optval, int __user *optlen)
1922{
1923        return -EOPNOTSUPP;
1924}
1925EXPORT_SYMBOL(sock_no_getsockopt);
1926
1927int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1928                    size_t len)
1929{
1930        return -EOPNOTSUPP;
1931}
1932EXPORT_SYMBOL(sock_no_sendmsg);
1933
1934int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1935                    size_t len, int flags)
1936{
1937        return -EOPNOTSUPP;
1938}
1939EXPORT_SYMBOL(sock_no_recvmsg);
1940
1941int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1942{
1943        /* Mirror missing mmap method error code */
1944        return -ENODEV;
1945}
1946EXPORT_SYMBOL(sock_no_mmap);
1947
1948ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1949{
1950        ssize_t res;
1951        struct msghdr msg = {.msg_flags = flags};
1952        struct kvec iov;
1953        char *kaddr = kmap(page);
1954        iov.iov_base = kaddr + offset;
1955        iov.iov_len = size;
1956        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1957        kunmap(page);
1958        return res;
1959}
1960EXPORT_SYMBOL(sock_no_sendpage);
1961
1962/*
1963 *      Default Socket Callbacks
1964 */
1965
1966static void sock_def_wakeup(struct sock *sk)
1967{
1968        struct socket_wq *wq;
1969
1970        rcu_read_lock();
1971        wq = rcu_dereference(sk->sk_wq);
1972        if (wq_has_sleeper(wq))
1973                wake_up_interruptible_all(&wq->wait);
1974        rcu_read_unlock();
1975}
1976
1977static void sock_def_error_report(struct sock *sk)
1978{
1979        struct socket_wq *wq;
1980
1981        rcu_read_lock();
1982        wq = rcu_dereference(sk->sk_wq);
1983        if (wq_has_sleeper(wq))
1984                wake_up_interruptible_poll(&wq->wait, POLLERR);
1985        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1986        rcu_read_unlock();
1987}
1988
1989static void sock_def_readable(struct sock *sk, int len)
1990{
1991        struct socket_wq *wq;
1992
1993        rcu_read_lock();
1994        wq = rcu_dereference(sk->sk_wq);
1995        if (wq_has_sleeper(wq))
1996                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1997                                                POLLRDNORM | POLLRDBAND);
1998        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1999        rcu_read_unlock();
2000}
2001
2002static void sock_def_write_space(struct sock *sk)
2003{
2004        struct socket_wq *wq;
2005
2006        rcu_read_lock();
2007
2008        /* Do not wake up a writer until he can make "significant"
2009         * progress.  --DaveM
2010         */
2011        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2012                wq = rcu_dereference(sk->sk_wq);
2013                if (wq_has_sleeper(wq))
2014                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2015                                                POLLWRNORM | POLLWRBAND);
2016
2017                /* Should agree with poll, otherwise some programs break */
2018                if (sock_writeable(sk))
2019                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2020        }
2021
2022        rcu_read_unlock();
2023}
2024
2025static void sock_def_destruct(struct sock *sk)
2026{
2027        kfree(sk->sk_protinfo);
2028}
2029
2030void sk_send_sigurg(struct sock *sk)
2031{
2032        if (sk->sk_socket && sk->sk_socket->file)
2033                if (send_sigurg(&sk->sk_socket->file->f_owner))
2034                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2035}
2036EXPORT_SYMBOL(sk_send_sigurg);
2037
2038void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2039                    unsigned long expires)
2040{
2041        if (!mod_timer(timer, expires))
2042                sock_hold(sk);
2043}
2044EXPORT_SYMBOL(sk_reset_timer);
2045
2046void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2047{
2048        if (timer_pending(timer) && del_timer(timer))
2049                __sock_put(sk);
2050}
2051EXPORT_SYMBOL(sk_stop_timer);
2052
2053void sock_init_data(struct socket *sock, struct sock *sk)
2054{
2055        skb_queue_head_init(&sk->sk_receive_queue);
2056        skb_queue_head_init(&sk->sk_write_queue);
2057        skb_queue_head_init(&sk->sk_error_queue);
2058#ifdef CONFIG_NET_DMA
2059        skb_queue_head_init(&sk->sk_async_wait_queue);
2060#endif
2061
2062        sk->sk_send_head        =       NULL;
2063
2064        init_timer(&sk->sk_timer);
2065
2066        sk->sk_allocation       =       GFP_KERNEL;
2067        sk->sk_rcvbuf           =       sysctl_rmem_default;
2068        sk->sk_sndbuf           =       sysctl_wmem_default;
2069        sk->sk_state            =       TCP_CLOSE;
2070        sk_set_socket(sk, sock);
2071
2072        sock_set_flag(sk, SOCK_ZAPPED);
2073
2074        if (sock) {
2075                sk->sk_type     =       sock->type;
2076                sk->sk_wq       =       sock->wq;
2077                sock->sk        =       sk;
2078        } else
2079                sk->sk_wq       =       NULL;
2080
2081        spin_lock_init(&sk->sk_dst_lock);
2082        rwlock_init(&sk->sk_callback_lock);
2083        lockdep_set_class_and_name(&sk->sk_callback_lock,
2084                        af_callback_keys + sk->sk_family,
2085                        af_family_clock_key_strings[sk->sk_family]);
2086
2087        sk->sk_state_change     =       sock_def_wakeup;
2088        sk->sk_data_ready       =       sock_def_readable;
2089        sk->sk_write_space      =       sock_def_write_space;
2090        sk->sk_error_report     =       sock_def_error_report;
2091        sk->sk_destruct         =       sock_def_destruct;
2092
2093        sk->sk_sndmsg_page      =       NULL;
2094        sk->sk_sndmsg_off       =       0;
2095
2096        sk->sk_peer_pid         =       NULL;
2097        sk->sk_peer_cred        =       NULL;
2098        sk->sk_write_pending    =       0;
2099        sk->sk_rcvlowat         =       1;
2100        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2101        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2102
2103        sk->sk_stamp = ktime_set(-1L, 0);
2104
2105        /*
2106         * Before updating sk_refcnt, we must commit prior changes to memory
2107         * (Documentation/RCU/rculist_nulls.txt for details)
2108         */
2109        smp_wmb();
2110        atomic_set(&sk->sk_refcnt, 1);
2111        atomic_set(&sk->sk_drops, 0);
2112}
2113EXPORT_SYMBOL(sock_init_data);
2114
2115void lock_sock_nested(struct sock *sk, int subclass)
2116{
2117        might_sleep();
2118        spin_lock_bh(&sk->sk_lock.slock);
2119        if (sk->sk_lock.owned)
2120                __lock_sock(sk);
2121        sk->sk_lock.owned = 1;
2122        spin_unlock(&sk->sk_lock.slock);
2123        /*
2124         * The sk_lock has mutex_lock() semantics here:
2125         */
2126        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2127        local_bh_enable();
2128}
2129EXPORT_SYMBOL(lock_sock_nested);
2130
2131void release_sock(struct sock *sk)
2132{
2133        /*
2134         * The sk_lock has mutex_unlock() semantics:
2135         */
2136        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2137
2138        spin_lock_bh(&sk->sk_lock.slock);
2139        if (sk->sk_backlog.tail)
2140                __release_sock(sk);
2141        sk->sk_lock.owned = 0;
2142        if (waitqueue_active(&sk->sk_lock.wq))
2143                wake_up(&sk->sk_lock.wq);
2144        spin_unlock_bh(&sk->sk_lock.slock);
2145}
2146EXPORT_SYMBOL(release_sock);
2147
2148/**
2149 * lock_sock_fast - fast version of lock_sock
2150 * @sk: socket
2151 *
2152 * This version should be used for very small section, where process wont block
2153 * return false if fast path is taken
2154 *   sk_lock.slock locked, owned = 0, BH disabled
2155 * return true if slow path is taken
2156 *   sk_lock.slock unlocked, owned = 1, BH enabled
2157 */
2158bool lock_sock_fast(struct sock *sk)
2159{
2160        might_sleep();
2161        spin_lock_bh(&sk->sk_lock.slock);
2162
2163        if (!sk->sk_lock.owned)
2164                /*
2165                 * Note : We must disable BH
2166                 */
2167                return false;
2168
2169        __lock_sock(sk);
2170        sk->sk_lock.owned = 1;
2171        spin_unlock(&sk->sk_lock.slock);
2172        /*
2173         * The sk_lock has mutex_lock() semantics here:
2174         */
2175        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2176        local_bh_enable();
2177        return true;
2178}
2179EXPORT_SYMBOL(lock_sock_fast);
2180
2181int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2182{
2183        struct timeval tv;
2184        if (!sock_flag(sk, SOCK_TIMESTAMP))
2185                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2186        tv = ktime_to_timeval(sk->sk_stamp);
2187        if (tv.tv_sec == -1)
2188                return -ENOENT;
2189        if (tv.tv_sec == 0) {
2190                sk->sk_stamp = ktime_get_real();
2191                tv = ktime_to_timeval(sk->sk_stamp);
2192        }
2193        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2194}
2195EXPORT_SYMBOL(sock_get_timestamp);
2196
2197int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2198{
2199        struct timespec ts;
2200        if (!sock_flag(sk, SOCK_TIMESTAMP))
2201                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2202        ts = ktime_to_timespec(sk->sk_stamp);
2203        if (ts.tv_sec == -1)
2204                return -ENOENT;
2205        if (ts.tv_sec == 0) {
2206                sk->sk_stamp = ktime_get_real();
2207                ts = ktime_to_timespec(sk->sk_stamp);
2208        }
2209        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2210}
2211EXPORT_SYMBOL(sock_get_timestampns);
2212
2213void sock_enable_timestamp(struct sock *sk, int flag)
2214{
2215        if (!sock_flag(sk, flag)) {
2216                unsigned long previous_flags = sk->sk_flags;
2217
2218                sock_set_flag(sk, flag);
2219                /*
2220                 * we just set one of the two flags which require net
2221                 * time stamping, but time stamping might have been on
2222                 * already because of the other one
2223                 */
2224                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2225                        net_enable_timestamp();
2226        }
2227}
2228
2229/*
2230 *      Get a socket option on an socket.
2231 *
2232 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2233 *      asynchronous errors should be reported by getsockopt. We assume
2234 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2235 */
2236int sock_common_getsockopt(struct socket *sock, int level, int optname,
2237                           char __user *optval, int __user *optlen)
2238{
2239        struct sock *sk = sock->sk;
2240
2241        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2242}
2243EXPORT_SYMBOL(sock_common_getsockopt);
2244
2245#ifdef CONFIG_COMPAT
2246int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2247                                  char __user *optval, int __user *optlen)
2248{
2249        struct sock *sk = sock->sk;
2250
2251        if (sk->sk_prot->compat_getsockopt != NULL)
2252                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2253                                                      optval, optlen);
2254        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2255}
2256EXPORT_SYMBOL(compat_sock_common_getsockopt);
2257#endif
2258
2259int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2260                        struct msghdr *msg, size_t size, int flags)
2261{
2262        struct sock *sk = sock->sk;
2263        int addr_len = 0;
2264        int err;
2265
2266        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2267                                   flags & ~MSG_DONTWAIT, &addr_len);
2268        if (err >= 0)
2269                msg->msg_namelen = addr_len;
2270        return err;
2271}
2272EXPORT_SYMBOL(sock_common_recvmsg);
2273
2274/*
2275 *      Set socket options on an inet socket.
2276 */
2277int sock_common_setsockopt(struct socket *sock, int level, int optname,
2278                           char __user *optval, unsigned int optlen)
2279{
2280        struct sock *sk = sock->sk;
2281
2282        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2283}
2284EXPORT_SYMBOL(sock_common_setsockopt);
2285
2286#ifdef CONFIG_COMPAT
2287int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2288                                  char __user *optval, unsigned int optlen)
2289{
2290        struct sock *sk = sock->sk;
2291
2292        if (sk->sk_prot->compat_setsockopt != NULL)
2293                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2294                                                      optval, optlen);
2295        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2296}
2297EXPORT_SYMBOL(compat_sock_common_setsockopt);
2298#endif
2299
2300void sk_common_release(struct sock *sk)
2301{
2302        if (sk->sk_prot->destroy)
2303                sk->sk_prot->destroy(sk);
2304
2305        /*
2306         * Observation: when sock_common_release is called, processes have
2307         * no access to socket. But net still has.
2308         * Step one, detach it from networking:
2309         *
2310         * A. Remove from hash tables.
2311         */
2312
2313        sk->sk_prot->unhash(sk);
2314
2315        /*
2316         * In this point socket cannot receive new packets, but it is possible
2317         * that some packets are in flight because some CPU runs receiver and
2318         * did hash table lookup before we unhashed socket. They will achieve
2319         * receive queue and will be purged by socket destructor.
2320         *
2321         * Also we still have packets pending on receive queue and probably,
2322         * our own packets waiting in device queues. sock_destroy will drain
2323         * receive queue, but transmitted packets will delay socket destruction
2324         * until the last reference will be released.
2325         */
2326
2327        sock_orphan(sk);
2328
2329        xfrm_sk_free_policy(sk);
2330
2331        sk_refcnt_debug_release(sk);
2332        sock_put(sk);
2333}
2334EXPORT_SYMBOL(sk_common_release);
2335
2336#ifdef CONFIG_PROC_FS
2337#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2338struct prot_inuse {
2339        int val[PROTO_INUSE_NR];
2340};
2341
2342static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2343
2344#ifdef CONFIG_NET_NS
2345void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2346{
2347        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2348}
2349EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2350
2351int sock_prot_inuse_get(struct net *net, struct proto *prot)
2352{
2353        int cpu, idx = prot->inuse_idx;
2354        int res = 0;
2355
2356        for_each_possible_cpu(cpu)
2357                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2358
2359        return res >= 0 ? res : 0;
2360}
2361EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2362
2363static int __net_init sock_inuse_init_net(struct net *net)
2364{
2365        net->core.inuse = alloc_percpu(struct prot_inuse);
2366        return net->core.inuse ? 0 : -ENOMEM;
2367}
2368
2369static void __net_exit sock_inuse_exit_net(struct net *net)
2370{
2371        free_percpu(net->core.inuse);
2372}
2373
2374static struct pernet_operations net_inuse_ops = {
2375        .init = sock_inuse_init_net,
2376        .exit = sock_inuse_exit_net,
2377};
2378
2379static __init int net_inuse_init(void)
2380{
2381        if (register_pernet_subsys(&net_inuse_ops))
2382                panic("Cannot initialize net inuse counters");
2383
2384        return 0;
2385}
2386
2387core_initcall(net_inuse_init);
2388#else
2389static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2390
2391void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2392{
2393        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2394}
2395EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2396
2397int sock_prot_inuse_get(struct net *net, struct proto *prot)
2398{
2399        int cpu, idx = prot->inuse_idx;
2400        int res = 0;
2401
2402        for_each_possible_cpu(cpu)
2403                res += per_cpu(prot_inuse, cpu).val[idx];
2404
2405        return res >= 0 ? res : 0;
2406}
2407EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2408#endif
2409
2410static void assign_proto_idx(struct proto *prot)
2411{
2412        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2413
2414        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2415                printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2416                return;
2417        }
2418
2419        set_bit(prot->inuse_idx, proto_inuse_idx);
2420}
2421
2422static void release_proto_idx(struct proto *prot)
2423{
2424        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2425                clear_bit(prot->inuse_idx, proto_inuse_idx);
2426}
2427#else
2428static inline void assign_proto_idx(struct proto *prot)
2429{
2430}
2431
2432static inline void release_proto_idx(struct proto *prot)
2433{
2434}
2435#endif
2436
2437int proto_register(struct proto *prot, int alloc_slab)
2438{
2439        if (alloc_slab) {
2440                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2441                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2442                                        NULL);
2443
2444                if (prot->slab == NULL) {
2445                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2446                               prot->name);
2447                        goto out;
2448                }
2449
2450                if (prot->rsk_prot != NULL) {
2451                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2452                        if (prot->rsk_prot->slab_name == NULL)
2453                                goto out_free_sock_slab;
2454
2455                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2456                                                                 prot->rsk_prot->obj_size, 0,
2457                                                                 SLAB_HWCACHE_ALIGN, NULL);
2458
2459                        if (prot->rsk_prot->slab == NULL) {
2460                                printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2461                                       prot->name);
2462                                goto out_free_request_sock_slab_name;
2463                        }
2464                }
2465
2466                if (prot->twsk_prot != NULL) {
2467                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2468
2469                        if (prot->twsk_prot->twsk_slab_name == NULL)
2470                                goto out_free_request_sock_slab;
2471
2472                        prot->twsk_prot->twsk_slab =
2473                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2474                                                  prot->twsk_prot->twsk_obj_size,
2475                                                  0,
2476                                                  SLAB_HWCACHE_ALIGN |
2477                                                        prot->slab_flags,
2478                                                  NULL);
2479                        if (prot->twsk_prot->twsk_slab == NULL)
2480                                goto out_free_timewait_sock_slab_name;
2481                }
2482        }
2483
2484        mutex_lock(&proto_list_mutex);
2485        list_add(&prot->node, &proto_list);
2486        assign_proto_idx(prot);
2487        mutex_unlock(&proto_list_mutex);
2488        return 0;
2489
2490out_free_timewait_sock_slab_name:
2491        kfree(prot->twsk_prot->twsk_slab_name);
2492out_free_request_sock_slab:
2493        if (prot->rsk_prot && prot->rsk_prot->slab) {
2494                kmem_cache_destroy(prot->rsk_prot->slab);
2495                prot->rsk_prot->slab = NULL;
2496        }
2497out_free_request_sock_slab_name:
2498        if (prot->rsk_prot)
2499                kfree(prot->rsk_prot->slab_name);
2500out_free_sock_slab:
2501        kmem_cache_destroy(prot->slab);
2502        prot->slab = NULL;
2503out:
2504        return -ENOBUFS;
2505}
2506EXPORT_SYMBOL(proto_register);
2507
2508void proto_unregister(struct proto *prot)
2509{
2510        mutex_lock(&proto_list_mutex);
2511        release_proto_idx(prot);
2512        list_del(&prot->node);
2513        mutex_unlock(&proto_list_mutex);
2514
2515        if (prot->slab != NULL) {
2516                kmem_cache_destroy(prot->slab);
2517                prot->slab = NULL;
2518        }
2519
2520        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2521                kmem_cache_destroy(prot->rsk_prot->slab);
2522                kfree(prot->rsk_prot->slab_name);
2523                prot->rsk_prot->slab = NULL;
2524        }
2525
2526        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2527                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2528                kfree(prot->twsk_prot->twsk_slab_name);
2529                prot->twsk_prot->twsk_slab = NULL;
2530        }
2531}
2532EXPORT_SYMBOL(proto_unregister);
2533
2534#ifdef CONFIG_PROC_FS
2535static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2536        __acquires(proto_list_mutex)
2537{
2538        mutex_lock(&proto_list_mutex);
2539        return seq_list_start_head(&proto_list, *pos);
2540}
2541
2542static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2543{
2544        return seq_list_next(v, &proto_list, pos);
2545}
2546
2547static void proto_seq_stop(struct seq_file *seq, void *v)
2548        __releases(proto_list_mutex)
2549{
2550        mutex_unlock(&proto_list_mutex);
2551}
2552
2553static char proto_method_implemented(const void *method)
2554{
2555        return method == NULL ? 'n' : 'y';
2556}
2557static long sock_prot_memory_allocated(struct proto *proto)
2558{
2559        return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2560}
2561
2562static char *sock_prot_memory_pressure(struct proto *proto)
2563{
2564        return proto->memory_pressure != NULL ?
2565        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2566}
2567
2568static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2569{
2570
2571        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2572                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2573                   proto->name,
2574                   proto->obj_size,
2575                   sock_prot_inuse_get(seq_file_net(seq), proto),
2576                   sock_prot_memory_allocated(proto),
2577                   sock_prot_memory_pressure(proto),
2578                   proto->max_header,
2579                   proto->slab == NULL ? "no" : "yes",
2580                   module_name(proto->owner),
2581                   proto_method_implemented(proto->close),
2582                   proto_method_implemented(proto->connect),
2583                   proto_method_implemented(proto->disconnect),
2584                   proto_method_implemented(proto->accept),
2585                   proto_method_implemented(proto->ioctl),
2586                   proto_method_implemented(proto->init),
2587                   proto_method_implemented(proto->destroy),
2588                   proto_method_implemented(proto->shutdown),
2589                   proto_method_implemented(proto->setsockopt),
2590                   proto_method_implemented(proto->getsockopt),
2591                   proto_method_implemented(proto->sendmsg),
2592                   proto_method_implemented(proto->recvmsg),
2593                   proto_method_implemented(proto->sendpage),
2594                   proto_method_implemented(proto->bind),
2595                   proto_method_implemented(proto->backlog_rcv),
2596                   proto_method_implemented(proto->hash),
2597                   proto_method_implemented(proto->unhash),
2598                   proto_method_implemented(proto->get_port),
2599                   proto_method_implemented(proto->enter_memory_pressure));
2600}
2601
2602static int proto_seq_show(struct seq_file *seq, void *v)
2603{
2604        if (v == &proto_list)
2605                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2606                           "protocol",
2607                           "size",
2608                           "sockets",
2609                           "memory",
2610                           "press",
2611                           "maxhdr",
2612                           "slab",
2613                           "module",
2614                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2615        else
2616                proto_seq_printf(seq, list_entry(v, struct proto, node));
2617        return 0;
2618}
2619
2620static const struct seq_operations proto_seq_ops = {
2621        .start  = proto_seq_start,
2622        .next   = proto_seq_next,
2623        .stop   = proto_seq_stop,
2624        .show   = proto_seq_show,
2625};
2626
2627static int proto_seq_open(struct inode *inode, struct file *file)
2628{
2629        return seq_open_net(inode, file, &proto_seq_ops,
2630                            sizeof(struct seq_net_private));
2631}
2632
2633static const struct file_operations proto_seq_fops = {
2634        .owner          = THIS_MODULE,
2635        .open           = proto_seq_open,
2636        .read           = seq_read,
2637        .llseek         = seq_lseek,
2638        .release        = seq_release_net,
2639};
2640
2641static __net_init int proto_init_net(struct net *net)
2642{
2643        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2644                return -ENOMEM;
2645
2646        return 0;
2647}
2648
2649static __net_exit void proto_exit_net(struct net *net)
2650{
2651        proc_net_remove(net, "protocols");
2652}
2653
2654
2655static __net_initdata struct pernet_operations proto_net_ops = {
2656        .init = proto_init_net,
2657        .exit = proto_exit_net,
2658};
2659
2660static int __init proto_init(void)
2661{
2662        return register_pernet_subsys(&proto_net_ops);
2663}
2664
2665subsys_initcall(proto_init);
2666
2667#endif /* PROC_FS */
2668