linux/net/unix/af_unix.c
<<
>>
Prefs
   1/*
   2 * NET4:        Implementation of BSD Unix domain sockets.
   3 *
   4 * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5 *
   6 *              This program is free software; you can redistribute it and/or
   7 *              modify it under the terms of the GNU General Public License
   8 *              as published by the Free Software Foundation; either version
   9 *              2 of the License, or (at your option) any later version.
  10 *
  11 * Fixes:
  12 *              Linus Torvalds  :       Assorted bug cures.
  13 *              Niibe Yutaka    :       async I/O support.
  14 *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15 *              Alan Cox        :       Limit size of allocated blocks.
  16 *              Alan Cox        :       Fixed the stupid socketpair bug.
  17 *              Alan Cox        :       BSD compatibility fine tuning.
  18 *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19 *              Alan Cox        :       Sorted out a proper draft version of
  20 *                                      file descriptor passing hacked up from
  21 *                                      Mike Shaver's work.
  22 *              Marty Leisner   :       Fixes to fd passing
  23 *              Nick Nevin      :       recvmsg bugfix.
  24 *              Alan Cox        :       Started proper garbage collector
  25 *              Heiko EiBfeldt  :       Missing verify_area check
  26 *              Alan Cox        :       Started POSIXisms
  27 *              Andreas Schwab  :       Replace inode by dentry for proper
  28 *                                      reference counting
  29 *              Kirk Petersen   :       Made this a module
  30 *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31 *                                      Lots of bug fixes.
  32 *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33 *                                      by above two patches.
  34 *           Andrea Arcangeli   :       If possible we block in connect(2)
  35 *                                      if the max backlog of the listen socket
  36 *                                      is been reached. This won't break
  37 *                                      old apps and it will avoid huge amount
  38 *                                      of socks hashed (this for unix_gc()
  39 *                                      performances reasons).
  40 *                                      Security fix that limits the max
  41 *                                      number of socks to 2*max_files and
  42 *                                      the number of skb queueable in the
  43 *                                      dgram receiver.
  44 *              Artur Skawina   :       Hash function optimizations
  45 *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46 *            Malcolm Beattie   :       Set peercred for socketpair
  47 *           Michal Ostrowski   :       Module initialization cleanup.
  48 *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49 *                                      the core infrastructure is doing that
  50 *                                      for all net proto families now (2.5.69+)
  51 *
  52 *
  53 * Known differences from reference BSD that was tested:
  54 *
  55 *      [TO FIX]
  56 *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57 *              other the moment one end closes.
  58 *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59 *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60 *      [NOT TO FIX]
  61 *      accept() returns a path name even if the connecting socket has closed
  62 *              in the meantime (BSD loses the path and gives up).
  63 *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64 *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65 *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66 *      BSD af_unix apparently has connect forgetting to block properly.
  67 *              (need to check this with the POSIX spec in detail)
  68 *
  69 * Differences from 2.0.0-11-... (ANK)
  70 *      Bug fixes and improvements.
  71 *              - client shutdown killed server socket.
  72 *              - removed all useless cli/sti pairs.
  73 *
  74 *      Semantic changes/extensions.
  75 *              - generic control message passing.
  76 *              - SCM_CREDENTIALS control message.
  77 *              - "Abstract" (not FS based) socket bindings.
  78 *                Abstract names are sequences of bytes (not zero terminated)
  79 *                started by 0, so that this name space does not intersect
  80 *                with BSD names.
  81 */
  82
  83#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85#include <linux/module.h>
  86#include <linux/kernel.h>
  87#include <linux/signal.h>
  88#include <linux/sched.h>
  89#include <linux/errno.h>
  90#include <linux/string.h>
  91#include <linux/stat.h>
  92#include <linux/dcache.h>
  93#include <linux/namei.h>
  94#include <linux/socket.h>
  95#include <linux/un.h>
  96#include <linux/fcntl.h>
  97#include <linux/termios.h>
  98#include <linux/sockios.h>
  99#include <linux/net.h>
 100#include <linux/in.h>
 101#include <linux/fs.h>
 102#include <linux/slab.h>
 103#include <asm/uaccess.h>
 104#include <linux/skbuff.h>
 105#include <linux/netdevice.h>
 106#include <net/net_namespace.h>
 107#include <net/sock.h>
 108#include <net/tcp_states.h>
 109#include <net/af_unix.h>
 110#include <linux/proc_fs.h>
 111#include <linux/seq_file.h>
 112#include <net/scm.h>
 113#include <linux/init.h>
 114#include <linux/poll.h>
 115#include <linux/rtnetlink.h>
 116#include <linux/mount.h>
 117#include <net/checksum.h>
 118#include <linux/security.h>
 119#include <linux/freezer.h>
 120
 121struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122EXPORT_SYMBOL_GPL(unix_socket_table);
 123DEFINE_SPINLOCK(unix_table_lock);
 124EXPORT_SYMBOL_GPL(unix_table_lock);
 125static atomic_long_t unix_nr_socks;
 126
 127
 128static struct hlist_head *unix_sockets_unbound(void *addr)
 129{
 130        unsigned long hash = (unsigned long)addr;
 131
 132        hash ^= hash >> 16;
 133        hash ^= hash >> 8;
 134        hash %= UNIX_HASH_SIZE;
 135        return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136}
 137
 138#define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140#ifdef CONFIG_SECURITY_NETWORK
 141static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142{
 143        UNIXCB(skb).secid = scm->secid;
 144}
 145
 146static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147{
 148        scm->secid = UNIXCB(skb).secid;
 149}
 150
 151static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 152{
 153        return (scm->secid == UNIXCB(skb).secid);
 154}
 155#else
 156static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 157{ }
 158
 159static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 160{ }
 161
 162static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 163{
 164        return true;
 165}
 166#endif /* CONFIG_SECURITY_NETWORK */
 167
 168/*
 169 *  SMP locking strategy:
 170 *    hash table is protected with spinlock unix_table_lock
 171 *    each socket state is protected by separate spin lock.
 172 */
 173
 174static inline unsigned int unix_hash_fold(__wsum n)
 175{
 176        unsigned int hash = (__force unsigned int)csum_fold(n);
 177
 178        hash ^= hash>>8;
 179        return hash&(UNIX_HASH_SIZE-1);
 180}
 181
 182#define unix_peer(sk) (unix_sk(sk)->peer)
 183
 184static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 185{
 186        return unix_peer(osk) == sk;
 187}
 188
 189static inline int unix_may_send(struct sock *sk, struct sock *osk)
 190{
 191        return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 192}
 193
 194static inline int unix_recvq_full(struct sock const *sk)
 195{
 196        return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 197}
 198
 199struct sock *unix_peer_get(struct sock *s)
 200{
 201        struct sock *peer;
 202
 203        unix_state_lock(s);
 204        peer = unix_peer(s);
 205        if (peer)
 206                sock_hold(peer);
 207        unix_state_unlock(s);
 208        return peer;
 209}
 210EXPORT_SYMBOL_GPL(unix_peer_get);
 211
 212static inline void unix_release_addr(struct unix_address *addr)
 213{
 214        if (atomic_dec_and_test(&addr->refcnt))
 215                kfree(addr);
 216}
 217
 218/*
 219 *      Check unix socket name:
 220 *              - should be not zero length.
 221 *              - if started by not zero, should be NULL terminated (FS object)
 222 *              - if started by zero, it is abstract name.
 223 */
 224
 225static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 226{
 227        if (len <= sizeof(short) || len > sizeof(*sunaddr))
 228                return -EINVAL;
 229        if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 230                return -EINVAL;
 231        if (sunaddr->sun_path[0]) {
 232                /*
 233                 * This may look like an off by one error but it is a bit more
 234                 * subtle. 108 is the longest valid AF_UNIX path for a binding.
 235                 * sun_path[108] doesn't as such exist.  However in kernel space
 236                 * we are guaranteed that it is a valid memory location in our
 237                 * kernel address buffer.
 238                 */
 239                ((char *)sunaddr)[len] = 0;
 240                len = strlen(sunaddr->sun_path)+1+sizeof(short);
 241                return len;
 242        }
 243
 244        *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 245        return len;
 246}
 247
 248static void __unix_remove_socket(struct sock *sk)
 249{
 250        sk_del_node_init(sk);
 251}
 252
 253static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 254{
 255        WARN_ON(!sk_unhashed(sk));
 256        sk_add_node(sk, list);
 257}
 258
 259static inline void unix_remove_socket(struct sock *sk)
 260{
 261        spin_lock(&unix_table_lock);
 262        __unix_remove_socket(sk);
 263        spin_unlock(&unix_table_lock);
 264}
 265
 266static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 267{
 268        spin_lock(&unix_table_lock);
 269        __unix_insert_socket(list, sk);
 270        spin_unlock(&unix_table_lock);
 271}
 272
 273static struct sock *__unix_find_socket_byname(struct net *net,
 274                                              struct sockaddr_un *sunname,
 275                                              int len, int type, unsigned int hash)
 276{
 277        struct sock *s;
 278
 279        sk_for_each(s, &unix_socket_table[hash ^ type]) {
 280                struct unix_sock *u = unix_sk(s);
 281
 282                if (!net_eq(sock_net(s), net))
 283                        continue;
 284
 285                if (u->addr->len == len &&
 286                    !memcmp(u->addr->name, sunname, len))
 287                        goto found;
 288        }
 289        s = NULL;
 290found:
 291        return s;
 292}
 293
 294static inline struct sock *unix_find_socket_byname(struct net *net,
 295                                                   struct sockaddr_un *sunname,
 296                                                   int len, int type,
 297                                                   unsigned int hash)
 298{
 299        struct sock *s;
 300
 301        spin_lock(&unix_table_lock);
 302        s = __unix_find_socket_byname(net, sunname, len, type, hash);
 303        if (s)
 304                sock_hold(s);
 305        spin_unlock(&unix_table_lock);
 306        return s;
 307}
 308
 309static struct sock *unix_find_socket_byinode(struct inode *i)
 310{
 311        struct sock *s;
 312
 313        spin_lock(&unix_table_lock);
 314        sk_for_each(s,
 315                    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 316                struct dentry *dentry = unix_sk(s)->path.dentry;
 317
 318                if (dentry && d_backing_inode(dentry) == i) {
 319                        sock_hold(s);
 320                        goto found;
 321                }
 322        }
 323        s = NULL;
 324found:
 325        spin_unlock(&unix_table_lock);
 326        return s;
 327}
 328
 329static inline int unix_writable(struct sock *sk)
 330{
 331        return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 332}
 333
 334static void unix_write_space(struct sock *sk)
 335{
 336        struct socket_wq *wq;
 337
 338        rcu_read_lock();
 339        if (unix_writable(sk)) {
 340                wq = rcu_dereference(sk->sk_wq);
 341                if (wq_has_sleeper(wq))
 342                        wake_up_interruptible_sync_poll(&wq->wait,
 343                                POLLOUT | POLLWRNORM | POLLWRBAND);
 344                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 345        }
 346        rcu_read_unlock();
 347}
 348
 349/* When dgram socket disconnects (or changes its peer), we clear its receive
 350 * queue of packets arrived from previous peer. First, it allows to do
 351 * flow control based only on wmem_alloc; second, sk connected to peer
 352 * may receive messages only from that peer. */
 353static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 354{
 355        if (!skb_queue_empty(&sk->sk_receive_queue)) {
 356                skb_queue_purge(&sk->sk_receive_queue);
 357                wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 358
 359                /* If one link of bidirectional dgram pipe is disconnected,
 360                 * we signal error. Messages are lost. Do not make this,
 361                 * when peer was not connected to us.
 362                 */
 363                if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 364                        other->sk_err = ECONNRESET;
 365                        other->sk_error_report(other);
 366                }
 367        }
 368}
 369
 370static void unix_sock_destructor(struct sock *sk)
 371{
 372        struct unix_sock *u = unix_sk(sk);
 373
 374        skb_queue_purge(&sk->sk_receive_queue);
 375
 376        WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 377        WARN_ON(!sk_unhashed(sk));
 378        WARN_ON(sk->sk_socket);
 379        if (!sock_flag(sk, SOCK_DEAD)) {
 380                pr_info("Attempt to release alive unix socket: %p\n", sk);
 381                return;
 382        }
 383
 384        if (u->addr)
 385                unix_release_addr(u->addr);
 386
 387        atomic_long_dec(&unix_nr_socks);
 388        local_bh_disable();
 389        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 390        local_bh_enable();
 391#ifdef UNIX_REFCNT_DEBUG
 392        pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 393                atomic_long_read(&unix_nr_socks));
 394#endif
 395}
 396
 397static void unix_release_sock(struct sock *sk, int embrion)
 398{
 399        struct unix_sock *u = unix_sk(sk);
 400        struct path path;
 401        struct sock *skpair;
 402        struct sk_buff *skb;
 403        int state;
 404
 405        unix_remove_socket(sk);
 406
 407        /* Clear state */
 408        unix_state_lock(sk);
 409        sock_orphan(sk);
 410        sk->sk_shutdown = SHUTDOWN_MASK;
 411        path         = u->path;
 412        u->path.dentry = NULL;
 413        u->path.mnt = NULL;
 414        state = sk->sk_state;
 415        sk->sk_state = TCP_CLOSE;
 416        unix_state_unlock(sk);
 417
 418        wake_up_interruptible_all(&u->peer_wait);
 419
 420        skpair = unix_peer(sk);
 421
 422        if (skpair != NULL) {
 423                if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 424                        unix_state_lock(skpair);
 425                        /* No more writes */
 426                        skpair->sk_shutdown = SHUTDOWN_MASK;
 427                        if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 428                                skpair->sk_err = ECONNRESET;
 429                        unix_state_unlock(skpair);
 430                        skpair->sk_state_change(skpair);
 431                        sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 432                }
 433                sock_put(skpair); /* It may now die */
 434                unix_peer(sk) = NULL;
 435        }
 436
 437        /* Try to flush out this socket. Throw out buffers at least */
 438
 439        while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 440                if (state == TCP_LISTEN)
 441                        unix_release_sock(skb->sk, 1);
 442                /* passed fds are erased in the kfree_skb hook        */
 443                kfree_skb(skb);
 444        }
 445
 446        if (path.dentry)
 447                path_put(&path);
 448
 449        sock_put(sk);
 450
 451        /* ---- Socket is dead now and most probably destroyed ---- */
 452
 453        /*
 454         * Fixme: BSD difference: In BSD all sockets connected to us get
 455         *        ECONNRESET and we die on the spot. In Linux we behave
 456         *        like files and pipes do and wait for the last
 457         *        dereference.
 458         *
 459         * Can't we simply set sock->err?
 460         *
 461         *        What the above comment does talk about? --ANK(980817)
 462         */
 463
 464        if (unix_tot_inflight)
 465                unix_gc();              /* Garbage collect fds */
 466}
 467
 468static void init_peercred(struct sock *sk)
 469{
 470        put_pid(sk->sk_peer_pid);
 471        if (sk->sk_peer_cred)
 472                put_cred(sk->sk_peer_cred);
 473        sk->sk_peer_pid  = get_pid(task_tgid(current));
 474        sk->sk_peer_cred = get_current_cred();
 475}
 476
 477static void copy_peercred(struct sock *sk, struct sock *peersk)
 478{
 479        put_pid(sk->sk_peer_pid);
 480        if (sk->sk_peer_cred)
 481                put_cred(sk->sk_peer_cred);
 482        sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 483        sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 484}
 485
 486static int unix_listen(struct socket *sock, int backlog)
 487{
 488        int err;
 489        struct sock *sk = sock->sk;
 490        struct unix_sock *u = unix_sk(sk);
 491        struct pid *old_pid = NULL;
 492
 493        err = -EOPNOTSUPP;
 494        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 495                goto out;       /* Only stream/seqpacket sockets accept */
 496        err = -EINVAL;
 497        if (!u->addr)
 498                goto out;       /* No listens on an unbound socket */
 499        unix_state_lock(sk);
 500        if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 501                goto out_unlock;
 502        if (backlog > sk->sk_max_ack_backlog)
 503                wake_up_interruptible_all(&u->peer_wait);
 504        sk->sk_max_ack_backlog  = backlog;
 505        sk->sk_state            = TCP_LISTEN;
 506        /* set credentials so connect can copy them */
 507        init_peercred(sk);
 508        err = 0;
 509
 510out_unlock:
 511        unix_state_unlock(sk);
 512        put_pid(old_pid);
 513out:
 514        return err;
 515}
 516
 517static int unix_release(struct socket *);
 518static int unix_bind(struct socket *, struct sockaddr *, int);
 519static int unix_stream_connect(struct socket *, struct sockaddr *,
 520                               int addr_len, int flags);
 521static int unix_socketpair(struct socket *, struct socket *);
 522static int unix_accept(struct socket *, struct socket *, int);
 523static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 524static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 525static unsigned int unix_dgram_poll(struct file *, struct socket *,
 526                                    poll_table *);
 527static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 528static int unix_shutdown(struct socket *, int);
 529static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 530static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 531static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 532                                    size_t size, int flags);
 533static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 534                                       struct pipe_inode_info *, size_t size,
 535                                       unsigned int flags);
 536static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 537static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 538static int unix_dgram_connect(struct socket *, struct sockaddr *,
 539                              int, int);
 540static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 541static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 542                                  int);
 543
 544static int unix_set_peek_off(struct sock *sk, int val)
 545{
 546        struct unix_sock *u = unix_sk(sk);
 547
 548        if (mutex_lock_interruptible(&u->readlock))
 549                return -EINTR;
 550
 551        sk->sk_peek_off = val;
 552        mutex_unlock(&u->readlock);
 553
 554        return 0;
 555}
 556
 557
 558static const struct proto_ops unix_stream_ops = {
 559        .family =       PF_UNIX,
 560        .owner =        THIS_MODULE,
 561        .release =      unix_release,
 562        .bind =         unix_bind,
 563        .connect =      unix_stream_connect,
 564        .socketpair =   unix_socketpair,
 565        .accept =       unix_accept,
 566        .getname =      unix_getname,
 567        .poll =         unix_poll,
 568        .ioctl =        unix_ioctl,
 569        .listen =       unix_listen,
 570        .shutdown =     unix_shutdown,
 571        .setsockopt =   sock_no_setsockopt,
 572        .getsockopt =   sock_no_getsockopt,
 573        .sendmsg =      unix_stream_sendmsg,
 574        .recvmsg =      unix_stream_recvmsg,
 575        .mmap =         sock_no_mmap,
 576        .sendpage =     unix_stream_sendpage,
 577        .splice_read =  unix_stream_splice_read,
 578        .set_peek_off = unix_set_peek_off,
 579};
 580
 581static const struct proto_ops unix_dgram_ops = {
 582        .family =       PF_UNIX,
 583        .owner =        THIS_MODULE,
 584        .release =      unix_release,
 585        .bind =         unix_bind,
 586        .connect =      unix_dgram_connect,
 587        .socketpair =   unix_socketpair,
 588        .accept =       sock_no_accept,
 589        .getname =      unix_getname,
 590        .poll =         unix_dgram_poll,
 591        .ioctl =        unix_ioctl,
 592        .listen =       sock_no_listen,
 593        .shutdown =     unix_shutdown,
 594        .setsockopt =   sock_no_setsockopt,
 595        .getsockopt =   sock_no_getsockopt,
 596        .sendmsg =      unix_dgram_sendmsg,
 597        .recvmsg =      unix_dgram_recvmsg,
 598        .mmap =         sock_no_mmap,
 599        .sendpage =     sock_no_sendpage,
 600        .set_peek_off = unix_set_peek_off,
 601};
 602
 603static const struct proto_ops unix_seqpacket_ops = {
 604        .family =       PF_UNIX,
 605        .owner =        THIS_MODULE,
 606        .release =      unix_release,
 607        .bind =         unix_bind,
 608        .connect =      unix_stream_connect,
 609        .socketpair =   unix_socketpair,
 610        .accept =       unix_accept,
 611        .getname =      unix_getname,
 612        .poll =         unix_dgram_poll,
 613        .ioctl =        unix_ioctl,
 614        .listen =       unix_listen,
 615        .shutdown =     unix_shutdown,
 616        .setsockopt =   sock_no_setsockopt,
 617        .getsockopt =   sock_no_getsockopt,
 618        .sendmsg =      unix_seqpacket_sendmsg,
 619        .recvmsg =      unix_seqpacket_recvmsg,
 620        .mmap =         sock_no_mmap,
 621        .sendpage =     sock_no_sendpage,
 622        .set_peek_off = unix_set_peek_off,
 623};
 624
 625static struct proto unix_proto = {
 626        .name                   = "UNIX",
 627        .owner                  = THIS_MODULE,
 628        .obj_size               = sizeof(struct unix_sock),
 629};
 630
 631/*
 632 * AF_UNIX sockets do not interact with hardware, hence they
 633 * dont trigger interrupts - so it's safe for them to have
 634 * bh-unsafe locking for their sk_receive_queue.lock. Split off
 635 * this special lock-class by reinitializing the spinlock key:
 636 */
 637static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 638
 639static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 640{
 641        struct sock *sk = NULL;
 642        struct unix_sock *u;
 643
 644        atomic_long_inc(&unix_nr_socks);
 645        if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 646                goto out;
 647
 648        sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 649        if (!sk)
 650                goto out;
 651
 652        sock_init_data(sock, sk);
 653        lockdep_set_class(&sk->sk_receive_queue.lock,
 654                                &af_unix_sk_receive_queue_lock_key);
 655
 656        sk->sk_write_space      = unix_write_space;
 657        sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 658        sk->sk_destruct         = unix_sock_destructor;
 659        u         = unix_sk(sk);
 660        u->path.dentry = NULL;
 661        u->path.mnt = NULL;
 662        spin_lock_init(&u->lock);
 663        atomic_long_set(&u->inflight, 0);
 664        INIT_LIST_HEAD(&u->link);
 665        mutex_init(&u->readlock); /* single task reading lock */
 666        init_waitqueue_head(&u->peer_wait);
 667        unix_insert_socket(unix_sockets_unbound(sk), sk);
 668out:
 669        if (sk == NULL)
 670                atomic_long_dec(&unix_nr_socks);
 671        else {
 672                local_bh_disable();
 673                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 674                local_bh_enable();
 675        }
 676        return sk;
 677}
 678
 679static int unix_create(struct net *net, struct socket *sock, int protocol,
 680                       int kern)
 681{
 682        if (protocol && protocol != PF_UNIX)
 683                return -EPROTONOSUPPORT;
 684
 685        sock->state = SS_UNCONNECTED;
 686
 687        switch (sock->type) {
 688        case SOCK_STREAM:
 689                sock->ops = &unix_stream_ops;
 690                break;
 691                /*
 692                 *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 693                 *      nothing uses it.
 694                 */
 695        case SOCK_RAW:
 696                sock->type = SOCK_DGRAM;
 697        case SOCK_DGRAM:
 698                sock->ops = &unix_dgram_ops;
 699                break;
 700        case SOCK_SEQPACKET:
 701                sock->ops = &unix_seqpacket_ops;
 702                break;
 703        default:
 704                return -ESOCKTNOSUPPORT;
 705        }
 706
 707        return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 708}
 709
 710static int unix_release(struct socket *sock)
 711{
 712        struct sock *sk = sock->sk;
 713
 714        if (!sk)
 715                return 0;
 716
 717        unix_release_sock(sk, 0);
 718        sock->sk = NULL;
 719
 720        return 0;
 721}
 722
 723static int unix_autobind(struct socket *sock)
 724{
 725        struct sock *sk = sock->sk;
 726        struct net *net = sock_net(sk);
 727        struct unix_sock *u = unix_sk(sk);
 728        static u32 ordernum = 1;
 729        struct unix_address *addr;
 730        int err;
 731        unsigned int retries = 0;
 732
 733        err = mutex_lock_interruptible(&u->readlock);
 734        if (err)
 735                return err;
 736
 737        err = 0;
 738        if (u->addr)
 739                goto out;
 740
 741        err = -ENOMEM;
 742        addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 743        if (!addr)
 744                goto out;
 745
 746        addr->name->sun_family = AF_UNIX;
 747        atomic_set(&addr->refcnt, 1);
 748
 749retry:
 750        addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 751        addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 752
 753        spin_lock(&unix_table_lock);
 754        ordernum = (ordernum+1)&0xFFFFF;
 755
 756        if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 757                                      addr->hash)) {
 758                spin_unlock(&unix_table_lock);
 759                /*
 760                 * __unix_find_socket_byname() may take long time if many names
 761                 * are already in use.
 762                 */
 763                cond_resched();
 764                /* Give up if all names seems to be in use. */
 765                if (retries++ == 0xFFFFF) {
 766                        err = -ENOSPC;
 767                        kfree(addr);
 768                        goto out;
 769                }
 770                goto retry;
 771        }
 772        addr->hash ^= sk->sk_type;
 773
 774        __unix_remove_socket(sk);
 775        u->addr = addr;
 776        __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 777        spin_unlock(&unix_table_lock);
 778        err = 0;
 779
 780out:    mutex_unlock(&u->readlock);
 781        return err;
 782}
 783
 784static struct sock *unix_find_other(struct net *net,
 785                                    struct sockaddr_un *sunname, int len,
 786                                    int type, unsigned int hash, int *error)
 787{
 788        struct sock *u;
 789        struct path path;
 790        int err = 0;
 791
 792        if (sunname->sun_path[0]) {
 793                struct inode *inode;
 794                err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 795                if (err)
 796                        goto fail;
 797                inode = d_backing_inode(path.dentry);
 798                err = inode_permission(inode, MAY_WRITE);
 799                if (err)
 800                        goto put_fail;
 801
 802                err = -ECONNREFUSED;
 803                if (!S_ISSOCK(inode->i_mode))
 804                        goto put_fail;
 805                u = unix_find_socket_byinode(inode);
 806                if (!u)
 807                        goto put_fail;
 808
 809                if (u->sk_type == type)
 810                        touch_atime(&path);
 811
 812                path_put(&path);
 813
 814                err = -EPROTOTYPE;
 815                if (u->sk_type != type) {
 816                        sock_put(u);
 817                        goto fail;
 818                }
 819        } else {
 820                err = -ECONNREFUSED;
 821                u = unix_find_socket_byname(net, sunname, len, type, hash);
 822                if (u) {
 823                        struct dentry *dentry;
 824                        dentry = unix_sk(u)->path.dentry;
 825                        if (dentry)
 826                                touch_atime(&unix_sk(u)->path);
 827                } else
 828                        goto fail;
 829        }
 830        return u;
 831
 832put_fail:
 833        path_put(&path);
 834fail:
 835        *error = err;
 836        return NULL;
 837}
 838
 839static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 840{
 841        struct dentry *dentry;
 842        struct path path;
 843        int err = 0;
 844        /*
 845         * Get the parent directory, calculate the hash for last
 846         * component.
 847         */
 848        dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 849        err = PTR_ERR(dentry);
 850        if (IS_ERR(dentry))
 851                return err;
 852
 853        /*
 854         * All right, let's create it.
 855         */
 856        err = security_path_mknod(&path, dentry, mode, 0);
 857        if (!err) {
 858                err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 859                if (!err) {
 860                        res->mnt = mntget(path.mnt);
 861                        res->dentry = dget(dentry);
 862                }
 863        }
 864        done_path_create(&path, dentry);
 865        return err;
 866}
 867
 868static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 869{
 870        struct sock *sk = sock->sk;
 871        struct net *net = sock_net(sk);
 872        struct unix_sock *u = unix_sk(sk);
 873        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 874        char *sun_path = sunaddr->sun_path;
 875        int err;
 876        unsigned int hash;
 877        struct unix_address *addr;
 878        struct hlist_head *list;
 879
 880        err = -EINVAL;
 881        if (sunaddr->sun_family != AF_UNIX)
 882                goto out;
 883
 884        if (addr_len == sizeof(short)) {
 885                err = unix_autobind(sock);
 886                goto out;
 887        }
 888
 889        err = unix_mkname(sunaddr, addr_len, &hash);
 890        if (err < 0)
 891                goto out;
 892        addr_len = err;
 893
 894        err = mutex_lock_interruptible(&u->readlock);
 895        if (err)
 896                goto out;
 897
 898        err = -EINVAL;
 899        if (u->addr)
 900                goto out_up;
 901
 902        err = -ENOMEM;
 903        addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 904        if (!addr)
 905                goto out_up;
 906
 907        memcpy(addr->name, sunaddr, addr_len);
 908        addr->len = addr_len;
 909        addr->hash = hash ^ sk->sk_type;
 910        atomic_set(&addr->refcnt, 1);
 911
 912        if (sun_path[0]) {
 913                struct path path;
 914                umode_t mode = S_IFSOCK |
 915                       (SOCK_INODE(sock)->i_mode & ~current_umask());
 916                err = unix_mknod(sun_path, mode, &path);
 917                if (err) {
 918                        if (err == -EEXIST)
 919                                err = -EADDRINUSE;
 920                        unix_release_addr(addr);
 921                        goto out_up;
 922                }
 923                addr->hash = UNIX_HASH_SIZE;
 924                hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
 925                spin_lock(&unix_table_lock);
 926                u->path = path;
 927                list = &unix_socket_table[hash];
 928        } else {
 929                spin_lock(&unix_table_lock);
 930                err = -EADDRINUSE;
 931                if (__unix_find_socket_byname(net, sunaddr, addr_len,
 932                                              sk->sk_type, hash)) {
 933                        unix_release_addr(addr);
 934                        goto out_unlock;
 935                }
 936
 937                list = &unix_socket_table[addr->hash];
 938        }
 939
 940        err = 0;
 941        __unix_remove_socket(sk);
 942        u->addr = addr;
 943        __unix_insert_socket(list, sk);
 944
 945out_unlock:
 946        spin_unlock(&unix_table_lock);
 947out_up:
 948        mutex_unlock(&u->readlock);
 949out:
 950        return err;
 951}
 952
 953static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 954{
 955        if (unlikely(sk1 == sk2) || !sk2) {
 956                unix_state_lock(sk1);
 957                return;
 958        }
 959        if (sk1 < sk2) {
 960                unix_state_lock(sk1);
 961                unix_state_lock_nested(sk2);
 962        } else {
 963                unix_state_lock(sk2);
 964                unix_state_lock_nested(sk1);
 965        }
 966}
 967
 968static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 969{
 970        if (unlikely(sk1 == sk2) || !sk2) {
 971                unix_state_unlock(sk1);
 972                return;
 973        }
 974        unix_state_unlock(sk1);
 975        unix_state_unlock(sk2);
 976}
 977
 978static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 979                              int alen, int flags)
 980{
 981        struct sock *sk = sock->sk;
 982        struct net *net = sock_net(sk);
 983        struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 984        struct sock *other;
 985        unsigned int hash;
 986        int err;
 987
 988        if (addr->sa_family != AF_UNSPEC) {
 989                err = unix_mkname(sunaddr, alen, &hash);
 990                if (err < 0)
 991                        goto out;
 992                alen = err;
 993
 994                if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 995                    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 996                        goto out;
 997
 998restart:
 999                other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1000                if (!other)
1001                        goto out;
1002
1003                unix_state_double_lock(sk, other);
1004
1005                /* Apparently VFS overslept socket death. Retry. */
1006                if (sock_flag(other, SOCK_DEAD)) {
1007                        unix_state_double_unlock(sk, other);
1008                        sock_put(other);
1009                        goto restart;
1010                }
1011
1012                err = -EPERM;
1013                if (!unix_may_send(sk, other))
1014                        goto out_unlock;
1015
1016                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1017                if (err)
1018                        goto out_unlock;
1019
1020        } else {
1021                /*
1022                 *      1003.1g breaking connected state with AF_UNSPEC
1023                 */
1024                other = NULL;
1025                unix_state_double_lock(sk, other);
1026        }
1027
1028        /*
1029         * If it was connected, reconnect.
1030         */
1031        if (unix_peer(sk)) {
1032                struct sock *old_peer = unix_peer(sk);
1033                unix_peer(sk) = other;
1034                unix_state_double_unlock(sk, other);
1035
1036                if (other != old_peer)
1037                        unix_dgram_disconnected(sk, old_peer);
1038                sock_put(old_peer);
1039        } else {
1040                unix_peer(sk) = other;
1041                unix_state_double_unlock(sk, other);
1042        }
1043        return 0;
1044
1045out_unlock:
1046        unix_state_double_unlock(sk, other);
1047        sock_put(other);
1048out:
1049        return err;
1050}
1051
1052static long unix_wait_for_peer(struct sock *other, long timeo)
1053{
1054        struct unix_sock *u = unix_sk(other);
1055        int sched;
1056        DEFINE_WAIT(wait);
1057
1058        prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1059
1060        sched = !sock_flag(other, SOCK_DEAD) &&
1061                !(other->sk_shutdown & RCV_SHUTDOWN) &&
1062                unix_recvq_full(other);
1063
1064        unix_state_unlock(other);
1065
1066        if (sched)
1067                timeo = schedule_timeout(timeo);
1068
1069        finish_wait(&u->peer_wait, &wait);
1070        return timeo;
1071}
1072
1073static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1074                               int addr_len, int flags)
1075{
1076        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1077        struct sock *sk = sock->sk;
1078        struct net *net = sock_net(sk);
1079        struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1080        struct sock *newsk = NULL;
1081        struct sock *other = NULL;
1082        struct sk_buff *skb = NULL;
1083        unsigned int hash;
1084        int st;
1085        int err;
1086        long timeo;
1087
1088        err = unix_mkname(sunaddr, addr_len, &hash);
1089        if (err < 0)
1090                goto out;
1091        addr_len = err;
1092
1093        if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1094            (err = unix_autobind(sock)) != 0)
1095                goto out;
1096
1097        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1098
1099        /* First of all allocate resources.
1100           If we will make it after state is locked,
1101           we will have to recheck all again in any case.
1102         */
1103
1104        err = -ENOMEM;
1105
1106        /* create new sock for complete connection */
1107        newsk = unix_create1(sock_net(sk), NULL, 0);
1108        if (newsk == NULL)
1109                goto out;
1110
1111        /* Allocate skb for sending to listening sock */
1112        skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1113        if (skb == NULL)
1114                goto out;
1115
1116restart:
1117        /*  Find listening sock. */
1118        other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1119        if (!other)
1120                goto out;
1121
1122        /* Latch state of peer */
1123        unix_state_lock(other);
1124
1125        /* Apparently VFS overslept socket death. Retry. */
1126        if (sock_flag(other, SOCK_DEAD)) {
1127                unix_state_unlock(other);
1128                sock_put(other);
1129                goto restart;
1130        }
1131
1132        err = -ECONNREFUSED;
1133        if (other->sk_state != TCP_LISTEN)
1134                goto out_unlock;
1135        if (other->sk_shutdown & RCV_SHUTDOWN)
1136                goto out_unlock;
1137
1138        if (unix_recvq_full(other)) {
1139                err = -EAGAIN;
1140                if (!timeo)
1141                        goto out_unlock;
1142
1143                timeo = unix_wait_for_peer(other, timeo);
1144
1145                err = sock_intr_errno(timeo);
1146                if (signal_pending(current))
1147                        goto out;
1148                sock_put(other);
1149                goto restart;
1150        }
1151
1152        /* Latch our state.
1153
1154           It is tricky place. We need to grab our state lock and cannot
1155           drop lock on peer. It is dangerous because deadlock is
1156           possible. Connect to self case and simultaneous
1157           attempt to connect are eliminated by checking socket
1158           state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1159           check this before attempt to grab lock.
1160
1161           Well, and we have to recheck the state after socket locked.
1162         */
1163        st = sk->sk_state;
1164
1165        switch (st) {
1166        case TCP_CLOSE:
1167                /* This is ok... continue with connect */
1168                break;
1169        case TCP_ESTABLISHED:
1170                /* Socket is already connected */
1171                err = -EISCONN;
1172                goto out_unlock;
1173        default:
1174                err = -EINVAL;
1175                goto out_unlock;
1176        }
1177
1178        unix_state_lock_nested(sk);
1179
1180        if (sk->sk_state != st) {
1181                unix_state_unlock(sk);
1182                unix_state_unlock(other);
1183                sock_put(other);
1184                goto restart;
1185        }
1186
1187        err = security_unix_stream_connect(sk, other, newsk);
1188        if (err) {
1189                unix_state_unlock(sk);
1190                goto out_unlock;
1191        }
1192
1193        /* The way is open! Fastly set all the necessary fields... */
1194
1195        sock_hold(sk);
1196        unix_peer(newsk)        = sk;
1197        newsk->sk_state         = TCP_ESTABLISHED;
1198        newsk->sk_type          = sk->sk_type;
1199        init_peercred(newsk);
1200        newu = unix_sk(newsk);
1201        RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1202        otheru = unix_sk(other);
1203
1204        /* copy address information from listening to new sock*/
1205        if (otheru->addr) {
1206                atomic_inc(&otheru->addr->refcnt);
1207                newu->addr = otheru->addr;
1208        }
1209        if (otheru->path.dentry) {
1210                path_get(&otheru->path);
1211                newu->path = otheru->path;
1212        }
1213
1214        /* Set credentials */
1215        copy_peercred(sk, other);
1216
1217        sock->state     = SS_CONNECTED;
1218        sk->sk_state    = TCP_ESTABLISHED;
1219        sock_hold(newsk);
1220
1221        smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1222        unix_peer(sk)   = newsk;
1223
1224        unix_state_unlock(sk);
1225
1226        /* take ten and and send info to listening sock */
1227        spin_lock(&other->sk_receive_queue.lock);
1228        __skb_queue_tail(&other->sk_receive_queue, skb);
1229        spin_unlock(&other->sk_receive_queue.lock);
1230        unix_state_unlock(other);
1231        other->sk_data_ready(other);
1232        sock_put(other);
1233        return 0;
1234
1235out_unlock:
1236        if (other)
1237                unix_state_unlock(other);
1238
1239out:
1240        kfree_skb(skb);
1241        if (newsk)
1242                unix_release_sock(newsk, 0);
1243        if (other)
1244                sock_put(other);
1245        return err;
1246}
1247
1248static int unix_socketpair(struct socket *socka, struct socket *sockb)
1249{
1250        struct sock *ska = socka->sk, *skb = sockb->sk;
1251
1252        /* Join our sockets back to back */
1253        sock_hold(ska);
1254        sock_hold(skb);
1255        unix_peer(ska) = skb;
1256        unix_peer(skb) = ska;
1257        init_peercred(ska);
1258        init_peercred(skb);
1259
1260        if (ska->sk_type != SOCK_DGRAM) {
1261                ska->sk_state = TCP_ESTABLISHED;
1262                skb->sk_state = TCP_ESTABLISHED;
1263                socka->state  = SS_CONNECTED;
1264                sockb->state  = SS_CONNECTED;
1265        }
1266        return 0;
1267}
1268
1269static void unix_sock_inherit_flags(const struct socket *old,
1270                                    struct socket *new)
1271{
1272        if (test_bit(SOCK_PASSCRED, &old->flags))
1273                set_bit(SOCK_PASSCRED, &new->flags);
1274        if (test_bit(SOCK_PASSSEC, &old->flags))
1275                set_bit(SOCK_PASSSEC, &new->flags);
1276}
1277
1278static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1279{
1280        struct sock *sk = sock->sk;
1281        struct sock *tsk;
1282        struct sk_buff *skb;
1283        int err;
1284
1285        err = -EOPNOTSUPP;
1286        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1287                goto out;
1288
1289        err = -EINVAL;
1290        if (sk->sk_state != TCP_LISTEN)
1291                goto out;
1292
1293        /* If socket state is TCP_LISTEN it cannot change (for now...),
1294         * so that no locks are necessary.
1295         */
1296
1297        skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1298        if (!skb) {
1299                /* This means receive shutdown. */
1300                if (err == 0)
1301                        err = -EINVAL;
1302                goto out;
1303        }
1304
1305        tsk = skb->sk;
1306        skb_free_datagram(sk, skb);
1307        wake_up_interruptible(&unix_sk(sk)->peer_wait);
1308
1309        /* attach accepted sock to socket */
1310        unix_state_lock(tsk);
1311        newsock->state = SS_CONNECTED;
1312        unix_sock_inherit_flags(sock, newsock);
1313        sock_graft(tsk, newsock);
1314        unix_state_unlock(tsk);
1315        return 0;
1316
1317out:
1318        return err;
1319}
1320
1321
1322static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1323{
1324        struct sock *sk = sock->sk;
1325        struct unix_sock *u;
1326        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1327        int err = 0;
1328
1329        if (peer) {
1330                sk = unix_peer_get(sk);
1331
1332                err = -ENOTCONN;
1333                if (!sk)
1334                        goto out;
1335                err = 0;
1336        } else {
1337                sock_hold(sk);
1338        }
1339
1340        u = unix_sk(sk);
1341        unix_state_lock(sk);
1342        if (!u->addr) {
1343                sunaddr->sun_family = AF_UNIX;
1344                sunaddr->sun_path[0] = 0;
1345                *uaddr_len = sizeof(short);
1346        } else {
1347                struct unix_address *addr = u->addr;
1348
1349                *uaddr_len = addr->len;
1350                memcpy(sunaddr, addr->name, *uaddr_len);
1351        }
1352        unix_state_unlock(sk);
1353        sock_put(sk);
1354out:
1355        return err;
1356}
1357
1358static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1359{
1360        int i;
1361
1362        scm->fp = UNIXCB(skb).fp;
1363        UNIXCB(skb).fp = NULL;
1364
1365        for (i = scm->fp->count-1; i >= 0; i--)
1366                unix_notinflight(scm->fp->fp[i]);
1367}
1368
1369static void unix_destruct_scm(struct sk_buff *skb)
1370{
1371        struct scm_cookie scm;
1372        memset(&scm, 0, sizeof(scm));
1373        scm.pid  = UNIXCB(skb).pid;
1374        if (UNIXCB(skb).fp)
1375                unix_detach_fds(&scm, skb);
1376
1377        /* Alas, it calls VFS */
1378        /* So fscking what? fput() had been SMP-safe since the last Summer */
1379        scm_destroy(&scm);
1380        sock_wfree(skb);
1381}
1382
1383#define MAX_RECURSION_LEVEL 4
1384
1385static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1386{
1387        int i;
1388        unsigned char max_level = 0;
1389        int unix_sock_count = 0;
1390
1391        for (i = scm->fp->count - 1; i >= 0; i--) {
1392                struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1393
1394                if (sk) {
1395                        unix_sock_count++;
1396                        max_level = max(max_level,
1397                                        unix_sk(sk)->recursion_level);
1398                }
1399        }
1400        if (unlikely(max_level > MAX_RECURSION_LEVEL))
1401                return -ETOOMANYREFS;
1402
1403        /*
1404         * Need to duplicate file references for the sake of garbage
1405         * collection.  Otherwise a socket in the fps might become a
1406         * candidate for GC while the skb is not yet queued.
1407         */
1408        UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1409        if (!UNIXCB(skb).fp)
1410                return -ENOMEM;
1411
1412        if (unix_sock_count) {
1413                for (i = scm->fp->count - 1; i >= 0; i--)
1414                        unix_inflight(scm->fp->fp[i]);
1415        }
1416        return max_level;
1417}
1418
1419static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1420{
1421        int err = 0;
1422
1423        UNIXCB(skb).pid  = get_pid(scm->pid);
1424        UNIXCB(skb).uid = scm->creds.uid;
1425        UNIXCB(skb).gid = scm->creds.gid;
1426        UNIXCB(skb).fp = NULL;
1427        unix_get_secdata(scm, skb);
1428        if (scm->fp && send_fds)
1429                err = unix_attach_fds(scm, skb);
1430
1431        skb->destructor = unix_destruct_scm;
1432        return err;
1433}
1434
1435/*
1436 * Some apps rely on write() giving SCM_CREDENTIALS
1437 * We include credentials if source or destination socket
1438 * asserted SOCK_PASSCRED.
1439 */
1440static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1441                            const struct sock *other)
1442{
1443        if (UNIXCB(skb).pid)
1444                return;
1445        if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1446            !other->sk_socket ||
1447            test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1448                UNIXCB(skb).pid  = get_pid(task_tgid(current));
1449                current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1450        }
1451}
1452
1453/*
1454 *      Send AF_UNIX data.
1455 */
1456
1457static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1458                              size_t len)
1459{
1460        struct sock *sk = sock->sk;
1461        struct net *net = sock_net(sk);
1462        struct unix_sock *u = unix_sk(sk);
1463        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1464        struct sock *other = NULL;
1465        int namelen = 0; /* fake GCC */
1466        int err;
1467        unsigned int hash;
1468        struct sk_buff *skb;
1469        long timeo;
1470        struct scm_cookie scm;
1471        int max_level;
1472        int data_len = 0;
1473
1474        wait_for_unix_gc();
1475        err = scm_send(sock, msg, &scm, false);
1476        if (err < 0)
1477                return err;
1478
1479        err = -EOPNOTSUPP;
1480        if (msg->msg_flags&MSG_OOB)
1481                goto out;
1482
1483        if (msg->msg_namelen) {
1484                err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1485                if (err < 0)
1486                        goto out;
1487                namelen = err;
1488        } else {
1489                sunaddr = NULL;
1490                err = -ENOTCONN;
1491                other = unix_peer_get(sk);
1492                if (!other)
1493                        goto out;
1494        }
1495
1496        if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1497            && (err = unix_autobind(sock)) != 0)
1498                goto out;
1499
1500        err = -EMSGSIZE;
1501        if (len > sk->sk_sndbuf - 32)
1502                goto out;
1503
1504        if (len > SKB_MAX_ALLOC) {
1505                data_len = min_t(size_t,
1506                                 len - SKB_MAX_ALLOC,
1507                                 MAX_SKB_FRAGS * PAGE_SIZE);
1508                data_len = PAGE_ALIGN(data_len);
1509
1510                BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1511        }
1512
1513        skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1514                                   msg->msg_flags & MSG_DONTWAIT, &err,
1515                                   PAGE_ALLOC_COSTLY_ORDER);
1516        if (skb == NULL)
1517                goto out;
1518
1519        err = unix_scm_to_skb(&scm, skb, true);
1520        if (err < 0)
1521                goto out_free;
1522        max_level = err + 1;
1523
1524        skb_put(skb, len - data_len);
1525        skb->data_len = data_len;
1526        skb->len = len;
1527        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1528        if (err)
1529                goto out_free;
1530
1531        timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1532
1533restart:
1534        if (!other) {
1535                err = -ECONNRESET;
1536                if (sunaddr == NULL)
1537                        goto out_free;
1538
1539                other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1540                                        hash, &err);
1541                if (other == NULL)
1542                        goto out_free;
1543        }
1544
1545        if (sk_filter(other, skb) < 0) {
1546                /* Toss the packet but do not return any error to the sender */
1547                err = len;
1548                goto out_free;
1549        }
1550
1551        unix_state_lock(other);
1552        err = -EPERM;
1553        if (!unix_may_send(sk, other))
1554                goto out_unlock;
1555
1556        if (sock_flag(other, SOCK_DEAD)) {
1557                /*
1558                 *      Check with 1003.1g - what should
1559                 *      datagram error
1560                 */
1561                unix_state_unlock(other);
1562                sock_put(other);
1563
1564                err = 0;
1565                unix_state_lock(sk);
1566                if (unix_peer(sk) == other) {
1567                        unix_peer(sk) = NULL;
1568                        unix_state_unlock(sk);
1569
1570                        unix_dgram_disconnected(sk, other);
1571                        sock_put(other);
1572                        err = -ECONNREFUSED;
1573                } else {
1574                        unix_state_unlock(sk);
1575                }
1576
1577                other = NULL;
1578                if (err)
1579                        goto out_free;
1580                goto restart;
1581        }
1582
1583        err = -EPIPE;
1584        if (other->sk_shutdown & RCV_SHUTDOWN)
1585                goto out_unlock;
1586
1587        if (sk->sk_type != SOCK_SEQPACKET) {
1588                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1589                if (err)
1590                        goto out_unlock;
1591        }
1592
1593        if (unix_peer(other) != sk && unix_recvq_full(other)) {
1594                if (!timeo) {
1595                        err = -EAGAIN;
1596                        goto out_unlock;
1597                }
1598
1599                timeo = unix_wait_for_peer(other, timeo);
1600
1601                err = sock_intr_errno(timeo);
1602                if (signal_pending(current))
1603                        goto out_free;
1604
1605                goto restart;
1606        }
1607
1608        if (sock_flag(other, SOCK_RCVTSTAMP))
1609                __net_timestamp(skb);
1610        maybe_add_creds(skb, sock, other);
1611        skb_queue_tail(&other->sk_receive_queue, skb);
1612        if (max_level > unix_sk(other)->recursion_level)
1613                unix_sk(other)->recursion_level = max_level;
1614        unix_state_unlock(other);
1615        other->sk_data_ready(other);
1616        sock_put(other);
1617        scm_destroy(&scm);
1618        return len;
1619
1620out_unlock:
1621        unix_state_unlock(other);
1622out_free:
1623        kfree_skb(skb);
1624out:
1625        if (other)
1626                sock_put(other);
1627        scm_destroy(&scm);
1628        return err;
1629}
1630
1631/* We use paged skbs for stream sockets, and limit occupancy to 32768
1632 * bytes, and a minimun of a full page.
1633 */
1634#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1635
1636static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1637                               size_t len)
1638{
1639        struct sock *sk = sock->sk;
1640        struct sock *other = NULL;
1641        int err, size;
1642        struct sk_buff *skb;
1643        int sent = 0;
1644        struct scm_cookie scm;
1645        bool fds_sent = false;
1646        int max_level;
1647        int data_len;
1648
1649        wait_for_unix_gc();
1650        err = scm_send(sock, msg, &scm, false);
1651        if (err < 0)
1652                return err;
1653
1654        err = -EOPNOTSUPP;
1655        if (msg->msg_flags&MSG_OOB)
1656                goto out_err;
1657
1658        if (msg->msg_namelen) {
1659                err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1660                goto out_err;
1661        } else {
1662                err = -ENOTCONN;
1663                other = unix_peer(sk);
1664                if (!other)
1665                        goto out_err;
1666        }
1667
1668        if (sk->sk_shutdown & SEND_SHUTDOWN)
1669                goto pipe_err;
1670
1671        while (sent < len) {
1672                size = len - sent;
1673
1674                /* Keep two messages in the pipe so it schedules better */
1675                size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1676
1677                /* allow fallback to order-0 allocations */
1678                size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1679
1680                data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1681
1682                data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1683
1684                skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1685                                           msg->msg_flags & MSG_DONTWAIT, &err,
1686                                           get_order(UNIX_SKB_FRAGS_SZ));
1687                if (!skb)
1688                        goto out_err;
1689
1690                /* Only send the fds in the first buffer */
1691                err = unix_scm_to_skb(&scm, skb, !fds_sent);
1692                if (err < 0) {
1693                        kfree_skb(skb);
1694                        goto out_err;
1695                }
1696                max_level = err + 1;
1697                fds_sent = true;
1698
1699                skb_put(skb, size - data_len);
1700                skb->data_len = data_len;
1701                skb->len = size;
1702                err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1703                if (err) {
1704                        kfree_skb(skb);
1705                        goto out_err;
1706                }
1707
1708                unix_state_lock(other);
1709
1710                if (sock_flag(other, SOCK_DEAD) ||
1711                    (other->sk_shutdown & RCV_SHUTDOWN))
1712                        goto pipe_err_free;
1713
1714                maybe_add_creds(skb, sock, other);
1715                skb_queue_tail(&other->sk_receive_queue, skb);
1716                if (max_level > unix_sk(other)->recursion_level)
1717                        unix_sk(other)->recursion_level = max_level;
1718                unix_state_unlock(other);
1719                other->sk_data_ready(other);
1720                sent += size;
1721        }
1722
1723        scm_destroy(&scm);
1724
1725        return sent;
1726
1727pipe_err_free:
1728        unix_state_unlock(other);
1729        kfree_skb(skb);
1730pipe_err:
1731        if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1732                send_sig(SIGPIPE, current, 0);
1733        err = -EPIPE;
1734out_err:
1735        scm_destroy(&scm);
1736        return sent ? : err;
1737}
1738
1739static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1740                                    int offset, size_t size, int flags)
1741{
1742        int err = 0;
1743        bool send_sigpipe = true;
1744        struct sock *other, *sk = socket->sk;
1745        struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1746
1747        if (flags & MSG_OOB)
1748                return -EOPNOTSUPP;
1749
1750        other = unix_peer(sk);
1751        if (!other || sk->sk_state != TCP_ESTABLISHED)
1752                return -ENOTCONN;
1753
1754        if (false) {
1755alloc_skb:
1756                unix_state_unlock(other);
1757                mutex_unlock(&unix_sk(other)->readlock);
1758                newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1759                                              &err, 0);
1760                if (!newskb)
1761                        return err;
1762        }
1763
1764        /* we must acquire readlock as we modify already present
1765         * skbs in the sk_receive_queue and mess with skb->len
1766         */
1767        err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1768        if (err) {
1769                err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1770                send_sigpipe = false;
1771                goto err;
1772        }
1773
1774        if (sk->sk_shutdown & SEND_SHUTDOWN) {
1775                err = -EPIPE;
1776                goto err_unlock;
1777        }
1778
1779        unix_state_lock(other);
1780
1781        if (sock_flag(other, SOCK_DEAD) ||
1782            other->sk_shutdown & RCV_SHUTDOWN) {
1783                err = -EPIPE;
1784                goto err_state_unlock;
1785        }
1786
1787        skb = skb_peek_tail(&other->sk_receive_queue);
1788        if (tail && tail == skb) {
1789                skb = newskb;
1790        } else if (!skb) {
1791                if (newskb)
1792                        skb = newskb;
1793                else
1794                        goto alloc_skb;
1795        } else if (newskb) {
1796                /* this is fast path, we don't necessarily need to
1797                 * call to kfree_skb even though with newskb == NULL
1798                 * this - does no harm
1799                 */
1800                consume_skb(newskb);
1801        }
1802
1803        if (skb_append_pagefrags(skb, page, offset, size)) {
1804                tail = skb;
1805                goto alloc_skb;
1806        }
1807
1808        skb->len += size;
1809        skb->data_len += size;
1810        skb->truesize += size;
1811        atomic_add(size, &sk->sk_wmem_alloc);
1812
1813        if (newskb)
1814                __skb_queue_tail(&other->sk_receive_queue, newskb);
1815
1816        unix_state_unlock(other);
1817        mutex_unlock(&unix_sk(other)->readlock);
1818
1819        other->sk_data_ready(other);
1820
1821        return size;
1822
1823err_state_unlock:
1824        unix_state_unlock(other);
1825err_unlock:
1826        mutex_unlock(&unix_sk(other)->readlock);
1827err:
1828        kfree_skb(newskb);
1829        if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1830                send_sig(SIGPIPE, current, 0);
1831        return err;
1832}
1833
1834static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1835                                  size_t len)
1836{
1837        int err;
1838        struct sock *sk = sock->sk;
1839
1840        err = sock_error(sk);
1841        if (err)
1842                return err;
1843
1844        if (sk->sk_state != TCP_ESTABLISHED)
1845                return -ENOTCONN;
1846
1847        if (msg->msg_namelen)
1848                msg->msg_namelen = 0;
1849
1850        return unix_dgram_sendmsg(sock, msg, len);
1851}
1852
1853static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1854                                  size_t size, int flags)
1855{
1856        struct sock *sk = sock->sk;
1857
1858        if (sk->sk_state != TCP_ESTABLISHED)
1859                return -ENOTCONN;
1860
1861        return unix_dgram_recvmsg(sock, msg, size, flags);
1862}
1863
1864static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1865{
1866        struct unix_sock *u = unix_sk(sk);
1867
1868        if (u->addr) {
1869                msg->msg_namelen = u->addr->len;
1870                memcpy(msg->msg_name, u->addr->name, u->addr->len);
1871        }
1872}
1873
1874static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1875                              size_t size, int flags)
1876{
1877        struct scm_cookie scm;
1878        struct sock *sk = sock->sk;
1879        struct unix_sock *u = unix_sk(sk);
1880        int noblock = flags & MSG_DONTWAIT;
1881        struct sk_buff *skb;
1882        int err;
1883        int peeked, skip;
1884
1885        err = -EOPNOTSUPP;
1886        if (flags&MSG_OOB)
1887                goto out;
1888
1889        err = mutex_lock_interruptible(&u->readlock);
1890        if (unlikely(err)) {
1891                /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1892                 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1893                 */
1894                err = noblock ? -EAGAIN : -ERESTARTSYS;
1895                goto out;
1896        }
1897
1898        skip = sk_peek_offset(sk, flags);
1899
1900        skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1901        if (!skb) {
1902                unix_state_lock(sk);
1903                /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1904                if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1905                    (sk->sk_shutdown & RCV_SHUTDOWN))
1906                        err = 0;
1907                unix_state_unlock(sk);
1908                goto out_unlock;
1909        }
1910
1911        wake_up_interruptible_sync_poll(&u->peer_wait,
1912                                        POLLOUT | POLLWRNORM | POLLWRBAND);
1913
1914        if (msg->msg_name)
1915                unix_copy_addr(msg, skb->sk);
1916
1917        if (size > skb->len - skip)
1918                size = skb->len - skip;
1919        else if (size < skb->len - skip)
1920                msg->msg_flags |= MSG_TRUNC;
1921
1922        err = skb_copy_datagram_msg(skb, skip, msg, size);
1923        if (err)
1924                goto out_free;
1925
1926        if (sock_flag(sk, SOCK_RCVTSTAMP))
1927                __sock_recv_timestamp(msg, sk, skb);
1928
1929        memset(&scm, 0, sizeof(scm));
1930
1931        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1932        unix_set_secdata(&scm, skb);
1933
1934        if (!(flags & MSG_PEEK)) {
1935                if (UNIXCB(skb).fp)
1936                        unix_detach_fds(&scm, skb);
1937
1938                sk_peek_offset_bwd(sk, skb->len);
1939        } else {
1940                /* It is questionable: on PEEK we could:
1941                   - do not return fds - good, but too simple 8)
1942                   - return fds, and do not return them on read (old strategy,
1943                     apparently wrong)
1944                   - clone fds (I chose it for now, it is the most universal
1945                     solution)
1946
1947                   POSIX 1003.1g does not actually define this clearly
1948                   at all. POSIX 1003.1g doesn't define a lot of things
1949                   clearly however!
1950
1951                */
1952
1953                sk_peek_offset_fwd(sk, size);
1954
1955                if (UNIXCB(skb).fp)
1956                        scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1957        }
1958        err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1959
1960        scm_recv(sock, msg, &scm, flags);
1961
1962out_free:
1963        skb_free_datagram(sk, skb);
1964out_unlock:
1965        mutex_unlock(&u->readlock);
1966out:
1967        return err;
1968}
1969
1970/*
1971 *      Sleep until more data has arrived. But check for races..
1972 */
1973static long unix_stream_data_wait(struct sock *sk, long timeo,
1974                                  struct sk_buff *last, unsigned int last_len)
1975{
1976        struct sk_buff *tail;
1977        DEFINE_WAIT(wait);
1978
1979        unix_state_lock(sk);
1980
1981        for (;;) {
1982                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1983
1984                tail = skb_peek_tail(&sk->sk_receive_queue);
1985                if (tail != last ||
1986                    (tail && tail->len != last_len) ||
1987                    sk->sk_err ||
1988                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1989                    signal_pending(current) ||
1990                    !timeo)
1991                        break;
1992
1993                set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1994                unix_state_unlock(sk);
1995                timeo = freezable_schedule_timeout(timeo);
1996                unix_state_lock(sk);
1997
1998                if (sock_flag(sk, SOCK_DEAD))
1999                        break;
2000
2001                clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2002        }
2003
2004        finish_wait(sk_sleep(sk), &wait);
2005        unix_state_unlock(sk);
2006        return timeo;
2007}
2008
2009static unsigned int unix_skb_len(const struct sk_buff *skb)
2010{
2011        return skb->len - UNIXCB(skb).consumed;
2012}
2013
2014struct unix_stream_read_state {
2015        int (*recv_actor)(struct sk_buff *, int, int,
2016                          struct unix_stream_read_state *);
2017        struct socket *socket;
2018        struct msghdr *msg;
2019        struct pipe_inode_info *pipe;
2020        size_t size;
2021        int flags;
2022        unsigned int splice_flags;
2023};
2024
2025static int unix_stream_read_generic(struct unix_stream_read_state *state)
2026{
2027        struct scm_cookie scm;
2028        struct socket *sock = state->socket;
2029        struct sock *sk = sock->sk;
2030        struct unix_sock *u = unix_sk(sk);
2031        int copied = 0;
2032        int flags = state->flags;
2033        int noblock = flags & MSG_DONTWAIT;
2034        bool check_creds = false;
2035        int target;
2036        int err = 0;
2037        long timeo;
2038        int skip;
2039        size_t size = state->size;
2040        unsigned int last_len;
2041
2042        err = -EINVAL;
2043        if (sk->sk_state != TCP_ESTABLISHED)
2044                goto out;
2045
2046        err = -EOPNOTSUPP;
2047        if (flags & MSG_OOB)
2048                goto out;
2049
2050        target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2051        timeo = sock_rcvtimeo(sk, noblock);
2052
2053        memset(&scm, 0, sizeof(scm));
2054
2055        /* Lock the socket to prevent queue disordering
2056         * while sleeps in memcpy_tomsg
2057         */
2058        err = mutex_lock_interruptible(&u->readlock);
2059        if (unlikely(err)) {
2060                /* recvmsg() in non blocking mode is supposed to return -EAGAIN
2061                 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
2062                 */
2063                err = noblock ? -EAGAIN : -ERESTARTSYS;
2064                goto out;
2065        }
2066
2067        if (flags & MSG_PEEK)
2068                skip = sk_peek_offset(sk, flags);
2069        else
2070                skip = 0;
2071
2072        do {
2073                int chunk;
2074                struct sk_buff *skb, *last;
2075
2076                unix_state_lock(sk);
2077                if (sock_flag(sk, SOCK_DEAD)) {
2078                        err = -ECONNRESET;
2079                        goto unlock;
2080                }
2081                last = skb = skb_peek(&sk->sk_receive_queue);
2082                last_len = last ? last->len : 0;
2083again:
2084                if (skb == NULL) {
2085                        unix_sk(sk)->recursion_level = 0;
2086                        if (copied >= target)
2087                                goto unlock;
2088
2089                        /*
2090                         *      POSIX 1003.1g mandates this order.
2091                         */
2092
2093                        err = sock_error(sk);
2094                        if (err)
2095                                goto unlock;
2096                        if (sk->sk_shutdown & RCV_SHUTDOWN)
2097                                goto unlock;
2098
2099                        unix_state_unlock(sk);
2100                        err = -EAGAIN;
2101                        if (!timeo)
2102                                break;
2103                        mutex_unlock(&u->readlock);
2104
2105                        timeo = unix_stream_data_wait(sk, timeo, last,
2106                                                      last_len);
2107
2108                        if (signal_pending(current) ||
2109                            mutex_lock_interruptible(&u->readlock)) {
2110                                err = sock_intr_errno(timeo);
2111                                goto out;
2112                        }
2113
2114                        continue;
2115unlock:
2116                        unix_state_unlock(sk);
2117                        break;
2118                }
2119
2120                while (skip >= unix_skb_len(skb)) {
2121                        skip -= unix_skb_len(skb);
2122                        last = skb;
2123                        last_len = skb->len;
2124                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
2125                        if (!skb)
2126                                goto again;
2127                }
2128
2129                unix_state_unlock(sk);
2130
2131                if (check_creds) {
2132                        /* Never glue messages from different writers */
2133                        if ((UNIXCB(skb).pid  != scm.pid) ||
2134                            !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2135                            !gid_eq(UNIXCB(skb).gid, scm.creds.gid) ||
2136                            !unix_secdata_eq(&scm, skb))
2137                                break;
2138                } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2139                        /* Copy credentials */
2140                        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2141                        unix_set_secdata(&scm, skb);
2142                        check_creds = true;
2143                }
2144
2145                /* Copy address just once */
2146                if (state->msg && state->msg->msg_name) {
2147                        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2148                                         state->msg->msg_name);
2149                        unix_copy_addr(state->msg, skb->sk);
2150                        sunaddr = NULL;
2151                }
2152
2153                chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2154                chunk = state->recv_actor(skb, skip, chunk, state);
2155                if (chunk < 0) {
2156                        if (copied == 0)
2157                                copied = -EFAULT;
2158                        break;
2159                }
2160                copied += chunk;
2161                size -= chunk;
2162
2163                /* Mark read part of skb as used */
2164                if (!(flags & MSG_PEEK)) {
2165                        UNIXCB(skb).consumed += chunk;
2166
2167                        sk_peek_offset_bwd(sk, chunk);
2168
2169                        if (UNIXCB(skb).fp)
2170                                unix_detach_fds(&scm, skb);
2171
2172                        if (unix_skb_len(skb))
2173                                break;
2174
2175                        skb_unlink(skb, &sk->sk_receive_queue);
2176                        consume_skb(skb);
2177
2178                        if (scm.fp)
2179                                break;
2180                } else {
2181                        /* It is questionable, see note in unix_dgram_recvmsg.
2182                         */
2183                        if (UNIXCB(skb).fp)
2184                                scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2185
2186                        sk_peek_offset_fwd(sk, chunk);
2187
2188                        if (UNIXCB(skb).fp)
2189                                break;
2190
2191                        skip = 0;
2192                        last = skb;
2193                        last_len = skb->len;
2194                        unix_state_lock(sk);
2195                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
2196                        if (skb)
2197                                goto again;
2198                        unix_state_unlock(sk);
2199                        break;
2200                }
2201        } while (size);
2202
2203        mutex_unlock(&u->readlock);
2204        if (state->msg)
2205                scm_recv(sock, state->msg, &scm, flags);
2206        else
2207                scm_destroy(&scm);
2208out:
2209        return copied ? : err;
2210}
2211
2212static int unix_stream_read_actor(struct sk_buff *skb,
2213                                  int skip, int chunk,
2214                                  struct unix_stream_read_state *state)
2215{
2216        int ret;
2217
2218        ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2219                                    state->msg, chunk);
2220        return ret ?: chunk;
2221}
2222
2223static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2224                               size_t size, int flags)
2225{
2226        struct unix_stream_read_state state = {
2227                .recv_actor = unix_stream_read_actor,
2228                .socket = sock,
2229                .msg = msg,
2230                .size = size,
2231                .flags = flags
2232        };
2233
2234        return unix_stream_read_generic(&state);
2235}
2236
2237static ssize_t skb_unix_socket_splice(struct sock *sk,
2238                                      struct pipe_inode_info *pipe,
2239                                      struct splice_pipe_desc *spd)
2240{
2241        int ret;
2242        struct unix_sock *u = unix_sk(sk);
2243
2244        mutex_unlock(&u->readlock);
2245        ret = splice_to_pipe(pipe, spd);
2246        mutex_lock(&u->readlock);
2247
2248        return ret;
2249}
2250
2251static int unix_stream_splice_actor(struct sk_buff *skb,
2252                                    int skip, int chunk,
2253                                    struct unix_stream_read_state *state)
2254{
2255        return skb_splice_bits(skb, state->socket->sk,
2256                               UNIXCB(skb).consumed + skip,
2257                               state->pipe, chunk, state->splice_flags,
2258                               skb_unix_socket_splice);
2259}
2260
2261static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2262                                       struct pipe_inode_info *pipe,
2263                                       size_t size, unsigned int flags)
2264{
2265        struct unix_stream_read_state state = {
2266                .recv_actor = unix_stream_splice_actor,
2267                .socket = sock,
2268                .pipe = pipe,
2269                .size = size,
2270                .splice_flags = flags,
2271        };
2272
2273        if (unlikely(*ppos))
2274                return -ESPIPE;
2275
2276        if (sock->file->f_flags & O_NONBLOCK ||
2277            flags & SPLICE_F_NONBLOCK)
2278                state.flags = MSG_DONTWAIT;
2279
2280        return unix_stream_read_generic(&state);
2281}
2282
2283static int unix_shutdown(struct socket *sock, int mode)
2284{
2285        struct sock *sk = sock->sk;
2286        struct sock *other;
2287
2288        if (mode < SHUT_RD || mode > SHUT_RDWR)
2289                return -EINVAL;
2290        /* This maps:
2291         * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2292         * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2293         * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2294         */
2295        ++mode;
2296
2297        unix_state_lock(sk);
2298        sk->sk_shutdown |= mode;
2299        other = unix_peer(sk);
2300        if (other)
2301                sock_hold(other);
2302        unix_state_unlock(sk);
2303        sk->sk_state_change(sk);
2304
2305        if (other &&
2306                (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2307
2308                int peer_mode = 0;
2309
2310                if (mode&RCV_SHUTDOWN)
2311                        peer_mode |= SEND_SHUTDOWN;
2312                if (mode&SEND_SHUTDOWN)
2313                        peer_mode |= RCV_SHUTDOWN;
2314                unix_state_lock(other);
2315                other->sk_shutdown |= peer_mode;
2316                unix_state_unlock(other);
2317                other->sk_state_change(other);
2318                if (peer_mode == SHUTDOWN_MASK)
2319                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2320                else if (peer_mode & RCV_SHUTDOWN)
2321                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2322        }
2323        if (other)
2324                sock_put(other);
2325
2326        return 0;
2327}
2328
2329long unix_inq_len(struct sock *sk)
2330{
2331        struct sk_buff *skb;
2332        long amount = 0;
2333
2334        if (sk->sk_state == TCP_LISTEN)
2335                return -EINVAL;
2336
2337        spin_lock(&sk->sk_receive_queue.lock);
2338        if (sk->sk_type == SOCK_STREAM ||
2339            sk->sk_type == SOCK_SEQPACKET) {
2340                skb_queue_walk(&sk->sk_receive_queue, skb)
2341                        amount += unix_skb_len(skb);
2342        } else {
2343                skb = skb_peek(&sk->sk_receive_queue);
2344                if (skb)
2345                        amount = skb->len;
2346        }
2347        spin_unlock(&sk->sk_receive_queue.lock);
2348
2349        return amount;
2350}
2351EXPORT_SYMBOL_GPL(unix_inq_len);
2352
2353long unix_outq_len(struct sock *sk)
2354{
2355        return sk_wmem_alloc_get(sk);
2356}
2357EXPORT_SYMBOL_GPL(unix_outq_len);
2358
2359static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2360{
2361        struct sock *sk = sock->sk;
2362        long amount = 0;
2363        int err;
2364
2365        switch (cmd) {
2366        case SIOCOUTQ:
2367                amount = unix_outq_len(sk);
2368                err = put_user(amount, (int __user *)arg);
2369                break;
2370        case SIOCINQ:
2371                amount = unix_inq_len(sk);
2372                if (amount < 0)
2373                        err = amount;
2374                else
2375                        err = put_user(amount, (int __user *)arg);
2376                break;
2377        default:
2378                err = -ENOIOCTLCMD;
2379                break;
2380        }
2381        return err;
2382}
2383
2384static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2385{
2386        struct sock *sk = sock->sk;
2387        unsigned int mask;
2388
2389        sock_poll_wait(file, sk_sleep(sk), wait);
2390        mask = 0;
2391
2392        /* exceptional events? */
2393        if (sk->sk_err)
2394                mask |= POLLERR;
2395        if (sk->sk_shutdown == SHUTDOWN_MASK)
2396                mask |= POLLHUP;
2397        if (sk->sk_shutdown & RCV_SHUTDOWN)
2398                mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2399
2400        /* readable? */
2401        if (!skb_queue_empty(&sk->sk_receive_queue))
2402                mask |= POLLIN | POLLRDNORM;
2403
2404        /* Connection-based need to check for termination and startup */
2405        if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2406            sk->sk_state == TCP_CLOSE)
2407                mask |= POLLHUP;
2408
2409        /*
2410         * we set writable also when the other side has shut down the
2411         * connection. This prevents stuck sockets.
2412         */
2413        if (unix_writable(sk))
2414                mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2415
2416        return mask;
2417}
2418
2419static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2420                                    poll_table *wait)
2421{
2422        struct sock *sk = sock->sk, *other;
2423        unsigned int mask, writable;
2424
2425        sock_poll_wait(file, sk_sleep(sk), wait);
2426        mask = 0;
2427
2428        /* exceptional events? */
2429        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2430                mask |= POLLERR |
2431                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2432
2433        if (sk->sk_shutdown & RCV_SHUTDOWN)
2434                mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2435        if (sk->sk_shutdown == SHUTDOWN_MASK)
2436                mask |= POLLHUP;
2437
2438        /* readable? */
2439        if (!skb_queue_empty(&sk->sk_receive_queue))
2440                mask |= POLLIN | POLLRDNORM;
2441
2442        /* Connection-based need to check for termination and startup */
2443        if (sk->sk_type == SOCK_SEQPACKET) {
2444                if (sk->sk_state == TCP_CLOSE)
2445                        mask |= POLLHUP;
2446                /* connection hasn't started yet? */
2447                if (sk->sk_state == TCP_SYN_SENT)
2448                        return mask;
2449        }
2450
2451        /* No write status requested, avoid expensive OUT tests. */
2452        if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2453                return mask;
2454
2455        writable = unix_writable(sk);
2456        other = unix_peer_get(sk);
2457        if (other) {
2458                if (unix_peer(other) != sk) {
2459                        sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2460                        if (unix_recvq_full(other))
2461                                writable = 0;
2462                }
2463                sock_put(other);
2464        }
2465
2466        if (writable)
2467                mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2468        else
2469                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2470
2471        return mask;
2472}
2473
2474#ifdef CONFIG_PROC_FS
2475
2476#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2477
2478#define get_bucket(x) ((x) >> BUCKET_SPACE)
2479#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2480#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2481
2482static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2483{
2484        unsigned long offset = get_offset(*pos);
2485        unsigned long bucket = get_bucket(*pos);
2486        struct sock *sk;
2487        unsigned long count = 0;
2488
2489        for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2490                if (sock_net(sk) != seq_file_net(seq))
2491                        continue;
2492                if (++count == offset)
2493                        break;
2494        }
2495
2496        return sk;
2497}
2498
2499static struct sock *unix_next_socket(struct seq_file *seq,
2500                                     struct sock *sk,
2501                                     loff_t *pos)
2502{
2503        unsigned long bucket;
2504
2505        while (sk > (struct sock *)SEQ_START_TOKEN) {
2506                sk = sk_next(sk);
2507                if (!sk)
2508                        goto next_bucket;
2509                if (sock_net(sk) == seq_file_net(seq))
2510                        return sk;
2511        }
2512
2513        do {
2514                sk = unix_from_bucket(seq, pos);
2515                if (sk)
2516                        return sk;
2517
2518next_bucket:
2519                bucket = get_bucket(*pos) + 1;
2520                *pos = set_bucket_offset(bucket, 1);
2521        } while (bucket < ARRAY_SIZE(unix_socket_table));
2522
2523        return NULL;
2524}
2525
2526static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2527        __acquires(unix_table_lock)
2528{
2529        spin_lock(&unix_table_lock);
2530
2531        if (!*pos)
2532                return SEQ_START_TOKEN;
2533
2534        if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2535                return NULL;
2536
2537        return unix_next_socket(seq, NULL, pos);
2538}
2539
2540static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2541{
2542        ++*pos;
2543        return unix_next_socket(seq, v, pos);
2544}
2545
2546static void unix_seq_stop(struct seq_file *seq, void *v)
2547        __releases(unix_table_lock)
2548{
2549        spin_unlock(&unix_table_lock);
2550}
2551
2552static int unix_seq_show(struct seq_file *seq, void *v)
2553{
2554
2555        if (v == SEQ_START_TOKEN)
2556                seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2557                         "Inode Path\n");
2558        else {
2559                struct sock *s = v;
2560                struct unix_sock *u = unix_sk(s);
2561                unix_state_lock(s);
2562
2563                seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2564                        s,
2565                        atomic_read(&s->sk_refcnt),
2566                        0,
2567                        s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2568                        s->sk_type,
2569                        s->sk_socket ?
2570                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2571                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2572                        sock_i_ino(s));
2573
2574                if (u->addr) {
2575                        int i, len;
2576                        seq_putc(seq, ' ');
2577
2578                        i = 0;
2579                        len = u->addr->len - sizeof(short);
2580                        if (!UNIX_ABSTRACT(s))
2581                                len--;
2582                        else {
2583                                seq_putc(seq, '@');
2584                                i++;
2585                        }
2586                        for ( ; i < len; i++)
2587                                seq_putc(seq, u->addr->name->sun_path[i]);
2588                }
2589                unix_state_unlock(s);
2590                seq_putc(seq, '\n');
2591        }
2592
2593        return 0;
2594}
2595
2596static const struct seq_operations unix_seq_ops = {
2597        .start  = unix_seq_start,
2598        .next   = unix_seq_next,
2599        .stop   = unix_seq_stop,
2600        .show   = unix_seq_show,
2601};
2602
2603static int unix_seq_open(struct inode *inode, struct file *file)
2604{
2605        return seq_open_net(inode, file, &unix_seq_ops,
2606                            sizeof(struct seq_net_private));
2607}
2608
2609static const struct file_operations unix_seq_fops = {
2610        .owner          = THIS_MODULE,
2611        .open           = unix_seq_open,
2612        .read           = seq_read,
2613        .llseek         = seq_lseek,
2614        .release        = seq_release_net,
2615};
2616
2617#endif
2618
2619static const struct net_proto_family unix_family_ops = {
2620        .family = PF_UNIX,
2621        .create = unix_create,
2622        .owner  = THIS_MODULE,
2623};
2624
2625
2626static int __net_init unix_net_init(struct net *net)
2627{
2628        int error = -ENOMEM;
2629
2630        net->unx.sysctl_max_dgram_qlen = 10;
2631        if (unix_sysctl_register(net))
2632                goto out;
2633
2634#ifdef CONFIG_PROC_FS
2635        if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2636                unix_sysctl_unregister(net);
2637                goto out;
2638        }
2639#endif
2640        error = 0;
2641out:
2642        return error;
2643}
2644
2645static void __net_exit unix_net_exit(struct net *net)
2646{
2647        unix_sysctl_unregister(net);
2648        remove_proc_entry("unix", net->proc_net);
2649}
2650
2651static struct pernet_operations unix_net_ops = {
2652        .init = unix_net_init,
2653        .exit = unix_net_exit,
2654};
2655
2656static int __init af_unix_init(void)
2657{
2658        int rc = -1;
2659
2660        BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2661
2662        rc = proto_register(&unix_proto, 1);
2663        if (rc != 0) {
2664                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2665                goto out;
2666        }
2667
2668        sock_register(&unix_family_ops);
2669        register_pernet_subsys(&unix_net_ops);
2670out:
2671        return rc;
2672}
2673
2674static void __exit af_unix_exit(void)
2675{
2676        sock_unregister(PF_UNIX);
2677        proto_unregister(&unix_proto);
2678        unregister_pernet_subsys(&unix_net_ops);
2679}
2680
2681/* Earlier than device_initcall() so that other drivers invoking
2682   request_module() don't end up in a loop when modprobe tries
2683   to use a UNIX socket. But later than subsys_initcall() because
2684   we depend on stuff initialised there */
2685fs_initcall(af_unix_init);
2686module_exit(af_unix_exit);
2687
2688MODULE_LICENSE("GPL");
2689MODULE_ALIAS_NETPROTO(PF_UNIX);
2690