linux/net/unix/af_unix.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * NET4:        Implementation of BSD Unix domain sockets.
   4 *
   5 * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6 *
   7 * Fixes:
   8 *              Linus Torvalds  :       Assorted bug cures.
   9 *              Niibe Yutaka    :       async I/O support.
  10 *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11 *              Alan Cox        :       Limit size of allocated blocks.
  12 *              Alan Cox        :       Fixed the stupid socketpair bug.
  13 *              Alan Cox        :       BSD compatibility fine tuning.
  14 *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15 *              Alan Cox        :       Sorted out a proper draft version of
  16 *                                      file descriptor passing hacked up from
  17 *                                      Mike Shaver's work.
  18 *              Marty Leisner   :       Fixes to fd passing
  19 *              Nick Nevin      :       recvmsg bugfix.
  20 *              Alan Cox        :       Started proper garbage collector
  21 *              Heiko EiBfeldt  :       Missing verify_area check
  22 *              Alan Cox        :       Started POSIXisms
  23 *              Andreas Schwab  :       Replace inode by dentry for proper
  24 *                                      reference counting
  25 *              Kirk Petersen   :       Made this a module
  26 *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27 *                                      Lots of bug fixes.
  28 *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29 *                                      by above two patches.
  30 *           Andrea Arcangeli   :       If possible we block in connect(2)
  31 *                                      if the max backlog of the listen socket
  32 *                                      is been reached. This won't break
  33 *                                      old apps and it will avoid huge amount
  34 *                                      of socks hashed (this for unix_gc()
  35 *                                      performances reasons).
  36 *                                      Security fix that limits the max
  37 *                                      number of socks to 2*max_files and
  38 *                                      the number of skb queueable in the
  39 *                                      dgram receiver.
  40 *              Artur Skawina   :       Hash function optimizations
  41 *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42 *            Malcolm Beattie   :       Set peercred for socketpair
  43 *           Michal Ostrowski   :       Module initialization cleanup.
  44 *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45 *                                      the core infrastructure is doing that
  46 *                                      for all net proto families now (2.5.69+)
  47 *
  48 * Known differences from reference BSD that was tested:
  49 *
  50 *      [TO FIX]
  51 *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52 *              other the moment one end closes.
  53 *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54 *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55 *      [NOT TO FIX]
  56 *      accept() returns a path name even if the connecting socket has closed
  57 *              in the meantime (BSD loses the path and gives up).
  58 *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59 *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60 *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61 *      BSD af_unix apparently has connect forgetting to block properly.
  62 *              (need to check this with the POSIX spec in detail)
  63 *
  64 * Differences from 2.0.0-11-... (ANK)
  65 *      Bug fixes and improvements.
  66 *              - client shutdown killed server socket.
  67 *              - removed all useless cli/sti pairs.
  68 *
  69 *      Semantic changes/extensions.
  70 *              - generic control message passing.
  71 *              - SCM_CREDENTIALS control message.
  72 *              - "Abstract" (not FS based) socket bindings.
  73 *                Abstract names are sequences of bytes (not zero terminated)
  74 *                started by 0, so that this name space does not intersect
  75 *                with BSD names.
  76 */
  77
  78#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80#include <linux/module.h>
  81#include <linux/kernel.h>
  82#include <linux/signal.h>
  83#include <linux/sched/signal.h>
  84#include <linux/errno.h>
  85#include <linux/string.h>
  86#include <linux/stat.h>
  87#include <linux/dcache.h>
  88#include <linux/namei.h>
  89#include <linux/socket.h>
  90#include <linux/un.h>
  91#include <linux/fcntl.h>
  92#include <linux/termios.h>
  93#include <linux/sockios.h>
  94#include <linux/net.h>
  95#include <linux/in.h>
  96#include <linux/fs.h>
  97#include <linux/slab.h>
  98#include <linux/uaccess.h>
  99#include <linux/skbuff.h>
 100#include <linux/netdevice.h>
 101#include <net/net_namespace.h>
 102#include <net/sock.h>
 103#include <net/tcp_states.h>
 104#include <net/af_unix.h>
 105#include <linux/proc_fs.h>
 106#include <linux/seq_file.h>
 107#include <net/scm.h>
 108#include <linux/init.h>
 109#include <linux/poll.h>
 110#include <linux/rtnetlink.h>
 111#include <linux/mount.h>
 112#include <net/checksum.h>
 113#include <linux/security.h>
 114#include <linux/freezer.h>
 115#include <linux/file.h>
 116#include <linux/btf_ids.h>
 117
 118#include "scm.h"
 119
 120struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 121EXPORT_SYMBOL_GPL(unix_socket_table);
 122DEFINE_SPINLOCK(unix_table_lock);
 123EXPORT_SYMBOL_GPL(unix_table_lock);
 124static atomic_long_t unix_nr_socks;
 125
 126
 127static struct hlist_head *unix_sockets_unbound(void *addr)
 128{
 129        unsigned long hash = (unsigned long)addr;
 130
 131        hash ^= hash >> 16;
 132        hash ^= hash >> 8;
 133        hash %= UNIX_HASH_SIZE;
 134        return &unix_socket_table[UNIX_HASH_SIZE + hash];
 135}
 136
 137#define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 138
 139#ifdef CONFIG_SECURITY_NETWORK
 140static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 141{
 142        UNIXCB(skb).secid = scm->secid;
 143}
 144
 145static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 146{
 147        scm->secid = UNIXCB(skb).secid;
 148}
 149
 150static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 151{
 152        return (scm->secid == UNIXCB(skb).secid);
 153}
 154#else
 155static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 156{ }
 157
 158static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 159{ }
 160
 161static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 162{
 163        return true;
 164}
 165#endif /* CONFIG_SECURITY_NETWORK */
 166
 167/*
 168 *  SMP locking strategy:
 169 *    hash table is protected with spinlock unix_table_lock
 170 *    each socket state is protected by separate spin lock.
 171 */
 172
 173static inline unsigned int unix_hash_fold(__wsum n)
 174{
 175        unsigned int hash = (__force unsigned int)csum_fold(n);
 176
 177        hash ^= hash>>8;
 178        return hash&(UNIX_HASH_SIZE-1);
 179}
 180
 181#define unix_peer(sk) (unix_sk(sk)->peer)
 182
 183static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 184{
 185        return unix_peer(osk) == sk;
 186}
 187
 188static inline int unix_may_send(struct sock *sk, struct sock *osk)
 189{
 190        return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 191}
 192
 193static inline int unix_recvq_full(const struct sock *sk)
 194{
 195        return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 196}
 197
 198static inline int unix_recvq_full_lockless(const struct sock *sk)
 199{
 200        return skb_queue_len_lockless(&sk->sk_receive_queue) >
 201                READ_ONCE(sk->sk_max_ack_backlog);
 202}
 203
 204struct sock *unix_peer_get(struct sock *s)
 205{
 206        struct sock *peer;
 207
 208        unix_state_lock(s);
 209        peer = unix_peer(s);
 210        if (peer)
 211                sock_hold(peer);
 212        unix_state_unlock(s);
 213        return peer;
 214}
 215EXPORT_SYMBOL_GPL(unix_peer_get);
 216
 217static inline void unix_release_addr(struct unix_address *addr)
 218{
 219        if (refcount_dec_and_test(&addr->refcnt))
 220                kfree(addr);
 221}
 222
 223/*
 224 *      Check unix socket name:
 225 *              - should be not zero length.
 226 *              - if started by not zero, should be NULL terminated (FS object)
 227 *              - if started by zero, it is abstract name.
 228 */
 229
 230static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 231{
 232        *hashp = 0;
 233
 234        if (len <= sizeof(short) || len > sizeof(*sunaddr))
 235                return -EINVAL;
 236        if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 237                return -EINVAL;
 238        if (sunaddr->sun_path[0]) {
 239                /*
 240                 * This may look like an off by one error but it is a bit more
 241                 * subtle. 108 is the longest valid AF_UNIX path for a binding.
 242                 * sun_path[108] doesn't as such exist.  However in kernel space
 243                 * we are guaranteed that it is a valid memory location in our
 244                 * kernel address buffer.
 245                 */
 246                ((char *)sunaddr)[len] = 0;
 247                len = strlen(sunaddr->sun_path)+1+sizeof(short);
 248                return len;
 249        }
 250
 251        *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 252        return len;
 253}
 254
 255static void __unix_remove_socket(struct sock *sk)
 256{
 257        sk_del_node_init(sk);
 258}
 259
 260static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 261{
 262        WARN_ON(!sk_unhashed(sk));
 263        sk_add_node(sk, list);
 264}
 265
 266static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
 267                            unsigned hash)
 268{
 269        __unix_remove_socket(sk);
 270        smp_store_release(&unix_sk(sk)->addr, addr);
 271        __unix_insert_socket(&unix_socket_table[hash], sk);
 272}
 273
 274static inline void unix_remove_socket(struct sock *sk)
 275{
 276        spin_lock(&unix_table_lock);
 277        __unix_remove_socket(sk);
 278        spin_unlock(&unix_table_lock);
 279}
 280
 281static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 282{
 283        spin_lock(&unix_table_lock);
 284        __unix_insert_socket(list, sk);
 285        spin_unlock(&unix_table_lock);
 286}
 287
 288static struct sock *__unix_find_socket_byname(struct net *net,
 289                                              struct sockaddr_un *sunname,
 290                                              int len, unsigned int hash)
 291{
 292        struct sock *s;
 293
 294        sk_for_each(s, &unix_socket_table[hash]) {
 295                struct unix_sock *u = unix_sk(s);
 296
 297                if (!net_eq(sock_net(s), net))
 298                        continue;
 299
 300                if (u->addr->len == len &&
 301                    !memcmp(u->addr->name, sunname, len))
 302                        return s;
 303        }
 304        return NULL;
 305}
 306
 307static inline struct sock *unix_find_socket_byname(struct net *net,
 308                                                   struct sockaddr_un *sunname,
 309                                                   int len, unsigned int hash)
 310{
 311        struct sock *s;
 312
 313        spin_lock(&unix_table_lock);
 314        s = __unix_find_socket_byname(net, sunname, len, hash);
 315        if (s)
 316                sock_hold(s);
 317        spin_unlock(&unix_table_lock);
 318        return s;
 319}
 320
 321static struct sock *unix_find_socket_byinode(struct inode *i)
 322{
 323        struct sock *s;
 324
 325        spin_lock(&unix_table_lock);
 326        sk_for_each(s,
 327                    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 328                struct dentry *dentry = unix_sk(s)->path.dentry;
 329
 330                if (dentry && d_backing_inode(dentry) == i) {
 331                        sock_hold(s);
 332                        goto found;
 333                }
 334        }
 335        s = NULL;
 336found:
 337        spin_unlock(&unix_table_lock);
 338        return s;
 339}
 340
 341/* Support code for asymmetrically connected dgram sockets
 342 *
 343 * If a datagram socket is connected to a socket not itself connected
 344 * to the first socket (eg, /dev/log), clients may only enqueue more
 345 * messages if the present receive queue of the server socket is not
 346 * "too large". This means there's a second writeability condition
 347 * poll and sendmsg need to test. The dgram recv code will do a wake
 348 * up on the peer_wait wait queue of a socket upon reception of a
 349 * datagram which needs to be propagated to sleeping would-be writers
 350 * since these might not have sent anything so far. This can't be
 351 * accomplished via poll_wait because the lifetime of the server
 352 * socket might be less than that of its clients if these break their
 353 * association with it or if the server socket is closed while clients
 354 * are still connected to it and there's no way to inform "a polling
 355 * implementation" that it should let go of a certain wait queue
 356 *
 357 * In order to propagate a wake up, a wait_queue_entry_t of the client
 358 * socket is enqueued on the peer_wait queue of the server socket
 359 * whose wake function does a wake_up on the ordinary client socket
 360 * wait queue. This connection is established whenever a write (or
 361 * poll for write) hit the flow control condition and broken when the
 362 * association to the server socket is dissolved or after a wake up
 363 * was relayed.
 364 */
 365
 366static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 367                                      void *key)
 368{
 369        struct unix_sock *u;
 370        wait_queue_head_t *u_sleep;
 371
 372        u = container_of(q, struct unix_sock, peer_wake);
 373
 374        __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 375                            q);
 376        u->peer_wake.private = NULL;
 377
 378        /* relaying can only happen while the wq still exists */
 379        u_sleep = sk_sleep(&u->sk);
 380        if (u_sleep)
 381                wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 382
 383        return 0;
 384}
 385
 386static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 387{
 388        struct unix_sock *u, *u_other;
 389        int rc;
 390
 391        u = unix_sk(sk);
 392        u_other = unix_sk(other);
 393        rc = 0;
 394        spin_lock(&u_other->peer_wait.lock);
 395
 396        if (!u->peer_wake.private) {
 397                u->peer_wake.private = other;
 398                __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 399
 400                rc = 1;
 401        }
 402
 403        spin_unlock(&u_other->peer_wait.lock);
 404        return rc;
 405}
 406
 407static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 408                                            struct sock *other)
 409{
 410        struct unix_sock *u, *u_other;
 411
 412        u = unix_sk(sk);
 413        u_other = unix_sk(other);
 414        spin_lock(&u_other->peer_wait.lock);
 415
 416        if (u->peer_wake.private == other) {
 417                __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 418                u->peer_wake.private = NULL;
 419        }
 420
 421        spin_unlock(&u_other->peer_wait.lock);
 422}
 423
 424static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 425                                                   struct sock *other)
 426{
 427        unix_dgram_peer_wake_disconnect(sk, other);
 428        wake_up_interruptible_poll(sk_sleep(sk),
 429                                   EPOLLOUT |
 430                                   EPOLLWRNORM |
 431                                   EPOLLWRBAND);
 432}
 433
 434/* preconditions:
 435 *      - unix_peer(sk) == other
 436 *      - association is stable
 437 */
 438static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 439{
 440        int connected;
 441
 442        connected = unix_dgram_peer_wake_connect(sk, other);
 443
 444        /* If other is SOCK_DEAD, we want to make sure we signal
 445         * POLLOUT, such that a subsequent write() can get a
 446         * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 447         * to other and its full, we will hang waiting for POLLOUT.
 448         */
 449        if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
 450                return 1;
 451
 452        if (connected)
 453                unix_dgram_peer_wake_disconnect(sk, other);
 454
 455        return 0;
 456}
 457
 458static int unix_writable(const struct sock *sk)
 459{
 460        return sk->sk_state != TCP_LISTEN &&
 461               (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 462}
 463
 464static void unix_write_space(struct sock *sk)
 465{
 466        struct socket_wq *wq;
 467
 468        rcu_read_lock();
 469        if (unix_writable(sk)) {
 470                wq = rcu_dereference(sk->sk_wq);
 471                if (skwq_has_sleeper(wq))
 472                        wake_up_interruptible_sync_poll(&wq->wait,
 473                                EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 474                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 475        }
 476        rcu_read_unlock();
 477}
 478
 479/* When dgram socket disconnects (or changes its peer), we clear its receive
 480 * queue of packets arrived from previous peer. First, it allows to do
 481 * flow control based only on wmem_alloc; second, sk connected to peer
 482 * may receive messages only from that peer. */
 483static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 484{
 485        if (!skb_queue_empty(&sk->sk_receive_queue)) {
 486                skb_queue_purge(&sk->sk_receive_queue);
 487                wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 488
 489                /* If one link of bidirectional dgram pipe is disconnected,
 490                 * we signal error. Messages are lost. Do not make this,
 491                 * when peer was not connected to us.
 492                 */
 493                if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 494                        other->sk_err = ECONNRESET;
 495                        sk_error_report(other);
 496                }
 497        }
 498        other->sk_state = TCP_CLOSE;
 499}
 500
 501static void unix_sock_destructor(struct sock *sk)
 502{
 503        struct unix_sock *u = unix_sk(sk);
 504
 505        skb_queue_purge(&sk->sk_receive_queue);
 506
 507#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 508        if (u->oob_skb) {
 509                kfree_skb(u->oob_skb);
 510                u->oob_skb = NULL;
 511        }
 512#endif
 513        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 514        WARN_ON(!sk_unhashed(sk));
 515        WARN_ON(sk->sk_socket);
 516        if (!sock_flag(sk, SOCK_DEAD)) {
 517                pr_info("Attempt to release alive unix socket: %p\n", sk);
 518                return;
 519        }
 520
 521        if (u->addr)
 522                unix_release_addr(u->addr);
 523
 524        atomic_long_dec(&unix_nr_socks);
 525        local_bh_disable();
 526        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 527        local_bh_enable();
 528#ifdef UNIX_REFCNT_DEBUG
 529        pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 530                atomic_long_read(&unix_nr_socks));
 531#endif
 532}
 533
 534static void unix_release_sock(struct sock *sk, int embrion)
 535{
 536        struct unix_sock *u = unix_sk(sk);
 537        struct path path;
 538        struct sock *skpair;
 539        struct sk_buff *skb;
 540        int state;
 541
 542        unix_remove_socket(sk);
 543
 544        /* Clear state */
 545        unix_state_lock(sk);
 546        sock_orphan(sk);
 547        sk->sk_shutdown = SHUTDOWN_MASK;
 548        path         = u->path;
 549        u->path.dentry = NULL;
 550        u->path.mnt = NULL;
 551        state = sk->sk_state;
 552        sk->sk_state = TCP_CLOSE;
 553
 554        skpair = unix_peer(sk);
 555        unix_peer(sk) = NULL;
 556
 557        unix_state_unlock(sk);
 558
 559        wake_up_interruptible_all(&u->peer_wait);
 560
 561        if (skpair != NULL) {
 562                if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 563                        unix_state_lock(skpair);
 564                        /* No more writes */
 565                        skpair->sk_shutdown = SHUTDOWN_MASK;
 566                        if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 567                                skpair->sk_err = ECONNRESET;
 568                        unix_state_unlock(skpair);
 569                        skpair->sk_state_change(skpair);
 570                        sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 571                }
 572
 573                unix_dgram_peer_wake_disconnect(sk, skpair);
 574                sock_put(skpair); /* It may now die */
 575        }
 576
 577        /* Try to flush out this socket. Throw out buffers at least */
 578
 579        while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 580                if (state == TCP_LISTEN)
 581                        unix_release_sock(skb->sk, 1);
 582                /* passed fds are erased in the kfree_skb hook        */
 583                UNIXCB(skb).consumed = skb->len;
 584                kfree_skb(skb);
 585        }
 586
 587        if (path.dentry)
 588                path_put(&path);
 589
 590        sock_put(sk);
 591
 592        /* ---- Socket is dead now and most probably destroyed ---- */
 593
 594        /*
 595         * Fixme: BSD difference: In BSD all sockets connected to us get
 596         *        ECONNRESET and we die on the spot. In Linux we behave
 597         *        like files and pipes do and wait for the last
 598         *        dereference.
 599         *
 600         * Can't we simply set sock->err?
 601         *
 602         *        What the above comment does talk about? --ANK(980817)
 603         */
 604
 605        if (unix_tot_inflight)
 606                unix_gc();              /* Garbage collect fds */
 607}
 608
 609static void init_peercred(struct sock *sk)
 610{
 611        const struct cred *old_cred;
 612        struct pid *old_pid;
 613
 614        spin_lock(&sk->sk_peer_lock);
 615        old_pid = sk->sk_peer_pid;
 616        old_cred = sk->sk_peer_cred;
 617        sk->sk_peer_pid  = get_pid(task_tgid(current));
 618        sk->sk_peer_cred = get_current_cred();
 619        spin_unlock(&sk->sk_peer_lock);
 620
 621        put_pid(old_pid);
 622        put_cred(old_cred);
 623}
 624
 625static void copy_peercred(struct sock *sk, struct sock *peersk)
 626{
 627        const struct cred *old_cred;
 628        struct pid *old_pid;
 629
 630        if (sk < peersk) {
 631                spin_lock(&sk->sk_peer_lock);
 632                spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 633        } else {
 634                spin_lock(&peersk->sk_peer_lock);
 635                spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 636        }
 637        old_pid = sk->sk_peer_pid;
 638        old_cred = sk->sk_peer_cred;
 639        sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 640        sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 641
 642        spin_unlock(&sk->sk_peer_lock);
 643        spin_unlock(&peersk->sk_peer_lock);
 644
 645        put_pid(old_pid);
 646        put_cred(old_cred);
 647}
 648
 649static int unix_listen(struct socket *sock, int backlog)
 650{
 651        int err;
 652        struct sock *sk = sock->sk;
 653        struct unix_sock *u = unix_sk(sk);
 654
 655        err = -EOPNOTSUPP;
 656        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 657                goto out;       /* Only stream/seqpacket sockets accept */
 658        err = -EINVAL;
 659        if (!u->addr)
 660                goto out;       /* No listens on an unbound socket */
 661        unix_state_lock(sk);
 662        if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 663                goto out_unlock;
 664        if (backlog > sk->sk_max_ack_backlog)
 665                wake_up_interruptible_all(&u->peer_wait);
 666        sk->sk_max_ack_backlog  = backlog;
 667        sk->sk_state            = TCP_LISTEN;
 668        /* set credentials so connect can copy them */
 669        init_peercred(sk);
 670        err = 0;
 671
 672out_unlock:
 673        unix_state_unlock(sk);
 674out:
 675        return err;
 676}
 677
 678static int unix_release(struct socket *);
 679static int unix_bind(struct socket *, struct sockaddr *, int);
 680static int unix_stream_connect(struct socket *, struct sockaddr *,
 681                               int addr_len, int flags);
 682static int unix_socketpair(struct socket *, struct socket *);
 683static int unix_accept(struct socket *, struct socket *, int, bool);
 684static int unix_getname(struct socket *, struct sockaddr *, int);
 685static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 686static __poll_t unix_dgram_poll(struct file *, struct socket *,
 687                                    poll_table *);
 688static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 689#ifdef CONFIG_COMPAT
 690static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 691#endif
 692static int unix_shutdown(struct socket *, int);
 693static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 694static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 695static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 696                                    size_t size, int flags);
 697static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 698                                       struct pipe_inode_info *, size_t size,
 699                                       unsigned int flags);
 700static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 701static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 702static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 703                          sk_read_actor_t recv_actor);
 704static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
 705                                 sk_read_actor_t recv_actor);
 706static int unix_dgram_connect(struct socket *, struct sockaddr *,
 707                              int, int);
 708static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 709static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 710                                  int);
 711
 712static int unix_set_peek_off(struct sock *sk, int val)
 713{
 714        struct unix_sock *u = unix_sk(sk);
 715
 716        if (mutex_lock_interruptible(&u->iolock))
 717                return -EINTR;
 718
 719        sk->sk_peek_off = val;
 720        mutex_unlock(&u->iolock);
 721
 722        return 0;
 723}
 724
 725#ifdef CONFIG_PROC_FS
 726static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 727{
 728        struct sock *sk = sock->sk;
 729        struct unix_sock *u;
 730
 731        if (sk) {
 732                u = unix_sk(sock->sk);
 733                seq_printf(m, "scm_fds: %u\n",
 734                           atomic_read(&u->scm_stat.nr_fds));
 735        }
 736}
 737#else
 738#define unix_show_fdinfo NULL
 739#endif
 740
 741static const struct proto_ops unix_stream_ops = {
 742        .family =       PF_UNIX,
 743        .owner =        THIS_MODULE,
 744        .release =      unix_release,
 745        .bind =         unix_bind,
 746        .connect =      unix_stream_connect,
 747        .socketpair =   unix_socketpair,
 748        .accept =       unix_accept,
 749        .getname =      unix_getname,
 750        .poll =         unix_poll,
 751        .ioctl =        unix_ioctl,
 752#ifdef CONFIG_COMPAT
 753        .compat_ioctl = unix_compat_ioctl,
 754#endif
 755        .listen =       unix_listen,
 756        .shutdown =     unix_shutdown,
 757        .sendmsg =      unix_stream_sendmsg,
 758        .recvmsg =      unix_stream_recvmsg,
 759        .read_sock =    unix_stream_read_sock,
 760        .mmap =         sock_no_mmap,
 761        .sendpage =     unix_stream_sendpage,
 762        .splice_read =  unix_stream_splice_read,
 763        .set_peek_off = unix_set_peek_off,
 764        .show_fdinfo =  unix_show_fdinfo,
 765};
 766
 767static const struct proto_ops unix_dgram_ops = {
 768        .family =       PF_UNIX,
 769        .owner =        THIS_MODULE,
 770        .release =      unix_release,
 771        .bind =         unix_bind,
 772        .connect =      unix_dgram_connect,
 773        .socketpair =   unix_socketpair,
 774        .accept =       sock_no_accept,
 775        .getname =      unix_getname,
 776        .poll =         unix_dgram_poll,
 777        .ioctl =        unix_ioctl,
 778#ifdef CONFIG_COMPAT
 779        .compat_ioctl = unix_compat_ioctl,
 780#endif
 781        .listen =       sock_no_listen,
 782        .shutdown =     unix_shutdown,
 783        .sendmsg =      unix_dgram_sendmsg,
 784        .read_sock =    unix_read_sock,
 785        .recvmsg =      unix_dgram_recvmsg,
 786        .mmap =         sock_no_mmap,
 787        .sendpage =     sock_no_sendpage,
 788        .set_peek_off = unix_set_peek_off,
 789        .show_fdinfo =  unix_show_fdinfo,
 790};
 791
 792static const struct proto_ops unix_seqpacket_ops = {
 793        .family =       PF_UNIX,
 794        .owner =        THIS_MODULE,
 795        .release =      unix_release,
 796        .bind =         unix_bind,
 797        .connect =      unix_stream_connect,
 798        .socketpair =   unix_socketpair,
 799        .accept =       unix_accept,
 800        .getname =      unix_getname,
 801        .poll =         unix_dgram_poll,
 802        .ioctl =        unix_ioctl,
 803#ifdef CONFIG_COMPAT
 804        .compat_ioctl = unix_compat_ioctl,
 805#endif
 806        .listen =       unix_listen,
 807        .shutdown =     unix_shutdown,
 808        .sendmsg =      unix_seqpacket_sendmsg,
 809        .recvmsg =      unix_seqpacket_recvmsg,
 810        .mmap =         sock_no_mmap,
 811        .sendpage =     sock_no_sendpage,
 812        .set_peek_off = unix_set_peek_off,
 813        .show_fdinfo =  unix_show_fdinfo,
 814};
 815
 816static void unix_close(struct sock *sk, long timeout)
 817{
 818        /* Nothing to do here, unix socket does not need a ->close().
 819         * This is merely for sockmap.
 820         */
 821}
 822
 823static void unix_unhash(struct sock *sk)
 824{
 825        /* Nothing to do here, unix socket does not need a ->unhash().
 826         * This is merely for sockmap.
 827         */
 828}
 829
 830struct proto unix_dgram_proto = {
 831        .name                   = "UNIX",
 832        .owner                  = THIS_MODULE,
 833        .obj_size               = sizeof(struct unix_sock),
 834        .close                  = unix_close,
 835#ifdef CONFIG_BPF_SYSCALL
 836        .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 837#endif
 838};
 839
 840struct proto unix_stream_proto = {
 841        .name                   = "UNIX-STREAM",
 842        .owner                  = THIS_MODULE,
 843        .obj_size               = sizeof(struct unix_sock),
 844        .close                  = unix_close,
 845        .unhash                 = unix_unhash,
 846#ifdef CONFIG_BPF_SYSCALL
 847        .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 848#endif
 849};
 850
 851static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 852{
 853        struct unix_sock *u;
 854        struct sock *sk;
 855        int err;
 856
 857        atomic_long_inc(&unix_nr_socks);
 858        if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 859                err = -ENFILE;
 860                goto err;
 861        }
 862
 863        if (type == SOCK_STREAM)
 864                sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 865        else /*dgram and  seqpacket */
 866                sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 867
 868        if (!sk) {
 869                err = -ENOMEM;
 870                goto err;
 871        }
 872
 873        sock_init_data(sock, sk);
 874
 875        sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 876        sk->sk_write_space      = unix_write_space;
 877        sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 878        sk->sk_destruct         = unix_sock_destructor;
 879        u         = unix_sk(sk);
 880        u->path.dentry = NULL;
 881        u->path.mnt = NULL;
 882        spin_lock_init(&u->lock);
 883        atomic_long_set(&u->inflight, 0);
 884        INIT_LIST_HEAD(&u->link);
 885        mutex_init(&u->iolock); /* single task reading lock */
 886        mutex_init(&u->bindlock); /* single task binding lock */
 887        init_waitqueue_head(&u->peer_wait);
 888        init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 889        memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 890        unix_insert_socket(unix_sockets_unbound(sk), sk);
 891
 892        local_bh_disable();
 893        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 894        local_bh_enable();
 895
 896        return sk;
 897
 898err:
 899        atomic_long_dec(&unix_nr_socks);
 900        return ERR_PTR(err);
 901}
 902
 903static int unix_create(struct net *net, struct socket *sock, int protocol,
 904                       int kern)
 905{
 906        struct sock *sk;
 907
 908        if (protocol && protocol != PF_UNIX)
 909                return -EPROTONOSUPPORT;
 910
 911        sock->state = SS_UNCONNECTED;
 912
 913        switch (sock->type) {
 914        case SOCK_STREAM:
 915                sock->ops = &unix_stream_ops;
 916                break;
 917                /*
 918                 *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 919                 *      nothing uses it.
 920                 */
 921        case SOCK_RAW:
 922                sock->type = SOCK_DGRAM;
 923                fallthrough;
 924        case SOCK_DGRAM:
 925                sock->ops = &unix_dgram_ops;
 926                break;
 927        case SOCK_SEQPACKET:
 928                sock->ops = &unix_seqpacket_ops;
 929                break;
 930        default:
 931                return -ESOCKTNOSUPPORT;
 932        }
 933
 934        sk = unix_create1(net, sock, kern, sock->type);
 935        if (IS_ERR(sk))
 936                return PTR_ERR(sk);
 937
 938        return 0;
 939}
 940
 941static int unix_release(struct socket *sock)
 942{
 943        struct sock *sk = sock->sk;
 944
 945        if (!sk)
 946                return 0;
 947
 948        sk->sk_prot->close(sk, 0);
 949        unix_release_sock(sk, 0);
 950        sock->sk = NULL;
 951
 952        return 0;
 953}
 954
 955static int unix_autobind(struct socket *sock)
 956{
 957        struct sock *sk = sock->sk;
 958        struct net *net = sock_net(sk);
 959        struct unix_sock *u = unix_sk(sk);
 960        static u32 ordernum = 1;
 961        struct unix_address *addr;
 962        int err;
 963        unsigned int retries = 0;
 964
 965        err = mutex_lock_interruptible(&u->bindlock);
 966        if (err)
 967                return err;
 968
 969        if (u->addr)
 970                goto out;
 971
 972        err = -ENOMEM;
 973        addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 974        if (!addr)
 975                goto out;
 976
 977        addr->name->sun_family = AF_UNIX;
 978        refcount_set(&addr->refcnt, 1);
 979
 980retry:
 981        addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 982        addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 983        addr->hash ^= sk->sk_type;
 984
 985        spin_lock(&unix_table_lock);
 986        ordernum = (ordernum+1)&0xFFFFF;
 987
 988        if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
 989                spin_unlock(&unix_table_lock);
 990                /*
 991                 * __unix_find_socket_byname() may take long time if many names
 992                 * are already in use.
 993                 */
 994                cond_resched();
 995                /* Give up if all names seems to be in use. */
 996                if (retries++ == 0xFFFFF) {
 997                        err = -ENOSPC;
 998                        kfree(addr);
 999                        goto out;
1000                }
1001                goto retry;
1002        }
1003
1004        __unix_set_addr(sk, addr, addr->hash);
1005        spin_unlock(&unix_table_lock);
1006        err = 0;
1007
1008out:    mutex_unlock(&u->bindlock);
1009        return err;
1010}
1011
1012static struct sock *unix_find_other(struct net *net,
1013                                    struct sockaddr_un *sunname, int len,
1014                                    int type, unsigned int hash, int *error)
1015{
1016        struct sock *u;
1017        struct path path;
1018        int err = 0;
1019
1020        if (sunname->sun_path[0]) {
1021                struct inode *inode;
1022                err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
1023                if (err)
1024                        goto fail;
1025                inode = d_backing_inode(path.dentry);
1026                err = path_permission(&path, MAY_WRITE);
1027                if (err)
1028                        goto put_fail;
1029
1030                err = -ECONNREFUSED;
1031                if (!S_ISSOCK(inode->i_mode))
1032                        goto put_fail;
1033                u = unix_find_socket_byinode(inode);
1034                if (!u)
1035                        goto put_fail;
1036
1037                if (u->sk_type == type)
1038                        touch_atime(&path);
1039
1040                path_put(&path);
1041
1042                err = -EPROTOTYPE;
1043                if (u->sk_type != type) {
1044                        sock_put(u);
1045                        goto fail;
1046                }
1047        } else {
1048                err = -ECONNREFUSED;
1049                u = unix_find_socket_byname(net, sunname, len, type ^ hash);
1050                if (u) {
1051                        struct dentry *dentry;
1052                        dentry = unix_sk(u)->path.dentry;
1053                        if (dentry)
1054                                touch_atime(&unix_sk(u)->path);
1055                } else
1056                        goto fail;
1057        }
1058        return u;
1059
1060put_fail:
1061        path_put(&path);
1062fail:
1063        *error = err;
1064        return NULL;
1065}
1066
1067static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
1068{
1069        struct unix_sock *u = unix_sk(sk);
1070        umode_t mode = S_IFSOCK |
1071               (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1072        struct user_namespace *ns; // barf...
1073        struct path parent;
1074        struct dentry *dentry;
1075        unsigned int hash;
1076        int err;
1077
1078        /*
1079         * Get the parent directory, calculate the hash for last
1080         * component.
1081         */
1082        dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1083        if (IS_ERR(dentry))
1084                return PTR_ERR(dentry);
1085        ns = mnt_user_ns(parent.mnt);
1086
1087        /*
1088         * All right, let's create it.
1089         */
1090        err = security_path_mknod(&parent, dentry, mode, 0);
1091        if (!err)
1092                err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1093        if (err)
1094                goto out;
1095        err = mutex_lock_interruptible(&u->bindlock);
1096        if (err)
1097                goto out_unlink;
1098        if (u->addr)
1099                goto out_unlock;
1100
1101        addr->hash = UNIX_HASH_SIZE;
1102        hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1103        spin_lock(&unix_table_lock);
1104        u->path.mnt = mntget(parent.mnt);
1105        u->path.dentry = dget(dentry);
1106        __unix_set_addr(sk, addr, hash);
1107        spin_unlock(&unix_table_lock);
1108        mutex_unlock(&u->bindlock);
1109        done_path_create(&parent, dentry);
1110        return 0;
1111
1112out_unlock:
1113        mutex_unlock(&u->bindlock);
1114        err = -EINVAL;
1115out_unlink:
1116        /* failed after successful mknod?  unlink what we'd created... */
1117        vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1118out:
1119        done_path_create(&parent, dentry);
1120        return err;
1121}
1122
1123static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
1124{
1125        struct unix_sock *u = unix_sk(sk);
1126        int err;
1127
1128        err = mutex_lock_interruptible(&u->bindlock);
1129        if (err)
1130                return err;
1131
1132        if (u->addr) {
1133                mutex_unlock(&u->bindlock);
1134                return -EINVAL;
1135        }
1136
1137        spin_lock(&unix_table_lock);
1138        if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
1139                                      addr->hash)) {
1140                spin_unlock(&unix_table_lock);
1141                mutex_unlock(&u->bindlock);
1142                return -EADDRINUSE;
1143        }
1144        __unix_set_addr(sk, addr, addr->hash);
1145        spin_unlock(&unix_table_lock);
1146        mutex_unlock(&u->bindlock);
1147        return 0;
1148}
1149
1150static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1151{
1152        struct sock *sk = sock->sk;
1153        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1154        char *sun_path = sunaddr->sun_path;
1155        int err;
1156        unsigned int hash;
1157        struct unix_address *addr;
1158
1159        if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1160            sunaddr->sun_family != AF_UNIX)
1161                return -EINVAL;
1162
1163        if (addr_len == sizeof(short))
1164                return unix_autobind(sock);
1165
1166        err = unix_mkname(sunaddr, addr_len, &hash);
1167        if (err < 0)
1168                return err;
1169        addr_len = err;
1170        addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1171        if (!addr)
1172                return -ENOMEM;
1173
1174        memcpy(addr->name, sunaddr, addr_len);
1175        addr->len = addr_len;
1176        addr->hash = hash ^ sk->sk_type;
1177        refcount_set(&addr->refcnt, 1);
1178
1179        if (sun_path[0])
1180                err = unix_bind_bsd(sk, addr);
1181        else
1182                err = unix_bind_abstract(sk, addr);
1183        if (err)
1184                unix_release_addr(addr);
1185        return err == -EEXIST ? -EADDRINUSE : err;
1186}
1187
1188static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1189{
1190        if (unlikely(sk1 == sk2) || !sk2) {
1191                unix_state_lock(sk1);
1192                return;
1193        }
1194        if (sk1 < sk2) {
1195                unix_state_lock(sk1);
1196                unix_state_lock_nested(sk2);
1197        } else {
1198                unix_state_lock(sk2);
1199                unix_state_lock_nested(sk1);
1200        }
1201}
1202
1203static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1204{
1205        if (unlikely(sk1 == sk2) || !sk2) {
1206                unix_state_unlock(sk1);
1207                return;
1208        }
1209        unix_state_unlock(sk1);
1210        unix_state_unlock(sk2);
1211}
1212
1213static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1214                              int alen, int flags)
1215{
1216        struct sock *sk = sock->sk;
1217        struct net *net = sock_net(sk);
1218        struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1219        struct sock *other;
1220        unsigned int hash;
1221        int err;
1222
1223        err = -EINVAL;
1224        if (alen < offsetofend(struct sockaddr, sa_family))
1225                goto out;
1226
1227        if (addr->sa_family != AF_UNSPEC) {
1228                err = unix_mkname(sunaddr, alen, &hash);
1229                if (err < 0)
1230                        goto out;
1231                alen = err;
1232
1233                if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1234                    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1235                        goto out;
1236
1237restart:
1238                other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1239                if (!other)
1240                        goto out;
1241
1242                unix_state_double_lock(sk, other);
1243
1244                /* Apparently VFS overslept socket death. Retry. */
1245                if (sock_flag(other, SOCK_DEAD)) {
1246                        unix_state_double_unlock(sk, other);
1247                        sock_put(other);
1248                        goto restart;
1249                }
1250
1251                err = -EPERM;
1252                if (!unix_may_send(sk, other))
1253                        goto out_unlock;
1254
1255                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1256                if (err)
1257                        goto out_unlock;
1258
1259                sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1260        } else {
1261                /*
1262                 *      1003.1g breaking connected state with AF_UNSPEC
1263                 */
1264                other = NULL;
1265                unix_state_double_lock(sk, other);
1266        }
1267
1268        /*
1269         * If it was connected, reconnect.
1270         */
1271        if (unix_peer(sk)) {
1272                struct sock *old_peer = unix_peer(sk);
1273
1274                unix_peer(sk) = other;
1275                if (!other)
1276                        sk->sk_state = TCP_CLOSE;
1277                unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1278
1279                unix_state_double_unlock(sk, other);
1280
1281                if (other != old_peer)
1282                        unix_dgram_disconnected(sk, old_peer);
1283                sock_put(old_peer);
1284        } else {
1285                unix_peer(sk) = other;
1286                unix_state_double_unlock(sk, other);
1287        }
1288
1289        return 0;
1290
1291out_unlock:
1292        unix_state_double_unlock(sk, other);
1293        sock_put(other);
1294out:
1295        return err;
1296}
1297
1298static long unix_wait_for_peer(struct sock *other, long timeo)
1299        __releases(&unix_sk(other)->lock)
1300{
1301        struct unix_sock *u = unix_sk(other);
1302        int sched;
1303        DEFINE_WAIT(wait);
1304
1305        prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1306
1307        sched = !sock_flag(other, SOCK_DEAD) &&
1308                !(other->sk_shutdown & RCV_SHUTDOWN) &&
1309                unix_recvq_full(other);
1310
1311        unix_state_unlock(other);
1312
1313        if (sched)
1314                timeo = schedule_timeout(timeo);
1315
1316        finish_wait(&u->peer_wait, &wait);
1317        return timeo;
1318}
1319
1320static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1321                               int addr_len, int flags)
1322{
1323        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324        struct sock *sk = sock->sk;
1325        struct net *net = sock_net(sk);
1326        struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1327        struct sock *newsk = NULL;
1328        struct sock *other = NULL;
1329        struct sk_buff *skb = NULL;
1330        unsigned int hash;
1331        int st;
1332        int err;
1333        long timeo;
1334
1335        err = unix_mkname(sunaddr, addr_len, &hash);
1336        if (err < 0)
1337                goto out;
1338        addr_len = err;
1339
1340        if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1341            (err = unix_autobind(sock)) != 0)
1342                goto out;
1343
1344        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1345
1346        /* First of all allocate resources.
1347           If we will make it after state is locked,
1348           we will have to recheck all again in any case.
1349         */
1350
1351        /* create new sock for complete connection */
1352        newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
1353        if (IS_ERR(newsk)) {
1354                err = PTR_ERR(newsk);
1355                newsk = NULL;
1356                goto out;
1357        }
1358
1359        err = -ENOMEM;
1360
1361        /* Allocate skb for sending to listening sock */
1362        skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1363        if (skb == NULL)
1364                goto out;
1365
1366restart:
1367        /*  Find listening sock. */
1368        other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1369        if (!other)
1370                goto out;
1371
1372        /* Latch state of peer */
1373        unix_state_lock(other);
1374
1375        /* Apparently VFS overslept socket death. Retry. */
1376        if (sock_flag(other, SOCK_DEAD)) {
1377                unix_state_unlock(other);
1378                sock_put(other);
1379                goto restart;
1380        }
1381
1382        err = -ECONNREFUSED;
1383        if (other->sk_state != TCP_LISTEN)
1384                goto out_unlock;
1385        if (other->sk_shutdown & RCV_SHUTDOWN)
1386                goto out_unlock;
1387
1388        if (unix_recvq_full(other)) {
1389                err = -EAGAIN;
1390                if (!timeo)
1391                        goto out_unlock;
1392
1393                timeo = unix_wait_for_peer(other, timeo);
1394
1395                err = sock_intr_errno(timeo);
1396                if (signal_pending(current))
1397                        goto out;
1398                sock_put(other);
1399                goto restart;
1400        }
1401
1402        /* Latch our state.
1403
1404           It is tricky place. We need to grab our state lock and cannot
1405           drop lock on peer. It is dangerous because deadlock is
1406           possible. Connect to self case and simultaneous
1407           attempt to connect are eliminated by checking socket
1408           state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1409           check this before attempt to grab lock.
1410
1411           Well, and we have to recheck the state after socket locked.
1412         */
1413        st = sk->sk_state;
1414
1415        switch (st) {
1416        case TCP_CLOSE:
1417                /* This is ok... continue with connect */
1418                break;
1419        case TCP_ESTABLISHED:
1420                /* Socket is already connected */
1421                err = -EISCONN;
1422                goto out_unlock;
1423        default:
1424                err = -EINVAL;
1425                goto out_unlock;
1426        }
1427
1428        unix_state_lock_nested(sk);
1429
1430        if (sk->sk_state != st) {
1431                unix_state_unlock(sk);
1432                unix_state_unlock(other);
1433                sock_put(other);
1434                goto restart;
1435        }
1436
1437        err = security_unix_stream_connect(sk, other, newsk);
1438        if (err) {
1439                unix_state_unlock(sk);
1440                goto out_unlock;
1441        }
1442
1443        /* The way is open! Fastly set all the necessary fields... */
1444
1445        sock_hold(sk);
1446        unix_peer(newsk)        = sk;
1447        newsk->sk_state         = TCP_ESTABLISHED;
1448        newsk->sk_type          = sk->sk_type;
1449        init_peercred(newsk);
1450        newu = unix_sk(newsk);
1451        RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1452        otheru = unix_sk(other);
1453
1454        /* copy address information from listening to new sock
1455         *
1456         * The contents of *(otheru->addr) and otheru->path
1457         * are seen fully set up here, since we have found
1458         * otheru in hash under unix_table_lock.  Insertion
1459         * into the hash chain we'd found it in had been done
1460         * in an earlier critical area protected by unix_table_lock,
1461         * the same one where we'd set *(otheru->addr) contents,
1462         * as well as otheru->path and otheru->addr itself.
1463         *
1464         * Using smp_store_release() here to set newu->addr
1465         * is enough to make those stores, as well as stores
1466         * to newu->path visible to anyone who gets newu->addr
1467         * by smp_load_acquire().  IOW, the same warranties
1468         * as for unix_sock instances bound in unix_bind() or
1469         * in unix_autobind().
1470         */
1471        if (otheru->path.dentry) {
1472                path_get(&otheru->path);
1473                newu->path = otheru->path;
1474        }
1475        refcount_inc(&otheru->addr->refcnt);
1476        smp_store_release(&newu->addr, otheru->addr);
1477
1478        /* Set credentials */
1479        copy_peercred(sk, other);
1480
1481        sock->state     = SS_CONNECTED;
1482        sk->sk_state    = TCP_ESTABLISHED;
1483        sock_hold(newsk);
1484
1485        smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1486        unix_peer(sk)   = newsk;
1487
1488        unix_state_unlock(sk);
1489
1490        /* take ten and send info to listening sock */
1491        spin_lock(&other->sk_receive_queue.lock);
1492        __skb_queue_tail(&other->sk_receive_queue, skb);
1493        spin_unlock(&other->sk_receive_queue.lock);
1494        unix_state_unlock(other);
1495        other->sk_data_ready(other);
1496        sock_put(other);
1497        return 0;
1498
1499out_unlock:
1500        if (other)
1501                unix_state_unlock(other);
1502
1503out:
1504        kfree_skb(skb);
1505        if (newsk)
1506                unix_release_sock(newsk, 0);
1507        if (other)
1508                sock_put(other);
1509        return err;
1510}
1511
1512static int unix_socketpair(struct socket *socka, struct socket *sockb)
1513{
1514        struct sock *ska = socka->sk, *skb = sockb->sk;
1515
1516        /* Join our sockets back to back */
1517        sock_hold(ska);
1518        sock_hold(skb);
1519        unix_peer(ska) = skb;
1520        unix_peer(skb) = ska;
1521        init_peercred(ska);
1522        init_peercred(skb);
1523
1524        ska->sk_state = TCP_ESTABLISHED;
1525        skb->sk_state = TCP_ESTABLISHED;
1526        socka->state  = SS_CONNECTED;
1527        sockb->state  = SS_CONNECTED;
1528        return 0;
1529}
1530
1531static void unix_sock_inherit_flags(const struct socket *old,
1532                                    struct socket *new)
1533{
1534        if (test_bit(SOCK_PASSCRED, &old->flags))
1535                set_bit(SOCK_PASSCRED, &new->flags);
1536        if (test_bit(SOCK_PASSSEC, &old->flags))
1537                set_bit(SOCK_PASSSEC, &new->flags);
1538}
1539
1540static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1541                       bool kern)
1542{
1543        struct sock *sk = sock->sk;
1544        struct sock *tsk;
1545        struct sk_buff *skb;
1546        int err;
1547
1548        err = -EOPNOTSUPP;
1549        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1550                goto out;
1551
1552        err = -EINVAL;
1553        if (sk->sk_state != TCP_LISTEN)
1554                goto out;
1555
1556        /* If socket state is TCP_LISTEN it cannot change (for now...),
1557         * so that no locks are necessary.
1558         */
1559
1560        skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1561        if (!skb) {
1562                /* This means receive shutdown. */
1563                if (err == 0)
1564                        err = -EINVAL;
1565                goto out;
1566        }
1567
1568        tsk = skb->sk;
1569        skb_free_datagram(sk, skb);
1570        wake_up_interruptible(&unix_sk(sk)->peer_wait);
1571
1572        /* attach accepted sock to socket */
1573        unix_state_lock(tsk);
1574        newsock->state = SS_CONNECTED;
1575        unix_sock_inherit_flags(sock, newsock);
1576        sock_graft(tsk, newsock);
1577        unix_state_unlock(tsk);
1578        return 0;
1579
1580out:
1581        return err;
1582}
1583
1584
1585static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1586{
1587        struct sock *sk = sock->sk;
1588        struct unix_address *addr;
1589        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1590        int err = 0;
1591
1592        if (peer) {
1593                sk = unix_peer_get(sk);
1594
1595                err = -ENOTCONN;
1596                if (!sk)
1597                        goto out;
1598                err = 0;
1599        } else {
1600                sock_hold(sk);
1601        }
1602
1603        addr = smp_load_acquire(&unix_sk(sk)->addr);
1604        if (!addr) {
1605                sunaddr->sun_family = AF_UNIX;
1606                sunaddr->sun_path[0] = 0;
1607                err = sizeof(short);
1608        } else {
1609                err = addr->len;
1610                memcpy(sunaddr, addr->name, addr->len);
1611        }
1612        sock_put(sk);
1613out:
1614        return err;
1615}
1616
1617static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1618{
1619        scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1620
1621        /*
1622         * Garbage collection of unix sockets starts by selecting a set of
1623         * candidate sockets which have reference only from being in flight
1624         * (total_refs == inflight_refs).  This condition is checked once during
1625         * the candidate collection phase, and candidates are marked as such, so
1626         * that non-candidates can later be ignored.  While inflight_refs is
1627         * protected by unix_gc_lock, total_refs (file count) is not, hence this
1628         * is an instantaneous decision.
1629         *
1630         * Once a candidate, however, the socket must not be reinstalled into a
1631         * file descriptor while the garbage collection is in progress.
1632         *
1633         * If the above conditions are met, then the directed graph of
1634         * candidates (*) does not change while unix_gc_lock is held.
1635         *
1636         * Any operations that changes the file count through file descriptors
1637         * (dup, close, sendmsg) does not change the graph since candidates are
1638         * not installed in fds.
1639         *
1640         * Dequeing a candidate via recvmsg would install it into an fd, but
1641         * that takes unix_gc_lock to decrement the inflight count, so it's
1642         * serialized with garbage collection.
1643         *
1644         * MSG_PEEK is special in that it does not change the inflight count,
1645         * yet does install the socket into an fd.  The following lock/unlock
1646         * pair is to ensure serialization with garbage collection.  It must be
1647         * done between incrementing the file count and installing the file into
1648         * an fd.
1649         *
1650         * If garbage collection starts after the barrier provided by the
1651         * lock/unlock, then it will see the elevated refcount and not mark this
1652         * as a candidate.  If a garbage collection is already in progress
1653         * before the file count was incremented, then the lock/unlock pair will
1654         * ensure that garbage collection is finished before progressing to
1655         * installing the fd.
1656         *
1657         * (*) A -> B where B is on the queue of A or B is on the queue of C
1658         * which is on the queue of listening socket A.
1659         */
1660        spin_lock(&unix_gc_lock);
1661        spin_unlock(&unix_gc_lock);
1662}
1663
1664static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1665{
1666        int err = 0;
1667
1668        UNIXCB(skb).pid  = get_pid(scm->pid);
1669        UNIXCB(skb).uid = scm->creds.uid;
1670        UNIXCB(skb).gid = scm->creds.gid;
1671        UNIXCB(skb).fp = NULL;
1672        unix_get_secdata(scm, skb);
1673        if (scm->fp && send_fds)
1674                err = unix_attach_fds(scm, skb);
1675
1676        skb->destructor = unix_destruct_scm;
1677        return err;
1678}
1679
1680static bool unix_passcred_enabled(const struct socket *sock,
1681                                  const struct sock *other)
1682{
1683        return test_bit(SOCK_PASSCRED, &sock->flags) ||
1684               !other->sk_socket ||
1685               test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1686}
1687
1688/*
1689 * Some apps rely on write() giving SCM_CREDENTIALS
1690 * We include credentials if source or destination socket
1691 * asserted SOCK_PASSCRED.
1692 */
1693static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1694                            const struct sock *other)
1695{
1696        if (UNIXCB(skb).pid)
1697                return;
1698        if (unix_passcred_enabled(sock, other)) {
1699                UNIXCB(skb).pid  = get_pid(task_tgid(current));
1700                current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1701        }
1702}
1703
1704static int maybe_init_creds(struct scm_cookie *scm,
1705                            struct socket *socket,
1706                            const struct sock *other)
1707{
1708        int err;
1709        struct msghdr msg = { .msg_controllen = 0 };
1710
1711        err = scm_send(socket, &msg, scm, false);
1712        if (err)
1713                return err;
1714
1715        if (unix_passcred_enabled(socket, other)) {
1716                scm->pid = get_pid(task_tgid(current));
1717                current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1718        }
1719        return err;
1720}
1721
1722static bool unix_skb_scm_eq(struct sk_buff *skb,
1723                            struct scm_cookie *scm)
1724{
1725        const struct unix_skb_parms *u = &UNIXCB(skb);
1726
1727        return u->pid == scm->pid &&
1728               uid_eq(u->uid, scm->creds.uid) &&
1729               gid_eq(u->gid, scm->creds.gid) &&
1730               unix_secdata_eq(scm, skb);
1731}
1732
1733static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1734{
1735        struct scm_fp_list *fp = UNIXCB(skb).fp;
1736        struct unix_sock *u = unix_sk(sk);
1737
1738        if (unlikely(fp && fp->count))
1739                atomic_add(fp->count, &u->scm_stat.nr_fds);
1740}
1741
1742static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1743{
1744        struct scm_fp_list *fp = UNIXCB(skb).fp;
1745        struct unix_sock *u = unix_sk(sk);
1746
1747        if (unlikely(fp && fp->count))
1748                atomic_sub(fp->count, &u->scm_stat.nr_fds);
1749}
1750
1751/*
1752 *      Send AF_UNIX data.
1753 */
1754
1755static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1756                              size_t len)
1757{
1758        struct sock *sk = sock->sk;
1759        struct net *net = sock_net(sk);
1760        struct unix_sock *u = unix_sk(sk);
1761        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1762        struct sock *other = NULL;
1763        int namelen = 0; /* fake GCC */
1764        int err;
1765        unsigned int hash;
1766        struct sk_buff *skb;
1767        long timeo;
1768        struct scm_cookie scm;
1769        int data_len = 0;
1770        int sk_locked;
1771
1772        wait_for_unix_gc();
1773        err = scm_send(sock, msg, &scm, false);
1774        if (err < 0)
1775                return err;
1776
1777        err = -EOPNOTSUPP;
1778        if (msg->msg_flags&MSG_OOB)
1779                goto out;
1780
1781        if (msg->msg_namelen) {
1782                err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1783                if (err < 0)
1784                        goto out;
1785                namelen = err;
1786        } else {
1787                sunaddr = NULL;
1788                err = -ENOTCONN;
1789                other = unix_peer_get(sk);
1790                if (!other)
1791                        goto out;
1792        }
1793
1794        if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1795            && (err = unix_autobind(sock)) != 0)
1796                goto out;
1797
1798        err = -EMSGSIZE;
1799        if (len > sk->sk_sndbuf - 32)
1800                goto out;
1801
1802        if (len > SKB_MAX_ALLOC) {
1803                data_len = min_t(size_t,
1804                                 len - SKB_MAX_ALLOC,
1805                                 MAX_SKB_FRAGS * PAGE_SIZE);
1806                data_len = PAGE_ALIGN(data_len);
1807
1808                BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1809        }
1810
1811        skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1812                                   msg->msg_flags & MSG_DONTWAIT, &err,
1813                                   PAGE_ALLOC_COSTLY_ORDER);
1814        if (skb == NULL)
1815                goto out;
1816
1817        err = unix_scm_to_skb(&scm, skb, true);
1818        if (err < 0)
1819                goto out_free;
1820
1821        skb_put(skb, len - data_len);
1822        skb->data_len = data_len;
1823        skb->len = len;
1824        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1825        if (err)
1826                goto out_free;
1827
1828        timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1829
1830restart:
1831        if (!other) {
1832                err = -ECONNRESET;
1833                if (sunaddr == NULL)
1834                        goto out_free;
1835
1836                other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1837                                        hash, &err);
1838                if (other == NULL)
1839                        goto out_free;
1840        }
1841
1842        if (sk_filter(other, skb) < 0) {
1843                /* Toss the packet but do not return any error to the sender */
1844                err = len;
1845                goto out_free;
1846        }
1847
1848        sk_locked = 0;
1849        unix_state_lock(other);
1850restart_locked:
1851        err = -EPERM;
1852        if (!unix_may_send(sk, other))
1853                goto out_unlock;
1854
1855        if (unlikely(sock_flag(other, SOCK_DEAD))) {
1856                /*
1857                 *      Check with 1003.1g - what should
1858                 *      datagram error
1859                 */
1860                unix_state_unlock(other);
1861                sock_put(other);
1862
1863                if (!sk_locked)
1864                        unix_state_lock(sk);
1865
1866                err = 0;
1867                if (unix_peer(sk) == other) {
1868                        unix_peer(sk) = NULL;
1869                        unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1870
1871                        unix_state_unlock(sk);
1872
1873                        sk->sk_state = TCP_CLOSE;
1874                        unix_dgram_disconnected(sk, other);
1875                        sock_put(other);
1876                        err = -ECONNREFUSED;
1877                } else {
1878                        unix_state_unlock(sk);
1879                }
1880
1881                other = NULL;
1882                if (err)
1883                        goto out_free;
1884                goto restart;
1885        }
1886
1887        err = -EPIPE;
1888        if (other->sk_shutdown & RCV_SHUTDOWN)
1889                goto out_unlock;
1890
1891        if (sk->sk_type != SOCK_SEQPACKET) {
1892                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1893                if (err)
1894                        goto out_unlock;
1895        }
1896
1897        /* other == sk && unix_peer(other) != sk if
1898         * - unix_peer(sk) == NULL, destination address bound to sk
1899         * - unix_peer(sk) == sk by time of get but disconnected before lock
1900         */
1901        if (other != sk &&
1902            unlikely(unix_peer(other) != sk &&
1903            unix_recvq_full_lockless(other))) {
1904                if (timeo) {
1905                        timeo = unix_wait_for_peer(other, timeo);
1906
1907                        err = sock_intr_errno(timeo);
1908                        if (signal_pending(current))
1909                                goto out_free;
1910
1911                        goto restart;
1912                }
1913
1914                if (!sk_locked) {
1915                        unix_state_unlock(other);
1916                        unix_state_double_lock(sk, other);
1917                }
1918
1919                if (unix_peer(sk) != other ||
1920                    unix_dgram_peer_wake_me(sk, other)) {
1921                        err = -EAGAIN;
1922                        sk_locked = 1;
1923                        goto out_unlock;
1924                }
1925
1926                if (!sk_locked) {
1927                        sk_locked = 1;
1928                        goto restart_locked;
1929                }
1930        }
1931
1932        if (unlikely(sk_locked))
1933                unix_state_unlock(sk);
1934
1935        if (sock_flag(other, SOCK_RCVTSTAMP))
1936                __net_timestamp(skb);
1937        maybe_add_creds(skb, sock, other);
1938        scm_stat_add(other, skb);
1939        skb_queue_tail(&other->sk_receive_queue, skb);
1940        unix_state_unlock(other);
1941        other->sk_data_ready(other);
1942        sock_put(other);
1943        scm_destroy(&scm);
1944        return len;
1945
1946out_unlock:
1947        if (sk_locked)
1948                unix_state_unlock(sk);
1949        unix_state_unlock(other);
1950out_free:
1951        kfree_skb(skb);
1952out:
1953        if (other)
1954                sock_put(other);
1955        scm_destroy(&scm);
1956        return err;
1957}
1958
1959/* We use paged skbs for stream sockets, and limit occupancy to 32768
1960 * bytes, and a minimum of a full page.
1961 */
1962#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1963
1964#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
1965static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
1966{
1967        struct unix_sock *ousk = unix_sk(other);
1968        struct sk_buff *skb;
1969        int err = 0;
1970
1971        skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
1972
1973        if (!skb)
1974                return err;
1975
1976        skb_put(skb, 1);
1977        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
1978
1979        if (err) {
1980                kfree_skb(skb);
1981                return err;
1982        }
1983
1984        unix_state_lock(other);
1985
1986        if (sock_flag(other, SOCK_DEAD) ||
1987            (other->sk_shutdown & RCV_SHUTDOWN)) {
1988                unix_state_unlock(other);
1989                kfree_skb(skb);
1990                return -EPIPE;
1991        }
1992
1993        maybe_add_creds(skb, sock, other);
1994        skb_get(skb);
1995
1996        if (ousk->oob_skb)
1997                consume_skb(ousk->oob_skb);
1998
1999        ousk->oob_skb = skb;
2000
2001        scm_stat_add(other, skb);
2002        skb_queue_tail(&other->sk_receive_queue, skb);
2003        sk_send_sigurg(other);
2004        unix_state_unlock(other);
2005        other->sk_data_ready(other);
2006
2007        return err;
2008}
2009#endif
2010
2011static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2012                               size_t len)
2013{
2014        struct sock *sk = sock->sk;
2015        struct sock *other = NULL;
2016        int err, size;
2017        struct sk_buff *skb;
2018        int sent = 0;
2019        struct scm_cookie scm;
2020        bool fds_sent = false;
2021        int data_len;
2022
2023        wait_for_unix_gc();
2024        err = scm_send(sock, msg, &scm, false);
2025        if (err < 0)
2026                return err;
2027
2028        err = -EOPNOTSUPP;
2029        if (msg->msg_flags & MSG_OOB) {
2030#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
2031                if (len)
2032                        len--;
2033                else
2034#endif
2035                        goto out_err;
2036        }
2037
2038        if (msg->msg_namelen) {
2039                err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2040                goto out_err;
2041        } else {
2042                err = -ENOTCONN;
2043                other = unix_peer(sk);
2044                if (!other)
2045                        goto out_err;
2046        }
2047
2048        if (sk->sk_shutdown & SEND_SHUTDOWN)
2049                goto pipe_err;
2050
2051        while (sent < len) {
2052                size = len - sent;
2053
2054                /* Keep two messages in the pipe so it schedules better */
2055                size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2056
2057                /* allow fallback to order-0 allocations */
2058                size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2059
2060                data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2061
2062                data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2063
2064                skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2065                                           msg->msg_flags & MSG_DONTWAIT, &err,
2066                                           get_order(UNIX_SKB_FRAGS_SZ));
2067                if (!skb)
2068                        goto out_err;
2069
2070                /* Only send the fds in the first buffer */
2071                err = unix_scm_to_skb(&scm, skb, !fds_sent);
2072                if (err < 0) {
2073                        kfree_skb(skb);
2074                        goto out_err;
2075                }
2076                fds_sent = true;
2077
2078                skb_put(skb, size - data_len);
2079                skb->data_len = data_len;
2080                skb->len = size;
2081                err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2082                if (err) {
2083                        kfree_skb(skb);
2084                        goto out_err;
2085                }
2086
2087                unix_state_lock(other);
2088
2089                if (sock_flag(other, SOCK_DEAD) ||
2090                    (other->sk_shutdown & RCV_SHUTDOWN))
2091                        goto pipe_err_free;
2092
2093                maybe_add_creds(skb, sock, other);
2094                scm_stat_add(other, skb);
2095                skb_queue_tail(&other->sk_receive_queue, skb);
2096                unix_state_unlock(other);
2097                other->sk_data_ready(other);
2098                sent += size;
2099        }
2100
2101#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
2102        if (msg->msg_flags & MSG_OOB) {
2103                err = queue_oob(sock, msg, other);
2104                if (err)
2105                        goto out_err;
2106                sent++;
2107        }
2108#endif
2109
2110        scm_destroy(&scm);
2111
2112        return sent;
2113
2114pipe_err_free:
2115        unix_state_unlock(other);
2116        kfree_skb(skb);
2117pipe_err:
2118        if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2119                send_sig(SIGPIPE, current, 0);
2120        err = -EPIPE;
2121out_err:
2122        scm_destroy(&scm);
2123        return sent ? : err;
2124}
2125
2126static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2127                                    int offset, size_t size, int flags)
2128{
2129        int err;
2130        bool send_sigpipe = false;
2131        bool init_scm = true;
2132        struct scm_cookie scm;
2133        struct sock *other, *sk = socket->sk;
2134        struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2135
2136        if (flags & MSG_OOB)
2137                return -EOPNOTSUPP;
2138
2139        other = unix_peer(sk);
2140        if (!other || sk->sk_state != TCP_ESTABLISHED)
2141                return -ENOTCONN;
2142
2143        if (false) {
2144alloc_skb:
2145                unix_state_unlock(other);
2146                mutex_unlock(&unix_sk(other)->iolock);
2147                newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2148                                              &err, 0);
2149                if (!newskb)
2150                        goto err;
2151        }
2152
2153        /* we must acquire iolock as we modify already present
2154         * skbs in the sk_receive_queue and mess with skb->len
2155         */
2156        err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2157        if (err) {
2158                err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2159                goto err;
2160        }
2161
2162        if (sk->sk_shutdown & SEND_SHUTDOWN) {
2163                err = -EPIPE;
2164                send_sigpipe = true;
2165                goto err_unlock;
2166        }
2167
2168        unix_state_lock(other);
2169
2170        if (sock_flag(other, SOCK_DEAD) ||
2171            other->sk_shutdown & RCV_SHUTDOWN) {
2172                err = -EPIPE;
2173                send_sigpipe = true;
2174                goto err_state_unlock;
2175        }
2176
2177        if (init_scm) {
2178                err = maybe_init_creds(&scm, socket, other);
2179                if (err)
2180                        goto err_state_unlock;
2181                init_scm = false;
2182        }
2183
2184        skb = skb_peek_tail(&other->sk_receive_queue);
2185        if (tail && tail == skb) {
2186                skb = newskb;
2187        } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2188                if (newskb) {
2189                        skb = newskb;
2190                } else {
2191                        tail = skb;
2192                        goto alloc_skb;
2193                }
2194        } else if (newskb) {
2195                /* this is fast path, we don't necessarily need to
2196                 * call to kfree_skb even though with newskb == NULL
2197                 * this - does no harm
2198                 */
2199                consume_skb(newskb);
2200                newskb = NULL;
2201        }
2202
2203        if (skb_append_pagefrags(skb, page, offset, size)) {
2204                tail = skb;
2205                goto alloc_skb;
2206        }
2207
2208        skb->len += size;
2209        skb->data_len += size;
2210        skb->truesize += size;
2211        refcount_add(size, &sk->sk_wmem_alloc);
2212
2213        if (newskb) {
2214                err = unix_scm_to_skb(&scm, skb, false);
2215                if (err)
2216                        goto err_state_unlock;
2217                spin_lock(&other->sk_receive_queue.lock);
2218                __skb_queue_tail(&other->sk_receive_queue, newskb);
2219                spin_unlock(&other->sk_receive_queue.lock);
2220        }
2221
2222        unix_state_unlock(other);
2223        mutex_unlock(&unix_sk(other)->iolock);
2224
2225        other->sk_data_ready(other);
2226        scm_destroy(&scm);
2227        return size;
2228
2229err_state_unlock:
2230        unix_state_unlock(other);
2231err_unlock:
2232        mutex_unlock(&unix_sk(other)->iolock);
2233err:
2234        kfree_skb(newskb);
2235        if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2236                send_sig(SIGPIPE, current, 0);
2237        if (!init_scm)
2238                scm_destroy(&scm);
2239        return err;
2240}
2241
2242static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2243                                  size_t len)
2244{
2245        int err;
2246        struct sock *sk = sock->sk;
2247
2248        err = sock_error(sk);
2249        if (err)
2250                return err;
2251
2252        if (sk->sk_state != TCP_ESTABLISHED)
2253                return -ENOTCONN;
2254
2255        if (msg->msg_namelen)
2256                msg->msg_namelen = 0;
2257
2258        return unix_dgram_sendmsg(sock, msg, len);
2259}
2260
2261static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2262                                  size_t size, int flags)
2263{
2264        struct sock *sk = sock->sk;
2265
2266        if (sk->sk_state != TCP_ESTABLISHED)
2267                return -ENOTCONN;
2268
2269        return unix_dgram_recvmsg(sock, msg, size, flags);
2270}
2271
2272static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2273{
2274        struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2275
2276        if (addr) {
2277                msg->msg_namelen = addr->len;
2278                memcpy(msg->msg_name, addr->name, addr->len);
2279        }
2280}
2281
2282int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2283                         int flags)
2284{
2285        struct scm_cookie scm;
2286        struct socket *sock = sk->sk_socket;
2287        struct unix_sock *u = unix_sk(sk);
2288        struct sk_buff *skb, *last;
2289        long timeo;
2290        int skip;
2291        int err;
2292
2293        err = -EOPNOTSUPP;
2294        if (flags&MSG_OOB)
2295                goto out;
2296
2297        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2298
2299        do {
2300                mutex_lock(&u->iolock);
2301
2302                skip = sk_peek_offset(sk, flags);
2303                skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2304                                              &skip, &err, &last);
2305                if (skb) {
2306                        if (!(flags & MSG_PEEK))
2307                                scm_stat_del(sk, skb);
2308                        break;
2309                }
2310
2311                mutex_unlock(&u->iolock);
2312
2313                if (err != -EAGAIN)
2314                        break;
2315        } while (timeo &&
2316                 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2317                                              &err, &timeo, last));
2318
2319        if (!skb) { /* implies iolock unlocked */
2320                unix_state_lock(sk);
2321                /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2322                if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2323                    (sk->sk_shutdown & RCV_SHUTDOWN))
2324                        err = 0;
2325                unix_state_unlock(sk);
2326                goto out;
2327        }
2328
2329        if (wq_has_sleeper(&u->peer_wait))
2330                wake_up_interruptible_sync_poll(&u->peer_wait,
2331                                                EPOLLOUT | EPOLLWRNORM |
2332                                                EPOLLWRBAND);
2333
2334        if (msg->msg_name)
2335                unix_copy_addr(msg, skb->sk);
2336
2337        if (size > skb->len - skip)
2338                size = skb->len - skip;
2339        else if (size < skb->len - skip)
2340                msg->msg_flags |= MSG_TRUNC;
2341
2342        err = skb_copy_datagram_msg(skb, skip, msg, size);
2343        if (err)
2344                goto out_free;
2345
2346        if (sock_flag(sk, SOCK_RCVTSTAMP))
2347                __sock_recv_timestamp(msg, sk, skb);
2348
2349        memset(&scm, 0, sizeof(scm));
2350
2351        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2352        unix_set_secdata(&scm, skb);
2353
2354        if (!(flags & MSG_PEEK)) {
2355                if (UNIXCB(skb).fp)
2356                        unix_detach_fds(&scm, skb);
2357
2358                sk_peek_offset_bwd(sk, skb->len);
2359        } else {
2360                /* It is questionable: on PEEK we could:
2361                   - do not return fds - good, but too simple 8)
2362                   - return fds, and do not return them on read (old strategy,
2363                     apparently wrong)
2364                   - clone fds (I chose it for now, it is the most universal
2365                     solution)
2366
2367                   POSIX 1003.1g does not actually define this clearly
2368                   at all. POSIX 1003.1g doesn't define a lot of things
2369                   clearly however!
2370
2371                */
2372
2373                sk_peek_offset_fwd(sk, size);
2374
2375                if (UNIXCB(skb).fp)
2376                        unix_peek_fds(&scm, skb);
2377        }
2378        err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2379
2380        scm_recv(sock, msg, &scm, flags);
2381
2382out_free:
2383        skb_free_datagram(sk, skb);
2384        mutex_unlock(&u->iolock);
2385out:
2386        return err;
2387}
2388
2389static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2390                              int flags)
2391{
2392        struct sock *sk = sock->sk;
2393
2394#ifdef CONFIG_BPF_SYSCALL
2395        const struct proto *prot = READ_ONCE(sk->sk_prot);
2396
2397        if (prot != &unix_dgram_proto)
2398                return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2399                                            flags & ~MSG_DONTWAIT, NULL);
2400#endif
2401        return __unix_dgram_recvmsg(sk, msg, size, flags);
2402}
2403
2404static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
2405                          sk_read_actor_t recv_actor)
2406{
2407        int copied = 0;
2408
2409        while (1) {
2410                struct unix_sock *u = unix_sk(sk);
2411                struct sk_buff *skb;
2412                int used, err;
2413
2414                mutex_lock(&u->iolock);
2415                skb = skb_recv_datagram(sk, 0, 1, &err);
2416                mutex_unlock(&u->iolock);
2417                if (!skb)
2418                        return err;
2419
2420                used = recv_actor(desc, skb, 0, skb->len);
2421                if (used <= 0) {
2422                        if (!copied)
2423                                copied = used;
2424                        kfree_skb(skb);
2425                        break;
2426                } else if (used <= skb->len) {
2427                        copied += used;
2428                }
2429
2430                kfree_skb(skb);
2431                if (!desc->count)
2432                        break;
2433        }
2434
2435        return copied;
2436}
2437
2438/*
2439 *      Sleep until more data has arrived. But check for races..
2440 */
2441static long unix_stream_data_wait(struct sock *sk, long timeo,
2442                                  struct sk_buff *last, unsigned int last_len,
2443                                  bool freezable)
2444{
2445        struct sk_buff *tail;
2446        DEFINE_WAIT(wait);
2447
2448        unix_state_lock(sk);
2449
2450        for (;;) {
2451                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2452
2453                tail = skb_peek_tail(&sk->sk_receive_queue);
2454                if (tail != last ||
2455                    (tail && tail->len != last_len) ||
2456                    sk->sk_err ||
2457                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2458                    signal_pending(current) ||
2459                    !timeo)
2460                        break;
2461
2462                sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2463                unix_state_unlock(sk);
2464                if (freezable)
2465                        timeo = freezable_schedule_timeout(timeo);
2466                else
2467                        timeo = schedule_timeout(timeo);
2468                unix_state_lock(sk);
2469
2470                if (sock_flag(sk, SOCK_DEAD))
2471                        break;
2472
2473                sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2474        }
2475
2476        finish_wait(sk_sleep(sk), &wait);
2477        unix_state_unlock(sk);
2478        return timeo;
2479}
2480
2481static unsigned int unix_skb_len(const struct sk_buff *skb)
2482{
2483        return skb->len - UNIXCB(skb).consumed;
2484}
2485
2486struct unix_stream_read_state {
2487        int (*recv_actor)(struct sk_buff *, int, int,
2488                          struct unix_stream_read_state *);
2489        struct socket *socket;
2490        struct msghdr *msg;
2491        struct pipe_inode_info *pipe;
2492        size_t size;
2493        int flags;
2494        unsigned int splice_flags;
2495};
2496
2497#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2498static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2499{
2500        struct socket *sock = state->socket;
2501        struct sock *sk = sock->sk;
2502        struct unix_sock *u = unix_sk(sk);
2503        int chunk = 1;
2504        struct sk_buff *oob_skb;
2505
2506        mutex_lock(&u->iolock);
2507        unix_state_lock(sk);
2508
2509        if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2510                unix_state_unlock(sk);
2511                mutex_unlock(&u->iolock);
2512                return -EINVAL;
2513        }
2514
2515        oob_skb = u->oob_skb;
2516
2517        if (!(state->flags & MSG_PEEK)) {
2518                u->oob_skb = NULL;
2519        }
2520
2521        unix_state_unlock(sk);
2522
2523        chunk = state->recv_actor(oob_skb, 0, chunk, state);
2524
2525        if (!(state->flags & MSG_PEEK)) {
2526                UNIXCB(oob_skb).consumed += 1;
2527                kfree_skb(oob_skb);
2528        }
2529
2530        mutex_unlock(&u->iolock);
2531
2532        if (chunk < 0)
2533                return -EFAULT;
2534
2535        state->msg->msg_flags |= MSG_OOB;
2536        return 1;
2537}
2538
2539static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2540                                  int flags, int copied)
2541{
2542        struct unix_sock *u = unix_sk(sk);
2543
2544        if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2545                skb_unlink(skb, &sk->sk_receive_queue);
2546                consume_skb(skb);
2547                skb = NULL;
2548        } else {
2549                if (skb == u->oob_skb) {
2550                        if (copied) {
2551                                skb = NULL;
2552                        } else if (sock_flag(sk, SOCK_URGINLINE)) {
2553                                if (!(flags & MSG_PEEK)) {
2554                                        u->oob_skb = NULL;
2555                                        consume_skb(skb);
2556                                }
2557                        } else if (!(flags & MSG_PEEK)) {
2558                                skb_unlink(skb, &sk->sk_receive_queue);
2559                                consume_skb(skb);
2560                                skb = skb_peek(&sk->sk_receive_queue);
2561                        }
2562                }
2563        }
2564        return skb;
2565}
2566#endif
2567
2568static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
2569                                 sk_read_actor_t recv_actor)
2570{
2571        if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2572                return -ENOTCONN;
2573
2574        return unix_read_sock(sk, desc, recv_actor);
2575}
2576
2577static int unix_stream_read_generic(struct unix_stream_read_state *state,
2578                                    bool freezable)
2579{
2580        struct scm_cookie scm;
2581        struct socket *sock = state->socket;
2582        struct sock *sk = sock->sk;
2583        struct unix_sock *u = unix_sk(sk);
2584        int copied = 0;
2585        int flags = state->flags;
2586        int noblock = flags & MSG_DONTWAIT;
2587        bool check_creds = false;
2588        int target;
2589        int err = 0;
2590        long timeo;
2591        int skip;
2592        size_t size = state->size;
2593        unsigned int last_len;
2594
2595        if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2596                err = -EINVAL;
2597                goto out;
2598        }
2599
2600        if (unlikely(flags & MSG_OOB)) {
2601                err = -EOPNOTSUPP;
2602#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2603                err = unix_stream_recv_urg(state);
2604#endif
2605                goto out;
2606        }
2607
2608        target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2609        timeo = sock_rcvtimeo(sk, noblock);
2610
2611        memset(&scm, 0, sizeof(scm));
2612
2613        /* Lock the socket to prevent queue disordering
2614         * while sleeps in memcpy_tomsg
2615         */
2616        mutex_lock(&u->iolock);
2617
2618        skip = max(sk_peek_offset(sk, flags), 0);
2619
2620        do {
2621                int chunk;
2622                bool drop_skb;
2623                struct sk_buff *skb, *last;
2624
2625redo:
2626                unix_state_lock(sk);
2627                if (sock_flag(sk, SOCK_DEAD)) {
2628                        err = -ECONNRESET;
2629                        goto unlock;
2630                }
2631                last = skb = skb_peek(&sk->sk_receive_queue);
2632                last_len = last ? last->len : 0;
2633
2634#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2635                if (skb) {
2636                        skb = manage_oob(skb, sk, flags, copied);
2637                        if (!skb) {
2638                                unix_state_unlock(sk);
2639                                if (copied)
2640                                        break;
2641                                goto redo;
2642                        }
2643                }
2644#endif
2645again:
2646                if (skb == NULL) {
2647                        if (copied >= target)
2648                                goto unlock;
2649
2650                        /*
2651                         *      POSIX 1003.1g mandates this order.
2652                         */
2653
2654                        err = sock_error(sk);
2655                        if (err)
2656                                goto unlock;
2657                        if (sk->sk_shutdown & RCV_SHUTDOWN)
2658                                goto unlock;
2659
2660                        unix_state_unlock(sk);
2661                        if (!timeo) {
2662                                err = -EAGAIN;
2663                                break;
2664                        }
2665
2666                        mutex_unlock(&u->iolock);
2667
2668                        timeo = unix_stream_data_wait(sk, timeo, last,
2669                                                      last_len, freezable);
2670
2671                        if (signal_pending(current)) {
2672                                err = sock_intr_errno(timeo);
2673                                scm_destroy(&scm);
2674                                goto out;
2675                        }
2676
2677                        mutex_lock(&u->iolock);
2678                        goto redo;
2679unlock:
2680                        unix_state_unlock(sk);
2681                        break;
2682                }
2683
2684                while (skip >= unix_skb_len(skb)) {
2685                        skip -= unix_skb_len(skb);
2686                        last = skb;
2687                        last_len = skb->len;
2688                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
2689                        if (!skb)
2690                                goto again;
2691                }
2692
2693                unix_state_unlock(sk);
2694
2695                if (check_creds) {
2696                        /* Never glue messages from different writers */
2697                        if (!unix_skb_scm_eq(skb, &scm))
2698                                break;
2699                } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2700                        /* Copy credentials */
2701                        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2702                        unix_set_secdata(&scm, skb);
2703                        check_creds = true;
2704                }
2705
2706                /* Copy address just once */
2707                if (state->msg && state->msg->msg_name) {
2708                        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2709                                         state->msg->msg_name);
2710                        unix_copy_addr(state->msg, skb->sk);
2711                        sunaddr = NULL;
2712                }
2713
2714                chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2715                skb_get(skb);
2716                chunk = state->recv_actor(skb, skip, chunk, state);
2717                drop_skb = !unix_skb_len(skb);
2718                /* skb is only safe to use if !drop_skb */
2719                consume_skb(skb);
2720                if (chunk < 0) {
2721                        if (copied == 0)
2722                                copied = -EFAULT;
2723                        break;
2724                }
2725                copied += chunk;
2726                size -= chunk;
2727
2728                if (drop_skb) {
2729                        /* the skb was touched by a concurrent reader;
2730                         * we should not expect anything from this skb
2731                         * anymore and assume it invalid - we can be
2732                         * sure it was dropped from the socket queue
2733                         *
2734                         * let's report a short read
2735                         */
2736                        err = 0;
2737                        break;
2738                }
2739
2740                /* Mark read part of skb as used */
2741                if (!(flags & MSG_PEEK)) {
2742                        UNIXCB(skb).consumed += chunk;
2743
2744                        sk_peek_offset_bwd(sk, chunk);
2745
2746                        if (UNIXCB(skb).fp) {
2747                                scm_stat_del(sk, skb);
2748                                unix_detach_fds(&scm, skb);
2749                        }
2750
2751                        if (unix_skb_len(skb))
2752                                break;
2753
2754                        skb_unlink(skb, &sk->sk_receive_queue);
2755                        consume_skb(skb);
2756
2757                        if (scm.fp)
2758                                break;
2759                } else {
2760                        /* It is questionable, see note in unix_dgram_recvmsg.
2761                         */
2762                        if (UNIXCB(skb).fp)
2763                                unix_peek_fds(&scm, skb);
2764
2765                        sk_peek_offset_fwd(sk, chunk);
2766
2767                        if (UNIXCB(skb).fp)
2768                                break;
2769
2770                        skip = 0;
2771                        last = skb;
2772                        last_len = skb->len;
2773                        unix_state_lock(sk);
2774                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
2775                        if (skb)
2776                                goto again;
2777                        unix_state_unlock(sk);
2778                        break;
2779                }
2780        } while (size);
2781
2782        mutex_unlock(&u->iolock);
2783        if (state->msg)
2784                scm_recv(sock, state->msg, &scm, flags);
2785        else
2786                scm_destroy(&scm);
2787out:
2788        return copied ? : err;
2789}
2790
2791static int unix_stream_read_actor(struct sk_buff *skb,
2792                                  int skip, int chunk,
2793                                  struct unix_stream_read_state *state)
2794{
2795        int ret;
2796
2797        ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2798                                    state->msg, chunk);
2799        return ret ?: chunk;
2800}
2801
2802int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2803                          size_t size, int flags)
2804{
2805        struct unix_stream_read_state state = {
2806                .recv_actor = unix_stream_read_actor,
2807                .socket = sk->sk_socket,
2808                .msg = msg,
2809                .size = size,
2810                .flags = flags
2811        };
2812
2813        return unix_stream_read_generic(&state, true);
2814}
2815
2816static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2817                               size_t size, int flags)
2818{
2819        struct unix_stream_read_state state = {
2820                .recv_actor = unix_stream_read_actor,
2821                .socket = sock,
2822                .msg = msg,
2823                .size = size,
2824                .flags = flags
2825        };
2826
2827#ifdef CONFIG_BPF_SYSCALL
2828        struct sock *sk = sock->sk;
2829        const struct proto *prot = READ_ONCE(sk->sk_prot);
2830
2831        if (prot != &unix_stream_proto)
2832                return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2833                                            flags & ~MSG_DONTWAIT, NULL);
2834#endif
2835        return unix_stream_read_generic(&state, true);
2836}
2837
2838static int unix_stream_splice_actor(struct sk_buff *skb,
2839                                    int skip, int chunk,
2840                                    struct unix_stream_read_state *state)
2841{
2842        return skb_splice_bits(skb, state->socket->sk,
2843                               UNIXCB(skb).consumed + skip,
2844                               state->pipe, chunk, state->splice_flags);
2845}
2846
2847static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2848                                       struct pipe_inode_info *pipe,
2849                                       size_t size, unsigned int flags)
2850{
2851        struct unix_stream_read_state state = {
2852                .recv_actor = unix_stream_splice_actor,
2853                .socket = sock,
2854                .pipe = pipe,
2855                .size = size,
2856                .splice_flags = flags,
2857        };
2858
2859        if (unlikely(*ppos))
2860                return -ESPIPE;
2861
2862        if (sock->file->f_flags & O_NONBLOCK ||
2863            flags & SPLICE_F_NONBLOCK)
2864                state.flags = MSG_DONTWAIT;
2865
2866        return unix_stream_read_generic(&state, false);
2867}
2868
2869static int unix_shutdown(struct socket *sock, int mode)
2870{
2871        struct sock *sk = sock->sk;
2872        struct sock *other;
2873
2874        if (mode < SHUT_RD || mode > SHUT_RDWR)
2875                return -EINVAL;
2876        /* This maps:
2877         * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2878         * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2879         * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2880         */
2881        ++mode;
2882
2883        unix_state_lock(sk);
2884        sk->sk_shutdown |= mode;
2885        if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2886            mode == SHUTDOWN_MASK)
2887                sk->sk_state = TCP_CLOSE;
2888        other = unix_peer(sk);
2889        if (other)
2890                sock_hold(other);
2891        unix_state_unlock(sk);
2892        sk->sk_state_change(sk);
2893
2894        if (other &&
2895                (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2896
2897                int peer_mode = 0;
2898                const struct proto *prot = READ_ONCE(other->sk_prot);
2899
2900                if (prot->unhash)
2901                        prot->unhash(other);
2902                if (mode&RCV_SHUTDOWN)
2903                        peer_mode |= SEND_SHUTDOWN;
2904                if (mode&SEND_SHUTDOWN)
2905                        peer_mode |= RCV_SHUTDOWN;
2906                unix_state_lock(other);
2907                other->sk_shutdown |= peer_mode;
2908                unix_state_unlock(other);
2909                other->sk_state_change(other);
2910                if (peer_mode == SHUTDOWN_MASK)
2911                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2912                else if (peer_mode & RCV_SHUTDOWN)
2913                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2914        }
2915        if (other)
2916                sock_put(other);
2917
2918        return 0;
2919}
2920
2921long unix_inq_len(struct sock *sk)
2922{
2923        struct sk_buff *skb;
2924        long amount = 0;
2925
2926        if (sk->sk_state == TCP_LISTEN)
2927                return -EINVAL;
2928
2929        spin_lock(&sk->sk_receive_queue.lock);
2930        if (sk->sk_type == SOCK_STREAM ||
2931            sk->sk_type == SOCK_SEQPACKET) {
2932                skb_queue_walk(&sk->sk_receive_queue, skb)
2933                        amount += unix_skb_len(skb);
2934        } else {
2935                skb = skb_peek(&sk->sk_receive_queue);
2936                if (skb)
2937                        amount = skb->len;
2938        }
2939        spin_unlock(&sk->sk_receive_queue.lock);
2940
2941        return amount;
2942}
2943EXPORT_SYMBOL_GPL(unix_inq_len);
2944
2945long unix_outq_len(struct sock *sk)
2946{
2947        return sk_wmem_alloc_get(sk);
2948}
2949EXPORT_SYMBOL_GPL(unix_outq_len);
2950
2951static int unix_open_file(struct sock *sk)
2952{
2953        struct path path;
2954        struct file *f;
2955        int fd;
2956
2957        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2958                return -EPERM;
2959
2960        if (!smp_load_acquire(&unix_sk(sk)->addr))
2961                return -ENOENT;
2962
2963        path = unix_sk(sk)->path;
2964        if (!path.dentry)
2965                return -ENOENT;
2966
2967        path_get(&path);
2968
2969        fd = get_unused_fd_flags(O_CLOEXEC);
2970        if (fd < 0)
2971                goto out;
2972
2973        f = dentry_open(&path, O_PATH, current_cred());
2974        if (IS_ERR(f)) {
2975                put_unused_fd(fd);
2976                fd = PTR_ERR(f);
2977                goto out;
2978        }
2979
2980        fd_install(fd, f);
2981out:
2982        path_put(&path);
2983
2984        return fd;
2985}
2986
2987static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2988{
2989        struct sock *sk = sock->sk;
2990        long amount = 0;
2991        int err;
2992
2993        switch (cmd) {
2994        case SIOCOUTQ:
2995                amount = unix_outq_len(sk);
2996                err = put_user(amount, (int __user *)arg);
2997                break;
2998        case SIOCINQ:
2999                amount = unix_inq_len(sk);
3000                if (amount < 0)
3001                        err = amount;
3002                else
3003                        err = put_user(amount, (int __user *)arg);
3004                break;
3005        case SIOCUNIXFILE:
3006                err = unix_open_file(sk);
3007                break;
3008#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3009        case SIOCATMARK:
3010                {
3011                        struct sk_buff *skb;
3012                        struct unix_sock *u = unix_sk(sk);
3013                        int answ = 0;
3014
3015                        skb = skb_peek(&sk->sk_receive_queue);
3016                        if (skb && skb == u->oob_skb)
3017                                answ = 1;
3018                        err = put_user(answ, (int __user *)arg);
3019                }
3020                break;
3021#endif
3022        default:
3023                err = -ENOIOCTLCMD;
3024                break;
3025        }
3026        return err;
3027}
3028
3029#ifdef CONFIG_COMPAT
3030static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3031{
3032        return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3033}
3034#endif
3035
3036static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3037{
3038        struct sock *sk = sock->sk;
3039        __poll_t mask;
3040
3041        sock_poll_wait(file, sock, wait);
3042        mask = 0;
3043
3044        /* exceptional events? */
3045        if (sk->sk_err)
3046                mask |= EPOLLERR;
3047        if (sk->sk_shutdown == SHUTDOWN_MASK)
3048                mask |= EPOLLHUP;
3049        if (sk->sk_shutdown & RCV_SHUTDOWN)
3050                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3051
3052        /* readable? */
3053        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3054                mask |= EPOLLIN | EPOLLRDNORM;
3055        if (sk_is_readable(sk))
3056                mask |= EPOLLIN | EPOLLRDNORM;
3057
3058        /* Connection-based need to check for termination and startup */
3059        if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3060            sk->sk_state == TCP_CLOSE)
3061                mask |= EPOLLHUP;
3062
3063        /*
3064         * we set writable also when the other side has shut down the
3065         * connection. This prevents stuck sockets.
3066         */
3067        if (unix_writable(sk))
3068                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3069
3070        return mask;
3071}
3072
3073static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3074                                    poll_table *wait)
3075{
3076        struct sock *sk = sock->sk, *other;
3077        unsigned int writable;
3078        __poll_t mask;
3079
3080        sock_poll_wait(file, sock, wait);
3081        mask = 0;
3082
3083        /* exceptional events? */
3084        if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3085                mask |= EPOLLERR |
3086                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3087
3088        if (sk->sk_shutdown & RCV_SHUTDOWN)
3089                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3090        if (sk->sk_shutdown == SHUTDOWN_MASK)
3091                mask |= EPOLLHUP;
3092
3093        /* readable? */
3094        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3095                mask |= EPOLLIN | EPOLLRDNORM;
3096        if (sk_is_readable(sk))
3097                mask |= EPOLLIN | EPOLLRDNORM;
3098
3099        /* Connection-based need to check for termination and startup */
3100        if (sk->sk_type == SOCK_SEQPACKET) {
3101                if (sk->sk_state == TCP_CLOSE)
3102                        mask |= EPOLLHUP;
3103                /* connection hasn't started yet? */
3104                if (sk->sk_state == TCP_SYN_SENT)
3105                        return mask;
3106        }
3107
3108        /* No write status requested, avoid expensive OUT tests. */
3109        if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3110                return mask;
3111
3112        writable = unix_writable(sk);
3113        if (writable) {
3114                unix_state_lock(sk);
3115
3116                other = unix_peer(sk);
3117                if (other && unix_peer(other) != sk &&
3118                    unix_recvq_full_lockless(other) &&
3119                    unix_dgram_peer_wake_me(sk, other))
3120                        writable = 0;
3121
3122                unix_state_unlock(sk);
3123        }
3124
3125        if (writable)
3126                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3127        else
3128                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3129
3130        return mask;
3131}
3132
3133#ifdef CONFIG_PROC_FS
3134
3135#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3136
3137#define get_bucket(x) ((x) >> BUCKET_SPACE)
3138#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
3139#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3140
3141static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3142{
3143        unsigned long offset = get_offset(*pos);
3144        unsigned long bucket = get_bucket(*pos);
3145        struct sock *sk;
3146        unsigned long count = 0;
3147
3148        for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
3149                if (sock_net(sk) != seq_file_net(seq))
3150                        continue;
3151                if (++count == offset)
3152                        break;
3153        }
3154
3155        return sk;
3156}
3157
3158static struct sock *unix_next_socket(struct seq_file *seq,
3159                                     struct sock *sk,
3160                                     loff_t *pos)
3161{
3162        unsigned long bucket;
3163
3164        while (sk > (struct sock *)SEQ_START_TOKEN) {
3165                sk = sk_next(sk);
3166                if (!sk)
3167                        goto next_bucket;
3168                if (sock_net(sk) == seq_file_net(seq))
3169                        return sk;
3170        }
3171
3172        do {
3173                sk = unix_from_bucket(seq, pos);
3174                if (sk)
3175                        return sk;
3176
3177next_bucket:
3178                bucket = get_bucket(*pos) + 1;
3179                *pos = set_bucket_offset(bucket, 1);
3180        } while (bucket < ARRAY_SIZE(unix_socket_table));
3181
3182        return NULL;
3183}
3184
3185static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3186        __acquires(unix_table_lock)
3187{
3188        spin_lock(&unix_table_lock);
3189
3190        if (!*pos)
3191                return SEQ_START_TOKEN;
3192
3193        if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
3194                return NULL;
3195
3196        return unix_next_socket(seq, NULL, pos);
3197}
3198
3199static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3200{
3201        ++*pos;
3202        return unix_next_socket(seq, v, pos);
3203}
3204
3205static void unix_seq_stop(struct seq_file *seq, void *v)
3206        __releases(unix_table_lock)
3207{
3208        spin_unlock(&unix_table_lock);
3209}
3210
3211static int unix_seq_show(struct seq_file *seq, void *v)
3212{
3213
3214        if (v == SEQ_START_TOKEN)
3215                seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3216                         "Inode Path\n");
3217        else {
3218                struct sock *s = v;
3219                struct unix_sock *u = unix_sk(s);
3220                unix_state_lock(s);
3221
3222                seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3223                        s,
3224                        refcount_read(&s->sk_refcnt),
3225                        0,
3226                        s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3227                        s->sk_type,
3228                        s->sk_socket ?
3229                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3230                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3231                        sock_i_ino(s));
3232
3233                if (u->addr) {  // under unix_table_lock here
3234                        int i, len;
3235                        seq_putc(seq, ' ');
3236
3237                        i = 0;
3238                        len = u->addr->len - sizeof(short);
3239                        if (!UNIX_ABSTRACT(s))
3240                                len--;
3241                        else {
3242                                seq_putc(seq, '@');
3243                                i++;
3244                        }
3245                        for ( ; i < len; i++)
3246                                seq_putc(seq, u->addr->name->sun_path[i] ?:
3247                                         '@');
3248                }
3249                unix_state_unlock(s);
3250                seq_putc(seq, '\n');
3251        }
3252
3253        return 0;
3254}
3255
3256static const struct seq_operations unix_seq_ops = {
3257        .start  = unix_seq_start,
3258        .next   = unix_seq_next,
3259        .stop   = unix_seq_stop,
3260        .show   = unix_seq_show,
3261};
3262
3263#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3264struct bpf_iter__unix {
3265        __bpf_md_ptr(struct bpf_iter_meta *, meta);
3266        __bpf_md_ptr(struct unix_sock *, unix_sk);
3267        uid_t uid __aligned(8);
3268};
3269
3270static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3271                              struct unix_sock *unix_sk, uid_t uid)
3272{
3273        struct bpf_iter__unix ctx;
3274
3275        meta->seq_num--;  /* skip SEQ_START_TOKEN */
3276        ctx.meta = meta;
3277        ctx.unix_sk = unix_sk;
3278        ctx.uid = uid;
3279        return bpf_iter_run_prog(prog, &ctx);
3280}
3281
3282static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3283{
3284        struct bpf_iter_meta meta;
3285        struct bpf_prog *prog;
3286        struct sock *sk = v;
3287        uid_t uid;
3288
3289        if (v == SEQ_START_TOKEN)
3290                return 0;
3291
3292        uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3293        meta.seq = seq;
3294        prog = bpf_iter_get_info(&meta, false);
3295        return unix_prog_seq_show(prog, &meta, v, uid);
3296}
3297
3298static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3299{
3300        struct bpf_iter_meta meta;
3301        struct bpf_prog *prog;
3302
3303        if (!v) {
3304                meta.seq = seq;
3305                prog = bpf_iter_get_info(&meta, true);
3306                if (prog)
3307                        (void)unix_prog_seq_show(prog, &meta, v, 0);
3308        }
3309
3310        unix_seq_stop(seq, v);
3311}
3312
3313static const struct seq_operations bpf_iter_unix_seq_ops = {
3314        .start  = unix_seq_start,
3315        .next   = unix_seq_next,
3316        .stop   = bpf_iter_unix_seq_stop,
3317        .show   = bpf_iter_unix_seq_show,
3318};
3319#endif
3320#endif
3321
3322static const struct net_proto_family unix_family_ops = {
3323        .family = PF_UNIX,
3324        .create = unix_create,
3325        .owner  = THIS_MODULE,
3326};
3327
3328
3329static int __net_init unix_net_init(struct net *net)
3330{
3331        int error = -ENOMEM;
3332
3333        net->unx.sysctl_max_dgram_qlen = 10;
3334        if (unix_sysctl_register(net))
3335                goto out;
3336
3337#ifdef CONFIG_PROC_FS
3338        if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3339                        sizeof(struct seq_net_private))) {
3340                unix_sysctl_unregister(net);
3341                goto out;
3342        }
3343#endif
3344        error = 0;
3345out:
3346        return error;
3347}
3348
3349static void __net_exit unix_net_exit(struct net *net)
3350{
3351        unix_sysctl_unregister(net);
3352        remove_proc_entry("unix", net->proc_net);
3353}
3354
3355static struct pernet_operations unix_net_ops = {
3356        .init = unix_net_init,
3357        .exit = unix_net_exit,
3358};
3359
3360#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3361DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3362                     struct unix_sock *unix_sk, uid_t uid)
3363
3364static const struct bpf_iter_seq_info unix_seq_info = {
3365        .seq_ops                = &bpf_iter_unix_seq_ops,
3366        .init_seq_private       = bpf_iter_init_seq_net,
3367        .fini_seq_private       = bpf_iter_fini_seq_net,
3368        .seq_priv_size          = sizeof(struct seq_net_private),
3369};
3370
3371static struct bpf_iter_reg unix_reg_info = {
3372        .target                 = "unix",
3373        .ctx_arg_info_size      = 1,
3374        .ctx_arg_info           = {
3375                { offsetof(struct bpf_iter__unix, unix_sk),
3376                  PTR_TO_BTF_ID_OR_NULL },
3377        },
3378        .seq_info               = &unix_seq_info,
3379};
3380
3381static void __init bpf_iter_register(void)
3382{
3383        unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3384        if (bpf_iter_reg_target(&unix_reg_info))
3385                pr_warn("Warning: could not register bpf iterator unix\n");
3386}
3387#endif
3388
3389static int __init af_unix_init(void)
3390{
3391        int rc = -1;
3392
3393        BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3394
3395        rc = proto_register(&unix_dgram_proto, 1);
3396        if (rc != 0) {
3397                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3398                goto out;
3399        }
3400
3401        rc = proto_register(&unix_stream_proto, 1);
3402        if (rc != 0) {
3403                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3404                goto out;
3405        }
3406
3407        sock_register(&unix_family_ops);
3408        register_pernet_subsys(&unix_net_ops);
3409        unix_bpf_build_proto();
3410
3411#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3412        bpf_iter_register();
3413#endif
3414
3415out:
3416        return rc;
3417}
3418
3419static void __exit af_unix_exit(void)
3420{
3421        sock_unregister(PF_UNIX);
3422        proto_unregister(&unix_dgram_proto);
3423        proto_unregister(&unix_stream_proto);
3424        unregister_pernet_subsys(&unix_net_ops);
3425}
3426
3427/* Earlier than device_initcall() so that other drivers invoking
3428   request_module() don't end up in a loop when modprobe tries
3429   to use a UNIX socket. But later than subsys_initcall() because
3430   we depend on stuff initialised there */
3431fs_initcall(af_unix_init);
3432module_exit(af_unix_exit);
3433
3434MODULE_LICENSE("GPL");
3435MODULE_ALIAS_NETPROTO(PF_UNIX);
3436