LXR linux/kernel/exit.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/kernel/exit.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/slab.h>
  10#include <linux/sched/autogroup.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/stat.h>
  13#include <linux/sched/task.h>
  14#include <linux/sched/task_stack.h>
  15#include <linux/sched/cputime.h>
  16#include <linux/interrupt.h>
  17#include <linux/module.h>
  18#include <linux/capability.h>
  19#include <linux/completion.h>
  20#include <linux/personality.h>
  21#include <linux/tty.h>
  22#include <linux/iocontext.h>
  23#include <linux/key.h>
  24#include <linux/cpu.h>
  25#include <linux/acct.h>
  26#include <linux/tsacct_kern.h>
  27#include <linux/file.h>
  28#include <linux/fdtable.h>
  29#include <linux/freezer.h>
  30#include <linux/binfmts.h>
  31#include <linux/nsproxy.h>
  32#include <linux/pid_namespace.h>
  33#include <linux/ptrace.h>
  34#include <linux/profile.h>
  35#include <linux/mount.h>
  36#include <linux/proc_fs.h>
  37#include <linux/kthread.h>
  38#include <linux/mempolicy.h>
  39#include <linux/taskstats_kern.h>
  40#include <linux/delayacct.h>
  41#include <linux/cgroup.h>
  42#include <linux/syscalls.h>
  43#include <linux/signal.h>
  44#include <linux/posix-timers.h>
  45#include <linux/cn_proc.h>
  46#include <linux/mutex.h>
  47#include <linux/futex.h>
  48#include <linux/pipe_fs_i.h>
  49#include <linux/audit.h> /* for audit_free() */
  50#include <linux/resource.h>
  51#include <linux/blkdev.h>
  52#include <linux/task_io_accounting_ops.h>
  53#include <linux/tracehook.h>
  54#include <linux/fs_struct.h>
  55#include <linux/init_task.h>
  56#include <linux/perf_event.h>
  57#include <trace/events/sched.h>
  58#include <linux/hw_breakpoint.h>
  59#include <linux/oom.h>
  60#include <linux/writeback.h>
  61#include <linux/shm.h>
  62#include <linux/kcov.h>
  63#include <linux/random.h>
  64#include <linux/rcuwait.h>
  65#include <linux/compat.h>
  66#include <linux/io_uring.h>
  67
  68#include <linux/uaccess.h>
  69#include <asm/unistd.h>
  70#include <asm/mmu_context.h>
  71
  72static void __unhash_process(struct task_struct *p, bool group_dead)
  73{
  74        nr_threads--;
  75        detach_pid(p, PIDTYPE_PID);
  76        if (group_dead) {
  77                detach_pid(p, PIDTYPE_TGID);
  78                detach_pid(p, PIDTYPE_PGID);
  79                detach_pid(p, PIDTYPE_SID);
  80
  81                list_del_rcu(&p->tasks);
  82                list_del_init(&p->sibling);
  83                __this_cpu_dec(process_counts);
  84        }
  85        list_del_rcu(&p->thread_group);
  86        list_del_rcu(&p->thread_node);
  87}
  88
  89/*
  90 * This function expects the tasklist_lock write-locked.
  91 */
  92static void __exit_signal(struct task_struct *tsk)
  93{
  94        struct signal_struct *sig = tsk->signal;
  95        bool group_dead = thread_group_leader(tsk);
  96        struct sighand_struct *sighand;
  97        struct tty_struct *tty;
  98        u64 utime, stime;
  99
 100        sighand = rcu_dereference_check(tsk->sighand,
 101                                        lockdep_tasklist_lock_is_held());
 102        spin_lock(&sighand->siglock);
 103
 104#ifdef CONFIG_POSIX_TIMERS
 105        posix_cpu_timers_exit(tsk);
 106        if (group_dead)
 107                posix_cpu_timers_exit_group(tsk);
 108#endif
 109
 110        if (group_dead) {
 111                tty = sig->tty;
 112                sig->tty = NULL;
 113        } else {
 114                /*
 115                 * If there is any task waiting for the group exit
 116                 * then notify it:
 117                 */
 118                if (sig->notify_count > 0 && !--sig->notify_count)
 119                        wake_up_process(sig->group_exit_task);
 120
 121                if (tsk == sig->curr_target)
 122                        sig->curr_target = next_thread(tsk);
 123        }
 124
 125        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 126                              sizeof(unsigned long long));
 127
 128        /*
 129         * Accumulate here the counters for all threads as they die. We could
 130         * skip the group leader because it is the last user of signal_struct,
 131         * but we want to avoid the race with thread_group_cputime() which can
 132         * see the empty ->thread_head list.
 133         */
 134        task_cputime(tsk, &utime, &stime);
 135        write_seqlock(&sig->stats_lock);
 136        sig->utime += utime;
 137        sig->stime += stime;
 138        sig->gtime += task_gtime(tsk);
 139        sig->min_flt += tsk->min_flt;
 140        sig->maj_flt += tsk->maj_flt;
 141        sig->nvcsw += tsk->nvcsw;
 142        sig->nivcsw += tsk->nivcsw;
 143        sig->inblock += task_io_get_inblock(tsk);
 144        sig->oublock += task_io_get_oublock(tsk);
 145        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 146        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 147        sig->nr_threads--;
 148        __unhash_process(tsk, group_dead);
 149        write_sequnlock(&sig->stats_lock);
 150
 151        /*
 152         * Do this under ->siglock, we can race with another thread
 153         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 154         */
 155        flush_sigqueue(&tsk->pending);
 156        tsk->sighand = NULL;
 157        spin_unlock(&sighand->siglock);
 158
 159        __cleanup_sighand(sighand);
 160        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 161        if (group_dead) {
 162                flush_sigqueue(&sig->shared_pending);
 163                tty_kref_put(tty);
 164        }
 165}
 166
 167static void delayed_put_task_struct(struct rcu_head *rhp)
 168{
 169        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 170
 171        perf_event_delayed_put(tsk);
 172        trace_sched_process_free(tsk);
 173        put_task_struct(tsk);
 174}
 175
 176void put_task_struct_rcu_user(struct task_struct *task)
 177{
 178        if (refcount_dec_and_test(&task->rcu_users))
 179                call_rcu(&task->rcu, delayed_put_task_struct);
 180}
 181
 182void release_task(struct task_struct *p)
 183{
 184        struct task_struct *leader;
 185        struct pid *thread_pid;
 186        int zap_leader;
 187repeat:
 188        /* don't need to get the RCU readlock here - the process is dead and
 189         * can't be modifying its own credentials. But shut RCU-lockdep up */
 190        rcu_read_lock();
 191        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 192        rcu_read_unlock();
 193
 194        cgroup_release(p);
 195
 196        write_lock_irq(&tasklist_lock);
 197        ptrace_release_task(p);
 198        thread_pid = get_pid(p->thread_pid);
 199        __exit_signal(p);
 200
 201        /*
 202         * If we are the last non-leader member of the thread
 203         * group, and the leader is zombie, then notify the
 204         * group leader's parent process. (if it wants notification.)
 205         */
 206        zap_leader = 0;
 207        leader = p->group_leader;
 208        if (leader != p && thread_group_empty(leader)
 209                        && leader->exit_state == EXIT_ZOMBIE) {
 210                /*
 211                 * If we were the last child thread and the leader has
 212                 * exited already, and the leader's parent ignores SIGCHLD,
 213                 * then we are the one who should release the leader.
 214                 */
 215                zap_leader = do_notify_parent(leader, leader->exit_signal);
 216                if (zap_leader)
 217                        leader->exit_state = EXIT_DEAD;
 218        }
 219
 220        write_unlock_irq(&tasklist_lock);
 221        seccomp_filter_release(p);
 222        proc_flush_pid(thread_pid);
 223        put_pid(thread_pid);
 224        release_thread(p);
 225        put_task_struct_rcu_user(p);
 226
 227        p = leader;
 228        if (unlikely(zap_leader))
 229                goto repeat;
 230}
 231
 232int rcuwait_wake_up(struct rcuwait *w)
 233{
 234        int ret = 0;
 235        struct task_struct *task;
 236
 237        rcu_read_lock();
 238
 239        /*
 240         * Order condition vs @task, such that everything prior to the load
 241         * of @task is visible. This is the condition as to why the user called
 242         * rcuwait_wake() in the first place. Pairs with set_current_state()
 243         * barrier (A) in rcuwait_wait_event().
 244         *
 245         *    WAIT                WAKE
 246         *    [S] tsk = current   [S] cond = true
 247         *        MB (A)              MB (B)
 248         *    [L] cond            [L] tsk
 249         */
 250        smp_mb(); /* (B) */
 251
 252        task = rcu_dereference(w->task);
 253        if (task)
 254                ret = wake_up_process(task);
 255        rcu_read_unlock();
 256
 257        return ret;
 258}
 259EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 260
 261/*
 262 * Determine if a process group is "orphaned", according to the POSIX
 263 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 264 * by terminal-generated stop signals.  Newly orphaned process groups are
 265 * to receive a SIGHUP and a SIGCONT.
 266 *
 267 * "I ask you, have you ever known what it is to be an orphan?"
 268 */
 269static int will_become_orphaned_pgrp(struct pid *pgrp,
 270                                        struct task_struct *ignored_task)
 271{
 272        struct task_struct *p;
 273
 274        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 275                if ((p == ignored_task) ||
 276                    (p->exit_state && thread_group_empty(p)) ||
 277                    is_global_init(p->real_parent))
 278                        continue;
 279
 280                if (task_pgrp(p->real_parent) != pgrp &&
 281                    task_session(p->real_parent) == task_session(p))
 282                        return 0;
 283        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 284
 285        return 1;
 286}
 287
 288int is_current_pgrp_orphaned(void)
 289{
 290        int retval;
 291
 292        read_lock(&tasklist_lock);
 293        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 294        read_unlock(&tasklist_lock);
 295
 296        return retval;
 297}
 298
 299static bool has_stopped_jobs(struct pid *pgrp)
 300{
 301        struct task_struct *p;
 302
 303        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 304                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 305                        return true;
 306        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 307
 308        return false;
 309}
 310
 311/*
 312 * Check to see if any process groups have become orphaned as
 313 * a result of our exiting, and if they have any stopped jobs,
 314 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 315 */
 316static void
 317kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 318{
 319        struct pid *pgrp = task_pgrp(tsk);
 320        struct task_struct *ignored_task = tsk;
 321
 322        if (!parent)
 323                /* exit: our father is in a different pgrp than
 324                 * we are and we were the only connection outside.
 325                 */
 326                parent = tsk->real_parent;
 327        else
 328                /* reparent: our child is in a different pgrp than
 329                 * we are, and it was the only connection outside.
 330                 */
 331                ignored_task = NULL;
 332
 333        if (task_pgrp(parent) != pgrp &&
 334            task_session(parent) == task_session(tsk) &&
 335            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 336            has_stopped_jobs(pgrp)) {
 337                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 338                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 339        }
 340}
 341
 342#ifdef CONFIG_MEMCG
 343/*
 344 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 345 */
 346void mm_update_next_owner(struct mm_struct *mm)
 347{
 348        struct task_struct *c, *g, *p = current;
 349
 350retry:
 351        /*
 352         * If the exiting or execing task is not the owner, it's
 353         * someone else's problem.
 354         */
 355        if (mm->owner != p)
 356                return;
 357        /*
 358         * The current owner is exiting/execing and there are no other
 359         * candidates.  Do not leave the mm pointing to a possibly
 360         * freed task structure.
 361         */
 362        if (atomic_read(&mm->mm_users) <= 1) {
 363                WRITE_ONCE(mm->owner, NULL);
 364                return;
 365        }
 366
 367        read_lock(&tasklist_lock);
 368        /*
 369         * Search in the children
 370         */
 371        list_for_each_entry(c, &p->children, sibling) {
 372                if (c->mm == mm)
 373                        goto assign_new_owner;
 374        }
 375
 376        /*
 377         * Search in the siblings
 378         */
 379        list_for_each_entry(c, &p->real_parent->children, sibling) {
 380                if (c->mm == mm)
 381                        goto assign_new_owner;
 382        }
 383
 384        /*
 385         * Search through everything else, we should not get here often.
 386         */
 387        for_each_process(g) {
 388                if (g->flags & PF_KTHREAD)
 389                        continue;
 390                for_each_thread(g, c) {
 391                        if (c->mm == mm)
 392                                goto assign_new_owner;
 393                        if (c->mm)
 394                                break;
 395                }
 396        }
 397        read_unlock(&tasklist_lock);
 398        /*
 399         * We found no owner yet mm_users > 1: this implies that we are
 400         * most likely racing with swapoff (try_to_unuse()) or /proc or
 401         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 402         */
 403        WRITE_ONCE(mm->owner, NULL);
 404        return;
 405
 406assign_new_owner:
 407        BUG_ON(c == p);
 408        get_task_struct(c);
 409        /*
 410         * The task_lock protects c->mm from changing.
 411         * We always want mm->owner->mm == mm
 412         */
 413        task_lock(c);
 414        /*
 415         * Delay read_unlock() till we have the task_lock()
 416         * to ensure that c does not slip away underneath us
 417         */
 418        read_unlock(&tasklist_lock);
 419        if (c->mm != mm) {
 420                task_unlock(c);
 421                put_task_struct(c);
 422                goto retry;
 423        }
 424        WRITE_ONCE(mm->owner, c);
 425        task_unlock(c);
 426        put_task_struct(c);
 427}
 428#endif /* CONFIG_MEMCG */
 429
 430/*
 431 * Turn us into a lazy TLB process if we
 432 * aren't already..
 433 */
 434static void exit_mm(void)
 435{
 436        struct mm_struct *mm = current->mm;
 437        struct core_state *core_state;
 438
 439        exit_mm_release(current, mm);
 440        if (!mm)
 441                return;
 442        sync_mm_rss(mm);
 443        /*
 444         * Serialize with any possible pending coredump.
 445         * We must hold mmap_lock around checking core_state
 446         * and clearing tsk->mm.  The core-inducing thread
 447         * will increment ->nr_threads for each thread in the
 448         * group with ->mm != NULL.
 449         */
 450        mmap_read_lock(mm);
 451        core_state = mm->core_state;
 452        if (core_state) {
 453                struct core_thread self;
 454
 455                mmap_read_unlock(mm);
 456
 457                self.task = current;
 458                if (self.task->flags & PF_SIGNALED)
 459                        self.next = xchg(&core_state->dumper.next, &self);
 460                else
 461                        self.task = NULL;
 462                /*
 463                 * Implies mb(), the result of xchg() must be visible
 464                 * to core_state->dumper.
 465                 */
 466                if (atomic_dec_and_test(&core_state->nr_threads))
 467                        complete(&core_state->startup);
 468
 469                for (;;) {
 470                        set_current_state(TASK_UNINTERRUPTIBLE);
 471                        if (!self.task) /* see coredump_finish() */
 472                                break;
 473                        freezable_schedule();
 474                }
 475                __set_current_state(TASK_RUNNING);
 476                mmap_read_lock(mm);
 477        }
 478        mmgrab(mm);
 479        BUG_ON(mm != current->active_mm);
 480        /* more a memory barrier than a real lock */
 481        task_lock(current);
 482        /*
 483         * When a thread stops operating on an address space, the loop
 484         * in membarrier_private_expedited() may not observe that
 485         * tsk->mm, and the loop in membarrier_global_expedited() may
 486         * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
 487         * rq->membarrier_state, so those would not issue an IPI.
 488         * Membarrier requires a memory barrier after accessing
 489         * user-space memory, before clearing tsk->mm or the
 490         * rq->membarrier_state.
 491         */
 492        smp_mb__after_spinlock();
 493        local_irq_disable();
 494        current->mm = NULL;
 495        membarrier_update_current_mm(NULL);
 496        enter_lazy_tlb(mm, current);
 497        local_irq_enable();
 498        task_unlock(current);
 499        mmap_read_unlock(mm);
 500        mm_update_next_owner(mm);
 501        mmput(mm);
 502        if (test_thread_flag(TIF_MEMDIE))
 503                exit_oom_victim();
 504}
 505
 506static struct task_struct *find_alive_thread(struct task_struct *p)
 507{
 508        struct task_struct *t;
 509
 510        for_each_thread(p, t) {
 511                if (!(t->flags & PF_EXITING))
 512                        return t;
 513        }
 514        return NULL;
 515}
 516
 517static struct task_struct *find_child_reaper(struct task_struct *father,
 518                                                struct list_head *dead)
 519        __releases(&tasklist_lock)
 520        __acquires(&tasklist_lock)
 521{
 522        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 523        struct task_struct *reaper = pid_ns->child_reaper;
 524        struct task_struct *p, *n;
 525
 526        if (likely(reaper != father))
 527                return reaper;
 528
 529        reaper = find_alive_thread(father);
 530        if (reaper) {
 531                pid_ns->child_reaper = reaper;
 532                return reaper;
 533        }
 534
 535        write_unlock_irq(&tasklist_lock);
 536
 537        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 538                list_del_init(&p->ptrace_entry);
 539                release_task(p);
 540        }
 541
 542        zap_pid_ns_processes(pid_ns);
 543        write_lock_irq(&tasklist_lock);
 544
 545        return father;
 546}
 547
 548/*
 549 * When we die, we re-parent all our children, and try to:
 550 * 1. give them to another thread in our thread group, if such a member exists
 551 * 2. give it to the first ancestor process which prctl'd itself as a
 552 *    child_subreaper for its children (like a service manager)
 553 * 3. give it to the init process (PID 1) in our pid namespace
 554 */
 555static struct task_struct *find_new_reaper(struct task_struct *father,
 556                                           struct task_struct *child_reaper)
 557{
 558        struct task_struct *thread, *reaper;
 559
 560        thread = find_alive_thread(father);
 561        if (thread)
 562                return thread;
 563
 564        if (father->signal->has_child_subreaper) {
 565                unsigned int ns_level = task_pid(father)->level;
 566                /*
 567                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 568                 * We can't check reaper != child_reaper to ensure we do not
 569                 * cross the namespaces, the exiting parent could be injected
 570                 * by setns() + fork().
 571                 * We check pid->level, this is slightly more efficient than
 572                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 573                 */
 574                for (reaper = father->real_parent;
 575                     task_pid(reaper)->level == ns_level;
 576                     reaper = reaper->real_parent) {
 577                        if (reaper == &init_task)
 578                                break;
 579                        if (!reaper->signal->is_child_subreaper)
 580                                continue;
 581                        thread = find_alive_thread(reaper);
 582                        if (thread)
 583                                return thread;
 584                }
 585        }
 586
 587        return child_reaper;
 588}
 589
 590/*
 591* Any that need to be release_task'd are put on the @dead list.
 592 */
 593static void reparent_leader(struct task_struct *father, struct task_struct *p,
 594                                struct list_head *dead)
 595{
 596        if (unlikely(p->exit_state == EXIT_DEAD))
 597                return;
 598
 599        /* We don't want people slaying init. */
 600        p->exit_signal = SIGCHLD;
 601
 602        /* If it has exited notify the new parent about this child's death. */
 603        if (!p->ptrace &&
 604            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 605                if (do_notify_parent(p, p->exit_signal)) {
 606                        p->exit_state = EXIT_DEAD;
 607                        list_add(&p->ptrace_entry, dead);
 608                }
 609        }
 610
 611        kill_orphaned_pgrp(p, father);
 612}
 613
 614/*
 615 * This does two things:
 616 *
 617 * A.  Make init inherit all the child processes
 618 * B.  Check to see if any process groups have become orphaned
 619 *      as a result of our exiting, and if they have any stopped
 620 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 621 */
 622static void forget_original_parent(struct task_struct *father,
 623                                        struct list_head *dead)
 624{
 625        struct task_struct *p, *t, *reaper;
 626
 627        if (unlikely(!list_empty(&father->ptraced)))
 628                exit_ptrace(father, dead);
 629
 630        /* Can drop and reacquire tasklist_lock */
 631        reaper = find_child_reaper(father, dead);
 632        if (list_empty(&father->children))
 633                return;
 634
 635        reaper = find_new_reaper(father, reaper);
 636        list_for_each_entry(p, &father->children, sibling) {
 637                for_each_thread(p, t) {
 638                        RCU_INIT_POINTER(t->real_parent, reaper);
 639                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
 640                        if (likely(!t->ptrace))
 641                                t->parent = t->real_parent;
 642                        if (t->pdeath_signal)
 643                                group_send_sig_info(t->pdeath_signal,
 644                                                    SEND_SIG_NOINFO, t,
 645                                                    PIDTYPE_TGID);
 646                }
 647                /*
 648                 * If this is a threaded reparent there is no need to
 649                 * notify anyone anything has happened.
 650                 */
 651                if (!same_thread_group(reaper, father))
 652                        reparent_leader(father, p, dead);
 653        }
 654        list_splice_tail_init(&father->children, &reaper->children);
 655}
 656
 657/*
 658 * Send signals to all our closest relatives so that they know
 659 * to properly mourn us..
 660 */
 661static void exit_notify(struct task_struct *tsk, int group_dead)
 662{
 663        bool autoreap;
 664        struct task_struct *p, *n;
 665        LIST_HEAD(dead);
 666
 667        write_lock_irq(&tasklist_lock);
 668        forget_original_parent(tsk, &dead);
 669
 670        if (group_dead)
 671                kill_orphaned_pgrp(tsk->group_leader, NULL);
 672
 673        tsk->exit_state = EXIT_ZOMBIE;
 674        if (unlikely(tsk->ptrace)) {
 675                int sig = thread_group_leader(tsk) &&
 676                                thread_group_empty(tsk) &&
 677                                !ptrace_reparented(tsk) ?
 678                        tsk->exit_signal : SIGCHLD;
 679                autoreap = do_notify_parent(tsk, sig);
 680        } else if (thread_group_leader(tsk)) {
 681                autoreap = thread_group_empty(tsk) &&
 682                        do_notify_parent(tsk, tsk->exit_signal);
 683        } else {
 684                autoreap = true;
 685        }
 686
 687        if (autoreap) {
 688                tsk->exit_state = EXIT_DEAD;
 689                list_add(&tsk->ptrace_entry, &dead);
 690        }
 691
 692        /* mt-exec, de_thread() is waiting for group leader */
 693        if (unlikely(tsk->signal->notify_count < 0))
 694                wake_up_process(tsk->signal->group_exit_task);
 695        write_unlock_irq(&tasklist_lock);
 696
 697        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 698                list_del_init(&p->ptrace_entry);
 699                release_task(p);
 700        }
 701}
 702
 703#ifdef CONFIG_DEBUG_STACK_USAGE
 704static void check_stack_usage(void)
 705{
 706        static DEFINE_SPINLOCK(low_water_lock);
 707        static int lowest_to_date = THREAD_SIZE;
 708        unsigned long free;
 709
 710        free = stack_not_used(current);
 711
 712        if (free >= lowest_to_date)
 713                return;
 714
 715        spin_lock(&low_water_lock);
 716        if (free < lowest_to_date) {
 717                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 718                        current->comm, task_pid_nr(current), free);
 719                lowest_to_date = free;
 720        }
 721        spin_unlock(&low_water_lock);
 722}
 723#else
 724static inline void check_stack_usage(void) {}
 725#endif
 726
 727void __noreturn do_exit(long code)
 728{
 729        struct task_struct *tsk = current;
 730        int group_dead;
 731
 732        /*
 733         * We can get here from a kernel oops, sometimes with preemption off.
 734         * Start by checking for critical errors.
 735         * Then fix up important state like USER_DS and preemption.
 736         * Then do everything else.
 737         */
 738
 739        WARN_ON(blk_needs_flush_plug(tsk));
 740
 741        if (unlikely(in_interrupt()))
 742                panic("Aiee, killing interrupt handler!");
 743        if (unlikely(!tsk->pid))
 744                panic("Attempted to kill the idle task!");
 745
 746        /*
 747         * If do_exit is called because this processes oopsed, it's possible
 748         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 749         * continuing. Amongst other possible reasons, this is to prevent
 750         * mm_release()->clear_child_tid() from writing to a user-controlled
 751         * kernel address.
 752         */
 753        force_uaccess_begin();
 754
 755        if (unlikely(in_atomic())) {
 756                pr_info("note: %s[%d] exited with preempt_count %d\n",
 757                        current->comm, task_pid_nr(current),
 758                        preempt_count());
 759                preempt_count_set(PREEMPT_ENABLED);
 760        }
 761
 762        profile_task_exit(tsk);
 763        kcov_task_exit(tsk);
 764
 765        ptrace_event(PTRACE_EVENT_EXIT, code);
 766
 767        validate_creds_for_do_exit(tsk);
 768
 769        /*
 770         * We're taking recursive faults here in do_exit. Safest is to just
 771         * leave this task alone and wait for reboot.
 772         */
 773        if (unlikely(tsk->flags & PF_EXITING)) {
 774                pr_alert("Fixing recursive fault but reboot is needed!\n");
 775                futex_exit_recursive(tsk);
 776                set_current_state(TASK_UNINTERRUPTIBLE);
 777                schedule();
 778        }
 779
 780        io_uring_files_cancel(tsk->files);
 781        exit_signals(tsk);  /* sets PF_EXITING */
 782
 783        /* sync mm's RSS info before statistics gathering */
 784        if (tsk->mm)
 785                sync_mm_rss(tsk->mm);
 786        acct_update_integrals(tsk);
 787        group_dead = atomic_dec_and_test(&tsk->signal->live);
 788        if (group_dead) {
 789                /*
 790                 * If the last thread of global init has exited, panic
 791                 * immediately to get a useable coredump.
 792                 */
 793                if (unlikely(is_global_init(tsk)))
 794                        panic("Attempted to kill init! exitcode=0x%08x\n",
 795                                tsk->signal->group_exit_code ?: (int)code);
 796
 797#ifdef CONFIG_POSIX_TIMERS
 798                hrtimer_cancel(&tsk->signal->real_timer);
 799                exit_itimers(tsk->signal);
 800#endif
 801                if (tsk->mm)
 802                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 803        }
 804        acct_collect(code, group_dead);
 805        if (group_dead)
 806                tty_audit_exit();
 807        audit_free(tsk);
 808
 809        tsk->exit_code = code;
 810        taskstats_exit(tsk, group_dead);
 811
 812        exit_mm();
 813
 814        if (group_dead)
 815                acct_process();
 816        trace_sched_process_exit(tsk);
 817
 818        exit_sem(tsk);
 819        exit_shm(tsk);
 820        exit_files(tsk);
 821        exit_fs(tsk);
 822        if (group_dead)
 823                disassociate_ctty(1);
 824        exit_task_namespaces(tsk);
 825        exit_task_work(tsk);
 826        exit_thread(tsk);
 827
 828        /*
 829         * Flush inherited counters to the parent - before the parent
 830         * gets woken up by child-exit notifications.
 831         *
 832         * because of cgroup mode, must be called before cgroup_exit()
 833         */
 834        perf_event_exit_task(tsk);
 835
 836        sched_autogroup_exit_task(tsk);
 837        cgroup_exit(tsk);
 838
 839        /*
 840         * FIXME: do that only when needed, using sched_exit tracepoint
 841         */
 842        flush_ptrace_hw_breakpoint(tsk);
 843
 844        exit_tasks_rcu_start();
 845        exit_notify(tsk, group_dead);
 846        proc_exit_connector(tsk);
 847        mpol_put_task_policy(tsk);
 848#ifdef CONFIG_FUTEX
 849        if (unlikely(current->pi_state_cache))
 850                kfree(current->pi_state_cache);
 851#endif
 852        /*
 853         * Make sure we are holding no locks:
 854         */
 855        debug_check_no_locks_held();
 856
 857        if (tsk->io_context)
 858                exit_io_context(tsk);
 859
 860        if (tsk->splice_pipe)
 861                free_pipe_info(tsk->splice_pipe);
 862
 863        if (tsk->task_frag.page)
 864                put_page(tsk->task_frag.page);
 865
 866        validate_creds_for_do_exit(tsk);
 867
 868        check_stack_usage();
 869        preempt_disable();
 870        if (tsk->nr_dirtied)
 871                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 872        exit_rcu();
 873        exit_tasks_rcu_finish();
 874
 875        lockdep_free_task(tsk);
 876        do_task_dead();
 877}
 878EXPORT_SYMBOL_GPL(do_exit);
 879
 880void complete_and_exit(struct completion *comp, long code)
 881{
 882        if (comp)
 883                complete(comp);
 884
 885        do_exit(code);
 886}
 887EXPORT_SYMBOL(complete_and_exit);
 888
 889SYSCALL_DEFINE1(exit, int, error_code)
 890{
 891        do_exit((error_code&0xff)<<8);
 892}
 893
 894/*
 895 * Take down every thread in the group.  This is called by fatal signals
 896 * as well as by sys_exit_group (below).
 897 */
 898void
 899do_group_exit(int exit_code)
 900{
 901        struct signal_struct *sig = current->signal;
 902
 903        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 904
 905        if (signal_group_exit(sig))
 906                exit_code = sig->group_exit_code;
 907        else if (!thread_group_empty(current)) {
 908                struct sighand_struct *const sighand = current->sighand;
 909
 910                spin_lock_irq(&sighand->siglock);
 911                if (signal_group_exit(sig))
 912                        /* Another thread got here before we took the lock.  */
 913                        exit_code = sig->group_exit_code;
 914                else {
 915                        sig->group_exit_code = exit_code;
 916                        sig->flags = SIGNAL_GROUP_EXIT;
 917                        zap_other_threads(current);
 918                }
 919                spin_unlock_irq(&sighand->siglock);
 920        }
 921
 922        do_exit(exit_code);
 923        /* NOTREACHED */
 924}
 925
 926/*
 927 * this kills every thread in the thread group. Note that any externally
 928 * wait4()-ing process will get the correct exit code - even if this
 929 * thread is not the thread group leader.
 930 */
 931SYSCALL_DEFINE1(exit_group, int, error_code)
 932{
 933        do_group_exit((error_code & 0xff) << 8);
 934        /* NOTREACHED */
 935        return 0;
 936}
 937
 938struct waitid_info {
 939        pid_t pid;
 940        uid_t uid;
 941        int status;
 942        int cause;
 943};
 944
 945struct wait_opts {
 946        enum pid_type           wo_type;
 947        int                     wo_flags;
 948        struct pid              *wo_pid;
 949
 950        struct waitid_info      *wo_info;
 951        int                     wo_stat;
 952        struct rusage           *wo_rusage;
 953
 954        wait_queue_entry_t              child_wait;
 955        int                     notask_error;
 956};
 957
 958static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 959{
 960        return  wo->wo_type == PIDTYPE_MAX ||
 961                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 962}
 963
 964static int
 965eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 966{
 967        if (!eligible_pid(wo, p))
 968                return 0;
 969
 970        /*
 971         * Wait for all children (clone and not) if __WALL is set or
 972         * if it is traced by us.
 973         */
 974        if (ptrace || (wo->wo_flags & __WALL))
 975                return 1;
 976
 977        /*
 978         * Otherwise, wait for clone children *only* if __WCLONE is set;
 979         * otherwise, wait for non-clone children *only*.
 980         *
 981         * Note: a "clone" child here is one that reports to its parent
 982         * using a signal other than SIGCHLD, or a non-leader thread which
 983         * we can only see if it is traced by us.
 984         */
 985        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 986                return 0;
 987
 988        return 1;
 989}
 990
 991/*
 992 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 993 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 994 * the lock and this task is uninteresting.  If we return nonzero, we have
 995 * released the lock and the system call should return.
 996 */
 997static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 998{
 999        int state, status;
1000        pid_t pid = task_pid_vnr(p);

1001        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1002        struct waitid_info *infop;
1003
1004        if (!likely(wo->wo_flags & WEXITED))
1005                return 0;
1006
1007        if (unlikely(wo->wo_flags & WNOWAIT)) {
1008                status = p->exit_code;
1009                get_task_struct(p);
1010                read_unlock(&tasklist_lock);
1011                sched_annotate_sleep();
1012                if (wo->wo_rusage)
1013                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1014                put_task_struct(p);
1015                goto out_info;
1016        }
1017        /*
1018         * Move the task's state to DEAD/TRACE, only one thread can do this.
1019         */
1020        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1021                EXIT_TRACE : EXIT_DEAD;
1022        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1023                return 0;
1024        /*
1025         * We own this thread, nobody else can reap it.
1026         */
1027        read_unlock(&tasklist_lock);
1028        sched_annotate_sleep();
1029
1030        /*
1031         * Check thread_group_leader() to exclude the traced sub-threads.
1032         */
1033        if (state == EXIT_DEAD && thread_group_leader(p)) {
1034                struct signal_struct *sig = p->signal;
1035                struct signal_struct *psig = current->signal;
1036                unsigned long maxrss;
1037                u64 tgutime, tgstime;
1038
1039                /*
1040                 * The resource counters for the group leader are in its
1041                 * own task_struct.  Those for dead threads in the group
1042                 * are in its signal_struct, as are those for the child
1043                 * processes it has previously reaped.  All these
1044                 * accumulate in the parent's signal_struct c* fields.
1045                 *
1046                 * We don't bother to take a lock here to protect these
1047                 * p->signal fields because the whole thread group is dead
1048                 * and nobody can change them.
1049                 *
1050                 * psig->stats_lock also protects us from our sub-theads
1051                 * which can reap other children at the same time. Until
1052                 * we change k_getrusage()-like users to rely on this lock
1053                 * we have to take ->siglock as well.
1054                 *
1055                 * We use thread_group_cputime_adjusted() to get times for
1056                 * the thread group, which consolidates times for all threads
1057                 * in the group including the group leader.
1058                 */
1059                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1060                spin_lock_irq(&current->sighand->siglock);
1061                write_seqlock(&psig->stats_lock);
1062                psig->cutime += tgutime + sig->cutime;
1063                psig->cstime += tgstime + sig->cstime;
1064                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1065                psig->cmin_flt +=
1066                        p->min_flt + sig->min_flt + sig->cmin_flt;
1067                psig->cmaj_flt +=
1068                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1069                psig->cnvcsw +=
1070                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1071                psig->cnivcsw +=
1072                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1073                psig->cinblock +=
1074                        task_io_get_inblock(p) +
1075                        sig->inblock + sig->cinblock;
1076                psig->coublock +=
1077                        task_io_get_oublock(p) +
1078                        sig->oublock + sig->coublock;
1079                maxrss = max(sig->maxrss, sig->cmaxrss);
1080                if (psig->cmaxrss < maxrss)
1081                        psig->cmaxrss = maxrss;
1082                task_io_accounting_add(&psig->ioac, &p->ioac);
1083                task_io_accounting_add(&psig->ioac, &sig->ioac);
1084                write_sequnlock(&psig->stats_lock);
1085                spin_unlock_irq(&current->sighand->siglock);
1086        }
1087
1088        if (wo->wo_rusage)
1089                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1090        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1091                ? p->signal->group_exit_code : p->exit_code;
1092        wo->wo_stat = status;
1093
1094        if (state == EXIT_TRACE) {
1095                write_lock_irq(&tasklist_lock);
1096                /* We dropped tasklist, ptracer could die and untrace */
1097                ptrace_unlink(p);
1098
1099                /* If parent wants a zombie, don't release it now */
1100                state = EXIT_ZOMBIE;
1101                if (do_notify_parent(p, p->exit_signal))
1102                        state = EXIT_DEAD;
1103                p->exit_state = state;
1104                write_unlock_irq(&tasklist_lock);
1105        }
1106        if (state == EXIT_DEAD)
1107                release_task(p);
1108
1109out_info:
1110        infop = wo->wo_info;
1111        if (infop) {
1112                if ((status & 0x7f) == 0) {
1113                        infop->cause = CLD_EXITED;
1114                        infop->status = status >> 8;
1115                } else {
1116                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1117                        infop->status = status & 0x7f;
1118                }
1119                infop->pid = pid;
1120                infop->uid = uid;
1121        }
1122
1123        return pid;
1124}
1125
1126static int *task_stopped_code(struct task_struct *p, bool ptrace)
1127{
1128        if (ptrace) {
1129                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1130                        return &p->exit_code;
1131        } else {
1132                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1133                        return &p->signal->group_exit_code;
1134        }
1135        return NULL;
1136}
1137
1138/**
1139 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1140 * @wo: wait options
1141 * @ptrace: is the wait for ptrace
1142 * @p: task to wait for
1143 *
1144 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1145 *
1146 * CONTEXT:
1147 * read_lock(&tasklist_lock), which is released if return value is
1148 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1149 *
1150 * RETURNS:
1151 * 0 if wait condition didn't exist and search for other wait conditions
1152 * should continue.  Non-zero return, -errno on failure and @p's pid on
1153 * success, implies that tasklist_lock is released and wait condition
1154 * search should terminate.
1155 */
1156static int wait_task_stopped(struct wait_opts *wo,
1157                                int ptrace, struct task_struct *p)
1158{
1159        struct waitid_info *infop;
1160        int exit_code, *p_code, why;
1161        uid_t uid = 0; /* unneeded, required by compiler */
1162        pid_t pid;
1163
1164        /*
1165         * Traditionally we see ptrace'd stopped tasks regardless of options.
1166         */
1167        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1168                return 0;
1169
1170        if (!task_stopped_code(p, ptrace))
1171                return 0;
1172
1173        exit_code = 0;
1174        spin_lock_irq(&p->sighand->siglock);
1175
1176        p_code = task_stopped_code(p, ptrace);
1177        if (unlikely(!p_code))
1178                goto unlock_sig;
1179
1180        exit_code = *p_code;
1181        if (!exit_code)
1182                goto unlock_sig;
1183
1184        if (!unlikely(wo->wo_flags & WNOWAIT))
1185                *p_code = 0;
1186
1187        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1188unlock_sig:
1189        spin_unlock_irq(&p->sighand->siglock);
1190        if (!exit_code)
1191                return 0;
1192
1193        /*
1194         * Now we are pretty sure this task is interesting.
1195         * Make sure it doesn't get reaped out from under us while we
1196         * give up the lock and then examine it below.  We don't want to
1197         * keep holding onto the tasklist_lock while we call getrusage and
1198         * possibly take page faults for user memory.
1199         */
1200        get_task_struct(p);
1201        pid = task_pid_vnr(p);
1202        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1203        read_unlock(&tasklist_lock);
1204        sched_annotate_sleep();
1205        if (wo->wo_rusage)
1206                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1207        put_task_struct(p);
1208
1209        if (likely(!(wo->wo_flags & WNOWAIT)))
1210                wo->wo_stat = (exit_code << 8) | 0x7f;
1211
1212        infop = wo->wo_info;
1213        if (infop) {
1214                infop->cause = why;
1215                infop->status = exit_code;
1216                infop->pid = pid;
1217                infop->uid = uid;
1218        }
1219        return pid;
1220}
1221
1222/*
1223 * Handle do_wait work for one task in a live, non-stopped state.
1224 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1225 * the lock and this task is uninteresting.  If we return nonzero, we have
1226 * released the lock and the system call should return.
1227 */
1228static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1229{
1230        struct waitid_info *infop;
1231        pid_t pid;
1232        uid_t uid;
1233
1234        if (!unlikely(wo->wo_flags & WCONTINUED))
1235                return 0;
1236
1237        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1238                return 0;
1239
1240        spin_lock_irq(&p->sighand->siglock);
1241        /* Re-check with the lock held.  */
1242        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1243                spin_unlock_irq(&p->sighand->siglock);
1244                return 0;
1245        }
1246        if (!unlikely(wo->wo_flags & WNOWAIT))
1247                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1248        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1249        spin_unlock_irq(&p->sighand->siglock);
1250
1251        pid = task_pid_vnr(p);
1252        get_task_struct(p);
1253        read_unlock(&tasklist_lock);
1254        sched_annotate_sleep();
1255        if (wo->wo_rusage)
1256                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1257        put_task_struct(p);
1258
1259        infop = wo->wo_info;
1260        if (!infop) {
1261                wo->wo_stat = 0xffff;
1262        } else {
1263                infop->cause = CLD_CONTINUED;
1264                infop->pid = pid;
1265                infop->uid = uid;
1266                infop->status = SIGCONT;
1267        }
1268        return pid;
1269}
1270
1271/*
1272 * Consider @p for a wait by @parent.
1273 *
1274 * -ECHILD should be in ->notask_error before the first call.
1275 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1276 * Returns zero if the search for a child should continue;
1277 * then ->notask_error is 0 if @p is an eligible child,
1278 * or still -ECHILD.
1279 */
1280static int wait_consider_task(struct wait_opts *wo, int ptrace,
1281                                struct task_struct *p)
1282{
1283        /*
1284         * We can race with wait_task_zombie() from another thread.
1285         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1286         * can't confuse the checks below.
1287         */
1288        int exit_state = READ_ONCE(p->exit_state);
1289        int ret;
1290
1291        if (unlikely(exit_state == EXIT_DEAD))
1292                return 0;
1293
1294        ret = eligible_child(wo, ptrace, p);
1295        if (!ret)
1296                return ret;
1297
1298        if (unlikely(exit_state == EXIT_TRACE)) {
1299                /*
1300                 * ptrace == 0 means we are the natural parent. In this case
1301                 * we should clear notask_error, debugger will notify us.
1302                 */
1303                if (likely(!ptrace))
1304                        wo->notask_error = 0;
1305                return 0;
1306        }
1307
1308        if (likely(!ptrace) && unlikely(p->ptrace)) {
1309                /*
1310                 * If it is traced by its real parent's group, just pretend
1311                 * the caller is ptrace_do_wait() and reap this child if it
1312                 * is zombie.
1313                 *
1314                 * This also hides group stop state from real parent; otherwise
1315                 * a single stop can be reported twice as group and ptrace stop.
1316                 * If a ptracer wants to distinguish these two events for its
1317                 * own children it should create a separate process which takes
1318                 * the role of real parent.
1319                 */
1320                if (!ptrace_reparented(p))
1321                        ptrace = 1;
1322        }
1323
1324        /* slay zombie? */
1325        if (exit_state == EXIT_ZOMBIE) {
1326                /* we don't reap group leaders with subthreads */
1327                if (!delay_group_leader(p)) {
1328                        /*
1329                         * A zombie ptracee is only visible to its ptracer.
1330                         * Notification and reaping will be cascaded to the
1331                         * real parent when the ptracer detaches.
1332                         */
1333                        if (unlikely(ptrace) || likely(!p->ptrace))
1334                                return wait_task_zombie(wo, p);
1335                }
1336
1337                /*
1338                 * Allow access to stopped/continued state via zombie by
1339                 * falling through.  Clearing of notask_error is complex.
1340                 *
1341                 * When !@ptrace:
1342                 *
1343                 * If WEXITED is set, notask_error should naturally be
1344                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1345                 * so, if there are live subthreads, there are events to
1346                 * wait for.  If all subthreads are dead, it's still safe
1347                 * to clear - this function will be called again in finite
1348                 * amount time once all the subthreads are released and
1349                 * will then return without clearing.
1350                 *
1351                 * When @ptrace:
1352                 *
1353                 * Stopped state is per-task and thus can't change once the
1354                 * target task dies.  Only continued and exited can happen.
1355                 * Clear notask_error if WCONTINUED | WEXITED.
1356                 */
1357                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1358                        wo->notask_error = 0;
1359        } else {
1360                /*
1361                 * @p is alive and it's gonna stop, continue or exit, so
1362                 * there always is something to wait for.
1363                 */
1364                wo->notask_error = 0;
1365        }
1366
1367        /*
1368         * Wait for stopped.  Depending on @ptrace, different stopped state
1369         * is used and the two don't interact with each other.
1370         */
1371        ret = wait_task_stopped(wo, ptrace, p);
1372        if (ret)
1373                return ret;
1374
1375        /*
1376         * Wait for continued.  There's only one continued state and the
1377         * ptracer can consume it which can confuse the real parent.  Don't
1378         * use WCONTINUED from ptracer.  You don't need or want it.
1379         */
1380        return wait_task_continued(wo, p);
1381}
1382
1383/*
1384 * Do the work of do_wait() for one thread in the group, @tsk.
1385 *
1386 * -ECHILD should be in ->notask_error before the first call.
1387 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1388 * Returns zero if the search for a child should continue; then
1389 * ->notask_error is 0 if there were any eligible children,
1390 * or still -ECHILD.
1391 */
1392static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1393{
1394        struct task_struct *p;
1395
1396        list_for_each_entry(p, &tsk->children, sibling) {
1397                int ret = wait_consider_task(wo, 0, p);
1398
1399                if (ret)
1400                        return ret;
1401        }
1402
1403        return 0;
1404}
1405
1406static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1407{
1408        struct task_struct *p;
1409
1410        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1411                int ret = wait_consider_task(wo, 1, p);
1412
1413                if (ret)
1414                        return ret;
1415        }
1416
1417        return 0;
1418}
1419
1420static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1421                                int sync, void *key)
1422{
1423        struct wait_opts *wo = container_of(wait, struct wait_opts,
1424                                                child_wait);
1425        struct task_struct *p = key;
1426
1427        if (!eligible_pid(wo, p))
1428                return 0;
1429
1430        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1431                return 0;
1432
1433        return default_wake_function(wait, mode, sync, key);
1434}
1435
1436void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1437{
1438        __wake_up_sync_key(&parent->signal->wait_chldexit,
1439                           TASK_INTERRUPTIBLE, p);
1440}
1441
1442static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
1443                                 struct task_struct *target)
1444{
1445        struct task_struct *parent =
1446                !ptrace ? target->real_parent : target->parent;
1447
1448        return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
1449                                     same_thread_group(current, parent));
1450}
1451
1452/*
1453 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
1454 * and tracee lists to find the target task.
1455 */
1456static int do_wait_pid(struct wait_opts *wo)
1457{
1458        bool ptrace;
1459        struct task_struct *target;
1460        int retval;
1461
1462        ptrace = false;
1463        target = pid_task(wo->wo_pid, PIDTYPE_TGID);
1464        if (target && is_effectively_child(wo, ptrace, target)) {
1465                retval = wait_consider_task(wo, ptrace, target);
1466                if (retval)
1467                        return retval;
1468        }
1469
1470        ptrace = true;
1471        target = pid_task(wo->wo_pid, PIDTYPE_PID);
1472        if (target && target->ptrace &&
1473            is_effectively_child(wo, ptrace, target)) {
1474                retval = wait_consider_task(wo, ptrace, target);
1475                if (retval)
1476                        return retval;
1477        }
1478
1479        return 0;
1480}
1481
1482static long do_wait(struct wait_opts *wo)
1483{
1484        int retval;
1485
1486        trace_sched_process_wait(wo->wo_pid);
1487
1488        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1489        wo->child_wait.private = current;
1490        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1491repeat:
1492        /*
1493         * If there is nothing that can match our criteria, just get out.
1494         * We will clear ->notask_error to zero if we see any child that
1495         * might later match our criteria, even if we are not able to reap
1496         * it yet.
1497         */
1498        wo->notask_error = -ECHILD;
1499        if ((wo->wo_type < PIDTYPE_MAX) &&
1500           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
1501                goto notask;
1502
1503        set_current_state(TASK_INTERRUPTIBLE);
1504        read_lock(&tasklist_lock);
1505
1506        if (wo->wo_type == PIDTYPE_PID) {
1507                retval = do_wait_pid(wo);
1508                if (retval)
1509                        goto end;
1510        } else {
1511                struct task_struct *tsk = current;
1512
1513                do {
1514                        retval = do_wait_thread(wo, tsk);
1515                        if (retval)
1516                                goto end;
1517
1518                        retval = ptrace_do_wait(wo, tsk);
1519                        if (retval)
1520                                goto end;
1521
1522                        if (wo->wo_flags & __WNOTHREAD)
1523                                break;
1524                } while_each_thread(current, tsk);
1525        }
1526        read_unlock(&tasklist_lock);
1527
1528notask:
1529        retval = wo->notask_error;
1530        if (!retval && !(wo->wo_flags & WNOHANG)) {
1531                retval = -ERESTARTSYS;
1532                if (!signal_pending(current)) {
1533                        schedule();
1534                        goto repeat;
1535                }
1536        }
1537end:
1538        __set_current_state(TASK_RUNNING);
1539        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1540        return retval;
1541}
1542
1543static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1544                          int options, struct rusage *ru)
1545{
1546        struct wait_opts wo;
1547        struct pid *pid = NULL;
1548        enum pid_type type;
1549        long ret;
1550        unsigned int f_flags = 0;
1551
1552        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1553                        __WNOTHREAD|__WCLONE|__WALL))
1554                return -EINVAL;
1555        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1556                return -EINVAL;
1557
1558        switch (which) {
1559        case P_ALL:
1560                type = PIDTYPE_MAX;
1561                break;
1562        case P_PID:
1563                type = PIDTYPE_PID;
1564                if (upid <= 0)
1565                        return -EINVAL;
1566
1567                pid = find_get_pid(upid);
1568                break;
1569        case P_PGID:
1570                type = PIDTYPE_PGID;
1571                if (upid < 0)
1572                        return -EINVAL;
1573
1574                if (upid)
1575                        pid = find_get_pid(upid);
1576                else
1577                        pid = get_task_pid(current, PIDTYPE_PGID);
1578                break;
1579        case P_PIDFD:
1580                type = PIDTYPE_PID;
1581                if (upid < 0)
1582                        return -EINVAL;
1583
1584                pid = pidfd_get_pid(upid, &f_flags);
1585                if (IS_ERR(pid))
1586                        return PTR_ERR(pid);
1587
1588                break;
1589        default:
1590                return -EINVAL;
1591        }
1592
1593        wo.wo_type      = type;
1594        wo.wo_pid       = pid;
1595        wo.wo_flags     = options;
1596        wo.wo_info      = infop;
1597        wo.wo_rusage    = ru;
1598        if (f_flags & O_NONBLOCK)
1599                wo.wo_flags |= WNOHANG;
1600
1601        ret = do_wait(&wo);
1602        if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
1603                ret = -EAGAIN;
1604
1605        put_pid(pid);
1606        return ret;
1607}
1608
1609SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1610                infop, int, options, struct rusage __user *, ru)
1611{
1612        struct rusage r;
1613        struct waitid_info info = {.status = 0};
1614        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1615        int signo = 0;
1616
1617        if (err > 0) {
1618                signo = SIGCHLD;
1619                err = 0;
1620                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1621                        return -EFAULT;
1622        }
1623        if (!infop)
1624                return err;
1625
1626        if (!user_write_access_begin(infop, sizeof(*infop)))
1627                return -EFAULT;
1628
1629        unsafe_put_user(signo, &infop->si_signo, Efault);
1630        unsafe_put_user(0, &infop->si_errno, Efault);
1631        unsafe_put_user(info.cause, &infop->si_code, Efault);
1632        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1633        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1634        unsafe_put_user(info.status, &infop->si_status, Efault);
1635        user_write_access_end();
1636        return err;
1637Efault:
1638        user_write_access_end();
1639        return -EFAULT;
1640}
1641
1642long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1643                  struct rusage *ru)
1644{
1645        struct wait_opts wo;
1646        struct pid *pid = NULL;
1647        enum pid_type type;
1648        long ret;
1649
1650        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1651                        __WNOTHREAD|__WCLONE|__WALL))
1652                return -EINVAL;
1653
1654        /* -INT_MIN is not defined */
1655        if (upid == INT_MIN)
1656                return -ESRCH;
1657
1658        if (upid == -1)
1659                type = PIDTYPE_MAX;
1660        else if (upid < 0) {
1661                type = PIDTYPE_PGID;
1662                pid = find_get_pid(-upid);
1663        } else if (upid == 0) {
1664                type = PIDTYPE_PGID;
1665                pid = get_task_pid(current, PIDTYPE_PGID);
1666        } else /* upid > 0 */ {
1667                type = PIDTYPE_PID;
1668                pid = find_get_pid(upid);
1669        }
1670
1671        wo.wo_type      = type;
1672        wo.wo_pid       = pid;
1673        wo.wo_flags     = options | WEXITED;
1674        wo.wo_info      = NULL;
1675        wo.wo_stat      = 0;
1676        wo.wo_rusage    = ru;
1677        ret = do_wait(&wo);
1678        put_pid(pid);
1679        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1680                ret = -EFAULT;
1681
1682        return ret;
1683}
1684
1685int kernel_wait(pid_t pid, int *stat)
1686{
1687        struct wait_opts wo = {
1688                .wo_type        = PIDTYPE_PID,
1689                .wo_pid         = find_get_pid(pid),
1690                .wo_flags       = WEXITED,
1691        };
1692        int ret;
1693
1694        ret = do_wait(&wo);
1695        if (ret > 0 && wo.wo_stat)
1696                *stat = wo.wo_stat;
1697        put_pid(wo.wo_pid);
1698        return ret;
1699}
1700
1701SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1702                int, options, struct rusage __user *, ru)
1703{
1704        struct rusage r;
1705        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1706
1707        if (err > 0) {
1708                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1709                        return -EFAULT;
1710        }
1711        return err;
1712}
1713
1714#ifdef __ARCH_WANT_SYS_WAITPID
1715
1716/*
1717 * sys_waitpid() remains for compatibility. waitpid() should be
1718 * implemented by calling sys_wait4() from libc.a.
1719 */
1720SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1721{
1722        return kernel_wait4(pid, stat_addr, options, NULL);
1723}
1724
1725#endif
1726
1727#ifdef CONFIG_COMPAT
1728COMPAT_SYSCALL_DEFINE4(wait4,
1729        compat_pid_t, pid,
1730        compat_uint_t __user *, stat_addr,
1731        int, options,
1732        struct compat_rusage __user *, ru)
1733{
1734        struct rusage r;
1735        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1736        if (err > 0) {
1737                if (ru && put_compat_rusage(&r, ru))
1738                        return -EFAULT;
1739        }
1740        return err;
1741}
1742
1743COMPAT_SYSCALL_DEFINE5(waitid,
1744                int, which, compat_pid_t, pid,
1745                struct compat_siginfo __user *, infop, int, options,
1746                struct compat_rusage __user *, uru)
1747{
1748        struct rusage ru;
1749        struct waitid_info info = {.status = 0};
1750        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1751        int signo = 0;
1752        if (err > 0) {
1753                signo = SIGCHLD;
1754                err = 0;
1755                if (uru) {
1756                        /* kernel_waitid() overwrites everything in ru */
1757                        if (COMPAT_USE_64BIT_TIME)
1758                                err = copy_to_user(uru, &ru, sizeof(ru));
1759                        else
1760                                err = put_compat_rusage(&ru, uru);
1761                        if (err)
1762                                return -EFAULT;
1763                }
1764        }
1765
1766        if (!infop)
1767                return err;
1768
1769        if (!user_write_access_begin(infop, sizeof(*infop)))
1770                return -EFAULT;
1771
1772        unsafe_put_user(signo, &infop->si_signo, Efault);
1773        unsafe_put_user(0, &infop->si_errno, Efault);
1774        unsafe_put_user(info.cause, &infop->si_code, Efault);
1775        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1776        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1777        unsafe_put_user(info.status, &infop->si_status, Efault);
1778        user_write_access_end();
1779        return err;
1780Efault:
1781        user_write_access_end();
1782        return -EFAULT;
1783}
1784#endif
1785
1786/**
1787 * thread_group_exited - check that a thread group has exited
1788 * @pid: tgid of thread group to be checked.
1789 *
1790 * Test if the thread group represented by tgid has exited (all
1791 * threads are zombies, dead or completely gone).
1792 *
1793 * Return: true if the thread group has exited. false otherwise.
1794 */
1795bool thread_group_exited(struct pid *pid)
1796{
1797        struct task_struct *task;
1798        bool exited;
1799
1800        rcu_read_lock();
1801        task = pid_task(pid, PIDTYPE_PID);
1802        exited = !task ||
1803                (READ_ONCE(task->exit_state) && thread_group_empty(task));
1804        rcu_read_unlock();
1805
1806        return exited;
1807}
1808EXPORT_SYMBOL(thread_group_exited);
1809
1810__weak void abort(void)
1811{
1812        BUG();
1813
1814        /* if that doesn't kill us, halt */
1815        panic("Oops failed to kill thread");
1816}
1817EXPORT_SYMBOL(abort);
1818