linux/kernel/exit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/kernel/exit.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/slab.h>
  10#include <linux/sched/autogroup.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/stat.h>
  13#include <linux/sched/task.h>
  14#include <linux/sched/task_stack.h>
  15#include <linux/sched/cputime.h>
  16#include <linux/interrupt.h>
  17#include <linux/module.h>
  18#include <linux/capability.h>
  19#include <linux/completion.h>
  20#include <linux/personality.h>
  21#include <linux/tty.h>
  22#include <linux/iocontext.h>
  23#include <linux/key.h>
  24#include <linux/cpu.h>
  25#include <linux/acct.h>
  26#include <linux/tsacct_kern.h>
  27#include <linux/file.h>
  28#include <linux/fdtable.h>
  29#include <linux/freezer.h>
  30#include <linux/binfmts.h>
  31#include <linux/nsproxy.h>
  32#include <linux/pid_namespace.h>
  33#include <linux/ptrace.h>
  34#include <linux/profile.h>
  35#include <linux/mount.h>
  36#include <linux/proc_fs.h>
  37#include <linux/kthread.h>
  38#include <linux/mempolicy.h>
  39#include <linux/taskstats_kern.h>
  40#include <linux/delayacct.h>
  41#include <linux/cgroup.h>
  42#include <linux/syscalls.h>
  43#include <linux/signal.h>
  44#include <linux/posix-timers.h>
  45#include <linux/cn_proc.h>
  46#include <linux/mutex.h>
  47#include <linux/futex.h>
  48#include <linux/pipe_fs_i.h>
  49#include <linux/audit.h> /* for audit_free() */
  50#include <linux/resource.h>
  51#include <linux/blkdev.h>
  52#include <linux/task_io_accounting_ops.h>
  53#include <linux/tracehook.h>
  54#include <linux/fs_struct.h>
  55#include <linux/init_task.h>
  56#include <linux/perf_event.h>
  57#include <trace/events/sched.h>
  58#include <linux/hw_breakpoint.h>
  59#include <linux/oom.h>
  60#include <linux/writeback.h>
  61#include <linux/shm.h>
  62#include <linux/kcov.h>
  63#include <linux/random.h>
  64#include <linux/rcuwait.h>
  65#include <linux/compat.h>
  66
  67#include <linux/uaccess.h>
  68#include <asm/unistd.h>
  69#include <asm/mmu_context.h>
  70
  71static void __unhash_process(struct task_struct *p, bool group_dead)
  72{
  73        nr_threads--;
  74        detach_pid(p, PIDTYPE_PID);
  75        if (group_dead) {
  76                detach_pid(p, PIDTYPE_TGID);
  77                detach_pid(p, PIDTYPE_PGID);
  78                detach_pid(p, PIDTYPE_SID);
  79
  80                list_del_rcu(&p->tasks);
  81                list_del_init(&p->sibling);
  82                __this_cpu_dec(process_counts);
  83        }
  84        list_del_rcu(&p->thread_group);
  85        list_del_rcu(&p->thread_node);
  86}
  87
  88/*
  89 * This function expects the tasklist_lock write-locked.
  90 */
  91static void __exit_signal(struct task_struct *tsk)
  92{
  93        struct signal_struct *sig = tsk->signal;
  94        bool group_dead = thread_group_leader(tsk);
  95        struct sighand_struct *sighand;
  96        struct tty_struct *tty;
  97        u64 utime, stime;
  98
  99        sighand = rcu_dereference_check(tsk->sighand,
 100                                        lockdep_tasklist_lock_is_held());
 101        spin_lock(&sighand->siglock);
 102
 103#ifdef CONFIG_POSIX_TIMERS
 104        posix_cpu_timers_exit(tsk);
 105        if (group_dead)
 106                posix_cpu_timers_exit_group(tsk);
 107#endif
 108
 109        if (group_dead) {
 110                tty = sig->tty;
 111                sig->tty = NULL;
 112        } else {
 113                /*
 114                 * If there is any task waiting for the group exit
 115                 * then notify it:
 116                 */
 117                if (sig->notify_count > 0 && !--sig->notify_count)
 118                        wake_up_process(sig->group_exit_task);
 119
 120                if (tsk == sig->curr_target)
 121                        sig->curr_target = next_thread(tsk);
 122        }
 123
 124        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 125                              sizeof(unsigned long long));
 126
 127        /*
 128         * Accumulate here the counters for all threads as they die. We could
 129         * skip the group leader because it is the last user of signal_struct,
 130         * but we want to avoid the race with thread_group_cputime() which can
 131         * see the empty ->thread_head list.
 132         */
 133        task_cputime(tsk, &utime, &stime);
 134        write_seqlock(&sig->stats_lock);
 135        sig->utime += utime;
 136        sig->stime += stime;
 137        sig->gtime += task_gtime(tsk);
 138        sig->min_flt += tsk->min_flt;
 139        sig->maj_flt += tsk->maj_flt;
 140        sig->nvcsw += tsk->nvcsw;
 141        sig->nivcsw += tsk->nivcsw;
 142        sig->inblock += task_io_get_inblock(tsk);
 143        sig->oublock += task_io_get_oublock(tsk);
 144        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 145        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 146        sig->nr_threads--;
 147        __unhash_process(tsk, group_dead);
 148        write_sequnlock(&sig->stats_lock);
 149
 150        /*
 151         * Do this under ->siglock, we can race with another thread
 152         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 153         */
 154        flush_sigqueue(&tsk->pending);
 155        tsk->sighand = NULL;
 156        spin_unlock(&sighand->siglock);
 157
 158        __cleanup_sighand(sighand);
 159        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 160        if (group_dead) {
 161                flush_sigqueue(&sig->shared_pending);
 162                tty_kref_put(tty);
 163        }
 164}
 165
 166static void delayed_put_task_struct(struct rcu_head *rhp)
 167{
 168        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 169
 170        perf_event_delayed_put(tsk);
 171        trace_sched_process_free(tsk);
 172        put_task_struct(tsk);
 173}
 174
 175void put_task_struct_rcu_user(struct task_struct *task)
 176{
 177        if (refcount_dec_and_test(&task->rcu_users))
 178                call_rcu(&task->rcu, delayed_put_task_struct);
 179}
 180
 181void release_task(struct task_struct *p)
 182{
 183        struct task_struct *leader;
 184        struct pid *thread_pid;
 185        int zap_leader;
 186repeat:
 187        /* don't need to get the RCU readlock here - the process is dead and
 188         * can't be modifying its own credentials. But shut RCU-lockdep up */
 189        rcu_read_lock();
 190        atomic_dec(&__task_cred(p)->user->processes);
 191        rcu_read_unlock();
 192
 193        cgroup_release(p);
 194
 195        write_lock_irq(&tasklist_lock);
 196        ptrace_release_task(p);
 197        thread_pid = get_pid(p->thread_pid);
 198        __exit_signal(p);
 199
 200        /*
 201         * If we are the last non-leader member of the thread
 202         * group, and the leader is zombie, then notify the
 203         * group leader's parent process. (if it wants notification.)
 204         */
 205        zap_leader = 0;
 206        leader = p->group_leader;
 207        if (leader != p && thread_group_empty(leader)
 208                        && leader->exit_state == EXIT_ZOMBIE) {
 209                /*
 210                 * If we were the last child thread and the leader has
 211                 * exited already, and the leader's parent ignores SIGCHLD,
 212                 * then we are the one who should release the leader.
 213                 */
 214                zap_leader = do_notify_parent(leader, leader->exit_signal);
 215                if (zap_leader)
 216                        leader->exit_state = EXIT_DEAD;
 217        }
 218
 219        write_unlock_irq(&tasklist_lock);
 220        seccomp_filter_release(p);
 221        proc_flush_pid(thread_pid);
 222        put_pid(thread_pid);
 223        release_thread(p);
 224        put_task_struct_rcu_user(p);
 225
 226        p = leader;
 227        if (unlikely(zap_leader))
 228                goto repeat;
 229}
 230
 231int rcuwait_wake_up(struct rcuwait *w)
 232{
 233        int ret = 0;
 234        struct task_struct *task;
 235
 236        rcu_read_lock();
 237
 238        /*
 239         * Order condition vs @task, such that everything prior to the load
 240         * of @task is visible. This is the condition as to why the user called
 241         * rcuwait_wake() in the first place. Pairs with set_current_state()
 242         * barrier (A) in rcuwait_wait_event().
 243         *
 244         *    WAIT                WAKE
 245         *    [S] tsk = current   [S] cond = true
 246         *        MB (A)              MB (B)
 247         *    [L] cond            [L] tsk
 248         */
 249        smp_mb(); /* (B) */
 250
 251        task = rcu_dereference(w->task);
 252        if (task)
 253                ret = wake_up_process(task);
 254        rcu_read_unlock();
 255
 256        return ret;
 257}
 258EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 259
 260/*
 261 * Determine if a process group is "orphaned", according to the POSIX
 262 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 263 * by terminal-generated stop signals.  Newly orphaned process groups are
 264 * to receive a SIGHUP and a SIGCONT.
 265 *
 266 * "I ask you, have you ever known what it is to be an orphan?"
 267 */
 268static int will_become_orphaned_pgrp(struct pid *pgrp,
 269                                        struct task_struct *ignored_task)
 270{
 271        struct task_struct *p;
 272
 273        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 274                if ((p == ignored_task) ||
 275                    (p->exit_state && thread_group_empty(p)) ||
 276                    is_global_init(p->real_parent))
 277                        continue;
 278
 279                if (task_pgrp(p->real_parent) != pgrp &&
 280                    task_session(p->real_parent) == task_session(p))
 281                        return 0;
 282        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 283
 284        return 1;
 285}
 286
 287int is_current_pgrp_orphaned(void)
 288{
 289        int retval;
 290
 291        read_lock(&tasklist_lock);
 292        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 293        read_unlock(&tasklist_lock);
 294
 295        return retval;
 296}
 297
 298static bool has_stopped_jobs(struct pid *pgrp)
 299{
 300        struct task_struct *p;
 301
 302        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 303                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 304                        return true;
 305        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 306
 307        return false;
 308}
 309
 310/*
 311 * Check to see if any process groups have become orphaned as
 312 * a result of our exiting, and if they have any stopped jobs,
 313 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 314 */
 315static void
 316kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 317{
 318        struct pid *pgrp = task_pgrp(tsk);
 319        struct task_struct *ignored_task = tsk;
 320
 321        if (!parent)
 322                /* exit: our father is in a different pgrp than
 323                 * we are and we were the only connection outside.
 324                 */
 325                parent = tsk->real_parent;
 326        else
 327                /* reparent: our child is in a different pgrp than
 328                 * we are, and it was the only connection outside.
 329                 */
 330                ignored_task = NULL;
 331
 332        if (task_pgrp(parent) != pgrp &&
 333            task_session(parent) == task_session(tsk) &&
 334            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 335            has_stopped_jobs(pgrp)) {
 336                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 337                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 338        }
 339}
 340
 341#ifdef CONFIG_MEMCG
 342/*
 343 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 344 */
 345void mm_update_next_owner(struct mm_struct *mm)
 346{
 347        struct task_struct *c, *g, *p = current;
 348
 349retry:
 350        /*
 351         * If the exiting or execing task is not the owner, it's
 352         * someone else's problem.
 353         */
 354        if (mm->owner != p)
 355                return;
 356        /*
 357         * The current owner is exiting/execing and there are no other
 358         * candidates.  Do not leave the mm pointing to a possibly
 359         * freed task structure.
 360         */
 361        if (atomic_read(&mm->mm_users) <= 1) {
 362                WRITE_ONCE(mm->owner, NULL);
 363                return;
 364        }
 365
 366        read_lock(&tasklist_lock);
 367        /*
 368         * Search in the children
 369         */
 370        list_for_each_entry(c, &p->children, sibling) {
 371                if (c->mm == mm)
 372                        goto assign_new_owner;
 373        }
 374
 375        /*
 376         * Search in the siblings
 377         */
 378        list_for_each_entry(c, &p->real_parent->children, sibling) {
 379                if (c->mm == mm)
 380                        goto assign_new_owner;
 381        }
 382
 383        /*
 384         * Search through everything else, we should not get here often.
 385         */
 386        for_each_process(g) {
 387                if (g->flags & PF_KTHREAD)
 388                        continue;
 389                for_each_thread(g, c) {
 390                        if (c->mm == mm)
 391                                goto assign_new_owner;
 392                        if (c->mm)
 393                                break;
 394                }
 395        }
 396        read_unlock(&tasklist_lock);
 397        /*
 398         * We found no owner yet mm_users > 1: this implies that we are
 399         * most likely racing with swapoff (try_to_unuse()) or /proc or
 400         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 401         */
 402        WRITE_ONCE(mm->owner, NULL);
 403        return;
 404
 405assign_new_owner:
 406        BUG_ON(c == p);
 407        get_task_struct(c);
 408        /*
 409         * The task_lock protects c->mm from changing.
 410         * We always want mm->owner->mm == mm
 411         */
 412        task_lock(c);
 413        /*
 414         * Delay read_unlock() till we have the task_lock()
 415         * to ensure that c does not slip away underneath us
 416         */
 417        read_unlock(&tasklist_lock);
 418        if (c->mm != mm) {
 419                task_unlock(c);
 420                put_task_struct(c);
 421                goto retry;
 422        }
 423        WRITE_ONCE(mm->owner, c);
 424        task_unlock(c);
 425        put_task_struct(c);
 426}
 427#endif /* CONFIG_MEMCG */
 428
 429/*
 430 * Turn us into a lazy TLB process if we
 431 * aren't already..
 432 */
 433static void exit_mm(void)
 434{
 435        struct mm_struct *mm = current->mm;
 436        struct core_state *core_state;
 437
 438        exit_mm_release(current, mm);
 439        if (!mm)
 440                return;
 441        sync_mm_rss(mm);
 442        /*
 443         * Serialize with any possible pending coredump.
 444         * We must hold mmap_lock around checking core_state
 445         * and clearing tsk->mm.  The core-inducing thread
 446         * will increment ->nr_threads for each thread in the
 447         * group with ->mm != NULL.
 448         */
 449        mmap_read_lock(mm);
 450        core_state = mm->core_state;
 451        if (core_state) {
 452                struct core_thread self;
 453
 454                mmap_read_unlock(mm);
 455
 456                self.task = current;
 457                if (self.task->flags & PF_SIGNALED)
 458                        self.next = xchg(&core_state->dumper.next, &self);
 459                else
 460                        self.task = NULL;
 461                /*
 462                 * Implies mb(), the result of xchg() must be visible
 463                 * to core_state->dumper.
 464                 */
 465                if (atomic_dec_and_test(&core_state->nr_threads))
 466                        complete(&core_state->startup);
 467
 468                for (;;) {
 469                        set_current_state(TASK_UNINTERRUPTIBLE);
 470                        if (!self.task) /* see coredump_finish() */
 471                                break;
 472                        freezable_schedule();
 473                }
 474                __set_current_state(TASK_RUNNING);
 475                mmap_read_lock(mm);
 476        }
 477        mmgrab(mm);
 478        BUG_ON(mm != current->active_mm);
 479        /* more a memory barrier than a real lock */
 480        task_lock(current);
 481        current->mm = NULL;
 482        mmap_read_unlock(mm);
 483        enter_lazy_tlb(mm, current);
 484        task_unlock(current);
 485        mm_update_next_owner(mm);
 486        mmput(mm);
 487        if (test_thread_flag(TIF_MEMDIE))
 488                exit_oom_victim();
 489}
 490
 491static struct task_struct *find_alive_thread(struct task_struct *p)
 492{
 493        struct task_struct *t;
 494
 495        for_each_thread(p, t) {
 496                if (!(t->flags & PF_EXITING))
 497                        return t;
 498        }
 499        return NULL;
 500}
 501
 502static struct task_struct *find_child_reaper(struct task_struct *father,
 503                                                struct list_head *dead)
 504        __releases(&tasklist_lock)
 505        __acquires(&tasklist_lock)
 506{
 507        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 508        struct task_struct *reaper = pid_ns->child_reaper;
 509        struct task_struct *p, *n;
 510
 511        if (likely(reaper != father))
 512                return reaper;
 513
 514        reaper = find_alive_thread(father);
 515        if (reaper) {
 516                pid_ns->child_reaper = reaper;
 517                return reaper;
 518        }
 519
 520        write_unlock_irq(&tasklist_lock);
 521
 522        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 523                list_del_init(&p->ptrace_entry);
 524                release_task(p);
 525        }
 526
 527        zap_pid_ns_processes(pid_ns);
 528        write_lock_irq(&tasklist_lock);
 529
 530        return father;
 531}
 532
 533/*
 534 * When we die, we re-parent all our children, and try to:
 535 * 1. give them to another thread in our thread group, if such a member exists
 536 * 2. give it to the first ancestor process which prctl'd itself as a
 537 *    child_subreaper for its children (like a service manager)
 538 * 3. give it to the init process (PID 1) in our pid namespace
 539 */
 540static struct task_struct *find_new_reaper(struct task_struct *father,
 541                                           struct task_struct *child_reaper)
 542{
 543        struct task_struct *thread, *reaper;
 544
 545        thread = find_alive_thread(father);
 546        if (thread)
 547                return thread;
 548
 549        if (father->signal->has_child_subreaper) {
 550                unsigned int ns_level = task_pid(father)->level;
 551                /*
 552                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 553                 * We can't check reaper != child_reaper to ensure we do not
 554                 * cross the namespaces, the exiting parent could be injected
 555                 * by setns() + fork().
 556                 * We check pid->level, this is slightly more efficient than
 557                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 558                 */
 559                for (reaper = father->real_parent;
 560                     task_pid(reaper)->level == ns_level;
 561                     reaper = reaper->real_parent) {
 562                        if (reaper == &init_task)
 563                                break;
 564                        if (!reaper->signal->is_child_subreaper)
 565                                continue;
 566                        thread = find_alive_thread(reaper);
 567                        if (thread)
 568                                return thread;
 569                }
 570        }
 571
 572        return child_reaper;
 573}
 574
 575/*
 576* Any that need to be release_task'd are put on the @dead list.
 577 */
 578static void reparent_leader(struct task_struct *father, struct task_struct *p,
 579                                struct list_head *dead)
 580{
 581        if (unlikely(p->exit_state == EXIT_DEAD))
 582                return;
 583
 584        /* We don't want people slaying init. */
 585        p->exit_signal = SIGCHLD;
 586
 587        /* If it has exited notify the new parent about this child's death. */
 588        if (!p->ptrace &&
 589            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 590                if (do_notify_parent(p, p->exit_signal)) {
 591                        p->exit_state = EXIT_DEAD;
 592                        list_add(&p->ptrace_entry, dead);
 593                }
 594        }
 595
 596        kill_orphaned_pgrp(p, father);
 597}
 598
 599/*
 600 * This does two things:
 601 *
 602 * A.  Make init inherit all the child processes
 603 * B.  Check to see if any process groups have become orphaned
 604 *      as a result of our exiting, and if they have any stopped
 605 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 606 */
 607static void forget_original_parent(struct task_struct *father,
 608                                        struct list_head *dead)
 609{
 610        struct task_struct *p, *t, *reaper;
 611
 612        if (unlikely(!list_empty(&father->ptraced)))
 613                exit_ptrace(father, dead);
 614
 615        /* Can drop and reacquire tasklist_lock */
 616        reaper = find_child_reaper(father, dead);
 617        if (list_empty(&father->children))
 618                return;
 619
 620        reaper = find_new_reaper(father, reaper);
 621        list_for_each_entry(p, &father->children, sibling) {
 622                for_each_thread(p, t) {
 623                        RCU_INIT_POINTER(t->real_parent, reaper);
 624                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
 625                        if (likely(!t->ptrace))
 626                                t->parent = t->real_parent;
 627                        if (t->pdeath_signal)
 628                                group_send_sig_info(t->pdeath_signal,
 629                                                    SEND_SIG_NOINFO, t,
 630                                                    PIDTYPE_TGID);
 631                }
 632                /*
 633                 * If this is a threaded reparent there is no need to
 634                 * notify anyone anything has happened.
 635                 */
 636                if (!same_thread_group(reaper, father))
 637                        reparent_leader(father, p, dead);
 638        }
 639        list_splice_tail_init(&father->children, &reaper->children);
 640}
 641
 642/*
 643 * Send signals to all our closest relatives so that they know
 644 * to properly mourn us..
 645 */
 646static void exit_notify(struct task_struct *tsk, int group_dead)
 647{
 648        bool autoreap;
 649        struct task_struct *p, *n;
 650        LIST_HEAD(dead);
 651
 652        write_lock_irq(&tasklist_lock);
 653        forget_original_parent(tsk, &dead);
 654
 655        if (group_dead)
 656                kill_orphaned_pgrp(tsk->group_leader, NULL);
 657
 658        tsk->exit_state = EXIT_ZOMBIE;
 659        if (unlikely(tsk->ptrace)) {
 660                int sig = thread_group_leader(tsk) &&
 661                                thread_group_empty(tsk) &&
 662                                !ptrace_reparented(tsk) ?
 663                        tsk->exit_signal : SIGCHLD;
 664                autoreap = do_notify_parent(tsk, sig);
 665        } else if (thread_group_leader(tsk)) {
 666                autoreap = thread_group_empty(tsk) &&
 667                        do_notify_parent(tsk, tsk->exit_signal);
 668        } else {
 669                autoreap = true;
 670        }
 671
 672        if (autoreap) {
 673                tsk->exit_state = EXIT_DEAD;
 674                list_add(&tsk->ptrace_entry, &dead);
 675        }
 676
 677        /* mt-exec, de_thread() is waiting for group leader */
 678        if (unlikely(tsk->signal->notify_count < 0))
 679                wake_up_process(tsk->signal->group_exit_task);
 680        write_unlock_irq(&tasklist_lock);
 681
 682        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 683                list_del_init(&p->ptrace_entry);
 684                release_task(p);
 685        }
 686}
 687
 688#ifdef CONFIG_DEBUG_STACK_USAGE
 689static void check_stack_usage(void)
 690{
 691        static DEFINE_SPINLOCK(low_water_lock);
 692        static int lowest_to_date = THREAD_SIZE;
 693        unsigned long free;
 694
 695        free = stack_not_used(current);
 696
 697        if (free >= lowest_to_date)
 698                return;
 699
 700        spin_lock(&low_water_lock);
 701        if (free < lowest_to_date) {
 702                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 703                        current->comm, task_pid_nr(current), free);
 704                lowest_to_date = free;
 705        }
 706        spin_unlock(&low_water_lock);
 707}
 708#else
 709static inline void check_stack_usage(void) {}
 710#endif
 711
 712void __noreturn do_exit(long code)
 713{
 714        struct task_struct *tsk = current;
 715        int group_dead;
 716
 717        /*
 718         * We can get here from a kernel oops, sometimes with preemption off.
 719         * Start by checking for critical errors.
 720         * Then fix up important state like USER_DS and preemption.
 721         * Then do everything else.
 722         */
 723
 724        WARN_ON(blk_needs_flush_plug(tsk));
 725
 726        if (unlikely(in_interrupt()))
 727                panic("Aiee, killing interrupt handler!");
 728        if (unlikely(!tsk->pid))
 729                panic("Attempted to kill the idle task!");
 730
 731        /*
 732         * If do_exit is called because this processes oopsed, it's possible
 733         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 734         * continuing. Amongst other possible reasons, this is to prevent
 735         * mm_release()->clear_child_tid() from writing to a user-controlled
 736         * kernel address.
 737         */
 738        force_uaccess_begin();
 739
 740        if (unlikely(in_atomic())) {
 741                pr_info("note: %s[%d] exited with preempt_count %d\n",
 742                        current->comm, task_pid_nr(current),
 743                        preempt_count());
 744                preempt_count_set(PREEMPT_ENABLED);
 745        }
 746
 747        profile_task_exit(tsk);
 748        kcov_task_exit(tsk);
 749
 750        ptrace_event(PTRACE_EVENT_EXIT, code);
 751
 752        validate_creds_for_do_exit(tsk);
 753
 754        /*
 755         * We're taking recursive faults here in do_exit. Safest is to just
 756         * leave this task alone and wait for reboot.
 757         */
 758        if (unlikely(tsk->flags & PF_EXITING)) {
 759                pr_alert("Fixing recursive fault but reboot is needed!\n");
 760                futex_exit_recursive(tsk);
 761                set_current_state(TASK_UNINTERRUPTIBLE);
 762                schedule();
 763        }
 764
 765        exit_signals(tsk);  /* sets PF_EXITING */
 766
 767        /* sync mm's RSS info before statistics gathering */
 768        if (tsk->mm)
 769                sync_mm_rss(tsk->mm);
 770        acct_update_integrals(tsk);
 771        group_dead = atomic_dec_and_test(&tsk->signal->live);
 772        if (group_dead) {
 773                /*
 774                 * If the last thread of global init has exited, panic
 775                 * immediately to get a useable coredump.
 776                 */
 777                if (unlikely(is_global_init(tsk)))
 778                        panic("Attempted to kill init! exitcode=0x%08x\n",
 779                                tsk->signal->group_exit_code ?: (int)code);
 780
 781#ifdef CONFIG_POSIX_TIMERS
 782                hrtimer_cancel(&tsk->signal->real_timer);
 783                exit_itimers(tsk->signal);
 784#endif
 785                if (tsk->mm)
 786                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 787        }
 788        acct_collect(code, group_dead);
 789        if (group_dead)
 790                tty_audit_exit();
 791        audit_free(tsk);
 792
 793        tsk->exit_code = code;
 794        taskstats_exit(tsk, group_dead);
 795
 796        exit_mm();
 797
 798        if (group_dead)
 799                acct_process();
 800        trace_sched_process_exit(tsk);
 801
 802        exit_sem(tsk);
 803        exit_shm(tsk);
 804        exit_files(tsk);
 805        exit_fs(tsk);
 806        if (group_dead)
 807                disassociate_ctty(1);
 808        exit_task_namespaces(tsk);
 809        exit_task_work(tsk);
 810        exit_thread(tsk);
 811
 812        /*
 813         * Flush inherited counters to the parent - before the parent
 814         * gets woken up by child-exit notifications.
 815         *
 816         * because of cgroup mode, must be called before cgroup_exit()
 817         */
 818        perf_event_exit_task(tsk);
 819
 820        sched_autogroup_exit_task(tsk);
 821        cgroup_exit(tsk);
 822
 823        /*
 824         * FIXME: do that only when needed, using sched_exit tracepoint
 825         */
 826        flush_ptrace_hw_breakpoint(tsk);
 827
 828        exit_tasks_rcu_start();
 829        exit_notify(tsk, group_dead);
 830        proc_exit_connector(tsk);
 831        mpol_put_task_policy(tsk);
 832#ifdef CONFIG_FUTEX
 833        if (unlikely(current->pi_state_cache))
 834                kfree(current->pi_state_cache);
 835#endif
 836        /*
 837         * Make sure we are holding no locks:
 838         */
 839        debug_check_no_locks_held();
 840
 841        if (tsk->io_context)
 842                exit_io_context(tsk);
 843
 844        if (tsk->splice_pipe)
 845                free_pipe_info(tsk->splice_pipe);
 846
 847        if (tsk->task_frag.page)
 848                put_page(tsk->task_frag.page);
 849
 850        validate_creds_for_do_exit(tsk);
 851
 852        check_stack_usage();
 853        preempt_disable();
 854        if (tsk->nr_dirtied)
 855                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 856        exit_rcu();
 857        exit_tasks_rcu_finish();
 858
 859        lockdep_free_task(tsk);
 860        do_task_dead();
 861}
 862EXPORT_SYMBOL_GPL(do_exit);
 863
 864void complete_and_exit(struct completion *comp, long code)
 865{
 866        if (comp)
 867                complete(comp);
 868
 869        do_exit(code);
 870}
 871EXPORT_SYMBOL(complete_and_exit);
 872
 873SYSCALL_DEFINE1(exit, int, error_code)
 874{
 875        do_exit((error_code&0xff)<<8);
 876}
 877
 878/*
 879 * Take down every thread in the group.  This is called by fatal signals
 880 * as well as by sys_exit_group (below).
 881 */
 882void
 883do_group_exit(int exit_code)
 884{
 885        struct signal_struct *sig = current->signal;
 886
 887        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 888
 889        if (signal_group_exit(sig))
 890                exit_code = sig->group_exit_code;
 891        else if (!thread_group_empty(current)) {
 892                struct sighand_struct *const sighand = current->sighand;
 893
 894                spin_lock_irq(&sighand->siglock);
 895                if (signal_group_exit(sig))
 896                        /* Another thread got here before we took the lock.  */
 897                        exit_code = sig->group_exit_code;
 898                else {
 899                        sig->group_exit_code = exit_code;
 900                        sig->flags = SIGNAL_GROUP_EXIT;
 901                        zap_other_threads(current);
 902                }
 903                spin_unlock_irq(&sighand->siglock);
 904        }
 905
 906        do_exit(exit_code);
 907        /* NOTREACHED */
 908}
 909
 910/*
 911 * this kills every thread in the thread group. Note that any externally
 912 * wait4()-ing process will get the correct exit code - even if this
 913 * thread is not the thread group leader.
 914 */
 915SYSCALL_DEFINE1(exit_group, int, error_code)
 916{
 917        do_group_exit((error_code & 0xff) << 8);
 918        /* NOTREACHED */
 919        return 0;
 920}
 921
 922struct waitid_info {
 923        pid_t pid;
 924        uid_t uid;
 925        int status;
 926        int cause;
 927};
 928
 929struct wait_opts {
 930        enum pid_type           wo_type;
 931        int                     wo_flags;
 932        struct pid              *wo_pid;
 933
 934        struct waitid_info      *wo_info;
 935        int                     wo_stat;
 936        struct rusage           *wo_rusage;
 937
 938        wait_queue_entry_t              child_wait;
 939        int                     notask_error;
 940};
 941
 942static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 943{
 944        return  wo->wo_type == PIDTYPE_MAX ||
 945                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 946}
 947
 948static int
 949eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 950{
 951        if (!eligible_pid(wo, p))
 952                return 0;
 953
 954        /*
 955         * Wait for all children (clone and not) if __WALL is set or
 956         * if it is traced by us.
 957         */
 958        if (ptrace || (wo->wo_flags & __WALL))
 959                return 1;
 960
 961        /*
 962         * Otherwise, wait for clone children *only* if __WCLONE is set;
 963         * otherwise, wait for non-clone children *only*.
 964         *
 965         * Note: a "clone" child here is one that reports to its parent
 966         * using a signal other than SIGCHLD, or a non-leader thread which
 967         * we can only see if it is traced by us.
 968         */
 969        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 970                return 0;
 971
 972        return 1;
 973}
 974
 975/*
 976 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 977 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 978 * the lock and this task is uninteresting.  If we return nonzero, we have
 979 * released the lock and the system call should return.
 980 */
 981static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 982{
 983        int state, status;
 984        pid_t pid = task_pid_vnr(p);
 985        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 986        struct waitid_info *infop;
 987
 988        if (!likely(wo->wo_flags & WEXITED))
 989                return 0;
 990
 991        if (unlikely(wo->wo_flags & WNOWAIT)) {
 992                status = p->exit_code;
 993                get_task_struct(p);
 994                read_unlock(&tasklist_lock);
 995                sched_annotate_sleep();
 996                if (wo->wo_rusage)
 997                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 998                put_task_struct(p);
 999                goto out_info;
1000        }
1001        /*
1002         * Move the task's state to DEAD/TRACE, only one thread can do this.
1003         */
1004        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1005                EXIT_TRACE : EXIT_DEAD;
1006        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1007                return 0;
1008        /*
1009         * We own this thread, nobody else can reap it.
1010         */
1011        read_unlock(&tasklist_lock);
1012        sched_annotate_sleep();
1013
1014        /*
1015         * Check thread_group_leader() to exclude the traced sub-threads.
1016         */
1017        if (state == EXIT_DEAD && thread_group_leader(p)) {
1018                struct signal_struct *sig = p->signal;
1019                struct signal_struct *psig = current->signal;
1020                unsigned long maxrss;
1021                u64 tgutime, tgstime;
1022
1023                /*
1024                 * The resource counters for the group leader are in its
1025                 * own task_struct.  Those for dead threads in the group
1026                 * are in its signal_struct, as are those for the child
1027                 * processes it has previously reaped.  All these
1028                 * accumulate in the parent's signal_struct c* fields.
1029                 *
1030                 * We don't bother to take a lock here to protect these
1031                 * p->signal fields because the whole thread group is dead
1032                 * and nobody can change them.
1033                 *
1034                 * psig->stats_lock also protects us from our sub-theads
1035                 * which can reap other children at the same time. Until
1036                 * we change k_getrusage()-like users to rely on this lock
1037                 * we have to take ->siglock as well.
1038                 *
1039                 * We use thread_group_cputime_adjusted() to get times for
1040                 * the thread group, which consolidates times for all threads
1041                 * in the group including the group leader.
1042                 */
1043                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1044                spin_lock_irq(&current->sighand->siglock);
1045                write_seqlock(&psig->stats_lock);
1046                psig->cutime += tgutime + sig->cutime;
1047                psig->cstime += tgstime + sig->cstime;
1048                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1049                psig->cmin_flt +=
1050                        p->min_flt + sig->min_flt + sig->cmin_flt;
1051                psig->cmaj_flt +=
1052                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1053                psig->cnvcsw +=
1054                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1055                psig->cnivcsw +=
1056                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1057                psig->cinblock +=
1058                        task_io_get_inblock(p) +
1059                        sig->inblock + sig->cinblock;
1060                psig->coublock +=
1061                        task_io_get_oublock(p) +
1062                        sig->oublock + sig->coublock;
1063                maxrss = max(sig->maxrss, sig->cmaxrss);
1064                if (psig->cmaxrss < maxrss)
1065                        psig->cmaxrss = maxrss;
1066                task_io_accounting_add(&psig->ioac, &p->ioac);
1067                task_io_accounting_add(&psig->ioac, &sig->ioac);
1068                write_sequnlock(&psig->stats_lock);
1069                spin_unlock_irq(&current->sighand->siglock);
1070        }
1071
1072        if (wo->wo_rusage)
1073                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1074        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1075                ? p->signal->group_exit_code : p->exit_code;
1076        wo->wo_stat = status;
1077
1078        if (state == EXIT_TRACE) {
1079                write_lock_irq(&tasklist_lock);
1080                /* We dropped tasklist, ptracer could die and untrace */
1081                ptrace_unlink(p);
1082
1083                /* If parent wants a zombie, don't release it now */
1084                state = EXIT_ZOMBIE;
1085                if (do_notify_parent(p, p->exit_signal))
1086                        state = EXIT_DEAD;
1087                p->exit_state = state;
1088                write_unlock_irq(&tasklist_lock);
1089        }
1090        if (state == EXIT_DEAD)
1091                release_task(p);
1092
1093out_info:
1094        infop = wo->wo_info;
1095        if (infop) {
1096                if ((status & 0x7f) == 0) {
1097                        infop->cause = CLD_EXITED;
1098                        infop->status = status >> 8;
1099                } else {
1100                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1101                        infop->status = status & 0x7f;
1102                }
1103                infop->pid = pid;
1104                infop->uid = uid;
1105        }
1106
1107        return pid;
1108}
1109
1110static int *task_stopped_code(struct task_struct *p, bool ptrace)
1111{
1112        if (ptrace) {
1113                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1114                        return &p->exit_code;
1115        } else {
1116                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1117                        return &p->signal->group_exit_code;
1118        }
1119        return NULL;
1120}
1121
1122/**
1123 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1124 * @wo: wait options
1125 * @ptrace: is the wait for ptrace
1126 * @p: task to wait for
1127 *
1128 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1129 *
1130 * CONTEXT:
1131 * read_lock(&tasklist_lock), which is released if return value is
1132 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1133 *
1134 * RETURNS:
1135 * 0 if wait condition didn't exist and search for other wait conditions
1136 * should continue.  Non-zero return, -errno on failure and @p's pid on
1137 * success, implies that tasklist_lock is released and wait condition
1138 * search should terminate.
1139 */
1140static int wait_task_stopped(struct wait_opts *wo,
1141                                int ptrace, struct task_struct *p)
1142{
1143        struct waitid_info *infop;
1144        int exit_code, *p_code, why;
1145        uid_t uid = 0; /* unneeded, required by compiler */
1146        pid_t pid;
1147
1148        /*
1149         * Traditionally we see ptrace'd stopped tasks regardless of options.
1150         */
1151        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1152                return 0;
1153
1154        if (!task_stopped_code(p, ptrace))
1155                return 0;
1156
1157        exit_code = 0;
1158        spin_lock_irq(&p->sighand->siglock);
1159
1160        p_code = task_stopped_code(p, ptrace);
1161        if (unlikely(!p_code))
1162                goto unlock_sig;
1163
1164        exit_code = *p_code;
1165        if (!exit_code)
1166                goto unlock_sig;
1167
1168        if (!unlikely(wo->wo_flags & WNOWAIT))
1169                *p_code = 0;
1170
1171        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1172unlock_sig:
1173        spin_unlock_irq(&p->sighand->siglock);
1174        if (!exit_code)
1175                return 0;
1176
1177        /*
1178         * Now we are pretty sure this task is interesting.
1179         * Make sure it doesn't get reaped out from under us while we
1180         * give up the lock and then examine it below.  We don't want to
1181         * keep holding onto the tasklist_lock while we call getrusage and
1182         * possibly take page faults for user memory.
1183         */
1184        get_task_struct(p);
1185        pid = task_pid_vnr(p);
1186        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1187        read_unlock(&tasklist_lock);
1188        sched_annotate_sleep();
1189        if (wo->wo_rusage)
1190                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1191        put_task_struct(p);
1192
1193        if (likely(!(wo->wo_flags & WNOWAIT)))
1194                wo->wo_stat = (exit_code << 8) | 0x7f;
1195
1196        infop = wo->wo_info;
1197        if (infop) {
1198                infop->cause = why;
1199                infop->status = exit_code;
1200                infop->pid = pid;
1201                infop->uid = uid;
1202        }
1203        return pid;
1204}
1205
1206/*
1207 * Handle do_wait work for one task in a live, non-stopped state.
1208 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1209 * the lock and this task is uninteresting.  If we return nonzero, we have
1210 * released the lock and the system call should return.
1211 */
1212static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1213{
1214        struct waitid_info *infop;
1215        pid_t pid;
1216        uid_t uid;
1217
1218        if (!unlikely(wo->wo_flags & WCONTINUED))
1219                return 0;
1220
1221        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1222                return 0;
1223
1224        spin_lock_irq(&p->sighand->siglock);
1225        /* Re-check with the lock held.  */
1226        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1227                spin_unlock_irq(&p->sighand->siglock);
1228                return 0;
1229        }
1230        if (!unlikely(wo->wo_flags & WNOWAIT))
1231                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1232        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1233        spin_unlock_irq(&p->sighand->siglock);
1234
1235        pid = task_pid_vnr(p);
1236        get_task_struct(p);
1237        read_unlock(&tasklist_lock);
1238        sched_annotate_sleep();
1239        if (wo->wo_rusage)
1240                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1241        put_task_struct(p);
1242
1243        infop = wo->wo_info;
1244        if (!infop) {
1245                wo->wo_stat = 0xffff;
1246        } else {
1247                infop->cause = CLD_CONTINUED;
1248                infop->pid = pid;
1249                infop->uid = uid;
1250                infop->status = SIGCONT;
1251        }
1252        return pid;
1253}
1254
1255/*
1256 * Consider @p for a wait by @parent.
1257 *
1258 * -ECHILD should be in ->notask_error before the first call.
1259 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1260 * Returns zero if the search for a child should continue;
1261 * then ->notask_error is 0 if @p is an eligible child,
1262 * or still -ECHILD.
1263 */
1264static int wait_consider_task(struct wait_opts *wo, int ptrace,
1265                                struct task_struct *p)
1266{
1267        /*
1268         * We can race with wait_task_zombie() from another thread.
1269         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1270         * can't confuse the checks below.
1271         */
1272        int exit_state = READ_ONCE(p->exit_state);
1273        int ret;
1274
1275        if (unlikely(exit_state == EXIT_DEAD))
1276                return 0;
1277
1278        ret = eligible_child(wo, ptrace, p);
1279        if (!ret)
1280                return ret;
1281
1282        if (unlikely(exit_state == EXIT_TRACE)) {
1283                /*
1284                 * ptrace == 0 means we are the natural parent. In this case
1285                 * we should clear notask_error, debugger will notify us.
1286                 */
1287                if (likely(!ptrace))
1288                        wo->notask_error = 0;
1289                return 0;
1290        }
1291
1292        if (likely(!ptrace) && unlikely(p->ptrace)) {
1293                /*
1294                 * If it is traced by its real parent's group, just pretend
1295                 * the caller is ptrace_do_wait() and reap this child if it
1296                 * is zombie.
1297                 *
1298                 * This also hides group stop state from real parent; otherwise
1299                 * a single stop can be reported twice as group and ptrace stop.
1300                 * If a ptracer wants to distinguish these two events for its
1301                 * own children it should create a separate process which takes
1302                 * the role of real parent.
1303                 */
1304                if (!ptrace_reparented(p))
1305                        ptrace = 1;
1306        }
1307
1308        /* slay zombie? */
1309        if (exit_state == EXIT_ZOMBIE) {
1310                /* we don't reap group leaders with subthreads */
1311                if (!delay_group_leader(p)) {
1312                        /*
1313                         * A zombie ptracee is only visible to its ptracer.
1314                         * Notification and reaping will be cascaded to the
1315                         * real parent when the ptracer detaches.
1316                         */
1317                        if (unlikely(ptrace) || likely(!p->ptrace))
1318                                return wait_task_zombie(wo, p);
1319                }
1320
1321                /*
1322                 * Allow access to stopped/continued state via zombie by
1323                 * falling through.  Clearing of notask_error is complex.
1324                 *
1325                 * When !@ptrace:
1326                 *
1327                 * If WEXITED is set, notask_error should naturally be
1328                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1329                 * so, if there are live subthreads, there are events to
1330                 * wait for.  If all subthreads are dead, it's still safe
1331                 * to clear - this function will be called again in finite
1332                 * amount time once all the subthreads are released and
1333                 * will then return without clearing.
1334                 *
1335                 * When @ptrace:
1336                 *
1337                 * Stopped state is per-task and thus can't change once the
1338                 * target task dies.  Only continued and exited can happen.
1339                 * Clear notask_error if WCONTINUED | WEXITED.
1340                 */
1341                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1342                        wo->notask_error = 0;
1343        } else {
1344                /*
1345                 * @p is alive and it's gonna stop, continue or exit, so
1346                 * there always is something to wait for.
1347                 */
1348                wo->notask_error = 0;
1349        }
1350
1351        /*
1352         * Wait for stopped.  Depending on @ptrace, different stopped state
1353         * is used and the two don't interact with each other.
1354         */
1355        ret = wait_task_stopped(wo, ptrace, p);
1356        if (ret)
1357                return ret;
1358
1359        /*
1360         * Wait for continued.  There's only one continued state and the
1361         * ptracer can consume it which can confuse the real parent.  Don't
1362         * use WCONTINUED from ptracer.  You don't need or want it.
1363         */
1364        return wait_task_continued(wo, p);
1365}
1366
1367/*
1368 * Do the work of do_wait() for one thread in the group, @tsk.
1369 *
1370 * -ECHILD should be in ->notask_error before the first call.
1371 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1372 * Returns zero if the search for a child should continue; then
1373 * ->notask_error is 0 if there were any eligible children,
1374 * or still -ECHILD.
1375 */
1376static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1377{
1378        struct task_struct *p;
1379
1380        list_for_each_entry(p, &tsk->children, sibling) {
1381                int ret = wait_consider_task(wo, 0, p);
1382
1383                if (ret)
1384                        return ret;
1385        }
1386
1387        return 0;
1388}
1389
1390static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1391{
1392        struct task_struct *p;
1393
1394        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1395                int ret = wait_consider_task(wo, 1, p);
1396
1397                if (ret)
1398                        return ret;
1399        }
1400
1401        return 0;
1402}
1403
1404static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1405                                int sync, void *key)
1406{
1407        struct wait_opts *wo = container_of(wait, struct wait_opts,
1408                                                child_wait);
1409        struct task_struct *p = key;
1410
1411        if (!eligible_pid(wo, p))
1412                return 0;
1413
1414        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1415                return 0;
1416
1417        return default_wake_function(wait, mode, sync, key);
1418}
1419
1420void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1421{
1422        __wake_up_sync_key(&parent->signal->wait_chldexit,
1423                           TASK_INTERRUPTIBLE, p);
1424}
1425
1426static long do_wait(struct wait_opts *wo)
1427{
1428        struct task_struct *tsk;
1429        int retval;
1430
1431        trace_sched_process_wait(wo->wo_pid);
1432
1433        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1434        wo->child_wait.private = current;
1435        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1436repeat:
1437        /*
1438         * If there is nothing that can match our criteria, just get out.
1439         * We will clear ->notask_error to zero if we see any child that
1440         * might later match our criteria, even if we are not able to reap
1441         * it yet.
1442         */
1443        wo->notask_error = -ECHILD;
1444        if ((wo->wo_type < PIDTYPE_MAX) &&
1445           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
1446                goto notask;
1447
1448        set_current_state(TASK_INTERRUPTIBLE);
1449        read_lock(&tasklist_lock);
1450        tsk = current;
1451        do {
1452                retval = do_wait_thread(wo, tsk);
1453                if (retval)
1454                        goto end;
1455
1456                retval = ptrace_do_wait(wo, tsk);
1457                if (retval)
1458                        goto end;
1459
1460                if (wo->wo_flags & __WNOTHREAD)
1461                        break;
1462        } while_each_thread(current, tsk);
1463        read_unlock(&tasklist_lock);
1464
1465notask:
1466        retval = wo->notask_error;
1467        if (!retval && !(wo->wo_flags & WNOHANG)) {
1468                retval = -ERESTARTSYS;
1469                if (!signal_pending(current)) {
1470                        schedule();
1471                        goto repeat;
1472                }
1473        }
1474end:
1475        __set_current_state(TASK_RUNNING);
1476        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1477        return retval;
1478}
1479
1480static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1481                          int options, struct rusage *ru)
1482{
1483        struct wait_opts wo;
1484        struct pid *pid = NULL;
1485        enum pid_type type;
1486        long ret;
1487        unsigned int f_flags = 0;
1488
1489        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1490                        __WNOTHREAD|__WCLONE|__WALL))
1491                return -EINVAL;
1492        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1493                return -EINVAL;
1494
1495        switch (which) {
1496        case P_ALL:
1497                type = PIDTYPE_MAX;
1498                break;
1499        case P_PID:
1500                type = PIDTYPE_PID;
1501                if (upid <= 0)
1502                        return -EINVAL;
1503
1504                pid = find_get_pid(upid);
1505                break;
1506        case P_PGID:
1507                type = PIDTYPE_PGID;
1508                if (upid < 0)
1509                        return -EINVAL;
1510
1511                if (upid)
1512                        pid = find_get_pid(upid);
1513                else
1514                        pid = get_task_pid(current, PIDTYPE_PGID);
1515                break;
1516        case P_PIDFD:
1517                type = PIDTYPE_PID;
1518                if (upid < 0)
1519                        return -EINVAL;
1520
1521                pid = pidfd_get_pid(upid, &f_flags);
1522                if (IS_ERR(pid))
1523                        return PTR_ERR(pid);
1524
1525                break;
1526        default:
1527                return -EINVAL;
1528        }
1529
1530        wo.wo_type      = type;
1531        wo.wo_pid       = pid;
1532        wo.wo_flags     = options;
1533        wo.wo_info      = infop;
1534        wo.wo_rusage    = ru;
1535        if (f_flags & O_NONBLOCK)
1536                wo.wo_flags |= WNOHANG;
1537
1538        ret = do_wait(&wo);
1539        if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
1540                ret = -EAGAIN;
1541
1542        put_pid(pid);
1543        return ret;
1544}
1545
1546SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1547                infop, int, options, struct rusage __user *, ru)
1548{
1549        struct rusage r;
1550        struct waitid_info info = {.status = 0};
1551        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1552        int signo = 0;
1553
1554        if (err > 0) {
1555                signo = SIGCHLD;
1556                err = 0;
1557                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1558                        return -EFAULT;
1559        }
1560        if (!infop)
1561                return err;
1562
1563        if (!user_write_access_begin(infop, sizeof(*infop)))
1564                return -EFAULT;
1565
1566        unsafe_put_user(signo, &infop->si_signo, Efault);
1567        unsafe_put_user(0, &infop->si_errno, Efault);
1568        unsafe_put_user(info.cause, &infop->si_code, Efault);
1569        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1570        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1571        unsafe_put_user(info.status, &infop->si_status, Efault);
1572        user_write_access_end();
1573        return err;
1574Efault:
1575        user_write_access_end();
1576        return -EFAULT;
1577}
1578
1579long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1580                  struct rusage *ru)
1581{
1582        struct wait_opts wo;
1583        struct pid *pid = NULL;
1584        enum pid_type type;
1585        long ret;
1586
1587        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1588                        __WNOTHREAD|__WCLONE|__WALL))
1589                return -EINVAL;
1590
1591        /* -INT_MIN is not defined */
1592        if (upid == INT_MIN)
1593                return -ESRCH;
1594
1595        if (upid == -1)
1596                type = PIDTYPE_MAX;
1597        else if (upid < 0) {
1598                type = PIDTYPE_PGID;
1599                pid = find_get_pid(-upid);
1600        } else if (upid == 0) {
1601                type = PIDTYPE_PGID;
1602                pid = get_task_pid(current, PIDTYPE_PGID);
1603        } else /* upid > 0 */ {
1604                type = PIDTYPE_PID;
1605                pid = find_get_pid(upid);
1606        }
1607
1608        wo.wo_type      = type;
1609        wo.wo_pid       = pid;
1610        wo.wo_flags     = options | WEXITED;
1611        wo.wo_info      = NULL;
1612        wo.wo_stat      = 0;
1613        wo.wo_rusage    = ru;
1614        ret = do_wait(&wo);
1615        put_pid(pid);
1616        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1617                ret = -EFAULT;
1618
1619        return ret;
1620}
1621
1622int kernel_wait(pid_t pid, int *stat)
1623{
1624        struct wait_opts wo = {
1625                .wo_type        = PIDTYPE_PID,
1626                .wo_pid         = find_get_pid(pid),
1627                .wo_flags       = WEXITED,
1628        };
1629        int ret;
1630
1631        ret = do_wait(&wo);
1632        if (ret > 0 && wo.wo_stat)
1633                *stat = wo.wo_stat;
1634        put_pid(wo.wo_pid);
1635        return ret;
1636}
1637
1638SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1639                int, options, struct rusage __user *, ru)
1640{
1641        struct rusage r;
1642        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1643
1644        if (err > 0) {
1645                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1646                        return -EFAULT;
1647        }
1648        return err;
1649}
1650
1651#ifdef __ARCH_WANT_SYS_WAITPID
1652
1653/*
1654 * sys_waitpid() remains for compatibility. waitpid() should be
1655 * implemented by calling sys_wait4() from libc.a.
1656 */
1657SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1658{
1659        return kernel_wait4(pid, stat_addr, options, NULL);
1660}
1661
1662#endif
1663
1664#ifdef CONFIG_COMPAT
1665COMPAT_SYSCALL_DEFINE4(wait4,
1666        compat_pid_t, pid,
1667        compat_uint_t __user *, stat_addr,
1668        int, options,
1669        struct compat_rusage __user *, ru)
1670{
1671        struct rusage r;
1672        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1673        if (err > 0) {
1674                if (ru && put_compat_rusage(&r, ru))
1675                        return -EFAULT;
1676        }
1677        return err;
1678}
1679
1680COMPAT_SYSCALL_DEFINE5(waitid,
1681                int, which, compat_pid_t, pid,
1682                struct compat_siginfo __user *, infop, int, options,
1683                struct compat_rusage __user *, uru)
1684{
1685        struct rusage ru;
1686        struct waitid_info info = {.status = 0};
1687        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1688        int signo = 0;
1689        if (err > 0) {
1690                signo = SIGCHLD;
1691                err = 0;
1692                if (uru) {
1693                        /* kernel_waitid() overwrites everything in ru */
1694                        if (COMPAT_USE_64BIT_TIME)
1695                                err = copy_to_user(uru, &ru, sizeof(ru));
1696                        else
1697                                err = put_compat_rusage(&ru, uru);
1698                        if (err)
1699                                return -EFAULT;
1700                }
1701        }
1702
1703        if (!infop)
1704                return err;
1705
1706        if (!user_write_access_begin(infop, sizeof(*infop)))
1707                return -EFAULT;
1708
1709        unsafe_put_user(signo, &infop->si_signo, Efault);
1710        unsafe_put_user(0, &infop->si_errno, Efault);
1711        unsafe_put_user(info.cause, &infop->si_code, Efault);
1712        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1713        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1714        unsafe_put_user(info.status, &infop->si_status, Efault);
1715        user_write_access_end();
1716        return err;
1717Efault:
1718        user_write_access_end();
1719        return -EFAULT;
1720}
1721#endif
1722
1723/**
1724 * thread_group_exited - check that a thread group has exited
1725 * @pid: tgid of thread group to be checked.
1726 *
1727 * Test if the thread group represented by tgid has exited (all
1728 * threads are zombies, dead or completely gone).
1729 *
1730 * Return: true if the thread group has exited. false otherwise.
1731 */
1732bool thread_group_exited(struct pid *pid)
1733{
1734        struct task_struct *task;
1735        bool exited;
1736
1737        rcu_read_lock();
1738        task = pid_task(pid, PIDTYPE_PID);
1739        exited = !task ||
1740                (READ_ONCE(task->exit_state) && thread_group_empty(task));
1741        rcu_read_unlock();
1742
1743        return exited;
1744}
1745EXPORT_SYMBOL(thread_group_exited);
1746
1747__weak void abort(void)
1748{
1749        BUG();
1750
1751        /* if that doesn't kill us, halt */
1752        panic("Oops failed to kill thread");
1753}
1754EXPORT_SYMBOL(abort);
1755