linux/kernel/exit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/kernel/exit.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/slab.h>
  10#include <linux/sched/autogroup.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/stat.h>
  13#include <linux/sched/task.h>
  14#include <linux/sched/task_stack.h>
  15#include <linux/sched/cputime.h>
  16#include <linux/interrupt.h>
  17#include <linux/module.h>
  18#include <linux/capability.h>
  19#include <linux/completion.h>
  20#include <linux/personality.h>
  21#include <linux/tty.h>
  22#include <linux/iocontext.h>
  23#include <linux/key.h>
  24#include <linux/cpu.h>
  25#include <linux/acct.h>
  26#include <linux/tsacct_kern.h>
  27#include <linux/file.h>
  28#include <linux/fdtable.h>
  29#include <linux/freezer.h>
  30#include <linux/binfmts.h>
  31#include <linux/nsproxy.h>
  32#include <linux/pid_namespace.h>
  33#include <linux/ptrace.h>
  34#include <linux/profile.h>
  35#include <linux/mount.h>
  36#include <linux/proc_fs.h>
  37#include <linux/kthread.h>
  38#include <linux/mempolicy.h>
  39#include <linux/taskstats_kern.h>
  40#include <linux/delayacct.h>
  41#include <linux/cgroup.h>
  42#include <linux/syscalls.h>
  43#include <linux/signal.h>
  44#include <linux/posix-timers.h>
  45#include <linux/cn_proc.h>
  46#include <linux/mutex.h>
  47#include <linux/futex.h>
  48#include <linux/pipe_fs_i.h>
  49#include <linux/audit.h> /* for audit_free() */
  50#include <linux/resource.h>
  51#include <linux/blkdev.h>
  52#include <linux/task_io_accounting_ops.h>
  53#include <linux/tracehook.h>
  54#include <linux/fs_struct.h>
  55#include <linux/init_task.h>
  56#include <linux/perf_event.h>
  57#include <trace/events/sched.h>
  58#include <linux/hw_breakpoint.h>
  59#include <linux/oom.h>
  60#include <linux/writeback.h>
  61#include <linux/shm.h>
  62#include <linux/kcov.h>
  63#include <linux/random.h>
  64#include <linux/rcuwait.h>
  65#include <linux/compat.h>
  66
  67#include <linux/uaccess.h>
  68#include <asm/unistd.h>
  69#include <asm/pgtable.h>
  70#include <asm/mmu_context.h>
  71
  72static void __unhash_process(struct task_struct *p, bool group_dead)
  73{
  74        nr_threads--;
  75        detach_pid(p, PIDTYPE_PID);
  76        if (group_dead) {
  77                detach_pid(p, PIDTYPE_TGID);
  78                detach_pid(p, PIDTYPE_PGID);
  79                detach_pid(p, PIDTYPE_SID);
  80
  81                list_del_rcu(&p->tasks);
  82                list_del_init(&p->sibling);
  83                __this_cpu_dec(process_counts);
  84        }
  85        list_del_rcu(&p->thread_group);
  86        list_del_rcu(&p->thread_node);
  87}
  88
  89/*
  90 * This function expects the tasklist_lock write-locked.
  91 */
  92static void __exit_signal(struct task_struct *tsk)
  93{
  94        struct signal_struct *sig = tsk->signal;
  95        bool group_dead = thread_group_leader(tsk);
  96        struct sighand_struct *sighand;
  97        struct tty_struct *uninitialized_var(tty);
  98        u64 utime, stime;
  99
 100        sighand = rcu_dereference_check(tsk->sighand,
 101                                        lockdep_tasklist_lock_is_held());
 102        spin_lock(&sighand->siglock);
 103
 104#ifdef CONFIG_POSIX_TIMERS
 105        posix_cpu_timers_exit(tsk);
 106        if (group_dead) {
 107                posix_cpu_timers_exit_group(tsk);
 108        } else {
 109                /*
 110                 * This can only happen if the caller is de_thread().
 111                 * FIXME: this is the temporary hack, we should teach
 112                 * posix-cpu-timers to handle this case correctly.
 113                 */
 114                if (unlikely(has_group_leader_pid(tsk)))
 115                        posix_cpu_timers_exit_group(tsk);
 116        }
 117#endif
 118
 119        if (group_dead) {
 120                tty = sig->tty;
 121                sig->tty = NULL;
 122        } else {
 123                /*
 124                 * If there is any task waiting for the group exit
 125                 * then notify it:
 126                 */
 127                if (sig->notify_count > 0 && !--sig->notify_count)
 128                        wake_up_process(sig->group_exit_task);
 129
 130                if (tsk == sig->curr_target)
 131                        sig->curr_target = next_thread(tsk);
 132        }
 133
 134        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 135                              sizeof(unsigned long long));
 136
 137        /*
 138         * Accumulate here the counters for all threads as they die. We could
 139         * skip the group leader because it is the last user of signal_struct,
 140         * but we want to avoid the race with thread_group_cputime() which can
 141         * see the empty ->thread_head list.
 142         */
 143        task_cputime(tsk, &utime, &stime);
 144        write_seqlock(&sig->stats_lock);
 145        sig->utime += utime;
 146        sig->stime += stime;
 147        sig->gtime += task_gtime(tsk);
 148        sig->min_flt += tsk->min_flt;
 149        sig->maj_flt += tsk->maj_flt;
 150        sig->nvcsw += tsk->nvcsw;
 151        sig->nivcsw += tsk->nivcsw;
 152        sig->inblock += task_io_get_inblock(tsk);
 153        sig->oublock += task_io_get_oublock(tsk);
 154        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 155        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 156        sig->nr_threads--;
 157        __unhash_process(tsk, group_dead);
 158        write_sequnlock(&sig->stats_lock);
 159
 160        /*
 161         * Do this under ->siglock, we can race with another thread
 162         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 163         */
 164        flush_sigqueue(&tsk->pending);
 165        tsk->sighand = NULL;
 166        spin_unlock(&sighand->siglock);
 167
 168        __cleanup_sighand(sighand);
 169        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 170        if (group_dead) {
 171                flush_sigqueue(&sig->shared_pending);
 172                tty_kref_put(tty);
 173        }
 174}
 175
 176static void delayed_put_task_struct(struct rcu_head *rhp)
 177{
 178        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 179
 180        perf_event_delayed_put(tsk);
 181        trace_sched_process_free(tsk);
 182        put_task_struct(tsk);
 183}
 184
 185void put_task_struct_rcu_user(struct task_struct *task)
 186{
 187        if (refcount_dec_and_test(&task->rcu_users))
 188                call_rcu(&task->rcu, delayed_put_task_struct);
 189}
 190
 191void release_task(struct task_struct *p)
 192{
 193        struct task_struct *leader;
 194        int zap_leader;
 195repeat:
 196        /* don't need to get the RCU readlock here - the process is dead and
 197         * can't be modifying its own credentials. But shut RCU-lockdep up */
 198        rcu_read_lock();
 199        atomic_dec(&__task_cred(p)->user->processes);
 200        rcu_read_unlock();
 201
 202        proc_flush_task(p);
 203        cgroup_release(p);
 204
 205        write_lock_irq(&tasklist_lock);
 206        ptrace_release_task(p);
 207        __exit_signal(p);
 208
 209        /*
 210         * If we are the last non-leader member of the thread
 211         * group, and the leader is zombie, then notify the
 212         * group leader's parent process. (if it wants notification.)
 213         */
 214        zap_leader = 0;
 215        leader = p->group_leader;
 216        if (leader != p && thread_group_empty(leader)
 217                        && leader->exit_state == EXIT_ZOMBIE) {
 218                /*
 219                 * If we were the last child thread and the leader has
 220                 * exited already, and the leader's parent ignores SIGCHLD,
 221                 * then we are the one who should release the leader.
 222                 */
 223                zap_leader = do_notify_parent(leader, leader->exit_signal);
 224                if (zap_leader)
 225                        leader->exit_state = EXIT_DEAD;
 226        }
 227
 228        write_unlock_irq(&tasklist_lock);
 229        release_thread(p);
 230        put_task_struct_rcu_user(p);
 231
 232        p = leader;
 233        if (unlikely(zap_leader))
 234                goto repeat;
 235}
 236
 237void rcuwait_wake_up(struct rcuwait *w)
 238{
 239        struct task_struct *task;
 240
 241        rcu_read_lock();
 242
 243        /*
 244         * Order condition vs @task, such that everything prior to the load
 245         * of @task is visible. This is the condition as to why the user called
 246         * rcuwait_trywake() in the first place. Pairs with set_current_state()
 247         * barrier (A) in rcuwait_wait_event().
 248         *
 249         *    WAIT                WAKE
 250         *    [S] tsk = current   [S] cond = true
 251         *        MB (A)              MB (B)
 252         *    [L] cond            [L] tsk
 253         */
 254        smp_mb(); /* (B) */
 255
 256        task = rcu_dereference(w->task);
 257        if (task)
 258                wake_up_process(task);
 259        rcu_read_unlock();
 260}
 261
 262/*
 263 * Determine if a process group is "orphaned", according to the POSIX
 264 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 265 * by terminal-generated stop signals.  Newly orphaned process groups are
 266 * to receive a SIGHUP and a SIGCONT.
 267 *
 268 * "I ask you, have you ever known what it is to be an orphan?"
 269 */
 270static int will_become_orphaned_pgrp(struct pid *pgrp,
 271                                        struct task_struct *ignored_task)
 272{
 273        struct task_struct *p;
 274
 275        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 276                if ((p == ignored_task) ||
 277                    (p->exit_state && thread_group_empty(p)) ||
 278                    is_global_init(p->real_parent))
 279                        continue;
 280
 281                if (task_pgrp(p->real_parent) != pgrp &&
 282                    task_session(p->real_parent) == task_session(p))
 283                        return 0;
 284        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 285
 286        return 1;
 287}
 288
 289int is_current_pgrp_orphaned(void)
 290{
 291        int retval;
 292
 293        read_lock(&tasklist_lock);
 294        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 295        read_unlock(&tasklist_lock);
 296
 297        return retval;
 298}
 299
 300static bool has_stopped_jobs(struct pid *pgrp)
 301{
 302        struct task_struct *p;
 303
 304        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 305                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 306                        return true;
 307        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 308
 309        return false;
 310}
 311
 312/*
 313 * Check to see if any process groups have become orphaned as
 314 * a result of our exiting, and if they have any stopped jobs,
 315 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 316 */
 317static void
 318kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 319{
 320        struct pid *pgrp = task_pgrp(tsk);
 321        struct task_struct *ignored_task = tsk;
 322
 323        if (!parent)
 324                /* exit: our father is in a different pgrp than
 325                 * we are and we were the only connection outside.
 326                 */
 327                parent = tsk->real_parent;
 328        else
 329                /* reparent: our child is in a different pgrp than
 330                 * we are, and it was the only connection outside.
 331                 */
 332                ignored_task = NULL;
 333
 334        if (task_pgrp(parent) != pgrp &&
 335            task_session(parent) == task_session(tsk) &&
 336            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 337            has_stopped_jobs(pgrp)) {
 338                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 339                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 340        }
 341}
 342
 343#ifdef CONFIG_MEMCG
 344/*
 345 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 346 */
 347void mm_update_next_owner(struct mm_struct *mm)
 348{
 349        struct task_struct *c, *g, *p = current;
 350
 351retry:
 352        /*
 353         * If the exiting or execing task is not the owner, it's
 354         * someone else's problem.
 355         */
 356        if (mm->owner != p)
 357                return;
 358        /*
 359         * The current owner is exiting/execing and there are no other
 360         * candidates.  Do not leave the mm pointing to a possibly
 361         * freed task structure.
 362         */
 363        if (atomic_read(&mm->mm_users) <= 1) {
 364                WRITE_ONCE(mm->owner, NULL);
 365                return;
 366        }
 367
 368        read_lock(&tasklist_lock);
 369        /*
 370         * Search in the children
 371         */
 372        list_for_each_entry(c, &p->children, sibling) {
 373                if (c->mm == mm)
 374                        goto assign_new_owner;
 375        }
 376
 377        /*
 378         * Search in the siblings
 379         */
 380        list_for_each_entry(c, &p->real_parent->children, sibling) {
 381                if (c->mm == mm)
 382                        goto assign_new_owner;
 383        }
 384
 385        /*
 386         * Search through everything else, we should not get here often.
 387         */
 388        for_each_process(g) {
 389                if (g->flags & PF_KTHREAD)
 390                        continue;
 391                for_each_thread(g, c) {
 392                        if (c->mm == mm)
 393                                goto assign_new_owner;
 394                        if (c->mm)
 395                                break;
 396                }
 397        }
 398        read_unlock(&tasklist_lock);
 399        /*
 400         * We found no owner yet mm_users > 1: this implies that we are
 401         * most likely racing with swapoff (try_to_unuse()) or /proc or
 402         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 403         */
 404        WRITE_ONCE(mm->owner, NULL);
 405        return;
 406
 407assign_new_owner:
 408        BUG_ON(c == p);
 409        get_task_struct(c);
 410        /*
 411         * The task_lock protects c->mm from changing.
 412         * We always want mm->owner->mm == mm
 413         */
 414        task_lock(c);
 415        /*
 416         * Delay read_unlock() till we have the task_lock()
 417         * to ensure that c does not slip away underneath us
 418         */
 419        read_unlock(&tasklist_lock);
 420        if (c->mm != mm) {
 421                task_unlock(c);
 422                put_task_struct(c);
 423                goto retry;
 424        }
 425        WRITE_ONCE(mm->owner, c);
 426        task_unlock(c);
 427        put_task_struct(c);
 428}
 429#endif /* CONFIG_MEMCG */
 430
 431/*
 432 * Turn us into a lazy TLB process if we
 433 * aren't already..
 434 */
 435static void exit_mm(void)
 436{
 437        struct mm_struct *mm = current->mm;
 438        struct core_state *core_state;
 439
 440        mm_release(current, mm);
 441        if (!mm)
 442                return;
 443        sync_mm_rss(mm);
 444        /*
 445         * Serialize with any possible pending coredump.
 446         * We must hold mmap_sem around checking core_state
 447         * and clearing tsk->mm.  The core-inducing thread
 448         * will increment ->nr_threads for each thread in the
 449         * group with ->mm != NULL.
 450         */
 451        down_read(&mm->mmap_sem);
 452        core_state = mm->core_state;
 453        if (core_state) {
 454                struct core_thread self;
 455
 456                up_read(&mm->mmap_sem);
 457
 458                self.task = current;
 459                self.next = xchg(&core_state->dumper.next, &self);
 460                /*
 461                 * Implies mb(), the result of xchg() must be visible
 462                 * to core_state->dumper.
 463                 */
 464                if (atomic_dec_and_test(&core_state->nr_threads))
 465                        complete(&core_state->startup);
 466
 467                for (;;) {
 468                        set_current_state(TASK_UNINTERRUPTIBLE);
 469                        if (!self.task) /* see coredump_finish() */
 470                                break;
 471                        freezable_schedule();
 472                }
 473                __set_current_state(TASK_RUNNING);
 474                down_read(&mm->mmap_sem);
 475        }
 476        mmgrab(mm);
 477        BUG_ON(mm != current->active_mm);
 478        /* more a memory barrier than a real lock */
 479        task_lock(current);
 480        current->mm = NULL;
 481        up_read(&mm->mmap_sem);
 482        enter_lazy_tlb(mm, current);
 483        task_unlock(current);
 484        mm_update_next_owner(mm);
 485        mmput(mm);
 486        if (test_thread_flag(TIF_MEMDIE))
 487                exit_oom_victim();
 488}
 489
 490static struct task_struct *find_alive_thread(struct task_struct *p)
 491{
 492        struct task_struct *t;
 493
 494        for_each_thread(p, t) {
 495                if (!(t->flags & PF_EXITING))
 496                        return t;
 497        }
 498        return NULL;
 499}
 500
 501static struct task_struct *find_child_reaper(struct task_struct *father,
 502                                                struct list_head *dead)
 503        __releases(&tasklist_lock)
 504        __acquires(&tasklist_lock)
 505{
 506        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 507        struct task_struct *reaper = pid_ns->child_reaper;
 508        struct task_struct *p, *n;
 509
 510        if (likely(reaper != father))
 511                return reaper;
 512
 513        reaper = find_alive_thread(father);
 514        if (reaper) {
 515                pid_ns->child_reaper = reaper;
 516                return reaper;
 517        }
 518
 519        write_unlock_irq(&tasklist_lock);
 520        if (unlikely(pid_ns == &init_pid_ns)) {
 521                panic("Attempted to kill init! exitcode=0x%08x\n",
 522                        father->signal->group_exit_code ?: father->exit_code);
 523        }
 524
 525        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 526                list_del_init(&p->ptrace_entry);
 527                release_task(p);
 528        }
 529
 530        zap_pid_ns_processes(pid_ns);
 531        write_lock_irq(&tasklist_lock);
 532
 533        return father;
 534}
 535
 536/*
 537 * When we die, we re-parent all our children, and try to:
 538 * 1. give them to another thread in our thread group, if such a member exists
 539 * 2. give it to the first ancestor process which prctl'd itself as a
 540 *    child_subreaper for its children (like a service manager)
 541 * 3. give it to the init process (PID 1) in our pid namespace
 542 */
 543static struct task_struct *find_new_reaper(struct task_struct *father,
 544                                           struct task_struct *child_reaper)
 545{
 546        struct task_struct *thread, *reaper;
 547
 548        thread = find_alive_thread(father);
 549        if (thread)
 550                return thread;
 551
 552        if (father->signal->has_child_subreaper) {
 553                unsigned int ns_level = task_pid(father)->level;
 554                /*
 555                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 556                 * We can't check reaper != child_reaper to ensure we do not
 557                 * cross the namespaces, the exiting parent could be injected
 558                 * by setns() + fork().
 559                 * We check pid->level, this is slightly more efficient than
 560                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 561                 */
 562                for (reaper = father->real_parent;
 563                     task_pid(reaper)->level == ns_level;
 564                     reaper = reaper->real_parent) {
 565                        if (reaper == &init_task)
 566                                break;
 567                        if (!reaper->signal->is_child_subreaper)
 568                                continue;
 569                        thread = find_alive_thread(reaper);
 570                        if (thread)
 571                                return thread;
 572                }
 573        }
 574
 575        return child_reaper;
 576}
 577
 578/*
 579* Any that need to be release_task'd are put on the @dead list.
 580 */
 581static void reparent_leader(struct task_struct *father, struct task_struct *p,
 582                                struct list_head *dead)
 583{
 584        if (unlikely(p->exit_state == EXIT_DEAD))
 585                return;
 586
 587        /* We don't want people slaying init. */
 588        p->exit_signal = SIGCHLD;
 589
 590        /* If it has exited notify the new parent about this child's death. */
 591        if (!p->ptrace &&
 592            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 593                if (do_notify_parent(p, p->exit_signal)) {
 594                        p->exit_state = EXIT_DEAD;
 595                        list_add(&p->ptrace_entry, dead);
 596                }
 597        }
 598
 599        kill_orphaned_pgrp(p, father);
 600}
 601
 602/*
 603 * This does two things:
 604 *
 605 * A.  Make init inherit all the child processes
 606 * B.  Check to see if any process groups have become orphaned
 607 *      as a result of our exiting, and if they have any stopped
 608 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 609 */
 610static void forget_original_parent(struct task_struct *father,
 611                                        struct list_head *dead)
 612{
 613        struct task_struct *p, *t, *reaper;
 614
 615        if (unlikely(!list_empty(&father->ptraced)))
 616                exit_ptrace(father, dead);
 617
 618        /* Can drop and reacquire tasklist_lock */
 619        reaper = find_child_reaper(father, dead);
 620        if (list_empty(&father->children))
 621                return;
 622
 623        reaper = find_new_reaper(father, reaper);
 624        list_for_each_entry(p, &father->children, sibling) {
 625                for_each_thread(p, t) {
 626                        t->real_parent = reaper;
 627                        BUG_ON((!t->ptrace) != (t->parent == father));
 628                        if (likely(!t->ptrace))
 629                                t->parent = t->real_parent;
 630                        if (t->pdeath_signal)
 631                                group_send_sig_info(t->pdeath_signal,
 632                                                    SEND_SIG_NOINFO, t,
 633                                                    PIDTYPE_TGID);
 634                }
 635                /*
 636                 * If this is a threaded reparent there is no need to
 637                 * notify anyone anything has happened.
 638                 */
 639                if (!same_thread_group(reaper, father))
 640                        reparent_leader(father, p, dead);
 641        }
 642        list_splice_tail_init(&father->children, &reaper->children);
 643}
 644
 645/*
 646 * Send signals to all our closest relatives so that they know
 647 * to properly mourn us..
 648 */
 649static void exit_notify(struct task_struct *tsk, int group_dead)
 650{
 651        bool autoreap;
 652        struct task_struct *p, *n;
 653        LIST_HEAD(dead);
 654
 655        write_lock_irq(&tasklist_lock);
 656        forget_original_parent(tsk, &dead);
 657
 658        if (group_dead)
 659                kill_orphaned_pgrp(tsk->group_leader, NULL);
 660
 661        tsk->exit_state = EXIT_ZOMBIE;
 662        if (unlikely(tsk->ptrace)) {
 663                int sig = thread_group_leader(tsk) &&
 664                                thread_group_empty(tsk) &&
 665                                !ptrace_reparented(tsk) ?
 666                        tsk->exit_signal : SIGCHLD;
 667                autoreap = do_notify_parent(tsk, sig);
 668        } else if (thread_group_leader(tsk)) {
 669                autoreap = thread_group_empty(tsk) &&
 670                        do_notify_parent(tsk, tsk->exit_signal);
 671        } else {
 672                autoreap = true;
 673        }
 674
 675        if (autoreap) {
 676                tsk->exit_state = EXIT_DEAD;
 677                list_add(&tsk->ptrace_entry, &dead);
 678        }
 679
 680        /* mt-exec, de_thread() is waiting for group leader */
 681        if (unlikely(tsk->signal->notify_count < 0))
 682                wake_up_process(tsk->signal->group_exit_task);
 683        write_unlock_irq(&tasklist_lock);
 684
 685        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 686                list_del_init(&p->ptrace_entry);
 687                release_task(p);
 688        }
 689}
 690
 691#ifdef CONFIG_DEBUG_STACK_USAGE
 692static void check_stack_usage(void)
 693{
 694        static DEFINE_SPINLOCK(low_water_lock);
 695        static int lowest_to_date = THREAD_SIZE;
 696        unsigned long free;
 697
 698        free = stack_not_used(current);
 699
 700        if (free >= lowest_to_date)
 701                return;
 702
 703        spin_lock(&low_water_lock);
 704        if (free < lowest_to_date) {
 705                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 706                        current->comm, task_pid_nr(current), free);
 707                lowest_to_date = free;
 708        }
 709        spin_unlock(&low_water_lock);
 710}
 711#else
 712static inline void check_stack_usage(void) {}
 713#endif
 714
 715void __noreturn do_exit(long code)
 716{
 717        struct task_struct *tsk = current;
 718        int group_dead;
 719
 720        profile_task_exit(tsk);
 721        kcov_task_exit(tsk);
 722
 723        WARN_ON(blk_needs_flush_plug(tsk));
 724
 725        if (unlikely(in_interrupt()))
 726                panic("Aiee, killing interrupt handler!");
 727        if (unlikely(!tsk->pid))
 728                panic("Attempted to kill the idle task!");
 729
 730        /*
 731         * If do_exit is called because this processes oopsed, it's possible
 732         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 733         * continuing. Amongst other possible reasons, this is to prevent
 734         * mm_release()->clear_child_tid() from writing to a user-controlled
 735         * kernel address.
 736         */
 737        set_fs(USER_DS);
 738
 739        ptrace_event(PTRACE_EVENT_EXIT, code);
 740
 741        validate_creds_for_do_exit(tsk);
 742
 743        /*
 744         * We're taking recursive faults here in do_exit. Safest is to just
 745         * leave this task alone and wait for reboot.
 746         */
 747        if (unlikely(tsk->flags & PF_EXITING)) {
 748                pr_alert("Fixing recursive fault but reboot is needed!\n");
 749                /*
 750                 * We can do this unlocked here. The futex code uses
 751                 * this flag just to verify whether the pi state
 752                 * cleanup has been done or not. In the worst case it
 753                 * loops once more. We pretend that the cleanup was
 754                 * done as there is no way to return. Either the
 755                 * OWNER_DIED bit is set by now or we push the blocked
 756                 * task into the wait for ever nirwana as well.
 757                 */
 758                tsk->flags |= PF_EXITPIDONE;
 759                set_current_state(TASK_UNINTERRUPTIBLE);
 760                schedule();
 761        }
 762
 763        exit_signals(tsk);  /* sets PF_EXITING */
 764        /*
 765         * Ensure that all new tsk->pi_lock acquisitions must observe
 766         * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
 767         */
 768        smp_mb();
 769        /*
 770         * Ensure that we must observe the pi_state in exit_mm() ->
 771         * mm_release() -> exit_pi_state_list().
 772         */
 773        raw_spin_lock_irq(&tsk->pi_lock);
 774        raw_spin_unlock_irq(&tsk->pi_lock);
 775
 776        if (unlikely(in_atomic())) {
 777                pr_info("note: %s[%d] exited with preempt_count %d\n",
 778                        current->comm, task_pid_nr(current),
 779                        preempt_count());
 780                preempt_count_set(PREEMPT_ENABLED);
 781        }
 782
 783        /* sync mm's RSS info before statistics gathering */
 784        if (tsk->mm)
 785                sync_mm_rss(tsk->mm);
 786        acct_update_integrals(tsk);
 787        group_dead = atomic_dec_and_test(&tsk->signal->live);
 788        if (group_dead) {
 789#ifdef CONFIG_POSIX_TIMERS
 790                hrtimer_cancel(&tsk->signal->real_timer);
 791                exit_itimers(tsk->signal);
 792#endif
 793                if (tsk->mm)
 794                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 795        }
 796        acct_collect(code, group_dead);
 797        if (group_dead)
 798                tty_audit_exit();
 799        audit_free(tsk);
 800
 801        tsk->exit_code = code;
 802        taskstats_exit(tsk, group_dead);
 803
 804        exit_mm();
 805
 806        if (group_dead)
 807                acct_process();
 808        trace_sched_process_exit(tsk);
 809
 810        exit_sem(tsk);
 811        exit_shm(tsk);
 812        exit_files(tsk);
 813        exit_fs(tsk);
 814        if (group_dead)
 815                disassociate_ctty(1);
 816        exit_task_namespaces(tsk);
 817        exit_task_work(tsk);
 818        exit_thread(tsk);
 819        exit_umh(tsk);
 820
 821        /*
 822         * Flush inherited counters to the parent - before the parent
 823         * gets woken up by child-exit notifications.
 824         *
 825         * because of cgroup mode, must be called before cgroup_exit()
 826         */
 827        perf_event_exit_task(tsk);
 828
 829        sched_autogroup_exit_task(tsk);
 830        cgroup_exit(tsk);
 831
 832        /*
 833         * FIXME: do that only when needed, using sched_exit tracepoint
 834         */
 835        flush_ptrace_hw_breakpoint(tsk);
 836
 837        exit_tasks_rcu_start();
 838        exit_notify(tsk, group_dead);
 839        proc_exit_connector(tsk);
 840        mpol_put_task_policy(tsk);
 841#ifdef CONFIG_FUTEX
 842        if (unlikely(current->pi_state_cache))
 843                kfree(current->pi_state_cache);
 844#endif
 845        /*
 846         * Make sure we are holding no locks:
 847         */
 848        debug_check_no_locks_held();
 849        /*
 850         * We can do this unlocked here. The futex code uses this flag
 851         * just to verify whether the pi state cleanup has been done
 852         * or not. In the worst case it loops once more.
 853         */
 854        tsk->flags |= PF_EXITPIDONE;
 855
 856        if (tsk->io_context)
 857                exit_io_context(tsk);
 858
 859        if (tsk->splice_pipe)
 860                free_pipe_info(tsk->splice_pipe);
 861
 862        if (tsk->task_frag.page)
 863                put_page(tsk->task_frag.page);
 864
 865        validate_creds_for_do_exit(tsk);
 866
 867        check_stack_usage();
 868        preempt_disable();
 869        if (tsk->nr_dirtied)
 870                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 871        exit_rcu();
 872        exit_tasks_rcu_finish();
 873
 874        lockdep_free_task(tsk);
 875        do_task_dead();
 876}
 877EXPORT_SYMBOL_GPL(do_exit);
 878
 879void complete_and_exit(struct completion *comp, long code)
 880{
 881        if (comp)
 882                complete(comp);
 883
 884        do_exit(code);
 885}
 886EXPORT_SYMBOL(complete_and_exit);
 887
 888SYSCALL_DEFINE1(exit, int, error_code)
 889{
 890        do_exit((error_code&0xff)<<8);
 891}
 892
 893/*
 894 * Take down every thread in the group.  This is called by fatal signals
 895 * as well as by sys_exit_group (below).
 896 */
 897void
 898do_group_exit(int exit_code)
 899{
 900        struct signal_struct *sig = current->signal;
 901
 902        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 903
 904        if (signal_group_exit(sig))
 905                exit_code = sig->group_exit_code;
 906        else if (!thread_group_empty(current)) {
 907                struct sighand_struct *const sighand = current->sighand;
 908
 909                spin_lock_irq(&sighand->siglock);
 910                if (signal_group_exit(sig))
 911                        /* Another thread got here before we took the lock.  */
 912                        exit_code = sig->group_exit_code;
 913                else {
 914                        sig->group_exit_code = exit_code;
 915                        sig->flags = SIGNAL_GROUP_EXIT;
 916                        zap_other_threads(current);
 917                }
 918                spin_unlock_irq(&sighand->siglock);
 919        }
 920
 921        do_exit(exit_code);
 922        /* NOTREACHED */
 923}
 924
 925/*
 926 * this kills every thread in the thread group. Note that any externally
 927 * wait4()-ing process will get the correct exit code - even if this
 928 * thread is not the thread group leader.
 929 */
 930SYSCALL_DEFINE1(exit_group, int, error_code)
 931{
 932        do_group_exit((error_code & 0xff) << 8);
 933        /* NOTREACHED */
 934        return 0;
 935}
 936
 937struct waitid_info {
 938        pid_t pid;
 939        uid_t uid;
 940        int status;
 941        int cause;
 942};
 943
 944struct wait_opts {
 945        enum pid_type           wo_type;
 946        int                     wo_flags;
 947        struct pid              *wo_pid;
 948
 949        struct waitid_info      *wo_info;
 950        int                     wo_stat;
 951        struct rusage           *wo_rusage;
 952
 953        wait_queue_entry_t              child_wait;
 954        int                     notask_error;
 955};
 956
 957static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 958{
 959        return  wo->wo_type == PIDTYPE_MAX ||
 960                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 961}
 962
 963static int
 964eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 965{
 966        if (!eligible_pid(wo, p))
 967                return 0;
 968
 969        /*
 970         * Wait for all children (clone and not) if __WALL is set or
 971         * if it is traced by us.
 972         */
 973        if (ptrace || (wo->wo_flags & __WALL))
 974                return 1;
 975
 976        /*
 977         * Otherwise, wait for clone children *only* if __WCLONE is set;
 978         * otherwise, wait for non-clone children *only*.
 979         *
 980         * Note: a "clone" child here is one that reports to its parent
 981         * using a signal other than SIGCHLD, or a non-leader thread which
 982         * we can only see if it is traced by us.
 983         */
 984        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 985                return 0;
 986
 987        return 1;
 988}
 989
 990/*
 991 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 992 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 993 * the lock and this task is uninteresting.  If we return nonzero, we have
 994 * released the lock and the system call should return.
 995 */
 996static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 997{
 998        int state, status;
 999        pid_t pid = task_pid_vnr(p);
1000        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1001        struct waitid_info *infop;
1002
1003        if (!likely(wo->wo_flags & WEXITED))
1004                return 0;
1005
1006        if (unlikely(wo->wo_flags & WNOWAIT)) {
1007                status = p->exit_code;
1008                get_task_struct(p);
1009                read_unlock(&tasklist_lock);
1010                sched_annotate_sleep();
1011                if (wo->wo_rusage)
1012                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1013                put_task_struct(p);
1014                goto out_info;
1015        }
1016        /*
1017         * Move the task's state to DEAD/TRACE, only one thread can do this.
1018         */
1019        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1020                EXIT_TRACE : EXIT_DEAD;
1021        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1022                return 0;
1023        /*
1024         * We own this thread, nobody else can reap it.
1025         */
1026        read_unlock(&tasklist_lock);
1027        sched_annotate_sleep();
1028
1029        /*
1030         * Check thread_group_leader() to exclude the traced sub-threads.
1031         */
1032        if (state == EXIT_DEAD && thread_group_leader(p)) {
1033                struct signal_struct *sig = p->signal;
1034                struct signal_struct *psig = current->signal;
1035                unsigned long maxrss;
1036                u64 tgutime, tgstime;
1037
1038                /*
1039                 * The resource counters for the group leader are in its
1040                 * own task_struct.  Those for dead threads in the group
1041                 * are in its signal_struct, as are those for the child
1042                 * processes it has previously reaped.  All these
1043                 * accumulate in the parent's signal_struct c* fields.
1044                 *
1045                 * We don't bother to take a lock here to protect these
1046                 * p->signal fields because the whole thread group is dead
1047                 * and nobody can change them.
1048                 *
1049                 * psig->stats_lock also protects us from our sub-theads
1050                 * which can reap other children at the same time. Until
1051                 * we change k_getrusage()-like users to rely on this lock
1052                 * we have to take ->siglock as well.
1053                 *
1054                 * We use thread_group_cputime_adjusted() to get times for
1055                 * the thread group, which consolidates times for all threads
1056                 * in the group including the group leader.
1057                 */
1058                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1059                spin_lock_irq(&current->sighand->siglock);
1060                write_seqlock(&psig->stats_lock);
1061                psig->cutime += tgutime + sig->cutime;
1062                psig->cstime += tgstime + sig->cstime;
1063                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1064                psig->cmin_flt +=
1065                        p->min_flt + sig->min_flt + sig->cmin_flt;
1066                psig->cmaj_flt +=
1067                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1068                psig->cnvcsw +=
1069                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1070                psig->cnivcsw +=
1071                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1072                psig->cinblock +=
1073                        task_io_get_inblock(p) +
1074                        sig->inblock + sig->cinblock;
1075                psig->coublock +=
1076                        task_io_get_oublock(p) +
1077                        sig->oublock + sig->coublock;
1078                maxrss = max(sig->maxrss, sig->cmaxrss);
1079                if (psig->cmaxrss < maxrss)
1080                        psig->cmaxrss = maxrss;
1081                task_io_accounting_add(&psig->ioac, &p->ioac);
1082                task_io_accounting_add(&psig->ioac, &sig->ioac);
1083                write_sequnlock(&psig->stats_lock);
1084                spin_unlock_irq(&current->sighand->siglock);
1085        }
1086
1087        if (wo->wo_rusage)
1088                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1089        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1090                ? p->signal->group_exit_code : p->exit_code;
1091        wo->wo_stat = status;
1092
1093        if (state == EXIT_TRACE) {
1094                write_lock_irq(&tasklist_lock);
1095                /* We dropped tasklist, ptracer could die and untrace */
1096                ptrace_unlink(p);
1097
1098                /* If parent wants a zombie, don't release it now */
1099                state = EXIT_ZOMBIE;
1100                if (do_notify_parent(p, p->exit_signal))
1101                        state = EXIT_DEAD;
1102                p->exit_state = state;
1103                write_unlock_irq(&tasklist_lock);
1104        }
1105        if (state == EXIT_DEAD)
1106                release_task(p);
1107
1108out_info:
1109        infop = wo->wo_info;
1110        if (infop) {
1111                if ((status & 0x7f) == 0) {
1112                        infop->cause = CLD_EXITED;
1113                        infop->status = status >> 8;
1114                } else {
1115                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1116                        infop->status = status & 0x7f;
1117                }
1118                infop->pid = pid;
1119                infop->uid = uid;
1120        }
1121
1122        return pid;
1123}
1124
1125static int *task_stopped_code(struct task_struct *p, bool ptrace)
1126{
1127        if (ptrace) {
1128                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1129                        return &p->exit_code;
1130        } else {
1131                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1132                        return &p->signal->group_exit_code;
1133        }
1134        return NULL;
1135}
1136
1137/**
1138 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1139 * @wo: wait options
1140 * @ptrace: is the wait for ptrace
1141 * @p: task to wait for
1142 *
1143 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1144 *
1145 * CONTEXT:
1146 * read_lock(&tasklist_lock), which is released if return value is
1147 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1148 *
1149 * RETURNS:
1150 * 0 if wait condition didn't exist and search for other wait conditions
1151 * should continue.  Non-zero return, -errno on failure and @p's pid on
1152 * success, implies that tasklist_lock is released and wait condition
1153 * search should terminate.
1154 */
1155static int wait_task_stopped(struct wait_opts *wo,
1156                                int ptrace, struct task_struct *p)
1157{
1158        struct waitid_info *infop;
1159        int exit_code, *p_code, why;
1160        uid_t uid = 0; /* unneeded, required by compiler */
1161        pid_t pid;
1162
1163        /*
1164         * Traditionally we see ptrace'd stopped tasks regardless of options.
1165         */
1166        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1167                return 0;
1168
1169        if (!task_stopped_code(p, ptrace))
1170                return 0;
1171
1172        exit_code = 0;
1173        spin_lock_irq(&p->sighand->siglock);
1174
1175        p_code = task_stopped_code(p, ptrace);
1176        if (unlikely(!p_code))
1177                goto unlock_sig;
1178
1179        exit_code = *p_code;
1180        if (!exit_code)
1181                goto unlock_sig;
1182
1183        if (!unlikely(wo->wo_flags & WNOWAIT))
1184                *p_code = 0;
1185
1186        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1187unlock_sig:
1188        spin_unlock_irq(&p->sighand->siglock);
1189        if (!exit_code)
1190                return 0;
1191
1192        /*
1193         * Now we are pretty sure this task is interesting.
1194         * Make sure it doesn't get reaped out from under us while we
1195         * give up the lock and then examine it below.  We don't want to
1196         * keep holding onto the tasklist_lock while we call getrusage and
1197         * possibly take page faults for user memory.
1198         */
1199        get_task_struct(p);
1200        pid = task_pid_vnr(p);
1201        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1202        read_unlock(&tasklist_lock);
1203        sched_annotate_sleep();
1204        if (wo->wo_rusage)
1205                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1206        put_task_struct(p);
1207
1208        if (likely(!(wo->wo_flags & WNOWAIT)))
1209                wo->wo_stat = (exit_code << 8) | 0x7f;
1210
1211        infop = wo->wo_info;
1212        if (infop) {
1213                infop->cause = why;
1214                infop->status = exit_code;
1215                infop->pid = pid;
1216                infop->uid = uid;
1217        }
1218        return pid;
1219}
1220
1221/*
1222 * Handle do_wait work for one task in a live, non-stopped state.
1223 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1224 * the lock and this task is uninteresting.  If we return nonzero, we have
1225 * released the lock and the system call should return.
1226 */
1227static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1228{
1229        struct waitid_info *infop;
1230        pid_t pid;
1231        uid_t uid;
1232
1233        if (!unlikely(wo->wo_flags & WCONTINUED))
1234                return 0;
1235
1236        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1237                return 0;
1238
1239        spin_lock_irq(&p->sighand->siglock);
1240        /* Re-check with the lock held.  */
1241        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1242                spin_unlock_irq(&p->sighand->siglock);
1243                return 0;
1244        }
1245        if (!unlikely(wo->wo_flags & WNOWAIT))
1246                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1247        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1248        spin_unlock_irq(&p->sighand->siglock);
1249
1250        pid = task_pid_vnr(p);
1251        get_task_struct(p);
1252        read_unlock(&tasklist_lock);
1253        sched_annotate_sleep();
1254        if (wo->wo_rusage)
1255                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1256        put_task_struct(p);
1257
1258        infop = wo->wo_info;
1259        if (!infop) {
1260                wo->wo_stat = 0xffff;
1261        } else {
1262                infop->cause = CLD_CONTINUED;
1263                infop->pid = pid;
1264                infop->uid = uid;
1265                infop->status = SIGCONT;
1266        }
1267        return pid;
1268}
1269
1270/*
1271 * Consider @p for a wait by @parent.
1272 *
1273 * -ECHILD should be in ->notask_error before the first call.
1274 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1275 * Returns zero if the search for a child should continue;
1276 * then ->notask_error is 0 if @p is an eligible child,
1277 * or still -ECHILD.
1278 */
1279static int wait_consider_task(struct wait_opts *wo, int ptrace,
1280                                struct task_struct *p)
1281{
1282        /*
1283         * We can race with wait_task_zombie() from another thread.
1284         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1285         * can't confuse the checks below.
1286         */
1287        int exit_state = READ_ONCE(p->exit_state);
1288        int ret;
1289
1290        if (unlikely(exit_state == EXIT_DEAD))
1291                return 0;
1292
1293        ret = eligible_child(wo, ptrace, p);
1294        if (!ret)
1295                return ret;
1296
1297        if (unlikely(exit_state == EXIT_TRACE)) {
1298                /*
1299                 * ptrace == 0 means we are the natural parent. In this case
1300                 * we should clear notask_error, debugger will notify us.
1301                 */
1302                if (likely(!ptrace))
1303                        wo->notask_error = 0;
1304                return 0;
1305        }
1306
1307        if (likely(!ptrace) && unlikely(p->ptrace)) {
1308                /*
1309                 * If it is traced by its real parent's group, just pretend
1310                 * the caller is ptrace_do_wait() and reap this child if it
1311                 * is zombie.
1312                 *
1313                 * This also hides group stop state from real parent; otherwise
1314                 * a single stop can be reported twice as group and ptrace stop.
1315                 * If a ptracer wants to distinguish these two events for its
1316                 * own children it should create a separate process which takes
1317                 * the role of real parent.
1318                 */
1319                if (!ptrace_reparented(p))
1320                        ptrace = 1;
1321        }
1322
1323        /* slay zombie? */
1324        if (exit_state == EXIT_ZOMBIE) {
1325                /* we don't reap group leaders with subthreads */
1326                if (!delay_group_leader(p)) {
1327                        /*
1328                         * A zombie ptracee is only visible to its ptracer.
1329                         * Notification and reaping will be cascaded to the
1330                         * real parent when the ptracer detaches.
1331                         */
1332                        if (unlikely(ptrace) || likely(!p->ptrace))
1333                                return wait_task_zombie(wo, p);
1334                }
1335
1336                /*
1337                 * Allow access to stopped/continued state via zombie by
1338                 * falling through.  Clearing of notask_error is complex.
1339                 *
1340                 * When !@ptrace:
1341                 *
1342                 * If WEXITED is set, notask_error should naturally be
1343                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1344                 * so, if there are live subthreads, there are events to
1345                 * wait for.  If all subthreads are dead, it's still safe
1346                 * to clear - this function will be called again in finite
1347                 * amount time once all the subthreads are released and
1348                 * will then return without clearing.
1349                 *
1350                 * When @ptrace:
1351                 *
1352                 * Stopped state is per-task and thus can't change once the
1353                 * target task dies.  Only continued and exited can happen.
1354                 * Clear notask_error if WCONTINUED | WEXITED.
1355                 */
1356                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1357                        wo->notask_error = 0;
1358        } else {
1359                /*
1360                 * @p is alive and it's gonna stop, continue or exit, so
1361                 * there always is something to wait for.
1362                 */
1363                wo->notask_error = 0;
1364        }
1365
1366        /*
1367         * Wait for stopped.  Depending on @ptrace, different stopped state
1368         * is used and the two don't interact with each other.
1369         */
1370        ret = wait_task_stopped(wo, ptrace, p);
1371        if (ret)
1372                return ret;
1373
1374        /*
1375         * Wait for continued.  There's only one continued state and the
1376         * ptracer can consume it which can confuse the real parent.  Don't
1377         * use WCONTINUED from ptracer.  You don't need or want it.
1378         */
1379        return wait_task_continued(wo, p);
1380}
1381
1382/*
1383 * Do the work of do_wait() for one thread in the group, @tsk.
1384 *
1385 * -ECHILD should be in ->notask_error before the first call.
1386 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1387 * Returns zero if the search for a child should continue; then
1388 * ->notask_error is 0 if there were any eligible children,
1389 * or still -ECHILD.
1390 */
1391static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1392{
1393        struct task_struct *p;
1394
1395        list_for_each_entry(p, &tsk->children, sibling) {
1396                int ret = wait_consider_task(wo, 0, p);
1397
1398                if (ret)
1399                        return ret;
1400        }
1401
1402        return 0;
1403}
1404
1405static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1406{
1407        struct task_struct *p;
1408
1409        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1410                int ret = wait_consider_task(wo, 1, p);
1411
1412                if (ret)
1413                        return ret;
1414        }
1415
1416        return 0;
1417}
1418
1419static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1420                                int sync, void *key)
1421{
1422        struct wait_opts *wo = container_of(wait, struct wait_opts,
1423                                                child_wait);
1424        struct task_struct *p = key;
1425
1426        if (!eligible_pid(wo, p))
1427                return 0;
1428
1429        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1430                return 0;
1431
1432        return default_wake_function(wait, mode, sync, key);
1433}
1434
1435void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1436{
1437        __wake_up_sync_key(&parent->signal->wait_chldexit,
1438                                TASK_INTERRUPTIBLE, 1, p);
1439}
1440
1441static long do_wait(struct wait_opts *wo)
1442{
1443        struct task_struct *tsk;
1444        int retval;
1445
1446        trace_sched_process_wait(wo->wo_pid);
1447
1448        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1449        wo->child_wait.private = current;
1450        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1451repeat:
1452        /*
1453         * If there is nothing that can match our criteria, just get out.
1454         * We will clear ->notask_error to zero if we see any child that
1455         * might later match our criteria, even if we are not able to reap
1456         * it yet.
1457         */
1458        wo->notask_error = -ECHILD;
1459        if ((wo->wo_type < PIDTYPE_MAX) &&
1460           (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1461                goto notask;
1462
1463        set_current_state(TASK_INTERRUPTIBLE);
1464        read_lock(&tasklist_lock);
1465        tsk = current;
1466        do {
1467                retval = do_wait_thread(wo, tsk);
1468                if (retval)
1469                        goto end;
1470
1471                retval = ptrace_do_wait(wo, tsk);
1472                if (retval)
1473                        goto end;
1474
1475                if (wo->wo_flags & __WNOTHREAD)
1476                        break;
1477        } while_each_thread(current, tsk);
1478        read_unlock(&tasklist_lock);
1479
1480notask:
1481        retval = wo->notask_error;
1482        if (!retval && !(wo->wo_flags & WNOHANG)) {
1483                retval = -ERESTARTSYS;
1484                if (!signal_pending(current)) {
1485                        schedule();
1486                        goto repeat;
1487                }
1488        }
1489end:
1490        __set_current_state(TASK_RUNNING);
1491        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1492        return retval;
1493}
1494
1495static struct pid *pidfd_get_pid(unsigned int fd)
1496{
1497        struct fd f;
1498        struct pid *pid;
1499
1500        f = fdget(fd);
1501        if (!f.file)
1502                return ERR_PTR(-EBADF);
1503
1504        pid = pidfd_pid(f.file);
1505        if (!IS_ERR(pid))
1506                get_pid(pid);
1507
1508        fdput(f);
1509        return pid;
1510}
1511
1512static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1513                          int options, struct rusage *ru)
1514{
1515        struct wait_opts wo;
1516        struct pid *pid = NULL;
1517        enum pid_type type;
1518        long ret;
1519
1520        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1521                        __WNOTHREAD|__WCLONE|__WALL))
1522                return -EINVAL;
1523        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1524                return -EINVAL;
1525
1526        switch (which) {
1527        case P_ALL:
1528                type = PIDTYPE_MAX;
1529                break;
1530        case P_PID:
1531                type = PIDTYPE_PID;
1532                if (upid <= 0)
1533                        return -EINVAL;
1534
1535                pid = find_get_pid(upid);
1536                break;
1537        case P_PGID:
1538                type = PIDTYPE_PGID;
1539                if (upid < 0)
1540                        return -EINVAL;
1541
1542                if (upid)
1543                        pid = find_get_pid(upid);
1544                else
1545                        pid = get_task_pid(current, PIDTYPE_PGID);
1546                break;
1547        case P_PIDFD:
1548                type = PIDTYPE_PID;
1549                if (upid < 0)
1550                        return -EINVAL;
1551
1552                pid = pidfd_get_pid(upid);
1553                if (IS_ERR(pid))
1554                        return PTR_ERR(pid);
1555                break;
1556        default:
1557                return -EINVAL;
1558        }
1559
1560        wo.wo_type      = type;
1561        wo.wo_pid       = pid;
1562        wo.wo_flags     = options;
1563        wo.wo_info      = infop;
1564        wo.wo_rusage    = ru;
1565        ret = do_wait(&wo);
1566
1567        put_pid(pid);
1568        return ret;
1569}
1570
1571SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1572                infop, int, options, struct rusage __user *, ru)
1573{
1574        struct rusage r;
1575        struct waitid_info info = {.status = 0};
1576        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1577        int signo = 0;
1578
1579        if (err > 0) {
1580                signo = SIGCHLD;
1581                err = 0;
1582                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1583                        return -EFAULT;
1584        }
1585        if (!infop)
1586                return err;
1587
1588        if (!user_access_begin(infop, sizeof(*infop)))
1589                return -EFAULT;
1590
1591        unsafe_put_user(signo, &infop->si_signo, Efault);
1592        unsafe_put_user(0, &infop->si_errno, Efault);
1593        unsafe_put_user(info.cause, &infop->si_code, Efault);
1594        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1595        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1596        unsafe_put_user(info.status, &infop->si_status, Efault);
1597        user_access_end();
1598        return err;
1599Efault:
1600        user_access_end();
1601        return -EFAULT;
1602}
1603
1604long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1605                  struct rusage *ru)
1606{
1607        struct wait_opts wo;
1608        struct pid *pid = NULL;
1609        enum pid_type type;
1610        long ret;
1611
1612        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1613                        __WNOTHREAD|__WCLONE|__WALL))
1614                return -EINVAL;
1615
1616        /* -INT_MIN is not defined */
1617        if (upid == INT_MIN)
1618                return -ESRCH;
1619
1620        if (upid == -1)
1621                type = PIDTYPE_MAX;
1622        else if (upid < 0) {
1623                type = PIDTYPE_PGID;
1624                pid = find_get_pid(-upid);
1625        } else if (upid == 0) {
1626                type = PIDTYPE_PGID;
1627                pid = get_task_pid(current, PIDTYPE_PGID);
1628        } else /* upid > 0 */ {
1629                type = PIDTYPE_PID;
1630                pid = find_get_pid(upid);
1631        }
1632
1633        wo.wo_type      = type;
1634        wo.wo_pid       = pid;
1635        wo.wo_flags     = options | WEXITED;
1636        wo.wo_info      = NULL;
1637        wo.wo_stat      = 0;
1638        wo.wo_rusage    = ru;
1639        ret = do_wait(&wo);
1640        put_pid(pid);
1641        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1642                ret = -EFAULT;
1643
1644        return ret;
1645}
1646
1647SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1648                int, options, struct rusage __user *, ru)
1649{
1650        struct rusage r;
1651        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1652
1653        if (err > 0) {
1654                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1655                        return -EFAULT;
1656        }
1657        return err;
1658}
1659
1660#ifdef __ARCH_WANT_SYS_WAITPID
1661
1662/*
1663 * sys_waitpid() remains for compatibility. waitpid() should be
1664 * implemented by calling sys_wait4() from libc.a.
1665 */
1666SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1667{
1668        return kernel_wait4(pid, stat_addr, options, NULL);
1669}
1670
1671#endif
1672
1673#ifdef CONFIG_COMPAT
1674COMPAT_SYSCALL_DEFINE4(wait4,
1675        compat_pid_t, pid,
1676        compat_uint_t __user *, stat_addr,
1677        int, options,
1678        struct compat_rusage __user *, ru)
1679{
1680        struct rusage r;
1681        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1682        if (err > 0) {
1683                if (ru && put_compat_rusage(&r, ru))
1684                        return -EFAULT;
1685        }
1686        return err;
1687}
1688
1689COMPAT_SYSCALL_DEFINE5(waitid,
1690                int, which, compat_pid_t, pid,
1691                struct compat_siginfo __user *, infop, int, options,
1692                struct compat_rusage __user *, uru)
1693{
1694        struct rusage ru;
1695        struct waitid_info info = {.status = 0};
1696        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1697        int signo = 0;
1698        if (err > 0) {
1699                signo = SIGCHLD;
1700                err = 0;
1701                if (uru) {
1702                        /* kernel_waitid() overwrites everything in ru */
1703                        if (COMPAT_USE_64BIT_TIME)
1704                                err = copy_to_user(uru, &ru, sizeof(ru));
1705                        else
1706                                err = put_compat_rusage(&ru, uru);
1707                        if (err)
1708                                return -EFAULT;
1709                }
1710        }
1711
1712        if (!infop)
1713                return err;
1714
1715        if (!user_access_begin(infop, sizeof(*infop)))
1716                return -EFAULT;
1717
1718        unsafe_put_user(signo, &infop->si_signo, Efault);
1719        unsafe_put_user(0, &infop->si_errno, Efault);
1720        unsafe_put_user(info.cause, &infop->si_code, Efault);
1721        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1722        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1723        unsafe_put_user(info.status, &infop->si_status, Efault);
1724        user_access_end();
1725        return err;
1726Efault:
1727        user_access_end();
1728        return -EFAULT;
1729}
1730#endif
1731
1732__weak void abort(void)
1733{
1734        BUG();
1735
1736        /* if that doesn't kill us, halt */
1737        panic("Oops failed to kill thread");
1738}
1739EXPORT_SYMBOL(abort);
1740