LXR linux/kernel/exit.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/kernel/exit.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/slab.h>
  10#include <linux/sched/autogroup.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/stat.h>
  13#include <linux/sched/task.h>
  14#include <linux/sched/task_stack.h>
  15#include <linux/sched/cputime.h>
  16#include <linux/interrupt.h>
  17#include <linux/module.h>
  18#include <linux/capability.h>
  19#include <linux/completion.h>
  20#include <linux/personality.h>
  21#include <linux/tty.h>
  22#include <linux/iocontext.h>
  23#include <linux/key.h>
  24#include <linux/cpu.h>
  25#include <linux/acct.h>
  26#include <linux/tsacct_kern.h>
  27#include <linux/file.h>
  28#include <linux/fdtable.h>
  29#include <linux/freezer.h>
  30#include <linux/binfmts.h>
  31#include <linux/nsproxy.h>
  32#include <linux/pid_namespace.h>
  33#include <linux/ptrace.h>
  34#include <linux/profile.h>
  35#include <linux/mount.h>
  36#include <linux/proc_fs.h>
  37#include <linux/kthread.h>
  38#include <linux/mempolicy.h>
  39#include <linux/taskstats_kern.h>
  40#include <linux/delayacct.h>
  41#include <linux/cgroup.h>
  42#include <linux/syscalls.h>
  43#include <linux/signal.h>
  44#include <linux/posix-timers.h>
  45#include <linux/cn_proc.h>
  46#include <linux/mutex.h>
  47#include <linux/futex.h>
  48#include <linux/pipe_fs_i.h>
  49#include <linux/audit.h> /* for audit_free() */
  50#include <linux/resource.h>
  51#include <linux/blkdev.h>
  52#include <linux/task_io_accounting_ops.h>
  53#include <linux/tracehook.h>
  54#include <linux/fs_struct.h>
  55#include <linux/init_task.h>
  56#include <linux/perf_event.h>
  57#include <trace/events/sched.h>
  58#include <linux/hw_breakpoint.h>
  59#include <linux/oom.h>
  60#include <linux/writeback.h>
  61#include <linux/shm.h>
  62#include <linux/kcov.h>
  63#include <linux/random.h>
  64#include <linux/rcuwait.h>
  65#include <linux/compat.h>
  66
  67#include <linux/uaccess.h>
  68#include <asm/unistd.h>
  69#include <asm/pgtable.h>
  70#include <asm/mmu_context.h>
  71
  72static void __unhash_process(struct task_struct *p, bool group_dead)
  73{
  74        nr_threads--;
  75        detach_pid(p, PIDTYPE_PID);
  76        if (group_dead) {
  77                detach_pid(p, PIDTYPE_TGID);
  78                detach_pid(p, PIDTYPE_PGID);
  79                detach_pid(p, PIDTYPE_SID);
  80
  81                list_del_rcu(&p->tasks);
  82                list_del_init(&p->sibling);
  83                __this_cpu_dec(process_counts);
  84        }
  85        list_del_rcu(&p->thread_group);
  86        list_del_rcu(&p->thread_node);
  87}
  88
  89/*
  90 * This function expects the tasklist_lock write-locked.
  91 */
  92static void __exit_signal(struct task_struct *tsk)
  93{
  94        struct signal_struct *sig = tsk->signal;
  95        bool group_dead = thread_group_leader(tsk);
  96        struct sighand_struct *sighand;
  97        struct tty_struct *uninitialized_var(tty);
  98        u64 utime, stime;
  99
 100        sighand = rcu_dereference_check(tsk->sighand,
 101                                        lockdep_tasklist_lock_is_held());
 102        spin_lock(&sighand->siglock);
 103
 104#ifdef CONFIG_POSIX_TIMERS
 105        posix_cpu_timers_exit(tsk);
 106        if (group_dead) {
 107                posix_cpu_timers_exit_group(tsk);
 108        } else {
 109                /*
 110                 * This can only happen if the caller is de_thread().
 111                 * FIXME: this is the temporary hack, we should teach
 112                 * posix-cpu-timers to handle this case correctly.
 113                 */
 114                if (unlikely(has_group_leader_pid(tsk)))
 115                        posix_cpu_timers_exit_group(tsk);
 116        }
 117#endif
 118
 119        if (group_dead) {
 120                tty = sig->tty;
 121                sig->tty = NULL;
 122        } else {
 123                /*
 124                 * If there is any task waiting for the group exit
 125                 * then notify it:
 126                 */
 127                if (sig->notify_count > 0 && !--sig->notify_count)
 128                        wake_up_process(sig->group_exit_task);
 129
 130                if (tsk == sig->curr_target)
 131                        sig->curr_target = next_thread(tsk);
 132        }
 133
 134        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 135                              sizeof(unsigned long long));
 136
 137        /*
 138         * Accumulate here the counters for all threads as they die. We could
 139         * skip the group leader because it is the last user of signal_struct,
 140         * but we want to avoid the race with thread_group_cputime() which can
 141         * see the empty ->thread_head list.
 142         */
 143        task_cputime(tsk, &utime, &stime);
 144        write_seqlock(&sig->stats_lock);
 145        sig->utime += utime;
 146        sig->stime += stime;
 147        sig->gtime += task_gtime(tsk);
 148        sig->min_flt += tsk->min_flt;
 149        sig->maj_flt += tsk->maj_flt;
 150        sig->nvcsw += tsk->nvcsw;
 151        sig->nivcsw += tsk->nivcsw;
 152        sig->inblock += task_io_get_inblock(tsk);
 153        sig->oublock += task_io_get_oublock(tsk);
 154        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 155        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 156        sig->nr_threads--;
 157        __unhash_process(tsk, group_dead);
 158        write_sequnlock(&sig->stats_lock);
 159
 160        /*
 161         * Do this under ->siglock, we can race with another thread
 162         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 163         */
 164        flush_sigqueue(&tsk->pending);
 165        tsk->sighand = NULL;
 166        spin_unlock(&sighand->siglock);
 167
 168        __cleanup_sighand(sighand);
 169        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 170        if (group_dead) {
 171                flush_sigqueue(&sig->shared_pending);
 172                tty_kref_put(tty);
 173        }
 174}
 175
 176static void delayed_put_task_struct(struct rcu_head *rhp)
 177{
 178        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 179
 180        perf_event_delayed_put(tsk);
 181        trace_sched_process_free(tsk);
 182        put_task_struct(tsk);
 183}
 184
 185void put_task_struct_rcu_user(struct task_struct *task)
 186{
 187        if (refcount_dec_and_test(&task->rcu_users))
 188                call_rcu(&task->rcu, delayed_put_task_struct);
 189}
 190
 191void release_task(struct task_struct *p)
 192{
 193        struct task_struct *leader;
 194        int zap_leader;
 195repeat:
 196        /* don't need to get the RCU readlock here - the process is dead and
 197         * can't be modifying its own credentials. But shut RCU-lockdep up */
 198        rcu_read_lock();
 199        atomic_dec(&__task_cred(p)->user->processes);
 200        rcu_read_unlock();
 201
 202        proc_flush_task(p);
 203        cgroup_release(p);
 204
 205        write_lock_irq(&tasklist_lock);
 206        ptrace_release_task(p);
 207        __exit_signal(p);
 208
 209        /*
 210         * If we are the last non-leader member of the thread
 211         * group, and the leader is zombie, then notify the
 212         * group leader's parent process. (if it wants notification.)
 213         */
 214        zap_leader = 0;
 215        leader = p->group_leader;
 216        if (leader != p && thread_group_empty(leader)
 217                        && leader->exit_state == EXIT_ZOMBIE) {
 218                /*
 219                 * If we were the last child thread and the leader has
 220                 * exited already, and the leader's parent ignores SIGCHLD,
 221                 * then we are the one who should release the leader.
 222                 */
 223                zap_leader = do_notify_parent(leader, leader->exit_signal);
 224                if (zap_leader)
 225                        leader->exit_state = EXIT_DEAD;
 226        }
 227
 228        write_unlock_irq(&tasklist_lock);
 229        release_thread(p);
 230        put_task_struct_rcu_user(p);
 231
 232        p = leader;
 233        if (unlikely(zap_leader))
 234                goto repeat;
 235}
 236
 237void rcuwait_wake_up(struct rcuwait *w)
 238{
 239        struct task_struct *task;
 240
 241        rcu_read_lock();
 242
 243        /*
 244         * Order condition vs @task, such that everything prior to the load
 245         * of @task is visible. This is the condition as to why the user called
 246         * rcuwait_trywake() in the first place. Pairs with set_current_state()
 247         * barrier (A) in rcuwait_wait_event().
 248         *
 249         *    WAIT                WAKE
 250         *    [S] tsk = current   [S] cond = true
 251         *        MB (A)              MB (B)
 252         *    [L] cond            [L] tsk
 253         */
 254        smp_mb(); /* (B) */
 255
 256        task = rcu_dereference(w->task);
 257        if (task)
 258                wake_up_process(task);
 259        rcu_read_unlock();
 260}
 261
 262/*
 263 * Determine if a process group is "orphaned", according to the POSIX
 264 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 265 * by terminal-generated stop signals.  Newly orphaned process groups are
 266 * to receive a SIGHUP and a SIGCONT.
 267 *
 268 * "I ask you, have you ever known what it is to be an orphan?"
 269 */
 270static int will_become_orphaned_pgrp(struct pid *pgrp,
 271                                        struct task_struct *ignored_task)
 272{
 273        struct task_struct *p;
 274
 275        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 276                if ((p == ignored_task) ||
 277                    (p->exit_state && thread_group_empty(p)) ||
 278                    is_global_init(p->real_parent))
 279                        continue;
 280
 281                if (task_pgrp(p->real_parent) != pgrp &&
 282                    task_session(p->real_parent) == task_session(p))
 283                        return 0;
 284        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 285
 286        return 1;
 287}
 288
 289int is_current_pgrp_orphaned(void)
 290{
 291        int retval;
 292
 293        read_lock(&tasklist_lock);
 294        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 295        read_unlock(&tasklist_lock);
 296
 297        return retval;
 298}
 299
 300static bool has_stopped_jobs(struct pid *pgrp)
 301{
 302        struct task_struct *p;
 303
 304        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 305                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 306                        return true;
 307        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 308
 309        return false;
 310}
 311
 312/*
 313 * Check to see if any process groups have become orphaned as
 314 * a result of our exiting, and if they have any stopped jobs,
 315 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 316 */
 317static void
 318kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 319{
 320        struct pid *pgrp = task_pgrp(tsk);
 321        struct task_struct *ignored_task = tsk;
 322
 323        if (!parent)
 324                /* exit: our father is in a different pgrp than
 325                 * we are and we were the only connection outside.
 326                 */
 327                parent = tsk->real_parent;
 328        else
 329                /* reparent: our child is in a different pgrp than
 330                 * we are, and it was the only connection outside.
 331                 */
 332                ignored_task = NULL;
 333
 334        if (task_pgrp(parent) != pgrp &&
 335            task_session(parent) == task_session(tsk) &&
 336            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 337            has_stopped_jobs(pgrp)) {
 338                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 339                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 340        }
 341}
 342
 343#ifdef CONFIG_MEMCG
 344/*
 345 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 346 */
 347void mm_update_next_owner(struct mm_struct *mm)
 348{
 349        struct task_struct *c, *g, *p = current;
 350
 351retry:
 352        /*
 353         * If the exiting or execing task is not the owner, it's
 354         * someone else's problem.
 355         */
 356        if (mm->owner != p)
 357                return;
 358        /*
 359         * The current owner is exiting/execing and there are no other
 360         * candidates.  Do not leave the mm pointing to a possibly
 361         * freed task structure.
 362         */
 363        if (atomic_read(&mm->mm_users) <= 1) {
 364                WRITE_ONCE(mm->owner, NULL);
 365                return;
 366        }
 367
 368        read_lock(&tasklist_lock);
 369        /*
 370         * Search in the children
 371         */
 372        list_for_each_entry(c, &p->children, sibling) {
 373                if (c->mm == mm)
 374                        goto assign_new_owner;
 375        }
 376
 377        /*
 378         * Search in the siblings
 379         */
 380        list_for_each_entry(c, &p->real_parent->children, sibling) {
 381                if (c->mm == mm)
 382                        goto assign_new_owner;
 383        }
 384
 385        /*
 386         * Search through everything else, we should not get here often.
 387         */
 388        for_each_process(g) {
 389                if (g->flags & PF_KTHREAD)
 390                        continue;
 391                for_each_thread(g, c) {
 392                        if (c->mm == mm)
 393                                goto assign_new_owner;
 394                        if (c->mm)
 395                                break;
 396                }
 397        }
 398        read_unlock(&tasklist_lock);
 399        /*
 400         * We found no owner yet mm_users > 1: this implies that we are
 401         * most likely racing with swapoff (try_to_unuse()) or /proc or
 402         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 403         */
 404        WRITE_ONCE(mm->owner, NULL);
 405        return;
 406
 407assign_new_owner:
 408        BUG_ON(c == p);
 409        get_task_struct(c);
 410        /*
 411         * The task_lock protects c->mm from changing.
 412         * We always want mm->owner->mm == mm
 413         */
 414        task_lock(c);
 415        /*
 416         * Delay read_unlock() till we have the task_lock()
 417         * to ensure that c does not slip away underneath us
 418         */
 419        read_unlock(&tasklist_lock);
 420        if (c->mm != mm) {
 421                task_unlock(c);
 422                put_task_struct(c);
 423                goto retry;
 424        }
 425        WRITE_ONCE(mm->owner, c);
 426        task_unlock(c);
 427        put_task_struct(c);
 428}
 429#endif /* CONFIG_MEMCG */
 430
 431/*
 432 * Turn us into a lazy TLB process if we
 433 * aren't already..
 434 */
 435static void exit_mm(void)
 436{
 437        struct mm_struct *mm = current->mm;
 438        struct core_state *core_state;
 439
 440        exit_mm_release(current, mm);
 441        if (!mm)
 442                return;
 443        sync_mm_rss(mm);
 444        /*
 445         * Serialize with any possible pending coredump.
 446         * We must hold mmap_sem around checking core_state
 447         * and clearing tsk->mm.  The core-inducing thread
 448         * will increment ->nr_threads for each thread in the
 449         * group with ->mm != NULL.
 450         */
 451        down_read(&mm->mmap_sem);
 452        core_state = mm->core_state;
 453        if (core_state) {
 454                struct core_thread self;
 455
 456                up_read(&mm->mmap_sem);
 457
 458                self.task = current;
 459                self.next = xchg(&core_state->dumper.next, &self);
 460                /*
 461                 * Implies mb(), the result of xchg() must be visible
 462                 * to core_state->dumper.
 463                 */
 464                if (atomic_dec_and_test(&core_state->nr_threads))
 465                        complete(&core_state->startup);
 466
 467                for (;;) {
 468                        set_current_state(TASK_UNINTERRUPTIBLE);
 469                        if (!self.task) /* see coredump_finish() */
 470                                break;
 471                        freezable_schedule();
 472                }
 473                __set_current_state(TASK_RUNNING);
 474                down_read(&mm->mmap_sem);
 475        }
 476        mmgrab(mm);
 477        BUG_ON(mm != current->active_mm);
 478        /* more a memory barrier than a real lock */
 479        task_lock(current);
 480        current->mm = NULL;
 481        up_read(&mm->mmap_sem);
 482        enter_lazy_tlb(mm, current);
 483        task_unlock(current);
 484        mm_update_next_owner(mm);
 485        mmput(mm);
 486        if (test_thread_flag(TIF_MEMDIE))
 487                exit_oom_victim();
 488}
 489
 490static struct task_struct *find_alive_thread(struct task_struct *p)
 491{
 492        struct task_struct *t;
 493
 494        for_each_thread(p, t) {
 495                if (!(t->flags & PF_EXITING))
 496                        return t;
 497        }
 498        return NULL;
 499}
 500
 501static struct task_struct *find_child_reaper(struct task_struct *father,
 502                                                struct list_head *dead)
 503        __releases(&tasklist_lock)
 504        __acquires(&tasklist_lock)
 505{
 506        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 507        struct task_struct *reaper = pid_ns->child_reaper;
 508        struct task_struct *p, *n;
 509
 510        if (likely(reaper != father))
 511                return reaper;
 512
 513        reaper = find_alive_thread(father);
 514        if (reaper) {
 515                pid_ns->child_reaper = reaper;
 516                return reaper;
 517        }
 518
 519        write_unlock_irq(&tasklist_lock);
 520
 521        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 522                list_del_init(&p->ptrace_entry);
 523                release_task(p);
 524        }
 525
 526        zap_pid_ns_processes(pid_ns);
 527        write_lock_irq(&tasklist_lock);
 528
 529        return father;
 530}
 531
 532/*
 533 * When we die, we re-parent all our children, and try to:
 534 * 1. give them to another thread in our thread group, if such a member exists
 535 * 2. give it to the first ancestor process which prctl'd itself as a
 536 *    child_subreaper for its children (like a service manager)
 537 * 3. give it to the init process (PID 1) in our pid namespace
 538 */
 539static struct task_struct *find_new_reaper(struct task_struct *father,
 540                                           struct task_struct *child_reaper)
 541{
 542        struct task_struct *thread, *reaper;
 543
 544        thread = find_alive_thread(father);
 545        if (thread)
 546                return thread;
 547
 548        if (father->signal->has_child_subreaper) {
 549                unsigned int ns_level = task_pid(father)->level;
 550                /*
 551                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 552                 * We can't check reaper != child_reaper to ensure we do not
 553                 * cross the namespaces, the exiting parent could be injected
 554                 * by setns() + fork().
 555                 * We check pid->level, this is slightly more efficient than
 556                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 557                 */
 558                for (reaper = father->real_parent;
 559                     task_pid(reaper)->level == ns_level;
 560                     reaper = reaper->real_parent) {
 561                        if (reaper == &init_task)
 562                                break;
 563                        if (!reaper->signal->is_child_subreaper)
 564                                continue;
 565                        thread = find_alive_thread(reaper);
 566                        if (thread)
 567                                return thread;
 568                }
 569        }
 570
 571        return child_reaper;
 572}
 573
 574/*
 575* Any that need to be release_task'd are put on the @dead list.
 576 */
 577static void reparent_leader(struct task_struct *father, struct task_struct *p,
 578                                struct list_head *dead)
 579{
 580        if (unlikely(p->exit_state == EXIT_DEAD))
 581                return;
 582
 583        /* We don't want people slaying init. */
 584        p->exit_signal = SIGCHLD;
 585
 586        /* If it has exited notify the new parent about this child's death. */
 587        if (!p->ptrace &&
 588            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 589                if (do_notify_parent(p, p->exit_signal)) {
 590                        p->exit_state = EXIT_DEAD;
 591                        list_add(&p->ptrace_entry, dead);
 592                }
 593        }
 594
 595        kill_orphaned_pgrp(p, father);
 596}
 597
 598/*
 599 * This does two things:
 600 *
 601 * A.  Make init inherit all the child processes
 602 * B.  Check to see if any process groups have become orphaned
 603 *      as a result of our exiting, and if they have any stopped
 604 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 605 */
 606static void forget_original_parent(struct task_struct *father,
 607                                        struct list_head *dead)
 608{
 609        struct task_struct *p, *t, *reaper;
 610
 611        if (unlikely(!list_empty(&father->ptraced)))
 612                exit_ptrace(father, dead);
 613
 614        /* Can drop and reacquire tasklist_lock */
 615        reaper = find_child_reaper(father, dead);
 616        if (list_empty(&father->children))
 617                return;
 618
 619        reaper = find_new_reaper(father, reaper);
 620        list_for_each_entry(p, &father->children, sibling) {
 621                for_each_thread(p, t) {
 622                        t->real_parent = reaper;
 623                        BUG_ON((!t->ptrace) != (t->parent == father));
 624                        if (likely(!t->ptrace))
 625                                t->parent = t->real_parent;
 626                        if (t->pdeath_signal)
 627                                group_send_sig_info(t->pdeath_signal,
 628                                                    SEND_SIG_NOINFO, t,
 629                                                    PIDTYPE_TGID);
 630                }
 631                /*
 632                 * If this is a threaded reparent there is no need to
 633                 * notify anyone anything has happened.
 634                 */
 635                if (!same_thread_group(reaper, father))
 636                        reparent_leader(father, p, dead);
 637        }
 638        list_splice_tail_init(&father->children, &reaper->children);
 639}
 640
 641/*
 642 * Send signals to all our closest relatives so that they know
 643 * to properly mourn us..
 644 */
 645static void exit_notify(struct task_struct *tsk, int group_dead)
 646{
 647        bool autoreap;
 648        struct task_struct *p, *n;
 649        LIST_HEAD(dead);
 650
 651        write_lock_irq(&tasklist_lock);
 652        forget_original_parent(tsk, &dead);
 653
 654        if (group_dead)
 655                kill_orphaned_pgrp(tsk->group_leader, NULL);
 656
 657        tsk->exit_state = EXIT_ZOMBIE;
 658        if (unlikely(tsk->ptrace)) {
 659                int sig = thread_group_leader(tsk) &&
 660                                thread_group_empty(tsk) &&
 661                                !ptrace_reparented(tsk) ?
 662                        tsk->exit_signal : SIGCHLD;
 663                autoreap = do_notify_parent(tsk, sig);
 664        } else if (thread_group_leader(tsk)) {
 665                autoreap = thread_group_empty(tsk) &&
 666                        do_notify_parent(tsk, tsk->exit_signal);
 667        } else {
 668                autoreap = true;
 669        }
 670
 671        if (autoreap) {
 672                tsk->exit_state = EXIT_DEAD;
 673                list_add(&tsk->ptrace_entry, &dead);
 674        }
 675
 676        /* mt-exec, de_thread() is waiting for group leader */
 677        if (unlikely(tsk->signal->notify_count < 0))
 678                wake_up_process(tsk->signal->group_exit_task);
 679        write_unlock_irq(&tasklist_lock);
 680
 681        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 682                list_del_init(&p->ptrace_entry);
 683                release_task(p);
 684        }
 685}
 686
 687#ifdef CONFIG_DEBUG_STACK_USAGE
 688static void check_stack_usage(void)
 689{
 690        static DEFINE_SPINLOCK(low_water_lock);
 691        static int lowest_to_date = THREAD_SIZE;
 692        unsigned long free;
 693
 694        free = stack_not_used(current);
 695
 696        if (free >= lowest_to_date)
 697                return;
 698
 699        spin_lock(&low_water_lock);
 700        if (free < lowest_to_date) {
 701                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 702                        current->comm, task_pid_nr(current), free);
 703                lowest_to_date = free;
 704        }
 705        spin_unlock(&low_water_lock);
 706}
 707#else
 708static inline void check_stack_usage(void) {}
 709#endif
 710
 711void __noreturn do_exit(long code)
 712{
 713        struct task_struct *tsk = current;
 714        int group_dead;
 715
 716        profile_task_exit(tsk);
 717        kcov_task_exit(tsk);
 718
 719        WARN_ON(blk_needs_flush_plug(tsk));
 720
 721        if (unlikely(in_interrupt()))
 722                panic("Aiee, killing interrupt handler!");
 723        if (unlikely(!tsk->pid))
 724                panic("Attempted to kill the idle task!");
 725
 726        /*
 727         * If do_exit is called because this processes oopsed, it's possible
 728         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 729         * continuing. Amongst other possible reasons, this is to prevent
 730         * mm_release()->clear_child_tid() from writing to a user-controlled
 731         * kernel address.
 732         */
 733        set_fs(USER_DS);
 734
 735        ptrace_event(PTRACE_EVENT_EXIT, code);
 736
 737        validate_creds_for_do_exit(tsk);
 738
 739        /*
 740         * We're taking recursive faults here in do_exit. Safest is to just
 741         * leave this task alone and wait for reboot.
 742         */
 743        if (unlikely(tsk->flags & PF_EXITING)) {
 744                pr_alert("Fixing recursive fault but reboot is needed!\n");
 745                futex_exit_recursive(tsk);
 746                set_current_state(TASK_UNINTERRUPTIBLE);
 747                schedule();
 748        }
 749
 750        exit_signals(tsk);  /* sets PF_EXITING */
 751
 752        if (unlikely(in_atomic())) {
 753                pr_info("note: %s[%d] exited with preempt_count %d\n",
 754                        current->comm, task_pid_nr(current),
 755                        preempt_count());
 756                preempt_count_set(PREEMPT_ENABLED);
 757        }
 758
 759        /* sync mm's RSS info before statistics gathering */
 760        if (tsk->mm)
 761                sync_mm_rss(tsk->mm);
 762        acct_update_integrals(tsk);
 763        group_dead = atomic_dec_and_test(&tsk->signal->live);
 764        if (group_dead) {
 765                /*
 766                 * If the last thread of global init has exited, panic
 767                 * immediately to get a useable coredump.
 768                 */
 769                if (unlikely(is_global_init(tsk)))
 770                        panic("Attempted to kill init! exitcode=0x%08x\n",
 771                                tsk->signal->group_exit_code ?: (int)code);
 772
 773#ifdef CONFIG_POSIX_TIMERS
 774                hrtimer_cancel(&tsk->signal->real_timer);
 775                exit_itimers(tsk->signal);
 776#endif
 777                if (tsk->mm)
 778                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 779        }
 780        acct_collect(code, group_dead);
 781        if (group_dead)
 782                tty_audit_exit();
 783        audit_free(tsk);
 784
 785        tsk->exit_code = code;
 786        taskstats_exit(tsk, group_dead);
 787
 788        exit_mm();
 789
 790        if (group_dead)
 791                acct_process();
 792        trace_sched_process_exit(tsk);
 793
 794        exit_sem(tsk);
 795        exit_shm(tsk);
 796        exit_files(tsk);
 797        exit_fs(tsk);
 798        if (group_dead)
 799                disassociate_ctty(1);
 800        exit_task_namespaces(tsk);
 801        exit_task_work(tsk);
 802        exit_thread(tsk);
 803        exit_umh(tsk);
 804
 805        /*
 806         * Flush inherited counters to the parent - before the parent
 807         * gets woken up by child-exit notifications.
 808         *
 809         * because of cgroup mode, must be called before cgroup_exit()
 810         */
 811        perf_event_exit_task(tsk);
 812
 813        sched_autogroup_exit_task(tsk);
 814        cgroup_exit(tsk);
 815
 816        /*
 817         * FIXME: do that only when needed, using sched_exit tracepoint
 818         */
 819        flush_ptrace_hw_breakpoint(tsk);
 820
 821        exit_tasks_rcu_start();
 822        exit_notify(tsk, group_dead);
 823        proc_exit_connector(tsk);
 824        mpol_put_task_policy(tsk);
 825#ifdef CONFIG_FUTEX
 826        if (unlikely(current->pi_state_cache))
 827                kfree(current->pi_state_cache);
 828#endif
 829        /*
 830         * Make sure we are holding no locks:
 831         */
 832        debug_check_no_locks_held();
 833
 834        if (tsk->io_context)
 835                exit_io_context(tsk);
 836
 837        if (tsk->splice_pipe)
 838                free_pipe_info(tsk->splice_pipe);
 839
 840        if (tsk->task_frag.page)
 841                put_page(tsk->task_frag.page);
 842
 843        validate_creds_for_do_exit(tsk);
 844
 845        check_stack_usage();
 846        preempt_disable();
 847        if (tsk->nr_dirtied)
 848                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 849        exit_rcu();
 850        exit_tasks_rcu_finish();
 851
 852        lockdep_free_task(tsk);
 853        do_task_dead();
 854}
 855EXPORT_SYMBOL_GPL(do_exit);
 856
 857void complete_and_exit(struct completion *comp, long code)
 858{
 859        if (comp)
 860                complete(comp);
 861
 862        do_exit(code);
 863}
 864EXPORT_SYMBOL(complete_and_exit);
 865
 866SYSCALL_DEFINE1(exit, int, error_code)
 867{
 868        do_exit((error_code&0xff)<<8);
 869}
 870
 871/*
 872 * Take down every thread in the group.  This is called by fatal signals
 873 * as well as by sys_exit_group (below).
 874 */
 875void
 876do_group_exit(int exit_code)
 877{
 878        struct signal_struct *sig = current->signal;
 879
 880        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 881
 882        if (signal_group_exit(sig))
 883                exit_code = sig->group_exit_code;
 884        else if (!thread_group_empty(current)) {
 885                struct sighand_struct *const sighand = current->sighand;
 886
 887                spin_lock_irq(&sighand->siglock);
 888                if (signal_group_exit(sig))
 889                        /* Another thread got here before we took the lock.  */
 890                        exit_code = sig->group_exit_code;
 891                else {
 892                        sig->group_exit_code = exit_code;
 893                        sig->flags = SIGNAL_GROUP_EXIT;
 894                        zap_other_threads(current);
 895                }
 896                spin_unlock_irq(&sighand->siglock);
 897        }
 898
 899        do_exit(exit_code);
 900        /* NOTREACHED */
 901}
 902
 903/*
 904 * this kills every thread in the thread group. Note that any externally
 905 * wait4()-ing process will get the correct exit code - even if this
 906 * thread is not the thread group leader.
 907 */
 908SYSCALL_DEFINE1(exit_group, int, error_code)
 909{
 910        do_group_exit((error_code & 0xff) << 8);
 911        /* NOTREACHED */
 912        return 0;
 913}
 914
 915struct waitid_info {
 916        pid_t pid;
 917        uid_t uid;
 918        int status;
 919        int cause;
 920};
 921
 922struct wait_opts {
 923        enum pid_type           wo_type;
 924        int                     wo_flags;
 925        struct pid              *wo_pid;
 926
 927        struct waitid_info      *wo_info;
 928        int                     wo_stat;
 929        struct rusage           *wo_rusage;
 930
 931        wait_queue_entry_t              child_wait;
 932        int                     notask_error;
 933};
 934
 935static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 936{
 937        return  wo->wo_type == PIDTYPE_MAX ||
 938                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 939}
 940
 941static int
 942eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 943{
 944        if (!eligible_pid(wo, p))
 945                return 0;
 946
 947        /*
 948         * Wait for all children (clone and not) if __WALL is set or
 949         * if it is traced by us.
 950         */
 951        if (ptrace || (wo->wo_flags & __WALL))
 952                return 1;
 953
 954        /*
 955         * Otherwise, wait for clone children *only* if __WCLONE is set;
 956         * otherwise, wait for non-clone children *only*.
 957         *
 958         * Note: a "clone" child here is one that reports to its parent
 959         * using a signal other than SIGCHLD, or a non-leader thread which
 960         * we can only see if it is traced by us.
 961         */
 962        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 963                return 0;
 964
 965        return 1;
 966}
 967
 968/*
 969 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 970 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 971 * the lock and this task is uninteresting.  If we return nonzero, we have
 972 * released the lock and the system call should return.
 973 */
 974static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 975{
 976        int state, status;
 977        pid_t pid = task_pid_vnr(p);
 978        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 979        struct waitid_info *infop;
 980
 981        if (!likely(wo->wo_flags & WEXITED))
 982                return 0;
 983
 984        if (unlikely(wo->wo_flags & WNOWAIT)) {
 985                status = p->exit_code;
 986                get_task_struct(p);
 987                read_unlock(&tasklist_lock);
 988                sched_annotate_sleep();
 989                if (wo->wo_rusage)
 990                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 991                put_task_struct(p);
 992                goto out_info;
 993        }
 994        /*
 995         * Move the task's state to DEAD/TRACE, only one thread can do this.
 996         */
 997        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
 998                EXIT_TRACE : EXIT_DEAD;
 999        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1000                return 0;

1001        /*
1002         * We own this thread, nobody else can reap it.
1003         */
1004        read_unlock(&tasklist_lock);
1005        sched_annotate_sleep();
1006
1007        /*
1008         * Check thread_group_leader() to exclude the traced sub-threads.
1009         */
1010        if (state == EXIT_DEAD && thread_group_leader(p)) {
1011                struct signal_struct *sig = p->signal;
1012                struct signal_struct *psig = current->signal;
1013                unsigned long maxrss;
1014                u64 tgutime, tgstime;
1015
1016                /*
1017                 * The resource counters for the group leader are in its
1018                 * own task_struct.  Those for dead threads in the group
1019                 * are in its signal_struct, as are those for the child
1020                 * processes it has previously reaped.  All these
1021                 * accumulate in the parent's signal_struct c* fields.
1022                 *
1023                 * We don't bother to take a lock here to protect these
1024                 * p->signal fields because the whole thread group is dead
1025                 * and nobody can change them.
1026                 *
1027                 * psig->stats_lock also protects us from our sub-theads
1028                 * which can reap other children at the same time. Until
1029                 * we change k_getrusage()-like users to rely on this lock
1030                 * we have to take ->siglock as well.
1031                 *
1032                 * We use thread_group_cputime_adjusted() to get times for
1033                 * the thread group, which consolidates times for all threads
1034                 * in the group including the group leader.
1035                 */
1036                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1037                spin_lock_irq(&current->sighand->siglock);
1038                write_seqlock(&psig->stats_lock);
1039                psig->cutime += tgutime + sig->cutime;
1040                psig->cstime += tgstime + sig->cstime;
1041                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1042                psig->cmin_flt +=
1043                        p->min_flt + sig->min_flt + sig->cmin_flt;
1044                psig->cmaj_flt +=
1045                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1046                psig->cnvcsw +=
1047                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1048                psig->cnivcsw +=
1049                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1050                psig->cinblock +=
1051                        task_io_get_inblock(p) +
1052                        sig->inblock + sig->cinblock;
1053                psig->coublock +=
1054                        task_io_get_oublock(p) +
1055                        sig->oublock + sig->coublock;
1056                maxrss = max(sig->maxrss, sig->cmaxrss);
1057                if (psig->cmaxrss < maxrss)
1058                        psig->cmaxrss = maxrss;
1059                task_io_accounting_add(&psig->ioac, &p->ioac);
1060                task_io_accounting_add(&psig->ioac, &sig->ioac);
1061                write_sequnlock(&psig->stats_lock);
1062                spin_unlock_irq(&current->sighand->siglock);
1063        }
1064
1065        if (wo->wo_rusage)
1066                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1067        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1068                ? p->signal->group_exit_code : p->exit_code;
1069        wo->wo_stat = status;
1070
1071        if (state == EXIT_TRACE) {
1072                write_lock_irq(&tasklist_lock);
1073                /* We dropped tasklist, ptracer could die and untrace */
1074                ptrace_unlink(p);
1075
1076                /* If parent wants a zombie, don't release it now */
1077                state = EXIT_ZOMBIE;
1078                if (do_notify_parent(p, p->exit_signal))
1079                        state = EXIT_DEAD;
1080                p->exit_state = state;
1081                write_unlock_irq(&tasklist_lock);
1082        }
1083        if (state == EXIT_DEAD)
1084                release_task(p);
1085
1086out_info:
1087        infop = wo->wo_info;
1088        if (infop) {
1089                if ((status & 0x7f) == 0) {
1090                        infop->cause = CLD_EXITED;
1091                        infop->status = status >> 8;
1092                } else {
1093                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1094                        infop->status = status & 0x7f;
1095                }
1096                infop->pid = pid;
1097                infop->uid = uid;
1098        }
1099
1100        return pid;
1101}
1102
1103static int *task_stopped_code(struct task_struct *p, bool ptrace)
1104{
1105        if (ptrace) {
1106                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1107                        return &p->exit_code;
1108        } else {
1109                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1110                        return &p->signal->group_exit_code;
1111        }
1112        return NULL;
1113}
1114
1115/**
1116 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1117 * @wo: wait options
1118 * @ptrace: is the wait for ptrace
1119 * @p: task to wait for
1120 *
1121 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1122 *
1123 * CONTEXT:
1124 * read_lock(&tasklist_lock), which is released if return value is
1125 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1126 *
1127 * RETURNS:
1128 * 0 if wait condition didn't exist and search for other wait conditions
1129 * should continue.  Non-zero return, -errno on failure and @p's pid on
1130 * success, implies that tasklist_lock is released and wait condition
1131 * search should terminate.
1132 */
1133static int wait_task_stopped(struct wait_opts *wo,
1134                                int ptrace, struct task_struct *p)
1135{
1136        struct waitid_info *infop;
1137        int exit_code, *p_code, why;
1138        uid_t uid = 0; /* unneeded, required by compiler */
1139        pid_t pid;
1140
1141        /*
1142         * Traditionally we see ptrace'd stopped tasks regardless of options.
1143         */
1144        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1145                return 0;
1146
1147        if (!task_stopped_code(p, ptrace))
1148                return 0;
1149
1150        exit_code = 0;
1151        spin_lock_irq(&p->sighand->siglock);
1152
1153        p_code = task_stopped_code(p, ptrace);
1154        if (unlikely(!p_code))
1155                goto unlock_sig;
1156
1157        exit_code = *p_code;
1158        if (!exit_code)
1159                goto unlock_sig;
1160
1161        if (!unlikely(wo->wo_flags & WNOWAIT))
1162                *p_code = 0;
1163
1164        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1165unlock_sig:
1166        spin_unlock_irq(&p->sighand->siglock);
1167        if (!exit_code)
1168                return 0;
1169
1170        /*
1171         * Now we are pretty sure this task is interesting.
1172         * Make sure it doesn't get reaped out from under us while we
1173         * give up the lock and then examine it below.  We don't want to
1174         * keep holding onto the tasklist_lock while we call getrusage and
1175         * possibly take page faults for user memory.
1176         */
1177        get_task_struct(p);
1178        pid = task_pid_vnr(p);
1179        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1180        read_unlock(&tasklist_lock);
1181        sched_annotate_sleep();
1182        if (wo->wo_rusage)
1183                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1184        put_task_struct(p);
1185
1186        if (likely(!(wo->wo_flags & WNOWAIT)))
1187                wo->wo_stat = (exit_code << 8) | 0x7f;
1188
1189        infop = wo->wo_info;
1190        if (infop) {
1191                infop->cause = why;
1192                infop->status = exit_code;
1193                infop->pid = pid;
1194                infop->uid = uid;
1195        }
1196        return pid;
1197}
1198
1199/*
1200 * Handle do_wait work for one task in a live, non-stopped state.
1201 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1202 * the lock and this task is uninteresting.  If we return nonzero, we have
1203 * released the lock and the system call should return.
1204 */
1205static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1206{
1207        struct waitid_info *infop;
1208        pid_t pid;
1209        uid_t uid;
1210
1211        if (!unlikely(wo->wo_flags & WCONTINUED))
1212                return 0;
1213
1214        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1215                return 0;
1216
1217        spin_lock_irq(&p->sighand->siglock);
1218        /* Re-check with the lock held.  */
1219        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1220                spin_unlock_irq(&p->sighand->siglock);
1221                return 0;
1222        }
1223        if (!unlikely(wo->wo_flags & WNOWAIT))
1224                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1225        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1226        spin_unlock_irq(&p->sighand->siglock);
1227
1228        pid = task_pid_vnr(p);
1229        get_task_struct(p);
1230        read_unlock(&tasklist_lock);
1231        sched_annotate_sleep();
1232        if (wo->wo_rusage)
1233                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1234        put_task_struct(p);
1235
1236        infop = wo->wo_info;
1237        if (!infop) {
1238                wo->wo_stat = 0xffff;
1239        } else {
1240                infop->cause = CLD_CONTINUED;
1241                infop->pid = pid;
1242                infop->uid = uid;
1243                infop->status = SIGCONT;
1244        }
1245        return pid;
1246}
1247
1248/*
1249 * Consider @p for a wait by @parent.
1250 *
1251 * -ECHILD should be in ->notask_error before the first call.
1252 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1253 * Returns zero if the search for a child should continue;
1254 * then ->notask_error is 0 if @p is an eligible child,
1255 * or still -ECHILD.
1256 */
1257static int wait_consider_task(struct wait_opts *wo, int ptrace,
1258                                struct task_struct *p)
1259{
1260        /*
1261         * We can race with wait_task_zombie() from another thread.
1262         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1263         * can't confuse the checks below.
1264         */
1265        int exit_state = READ_ONCE(p->exit_state);
1266        int ret;
1267
1268        if (unlikely(exit_state == EXIT_DEAD))
1269                return 0;
1270
1271        ret = eligible_child(wo, ptrace, p);
1272        if (!ret)
1273                return ret;
1274
1275        if (unlikely(exit_state == EXIT_TRACE)) {
1276                /*
1277                 * ptrace == 0 means we are the natural parent. In this case
1278                 * we should clear notask_error, debugger will notify us.
1279                 */
1280                if (likely(!ptrace))
1281                        wo->notask_error = 0;
1282                return 0;
1283        }
1284
1285        if (likely(!ptrace) && unlikely(p->ptrace)) {
1286                /*
1287                 * If it is traced by its real parent's group, just pretend
1288                 * the caller is ptrace_do_wait() and reap this child if it
1289                 * is zombie.
1290                 *
1291                 * This also hides group stop state from real parent; otherwise
1292                 * a single stop can be reported twice as group and ptrace stop.
1293                 * If a ptracer wants to distinguish these two events for its
1294                 * own children it should create a separate process which takes
1295                 * the role of real parent.
1296                 */
1297                if (!ptrace_reparented(p))
1298                        ptrace = 1;
1299        }
1300
1301        /* slay zombie? */
1302        if (exit_state == EXIT_ZOMBIE) {
1303                /* we don't reap group leaders with subthreads */
1304                if (!delay_group_leader(p)) {
1305                        /*
1306                         * A zombie ptracee is only visible to its ptracer.
1307                         * Notification and reaping will be cascaded to the
1308                         * real parent when the ptracer detaches.
1309                         */
1310                        if (unlikely(ptrace) || likely(!p->ptrace))
1311                                return wait_task_zombie(wo, p);
1312                }
1313
1314                /*
1315                 * Allow access to stopped/continued state via zombie by
1316                 * falling through.  Clearing of notask_error is complex.
1317                 *
1318                 * When !@ptrace:
1319                 *
1320                 * If WEXITED is set, notask_error should naturally be
1321                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1322                 * so, if there are live subthreads, there are events to
1323                 * wait for.  If all subthreads are dead, it's still safe
1324                 * to clear - this function will be called again in finite
1325                 * amount time once all the subthreads are released and
1326                 * will then return without clearing.
1327                 *
1328                 * When @ptrace:
1329                 *
1330                 * Stopped state is per-task and thus can't change once the
1331                 * target task dies.  Only continued and exited can happen.
1332                 * Clear notask_error if WCONTINUED | WEXITED.
1333                 */
1334                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1335                        wo->notask_error = 0;
1336        } else {
1337                /*
1338                 * @p is alive and it's gonna stop, continue or exit, so
1339                 * there always is something to wait for.
1340                 */
1341                wo->notask_error = 0;
1342        }
1343
1344        /*
1345         * Wait for stopped.  Depending on @ptrace, different stopped state
1346         * is used and the two don't interact with each other.
1347         */
1348        ret = wait_task_stopped(wo, ptrace, p);
1349        if (ret)
1350                return ret;
1351
1352        /*
1353         * Wait for continued.  There's only one continued state and the
1354         * ptracer can consume it which can confuse the real parent.  Don't
1355         * use WCONTINUED from ptracer.  You don't need or want it.
1356         */
1357        return wait_task_continued(wo, p);
1358}
1359
1360/*
1361 * Do the work of do_wait() for one thread in the group, @tsk.
1362 *
1363 * -ECHILD should be in ->notask_error before the first call.
1364 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1365 * Returns zero if the search for a child should continue; then
1366 * ->notask_error is 0 if there were any eligible children,
1367 * or still -ECHILD.
1368 */
1369static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1370{
1371        struct task_struct *p;
1372
1373        list_for_each_entry(p, &tsk->children, sibling) {
1374                int ret = wait_consider_task(wo, 0, p);
1375
1376                if (ret)
1377                        return ret;
1378        }
1379
1380        return 0;
1381}
1382
1383static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1384{
1385        struct task_struct *p;
1386
1387        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1388                int ret = wait_consider_task(wo, 1, p);
1389
1390                if (ret)
1391                        return ret;
1392        }
1393
1394        return 0;
1395}
1396
1397static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1398                                int sync, void *key)
1399{
1400        struct wait_opts *wo = container_of(wait, struct wait_opts,
1401                                                child_wait);
1402        struct task_struct *p = key;
1403
1404        if (!eligible_pid(wo, p))
1405                return 0;
1406
1407        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1408                return 0;
1409
1410        return default_wake_function(wait, mode, sync, key);
1411}
1412
1413void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1414{
1415        __wake_up_sync_key(&parent->signal->wait_chldexit,
1416                           TASK_INTERRUPTIBLE, p);
1417}
1418
1419static long do_wait(struct wait_opts *wo)
1420{
1421        struct task_struct *tsk;
1422        int retval;
1423
1424        trace_sched_process_wait(wo->wo_pid);
1425
1426        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1427        wo->child_wait.private = current;
1428        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1429repeat:
1430        /*
1431         * If there is nothing that can match our criteria, just get out.
1432         * We will clear ->notask_error to zero if we see any child that
1433         * might later match our criteria, even if we are not able to reap
1434         * it yet.
1435         */
1436        wo->notask_error = -ECHILD;
1437        if ((wo->wo_type < PIDTYPE_MAX) &&
1438           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
1439                goto notask;
1440
1441        set_current_state(TASK_INTERRUPTIBLE);
1442        read_lock(&tasklist_lock);
1443        tsk = current;
1444        do {
1445                retval = do_wait_thread(wo, tsk);
1446                if (retval)
1447                        goto end;
1448
1449                retval = ptrace_do_wait(wo, tsk);
1450                if (retval)
1451                        goto end;
1452
1453                if (wo->wo_flags & __WNOTHREAD)
1454                        break;
1455        } while_each_thread(current, tsk);
1456        read_unlock(&tasklist_lock);
1457
1458notask:
1459        retval = wo->notask_error;
1460        if (!retval && !(wo->wo_flags & WNOHANG)) {
1461                retval = -ERESTARTSYS;
1462                if (!signal_pending(current)) {
1463                        schedule();
1464                        goto repeat;
1465                }
1466        }
1467end:
1468        __set_current_state(TASK_RUNNING);
1469        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1470        return retval;
1471}
1472
1473static struct pid *pidfd_get_pid(unsigned int fd)
1474{
1475        struct fd f;
1476        struct pid *pid;
1477
1478        f = fdget(fd);
1479        if (!f.file)
1480                return ERR_PTR(-EBADF);
1481
1482        pid = pidfd_pid(f.file);
1483        if (!IS_ERR(pid))
1484                get_pid(pid);
1485
1486        fdput(f);
1487        return pid;
1488}
1489
1490static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1491                          int options, struct rusage *ru)
1492{
1493        struct wait_opts wo;
1494        struct pid *pid = NULL;
1495        enum pid_type type;
1496        long ret;
1497
1498        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1499                        __WNOTHREAD|__WCLONE|__WALL))
1500                return -EINVAL;
1501        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1502                return -EINVAL;
1503
1504        switch (which) {
1505        case P_ALL:
1506                type = PIDTYPE_MAX;
1507                break;
1508        case P_PID:
1509                type = PIDTYPE_PID;
1510                if (upid <= 0)
1511                        return -EINVAL;
1512
1513                pid = find_get_pid(upid);
1514                break;
1515        case P_PGID:
1516                type = PIDTYPE_PGID;
1517                if (upid < 0)
1518                        return -EINVAL;
1519
1520                if (upid)
1521                        pid = find_get_pid(upid);
1522                else
1523                        pid = get_task_pid(current, PIDTYPE_PGID);
1524                break;
1525        case P_PIDFD:
1526                type = PIDTYPE_PID;
1527                if (upid < 0)
1528                        return -EINVAL;
1529
1530                pid = pidfd_get_pid(upid);
1531                if (IS_ERR(pid))
1532                        return PTR_ERR(pid);
1533                break;
1534        default:
1535                return -EINVAL;
1536        }
1537
1538        wo.wo_type      = type;
1539        wo.wo_pid       = pid;
1540        wo.wo_flags     = options;
1541        wo.wo_info      = infop;
1542        wo.wo_rusage    = ru;
1543        ret = do_wait(&wo);
1544
1545        put_pid(pid);
1546        return ret;
1547}
1548
1549SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1550                infop, int, options, struct rusage __user *, ru)
1551{
1552        struct rusage r;
1553        struct waitid_info info = {.status = 0};
1554        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1555        int signo = 0;
1556
1557        if (err > 0) {
1558                signo = SIGCHLD;
1559                err = 0;
1560                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1561                        return -EFAULT;
1562        }
1563        if (!infop)
1564                return err;
1565
1566        if (!user_access_begin(infop, sizeof(*infop)))
1567                return -EFAULT;
1568
1569        unsafe_put_user(signo, &infop->si_signo, Efault);
1570        unsafe_put_user(0, &infop->si_errno, Efault);
1571        unsafe_put_user(info.cause, &infop->si_code, Efault);
1572        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1573        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1574        unsafe_put_user(info.status, &infop->si_status, Efault);
1575        user_access_end();
1576        return err;
1577Efault:
1578        user_access_end();
1579        return -EFAULT;
1580}
1581
1582long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1583                  struct rusage *ru)
1584{
1585        struct wait_opts wo;
1586        struct pid *pid = NULL;
1587        enum pid_type type;
1588        long ret;
1589
1590        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1591                        __WNOTHREAD|__WCLONE|__WALL))
1592                return -EINVAL;
1593
1594        /* -INT_MIN is not defined */
1595        if (upid == INT_MIN)
1596                return -ESRCH;
1597
1598        if (upid == -1)
1599                type = PIDTYPE_MAX;
1600        else if (upid < 0) {
1601                type = PIDTYPE_PGID;
1602                pid = find_get_pid(-upid);
1603        } else if (upid == 0) {
1604                type = PIDTYPE_PGID;
1605                pid = get_task_pid(current, PIDTYPE_PGID);
1606        } else /* upid > 0 */ {
1607                type = PIDTYPE_PID;
1608                pid = find_get_pid(upid);
1609        }
1610
1611        wo.wo_type      = type;
1612        wo.wo_pid       = pid;
1613        wo.wo_flags     = options | WEXITED;
1614        wo.wo_info      = NULL;
1615        wo.wo_stat      = 0;
1616        wo.wo_rusage    = ru;
1617        ret = do_wait(&wo);
1618        put_pid(pid);
1619        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1620                ret = -EFAULT;
1621
1622        return ret;
1623}
1624
1625SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1626                int, options, struct rusage __user *, ru)
1627{
1628        struct rusage r;
1629        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1630
1631        if (err > 0) {
1632                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1633                        return -EFAULT;
1634        }
1635        return err;
1636}
1637
1638#ifdef __ARCH_WANT_SYS_WAITPID
1639
1640/*
1641 * sys_waitpid() remains for compatibility. waitpid() should be
1642 * implemented by calling sys_wait4() from libc.a.
1643 */
1644SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1645{
1646        return kernel_wait4(pid, stat_addr, options, NULL);
1647}
1648
1649#endif
1650
1651#ifdef CONFIG_COMPAT
1652COMPAT_SYSCALL_DEFINE4(wait4,
1653        compat_pid_t, pid,
1654        compat_uint_t __user *, stat_addr,
1655        int, options,
1656        struct compat_rusage __user *, ru)
1657{
1658        struct rusage r;
1659        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1660        if (err > 0) {
1661                if (ru && put_compat_rusage(&r, ru))
1662                        return -EFAULT;
1663        }
1664        return err;
1665}
1666
1667COMPAT_SYSCALL_DEFINE5(waitid,
1668                int, which, compat_pid_t, pid,
1669                struct compat_siginfo __user *, infop, int, options,
1670                struct compat_rusage __user *, uru)
1671{
1672        struct rusage ru;
1673        struct waitid_info info = {.status = 0};
1674        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1675        int signo = 0;
1676        if (err > 0) {
1677                signo = SIGCHLD;
1678                err = 0;
1679                if (uru) {
1680                        /* kernel_waitid() overwrites everything in ru */
1681                        if (COMPAT_USE_64BIT_TIME)
1682                                err = copy_to_user(uru, &ru, sizeof(ru));
1683                        else
1684                                err = put_compat_rusage(&ru, uru);
1685                        if (err)
1686                                return -EFAULT;
1687                }
1688        }
1689
1690        if (!infop)
1691                return err;
1692
1693        if (!user_access_begin(infop, sizeof(*infop)))
1694                return -EFAULT;
1695
1696        unsafe_put_user(signo, &infop->si_signo, Efault);
1697        unsafe_put_user(0, &infop->si_errno, Efault);
1698        unsafe_put_user(info.cause, &infop->si_code, Efault);
1699        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1700        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1701        unsafe_put_user(info.status, &infop->si_status, Efault);
1702        user_access_end();
1703        return err;
1704Efault:
1705        user_access_end();
1706        return -EFAULT;
1707}
1708#endif
1709
1710__weak void abort(void)
1711{
1712        BUG();
1713
1714        /* if that doesn't kill us, halt */
1715        panic("Oops failed to kill thread");
1716}
1717EXPORT_SYMBOL(abort);
1718