linux/kernel/exit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/kernel/exit.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/slab.h>
  10#include <linux/sched/autogroup.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched/stat.h>
  13#include <linux/sched/task.h>
  14#include <linux/sched/task_stack.h>
  15#include <linux/sched/cputime.h>
  16#include <linux/interrupt.h>
  17#include <linux/module.h>
  18#include <linux/capability.h>
  19#include <linux/completion.h>
  20#include <linux/personality.h>
  21#include <linux/tty.h>
  22#include <linux/iocontext.h>
  23#include <linux/key.h>
  24#include <linux/cpu.h>
  25#include <linux/acct.h>
  26#include <linux/tsacct_kern.h>
  27#include <linux/file.h>
  28#include <linux/fdtable.h>
  29#include <linux/freezer.h>
  30#include <linux/binfmts.h>
  31#include <linux/nsproxy.h>
  32#include <linux/pid_namespace.h>
  33#include <linux/ptrace.h>
  34#include <linux/profile.h>
  35#include <linux/mount.h>
  36#include <linux/proc_fs.h>
  37#include <linux/kthread.h>
  38#include <linux/mempolicy.h>
  39#include <linux/taskstats_kern.h>
  40#include <linux/delayacct.h>
  41#include <linux/cgroup.h>
  42#include <linux/syscalls.h>
  43#include <linux/signal.h>
  44#include <linux/posix-timers.h>
  45#include <linux/cn_proc.h>
  46#include <linux/mutex.h>
  47#include <linux/futex.h>
  48#include <linux/pipe_fs_i.h>
  49#include <linux/audit.h> /* for audit_free() */
  50#include <linux/resource.h>
  51#include <linux/blkdev.h>
  52#include <linux/task_io_accounting_ops.h>
  53#include <linux/tracehook.h>
  54#include <linux/fs_struct.h>
  55#include <linux/init_task.h>
  56#include <linux/perf_event.h>
  57#include <trace/events/sched.h>
  58#include <linux/hw_breakpoint.h>
  59#include <linux/oom.h>
  60#include <linux/writeback.h>
  61#include <linux/shm.h>
  62#include <linux/kcov.h>
  63#include <linux/random.h>
  64#include <linux/rcuwait.h>
  65#include <linux/compat.h>
  66
  67#include <linux/uaccess.h>
  68#include <asm/unistd.h>
  69#include <asm/mmu_context.h>
  70
  71static void __unhash_process(struct task_struct *p, bool group_dead)
  72{
  73        nr_threads--;
  74        detach_pid(p, PIDTYPE_PID);
  75        if (group_dead) {
  76                detach_pid(p, PIDTYPE_TGID);
  77                detach_pid(p, PIDTYPE_PGID);
  78                detach_pid(p, PIDTYPE_SID);
  79
  80                list_del_rcu(&p->tasks);
  81                list_del_init(&p->sibling);
  82                __this_cpu_dec(process_counts);
  83        }
  84        list_del_rcu(&p->thread_group);
  85        list_del_rcu(&p->thread_node);
  86}
  87
  88/*
  89 * This function expects the tasklist_lock write-locked.
  90 */
  91static void __exit_signal(struct task_struct *tsk)
  92{
  93        struct signal_struct *sig = tsk->signal;
  94        bool group_dead = thread_group_leader(tsk);
  95        struct sighand_struct *sighand;
  96        struct tty_struct *tty;
  97        u64 utime, stime;
  98
  99        sighand = rcu_dereference_check(tsk->sighand,
 100                                        lockdep_tasklist_lock_is_held());
 101        spin_lock(&sighand->siglock);
 102
 103#ifdef CONFIG_POSIX_TIMERS
 104        posix_cpu_timers_exit(tsk);
 105        if (group_dead)
 106                posix_cpu_timers_exit_group(tsk);
 107#endif
 108
 109        if (group_dead) {
 110                tty = sig->tty;
 111                sig->tty = NULL;
 112        } else {
 113                /*
 114                 * If there is any task waiting for the group exit
 115                 * then notify it:
 116                 */
 117                if (sig->notify_count > 0 && !--sig->notify_count)
 118                        wake_up_process(sig->group_exit_task);
 119
 120                if (tsk == sig->curr_target)
 121                        sig->curr_target = next_thread(tsk);
 122        }
 123
 124        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 125                              sizeof(unsigned long long));
 126
 127        /*
 128         * Accumulate here the counters for all threads as they die. We could
 129         * skip the group leader because it is the last user of signal_struct,
 130         * but we want to avoid the race with thread_group_cputime() which can
 131         * see the empty ->thread_head list.
 132         */
 133        task_cputime(tsk, &utime, &stime);
 134        write_seqlock(&sig->stats_lock);
 135        sig->utime += utime;
 136        sig->stime += stime;
 137        sig->gtime += task_gtime(tsk);
 138        sig->min_flt += tsk->min_flt;
 139        sig->maj_flt += tsk->maj_flt;
 140        sig->nvcsw += tsk->nvcsw;
 141        sig->nivcsw += tsk->nivcsw;
 142        sig->inblock += task_io_get_inblock(tsk);
 143        sig->oublock += task_io_get_oublock(tsk);
 144        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 145        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 146        sig->nr_threads--;
 147        __unhash_process(tsk, group_dead);
 148        write_sequnlock(&sig->stats_lock);
 149
 150        /*
 151         * Do this under ->siglock, we can race with another thread
 152         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 153         */
 154        flush_sigqueue(&tsk->pending);
 155        tsk->sighand = NULL;
 156        spin_unlock(&sighand->siglock);
 157
 158        __cleanup_sighand(sighand);
 159        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 160        if (group_dead) {
 161                flush_sigqueue(&sig->shared_pending);
 162                tty_kref_put(tty);
 163        }
 164}
 165
 166static void delayed_put_task_struct(struct rcu_head *rhp)
 167{
 168        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 169
 170        perf_event_delayed_put(tsk);
 171        trace_sched_process_free(tsk);
 172        put_task_struct(tsk);
 173}
 174
 175void put_task_struct_rcu_user(struct task_struct *task)
 176{
 177        if (refcount_dec_and_test(&task->rcu_users))
 178                call_rcu(&task->rcu, delayed_put_task_struct);
 179}
 180
 181void release_task(struct task_struct *p)
 182{
 183        struct task_struct *leader;
 184        struct pid *thread_pid;
 185        int zap_leader;
 186repeat:
 187        /* don't need to get the RCU readlock here - the process is dead and
 188         * can't be modifying its own credentials. But shut RCU-lockdep up */
 189        rcu_read_lock();
 190        atomic_dec(&__task_cred(p)->user->processes);
 191        rcu_read_unlock();
 192
 193        cgroup_release(p);
 194
 195        write_lock_irq(&tasklist_lock);
 196        ptrace_release_task(p);
 197        thread_pid = get_pid(p->thread_pid);
 198        __exit_signal(p);
 199
 200        /*
 201         * If we are the last non-leader member of the thread
 202         * group, and the leader is zombie, then notify the
 203         * group leader's parent process. (if it wants notification.)
 204         */
 205        zap_leader = 0;
 206        leader = p->group_leader;
 207        if (leader != p && thread_group_empty(leader)
 208                        && leader->exit_state == EXIT_ZOMBIE) {
 209                /*
 210                 * If we were the last child thread and the leader has
 211                 * exited already, and the leader's parent ignores SIGCHLD,
 212                 * then we are the one who should release the leader.
 213                 */
 214                zap_leader = do_notify_parent(leader, leader->exit_signal);
 215                if (zap_leader)
 216                        leader->exit_state = EXIT_DEAD;
 217        }
 218
 219        write_unlock_irq(&tasklist_lock);
 220        seccomp_filter_release(p);
 221        proc_flush_pid(thread_pid);
 222        put_pid(thread_pid);
 223        release_thread(p);
 224        put_task_struct_rcu_user(p);
 225
 226        p = leader;
 227        if (unlikely(zap_leader))
 228                goto repeat;
 229}
 230
 231int rcuwait_wake_up(struct rcuwait *w)
 232{
 233        int ret = 0;
 234        struct task_struct *task;
 235
 236        rcu_read_lock();
 237
 238        /*
 239         * Order condition vs @task, such that everything prior to the load
 240         * of @task is visible. This is the condition as to why the user called
 241         * rcuwait_wake() in the first place. Pairs with set_current_state()
 242         * barrier (A) in rcuwait_wait_event().
 243         *
 244         *    WAIT                WAKE
 245         *    [S] tsk = current   [S] cond = true
 246         *        MB (A)              MB (B)
 247         *    [L] cond            [L] tsk
 248         */
 249        smp_mb(); /* (B) */
 250
 251        task = rcu_dereference(w->task);
 252        if (task)
 253                ret = wake_up_process(task);
 254        rcu_read_unlock();
 255
 256        return ret;
 257}
 258EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 259
 260/*
 261 * Determine if a process group is "orphaned", according to the POSIX
 262 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 263 * by terminal-generated stop signals.  Newly orphaned process groups are
 264 * to receive a SIGHUP and a SIGCONT.
 265 *
 266 * "I ask you, have you ever known what it is to be an orphan?"
 267 */
 268static int will_become_orphaned_pgrp(struct pid *pgrp,
 269                                        struct task_struct *ignored_task)
 270{
 271        struct task_struct *p;
 272
 273        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 274                if ((p == ignored_task) ||
 275                    (p->exit_state && thread_group_empty(p)) ||
 276                    is_global_init(p->real_parent))
 277                        continue;
 278
 279                if (task_pgrp(p->real_parent) != pgrp &&
 280                    task_session(p->real_parent) == task_session(p))
 281                        return 0;
 282        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 283
 284        return 1;
 285}
 286
 287int is_current_pgrp_orphaned(void)
 288{
 289        int retval;
 290
 291        read_lock(&tasklist_lock);
 292        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 293        read_unlock(&tasklist_lock);
 294
 295        return retval;
 296}
 297
 298static bool has_stopped_jobs(struct pid *pgrp)
 299{
 300        struct task_struct *p;
 301
 302        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 303                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 304                        return true;
 305        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 306
 307        return false;
 308}
 309
 310/*
 311 * Check to see if any process groups have become orphaned as
 312 * a result of our exiting, and if they have any stopped jobs,
 313 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 314 */
 315static void
 316kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 317{
 318        struct pid *pgrp = task_pgrp(tsk);
 319        struct task_struct *ignored_task = tsk;
 320
 321        if (!parent)
 322                /* exit: our father is in a different pgrp than
 323                 * we are and we were the only connection outside.
 324                 */
 325                parent = tsk->real_parent;
 326        else
 327                /* reparent: our child is in a different pgrp than
 328                 * we are, and it was the only connection outside.
 329                 */
 330                ignored_task = NULL;
 331
 332        if (task_pgrp(parent) != pgrp &&
 333            task_session(parent) == task_session(tsk) &&
 334            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 335            has_stopped_jobs(pgrp)) {
 336                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 337                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 338        }
 339}
 340
 341#ifdef CONFIG_MEMCG
 342/*
 343 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 344 */
 345void mm_update_next_owner(struct mm_struct *mm)
 346{
 347        struct task_struct *c, *g, *p = current;
 348
 349retry:
 350        /*
 351         * If the exiting or execing task is not the owner, it's
 352         * someone else's problem.
 353         */
 354        if (mm->owner != p)
 355                return;
 356        /*
 357         * The current owner is exiting/execing and there are no other
 358         * candidates.  Do not leave the mm pointing to a possibly
 359         * freed task structure.
 360         */
 361        if (atomic_read(&mm->mm_users) <= 1) {
 362                WRITE_ONCE(mm->owner, NULL);
 363                return;
 364        }
 365
 366        read_lock(&tasklist_lock);
 367        /*
 368         * Search in the children
 369         */
 370        list_for_each_entry(c, &p->children, sibling) {
 371                if (c->mm == mm)
 372                        goto assign_new_owner;
 373        }
 374
 375        /*
 376         * Search in the siblings
 377         */
 378        list_for_each_entry(c, &p->real_parent->children, sibling) {
 379                if (c->mm == mm)
 380                        goto assign_new_owner;
 381        }
 382
 383        /*
 384         * Search through everything else, we should not get here often.
 385         */
 386        for_each_process(g) {
 387                if (g->flags & PF_KTHREAD)
 388                        continue;
 389                for_each_thread(g, c) {
 390                        if (c->mm == mm)
 391                                goto assign_new_owner;
 392                        if (c->mm)
 393                                break;
 394                }
 395        }
 396        read_unlock(&tasklist_lock);
 397        /*
 398         * We found no owner yet mm_users > 1: this implies that we are
 399         * most likely racing with swapoff (try_to_unuse()) or /proc or
 400         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 401         */
 402        WRITE_ONCE(mm->owner, NULL);
 403        return;
 404
 405assign_new_owner:
 406        BUG_ON(c == p);
 407        get_task_struct(c);
 408        /*
 409         * The task_lock protects c->mm from changing.
 410         * We always want mm->owner->mm == mm
 411         */
 412        task_lock(c);
 413        /*
 414         * Delay read_unlock() till we have the task_lock()
 415         * to ensure that c does not slip away underneath us
 416         */
 417        read_unlock(&tasklist_lock);
 418        if (c->mm != mm) {
 419                task_unlock(c);
 420                put_task_struct(c);
 421                goto retry;
 422        }
 423        WRITE_ONCE(mm->owner, c);
 424        task_unlock(c);
 425        put_task_struct(c);
 426}
 427#endif /* CONFIG_MEMCG */
 428
 429/*
 430 * Turn us into a lazy TLB process if we
 431 * aren't already..
 432 */
 433static void exit_mm(void)
 434{
 435        struct mm_struct *mm = current->mm;
 436        struct core_state *core_state;
 437
 438        exit_mm_release(current, mm);
 439        if (!mm)
 440                return;
 441        sync_mm_rss(mm);
 442        /*
 443         * Serialize with any possible pending coredump.
 444         * We must hold mmap_lock around checking core_state
 445         * and clearing tsk->mm.  The core-inducing thread
 446         * will increment ->nr_threads for each thread in the
 447         * group with ->mm != NULL.
 448         */
 449        mmap_read_lock(mm);
 450        core_state = mm->core_state;
 451        if (core_state) {
 452                struct core_thread self;
 453
 454                mmap_read_unlock(mm);
 455
 456                self.task = current;
 457                self.next = xchg(&core_state->dumper.next, &self);
 458                /*
 459                 * Implies mb(), the result of xchg() must be visible
 460                 * to core_state->dumper.
 461                 */
 462                if (atomic_dec_and_test(&core_state->nr_threads))
 463                        complete(&core_state->startup);
 464
 465                for (;;) {
 466                        set_current_state(TASK_UNINTERRUPTIBLE);
 467                        if (!self.task) /* see coredump_finish() */
 468                                break;
 469                        freezable_schedule();
 470                }
 471                __set_current_state(TASK_RUNNING);
 472                mmap_read_lock(mm);
 473        }
 474        mmgrab(mm);
 475        BUG_ON(mm != current->active_mm);
 476        /* more a memory barrier than a real lock */
 477        task_lock(current);
 478        current->mm = NULL;
 479        mmap_read_unlock(mm);
 480        enter_lazy_tlb(mm, current);
 481        task_unlock(current);
 482        mm_update_next_owner(mm);
 483        mmput(mm);
 484        if (test_thread_flag(TIF_MEMDIE))
 485                exit_oom_victim();
 486}
 487
 488static struct task_struct *find_alive_thread(struct task_struct *p)
 489{
 490        struct task_struct *t;
 491
 492        for_each_thread(p, t) {
 493                if (!(t->flags & PF_EXITING))
 494                        return t;
 495        }
 496        return NULL;
 497}
 498
 499static struct task_struct *find_child_reaper(struct task_struct *father,
 500                                                struct list_head *dead)
 501        __releases(&tasklist_lock)
 502        __acquires(&tasklist_lock)
 503{
 504        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 505        struct task_struct *reaper = pid_ns->child_reaper;
 506        struct task_struct *p, *n;
 507
 508        if (likely(reaper != father))
 509                return reaper;
 510
 511        reaper = find_alive_thread(father);
 512        if (reaper) {
 513                pid_ns->child_reaper = reaper;
 514                return reaper;
 515        }
 516
 517        write_unlock_irq(&tasklist_lock);
 518
 519        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 520                list_del_init(&p->ptrace_entry);
 521                release_task(p);
 522        }
 523
 524        zap_pid_ns_processes(pid_ns);
 525        write_lock_irq(&tasklist_lock);
 526
 527        return father;
 528}
 529
 530/*
 531 * When we die, we re-parent all our children, and try to:
 532 * 1. give them to another thread in our thread group, if such a member exists
 533 * 2. give it to the first ancestor process which prctl'd itself as a
 534 *    child_subreaper for its children (like a service manager)
 535 * 3. give it to the init process (PID 1) in our pid namespace
 536 */
 537static struct task_struct *find_new_reaper(struct task_struct *father,
 538                                           struct task_struct *child_reaper)
 539{
 540        struct task_struct *thread, *reaper;
 541
 542        thread = find_alive_thread(father);
 543        if (thread)
 544                return thread;
 545
 546        if (father->signal->has_child_subreaper) {
 547                unsigned int ns_level = task_pid(father)->level;
 548                /*
 549                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 550                 * We can't check reaper != child_reaper to ensure we do not
 551                 * cross the namespaces, the exiting parent could be injected
 552                 * by setns() + fork().
 553                 * We check pid->level, this is slightly more efficient than
 554                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 555                 */
 556                for (reaper = father->real_parent;
 557                     task_pid(reaper)->level == ns_level;
 558                     reaper = reaper->real_parent) {
 559                        if (reaper == &init_task)
 560                                break;
 561                        if (!reaper->signal->is_child_subreaper)
 562                                continue;
 563                        thread = find_alive_thread(reaper);
 564                        if (thread)
 565                                return thread;
 566                }
 567        }
 568
 569        return child_reaper;
 570}
 571
 572/*
 573* Any that need to be release_task'd are put on the @dead list.
 574 */
 575static void reparent_leader(struct task_struct *father, struct task_struct *p,
 576                                struct list_head *dead)
 577{
 578        if (unlikely(p->exit_state == EXIT_DEAD))
 579                return;
 580
 581        /* We don't want people slaying init. */
 582        p->exit_signal = SIGCHLD;
 583
 584        /* If it has exited notify the new parent about this child's death. */
 585        if (!p->ptrace &&
 586            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 587                if (do_notify_parent(p, p->exit_signal)) {
 588                        p->exit_state = EXIT_DEAD;
 589                        list_add(&p->ptrace_entry, dead);
 590                }
 591        }
 592
 593        kill_orphaned_pgrp(p, father);
 594}
 595
 596/*
 597 * This does two things:
 598 *
 599 * A.  Make init inherit all the child processes
 600 * B.  Check to see if any process groups have become orphaned
 601 *      as a result of our exiting, and if they have any stopped
 602 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 603 */
 604static void forget_original_parent(struct task_struct *father,
 605                                        struct list_head *dead)
 606{
 607        struct task_struct *p, *t, *reaper;
 608
 609        if (unlikely(!list_empty(&father->ptraced)))
 610                exit_ptrace(father, dead);
 611
 612        /* Can drop and reacquire tasklist_lock */
 613        reaper = find_child_reaper(father, dead);
 614        if (list_empty(&father->children))
 615                return;
 616
 617        reaper = find_new_reaper(father, reaper);
 618        list_for_each_entry(p, &father->children, sibling) {
 619                for_each_thread(p, t) {
 620                        RCU_INIT_POINTER(t->real_parent, reaper);
 621                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
 622                        if (likely(!t->ptrace))
 623                                t->parent = t->real_parent;
 624                        if (t->pdeath_signal)
 625                                group_send_sig_info(t->pdeath_signal,
 626                                                    SEND_SIG_NOINFO, t,
 627                                                    PIDTYPE_TGID);
 628                }
 629                /*
 630                 * If this is a threaded reparent there is no need to
 631                 * notify anyone anything has happened.
 632                 */
 633                if (!same_thread_group(reaper, father))
 634                        reparent_leader(father, p, dead);
 635        }
 636        list_splice_tail_init(&father->children, &reaper->children);
 637}
 638
 639/*
 640 * Send signals to all our closest relatives so that they know
 641 * to properly mourn us..
 642 */
 643static void exit_notify(struct task_struct *tsk, int group_dead)
 644{
 645        bool autoreap;
 646        struct task_struct *p, *n;
 647        LIST_HEAD(dead);
 648
 649        write_lock_irq(&tasklist_lock);
 650        forget_original_parent(tsk, &dead);
 651
 652        if (group_dead)
 653                kill_orphaned_pgrp(tsk->group_leader, NULL);
 654
 655        tsk->exit_state = EXIT_ZOMBIE;
 656        if (unlikely(tsk->ptrace)) {
 657                int sig = thread_group_leader(tsk) &&
 658                                thread_group_empty(tsk) &&
 659                                !ptrace_reparented(tsk) ?
 660                        tsk->exit_signal : SIGCHLD;
 661                autoreap = do_notify_parent(tsk, sig);
 662        } else if (thread_group_leader(tsk)) {
 663                autoreap = thread_group_empty(tsk) &&
 664                        do_notify_parent(tsk, tsk->exit_signal);
 665        } else {
 666                autoreap = true;
 667        }
 668
 669        if (autoreap) {
 670                tsk->exit_state = EXIT_DEAD;
 671                list_add(&tsk->ptrace_entry, &dead);
 672        }
 673
 674        /* mt-exec, de_thread() is waiting for group leader */
 675        if (unlikely(tsk->signal->notify_count < 0))
 676                wake_up_process(tsk->signal->group_exit_task);
 677        write_unlock_irq(&tasklist_lock);
 678
 679        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 680                list_del_init(&p->ptrace_entry);
 681                release_task(p);
 682        }
 683}
 684
 685#ifdef CONFIG_DEBUG_STACK_USAGE
 686static void check_stack_usage(void)
 687{
 688        static DEFINE_SPINLOCK(low_water_lock);
 689        static int lowest_to_date = THREAD_SIZE;
 690        unsigned long free;
 691
 692        free = stack_not_used(current);
 693
 694        if (free >= lowest_to_date)
 695                return;
 696
 697        spin_lock(&low_water_lock);
 698        if (free < lowest_to_date) {
 699                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 700                        current->comm, task_pid_nr(current), free);
 701                lowest_to_date = free;
 702        }
 703        spin_unlock(&low_water_lock);
 704}
 705#else
 706static inline void check_stack_usage(void) {}
 707#endif
 708
 709void __noreturn do_exit(long code)
 710{
 711        struct task_struct *tsk = current;
 712        int group_dead;
 713
 714        /*
 715         * We can get here from a kernel oops, sometimes with preemption off.
 716         * Start by checking for critical errors.
 717         * Then fix up important state like USER_DS and preemption.
 718         * Then do everything else.
 719         */
 720
 721        WARN_ON(blk_needs_flush_plug(tsk));
 722
 723        if (unlikely(in_interrupt()))
 724                panic("Aiee, killing interrupt handler!");
 725        if (unlikely(!tsk->pid))
 726                panic("Attempted to kill the idle task!");
 727
 728        /*
 729         * If do_exit is called because this processes oopsed, it's possible
 730         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 731         * continuing. Amongst other possible reasons, this is to prevent
 732         * mm_release()->clear_child_tid() from writing to a user-controlled
 733         * kernel address.
 734         */
 735        force_uaccess_begin();
 736
 737        if (unlikely(in_atomic())) {
 738                pr_info("note: %s[%d] exited with preempt_count %d\n",
 739                        current->comm, task_pid_nr(current),
 740                        preempt_count());
 741                preempt_count_set(PREEMPT_ENABLED);
 742        }
 743
 744        profile_task_exit(tsk);
 745        kcov_task_exit(tsk);
 746
 747        ptrace_event(PTRACE_EVENT_EXIT, code);
 748
 749        validate_creds_for_do_exit(tsk);
 750
 751        /*
 752         * We're taking recursive faults here in do_exit. Safest is to just
 753         * leave this task alone and wait for reboot.
 754         */
 755        if (unlikely(tsk->flags & PF_EXITING)) {
 756                pr_alert("Fixing recursive fault but reboot is needed!\n");
 757                futex_exit_recursive(tsk);
 758                set_current_state(TASK_UNINTERRUPTIBLE);
 759                schedule();
 760        }
 761
 762        exit_signals(tsk);  /* sets PF_EXITING */
 763
 764        /* sync mm's RSS info before statistics gathering */
 765        if (tsk->mm)
 766                sync_mm_rss(tsk->mm);
 767        acct_update_integrals(tsk);
 768        group_dead = atomic_dec_and_test(&tsk->signal->live);
 769        if (group_dead) {
 770                /*
 771                 * If the last thread of global init has exited, panic
 772                 * immediately to get a useable coredump.
 773                 */
 774                if (unlikely(is_global_init(tsk)))
 775                        panic("Attempted to kill init! exitcode=0x%08x\n",
 776                                tsk->signal->group_exit_code ?: (int)code);
 777
 778#ifdef CONFIG_POSIX_TIMERS
 779                hrtimer_cancel(&tsk->signal->real_timer);
 780                exit_itimers(tsk->signal);
 781#endif
 782                if (tsk->mm)
 783                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 784        }
 785        acct_collect(code, group_dead);
 786        if (group_dead)
 787                tty_audit_exit();
 788        audit_free(tsk);
 789
 790        tsk->exit_code = code;
 791        taskstats_exit(tsk, group_dead);
 792
 793        exit_mm();
 794
 795        if (group_dead)
 796                acct_process();
 797        trace_sched_process_exit(tsk);
 798
 799        exit_sem(tsk);
 800        exit_shm(tsk);
 801        exit_files(tsk);
 802        exit_fs(tsk);
 803        if (group_dead)
 804                disassociate_ctty(1);
 805        exit_task_namespaces(tsk);
 806        exit_task_work(tsk);
 807        exit_thread(tsk);
 808
 809        /*
 810         * Flush inherited counters to the parent - before the parent
 811         * gets woken up by child-exit notifications.
 812         *
 813         * because of cgroup mode, must be called before cgroup_exit()
 814         */
 815        perf_event_exit_task(tsk);
 816
 817        sched_autogroup_exit_task(tsk);
 818        cgroup_exit(tsk);
 819
 820        /*
 821         * FIXME: do that only when needed, using sched_exit tracepoint
 822         */
 823        flush_ptrace_hw_breakpoint(tsk);
 824
 825        exit_tasks_rcu_start();
 826        exit_notify(tsk, group_dead);
 827        proc_exit_connector(tsk);
 828        mpol_put_task_policy(tsk);
 829#ifdef CONFIG_FUTEX
 830        if (unlikely(current->pi_state_cache))
 831                kfree(current->pi_state_cache);
 832#endif
 833        /*
 834         * Make sure we are holding no locks:
 835         */
 836        debug_check_no_locks_held();
 837
 838        if (tsk->io_context)
 839                exit_io_context(tsk);
 840
 841        if (tsk->splice_pipe)
 842                free_pipe_info(tsk->splice_pipe);
 843
 844        if (tsk->task_frag.page)
 845                put_page(tsk->task_frag.page);
 846
 847        validate_creds_for_do_exit(tsk);
 848
 849        check_stack_usage();
 850        preempt_disable();
 851        if (tsk->nr_dirtied)
 852                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 853        exit_rcu();
 854        exit_tasks_rcu_finish();
 855
 856        lockdep_free_task(tsk);
 857        do_task_dead();
 858}
 859EXPORT_SYMBOL_GPL(do_exit);
 860
 861void complete_and_exit(struct completion *comp, long code)
 862{
 863        if (comp)
 864                complete(comp);
 865
 866        do_exit(code);
 867}
 868EXPORT_SYMBOL(complete_and_exit);
 869
 870SYSCALL_DEFINE1(exit, int, error_code)
 871{
 872        do_exit((error_code&0xff)<<8);
 873}
 874
 875/*
 876 * Take down every thread in the group.  This is called by fatal signals
 877 * as well as by sys_exit_group (below).
 878 */
 879void
 880do_group_exit(int exit_code)
 881{
 882        struct signal_struct *sig = current->signal;
 883
 884        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 885
 886        if (signal_group_exit(sig))
 887                exit_code = sig->group_exit_code;
 888        else if (!thread_group_empty(current)) {
 889                struct sighand_struct *const sighand = current->sighand;
 890
 891                spin_lock_irq(&sighand->siglock);
 892                if (signal_group_exit(sig))
 893                        /* Another thread got here before we took the lock.  */
 894                        exit_code = sig->group_exit_code;
 895                else {
 896                        sig->group_exit_code = exit_code;
 897                        sig->flags = SIGNAL_GROUP_EXIT;
 898                        zap_other_threads(current);
 899                }
 900                spin_unlock_irq(&sighand->siglock);
 901        }
 902
 903        do_exit(exit_code);
 904        /* NOTREACHED */
 905}
 906
 907/*
 908 * this kills every thread in the thread group. Note that any externally
 909 * wait4()-ing process will get the correct exit code - even if this
 910 * thread is not the thread group leader.
 911 */
 912SYSCALL_DEFINE1(exit_group, int, error_code)
 913{
 914        do_group_exit((error_code & 0xff) << 8);
 915        /* NOTREACHED */
 916        return 0;
 917}
 918
 919struct waitid_info {
 920        pid_t pid;
 921        uid_t uid;
 922        int status;
 923        int cause;
 924};
 925
 926struct wait_opts {
 927        enum pid_type           wo_type;
 928        int                     wo_flags;
 929        struct pid              *wo_pid;
 930
 931        struct waitid_info      *wo_info;
 932        int                     wo_stat;
 933        struct rusage           *wo_rusage;
 934
 935        wait_queue_entry_t              child_wait;
 936        int                     notask_error;
 937};
 938
 939static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 940{
 941        return  wo->wo_type == PIDTYPE_MAX ||
 942                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 943}
 944
 945static int
 946eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 947{
 948        if (!eligible_pid(wo, p))
 949                return 0;
 950
 951        /*
 952         * Wait for all children (clone and not) if __WALL is set or
 953         * if it is traced by us.
 954         */
 955        if (ptrace || (wo->wo_flags & __WALL))
 956                return 1;
 957
 958        /*
 959         * Otherwise, wait for clone children *only* if __WCLONE is set;
 960         * otherwise, wait for non-clone children *only*.
 961         *
 962         * Note: a "clone" child here is one that reports to its parent
 963         * using a signal other than SIGCHLD, or a non-leader thread which
 964         * we can only see if it is traced by us.
 965         */
 966        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 967                return 0;
 968
 969        return 1;
 970}
 971
 972/*
 973 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 974 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 975 * the lock and this task is uninteresting.  If we return nonzero, we have
 976 * released the lock and the system call should return.
 977 */
 978static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 979{
 980        int state, status;
 981        pid_t pid = task_pid_vnr(p);
 982        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 983        struct waitid_info *infop;
 984
 985        if (!likely(wo->wo_flags & WEXITED))
 986                return 0;
 987
 988        if (unlikely(wo->wo_flags & WNOWAIT)) {
 989                status = p->exit_code;
 990                get_task_struct(p);
 991                read_unlock(&tasklist_lock);
 992                sched_annotate_sleep();
 993                if (wo->wo_rusage)
 994                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 995                put_task_struct(p);
 996                goto out_info;
 997        }
 998        /*
 999         * Move the task's state to DEAD/TRACE, only one thread can do this.
1000         */
1001        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1002                EXIT_TRACE : EXIT_DEAD;
1003        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1004                return 0;
1005        /*
1006         * We own this thread, nobody else can reap it.
1007         */
1008        read_unlock(&tasklist_lock);
1009        sched_annotate_sleep();
1010
1011        /*
1012         * Check thread_group_leader() to exclude the traced sub-threads.
1013         */
1014        if (state == EXIT_DEAD && thread_group_leader(p)) {
1015                struct signal_struct *sig = p->signal;
1016                struct signal_struct *psig = current->signal;
1017                unsigned long maxrss;
1018                u64 tgutime, tgstime;
1019
1020                /*
1021                 * The resource counters for the group leader are in its
1022                 * own task_struct.  Those for dead threads in the group
1023                 * are in its signal_struct, as are those for the child
1024                 * processes it has previously reaped.  All these
1025                 * accumulate in the parent's signal_struct c* fields.
1026                 *
1027                 * We don't bother to take a lock here to protect these
1028                 * p->signal fields because the whole thread group is dead
1029                 * and nobody can change them.
1030                 *
1031                 * psig->stats_lock also protects us from our sub-theads
1032                 * which can reap other children at the same time. Until
1033                 * we change k_getrusage()-like users to rely on this lock
1034                 * we have to take ->siglock as well.
1035                 *
1036                 * We use thread_group_cputime_adjusted() to get times for
1037                 * the thread group, which consolidates times for all threads
1038                 * in the group including the group leader.
1039                 */
1040                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1041                spin_lock_irq(&current->sighand->siglock);
1042                write_seqlock(&psig->stats_lock);
1043                psig->cutime += tgutime + sig->cutime;
1044                psig->cstime += tgstime + sig->cstime;
1045                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1046                psig->cmin_flt +=
1047                        p->min_flt + sig->min_flt + sig->cmin_flt;
1048                psig->cmaj_flt +=
1049                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1050                psig->cnvcsw +=
1051                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1052                psig->cnivcsw +=
1053                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1054                psig->cinblock +=
1055                        task_io_get_inblock(p) +
1056                        sig->inblock + sig->cinblock;
1057                psig->coublock +=
1058                        task_io_get_oublock(p) +
1059                        sig->oublock + sig->coublock;
1060                maxrss = max(sig->maxrss, sig->cmaxrss);
1061                if (psig->cmaxrss < maxrss)
1062                        psig->cmaxrss = maxrss;
1063                task_io_accounting_add(&psig->ioac, &p->ioac);
1064                task_io_accounting_add(&psig->ioac, &sig->ioac);
1065                write_sequnlock(&psig->stats_lock);
1066                spin_unlock_irq(&current->sighand->siglock);
1067        }
1068
1069        if (wo->wo_rusage)
1070                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1071        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1072                ? p->signal->group_exit_code : p->exit_code;
1073        wo->wo_stat = status;
1074
1075        if (state == EXIT_TRACE) {
1076                write_lock_irq(&tasklist_lock);
1077                /* We dropped tasklist, ptracer could die and untrace */
1078                ptrace_unlink(p);
1079
1080                /* If parent wants a zombie, don't release it now */
1081                state = EXIT_ZOMBIE;
1082                if (do_notify_parent(p, p->exit_signal))
1083                        state = EXIT_DEAD;
1084                p->exit_state = state;
1085                write_unlock_irq(&tasklist_lock);
1086        }
1087        if (state == EXIT_DEAD)
1088                release_task(p);
1089
1090out_info:
1091        infop = wo->wo_info;
1092        if (infop) {
1093                if ((status & 0x7f) == 0) {
1094                        infop->cause = CLD_EXITED;
1095                        infop->status = status >> 8;
1096                } else {
1097                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1098                        infop->status = status & 0x7f;
1099                }
1100                infop->pid = pid;
1101                infop->uid = uid;
1102        }
1103
1104        return pid;
1105}
1106
1107static int *task_stopped_code(struct task_struct *p, bool ptrace)
1108{
1109        if (ptrace) {
1110                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1111                        return &p->exit_code;
1112        } else {
1113                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1114                        return &p->signal->group_exit_code;
1115        }
1116        return NULL;
1117}
1118
1119/**
1120 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1121 * @wo: wait options
1122 * @ptrace: is the wait for ptrace
1123 * @p: task to wait for
1124 *
1125 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1126 *
1127 * CONTEXT:
1128 * read_lock(&tasklist_lock), which is released if return value is
1129 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1130 *
1131 * RETURNS:
1132 * 0 if wait condition didn't exist and search for other wait conditions
1133 * should continue.  Non-zero return, -errno on failure and @p's pid on
1134 * success, implies that tasklist_lock is released and wait condition
1135 * search should terminate.
1136 */
1137static int wait_task_stopped(struct wait_opts *wo,
1138                                int ptrace, struct task_struct *p)
1139{
1140        struct waitid_info *infop;
1141        int exit_code, *p_code, why;
1142        uid_t uid = 0; /* unneeded, required by compiler */
1143        pid_t pid;
1144
1145        /*
1146         * Traditionally we see ptrace'd stopped tasks regardless of options.
1147         */
1148        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1149                return 0;
1150
1151        if (!task_stopped_code(p, ptrace))
1152                return 0;
1153
1154        exit_code = 0;
1155        spin_lock_irq(&p->sighand->siglock);
1156
1157        p_code = task_stopped_code(p, ptrace);
1158        if (unlikely(!p_code))
1159                goto unlock_sig;
1160
1161        exit_code = *p_code;
1162        if (!exit_code)
1163                goto unlock_sig;
1164
1165        if (!unlikely(wo->wo_flags & WNOWAIT))
1166                *p_code = 0;
1167
1168        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1169unlock_sig:
1170        spin_unlock_irq(&p->sighand->siglock);
1171        if (!exit_code)
1172                return 0;
1173
1174        /*
1175         * Now we are pretty sure this task is interesting.
1176         * Make sure it doesn't get reaped out from under us while we
1177         * give up the lock and then examine it below.  We don't want to
1178         * keep holding onto the tasklist_lock while we call getrusage and
1179         * possibly take page faults for user memory.
1180         */
1181        get_task_struct(p);
1182        pid = task_pid_vnr(p);
1183        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1184        read_unlock(&tasklist_lock);
1185        sched_annotate_sleep();
1186        if (wo->wo_rusage)
1187                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1188        put_task_struct(p);
1189
1190        if (likely(!(wo->wo_flags & WNOWAIT)))
1191                wo->wo_stat = (exit_code << 8) | 0x7f;
1192
1193        infop = wo->wo_info;
1194        if (infop) {
1195                infop->cause = why;
1196                infop->status = exit_code;
1197                infop->pid = pid;
1198                infop->uid = uid;
1199        }
1200        return pid;
1201}
1202
1203/*
1204 * Handle do_wait work for one task in a live, non-stopped state.
1205 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1206 * the lock and this task is uninteresting.  If we return nonzero, we have
1207 * released the lock and the system call should return.
1208 */
1209static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1210{
1211        struct waitid_info *infop;
1212        pid_t pid;
1213        uid_t uid;
1214
1215        if (!unlikely(wo->wo_flags & WCONTINUED))
1216                return 0;
1217
1218        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1219                return 0;
1220
1221        spin_lock_irq(&p->sighand->siglock);
1222        /* Re-check with the lock held.  */
1223        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1224                spin_unlock_irq(&p->sighand->siglock);
1225                return 0;
1226        }
1227        if (!unlikely(wo->wo_flags & WNOWAIT))
1228                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1229        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1230        spin_unlock_irq(&p->sighand->siglock);
1231
1232        pid = task_pid_vnr(p);
1233        get_task_struct(p);
1234        read_unlock(&tasklist_lock);
1235        sched_annotate_sleep();
1236        if (wo->wo_rusage)
1237                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1238        put_task_struct(p);
1239
1240        infop = wo->wo_info;
1241        if (!infop) {
1242                wo->wo_stat = 0xffff;
1243        } else {
1244                infop->cause = CLD_CONTINUED;
1245                infop->pid = pid;
1246                infop->uid = uid;
1247                infop->status = SIGCONT;
1248        }
1249        return pid;
1250}
1251
1252/*
1253 * Consider @p for a wait by @parent.
1254 *
1255 * -ECHILD should be in ->notask_error before the first call.
1256 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1257 * Returns zero if the search for a child should continue;
1258 * then ->notask_error is 0 if @p is an eligible child,
1259 * or still -ECHILD.
1260 */
1261static int wait_consider_task(struct wait_opts *wo, int ptrace,
1262                                struct task_struct *p)
1263{
1264        /*
1265         * We can race with wait_task_zombie() from another thread.
1266         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1267         * can't confuse the checks below.
1268         */
1269        int exit_state = READ_ONCE(p->exit_state);
1270        int ret;
1271
1272        if (unlikely(exit_state == EXIT_DEAD))
1273                return 0;
1274
1275        ret = eligible_child(wo, ptrace, p);
1276        if (!ret)
1277                return ret;
1278
1279        if (unlikely(exit_state == EXIT_TRACE)) {
1280                /*
1281                 * ptrace == 0 means we are the natural parent. In this case
1282                 * we should clear notask_error, debugger will notify us.
1283                 */
1284                if (likely(!ptrace))
1285                        wo->notask_error = 0;
1286                return 0;
1287        }
1288
1289        if (likely(!ptrace) && unlikely(p->ptrace)) {
1290                /*
1291                 * If it is traced by its real parent's group, just pretend
1292                 * the caller is ptrace_do_wait() and reap this child if it
1293                 * is zombie.
1294                 *
1295                 * This also hides group stop state from real parent; otherwise
1296                 * a single stop can be reported twice as group and ptrace stop.
1297                 * If a ptracer wants to distinguish these two events for its
1298                 * own children it should create a separate process which takes
1299                 * the role of real parent.
1300                 */
1301                if (!ptrace_reparented(p))
1302                        ptrace = 1;
1303        }
1304
1305        /* slay zombie? */
1306        if (exit_state == EXIT_ZOMBIE) {
1307                /* we don't reap group leaders with subthreads */
1308                if (!delay_group_leader(p)) {
1309                        /*
1310                         * A zombie ptracee is only visible to its ptracer.
1311                         * Notification and reaping will be cascaded to the
1312                         * real parent when the ptracer detaches.
1313                         */
1314                        if (unlikely(ptrace) || likely(!p->ptrace))
1315                                return wait_task_zombie(wo, p);
1316                }
1317
1318                /*
1319                 * Allow access to stopped/continued state via zombie by
1320                 * falling through.  Clearing of notask_error is complex.
1321                 *
1322                 * When !@ptrace:
1323                 *
1324                 * If WEXITED is set, notask_error should naturally be
1325                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1326                 * so, if there are live subthreads, there are events to
1327                 * wait for.  If all subthreads are dead, it's still safe
1328                 * to clear - this function will be called again in finite
1329                 * amount time once all the subthreads are released and
1330                 * will then return without clearing.
1331                 *
1332                 * When @ptrace:
1333                 *
1334                 * Stopped state is per-task and thus can't change once the
1335                 * target task dies.  Only continued and exited can happen.
1336                 * Clear notask_error if WCONTINUED | WEXITED.
1337                 */
1338                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1339                        wo->notask_error = 0;
1340        } else {
1341                /*
1342                 * @p is alive and it's gonna stop, continue or exit, so
1343                 * there always is something to wait for.
1344                 */
1345                wo->notask_error = 0;
1346        }
1347
1348        /*
1349         * Wait for stopped.  Depending on @ptrace, different stopped state
1350         * is used and the two don't interact with each other.
1351         */
1352        ret = wait_task_stopped(wo, ptrace, p);
1353        if (ret)
1354                return ret;
1355
1356        /*
1357         * Wait for continued.  There's only one continued state and the
1358         * ptracer can consume it which can confuse the real parent.  Don't
1359         * use WCONTINUED from ptracer.  You don't need or want it.
1360         */
1361        return wait_task_continued(wo, p);
1362}
1363
1364/*
1365 * Do the work of do_wait() for one thread in the group, @tsk.
1366 *
1367 * -ECHILD should be in ->notask_error before the first call.
1368 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1369 * Returns zero if the search for a child should continue; then
1370 * ->notask_error is 0 if there were any eligible children,
1371 * or still -ECHILD.
1372 */
1373static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1374{
1375        struct task_struct *p;
1376
1377        list_for_each_entry(p, &tsk->children, sibling) {
1378                int ret = wait_consider_task(wo, 0, p);
1379
1380                if (ret)
1381                        return ret;
1382        }
1383
1384        return 0;
1385}
1386
1387static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1388{
1389        struct task_struct *p;
1390
1391        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1392                int ret = wait_consider_task(wo, 1, p);
1393
1394                if (ret)
1395                        return ret;
1396        }
1397
1398        return 0;
1399}
1400
1401static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1402                                int sync, void *key)
1403{
1404        struct wait_opts *wo = container_of(wait, struct wait_opts,
1405                                                child_wait);
1406        struct task_struct *p = key;
1407
1408        if (!eligible_pid(wo, p))
1409                return 0;
1410
1411        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1412                return 0;
1413
1414        return default_wake_function(wait, mode, sync, key);
1415}
1416
1417void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1418{
1419        __wake_up_sync_key(&parent->signal->wait_chldexit,
1420                           TASK_INTERRUPTIBLE, p);
1421}
1422
1423static long do_wait(struct wait_opts *wo)
1424{
1425        struct task_struct *tsk;
1426        int retval;
1427
1428        trace_sched_process_wait(wo->wo_pid);
1429
1430        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1431        wo->child_wait.private = current;
1432        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1433repeat:
1434        /*
1435         * If there is nothing that can match our criteria, just get out.
1436         * We will clear ->notask_error to zero if we see any child that
1437         * might later match our criteria, even if we are not able to reap
1438         * it yet.
1439         */
1440        wo->notask_error = -ECHILD;
1441        if ((wo->wo_type < PIDTYPE_MAX) &&
1442           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
1443                goto notask;
1444
1445        set_current_state(TASK_INTERRUPTIBLE);
1446        read_lock(&tasklist_lock);
1447        tsk = current;
1448        do {
1449                retval = do_wait_thread(wo, tsk);
1450                if (retval)
1451                        goto end;
1452
1453                retval = ptrace_do_wait(wo, tsk);
1454                if (retval)
1455                        goto end;
1456
1457                if (wo->wo_flags & __WNOTHREAD)
1458                        break;
1459        } while_each_thread(current, tsk);
1460        read_unlock(&tasklist_lock);
1461
1462notask:
1463        retval = wo->notask_error;
1464        if (!retval && !(wo->wo_flags & WNOHANG)) {
1465                retval = -ERESTARTSYS;
1466                if (!signal_pending(current)) {
1467                        schedule();
1468                        goto repeat;
1469                }
1470        }
1471end:
1472        __set_current_state(TASK_RUNNING);
1473        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1474        return retval;
1475}
1476
1477static struct pid *pidfd_get_pid(unsigned int fd)
1478{
1479        struct fd f;
1480        struct pid *pid;
1481
1482        f = fdget(fd);
1483        if (!f.file)
1484                return ERR_PTR(-EBADF);
1485
1486        pid = pidfd_pid(f.file);
1487        if (!IS_ERR(pid))
1488                get_pid(pid);
1489
1490        fdput(f);
1491        return pid;
1492}
1493
1494static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1495                          int options, struct rusage *ru)
1496{
1497        struct wait_opts wo;
1498        struct pid *pid = NULL;
1499        enum pid_type type;
1500        long ret;
1501
1502        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1503                        __WNOTHREAD|__WCLONE|__WALL))
1504                return -EINVAL;
1505        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1506                return -EINVAL;
1507
1508        switch (which) {
1509        case P_ALL:
1510                type = PIDTYPE_MAX;
1511                break;
1512        case P_PID:
1513                type = PIDTYPE_PID;
1514                if (upid <= 0)
1515                        return -EINVAL;
1516
1517                pid = find_get_pid(upid);
1518                break;
1519        case P_PGID:
1520                type = PIDTYPE_PGID;
1521                if (upid < 0)
1522                        return -EINVAL;
1523
1524                if (upid)
1525                        pid = find_get_pid(upid);
1526                else
1527                        pid = get_task_pid(current, PIDTYPE_PGID);
1528                break;
1529        case P_PIDFD:
1530                type = PIDTYPE_PID;
1531                if (upid < 0)
1532                        return -EINVAL;
1533
1534                pid = pidfd_get_pid(upid);
1535                if (IS_ERR(pid))
1536                        return PTR_ERR(pid);
1537                break;
1538        default:
1539                return -EINVAL;
1540        }
1541
1542        wo.wo_type      = type;
1543        wo.wo_pid       = pid;
1544        wo.wo_flags     = options;
1545        wo.wo_info      = infop;
1546        wo.wo_rusage    = ru;
1547        ret = do_wait(&wo);
1548
1549        put_pid(pid);
1550        return ret;
1551}
1552
1553SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1554                infop, int, options, struct rusage __user *, ru)
1555{
1556        struct rusage r;
1557        struct waitid_info info = {.status = 0};
1558        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1559        int signo = 0;
1560
1561        if (err > 0) {
1562                signo = SIGCHLD;
1563                err = 0;
1564                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1565                        return -EFAULT;
1566        }
1567        if (!infop)
1568                return err;
1569
1570        if (!user_write_access_begin(infop, sizeof(*infop)))
1571                return -EFAULT;
1572
1573        unsafe_put_user(signo, &infop->si_signo, Efault);
1574        unsafe_put_user(0, &infop->si_errno, Efault);
1575        unsafe_put_user(info.cause, &infop->si_code, Efault);
1576        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1577        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1578        unsafe_put_user(info.status, &infop->si_status, Efault);
1579        user_write_access_end();
1580        return err;
1581Efault:
1582        user_write_access_end();
1583        return -EFAULT;
1584}
1585
1586long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1587                  struct rusage *ru)
1588{
1589        struct wait_opts wo;
1590        struct pid *pid = NULL;
1591        enum pid_type type;
1592        long ret;
1593
1594        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1595                        __WNOTHREAD|__WCLONE|__WALL))
1596                return -EINVAL;
1597
1598        /* -INT_MIN is not defined */
1599        if (upid == INT_MIN)
1600                return -ESRCH;
1601
1602        if (upid == -1)
1603                type = PIDTYPE_MAX;
1604        else if (upid < 0) {
1605                type = PIDTYPE_PGID;
1606                pid = find_get_pid(-upid);
1607        } else if (upid == 0) {
1608                type = PIDTYPE_PGID;
1609                pid = get_task_pid(current, PIDTYPE_PGID);
1610        } else /* upid > 0 */ {
1611                type = PIDTYPE_PID;
1612                pid = find_get_pid(upid);
1613        }
1614
1615        wo.wo_type      = type;
1616        wo.wo_pid       = pid;
1617        wo.wo_flags     = options | WEXITED;
1618        wo.wo_info      = NULL;
1619        wo.wo_stat      = 0;
1620        wo.wo_rusage    = ru;
1621        ret = do_wait(&wo);
1622        put_pid(pid);
1623        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1624                ret = -EFAULT;
1625
1626        return ret;
1627}
1628
1629int kernel_wait(pid_t pid, int *stat)
1630{
1631        struct wait_opts wo = {
1632                .wo_type        = PIDTYPE_PID,
1633                .wo_pid         = find_get_pid(pid),
1634                .wo_flags       = WEXITED,
1635        };
1636        int ret;
1637
1638        ret = do_wait(&wo);
1639        if (ret > 0 && wo.wo_stat)
1640                *stat = wo.wo_stat;
1641        put_pid(wo.wo_pid);
1642        return ret;
1643}
1644
1645SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1646                int, options, struct rusage __user *, ru)
1647{
1648        struct rusage r;
1649        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1650
1651        if (err > 0) {
1652                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1653                        return -EFAULT;
1654        }
1655        return err;
1656}
1657
1658#ifdef __ARCH_WANT_SYS_WAITPID
1659
1660/*
1661 * sys_waitpid() remains for compatibility. waitpid() should be
1662 * implemented by calling sys_wait4() from libc.a.
1663 */
1664SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1665{
1666        return kernel_wait4(pid, stat_addr, options, NULL);
1667}
1668
1669#endif
1670
1671#ifdef CONFIG_COMPAT
1672COMPAT_SYSCALL_DEFINE4(wait4,
1673        compat_pid_t, pid,
1674        compat_uint_t __user *, stat_addr,
1675        int, options,
1676        struct compat_rusage __user *, ru)
1677{
1678        struct rusage r;
1679        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1680        if (err > 0) {
1681                if (ru && put_compat_rusage(&r, ru))
1682                        return -EFAULT;
1683        }
1684        return err;
1685}
1686
1687COMPAT_SYSCALL_DEFINE5(waitid,
1688                int, which, compat_pid_t, pid,
1689                struct compat_siginfo __user *, infop, int, options,
1690                struct compat_rusage __user *, uru)
1691{
1692        struct rusage ru;
1693        struct waitid_info info = {.status = 0};
1694        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1695        int signo = 0;
1696        if (err > 0) {
1697                signo = SIGCHLD;
1698                err = 0;
1699                if (uru) {
1700                        /* kernel_waitid() overwrites everything in ru */
1701                        if (COMPAT_USE_64BIT_TIME)
1702                                err = copy_to_user(uru, &ru, sizeof(ru));
1703                        else
1704                                err = put_compat_rusage(&ru, uru);
1705                        if (err)
1706                                return -EFAULT;
1707                }
1708        }
1709
1710        if (!infop)
1711                return err;
1712
1713        if (!user_write_access_begin(infop, sizeof(*infop)))
1714                return -EFAULT;
1715
1716        unsafe_put_user(signo, &infop->si_signo, Efault);
1717        unsafe_put_user(0, &infop->si_errno, Efault);
1718        unsafe_put_user(info.cause, &infop->si_code, Efault);
1719        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1720        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1721        unsafe_put_user(info.status, &infop->si_status, Efault);
1722        user_write_access_end();
1723        return err;
1724Efault:
1725        user_write_access_end();
1726        return -EFAULT;
1727}
1728#endif
1729
1730/**
1731 * thread_group_exited - check that a thread group has exited
1732 * @pid: tgid of thread group to be checked.
1733 *
1734 * Test if the thread group represented by tgid has exited (all
1735 * threads are zombies, dead or completely gone).
1736 *
1737 * Return: true if the thread group has exited. false otherwise.
1738 */
1739bool thread_group_exited(struct pid *pid)
1740{
1741        struct task_struct *task;
1742        bool exited;
1743
1744        rcu_read_lock();
1745        task = pid_task(pid, PIDTYPE_PID);
1746        exited = !task ||
1747                (READ_ONCE(task->exit_state) && thread_group_empty(task));
1748        rcu_read_unlock();
1749
1750        return exited;
1751}
1752EXPORT_SYMBOL(thread_group_exited);
1753
1754__weak void abort(void)
1755{
1756        BUG();
1757
1758        /* if that doesn't kill us, halt */
1759        panic("Oops failed to kill thread");
1760}
1761EXPORT_SYMBOL(abort);
1762