linux/kernel/exit.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/exit.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/slab.h>
   9#include <linux/sched/autogroup.h>
  10#include <linux/sched/mm.h>
  11#include <linux/sched/stat.h>
  12#include <linux/sched/task.h>
  13#include <linux/sched/task_stack.h>
  14#include <linux/sched/cputime.h>
  15#include <linux/interrupt.h>
  16#include <linux/module.h>
  17#include <linux/capability.h>
  18#include <linux/completion.h>
  19#include <linux/personality.h>
  20#include <linux/tty.h>
  21#include <linux/iocontext.h>
  22#include <linux/key.h>
  23#include <linux/cpu.h>
  24#include <linux/acct.h>
  25#include <linux/tsacct_kern.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/freezer.h>
  29#include <linux/binfmts.h>
  30#include <linux/nsproxy.h>
  31#include <linux/pid_namespace.h>
  32#include <linux/ptrace.h>
  33#include <linux/profile.h>
  34#include <linux/mount.h>
  35#include <linux/proc_fs.h>
  36#include <linux/kthread.h>
  37#include <linux/mempolicy.h>
  38#include <linux/taskstats_kern.h>
  39#include <linux/delayacct.h>
  40#include <linux/cgroup.h>
  41#include <linux/syscalls.h>
  42#include <linux/signal.h>
  43#include <linux/posix-timers.h>
  44#include <linux/cn_proc.h>
  45#include <linux/mutex.h>
  46#include <linux/futex.h>
  47#include <linux/pipe_fs_i.h>
  48#include <linux/audit.h> /* for audit_free() */
  49#include <linux/resource.h>
  50#include <linux/blkdev.h>
  51#include <linux/task_io_accounting_ops.h>
  52#include <linux/tracehook.h>
  53#include <linux/fs_struct.h>
  54#include <linux/init_task.h>
  55#include <linux/perf_event.h>
  56#include <trace/events/sched.h>
  57#include <linux/hw_breakpoint.h>
  58#include <linux/oom.h>
  59#include <linux/writeback.h>
  60#include <linux/shm.h>
  61#include <linux/kcov.h>
  62#include <linux/random.h>
  63#include <linux/rcuwait.h>
  64#include <linux/compat.h>
  65
  66#include <linux/uaccess.h>
  67#include <asm/unistd.h>
  68#include <asm/pgtable.h>
  69#include <asm/mmu_context.h>
  70
  71static void __unhash_process(struct task_struct *p, bool group_dead)
  72{
  73        nr_threads--;
  74        detach_pid(p, PIDTYPE_PID);
  75        if (group_dead) {
  76                detach_pid(p, PIDTYPE_TGID);
  77                detach_pid(p, PIDTYPE_PGID);
  78                detach_pid(p, PIDTYPE_SID);
  79
  80                list_del_rcu(&p->tasks);
  81                list_del_init(&p->sibling);
  82                __this_cpu_dec(process_counts);
  83        }
  84        list_del_rcu(&p->thread_group);
  85        list_del_rcu(&p->thread_node);
  86}
  87
  88/*
  89 * This function expects the tasklist_lock write-locked.
  90 */
  91static void __exit_signal(struct task_struct *tsk)
  92{
  93        struct signal_struct *sig = tsk->signal;
  94        bool group_dead = thread_group_leader(tsk);
  95        struct sighand_struct *sighand;
  96        struct tty_struct *uninitialized_var(tty);
  97        u64 utime, stime;
  98
  99        sighand = rcu_dereference_check(tsk->sighand,
 100                                        lockdep_tasklist_lock_is_held());
 101        spin_lock(&sighand->siglock);
 102
 103#ifdef CONFIG_POSIX_TIMERS
 104        posix_cpu_timers_exit(tsk);
 105        if (group_dead) {
 106                posix_cpu_timers_exit_group(tsk);
 107        } else {
 108                /*
 109                 * This can only happen if the caller is de_thread().
 110                 * FIXME: this is the temporary hack, we should teach
 111                 * posix-cpu-timers to handle this case correctly.
 112                 */
 113                if (unlikely(has_group_leader_pid(tsk)))
 114                        posix_cpu_timers_exit_group(tsk);
 115        }
 116#endif
 117
 118        if (group_dead) {
 119                tty = sig->tty;
 120                sig->tty = NULL;
 121        } else {
 122                /*
 123                 * If there is any task waiting for the group exit
 124                 * then notify it:
 125                 */
 126                if (sig->notify_count > 0 && !--sig->notify_count)
 127                        wake_up_process(sig->group_exit_task);
 128
 129                if (tsk == sig->curr_target)
 130                        sig->curr_target = next_thread(tsk);
 131        }
 132
 133        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 134                              sizeof(unsigned long long));
 135
 136        /*
 137         * Accumulate here the counters for all threads as they die. We could
 138         * skip the group leader because it is the last user of signal_struct,
 139         * but we want to avoid the race with thread_group_cputime() which can
 140         * see the empty ->thread_head list.
 141         */
 142        task_cputime(tsk, &utime, &stime);
 143        write_seqlock(&sig->stats_lock);
 144        sig->utime += utime;
 145        sig->stime += stime;
 146        sig->gtime += task_gtime(tsk);
 147        sig->min_flt += tsk->min_flt;
 148        sig->maj_flt += tsk->maj_flt;
 149        sig->nvcsw += tsk->nvcsw;
 150        sig->nivcsw += tsk->nivcsw;
 151        sig->inblock += task_io_get_inblock(tsk);
 152        sig->oublock += task_io_get_oublock(tsk);
 153        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 154        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 155        sig->nr_threads--;
 156        __unhash_process(tsk, group_dead);
 157        write_sequnlock(&sig->stats_lock);
 158
 159        /*
 160         * Do this under ->siglock, we can race with another thread
 161         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 162         */
 163        flush_sigqueue(&tsk->pending);
 164        tsk->sighand = NULL;
 165        spin_unlock(&sighand->siglock);
 166
 167        __cleanup_sighand(sighand);
 168        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 169        if (group_dead) {
 170                flush_sigqueue(&sig->shared_pending);
 171                tty_kref_put(tty);
 172        }
 173}
 174
 175static void delayed_put_task_struct(struct rcu_head *rhp)
 176{
 177        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 178
 179        perf_event_delayed_put(tsk);
 180        trace_sched_process_free(tsk);
 181        put_task_struct(tsk);
 182}
 183
 184
 185void release_task(struct task_struct *p)
 186{
 187        struct task_struct *leader;
 188        int zap_leader;
 189repeat:
 190        /* don't need to get the RCU readlock here - the process is dead and
 191         * can't be modifying its own credentials. But shut RCU-lockdep up */
 192        rcu_read_lock();
 193        atomic_dec(&__task_cred(p)->user->processes);
 194        rcu_read_unlock();
 195
 196        proc_flush_task(p);
 197
 198        write_lock_irq(&tasklist_lock);
 199        ptrace_release_task(p);
 200        __exit_signal(p);
 201
 202        /*
 203         * If we are the last non-leader member of the thread
 204         * group, and the leader is zombie, then notify the
 205         * group leader's parent process. (if it wants notification.)
 206         */
 207        zap_leader = 0;
 208        leader = p->group_leader;
 209        if (leader != p && thread_group_empty(leader)
 210                        && leader->exit_state == EXIT_ZOMBIE) {
 211                /*
 212                 * If we were the last child thread and the leader has
 213                 * exited already, and the leader's parent ignores SIGCHLD,
 214                 * then we are the one who should release the leader.
 215                 */
 216                zap_leader = do_notify_parent(leader, leader->exit_signal);
 217                if (zap_leader)
 218                        leader->exit_state = EXIT_DEAD;
 219        }
 220
 221        write_unlock_irq(&tasklist_lock);
 222        release_thread(p);
 223        call_rcu(&p->rcu, delayed_put_task_struct);
 224
 225        p = leader;
 226        if (unlikely(zap_leader))
 227                goto repeat;
 228}
 229
 230/*
 231 * Note that if this function returns a valid task_struct pointer (!NULL)
 232 * task->usage must remain >0 for the duration of the RCU critical section.
 233 */
 234struct task_struct *task_rcu_dereference(struct task_struct **ptask)
 235{
 236        struct sighand_struct *sighand;
 237        struct task_struct *task;
 238
 239        /*
 240         * We need to verify that release_task() was not called and thus
 241         * delayed_put_task_struct() can't run and drop the last reference
 242         * before rcu_read_unlock(). We check task->sighand != NULL,
 243         * but we can read the already freed and reused memory.
 244         */
 245retry:
 246        task = rcu_dereference(*ptask);
 247        if (!task)
 248                return NULL;
 249
 250        probe_kernel_address(&task->sighand, sighand);
 251
 252        /*
 253         * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
 254         * was already freed we can not miss the preceding update of this
 255         * pointer.
 256         */
 257        smp_rmb();
 258        if (unlikely(task != READ_ONCE(*ptask)))
 259                goto retry;
 260
 261        /*
 262         * We've re-checked that "task == *ptask", now we have two different
 263         * cases:
 264         *
 265         * 1. This is actually the same task/task_struct. In this case
 266         *    sighand != NULL tells us it is still alive.
 267         *
 268         * 2. This is another task which got the same memory for task_struct.
 269         *    We can't know this of course, and we can not trust
 270         *    sighand != NULL.
 271         *
 272         *    In this case we actually return a random value, but this is
 273         *    correct.
 274         *
 275         *    If we return NULL - we can pretend that we actually noticed that
 276         *    *ptask was updated when the previous task has exited. Or pretend
 277         *    that probe_slab_address(&sighand) reads NULL.
 278         *
 279         *    If we return the new task (because sighand is not NULL for any
 280         *    reason) - this is fine too. This (new) task can't go away before
 281         *    another gp pass.
 282         *
 283         *    And note: We could even eliminate the false positive if re-read
 284         *    task->sighand once again to avoid the falsely NULL. But this case
 285         *    is very unlikely so we don't care.
 286         */
 287        if (!sighand)
 288                return NULL;
 289
 290        return task;
 291}
 292
 293void rcuwait_wake_up(struct rcuwait *w)
 294{
 295        struct task_struct *task;
 296
 297        rcu_read_lock();
 298
 299        /*
 300         * Order condition vs @task, such that everything prior to the load
 301         * of @task is visible. This is the condition as to why the user called
 302         * rcuwait_trywake() in the first place. Pairs with set_current_state()
 303         * barrier (A) in rcuwait_wait_event().
 304         *
 305         *    WAIT                WAKE
 306         *    [S] tsk = current   [S] cond = true
 307         *        MB (A)              MB (B)
 308         *    [L] cond            [L] tsk
 309         */
 310        smp_rmb(); /* (B) */
 311
 312        /*
 313         * Avoid using task_rcu_dereference() magic as long as we are careful,
 314         * see comment in rcuwait_wait_event() regarding ->exit_state.
 315         */
 316        task = rcu_dereference(w->task);
 317        if (task)
 318                wake_up_process(task);
 319        rcu_read_unlock();
 320}
 321
 322/*
 323 * Determine if a process group is "orphaned", according to the POSIX
 324 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 325 * by terminal-generated stop signals.  Newly orphaned process groups are
 326 * to receive a SIGHUP and a SIGCONT.
 327 *
 328 * "I ask you, have you ever known what it is to be an orphan?"
 329 */
 330static int will_become_orphaned_pgrp(struct pid *pgrp,
 331                                        struct task_struct *ignored_task)
 332{
 333        struct task_struct *p;
 334
 335        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 336                if ((p == ignored_task) ||
 337                    (p->exit_state && thread_group_empty(p)) ||
 338                    is_global_init(p->real_parent))
 339                        continue;
 340
 341                if (task_pgrp(p->real_parent) != pgrp &&
 342                    task_session(p->real_parent) == task_session(p))
 343                        return 0;
 344        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 345
 346        return 1;
 347}
 348
 349int is_current_pgrp_orphaned(void)
 350{
 351        int retval;
 352
 353        read_lock(&tasklist_lock);
 354        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 355        read_unlock(&tasklist_lock);
 356
 357        return retval;
 358}
 359
 360static bool has_stopped_jobs(struct pid *pgrp)
 361{
 362        struct task_struct *p;
 363
 364        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 365                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 366                        return true;
 367        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 368
 369        return false;
 370}
 371
 372/*
 373 * Check to see if any process groups have become orphaned as
 374 * a result of our exiting, and if they have any stopped jobs,
 375 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 376 */
 377static void
 378kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 379{
 380        struct pid *pgrp = task_pgrp(tsk);
 381        struct task_struct *ignored_task = tsk;
 382
 383        if (!parent)
 384                /* exit: our father is in a different pgrp than
 385                 * we are and we were the only connection outside.
 386                 */
 387                parent = tsk->real_parent;
 388        else
 389                /* reparent: our child is in a different pgrp than
 390                 * we are, and it was the only connection outside.
 391                 */
 392                ignored_task = NULL;
 393
 394        if (task_pgrp(parent) != pgrp &&
 395            task_session(parent) == task_session(tsk) &&
 396            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 397            has_stopped_jobs(pgrp)) {
 398                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 399                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 400        }
 401}
 402
 403#ifdef CONFIG_MEMCG
 404/*
 405 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 406 */
 407void mm_update_next_owner(struct mm_struct *mm)
 408{
 409        struct task_struct *c, *g, *p = current;
 410
 411retry:
 412        /*
 413         * If the exiting or execing task is not the owner, it's
 414         * someone else's problem.
 415         */
 416        if (mm->owner != p)
 417                return;
 418        /*
 419         * The current owner is exiting/execing and there are no other
 420         * candidates.  Do not leave the mm pointing to a possibly
 421         * freed task structure.
 422         */
 423        if (atomic_read(&mm->mm_users) <= 1) {
 424                mm->owner = NULL;
 425                return;
 426        }
 427
 428        read_lock(&tasklist_lock);
 429        /*
 430         * Search in the children
 431         */
 432        list_for_each_entry(c, &p->children, sibling) {
 433                if (c->mm == mm)
 434                        goto assign_new_owner;
 435        }
 436
 437        /*
 438         * Search in the siblings
 439         */
 440        list_for_each_entry(c, &p->real_parent->children, sibling) {
 441                if (c->mm == mm)
 442                        goto assign_new_owner;
 443        }
 444
 445        /*
 446         * Search through everything else, we should not get here often.
 447         */
 448        for_each_process(g) {
 449                if (g->flags & PF_KTHREAD)
 450                        continue;
 451                for_each_thread(g, c) {
 452                        if (c->mm == mm)
 453                                goto assign_new_owner;
 454                        if (c->mm)
 455                                break;
 456                }
 457        }
 458        read_unlock(&tasklist_lock);
 459        /*
 460         * We found no owner yet mm_users > 1: this implies that we are
 461         * most likely racing with swapoff (try_to_unuse()) or /proc or
 462         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 463         */
 464        mm->owner = NULL;
 465        return;
 466
 467assign_new_owner:
 468        BUG_ON(c == p);
 469        get_task_struct(c);
 470        /*
 471         * The task_lock protects c->mm from changing.
 472         * We always want mm->owner->mm == mm
 473         */
 474        task_lock(c);
 475        /*
 476         * Delay read_unlock() till we have the task_lock()
 477         * to ensure that c does not slip away underneath us
 478         */
 479        read_unlock(&tasklist_lock);
 480        if (c->mm != mm) {
 481                task_unlock(c);
 482                put_task_struct(c);
 483                goto retry;
 484        }
 485        mm->owner = c;
 486        task_unlock(c);
 487        put_task_struct(c);
 488}
 489#endif /* CONFIG_MEMCG */
 490
 491/*
 492 * Turn us into a lazy TLB process if we
 493 * aren't already..
 494 */
 495static void exit_mm(void)
 496{
 497        struct mm_struct *mm = current->mm;
 498        struct core_state *core_state;
 499
 500        mm_release(current, mm);
 501        if (!mm)
 502                return;
 503        sync_mm_rss(mm);
 504        /*
 505         * Serialize with any possible pending coredump.
 506         * We must hold mmap_sem around checking core_state
 507         * and clearing tsk->mm.  The core-inducing thread
 508         * will increment ->nr_threads for each thread in the
 509         * group with ->mm != NULL.
 510         */
 511        down_read(&mm->mmap_sem);
 512        core_state = mm->core_state;
 513        if (core_state) {
 514                struct core_thread self;
 515
 516                up_read(&mm->mmap_sem);
 517
 518                self.task = current;
 519                self.next = xchg(&core_state->dumper.next, &self);
 520                /*
 521                 * Implies mb(), the result of xchg() must be visible
 522                 * to core_state->dumper.
 523                 */
 524                if (atomic_dec_and_test(&core_state->nr_threads))
 525                        complete(&core_state->startup);
 526
 527                for (;;) {
 528                        set_current_state(TASK_UNINTERRUPTIBLE);
 529                        if (!self.task) /* see coredump_finish() */
 530                                break;
 531                        freezable_schedule();
 532                }
 533                __set_current_state(TASK_RUNNING);
 534                down_read(&mm->mmap_sem);
 535        }
 536        mmgrab(mm);
 537        BUG_ON(mm != current->active_mm);
 538        /* more a memory barrier than a real lock */
 539        task_lock(current);
 540        current->mm = NULL;
 541        up_read(&mm->mmap_sem);
 542        enter_lazy_tlb(mm, current);
 543        task_unlock(current);
 544        mm_update_next_owner(mm);
 545        mmput(mm);
 546        if (test_thread_flag(TIF_MEMDIE))
 547                exit_oom_victim();
 548}
 549
 550static struct task_struct *find_alive_thread(struct task_struct *p)
 551{
 552        struct task_struct *t;
 553
 554        for_each_thread(p, t) {
 555                if (!(t->flags & PF_EXITING))
 556                        return t;
 557        }
 558        return NULL;
 559}
 560
 561static struct task_struct *find_child_reaper(struct task_struct *father)
 562        __releases(&tasklist_lock)
 563        __acquires(&tasklist_lock)
 564{
 565        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 566        struct task_struct *reaper = pid_ns->child_reaper;
 567
 568        if (likely(reaper != father))
 569                return reaper;
 570
 571        reaper = find_alive_thread(father);
 572        if (reaper) {
 573                pid_ns->child_reaper = reaper;
 574                return reaper;
 575        }
 576
 577        write_unlock_irq(&tasklist_lock);
 578        if (unlikely(pid_ns == &init_pid_ns)) {
 579                panic("Attempted to kill init! exitcode=0x%08x\n",
 580                        father->signal->group_exit_code ?: father->exit_code);
 581        }
 582        zap_pid_ns_processes(pid_ns);
 583        write_lock_irq(&tasklist_lock);
 584
 585        return father;
 586}
 587
 588/*
 589 * When we die, we re-parent all our children, and try to:
 590 * 1. give them to another thread in our thread group, if such a member exists
 591 * 2. give it to the first ancestor process which prctl'd itself as a
 592 *    child_subreaper for its children (like a service manager)
 593 * 3. give it to the init process (PID 1) in our pid namespace
 594 */
 595static struct task_struct *find_new_reaper(struct task_struct *father,
 596                                           struct task_struct *child_reaper)
 597{
 598        struct task_struct *thread, *reaper;
 599
 600        thread = find_alive_thread(father);
 601        if (thread)
 602                return thread;
 603
 604        if (father->signal->has_child_subreaper) {
 605                unsigned int ns_level = task_pid(father)->level;
 606                /*
 607                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 608                 * We can't check reaper != child_reaper to ensure we do not
 609                 * cross the namespaces, the exiting parent could be injected
 610                 * by setns() + fork().
 611                 * We check pid->level, this is slightly more efficient than
 612                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 613                 */
 614                for (reaper = father->real_parent;
 615                     task_pid(reaper)->level == ns_level;
 616                     reaper = reaper->real_parent) {
 617                        if (reaper == &init_task)
 618                                break;
 619                        if (!reaper->signal->is_child_subreaper)
 620                                continue;
 621                        thread = find_alive_thread(reaper);
 622                        if (thread)
 623                                return thread;
 624                }
 625        }
 626
 627        return child_reaper;
 628}
 629
 630/*
 631* Any that need to be release_task'd are put on the @dead list.
 632 */
 633static void reparent_leader(struct task_struct *father, struct task_struct *p,
 634                                struct list_head *dead)
 635{
 636        if (unlikely(p->exit_state == EXIT_DEAD))
 637                return;
 638
 639        /* We don't want people slaying init. */
 640        p->exit_signal = SIGCHLD;
 641
 642        /* If it has exited notify the new parent about this child's death. */
 643        if (!p->ptrace &&
 644            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 645                if (do_notify_parent(p, p->exit_signal)) {
 646                        p->exit_state = EXIT_DEAD;
 647                        list_add(&p->ptrace_entry, dead);
 648                }
 649        }
 650
 651        kill_orphaned_pgrp(p, father);
 652}
 653
 654/*
 655 * This does two things:
 656 *
 657 * A.  Make init inherit all the child processes
 658 * B.  Check to see if any process groups have become orphaned
 659 *      as a result of our exiting, and if they have any stopped
 660 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 661 */
 662static void forget_original_parent(struct task_struct *father,
 663                                        struct list_head *dead)
 664{
 665        struct task_struct *p, *t, *reaper;
 666
 667        if (unlikely(!list_empty(&father->ptraced)))
 668                exit_ptrace(father, dead);
 669
 670        /* Can drop and reacquire tasklist_lock */
 671        reaper = find_child_reaper(father);
 672        if (list_empty(&father->children))
 673                return;
 674
 675        reaper = find_new_reaper(father, reaper);
 676        list_for_each_entry(p, &father->children, sibling) {
 677                for_each_thread(p, t) {
 678                        t->real_parent = reaper;
 679                        BUG_ON((!t->ptrace) != (t->parent == father));
 680                        if (likely(!t->ptrace))
 681                                t->parent = t->real_parent;
 682                        if (t->pdeath_signal)
 683                                group_send_sig_info(t->pdeath_signal,
 684                                                    SEND_SIG_NOINFO, t,
 685                                                    PIDTYPE_TGID);
 686                }
 687                /*
 688                 * If this is a threaded reparent there is no need to
 689                 * notify anyone anything has happened.
 690                 */
 691                if (!same_thread_group(reaper, father))
 692                        reparent_leader(father, p, dead);
 693        }
 694        list_splice_tail_init(&father->children, &reaper->children);
 695}
 696
 697/*
 698 * Send signals to all our closest relatives so that they know
 699 * to properly mourn us..
 700 */
 701static void exit_notify(struct task_struct *tsk, int group_dead)
 702{
 703        bool autoreap;
 704        struct task_struct *p, *n;
 705        LIST_HEAD(dead);
 706
 707        write_lock_irq(&tasklist_lock);
 708        forget_original_parent(tsk, &dead);
 709
 710        if (group_dead)
 711                kill_orphaned_pgrp(tsk->group_leader, NULL);
 712
 713        if (unlikely(tsk->ptrace)) {
 714                int sig = thread_group_leader(tsk) &&
 715                                thread_group_empty(tsk) &&
 716                                !ptrace_reparented(tsk) ?
 717                        tsk->exit_signal : SIGCHLD;
 718                autoreap = do_notify_parent(tsk, sig);
 719        } else if (thread_group_leader(tsk)) {
 720                autoreap = thread_group_empty(tsk) &&
 721                        do_notify_parent(tsk, tsk->exit_signal);
 722        } else {
 723                autoreap = true;
 724        }
 725
 726        tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 727        if (tsk->exit_state == EXIT_DEAD)
 728                list_add(&tsk->ptrace_entry, &dead);
 729
 730        /* mt-exec, de_thread() is waiting for group leader */
 731        if (unlikely(tsk->signal->notify_count < 0))
 732                wake_up_process(tsk->signal->group_exit_task);
 733        write_unlock_irq(&tasklist_lock);
 734
 735        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 736                list_del_init(&p->ptrace_entry);
 737                release_task(p);
 738        }
 739}
 740
 741#ifdef CONFIG_DEBUG_STACK_USAGE
 742static void check_stack_usage(void)
 743{
 744        static DEFINE_SPINLOCK(low_water_lock);
 745        static int lowest_to_date = THREAD_SIZE;
 746        unsigned long free;
 747
 748        free = stack_not_used(current);
 749
 750        if (free >= lowest_to_date)
 751                return;
 752
 753        spin_lock(&low_water_lock);
 754        if (free < lowest_to_date) {
 755                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 756                        current->comm, task_pid_nr(current), free);
 757                lowest_to_date = free;
 758        }
 759        spin_unlock(&low_water_lock);
 760}
 761#else
 762static inline void check_stack_usage(void) {}
 763#endif
 764
 765void __noreturn do_exit(long code)
 766{
 767        struct task_struct *tsk = current;
 768        int group_dead;
 769
 770        profile_task_exit(tsk);
 771        kcov_task_exit(tsk);
 772
 773        WARN_ON(blk_needs_flush_plug(tsk));
 774
 775        if (unlikely(in_interrupt()))
 776                panic("Aiee, killing interrupt handler!");
 777        if (unlikely(!tsk->pid))
 778                panic("Attempted to kill the idle task!");
 779
 780        /*
 781         * If do_exit is called because this processes oopsed, it's possible
 782         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 783         * continuing. Amongst other possible reasons, this is to prevent
 784         * mm_release()->clear_child_tid() from writing to a user-controlled
 785         * kernel address.
 786         */
 787        set_fs(USER_DS);
 788
 789        ptrace_event(PTRACE_EVENT_EXIT, code);
 790
 791        validate_creds_for_do_exit(tsk);
 792
 793        /*
 794         * We're taking recursive faults here in do_exit. Safest is to just
 795         * leave this task alone and wait for reboot.
 796         */
 797        if (unlikely(tsk->flags & PF_EXITING)) {
 798                pr_alert("Fixing recursive fault but reboot is needed!\n");
 799                /*
 800                 * We can do this unlocked here. The futex code uses
 801                 * this flag just to verify whether the pi state
 802                 * cleanup has been done or not. In the worst case it
 803                 * loops once more. We pretend that the cleanup was
 804                 * done as there is no way to return. Either the
 805                 * OWNER_DIED bit is set by now or we push the blocked
 806                 * task into the wait for ever nirwana as well.
 807                 */
 808                tsk->flags |= PF_EXITPIDONE;
 809                set_current_state(TASK_UNINTERRUPTIBLE);
 810                schedule();
 811        }
 812
 813        exit_signals(tsk);  /* sets PF_EXITING */
 814        /*
 815         * Ensure that all new tsk->pi_lock acquisitions must observe
 816         * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
 817         */
 818        smp_mb();
 819        /*
 820         * Ensure that we must observe the pi_state in exit_mm() ->
 821         * mm_release() -> exit_pi_state_list().
 822         */
 823        raw_spin_lock_irq(&tsk->pi_lock);
 824        raw_spin_unlock_irq(&tsk->pi_lock);
 825
 826        if (unlikely(in_atomic())) {
 827                pr_info("note: %s[%d] exited with preempt_count %d\n",
 828                        current->comm, task_pid_nr(current),
 829                        preempt_count());
 830                preempt_count_set(PREEMPT_ENABLED);
 831        }
 832
 833        /* sync mm's RSS info before statistics gathering */
 834        if (tsk->mm)
 835                sync_mm_rss(tsk->mm);
 836        acct_update_integrals(tsk);
 837        group_dead = atomic_dec_and_test(&tsk->signal->live);
 838        if (group_dead) {
 839#ifdef CONFIG_POSIX_TIMERS
 840                hrtimer_cancel(&tsk->signal->real_timer);
 841                exit_itimers(tsk->signal);
 842#endif
 843                if (tsk->mm)
 844                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 845        }
 846        acct_collect(code, group_dead);
 847        if (group_dead)
 848                tty_audit_exit();
 849        audit_free(tsk);
 850
 851        tsk->exit_code = code;
 852        taskstats_exit(tsk, group_dead);
 853
 854        exit_mm();
 855
 856        if (group_dead)
 857                acct_process();
 858        trace_sched_process_exit(tsk);
 859
 860        exit_sem(tsk);
 861        exit_shm(tsk);
 862        exit_files(tsk);
 863        exit_fs(tsk);
 864        if (group_dead)
 865                disassociate_ctty(1);
 866        exit_task_namespaces(tsk);
 867        exit_task_work(tsk);
 868        exit_thread(tsk);
 869
 870        /*
 871         * Flush inherited counters to the parent - before the parent
 872         * gets woken up by child-exit notifications.
 873         *
 874         * because of cgroup mode, must be called before cgroup_exit()
 875         */
 876        perf_event_exit_task(tsk);
 877
 878        sched_autogroup_exit_task(tsk);
 879        cgroup_exit(tsk);
 880
 881        /*
 882         * FIXME: do that only when needed, using sched_exit tracepoint
 883         */
 884        flush_ptrace_hw_breakpoint(tsk);
 885
 886        exit_tasks_rcu_start();
 887        exit_notify(tsk, group_dead);
 888        proc_exit_connector(tsk);
 889        mpol_put_task_policy(tsk);
 890#ifdef CONFIG_FUTEX
 891        if (unlikely(current->pi_state_cache))
 892                kfree(current->pi_state_cache);
 893#endif
 894        /*
 895         * Make sure we are holding no locks:
 896         */
 897        debug_check_no_locks_held();
 898        /*
 899         * We can do this unlocked here. The futex code uses this flag
 900         * just to verify whether the pi state cleanup has been done
 901         * or not. In the worst case it loops once more.
 902         */
 903        tsk->flags |= PF_EXITPIDONE;
 904
 905        if (tsk->io_context)
 906                exit_io_context(tsk);
 907
 908        if (tsk->splice_pipe)
 909                free_pipe_info(tsk->splice_pipe);
 910
 911        if (tsk->task_frag.page)
 912                put_page(tsk->task_frag.page);
 913
 914        validate_creds_for_do_exit(tsk);
 915
 916        check_stack_usage();
 917        preempt_disable();
 918        if (tsk->nr_dirtied)
 919                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 920        exit_rcu();
 921        exit_tasks_rcu_finish();
 922
 923        lockdep_free_task(tsk);
 924        do_task_dead();
 925}
 926EXPORT_SYMBOL_GPL(do_exit);
 927
 928void complete_and_exit(struct completion *comp, long code)
 929{
 930        if (comp)
 931                complete(comp);
 932
 933        do_exit(code);
 934}
 935EXPORT_SYMBOL(complete_and_exit);
 936
 937SYSCALL_DEFINE1(exit, int, error_code)
 938{
 939        do_exit((error_code&0xff)<<8);
 940}
 941
 942/*
 943 * Take down every thread in the group.  This is called by fatal signals
 944 * as well as by sys_exit_group (below).
 945 */
 946void
 947do_group_exit(int exit_code)
 948{
 949        struct signal_struct *sig = current->signal;
 950
 951        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 952
 953        if (signal_group_exit(sig))
 954                exit_code = sig->group_exit_code;
 955        else if (!thread_group_empty(current)) {
 956                struct sighand_struct *const sighand = current->sighand;
 957
 958                spin_lock_irq(&sighand->siglock);
 959                if (signal_group_exit(sig))
 960                        /* Another thread got here before we took the lock.  */
 961                        exit_code = sig->group_exit_code;
 962                else {
 963                        sig->group_exit_code = exit_code;
 964                        sig->flags = SIGNAL_GROUP_EXIT;
 965                        zap_other_threads(current);
 966                }
 967                spin_unlock_irq(&sighand->siglock);
 968        }
 969
 970        do_exit(exit_code);
 971        /* NOTREACHED */
 972}
 973
 974/*
 975 * this kills every thread in the thread group. Note that any externally
 976 * wait4()-ing process will get the correct exit code - even if this
 977 * thread is not the thread group leader.
 978 */
 979SYSCALL_DEFINE1(exit_group, int, error_code)
 980{
 981        do_group_exit((error_code & 0xff) << 8);
 982        /* NOTREACHED */
 983        return 0;
 984}
 985
 986struct waitid_info {
 987        pid_t pid;
 988        uid_t uid;
 989        int status;
 990        int cause;
 991};
 992
 993struct wait_opts {
 994        enum pid_type           wo_type;
 995        int                     wo_flags;
 996        struct pid              *wo_pid;
 997
 998        struct waitid_info      *wo_info;
 999        int                     wo_stat;
1000        struct rusage           *wo_rusage;
1001
1002        wait_queue_entry_t              child_wait;
1003        int                     notask_error;
1004};
1005
1006static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1007{
1008        return  wo->wo_type == PIDTYPE_MAX ||
1009                task_pid_type(p, wo->wo_type) == wo->wo_pid;
1010}
1011
1012static int
1013eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
1014{
1015        if (!eligible_pid(wo, p))
1016                return 0;
1017
1018        /*
1019         * Wait for all children (clone and not) if __WALL is set or
1020         * if it is traced by us.
1021         */
1022        if (ptrace || (wo->wo_flags & __WALL))
1023                return 1;
1024
1025        /*
1026         * Otherwise, wait for clone children *only* if __WCLONE is set;
1027         * otherwise, wait for non-clone children *only*.
1028         *
1029         * Note: a "clone" child here is one that reports to its parent
1030         * using a signal other than SIGCHLD, or a non-leader thread which
1031         * we can only see if it is traced by us.
1032         */
1033        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1034                return 0;
1035
1036        return 1;
1037}
1038
1039/*
1040 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
1041 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1042 * the lock and this task is uninteresting.  If we return nonzero, we have
1043 * released the lock and the system call should return.
1044 */
1045static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1046{
1047        int state, status;
1048        pid_t pid = task_pid_vnr(p);
1049        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1050        struct waitid_info *infop;
1051
1052        if (!likely(wo->wo_flags & WEXITED))
1053                return 0;
1054
1055        if (unlikely(wo->wo_flags & WNOWAIT)) {
1056                status = p->exit_code;
1057                get_task_struct(p);
1058                read_unlock(&tasklist_lock);
1059                sched_annotate_sleep();
1060                if (wo->wo_rusage)
1061                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1062                put_task_struct(p);
1063                goto out_info;
1064        }
1065        /*
1066         * Move the task's state to DEAD/TRACE, only one thread can do this.
1067         */
1068        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1069                EXIT_TRACE : EXIT_DEAD;
1070        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1071                return 0;
1072        /*
1073         * We own this thread, nobody else can reap it.
1074         */
1075        read_unlock(&tasklist_lock);
1076        sched_annotate_sleep();
1077
1078        /*
1079         * Check thread_group_leader() to exclude the traced sub-threads.
1080         */
1081        if (state == EXIT_DEAD && thread_group_leader(p)) {
1082                struct signal_struct *sig = p->signal;
1083                struct signal_struct *psig = current->signal;
1084                unsigned long maxrss;
1085                u64 tgutime, tgstime;
1086
1087                /*
1088                 * The resource counters for the group leader are in its
1089                 * own task_struct.  Those for dead threads in the group
1090                 * are in its signal_struct, as are those for the child
1091                 * processes it has previously reaped.  All these
1092                 * accumulate in the parent's signal_struct c* fields.
1093                 *
1094                 * We don't bother to take a lock here to protect these
1095                 * p->signal fields because the whole thread group is dead
1096                 * and nobody can change them.
1097                 *
1098                 * psig->stats_lock also protects us from our sub-theads
1099                 * which can reap other children at the same time. Until
1100                 * we change k_getrusage()-like users to rely on this lock
1101                 * we have to take ->siglock as well.
1102                 *
1103                 * We use thread_group_cputime_adjusted() to get times for
1104                 * the thread group, which consolidates times for all threads
1105                 * in the group including the group leader.
1106                 */
1107                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1108                spin_lock_irq(&current->sighand->siglock);
1109                write_seqlock(&psig->stats_lock);
1110                psig->cutime += tgutime + sig->cutime;
1111                psig->cstime += tgstime + sig->cstime;
1112                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1113                psig->cmin_flt +=
1114                        p->min_flt + sig->min_flt + sig->cmin_flt;
1115                psig->cmaj_flt +=
1116                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1117                psig->cnvcsw +=
1118                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1119                psig->cnivcsw +=
1120                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1121                psig->cinblock +=
1122                        task_io_get_inblock(p) +
1123                        sig->inblock + sig->cinblock;
1124                psig->coublock +=
1125                        task_io_get_oublock(p) +
1126                        sig->oublock + sig->coublock;
1127                maxrss = max(sig->maxrss, sig->cmaxrss);
1128                if (psig->cmaxrss < maxrss)
1129                        psig->cmaxrss = maxrss;
1130                task_io_accounting_add(&psig->ioac, &p->ioac);
1131                task_io_accounting_add(&psig->ioac, &sig->ioac);
1132                write_sequnlock(&psig->stats_lock);
1133                spin_unlock_irq(&current->sighand->siglock);
1134        }
1135
1136        if (wo->wo_rusage)
1137                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1138        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1139                ? p->signal->group_exit_code : p->exit_code;
1140        wo->wo_stat = status;
1141
1142        if (state == EXIT_TRACE) {
1143                write_lock_irq(&tasklist_lock);
1144                /* We dropped tasklist, ptracer could die and untrace */
1145                ptrace_unlink(p);
1146
1147                /* If parent wants a zombie, don't release it now */
1148                state = EXIT_ZOMBIE;
1149                if (do_notify_parent(p, p->exit_signal))
1150                        state = EXIT_DEAD;
1151                p->exit_state = state;
1152                write_unlock_irq(&tasklist_lock);
1153        }
1154        if (state == EXIT_DEAD)
1155                release_task(p);
1156
1157out_info:
1158        infop = wo->wo_info;
1159        if (infop) {
1160                if ((status & 0x7f) == 0) {
1161                        infop->cause = CLD_EXITED;
1162                        infop->status = status >> 8;
1163                } else {
1164                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1165                        infop->status = status & 0x7f;
1166                }
1167                infop->pid = pid;
1168                infop->uid = uid;
1169        }
1170
1171        return pid;
1172}
1173
1174static int *task_stopped_code(struct task_struct *p, bool ptrace)
1175{
1176        if (ptrace) {
1177                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1178                        return &p->exit_code;
1179        } else {
1180                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1181                        return &p->signal->group_exit_code;
1182        }
1183        return NULL;
1184}
1185
1186/**
1187 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1188 * @wo: wait options
1189 * @ptrace: is the wait for ptrace
1190 * @p: task to wait for
1191 *
1192 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1193 *
1194 * CONTEXT:
1195 * read_lock(&tasklist_lock), which is released if return value is
1196 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1197 *
1198 * RETURNS:
1199 * 0 if wait condition didn't exist and search for other wait conditions
1200 * should continue.  Non-zero return, -errno on failure and @p's pid on
1201 * success, implies that tasklist_lock is released and wait condition
1202 * search should terminate.
1203 */
1204static int wait_task_stopped(struct wait_opts *wo,
1205                                int ptrace, struct task_struct *p)
1206{
1207        struct waitid_info *infop;
1208        int exit_code, *p_code, why;
1209        uid_t uid = 0; /* unneeded, required by compiler */
1210        pid_t pid;
1211
1212        /*
1213         * Traditionally we see ptrace'd stopped tasks regardless of options.
1214         */
1215        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1216                return 0;
1217
1218        if (!task_stopped_code(p, ptrace))
1219                return 0;
1220
1221        exit_code = 0;
1222        spin_lock_irq(&p->sighand->siglock);
1223
1224        p_code = task_stopped_code(p, ptrace);
1225        if (unlikely(!p_code))
1226                goto unlock_sig;
1227
1228        exit_code = *p_code;
1229        if (!exit_code)
1230                goto unlock_sig;
1231
1232        if (!unlikely(wo->wo_flags & WNOWAIT))
1233                *p_code = 0;
1234
1235        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1236unlock_sig:
1237        spin_unlock_irq(&p->sighand->siglock);
1238        if (!exit_code)
1239                return 0;
1240
1241        /*
1242         * Now we are pretty sure this task is interesting.
1243         * Make sure it doesn't get reaped out from under us while we
1244         * give up the lock and then examine it below.  We don't want to
1245         * keep holding onto the tasklist_lock while we call getrusage and
1246         * possibly take page faults for user memory.
1247         */
1248        get_task_struct(p);
1249        pid = task_pid_vnr(p);
1250        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1251        read_unlock(&tasklist_lock);
1252        sched_annotate_sleep();
1253        if (wo->wo_rusage)
1254                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1255        put_task_struct(p);
1256
1257        if (likely(!(wo->wo_flags & WNOWAIT)))
1258                wo->wo_stat = (exit_code << 8) | 0x7f;
1259
1260        infop = wo->wo_info;
1261        if (infop) {
1262                infop->cause = why;
1263                infop->status = exit_code;
1264                infop->pid = pid;
1265                infop->uid = uid;
1266        }
1267        return pid;
1268}
1269
1270/*
1271 * Handle do_wait work for one task in a live, non-stopped state.
1272 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1273 * the lock and this task is uninteresting.  If we return nonzero, we have
1274 * released the lock and the system call should return.
1275 */
1276static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1277{
1278        struct waitid_info *infop;
1279        pid_t pid;
1280        uid_t uid;
1281
1282        if (!unlikely(wo->wo_flags & WCONTINUED))
1283                return 0;
1284
1285        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1286                return 0;
1287
1288        spin_lock_irq(&p->sighand->siglock);
1289        /* Re-check with the lock held.  */
1290        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1291                spin_unlock_irq(&p->sighand->siglock);
1292                return 0;
1293        }
1294        if (!unlikely(wo->wo_flags & WNOWAIT))
1295                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1296        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1297        spin_unlock_irq(&p->sighand->siglock);
1298
1299        pid = task_pid_vnr(p);
1300        get_task_struct(p);
1301        read_unlock(&tasklist_lock);
1302        sched_annotate_sleep();
1303        if (wo->wo_rusage)
1304                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1305        put_task_struct(p);
1306
1307        infop = wo->wo_info;
1308        if (!infop) {
1309                wo->wo_stat = 0xffff;
1310        } else {
1311                infop->cause = CLD_CONTINUED;
1312                infop->pid = pid;
1313                infop->uid = uid;
1314                infop->status = SIGCONT;
1315        }
1316        return pid;
1317}
1318
1319/*
1320 * Consider @p for a wait by @parent.
1321 *
1322 * -ECHILD should be in ->notask_error before the first call.
1323 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1324 * Returns zero if the search for a child should continue;
1325 * then ->notask_error is 0 if @p is an eligible child,
1326 * or still -ECHILD.
1327 */
1328static int wait_consider_task(struct wait_opts *wo, int ptrace,
1329                                struct task_struct *p)
1330{
1331        /*
1332         * We can race with wait_task_zombie() from another thread.
1333         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1334         * can't confuse the checks below.
1335         */
1336        int exit_state = READ_ONCE(p->exit_state);
1337        int ret;
1338
1339        if (unlikely(exit_state == EXIT_DEAD))
1340                return 0;
1341
1342        ret = eligible_child(wo, ptrace, p);
1343        if (!ret)
1344                return ret;
1345
1346        if (unlikely(exit_state == EXIT_TRACE)) {
1347                /*
1348                 * ptrace == 0 means we are the natural parent. In this case
1349                 * we should clear notask_error, debugger will notify us.
1350                 */
1351                if (likely(!ptrace))
1352                        wo->notask_error = 0;
1353                return 0;
1354        }
1355
1356        if (likely(!ptrace) && unlikely(p->ptrace)) {
1357                /*
1358                 * If it is traced by its real parent's group, just pretend
1359                 * the caller is ptrace_do_wait() and reap this child if it
1360                 * is zombie.
1361                 *
1362                 * This also hides group stop state from real parent; otherwise
1363                 * a single stop can be reported twice as group and ptrace stop.
1364                 * If a ptracer wants to distinguish these two events for its
1365                 * own children it should create a separate process which takes
1366                 * the role of real parent.
1367                 */
1368                if (!ptrace_reparented(p))
1369                        ptrace = 1;
1370        }
1371
1372        /* slay zombie? */
1373        if (exit_state == EXIT_ZOMBIE) {
1374                /* we don't reap group leaders with subthreads */
1375                if (!delay_group_leader(p)) {
1376                        /*
1377                         * A zombie ptracee is only visible to its ptracer.
1378                         * Notification and reaping will be cascaded to the
1379                         * real parent when the ptracer detaches.
1380                         */
1381                        if (unlikely(ptrace) || likely(!p->ptrace))
1382                                return wait_task_zombie(wo, p);
1383                }
1384
1385                /*
1386                 * Allow access to stopped/continued state via zombie by
1387                 * falling through.  Clearing of notask_error is complex.
1388                 *
1389                 * When !@ptrace:
1390                 *
1391                 * If WEXITED is set, notask_error should naturally be
1392                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1393                 * so, if there are live subthreads, there are events to
1394                 * wait for.  If all subthreads are dead, it's still safe
1395                 * to clear - this function will be called again in finite
1396                 * amount time once all the subthreads are released and
1397                 * will then return without clearing.
1398                 *
1399                 * When @ptrace:
1400                 *
1401                 * Stopped state is per-task and thus can't change once the
1402                 * target task dies.  Only continued and exited can happen.
1403                 * Clear notask_error if WCONTINUED | WEXITED.
1404                 */
1405                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1406                        wo->notask_error = 0;
1407        } else {
1408                /*
1409                 * @p is alive and it's gonna stop, continue or exit, so
1410                 * there always is something to wait for.
1411                 */
1412                wo->notask_error = 0;
1413        }
1414
1415        /*
1416         * Wait for stopped.  Depending on @ptrace, different stopped state
1417         * is used and the two don't interact with each other.
1418         */
1419        ret = wait_task_stopped(wo, ptrace, p);
1420        if (ret)
1421                return ret;
1422
1423        /*
1424         * Wait for continued.  There's only one continued state and the
1425         * ptracer can consume it which can confuse the real parent.  Don't
1426         * use WCONTINUED from ptracer.  You don't need or want it.
1427         */
1428        return wait_task_continued(wo, p);
1429}
1430
1431/*
1432 * Do the work of do_wait() for one thread in the group, @tsk.
1433 *
1434 * -ECHILD should be in ->notask_error before the first call.
1435 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1436 * Returns zero if the search for a child should continue; then
1437 * ->notask_error is 0 if there were any eligible children,
1438 * or still -ECHILD.
1439 */
1440static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1441{
1442        struct task_struct *p;
1443
1444        list_for_each_entry(p, &tsk->children, sibling) {
1445                int ret = wait_consider_task(wo, 0, p);
1446
1447                if (ret)
1448                        return ret;
1449        }
1450
1451        return 0;
1452}
1453
1454static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1455{
1456        struct task_struct *p;
1457
1458        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1459                int ret = wait_consider_task(wo, 1, p);
1460
1461                if (ret)
1462                        return ret;
1463        }
1464
1465        return 0;
1466}
1467
1468static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1469                                int sync, void *key)
1470{
1471        struct wait_opts *wo = container_of(wait, struct wait_opts,
1472                                                child_wait);
1473        struct task_struct *p = key;
1474
1475        if (!eligible_pid(wo, p))
1476                return 0;
1477
1478        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1479                return 0;
1480
1481        return default_wake_function(wait, mode, sync, key);
1482}
1483
1484void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1485{
1486        __wake_up_sync_key(&parent->signal->wait_chldexit,
1487                                TASK_INTERRUPTIBLE, 1, p);
1488}
1489
1490static long do_wait(struct wait_opts *wo)
1491{
1492        struct task_struct *tsk;
1493        int retval;
1494
1495        trace_sched_process_wait(wo->wo_pid);
1496
1497        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1498        wo->child_wait.private = current;
1499        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1500repeat:
1501        /*
1502         * If there is nothing that can match our criteria, just get out.
1503         * We will clear ->notask_error to zero if we see any child that
1504         * might later match our criteria, even if we are not able to reap
1505         * it yet.
1506         */
1507        wo->notask_error = -ECHILD;
1508        if ((wo->wo_type < PIDTYPE_MAX) &&
1509           (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1510                goto notask;
1511
1512        set_current_state(TASK_INTERRUPTIBLE);
1513        read_lock(&tasklist_lock);
1514        tsk = current;
1515        do {
1516                retval = do_wait_thread(wo, tsk);
1517                if (retval)
1518                        goto end;
1519
1520                retval = ptrace_do_wait(wo, tsk);
1521                if (retval)
1522                        goto end;
1523
1524                if (wo->wo_flags & __WNOTHREAD)
1525                        break;
1526        } while_each_thread(current, tsk);
1527        read_unlock(&tasklist_lock);
1528
1529notask:
1530        retval = wo->notask_error;
1531        if (!retval && !(wo->wo_flags & WNOHANG)) {
1532                retval = -ERESTARTSYS;
1533                if (!signal_pending(current)) {
1534                        schedule();
1535                        goto repeat;
1536                }
1537        }
1538end:
1539        __set_current_state(TASK_RUNNING);
1540        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1541        return retval;
1542}
1543
1544static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1545                          int options, struct rusage *ru)
1546{
1547        struct wait_opts wo;
1548        struct pid *pid = NULL;
1549        enum pid_type type;
1550        long ret;
1551
1552        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1553                        __WNOTHREAD|__WCLONE|__WALL))
1554                return -EINVAL;
1555        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1556                return -EINVAL;
1557
1558        switch (which) {
1559        case P_ALL:
1560                type = PIDTYPE_MAX;
1561                break;
1562        case P_PID:
1563                type = PIDTYPE_PID;
1564                if (upid <= 0)
1565                        return -EINVAL;
1566                break;
1567        case P_PGID:
1568                type = PIDTYPE_PGID;
1569                if (upid <= 0)
1570                        return -EINVAL;
1571                break;
1572        default:
1573                return -EINVAL;
1574        }
1575
1576        if (type < PIDTYPE_MAX)
1577                pid = find_get_pid(upid);
1578
1579        wo.wo_type      = type;
1580        wo.wo_pid       = pid;
1581        wo.wo_flags     = options;
1582        wo.wo_info      = infop;
1583        wo.wo_rusage    = ru;
1584        ret = do_wait(&wo);
1585
1586        put_pid(pid);
1587        return ret;
1588}
1589
1590SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1591                infop, int, options, struct rusage __user *, ru)
1592{
1593        struct rusage r;
1594        struct waitid_info info = {.status = 0};
1595        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1596        int signo = 0;
1597
1598        if (err > 0) {
1599                signo = SIGCHLD;
1600                err = 0;
1601                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1602                        return -EFAULT;
1603        }
1604        if (!infop)
1605                return err;
1606
1607        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1608                return -EFAULT;
1609
1610        user_access_begin();
1611        unsafe_put_user(signo, &infop->si_signo, Efault);
1612        unsafe_put_user(0, &infop->si_errno, Efault);
1613        unsafe_put_user(info.cause, &infop->si_code, Efault);
1614        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1615        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1616        unsafe_put_user(info.status, &infop->si_status, Efault);
1617        user_access_end();
1618        return err;
1619Efault:
1620        user_access_end();
1621        return -EFAULT;
1622}
1623
1624long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1625                  struct rusage *ru)
1626{
1627        struct wait_opts wo;
1628        struct pid *pid = NULL;
1629        enum pid_type type;
1630        long ret;
1631
1632        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1633                        __WNOTHREAD|__WCLONE|__WALL))
1634                return -EINVAL;
1635
1636        /* -INT_MIN is not defined */
1637        if (upid == INT_MIN)
1638                return -ESRCH;
1639
1640        if (upid == -1)
1641                type = PIDTYPE_MAX;
1642        else if (upid < 0) {
1643                type = PIDTYPE_PGID;
1644                pid = find_get_pid(-upid);
1645        } else if (upid == 0) {
1646                type = PIDTYPE_PGID;
1647                pid = get_task_pid(current, PIDTYPE_PGID);
1648        } else /* upid > 0 */ {
1649                type = PIDTYPE_PID;
1650                pid = find_get_pid(upid);
1651        }
1652
1653        wo.wo_type      = type;
1654        wo.wo_pid       = pid;
1655        wo.wo_flags     = options | WEXITED;
1656        wo.wo_info      = NULL;
1657        wo.wo_stat      = 0;
1658        wo.wo_rusage    = ru;
1659        ret = do_wait(&wo);
1660        put_pid(pid);
1661        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1662                ret = -EFAULT;
1663
1664        return ret;
1665}
1666
1667SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1668                int, options, struct rusage __user *, ru)
1669{
1670        struct rusage r;
1671        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1672
1673        if (err > 0) {
1674                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1675                        return -EFAULT;
1676        }
1677        return err;
1678}
1679
1680#ifdef __ARCH_WANT_SYS_WAITPID
1681
1682/*
1683 * sys_waitpid() remains for compatibility. waitpid() should be
1684 * implemented by calling sys_wait4() from libc.a.
1685 */
1686SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1687{
1688        return kernel_wait4(pid, stat_addr, options, NULL);
1689}
1690
1691#endif
1692
1693#ifdef CONFIG_COMPAT
1694COMPAT_SYSCALL_DEFINE4(wait4,
1695        compat_pid_t, pid,
1696        compat_uint_t __user *, stat_addr,
1697        int, options,
1698        struct compat_rusage __user *, ru)
1699{
1700        struct rusage r;
1701        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1702        if (err > 0) {
1703                if (ru && put_compat_rusage(&r, ru))
1704                        return -EFAULT;
1705        }
1706        return err;
1707}
1708
1709COMPAT_SYSCALL_DEFINE5(waitid,
1710                int, which, compat_pid_t, pid,
1711                struct compat_siginfo __user *, infop, int, options,
1712                struct compat_rusage __user *, uru)
1713{
1714        struct rusage ru;
1715        struct waitid_info info = {.status = 0};
1716        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1717        int signo = 0;
1718        if (err > 0) {
1719                signo = SIGCHLD;
1720                err = 0;
1721                if (uru) {
1722                        /* kernel_waitid() overwrites everything in ru */
1723                        if (COMPAT_USE_64BIT_TIME)
1724                                err = copy_to_user(uru, &ru, sizeof(ru));
1725                        else
1726                                err = put_compat_rusage(&ru, uru);
1727                        if (err)
1728                                return -EFAULT;
1729                }
1730        }
1731
1732        if (!infop)
1733                return err;
1734
1735        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1736                return -EFAULT;
1737
1738        user_access_begin();
1739        unsafe_put_user(signo, &infop->si_signo, Efault);
1740        unsafe_put_user(0, &infop->si_errno, Efault);
1741        unsafe_put_user(info.cause, &infop->si_code, Efault);
1742        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1743        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1744        unsafe_put_user(info.status, &infop->si_status, Efault);
1745        user_access_end();
1746        return err;
1747Efault:
1748        user_access_end();
1749        return -EFAULT;
1750}
1751#endif
1752
1753__weak void abort(void)
1754{
1755        BUG();
1756
1757        /* if that doesn't kill us, halt */
1758        panic("Oops failed to kill thread");
1759}
1760EXPORT_SYMBOL(abort);
1761