linux/kernel/exit.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/exit.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/slab.h>
   9#include <linux/sched/autogroup.h>
  10#include <linux/sched/mm.h>
  11#include <linux/sched/stat.h>
  12#include <linux/sched/task.h>
  13#include <linux/sched/task_stack.h>
  14#include <linux/sched/cputime.h>
  15#include <linux/interrupt.h>
  16#include <linux/module.h>
  17#include <linux/capability.h>
  18#include <linux/completion.h>
  19#include <linux/personality.h>
  20#include <linux/tty.h>
  21#include <linux/iocontext.h>
  22#include <linux/key.h>
  23#include <linux/cpu.h>
  24#include <linux/acct.h>
  25#include <linux/tsacct_kern.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/freezer.h>
  29#include <linux/binfmts.h>
  30#include <linux/nsproxy.h>
  31#include <linux/pid_namespace.h>
  32#include <linux/ptrace.h>
  33#include <linux/profile.h>
  34#include <linux/mount.h>
  35#include <linux/proc_fs.h>
  36#include <linux/kthread.h>
  37#include <linux/mempolicy.h>
  38#include <linux/taskstats_kern.h>
  39#include <linux/delayacct.h>
  40#include <linux/cgroup.h>
  41#include <linux/syscalls.h>
  42#include <linux/signal.h>
  43#include <linux/posix-timers.h>
  44#include <linux/cn_proc.h>
  45#include <linux/mutex.h>
  46#include <linux/futex.h>
  47#include <linux/pipe_fs_i.h>
  48#include <linux/audit.h> /* for audit_free() */
  49#include <linux/resource.h>
  50#include <linux/blkdev.h>
  51#include <linux/task_io_accounting_ops.h>
  52#include <linux/tracehook.h>
  53#include <linux/fs_struct.h>
  54#include <linux/init_task.h>
  55#include <linux/perf_event.h>
  56#include <trace/events/sched.h>
  57#include <linux/hw_breakpoint.h>
  58#include <linux/oom.h>
  59#include <linux/writeback.h>
  60#include <linux/shm.h>
  61#include <linux/kcov.h>
  62#include <linux/random.h>
  63#include <linux/rcuwait.h>
  64#include <linux/compat.h>
  65
  66#include <linux/uaccess.h>
  67#include <asm/unistd.h>
  68#include <asm/pgtable.h>
  69#include <asm/mmu_context.h>
  70
  71static void __unhash_process(struct task_struct *p, bool group_dead)
  72{
  73        nr_threads--;
  74        detach_pid(p, PIDTYPE_PID);
  75        if (group_dead) {
  76                detach_pid(p, PIDTYPE_PGID);
  77                detach_pid(p, PIDTYPE_SID);
  78
  79                list_del_rcu(&p->tasks);
  80                list_del_init(&p->sibling);
  81                __this_cpu_dec(process_counts);
  82        }
  83        list_del_rcu(&p->thread_group);
  84        list_del_rcu(&p->thread_node);
  85}
  86
  87/*
  88 * This function expects the tasklist_lock write-locked.
  89 */
  90static void __exit_signal(struct task_struct *tsk)
  91{
  92        struct signal_struct *sig = tsk->signal;
  93        bool group_dead = thread_group_leader(tsk);
  94        struct sighand_struct *sighand;
  95        struct tty_struct *uninitialized_var(tty);
  96        u64 utime, stime;
  97
  98        sighand = rcu_dereference_check(tsk->sighand,
  99                                        lockdep_tasklist_lock_is_held());
 100        spin_lock(&sighand->siglock);
 101
 102#ifdef CONFIG_POSIX_TIMERS
 103        posix_cpu_timers_exit(tsk);
 104        if (group_dead) {
 105                posix_cpu_timers_exit_group(tsk);
 106        } else {
 107                /*
 108                 * This can only happen if the caller is de_thread().
 109                 * FIXME: this is the temporary hack, we should teach
 110                 * posix-cpu-timers to handle this case correctly.
 111                 */
 112                if (unlikely(has_group_leader_pid(tsk)))
 113                        posix_cpu_timers_exit_group(tsk);
 114        }
 115#endif
 116
 117        if (group_dead) {
 118                tty = sig->tty;
 119                sig->tty = NULL;
 120        } else {
 121                /*
 122                 * If there is any task waiting for the group exit
 123                 * then notify it:
 124                 */
 125                if (sig->notify_count > 0 && !--sig->notify_count)
 126                        wake_up_process(sig->group_exit_task);
 127
 128                if (tsk == sig->curr_target)
 129                        sig->curr_target = next_thread(tsk);
 130        }
 131
 132        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 133                              sizeof(unsigned long long));
 134
 135        /*
 136         * Accumulate here the counters for all threads as they die. We could
 137         * skip the group leader because it is the last user of signal_struct,
 138         * but we want to avoid the race with thread_group_cputime() which can
 139         * see the empty ->thread_head list.
 140         */
 141        task_cputime(tsk, &utime, &stime);
 142        write_seqlock(&sig->stats_lock);
 143        sig->utime += utime;
 144        sig->stime += stime;
 145        sig->gtime += task_gtime(tsk);
 146        sig->min_flt += tsk->min_flt;
 147        sig->maj_flt += tsk->maj_flt;
 148        sig->nvcsw += tsk->nvcsw;
 149        sig->nivcsw += tsk->nivcsw;
 150        sig->inblock += task_io_get_inblock(tsk);
 151        sig->oublock += task_io_get_oublock(tsk);
 152        task_io_accounting_add(&sig->ioac, &tsk->ioac);
 153        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 154        sig->nr_threads--;
 155        __unhash_process(tsk, group_dead);
 156        write_sequnlock(&sig->stats_lock);
 157
 158        /*
 159         * Do this under ->siglock, we can race with another thread
 160         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 161         */
 162        flush_sigqueue(&tsk->pending);
 163        tsk->sighand = NULL;
 164        spin_unlock(&sighand->siglock);
 165
 166        __cleanup_sighand(sighand);
 167        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 168        if (group_dead) {
 169                flush_sigqueue(&sig->shared_pending);
 170                tty_kref_put(tty);
 171        }
 172}
 173
 174static void delayed_put_task_struct(struct rcu_head *rhp)
 175{
 176        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 177
 178        perf_event_delayed_put(tsk);
 179        trace_sched_process_free(tsk);
 180        put_task_struct(tsk);
 181}
 182
 183
 184void release_task(struct task_struct *p)
 185{
 186        struct task_struct *leader;
 187        int zap_leader;
 188repeat:
 189        /* don't need to get the RCU readlock here - the process is dead and
 190         * can't be modifying its own credentials. But shut RCU-lockdep up */
 191        rcu_read_lock();
 192        atomic_dec(&__task_cred(p)->user->processes);
 193        rcu_read_unlock();
 194
 195        proc_flush_task(p);
 196
 197        write_lock_irq(&tasklist_lock);
 198        ptrace_release_task(p);
 199        __exit_signal(p);
 200
 201        /*
 202         * If we are the last non-leader member of the thread
 203         * group, and the leader is zombie, then notify the
 204         * group leader's parent process. (if it wants notification.)
 205         */
 206        zap_leader = 0;
 207        leader = p->group_leader;
 208        if (leader != p && thread_group_empty(leader)
 209                        && leader->exit_state == EXIT_ZOMBIE) {
 210                /*
 211                 * If we were the last child thread and the leader has
 212                 * exited already, and the leader's parent ignores SIGCHLD,
 213                 * then we are the one who should release the leader.
 214                 */
 215                zap_leader = do_notify_parent(leader, leader->exit_signal);
 216                if (zap_leader)
 217                        leader->exit_state = EXIT_DEAD;
 218        }
 219
 220        write_unlock_irq(&tasklist_lock);
 221        release_thread(p);
 222        call_rcu(&p->rcu, delayed_put_task_struct);
 223
 224        p = leader;
 225        if (unlikely(zap_leader))
 226                goto repeat;
 227}
 228
 229/*
 230 * Note that if this function returns a valid task_struct pointer (!NULL)
 231 * task->usage must remain >0 for the duration of the RCU critical section.
 232 */
 233struct task_struct *task_rcu_dereference(struct task_struct **ptask)
 234{
 235        struct sighand_struct *sighand;
 236        struct task_struct *task;
 237
 238        /*
 239         * We need to verify that release_task() was not called and thus
 240         * delayed_put_task_struct() can't run and drop the last reference
 241         * before rcu_read_unlock(). We check task->sighand != NULL,
 242         * but we can read the already freed and reused memory.
 243         */
 244retry:
 245        task = rcu_dereference(*ptask);
 246        if (!task)
 247                return NULL;
 248
 249        probe_kernel_address(&task->sighand, sighand);
 250
 251        /*
 252         * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
 253         * was already freed we can not miss the preceding update of this
 254         * pointer.
 255         */
 256        smp_rmb();
 257        if (unlikely(task != READ_ONCE(*ptask)))
 258                goto retry;
 259
 260        /*
 261         * We've re-checked that "task == *ptask", now we have two different
 262         * cases:
 263         *
 264         * 1. This is actually the same task/task_struct. In this case
 265         *    sighand != NULL tells us it is still alive.
 266         *
 267         * 2. This is another task which got the same memory for task_struct.
 268         *    We can't know this of course, and we can not trust
 269         *    sighand != NULL.
 270         *
 271         *    In this case we actually return a random value, but this is
 272         *    correct.
 273         *
 274         *    If we return NULL - we can pretend that we actually noticed that
 275         *    *ptask was updated when the previous task has exited. Or pretend
 276         *    that probe_slab_address(&sighand) reads NULL.
 277         *
 278         *    If we return the new task (because sighand is not NULL for any
 279         *    reason) - this is fine too. This (new) task can't go away before
 280         *    another gp pass.
 281         *
 282         *    And note: We could even eliminate the false positive if re-read
 283         *    task->sighand once again to avoid the falsely NULL. But this case
 284         *    is very unlikely so we don't care.
 285         */
 286        if (!sighand)
 287                return NULL;
 288
 289        return task;
 290}
 291
 292void rcuwait_wake_up(struct rcuwait *w)
 293{
 294        struct task_struct *task;
 295
 296        rcu_read_lock();
 297
 298        /*
 299         * Order condition vs @task, such that everything prior to the load
 300         * of @task is visible. This is the condition as to why the user called
 301         * rcuwait_trywake() in the first place. Pairs with set_current_state()
 302         * barrier (A) in rcuwait_wait_event().
 303         *
 304         *    WAIT                WAKE
 305         *    [S] tsk = current   [S] cond = true
 306         *        MB (A)              MB (B)
 307         *    [L] cond            [L] tsk
 308         */
 309        smp_rmb(); /* (B) */
 310
 311        /*
 312         * Avoid using task_rcu_dereference() magic as long as we are careful,
 313         * see comment in rcuwait_wait_event() regarding ->exit_state.
 314         */
 315        task = rcu_dereference(w->task);
 316        if (task)
 317                wake_up_process(task);
 318        rcu_read_unlock();
 319}
 320
 321/*
 322 * Determine if a process group is "orphaned", according to the POSIX
 323 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 324 * by terminal-generated stop signals.  Newly orphaned process groups are
 325 * to receive a SIGHUP and a SIGCONT.
 326 *
 327 * "I ask you, have you ever known what it is to be an orphan?"
 328 */
 329static int will_become_orphaned_pgrp(struct pid *pgrp,
 330                                        struct task_struct *ignored_task)
 331{
 332        struct task_struct *p;
 333
 334        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 335                if ((p == ignored_task) ||
 336                    (p->exit_state && thread_group_empty(p)) ||
 337                    is_global_init(p->real_parent))
 338                        continue;
 339
 340                if (task_pgrp(p->real_parent) != pgrp &&
 341                    task_session(p->real_parent) == task_session(p))
 342                        return 0;
 343        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 344
 345        return 1;
 346}
 347
 348int is_current_pgrp_orphaned(void)
 349{
 350        int retval;
 351
 352        read_lock(&tasklist_lock);
 353        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 354        read_unlock(&tasklist_lock);
 355
 356        return retval;
 357}
 358
 359static bool has_stopped_jobs(struct pid *pgrp)
 360{
 361        struct task_struct *p;
 362
 363        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 364                if (p->signal->flags & SIGNAL_STOP_STOPPED)
 365                        return true;
 366        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 367
 368        return false;
 369}
 370
 371/*
 372 * Check to see if any process groups have become orphaned as
 373 * a result of our exiting, and if they have any stopped jobs,
 374 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 375 */
 376static void
 377kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 378{
 379        struct pid *pgrp = task_pgrp(tsk);
 380        struct task_struct *ignored_task = tsk;
 381
 382        if (!parent)
 383                /* exit: our father is in a different pgrp than
 384                 * we are and we were the only connection outside.
 385                 */
 386                parent = tsk->real_parent;
 387        else
 388                /* reparent: our child is in a different pgrp than
 389                 * we are, and it was the only connection outside.
 390                 */
 391                ignored_task = NULL;
 392
 393        if (task_pgrp(parent) != pgrp &&
 394            task_session(parent) == task_session(tsk) &&
 395            will_become_orphaned_pgrp(pgrp, ignored_task) &&
 396            has_stopped_jobs(pgrp)) {
 397                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 398                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 399        }
 400}
 401
 402#ifdef CONFIG_MEMCG
 403/*
 404 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 405 */
 406void mm_update_next_owner(struct mm_struct *mm)
 407{
 408        struct task_struct *c, *g, *p = current;
 409
 410retry:
 411        /*
 412         * If the exiting or execing task is not the owner, it's
 413         * someone else's problem.
 414         */
 415        if (mm->owner != p)
 416                return;
 417        /*
 418         * The current owner is exiting/execing and there are no other
 419         * candidates.  Do not leave the mm pointing to a possibly
 420         * freed task structure.
 421         */
 422        if (atomic_read(&mm->mm_users) <= 1) {
 423                mm->owner = NULL;
 424                return;
 425        }
 426
 427        read_lock(&tasklist_lock);
 428        /*
 429         * Search in the children
 430         */
 431        list_for_each_entry(c, &p->children, sibling) {
 432                if (c->mm == mm)
 433                        goto assign_new_owner;
 434        }
 435
 436        /*
 437         * Search in the siblings
 438         */
 439        list_for_each_entry(c, &p->real_parent->children, sibling) {
 440                if (c->mm == mm)
 441                        goto assign_new_owner;
 442        }
 443
 444        /*
 445         * Search through everything else, we should not get here often.
 446         */
 447        for_each_process(g) {
 448                if (g->flags & PF_KTHREAD)
 449                        continue;
 450                for_each_thread(g, c) {
 451                        if (c->mm == mm)
 452                                goto assign_new_owner;
 453                        if (c->mm)
 454                                break;
 455                }
 456        }
 457        read_unlock(&tasklist_lock);
 458        /*
 459         * We found no owner yet mm_users > 1: this implies that we are
 460         * most likely racing with swapoff (try_to_unuse()) or /proc or
 461         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 462         */
 463        mm->owner = NULL;
 464        return;
 465
 466assign_new_owner:
 467        BUG_ON(c == p);
 468        get_task_struct(c);
 469        /*
 470         * The task_lock protects c->mm from changing.
 471         * We always want mm->owner->mm == mm
 472         */
 473        task_lock(c);
 474        /*
 475         * Delay read_unlock() till we have the task_lock()
 476         * to ensure that c does not slip away underneath us
 477         */
 478        read_unlock(&tasklist_lock);
 479        if (c->mm != mm) {
 480                task_unlock(c);
 481                put_task_struct(c);
 482                goto retry;
 483        }
 484        mm->owner = c;
 485        task_unlock(c);
 486        put_task_struct(c);
 487}
 488#endif /* CONFIG_MEMCG */
 489
 490/*
 491 * Turn us into a lazy TLB process if we
 492 * aren't already..
 493 */
 494static void exit_mm(void)
 495{
 496        struct mm_struct *mm = current->mm;
 497        struct core_state *core_state;
 498
 499        mm_release(current, mm);
 500        if (!mm)
 501                return;
 502        sync_mm_rss(mm);
 503        /*
 504         * Serialize with any possible pending coredump.
 505         * We must hold mmap_sem around checking core_state
 506         * and clearing tsk->mm.  The core-inducing thread
 507         * will increment ->nr_threads for each thread in the
 508         * group with ->mm != NULL.
 509         */
 510        down_read(&mm->mmap_sem);
 511        core_state = mm->core_state;
 512        if (core_state) {
 513                struct core_thread self;
 514
 515                up_read(&mm->mmap_sem);
 516
 517                self.task = current;
 518                self.next = xchg(&core_state->dumper.next, &self);
 519                /*
 520                 * Implies mb(), the result of xchg() must be visible
 521                 * to core_state->dumper.
 522                 */
 523                if (atomic_dec_and_test(&core_state->nr_threads))
 524                        complete(&core_state->startup);
 525
 526                for (;;) {
 527                        set_current_state(TASK_UNINTERRUPTIBLE);
 528                        if (!self.task) /* see coredump_finish() */
 529                                break;
 530                        freezable_schedule();
 531                }
 532                __set_current_state(TASK_RUNNING);
 533                down_read(&mm->mmap_sem);
 534        }
 535        mmgrab(mm);
 536        BUG_ON(mm != current->active_mm);
 537        /* more a memory barrier than a real lock */
 538        task_lock(current);
 539        current->mm = NULL;
 540        up_read(&mm->mmap_sem);
 541        enter_lazy_tlb(mm, current);
 542        task_unlock(current);
 543        mm_update_next_owner(mm);
 544        mmput(mm);
 545        if (test_thread_flag(TIF_MEMDIE))
 546                exit_oom_victim();
 547}
 548
 549static struct task_struct *find_alive_thread(struct task_struct *p)
 550{
 551        struct task_struct *t;
 552
 553        for_each_thread(p, t) {
 554                if (!(t->flags & PF_EXITING))
 555                        return t;
 556        }
 557        return NULL;
 558}
 559
 560static struct task_struct *find_child_reaper(struct task_struct *father)
 561        __releases(&tasklist_lock)
 562        __acquires(&tasklist_lock)
 563{
 564        struct pid_namespace *pid_ns = task_active_pid_ns(father);
 565        struct task_struct *reaper = pid_ns->child_reaper;
 566
 567        if (likely(reaper != father))
 568                return reaper;
 569
 570        reaper = find_alive_thread(father);
 571        if (reaper) {
 572                pid_ns->child_reaper = reaper;
 573                return reaper;
 574        }
 575
 576        write_unlock_irq(&tasklist_lock);
 577        if (unlikely(pid_ns == &init_pid_ns)) {
 578                panic("Attempted to kill init! exitcode=0x%08x\n",
 579                        father->signal->group_exit_code ?: father->exit_code);
 580        }
 581        zap_pid_ns_processes(pid_ns);
 582        write_lock_irq(&tasklist_lock);
 583
 584        return father;
 585}
 586
 587/*
 588 * When we die, we re-parent all our children, and try to:
 589 * 1. give them to another thread in our thread group, if such a member exists
 590 * 2. give it to the first ancestor process which prctl'd itself as a
 591 *    child_subreaper for its children (like a service manager)
 592 * 3. give it to the init process (PID 1) in our pid namespace
 593 */
 594static struct task_struct *find_new_reaper(struct task_struct *father,
 595                                           struct task_struct *child_reaper)
 596{
 597        struct task_struct *thread, *reaper;
 598
 599        thread = find_alive_thread(father);
 600        if (thread)
 601                return thread;
 602
 603        if (father->signal->has_child_subreaper) {
 604                unsigned int ns_level = task_pid(father)->level;
 605                /*
 606                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 607                 * We can't check reaper != child_reaper to ensure we do not
 608                 * cross the namespaces, the exiting parent could be injected
 609                 * by setns() + fork().
 610                 * We check pid->level, this is slightly more efficient than
 611                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 612                 */
 613                for (reaper = father->real_parent;
 614                     task_pid(reaper)->level == ns_level;
 615                     reaper = reaper->real_parent) {
 616                        if (reaper == &init_task)
 617                                break;
 618                        if (!reaper->signal->is_child_subreaper)
 619                                continue;
 620                        thread = find_alive_thread(reaper);
 621                        if (thread)
 622                                return thread;
 623                }
 624        }
 625
 626        return child_reaper;
 627}
 628
 629/*
 630* Any that need to be release_task'd are put on the @dead list.
 631 */
 632static void reparent_leader(struct task_struct *father, struct task_struct *p,
 633                                struct list_head *dead)
 634{
 635        if (unlikely(p->exit_state == EXIT_DEAD))
 636                return;
 637
 638        /* We don't want people slaying init. */
 639        p->exit_signal = SIGCHLD;
 640
 641        /* If it has exited notify the new parent about this child's death. */
 642        if (!p->ptrace &&
 643            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 644                if (do_notify_parent(p, p->exit_signal)) {
 645                        p->exit_state = EXIT_DEAD;
 646                        list_add(&p->ptrace_entry, dead);
 647                }
 648        }
 649
 650        kill_orphaned_pgrp(p, father);
 651}
 652
 653/*
 654 * This does two things:
 655 *
 656 * A.  Make init inherit all the child processes
 657 * B.  Check to see if any process groups have become orphaned
 658 *      as a result of our exiting, and if they have any stopped
 659 *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 660 */
 661static void forget_original_parent(struct task_struct *father,
 662                                        struct list_head *dead)
 663{
 664        struct task_struct *p, *t, *reaper;
 665
 666        if (unlikely(!list_empty(&father->ptraced)))
 667                exit_ptrace(father, dead);
 668
 669        /* Can drop and reacquire tasklist_lock */
 670        reaper = find_child_reaper(father);
 671        if (list_empty(&father->children))
 672                return;
 673
 674        reaper = find_new_reaper(father, reaper);
 675        list_for_each_entry(p, &father->children, sibling) {
 676                for_each_thread(p, t) {
 677                        t->real_parent = reaper;
 678                        BUG_ON((!t->ptrace) != (t->parent == father));
 679                        if (likely(!t->ptrace))
 680                                t->parent = t->real_parent;
 681                        if (t->pdeath_signal)
 682                                group_send_sig_info(t->pdeath_signal,
 683                                                    SEND_SIG_NOINFO, t);
 684                }
 685                /*
 686                 * If this is a threaded reparent there is no need to
 687                 * notify anyone anything has happened.
 688                 */
 689                if (!same_thread_group(reaper, father))
 690                        reparent_leader(father, p, dead);
 691        }
 692        list_splice_tail_init(&father->children, &reaper->children);
 693}
 694
 695/*
 696 * Send signals to all our closest relatives so that they know
 697 * to properly mourn us..
 698 */
 699static void exit_notify(struct task_struct *tsk, int group_dead)
 700{
 701        bool autoreap;
 702        struct task_struct *p, *n;
 703        LIST_HEAD(dead);
 704
 705        write_lock_irq(&tasklist_lock);
 706        forget_original_parent(tsk, &dead);
 707
 708        if (group_dead)
 709                kill_orphaned_pgrp(tsk->group_leader, NULL);
 710
 711        if (unlikely(tsk->ptrace)) {
 712                int sig = thread_group_leader(tsk) &&
 713                                thread_group_empty(tsk) &&
 714                                !ptrace_reparented(tsk) ?
 715                        tsk->exit_signal : SIGCHLD;
 716                autoreap = do_notify_parent(tsk, sig);
 717        } else if (thread_group_leader(tsk)) {
 718                autoreap = thread_group_empty(tsk) &&
 719                        do_notify_parent(tsk, tsk->exit_signal);
 720        } else {
 721                autoreap = true;
 722        }
 723
 724        tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 725        if (tsk->exit_state == EXIT_DEAD)
 726                list_add(&tsk->ptrace_entry, &dead);
 727
 728        /* mt-exec, de_thread() is waiting for group leader */
 729        if (unlikely(tsk->signal->notify_count < 0))
 730                wake_up_process(tsk->signal->group_exit_task);
 731        write_unlock_irq(&tasklist_lock);
 732
 733        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 734                list_del_init(&p->ptrace_entry);
 735                release_task(p);
 736        }
 737}
 738
 739#ifdef CONFIG_DEBUG_STACK_USAGE
 740static void check_stack_usage(void)
 741{
 742        static DEFINE_SPINLOCK(low_water_lock);
 743        static int lowest_to_date = THREAD_SIZE;
 744        unsigned long free;
 745
 746        free = stack_not_used(current);
 747
 748        if (free >= lowest_to_date)
 749                return;
 750
 751        spin_lock(&low_water_lock);
 752        if (free < lowest_to_date) {
 753                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 754                        current->comm, task_pid_nr(current), free);
 755                lowest_to_date = free;
 756        }
 757        spin_unlock(&low_water_lock);
 758}
 759#else
 760static inline void check_stack_usage(void) {}
 761#endif
 762
 763void __noreturn do_exit(long code)
 764{
 765        struct task_struct *tsk = current;
 766        int group_dead;
 767
 768        profile_task_exit(tsk);
 769        kcov_task_exit(tsk);
 770
 771        WARN_ON(blk_needs_flush_plug(tsk));
 772
 773        if (unlikely(in_interrupt()))
 774                panic("Aiee, killing interrupt handler!");
 775        if (unlikely(!tsk->pid))
 776                panic("Attempted to kill the idle task!");
 777
 778        /*
 779         * If do_exit is called because this processes oopsed, it's possible
 780         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 781         * continuing. Amongst other possible reasons, this is to prevent
 782         * mm_release()->clear_child_tid() from writing to a user-controlled
 783         * kernel address.
 784         */
 785        set_fs(USER_DS);
 786
 787        ptrace_event(PTRACE_EVENT_EXIT, code);
 788
 789        validate_creds_for_do_exit(tsk);
 790
 791        /*
 792         * We're taking recursive faults here in do_exit. Safest is to just
 793         * leave this task alone and wait for reboot.
 794         */
 795        if (unlikely(tsk->flags & PF_EXITING)) {
 796                pr_alert("Fixing recursive fault but reboot is needed!\n");
 797                /*
 798                 * We can do this unlocked here. The futex code uses
 799                 * this flag just to verify whether the pi state
 800                 * cleanup has been done or not. In the worst case it
 801                 * loops once more. We pretend that the cleanup was
 802                 * done as there is no way to return. Either the
 803                 * OWNER_DIED bit is set by now or we push the blocked
 804                 * task into the wait for ever nirwana as well.
 805                 */
 806                tsk->flags |= PF_EXITPIDONE;
 807                set_current_state(TASK_UNINTERRUPTIBLE);
 808                schedule();
 809        }
 810
 811        exit_signals(tsk);  /* sets PF_EXITING */
 812        /*
 813         * Ensure that all new tsk->pi_lock acquisitions must observe
 814         * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
 815         */
 816        smp_mb();
 817        /*
 818         * Ensure that we must observe the pi_state in exit_mm() ->
 819         * mm_release() -> exit_pi_state_list().
 820         */
 821        raw_spin_lock_irq(&tsk->pi_lock);
 822        raw_spin_unlock_irq(&tsk->pi_lock);
 823
 824        if (unlikely(in_atomic())) {
 825                pr_info("note: %s[%d] exited with preempt_count %d\n",
 826                        current->comm, task_pid_nr(current),
 827                        preempt_count());
 828                preempt_count_set(PREEMPT_ENABLED);
 829        }
 830
 831        /* sync mm's RSS info before statistics gathering */
 832        if (tsk->mm)
 833                sync_mm_rss(tsk->mm);
 834        acct_update_integrals(tsk);
 835        group_dead = atomic_dec_and_test(&tsk->signal->live);
 836        if (group_dead) {
 837#ifdef CONFIG_POSIX_TIMERS
 838                hrtimer_cancel(&tsk->signal->real_timer);
 839                exit_itimers(tsk->signal);
 840#endif
 841                if (tsk->mm)
 842                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 843        }
 844        acct_collect(code, group_dead);
 845        if (group_dead)
 846                tty_audit_exit();
 847        audit_free(tsk);
 848
 849        tsk->exit_code = code;
 850        taskstats_exit(tsk, group_dead);
 851
 852        exit_mm();
 853
 854        if (group_dead)
 855                acct_process();
 856        trace_sched_process_exit(tsk);
 857
 858        exit_sem(tsk);
 859        exit_shm(tsk);
 860        exit_files(tsk);
 861        exit_fs(tsk);
 862        if (group_dead)
 863                disassociate_ctty(1);
 864        exit_task_namespaces(tsk);
 865        exit_task_work(tsk);
 866        exit_thread(tsk);
 867
 868        /*
 869         * Flush inherited counters to the parent - before the parent
 870         * gets woken up by child-exit notifications.
 871         *
 872         * because of cgroup mode, must be called before cgroup_exit()
 873         */
 874        perf_event_exit_task(tsk);
 875
 876        sched_autogroup_exit_task(tsk);
 877        cgroup_exit(tsk);
 878
 879        /*
 880         * FIXME: do that only when needed, using sched_exit tracepoint
 881         */
 882        flush_ptrace_hw_breakpoint(tsk);
 883
 884        exit_tasks_rcu_start();
 885        exit_notify(tsk, group_dead);
 886        proc_exit_connector(tsk);
 887        mpol_put_task_policy(tsk);
 888#ifdef CONFIG_FUTEX
 889        if (unlikely(current->pi_state_cache))
 890                kfree(current->pi_state_cache);
 891#endif
 892        /*
 893         * Make sure we are holding no locks:
 894         */
 895        debug_check_no_locks_held();
 896        /*
 897         * We can do this unlocked here. The futex code uses this flag
 898         * just to verify whether the pi state cleanup has been done
 899         * or not. In the worst case it loops once more.
 900         */
 901        tsk->flags |= PF_EXITPIDONE;
 902
 903        if (tsk->io_context)
 904                exit_io_context(tsk);
 905
 906        if (tsk->splice_pipe)
 907                free_pipe_info(tsk->splice_pipe);
 908
 909        if (tsk->task_frag.page)
 910                put_page(tsk->task_frag.page);
 911
 912        validate_creds_for_do_exit(tsk);
 913
 914        check_stack_usage();
 915        preempt_disable();
 916        if (tsk->nr_dirtied)
 917                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 918        exit_rcu();
 919        exit_tasks_rcu_finish();
 920
 921        lockdep_free_task(tsk);
 922        do_task_dead();
 923}
 924EXPORT_SYMBOL_GPL(do_exit);
 925
 926void complete_and_exit(struct completion *comp, long code)
 927{
 928        if (comp)
 929                complete(comp);
 930
 931        do_exit(code);
 932}
 933EXPORT_SYMBOL(complete_and_exit);
 934
 935SYSCALL_DEFINE1(exit, int, error_code)
 936{
 937        do_exit((error_code&0xff)<<8);
 938}
 939
 940/*
 941 * Take down every thread in the group.  This is called by fatal signals
 942 * as well as by sys_exit_group (below).
 943 */
 944void
 945do_group_exit(int exit_code)
 946{
 947        struct signal_struct *sig = current->signal;
 948
 949        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 950
 951        if (signal_group_exit(sig))
 952                exit_code = sig->group_exit_code;
 953        else if (!thread_group_empty(current)) {
 954                struct sighand_struct *const sighand = current->sighand;
 955
 956                spin_lock_irq(&sighand->siglock);
 957                if (signal_group_exit(sig))
 958                        /* Another thread got here before we took the lock.  */
 959                        exit_code = sig->group_exit_code;
 960                else {
 961                        sig->group_exit_code = exit_code;
 962                        sig->flags = SIGNAL_GROUP_EXIT;
 963                        zap_other_threads(current);
 964                }
 965                spin_unlock_irq(&sighand->siglock);
 966        }
 967
 968        do_exit(exit_code);
 969        /* NOTREACHED */
 970}
 971
 972/*
 973 * this kills every thread in the thread group. Note that any externally
 974 * wait4()-ing process will get the correct exit code - even if this
 975 * thread is not the thread group leader.
 976 */
 977SYSCALL_DEFINE1(exit_group, int, error_code)
 978{
 979        do_group_exit((error_code & 0xff) << 8);
 980        /* NOTREACHED */
 981        return 0;
 982}
 983
 984struct waitid_info {
 985        pid_t pid;
 986        uid_t uid;
 987        int status;
 988        int cause;
 989};
 990
 991struct wait_opts {
 992        enum pid_type           wo_type;
 993        int                     wo_flags;
 994        struct pid              *wo_pid;
 995
 996        struct waitid_info      *wo_info;
 997        int                     wo_stat;
 998        struct rusage           *wo_rusage;
 999
1000        wait_queue_entry_t              child_wait;
1001        int                     notask_error;
1002};
1003
1004static inline
1005struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1006{
1007        if (type != PIDTYPE_PID)
1008                task = task->group_leader;
1009        return task->pids[type].pid;
1010}
1011
1012static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1013{
1014        return  wo->wo_type == PIDTYPE_MAX ||
1015                task_pid_type(p, wo->wo_type) == wo->wo_pid;
1016}
1017
1018static int
1019eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
1020{
1021        if (!eligible_pid(wo, p))
1022                return 0;
1023
1024        /*
1025         * Wait for all children (clone and not) if __WALL is set or
1026         * if it is traced by us.
1027         */
1028        if (ptrace || (wo->wo_flags & __WALL))
1029                return 1;
1030
1031        /*
1032         * Otherwise, wait for clone children *only* if __WCLONE is set;
1033         * otherwise, wait for non-clone children *only*.
1034         *
1035         * Note: a "clone" child here is one that reports to its parent
1036         * using a signal other than SIGCHLD, or a non-leader thread which
1037         * we can only see if it is traced by us.
1038         */
1039        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1040                return 0;
1041
1042        return 1;
1043}
1044
1045/*
1046 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
1047 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1048 * the lock and this task is uninteresting.  If we return nonzero, we have
1049 * released the lock and the system call should return.
1050 */
1051static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1052{
1053        int state, status;
1054        pid_t pid = task_pid_vnr(p);
1055        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1056        struct waitid_info *infop;
1057
1058        if (!likely(wo->wo_flags & WEXITED))
1059                return 0;
1060
1061        if (unlikely(wo->wo_flags & WNOWAIT)) {
1062                status = p->exit_code;
1063                get_task_struct(p);
1064                read_unlock(&tasklist_lock);
1065                sched_annotate_sleep();
1066                if (wo->wo_rusage)
1067                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1068                put_task_struct(p);
1069                goto out_info;
1070        }
1071        /*
1072         * Move the task's state to DEAD/TRACE, only one thread can do this.
1073         */
1074        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1075                EXIT_TRACE : EXIT_DEAD;
1076        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1077                return 0;
1078        /*
1079         * We own this thread, nobody else can reap it.
1080         */
1081        read_unlock(&tasklist_lock);
1082        sched_annotate_sleep();
1083
1084        /*
1085         * Check thread_group_leader() to exclude the traced sub-threads.
1086         */
1087        if (state == EXIT_DEAD && thread_group_leader(p)) {
1088                struct signal_struct *sig = p->signal;
1089                struct signal_struct *psig = current->signal;
1090                unsigned long maxrss;
1091                u64 tgutime, tgstime;
1092
1093                /*
1094                 * The resource counters for the group leader are in its
1095                 * own task_struct.  Those for dead threads in the group
1096                 * are in its signal_struct, as are those for the child
1097                 * processes it has previously reaped.  All these
1098                 * accumulate in the parent's signal_struct c* fields.
1099                 *
1100                 * We don't bother to take a lock here to protect these
1101                 * p->signal fields because the whole thread group is dead
1102                 * and nobody can change them.
1103                 *
1104                 * psig->stats_lock also protects us from our sub-theads
1105                 * which can reap other children at the same time. Until
1106                 * we change k_getrusage()-like users to rely on this lock
1107                 * we have to take ->siglock as well.
1108                 *
1109                 * We use thread_group_cputime_adjusted() to get times for
1110                 * the thread group, which consolidates times for all threads
1111                 * in the group including the group leader.
1112                 */
1113                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1114                spin_lock_irq(&current->sighand->siglock);
1115                write_seqlock(&psig->stats_lock);
1116                psig->cutime += tgutime + sig->cutime;
1117                psig->cstime += tgstime + sig->cstime;
1118                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1119                psig->cmin_flt +=
1120                        p->min_flt + sig->min_flt + sig->cmin_flt;
1121                psig->cmaj_flt +=
1122                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1123                psig->cnvcsw +=
1124                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
1125                psig->cnivcsw +=
1126                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
1127                psig->cinblock +=
1128                        task_io_get_inblock(p) +
1129                        sig->inblock + sig->cinblock;
1130                psig->coublock +=
1131                        task_io_get_oublock(p) +
1132                        sig->oublock + sig->coublock;
1133                maxrss = max(sig->maxrss, sig->cmaxrss);
1134                if (psig->cmaxrss < maxrss)
1135                        psig->cmaxrss = maxrss;
1136                task_io_accounting_add(&psig->ioac, &p->ioac);
1137                task_io_accounting_add(&psig->ioac, &sig->ioac);
1138                write_sequnlock(&psig->stats_lock);
1139                spin_unlock_irq(&current->sighand->siglock);
1140        }
1141
1142        if (wo->wo_rusage)
1143                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1144        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1145                ? p->signal->group_exit_code : p->exit_code;
1146        wo->wo_stat = status;
1147
1148        if (state == EXIT_TRACE) {
1149                write_lock_irq(&tasklist_lock);
1150                /* We dropped tasklist, ptracer could die and untrace */
1151                ptrace_unlink(p);
1152
1153                /* If parent wants a zombie, don't release it now */
1154                state = EXIT_ZOMBIE;
1155                if (do_notify_parent(p, p->exit_signal))
1156                        state = EXIT_DEAD;
1157                p->exit_state = state;
1158                write_unlock_irq(&tasklist_lock);
1159        }
1160        if (state == EXIT_DEAD)
1161                release_task(p);
1162
1163out_info:
1164        infop = wo->wo_info;
1165        if (infop) {
1166                if ((status & 0x7f) == 0) {
1167                        infop->cause = CLD_EXITED;
1168                        infop->status = status >> 8;
1169                } else {
1170                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1171                        infop->status = status & 0x7f;
1172                }
1173                infop->pid = pid;
1174                infop->uid = uid;
1175        }
1176
1177        return pid;
1178}
1179
1180static int *task_stopped_code(struct task_struct *p, bool ptrace)
1181{
1182        if (ptrace) {
1183                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1184                        return &p->exit_code;
1185        } else {
1186                if (p->signal->flags & SIGNAL_STOP_STOPPED)
1187                        return &p->signal->group_exit_code;
1188        }
1189        return NULL;
1190}
1191
1192/**
1193 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1194 * @wo: wait options
1195 * @ptrace: is the wait for ptrace
1196 * @p: task to wait for
1197 *
1198 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1199 *
1200 * CONTEXT:
1201 * read_lock(&tasklist_lock), which is released if return value is
1202 * non-zero.  Also, grabs and releases @p->sighand->siglock.
1203 *
1204 * RETURNS:
1205 * 0 if wait condition didn't exist and search for other wait conditions
1206 * should continue.  Non-zero return, -errno on failure and @p's pid on
1207 * success, implies that tasklist_lock is released and wait condition
1208 * search should terminate.
1209 */
1210static int wait_task_stopped(struct wait_opts *wo,
1211                                int ptrace, struct task_struct *p)
1212{
1213        struct waitid_info *infop;
1214        int exit_code, *p_code, why;
1215        uid_t uid = 0; /* unneeded, required by compiler */
1216        pid_t pid;
1217
1218        /*
1219         * Traditionally we see ptrace'd stopped tasks regardless of options.
1220         */
1221        if (!ptrace && !(wo->wo_flags & WUNTRACED))
1222                return 0;
1223
1224        if (!task_stopped_code(p, ptrace))
1225                return 0;
1226
1227        exit_code = 0;
1228        spin_lock_irq(&p->sighand->siglock);
1229
1230        p_code = task_stopped_code(p, ptrace);
1231        if (unlikely(!p_code))
1232                goto unlock_sig;
1233
1234        exit_code = *p_code;
1235        if (!exit_code)
1236                goto unlock_sig;
1237
1238        if (!unlikely(wo->wo_flags & WNOWAIT))
1239                *p_code = 0;
1240
1241        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1242unlock_sig:
1243        spin_unlock_irq(&p->sighand->siglock);
1244        if (!exit_code)
1245                return 0;
1246
1247        /*
1248         * Now we are pretty sure this task is interesting.
1249         * Make sure it doesn't get reaped out from under us while we
1250         * give up the lock and then examine it below.  We don't want to
1251         * keep holding onto the tasklist_lock while we call getrusage and
1252         * possibly take page faults for user memory.
1253         */
1254        get_task_struct(p);
1255        pid = task_pid_vnr(p);
1256        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1257        read_unlock(&tasklist_lock);
1258        sched_annotate_sleep();
1259        if (wo->wo_rusage)
1260                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1261        put_task_struct(p);
1262
1263        if (likely(!(wo->wo_flags & WNOWAIT)))
1264                wo->wo_stat = (exit_code << 8) | 0x7f;
1265
1266        infop = wo->wo_info;
1267        if (infop) {
1268                infop->cause = why;
1269                infop->status = exit_code;
1270                infop->pid = pid;
1271                infop->uid = uid;
1272        }
1273        return pid;
1274}
1275
1276/*
1277 * Handle do_wait work for one task in a live, non-stopped state.
1278 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1279 * the lock and this task is uninteresting.  If we return nonzero, we have
1280 * released the lock and the system call should return.
1281 */
1282static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1283{
1284        struct waitid_info *infop;
1285        pid_t pid;
1286        uid_t uid;
1287
1288        if (!unlikely(wo->wo_flags & WCONTINUED))
1289                return 0;
1290
1291        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1292                return 0;
1293
1294        spin_lock_irq(&p->sighand->siglock);
1295        /* Re-check with the lock held.  */
1296        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1297                spin_unlock_irq(&p->sighand->siglock);
1298                return 0;
1299        }
1300        if (!unlikely(wo->wo_flags & WNOWAIT))
1301                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1302        uid = from_kuid_munged(current_user_ns(), task_uid(p));
1303        spin_unlock_irq(&p->sighand->siglock);
1304
1305        pid = task_pid_vnr(p);
1306        get_task_struct(p);
1307        read_unlock(&tasklist_lock);
1308        sched_annotate_sleep();
1309        if (wo->wo_rusage)
1310                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1311        put_task_struct(p);
1312
1313        infop = wo->wo_info;
1314        if (!infop) {
1315                wo->wo_stat = 0xffff;
1316        } else {
1317                infop->cause = CLD_CONTINUED;
1318                infop->pid = pid;
1319                infop->uid = uid;
1320                infop->status = SIGCONT;
1321        }
1322        return pid;
1323}
1324
1325/*
1326 * Consider @p for a wait by @parent.
1327 *
1328 * -ECHILD should be in ->notask_error before the first call.
1329 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1330 * Returns zero if the search for a child should continue;
1331 * then ->notask_error is 0 if @p is an eligible child,
1332 * or still -ECHILD.
1333 */
1334static int wait_consider_task(struct wait_opts *wo, int ptrace,
1335                                struct task_struct *p)
1336{
1337        /*
1338         * We can race with wait_task_zombie() from another thread.
1339         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1340         * can't confuse the checks below.
1341         */
1342        int exit_state = ACCESS_ONCE(p->exit_state);
1343        int ret;
1344
1345        if (unlikely(exit_state == EXIT_DEAD))
1346                return 0;
1347
1348        ret = eligible_child(wo, ptrace, p);
1349        if (!ret)
1350                return ret;
1351
1352        if (unlikely(exit_state == EXIT_TRACE)) {
1353                /*
1354                 * ptrace == 0 means we are the natural parent. In this case
1355                 * we should clear notask_error, debugger will notify us.
1356                 */
1357                if (likely(!ptrace))
1358                        wo->notask_error = 0;
1359                return 0;
1360        }
1361
1362        if (likely(!ptrace) && unlikely(p->ptrace)) {
1363                /*
1364                 * If it is traced by its real parent's group, just pretend
1365                 * the caller is ptrace_do_wait() and reap this child if it
1366                 * is zombie.
1367                 *
1368                 * This also hides group stop state from real parent; otherwise
1369                 * a single stop can be reported twice as group and ptrace stop.
1370                 * If a ptracer wants to distinguish these two events for its
1371                 * own children it should create a separate process which takes
1372                 * the role of real parent.
1373                 */
1374                if (!ptrace_reparented(p))
1375                        ptrace = 1;
1376        }
1377
1378        /* slay zombie? */
1379        if (exit_state == EXIT_ZOMBIE) {
1380                /* we don't reap group leaders with subthreads */
1381                if (!delay_group_leader(p)) {
1382                        /*
1383                         * A zombie ptracee is only visible to its ptracer.
1384                         * Notification and reaping will be cascaded to the
1385                         * real parent when the ptracer detaches.
1386                         */
1387                        if (unlikely(ptrace) || likely(!p->ptrace))
1388                                return wait_task_zombie(wo, p);
1389                }
1390
1391                /*
1392                 * Allow access to stopped/continued state via zombie by
1393                 * falling through.  Clearing of notask_error is complex.
1394                 *
1395                 * When !@ptrace:
1396                 *
1397                 * If WEXITED is set, notask_error should naturally be
1398                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1399                 * so, if there are live subthreads, there are events to
1400                 * wait for.  If all subthreads are dead, it's still safe
1401                 * to clear - this function will be called again in finite
1402                 * amount time once all the subthreads are released and
1403                 * will then return without clearing.
1404                 *
1405                 * When @ptrace:
1406                 *
1407                 * Stopped state is per-task and thus can't change once the
1408                 * target task dies.  Only continued and exited can happen.
1409                 * Clear notask_error if WCONTINUED | WEXITED.
1410                 */
1411                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1412                        wo->notask_error = 0;
1413        } else {
1414                /*
1415                 * @p is alive and it's gonna stop, continue or exit, so
1416                 * there always is something to wait for.
1417                 */
1418                wo->notask_error = 0;
1419        }
1420
1421        /*
1422         * Wait for stopped.  Depending on @ptrace, different stopped state
1423         * is used and the two don't interact with each other.
1424         */
1425        ret = wait_task_stopped(wo, ptrace, p);
1426        if (ret)
1427                return ret;
1428
1429        /*
1430         * Wait for continued.  There's only one continued state and the
1431         * ptracer can consume it which can confuse the real parent.  Don't
1432         * use WCONTINUED from ptracer.  You don't need or want it.
1433         */
1434        return wait_task_continued(wo, p);
1435}
1436
1437/*
1438 * Do the work of do_wait() for one thread in the group, @tsk.
1439 *
1440 * -ECHILD should be in ->notask_error before the first call.
1441 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1442 * Returns zero if the search for a child should continue; then
1443 * ->notask_error is 0 if there were any eligible children,
1444 * or still -ECHILD.
1445 */
1446static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1447{
1448        struct task_struct *p;
1449
1450        list_for_each_entry(p, &tsk->children, sibling) {
1451                int ret = wait_consider_task(wo, 0, p);
1452
1453                if (ret)
1454                        return ret;
1455        }
1456
1457        return 0;
1458}
1459
1460static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1461{
1462        struct task_struct *p;
1463
1464        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1465                int ret = wait_consider_task(wo, 1, p);
1466
1467                if (ret)
1468                        return ret;
1469        }
1470
1471        return 0;
1472}
1473
1474static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1475                                int sync, void *key)
1476{
1477        struct wait_opts *wo = container_of(wait, struct wait_opts,
1478                                                child_wait);
1479        struct task_struct *p = key;
1480
1481        if (!eligible_pid(wo, p))
1482                return 0;
1483
1484        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1485                return 0;
1486
1487        return default_wake_function(wait, mode, sync, key);
1488}
1489
1490void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1491{
1492        __wake_up_sync_key(&parent->signal->wait_chldexit,
1493                                TASK_INTERRUPTIBLE, 1, p);
1494}
1495
1496static long do_wait(struct wait_opts *wo)
1497{
1498        struct task_struct *tsk;
1499        int retval;
1500
1501        trace_sched_process_wait(wo->wo_pid);
1502
1503        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1504        wo->child_wait.private = current;
1505        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1506repeat:
1507        /*
1508         * If there is nothing that can match our criteria, just get out.
1509         * We will clear ->notask_error to zero if we see any child that
1510         * might later match our criteria, even if we are not able to reap
1511         * it yet.
1512         */
1513        wo->notask_error = -ECHILD;
1514        if ((wo->wo_type < PIDTYPE_MAX) &&
1515           (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1516                goto notask;
1517
1518        set_current_state(TASK_INTERRUPTIBLE);
1519        read_lock(&tasklist_lock);
1520        tsk = current;
1521        do {
1522                retval = do_wait_thread(wo, tsk);
1523                if (retval)
1524                        goto end;
1525
1526                retval = ptrace_do_wait(wo, tsk);
1527                if (retval)
1528                        goto end;
1529
1530                if (wo->wo_flags & __WNOTHREAD)
1531                        break;
1532        } while_each_thread(current, tsk);
1533        read_unlock(&tasklist_lock);
1534
1535notask:
1536        retval = wo->notask_error;
1537        if (!retval && !(wo->wo_flags & WNOHANG)) {
1538                retval = -ERESTARTSYS;
1539                if (!signal_pending(current)) {
1540                        schedule();
1541                        goto repeat;
1542                }
1543        }
1544end:
1545        __set_current_state(TASK_RUNNING);
1546        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1547        return retval;
1548}
1549
1550static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1551                          int options, struct rusage *ru)
1552{
1553        struct wait_opts wo;
1554        struct pid *pid = NULL;
1555        enum pid_type type;
1556        long ret;
1557
1558        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1559                        __WNOTHREAD|__WCLONE|__WALL))
1560                return -EINVAL;
1561        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1562                return -EINVAL;
1563
1564        switch (which) {
1565        case P_ALL:
1566                type = PIDTYPE_MAX;
1567                break;
1568        case P_PID:
1569                type = PIDTYPE_PID;
1570                if (upid <= 0)
1571                        return -EINVAL;
1572                break;
1573        case P_PGID:
1574                type = PIDTYPE_PGID;
1575                if (upid <= 0)
1576                        return -EINVAL;
1577                break;
1578        default:
1579                return -EINVAL;
1580        }
1581
1582        if (type < PIDTYPE_MAX)
1583                pid = find_get_pid(upid);
1584
1585        wo.wo_type      = type;
1586        wo.wo_pid       = pid;
1587        wo.wo_flags     = options;
1588        wo.wo_info      = infop;
1589        wo.wo_rusage    = ru;
1590        ret = do_wait(&wo);
1591
1592        put_pid(pid);
1593        return ret;
1594}
1595
1596SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1597                infop, int, options, struct rusage __user *, ru)
1598{
1599        struct rusage r;
1600        struct waitid_info info = {.status = 0};
1601        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1602        int signo = 0;
1603
1604        if (err > 0) {
1605                signo = SIGCHLD;
1606                err = 0;
1607                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1608                        return -EFAULT;
1609        }
1610        if (!infop)
1611                return err;
1612
1613        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1614                return -EFAULT;
1615
1616        user_access_begin();
1617        unsafe_put_user(signo, &infop->si_signo, Efault);
1618        unsafe_put_user(0, &infop->si_errno, Efault);
1619        unsafe_put_user(info.cause, &infop->si_code, Efault);
1620        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1621        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1622        unsafe_put_user(info.status, &infop->si_status, Efault);
1623        user_access_end();
1624        return err;
1625Efault:
1626        user_access_end();
1627        return -EFAULT;
1628}
1629
1630long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1631                  struct rusage *ru)
1632{
1633        struct wait_opts wo;
1634        struct pid *pid = NULL;
1635        enum pid_type type;
1636        long ret;
1637
1638        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1639                        __WNOTHREAD|__WCLONE|__WALL))
1640                return -EINVAL;
1641
1642        /* -INT_MIN is not defined */
1643        if (upid == INT_MIN)
1644                return -ESRCH;
1645
1646        if (upid == -1)
1647                type = PIDTYPE_MAX;
1648        else if (upid < 0) {
1649                type = PIDTYPE_PGID;
1650                pid = find_get_pid(-upid);
1651        } else if (upid == 0) {
1652                type = PIDTYPE_PGID;
1653                pid = get_task_pid(current, PIDTYPE_PGID);
1654        } else /* upid > 0 */ {
1655                type = PIDTYPE_PID;
1656                pid = find_get_pid(upid);
1657        }
1658
1659        wo.wo_type      = type;
1660        wo.wo_pid       = pid;
1661        wo.wo_flags     = options | WEXITED;
1662        wo.wo_info      = NULL;
1663        wo.wo_stat      = 0;
1664        wo.wo_rusage    = ru;
1665        ret = do_wait(&wo);
1666        put_pid(pid);
1667        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1668                ret = -EFAULT;
1669
1670        return ret;
1671}
1672
1673SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1674                int, options, struct rusage __user *, ru)
1675{
1676        struct rusage r;
1677        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1678
1679        if (err > 0) {
1680                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1681                        return -EFAULT;
1682        }
1683        return err;
1684}
1685
1686#ifdef __ARCH_WANT_SYS_WAITPID
1687
1688/*
1689 * sys_waitpid() remains for compatibility. waitpid() should be
1690 * implemented by calling sys_wait4() from libc.a.
1691 */
1692SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1693{
1694        return sys_wait4(pid, stat_addr, options, NULL);
1695}
1696
1697#endif
1698
1699#ifdef CONFIG_COMPAT
1700COMPAT_SYSCALL_DEFINE4(wait4,
1701        compat_pid_t, pid,
1702        compat_uint_t __user *, stat_addr,
1703        int, options,
1704        struct compat_rusage __user *, ru)
1705{
1706        struct rusage r;
1707        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1708        if (err > 0) {
1709                if (ru && put_compat_rusage(&r, ru))
1710                        return -EFAULT;
1711        }
1712        return err;
1713}
1714
1715COMPAT_SYSCALL_DEFINE5(waitid,
1716                int, which, compat_pid_t, pid,
1717                struct compat_siginfo __user *, infop, int, options,
1718                struct compat_rusage __user *, uru)
1719{
1720        struct rusage ru;
1721        struct waitid_info info = {.status = 0};
1722        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1723        int signo = 0;
1724        if (err > 0) {
1725                signo = SIGCHLD;
1726                err = 0;
1727                if (uru) {
1728                        /* kernel_waitid() overwrites everything in ru */
1729                        if (COMPAT_USE_64BIT_TIME)
1730                                err = copy_to_user(uru, &ru, sizeof(ru));
1731                        else
1732                                err = put_compat_rusage(&ru, uru);
1733                        if (err)
1734                                return -EFAULT;
1735                }
1736        }
1737
1738        if (!infop)
1739                return err;
1740
1741        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1742                return -EFAULT;
1743
1744        user_access_begin();
1745        unsafe_put_user(signo, &infop->si_signo, Efault);
1746        unsafe_put_user(0, &infop->si_errno, Efault);
1747        unsafe_put_user(info.cause, &infop->si_code, Efault);
1748        unsafe_put_user(info.pid, &infop->si_pid, Efault);
1749        unsafe_put_user(info.uid, &infop->si_uid, Efault);
1750        unsafe_put_user(info.status, &infop->si_status, Efault);
1751        user_access_end();
1752        return err;
1753Efault:
1754        user_access_end();
1755        return -EFAULT;
1756}
1757#endif
1758