linux/mm/oom_kill.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/oom_kill.c
   3 * 
   4 *  Copyright (C)  1998,2000  Rik van Riel
   5 *      Thanks go out to Claus Fischer for some serious inspiration and
   6 *      for goading me into coding this file...
   7 *  Copyright (C)  2010  Google, Inc.
   8 *      Rewritten by David Rientjes
   9 *
  10 *  The routines in this file are used to kill a process when
  11 *  we're seriously out of memory. This gets called from __alloc_pages()
  12 *  in mm/page_alloc.c when we really run out of memory.
  13 *
  14 *  Since we won't call these routines often (on a well-configured
  15 *  machine) this file will double as a 'coding guide' and a signpost
  16 *  for newbie kernel hackers. It features several pointers to major
  17 *  kernel subsystems and hints as to where to find out what things do.
  18 */
  19
  20#include <linux/oom.h>
  21#include <linux/mm.h>
  22#include <linux/err.h>
  23#include <linux/gfp.h>
  24#include <linux/sched.h>
  25#include <linux/swap.h>
  26#include <linux/timex.h>
  27#include <linux/jiffies.h>
  28#include <linux/cpuset.h>
  29#include <linux/module.h>
  30#include <linux/notifier.h>
  31#include <linux/memcontrol.h>
  32#include <linux/mempolicy.h>
  33#include <linux/security.h>
  34
  35int sysctl_panic_on_oom;
  36int sysctl_oom_kill_allocating_task;
  37int sysctl_oom_dump_tasks = 1;
  38static DEFINE_SPINLOCK(zone_scan_lock);
  39
  40#ifdef CONFIG_NUMA
  41/**
  42 * has_intersects_mems_allowed() - check task eligiblity for kill
  43 * @tsk: task struct of which task to consider
  44 * @mask: nodemask passed to page allocator for mempolicy ooms
  45 *
  46 * Task eligibility is determined by whether or not a candidate task, @tsk,
  47 * shares the same mempolicy nodes as current if it is bound by such a policy
  48 * and whether or not it has the same set of allowed cpuset nodes.
  49 */
  50static bool has_intersects_mems_allowed(struct task_struct *tsk,
  51                                        const nodemask_t *mask)
  52{
  53        struct task_struct *start = tsk;
  54
  55        do {
  56                if (mask) {
  57                        /*
  58                         * If this is a mempolicy constrained oom, tsk's
  59                         * cpuset is irrelevant.  Only return true if its
  60                         * mempolicy intersects current, otherwise it may be
  61                         * needlessly killed.
  62                         */
  63                        if (mempolicy_nodemask_intersects(tsk, mask))
  64                                return true;
  65                } else {
  66                        /*
  67                         * This is not a mempolicy constrained oom, so only
  68                         * check the mems of tsk's cpuset.
  69                         */
  70                        if (cpuset_mems_allowed_intersects(current, tsk))
  71                                return true;
  72                }
  73        } while_each_thread(start, tsk);
  74
  75        return false;
  76}
  77#else
  78static bool has_intersects_mems_allowed(struct task_struct *tsk,
  79                                        const nodemask_t *mask)
  80{
  81        return true;
  82}
  83#endif /* CONFIG_NUMA */
  84
  85/*
  86 * If this is a system OOM (not a memcg OOM) and the task selected to be
  87 * killed is not already running at high (RT) priorities, speed up the
  88 * recovery by boosting the dying task to the lowest FIFO priority.
  89 * That helps with the recovery and avoids interfering with RT tasks.
  90 */
  91static void boost_dying_task_prio(struct task_struct *p,
  92                                  struct mem_cgroup *mem)
  93{
  94        struct sched_param param = { .sched_priority = 1 };
  95
  96        if (mem)
  97                return;
  98
  99        if (!rt_task(p))
 100                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 101}
 102
 103/*
 104 * The process p may have detached its own ->mm while exiting or through
 105 * use_mm(), but one or more of its subthreads may still have a valid
 106 * pointer.  Return p, or any of its subthreads with a valid ->mm, with
 107 * task_lock() held.
 108 */
 109struct task_struct *find_lock_task_mm(struct task_struct *p)
 110{
 111        struct task_struct *t = p;
 112
 113        do {
 114                task_lock(t);
 115                if (likely(t->mm))
 116                        return t;
 117                task_unlock(t);
 118        } while_each_thread(p, t);
 119
 120        return NULL;
 121}
 122
 123/* return true if the task is not adequate as candidate victim task. */
 124static bool oom_unkillable_task(struct task_struct *p,
 125                const struct mem_cgroup *mem, const nodemask_t *nodemask)
 126{
 127        if (is_global_init(p))
 128                return true;
 129        if (p->flags & PF_KTHREAD)
 130                return true;
 131
 132        /* When mem_cgroup_out_of_memory() and p is not member of the group */
 133        if (mem && !task_in_mem_cgroup(p, mem))
 134                return true;
 135
 136        /* p may not have freeable memory in nodemask */
 137        if (!has_intersects_mems_allowed(p, nodemask))
 138                return true;
 139
 140        return false;
 141}
 142
 143/**
 144 * oom_badness - heuristic function to determine which candidate task to kill
 145 * @p: task struct of which task we should calculate
 146 * @totalpages: total present RAM allowed for page allocation
 147 *
 148 * The heuristic for determining which task to kill is made to be as simple and
 149 * predictable as possible.  The goal is to return the highest value for the
 150 * task consuming the most memory to avoid subsequent oom failures.
 151 */
 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 153                      const nodemask_t *nodemask, unsigned long totalpages)
 154{
 155        int points;
 156
 157        if (oom_unkillable_task(p, mem, nodemask))
 158                return 0;
 159
 160        p = find_lock_task_mm(p);
 161        if (!p)
 162                return 0;
 163
 164        /*
 165         * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
 166         * so the entire heuristic doesn't need to be executed for something
 167         * that cannot be killed.
 168         */
 169        if (atomic_read(&p->mm->oom_disable_count)) {
 170                task_unlock(p);
 171                return 0;
 172        }
 173
 174        /*
 175         * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
 176         * priority for oom killing.
 177         */
 178        if (p->flags & PF_OOM_ORIGIN) {
 179                task_unlock(p);
 180                return 1000;
 181        }
 182
 183        /*
 184         * The memory controller may have a limit of 0 bytes, so avoid a divide
 185         * by zero, if necessary.
 186         */
 187        if (!totalpages)
 188                totalpages = 1;
 189
 190        /*
 191         * The baseline for the badness score is the proportion of RAM that each
 192         * task's rss and swap space use.
 193         */
 194        points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
 195                        totalpages;
 196        task_unlock(p);
 197
 198        /*
 199         * Root processes get 3% bonus, just like the __vm_enough_memory()
 200         * implementation used by LSMs.
 201         */
 202        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
 203                points -= 30;
 204
 205        /*
 206         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
 207         * either completely disable oom killing or always prefer a certain
 208         * task.
 209         */
 210        points += p->signal->oom_score_adj;
 211
 212        /*
 213         * Never return 0 for an eligible task that may be killed since it's
 214         * possible that no single user task uses more than 0.1% of memory and
 215         * no single admin tasks uses more than 3.0%.
 216         */
 217        if (points <= 0)
 218                return 1;
 219        return (points < 1000) ? points : 1000;
 220}
 221
 222/*
 223 * Determine the type of allocation constraint.
 224 */
 225#ifdef CONFIG_NUMA
 226static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 227                                gfp_t gfp_mask, nodemask_t *nodemask,
 228                                unsigned long *totalpages)
 229{
 230        struct zone *zone;
 231        struct zoneref *z;
 232        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 233        bool cpuset_limited = false;
 234        int nid;
 235
 236        /* Default to all available memory */
 237        *totalpages = totalram_pages + total_swap_pages;
 238
 239        if (!zonelist)
 240                return CONSTRAINT_NONE;
 241        /*
 242         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 243         * to kill current.We have to random task kill in this case.
 244         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
 245         */
 246        if (gfp_mask & __GFP_THISNODE)
 247                return CONSTRAINT_NONE;
 248
 249        /*
 250         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
 251         * the page allocator means a mempolicy is in effect.  Cpuset policy
 252         * is enforced in get_page_from_freelist().
 253         */
 254        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
 255                *totalpages = total_swap_pages;
 256                for_each_node_mask(nid, *nodemask)
 257                        *totalpages += node_spanned_pages(nid);
 258                return CONSTRAINT_MEMORY_POLICY;
 259        }
 260
 261        /* Check this allocation failure is caused by cpuset's wall function */
 262        for_each_zone_zonelist_nodemask(zone, z, zonelist,
 263                        high_zoneidx, nodemask)
 264                if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
 265                        cpuset_limited = true;
 266
 267        if (cpuset_limited) {
 268                *totalpages = total_swap_pages;
 269                for_each_node_mask(nid, cpuset_current_mems_allowed)
 270                        *totalpages += node_spanned_pages(nid);
 271                return CONSTRAINT_CPUSET;
 272        }
 273        return CONSTRAINT_NONE;
 274}
 275#else
 276static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 277                                gfp_t gfp_mask, nodemask_t *nodemask,
 278                                unsigned long *totalpages)
 279{
 280        *totalpages = totalram_pages + total_swap_pages;
 281        return CONSTRAINT_NONE;
 282}
 283#endif
 284
 285/*
 286 * Simple selection loop. We chose the process with the highest
 287 * number of 'points'. We expect the caller will lock the tasklist.
 288 *
 289 * (not docbooked, we don't want this one cluttering up the manual)
 290 */
 291static struct task_struct *select_bad_process(unsigned int *ppoints,
 292                unsigned long totalpages, struct mem_cgroup *mem,
 293                const nodemask_t *nodemask)
 294{
 295        struct task_struct *p;
 296        struct task_struct *chosen = NULL;
 297        *ppoints = 0;
 298
 299        for_each_process(p) {
 300                unsigned int points;
 301
 302                if (oom_unkillable_task(p, mem, nodemask))
 303                        continue;
 304
 305                /*
 306                 * This task already has access to memory reserves and is
 307                 * being killed. Don't allow any other task access to the
 308                 * memory reserve.
 309                 *
 310                 * Note: this may have a chance of deadlock if it gets
 311                 * blocked waiting for another task which itself is waiting
 312                 * for memory. Is there a better alternative?
 313                 */
 314                if (test_tsk_thread_flag(p, TIF_MEMDIE))
 315                        return ERR_PTR(-1UL);
 316
 317                /*
 318                 * This is in the process of releasing memory so wait for it
 319                 * to finish before killing some other task by mistake.
 320                 *
 321                 * However, if p is the current task, we allow the 'kill' to
 322                 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
 323                 * which will allow it to gain access to memory reserves in
 324                 * the process of exiting and releasing its resources.
 325                 * Otherwise we could get an easy OOM deadlock.
 326                 */
 327                if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
 328                        if (p != current)
 329                                return ERR_PTR(-1UL);
 330
 331                        chosen = p;
 332                        *ppoints = 1000;
 333                }
 334
 335                points = oom_badness(p, mem, nodemask, totalpages);
 336                if (points > *ppoints) {
 337                        chosen = p;
 338                        *ppoints = points;
 339                }
 340        }
 341
 342        return chosen;
 343}
 344
 345/**
 346 * dump_tasks - dump current memory state of all system tasks
 347 * @mem: current's memory controller, if constrained
 348 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 349 *
 350 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 351 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 352 * are not shown.
 353 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
 354 * value, oom_score_adj value, and name.
 355 *
 356 * Call with tasklist_lock read-locked.
 357 */
 358static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
 359{
 360        struct task_struct *p;
 361        struct task_struct *task;
 362
 363        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
 364        for_each_process(p) {
 365                if (oom_unkillable_task(p, mem, nodemask))
 366                        continue;
 367
 368                task = find_lock_task_mm(p);
 369                if (!task) {
 370                        /*
 371                         * This is a kthread or all of p's threads have already
 372                         * detached their mm's.  There's no need to report
 373                         * them; they can't be oom killed anyway.
 374                         */
 375                        continue;
 376                }
 377
 378                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
 379                        task->pid, task_uid(task), task->tgid,
 380                        task->mm->total_vm, get_mm_rss(task->mm),
 381                        task_cpu(task), task->signal->oom_adj,
 382                        task->signal->oom_score_adj, task->comm);
 383                task_unlock(task);
 384        }
 385}
 386
 387static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 388                        struct mem_cgroup *mem, const nodemask_t *nodemask)
 389{
 390        task_lock(current);
 391        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
 392                "oom_adj=%d, oom_score_adj=%d\n",
 393                current->comm, gfp_mask, order, current->signal->oom_adj,
 394                current->signal->oom_score_adj);
 395        cpuset_print_task_mems_allowed(current);
 396        task_unlock(current);
 397        dump_stack();
 398        mem_cgroup_print_oom_info(mem, p);
 399        show_mem();
 400        if (sysctl_oom_dump_tasks)
 401                dump_tasks(mem, nodemask);
 402}
 403
 404#define K(x) ((x) << (PAGE_SHIFT-10))
 405static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 406{
 407        struct task_struct *q;
 408        struct mm_struct *mm;
 409
 410        p = find_lock_task_mm(p);
 411        if (!p)
 412                return 1;
 413
 414        /* mm cannot be safely dereferenced after task_unlock(p) */
 415        mm = p->mm;
 416
 417        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 418                task_pid_nr(p), p->comm, K(p->mm->total_vm),
 419                K(get_mm_counter(p->mm, MM_ANONPAGES)),
 420                K(get_mm_counter(p->mm, MM_FILEPAGES)));
 421        task_unlock(p);
 422
 423        /*
 424         * Kill all processes sharing p->mm in other thread groups, if any.
 425         * They don't get access to memory reserves or a higher scheduler
 426         * priority, though, to avoid depletion of all memory or task
 427         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
 428         * task cannot exit because it requires the semaphore and its contended
 429         * by another thread trying to allocate memory itself.  That thread will
 430         * now get access to memory reserves since it has a pending fatal
 431         * signal.
 432         */
 433        for_each_process(q)
 434                if (q->mm == mm && !same_thread_group(q, p)) {
 435                        task_lock(q);   /* Protect ->comm from prctl() */
 436                        pr_err("Kill process %d (%s) sharing same memory\n",
 437                                task_pid_nr(q), q->comm);
 438                        task_unlock(q);
 439                        force_sig(SIGKILL, q);
 440                }
 441
 442        set_tsk_thread_flag(p, TIF_MEMDIE);
 443        force_sig(SIGKILL, p);
 444
 445        /*
 446         * We give our sacrificial lamb high priority and access to
 447         * all the memory it needs. That way it should be able to
 448         * exit() and clear out its resources quickly...
 449         */
 450        boost_dying_task_prio(p, mem);
 451
 452        return 0;
 453}
 454#undef K
 455
 456static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 457                            unsigned int points, unsigned long totalpages,
 458                            struct mem_cgroup *mem, nodemask_t *nodemask,
 459                            const char *message)
 460{
 461        struct task_struct *victim = p;
 462        struct task_struct *child;
 463        struct task_struct *t = p;
 464        unsigned int victim_points = 0;
 465
 466        if (printk_ratelimit())
 467                dump_header(p, gfp_mask, order, mem, nodemask);
 468
 469        /*
 470         * If the task is already exiting, don't alarm the sysadmin or kill
 471         * its children or threads, just set TIF_MEMDIE so it can die quickly
 472         */
 473        if (p->flags & PF_EXITING) {
 474                set_tsk_thread_flag(p, TIF_MEMDIE);
 475                boost_dying_task_prio(p, mem);
 476                return 0;
 477        }
 478
 479        task_lock(p);
 480        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 481                message, task_pid_nr(p), p->comm, points);
 482        task_unlock(p);
 483
 484        /*
 485         * If any of p's children has a different mm and is eligible for kill,
 486         * the one with the highest badness() score is sacrificed for its
 487         * parent.  This attempts to lose the minimal amount of work done while
 488         * still freeing memory.
 489         */
 490        do {
 491                list_for_each_entry(child, &t->children, sibling) {
 492                        unsigned int child_points;
 493
 494                        /*
 495                         * oom_badness() returns 0 if the thread is unkillable
 496                         */
 497                        child_points = oom_badness(child, mem, nodemask,
 498                                                                totalpages);
 499                        if (child_points > victim_points) {
 500                                victim = child;
 501                                victim_points = child_points;
 502                        }
 503                }
 504        } while_each_thread(p, t);
 505
 506        return oom_kill_task(victim, mem);
 507}
 508
 509/*
 510 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 511 */
 512static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 513                                int order, const nodemask_t *nodemask)
 514{
 515        if (likely(!sysctl_panic_on_oom))
 516                return;
 517        if (sysctl_panic_on_oom != 2) {
 518                /*
 519                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
 520                 * does not panic for cpuset, mempolicy, or memcg allocation
 521                 * failures.
 522                 */
 523                if (constraint != CONSTRAINT_NONE)
 524                        return;
 525        }
 526        read_lock(&tasklist_lock);
 527        dump_header(NULL, gfp_mask, order, NULL, nodemask);
 528        read_unlock(&tasklist_lock);
 529        panic("Out of memory: %s panic_on_oom is enabled\n",
 530                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 531}
 532
 533#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 534void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 535{
 536        unsigned long limit;
 537        unsigned int points = 0;
 538        struct task_struct *p;
 539
 540        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
 541        limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 542        read_lock(&tasklist_lock);
 543retry:
 544        p = select_bad_process(&points, limit, mem, NULL);
 545        if (!p || PTR_ERR(p) == -1UL)
 546                goto out;
 547
 548        if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
 549                                "Memory cgroup out of memory"))
 550                goto retry;
 551out:
 552        read_unlock(&tasklist_lock);
 553}
 554#endif
 555
 556static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 557
 558int register_oom_notifier(struct notifier_block *nb)
 559{
 560        return blocking_notifier_chain_register(&oom_notify_list, nb);
 561}
 562EXPORT_SYMBOL_GPL(register_oom_notifier);
 563
 564int unregister_oom_notifier(struct notifier_block *nb)
 565{
 566        return blocking_notifier_chain_unregister(&oom_notify_list, nb);
 567}
 568EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 569
 570/*
 571 * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
 572 * if a parallel OOM killing is already taking place that includes a zone in
 573 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 574 */
 575int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 576{
 577        struct zoneref *z;
 578        struct zone *zone;
 579        int ret = 1;
 580
 581        spin_lock(&zone_scan_lock);
 582        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
 583                if (zone_is_oom_locked(zone)) {
 584                        ret = 0;
 585                        goto out;
 586                }
 587        }
 588
 589        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
 590                /*
 591                 * Lock each zone in the zonelist under zone_scan_lock so a
 592                 * parallel invocation of try_set_zonelist_oom() doesn't succeed
 593                 * when it shouldn't.
 594                 */
 595                zone_set_flag(zone, ZONE_OOM_LOCKED);
 596        }
 597
 598out:
 599        spin_unlock(&zone_scan_lock);
 600        return ret;
 601}
 602
 603/*
 604 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
 605 * allocation attempts with zonelists containing them may now recall the OOM
 606 * killer, if necessary.
 607 */
 608void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 609{
 610        struct zoneref *z;
 611        struct zone *zone;
 612
 613        spin_lock(&zone_scan_lock);
 614        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
 615                zone_clear_flag(zone, ZONE_OOM_LOCKED);
 616        }
 617        spin_unlock(&zone_scan_lock);
 618}
 619
 620/*
 621 * Try to acquire the oom killer lock for all system zones.  Returns zero if a
 622 * parallel oom killing is taking place, otherwise locks all zones and returns
 623 * non-zero.
 624 */
 625static int try_set_system_oom(void)
 626{
 627        struct zone *zone;
 628        int ret = 1;
 629
 630        spin_lock(&zone_scan_lock);
 631        for_each_populated_zone(zone)
 632                if (zone_is_oom_locked(zone)) {
 633                        ret = 0;
 634                        goto out;
 635                }
 636        for_each_populated_zone(zone)
 637                zone_set_flag(zone, ZONE_OOM_LOCKED);
 638out:
 639        spin_unlock(&zone_scan_lock);
 640        return ret;
 641}
 642
 643/*
 644 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
 645 * attempts or page faults may now recall the oom killer, if necessary.
 646 */
 647static void clear_system_oom(void)
 648{
 649        struct zone *zone;
 650
 651        spin_lock(&zone_scan_lock);
 652        for_each_populated_zone(zone)
 653                zone_clear_flag(zone, ZONE_OOM_LOCKED);
 654        spin_unlock(&zone_scan_lock);
 655}
 656
 657/**
 658 * out_of_memory - kill the "best" process when we run out of memory
 659 * @zonelist: zonelist pointer
 660 * @gfp_mask: memory allocation flags
 661 * @order: amount of memory being requested as a power of 2
 662 * @nodemask: nodemask passed to page allocator
 663 *
 664 * If we run out of memory, we have the choice between either
 665 * killing a random task (bad), letting the system crash (worse)
 666 * OR try to be smart about which process to kill. Note that we
 667 * don't have to be perfect here, we just have to be good.
 668 */
 669void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 670                int order, nodemask_t *nodemask)
 671{
 672        const nodemask_t *mpol_mask;
 673        struct task_struct *p;
 674        unsigned long totalpages;
 675        unsigned long freed = 0;
 676        unsigned int points;
 677        enum oom_constraint constraint = CONSTRAINT_NONE;
 678        int killed = 0;
 679
 680        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 681        if (freed > 0)
 682                /* Got some memory back in the last second. */
 683                return;
 684
 685        /*
 686         * If current has a pending SIGKILL, then automatically select it.  The
 687         * goal is to allow it to allocate so that it may quickly exit and free
 688         * its memory.
 689         */
 690        if (fatal_signal_pending(current)) {
 691                set_thread_flag(TIF_MEMDIE);
 692                boost_dying_task_prio(current, NULL);
 693                return;
 694        }
 695
 696        /*
 697         * Check if there were limitations on the allocation (only relevant for
 698         * NUMA) that may require different handling.
 699         */
 700        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
 701                                                &totalpages);
 702        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
 703        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
 704
 705        read_lock(&tasklist_lock);
 706        if (sysctl_oom_kill_allocating_task &&
 707            !oom_unkillable_task(current, NULL, nodemask) &&
 708            current->mm && !atomic_read(&current->mm->oom_disable_count)) {
 709                /*
 710                 * oom_kill_process() needs tasklist_lock held.  If it returns
 711                 * non-zero, current could not be killed so we must fallback to
 712                 * the tasklist scan.
 713                 */
 714                if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
 715                                NULL, nodemask,
 716                                "Out of memory (oom_kill_allocating_task)"))
 717                        goto out;
 718        }
 719
 720retry:
 721        p = select_bad_process(&points, totalpages, NULL, mpol_mask);
 722        if (PTR_ERR(p) == -1UL)
 723                goto out;
 724
 725        /* Found nothing?!?! Either we hang forever, or we panic. */
 726        if (!p) {
 727                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
 728                read_unlock(&tasklist_lock);
 729                panic("Out of memory and no killable processes...\n");
 730        }
 731
 732        if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
 733                                nodemask, "Out of memory"))
 734                goto retry;
 735        killed = 1;
 736out:
 737        read_unlock(&tasklist_lock);
 738
 739        /*
 740         * Give "p" a good chance of killing itself before we
 741         * retry to allocate memory unless "p" is current
 742         */
 743        if (killed && !test_thread_flag(TIF_MEMDIE))
 744                schedule_timeout_uninterruptible(1);
 745}
 746
 747/*
 748 * The pagefault handler calls here because it is out of memory, so kill a
 749 * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
 750 * oom killing is already in progress so do nothing.  If a task is found with
 751 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
 752 */
 753void pagefault_out_of_memory(void)
 754{
 755        if (try_set_system_oom()) {
 756                out_of_memory(NULL, 0, 0, NULL);
 757                clear_system_oom();
 758        }
 759        if (!test_thread_flag(TIF_MEMDIE))
 760                schedule_timeout_uninterruptible(1);
 761}
 762