linux/mm/oom_kill.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/oom_kill.c
   3 * 
   4 *  Copyright (C)  1998,2000  Rik van Riel
   5 *      Thanks go out to Claus Fischer for some serious inspiration and
   6 *      for goading me into coding this file...
   7 *  Copyright (C)  2010  Google, Inc.
   8 *      Rewritten by David Rientjes
   9 *
  10 *  The routines in this file are used to kill a process when
  11 *  we're seriously out of memory. This gets called from __alloc_pages()
  12 *  in mm/page_alloc.c when we really run out of memory.
  13 *
  14 *  Since we won't call these routines often (on a well-configured
  15 *  machine) this file will double as a 'coding guide' and a signpost
  16 *  for newbie kernel hackers. It features several pointers to major
  17 *  kernel subsystems and hints as to where to find out what things do.
  18 */
  19
  20#include <linux/oom.h>
  21#include <linux/mm.h>
  22#include <linux/err.h>
  23#include <linux/gfp.h>
  24#include <linux/sched.h>
  25#include <linux/swap.h>
  26#include <linux/timex.h>
  27#include <linux/jiffies.h>
  28#include <linux/cpuset.h>
  29#include <linux/export.h>
  30#include <linux/notifier.h>
  31#include <linux/memcontrol.h>
  32#include <linux/mempolicy.h>
  33#include <linux/security.h>
  34#include <linux/ptrace.h>
  35#include <linux/freezer.h>
  36#include <linux/ftrace.h>
  37#include <linux/ratelimit.h>
  38#include <linux/kthread.h>
  39#include <linux/init.h>
  40
  41#include <asm/tlb.h>
  42#include "internal.h"
  43
  44#define CREATE_TRACE_POINTS
  45#include <trace/events/oom.h>
  46
  47int sysctl_panic_on_oom;
  48int sysctl_oom_kill_allocating_task;
  49int sysctl_oom_dump_tasks = 1;
  50
  51DEFINE_MUTEX(oom_lock);
  52
  53#ifdef CONFIG_NUMA
  54/**
  55 * has_intersects_mems_allowed() - check task eligiblity for kill
  56 * @start: task struct of which task to consider
  57 * @mask: nodemask passed to page allocator for mempolicy ooms
  58 *
  59 * Task eligibility is determined by whether or not a candidate task, @tsk,
  60 * shares the same mempolicy nodes as current if it is bound by such a policy
  61 * and whether or not it has the same set of allowed cpuset nodes.
  62 */
  63static bool has_intersects_mems_allowed(struct task_struct *start,
  64                                        const nodemask_t *mask)
  65{
  66        struct task_struct *tsk;
  67        bool ret = false;
  68
  69        rcu_read_lock();
  70        for_each_thread(start, tsk) {
  71                if (mask) {
  72                        /*
  73                         * If this is a mempolicy constrained oom, tsk's
  74                         * cpuset is irrelevant.  Only return true if its
  75                         * mempolicy intersects current, otherwise it may be
  76                         * needlessly killed.
  77                         */
  78                        ret = mempolicy_nodemask_intersects(tsk, mask);
  79                } else {
  80                        /*
  81                         * This is not a mempolicy constrained oom, so only
  82                         * check the mems of tsk's cpuset.
  83                         */
  84                        ret = cpuset_mems_allowed_intersects(current, tsk);
  85                }
  86                if (ret)
  87                        break;
  88        }
  89        rcu_read_unlock();
  90
  91        return ret;
  92}
  93#else
  94static bool has_intersects_mems_allowed(struct task_struct *tsk,
  95                                        const nodemask_t *mask)
  96{
  97        return true;
  98}
  99#endif /* CONFIG_NUMA */
 100
 101/*
 102 * The process p may have detached its own ->mm while exiting or through
 103 * use_mm(), but one or more of its subthreads may still have a valid
 104 * pointer.  Return p, or any of its subthreads with a valid ->mm, with
 105 * task_lock() held.
 106 */
 107struct task_struct *find_lock_task_mm(struct task_struct *p)
 108{
 109        struct task_struct *t;
 110
 111        rcu_read_lock();
 112
 113        for_each_thread(p, t) {
 114                task_lock(t);
 115                if (likely(t->mm))
 116                        goto found;
 117                task_unlock(t);
 118        }
 119        t = NULL;
 120found:
 121        rcu_read_unlock();
 122
 123        return t;
 124}
 125
 126/*
 127 * order == -1 means the oom kill is required by sysrq, otherwise only
 128 * for display purposes.
 129 */
 130static inline bool is_sysrq_oom(struct oom_control *oc)
 131{
 132        return oc->order == -1;
 133}
 134
 135/* return true if the task is not adequate as candidate victim task. */
 136static bool oom_unkillable_task(struct task_struct *p,
 137                struct mem_cgroup *memcg, const nodemask_t *nodemask)
 138{
 139        if (is_global_init(p))
 140                return true;
 141        if (p->flags & PF_KTHREAD)
 142                return true;
 143
 144        /* When mem_cgroup_out_of_memory() and p is not member of the group */
 145        if (memcg && !task_in_mem_cgroup(p, memcg))
 146                return true;
 147
 148        /* p may not have freeable memory in nodemask */
 149        if (!has_intersects_mems_allowed(p, nodemask))
 150                return true;
 151
 152        return false;
 153}
 154
 155/**
 156 * oom_badness - heuristic function to determine which candidate task to kill
 157 * @p: task struct of which task we should calculate
 158 * @totalpages: total present RAM allowed for page allocation
 159 *
 160 * The heuristic for determining which task to kill is made to be as simple and
 161 * predictable as possible.  The goal is to return the highest value for the
 162 * task consuming the most memory to avoid subsequent oom failures.
 163 */
 164unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 165                          const nodemask_t *nodemask, unsigned long totalpages)
 166{
 167        long points;
 168        long adj;
 169
 170        if (oom_unkillable_task(p, memcg, nodemask))
 171                return 0;
 172
 173        p = find_lock_task_mm(p);
 174        if (!p)
 175                return 0;
 176
 177        adj = (long)p->signal->oom_score_adj;
 178        if (adj == OOM_SCORE_ADJ_MIN) {
 179                task_unlock(p);
 180                return 0;
 181        }
 182
 183        /*
 184         * The baseline for the badness score is the proportion of RAM that each
 185         * task's rss, pagetable and swap space use.
 186         */
 187        points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
 188                atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
 189        task_unlock(p);
 190
 191        /*
 192         * Root processes get 3% bonus, just like the __vm_enough_memory()
 193         * implementation used by LSMs.
 194         */
 195        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
 196                points -= (points * 3) / 100;
 197
 198        /* Normalize to oom_score_adj units */
 199        adj *= totalpages / 1000;
 200        points += adj;
 201
 202        /*
 203         * Never return 0 for an eligible task regardless of the root bonus and
 204         * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
 205         */
 206        return points > 0 ? points : 1;
 207}
 208
 209/*
 210 * Determine the type of allocation constraint.
 211 */
 212#ifdef CONFIG_NUMA
 213static enum oom_constraint constrained_alloc(struct oom_control *oc,
 214                                             unsigned long *totalpages)
 215{
 216        struct zone *zone;
 217        struct zoneref *z;
 218        enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
 219        bool cpuset_limited = false;
 220        int nid;
 221
 222        /* Default to all available memory */
 223        *totalpages = totalram_pages + total_swap_pages;
 224
 225        if (!oc->zonelist)
 226                return CONSTRAINT_NONE;
 227        /*
 228         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 229         * to kill current.We have to random task kill in this case.
 230         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
 231         */
 232        if (oc->gfp_mask & __GFP_THISNODE)
 233                return CONSTRAINT_NONE;
 234
 235        /*
 236         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
 237         * the page allocator means a mempolicy is in effect.  Cpuset policy
 238         * is enforced in get_page_from_freelist().
 239         */
 240        if (oc->nodemask &&
 241            !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
 242                *totalpages = total_swap_pages;
 243                for_each_node_mask(nid, *oc->nodemask)
 244                        *totalpages += node_spanned_pages(nid);
 245                return CONSTRAINT_MEMORY_POLICY;
 246        }
 247
 248        /* Check this allocation failure is caused by cpuset's wall function */
 249        for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
 250                        high_zoneidx, oc->nodemask)
 251                if (!cpuset_zone_allowed(zone, oc->gfp_mask))
 252                        cpuset_limited = true;
 253
 254        if (cpuset_limited) {
 255                *totalpages = total_swap_pages;
 256                for_each_node_mask(nid, cpuset_current_mems_allowed)
 257                        *totalpages += node_spanned_pages(nid);
 258                return CONSTRAINT_CPUSET;
 259        }
 260        return CONSTRAINT_NONE;
 261}
 262#else
 263static enum oom_constraint constrained_alloc(struct oom_control *oc,
 264                                             unsigned long *totalpages)
 265{
 266        *totalpages = totalram_pages + total_swap_pages;
 267        return CONSTRAINT_NONE;
 268}
 269#endif
 270
 271enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 272                        struct task_struct *task, unsigned long totalpages)
 273{
 274        if (oom_unkillable_task(task, NULL, oc->nodemask))
 275                return OOM_SCAN_CONTINUE;
 276
 277        /*
 278         * This task already has access to memory reserves and is being killed.
 279         * Don't allow any other task to have access to the reserves.
 280         */
 281        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
 282                if (!is_sysrq_oom(oc))
 283                        return OOM_SCAN_ABORT;
 284        }
 285        if (!task->mm)
 286                return OOM_SCAN_CONTINUE;
 287
 288        /*
 289         * If task is allocating a lot of memory and has been marked to be
 290         * killed first if it triggers an oom, then select it.
 291         */
 292        if (oom_task_origin(task))
 293                return OOM_SCAN_SELECT;
 294
 295        return OOM_SCAN_OK;
 296}
 297
 298/*
 299 * Simple selection loop. We chose the process with the highest
 300 * number of 'points'.  Returns -1 on scan abort.
 301 */
 302static struct task_struct *select_bad_process(struct oom_control *oc,
 303                unsigned int *ppoints, unsigned long totalpages)
 304{
 305        struct task_struct *g, *p;
 306        struct task_struct *chosen = NULL;
 307        unsigned long chosen_points = 0;
 308
 309        rcu_read_lock();
 310        for_each_process_thread(g, p) {
 311                unsigned int points;
 312
 313                switch (oom_scan_process_thread(oc, p, totalpages)) {
 314                case OOM_SCAN_SELECT:
 315                        chosen = p;
 316                        chosen_points = ULONG_MAX;
 317                        /* fall through */
 318                case OOM_SCAN_CONTINUE:
 319                        continue;
 320                case OOM_SCAN_ABORT:
 321                        rcu_read_unlock();
 322                        return (struct task_struct *)(-1UL);
 323                case OOM_SCAN_OK:
 324                        break;
 325                };
 326                points = oom_badness(p, NULL, oc->nodemask, totalpages);
 327                if (!points || points < chosen_points)
 328                        continue;
 329                /* Prefer thread group leaders for display purposes */
 330                if (points == chosen_points && thread_group_leader(chosen))
 331                        continue;
 332
 333                chosen = p;
 334                chosen_points = points;
 335        }
 336        if (chosen)
 337                get_task_struct(chosen);
 338        rcu_read_unlock();
 339
 340        *ppoints = chosen_points * 1000 / totalpages;
 341        return chosen;
 342}
 343
 344/**
 345 * dump_tasks - dump current memory state of all system tasks
 346 * @memcg: current's memory controller, if constrained
 347 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 348 *
 349 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 350 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 351 * are not shown.
 352 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
 353 * swapents, oom_score_adj value, and name.
 354 */
 355static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 356{
 357        struct task_struct *p;
 358        struct task_struct *task;
 359
 360        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
 361        rcu_read_lock();
 362        for_each_process(p) {
 363                if (oom_unkillable_task(p, memcg, nodemask))
 364                        continue;
 365
 366                task = find_lock_task_mm(p);
 367                if (!task) {
 368                        /*
 369                         * This is a kthread or all of p's threads have already
 370                         * detached their mm's.  There's no need to report
 371                         * them; they can't be oom killed anyway.
 372                         */
 373                        continue;
 374                }
 375
 376                pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
 377                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
 378                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 379                        atomic_long_read(&task->mm->nr_ptes),
 380                        mm_nr_pmds(task->mm),
 381                        get_mm_counter(task->mm, MM_SWAPENTS),
 382                        task->signal->oom_score_adj, task->comm);
 383                task_unlock(task);
 384        }
 385        rcu_read_unlock();
 386}
 387
 388static void dump_header(struct oom_control *oc, struct task_struct *p,
 389                        struct mem_cgroup *memcg)
 390{
 391        pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
 392                current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
 393                current->signal->oom_score_adj);
 394
 395        cpuset_print_current_mems_allowed();
 396        dump_stack();
 397        if (memcg)
 398                mem_cgroup_print_oom_info(memcg, p);
 399        else
 400                show_mem(SHOW_MEM_FILTER_NODES);
 401        if (sysctl_oom_dump_tasks)
 402                dump_tasks(memcg, oc->nodemask);
 403}
 404
 405/*
 406 * Number of OOM victims in flight
 407 */
 408static atomic_t oom_victims = ATOMIC_INIT(0);
 409static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 410
 411bool oom_killer_disabled __read_mostly;
 412
 413#define K(x) ((x) << (PAGE_SHIFT-10))
 414
 415#ifdef CONFIG_MMU
 416/*
 417 * OOM Reaper kernel thread which tries to reap the memory used by the OOM
 418 * victim (if that is possible) to help the OOM killer to move on.
 419 */
 420static struct task_struct *oom_reaper_th;
 421static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 422static struct task_struct *oom_reaper_list;
 423static DEFINE_SPINLOCK(oom_reaper_lock);
 424
 425
 426static bool __oom_reap_task(struct task_struct *tsk)
 427{
 428        struct mmu_gather tlb;
 429        struct vm_area_struct *vma;
 430        struct mm_struct *mm;
 431        struct task_struct *p;
 432        struct zap_details details = {.check_swap_entries = true,
 433                                      .ignore_dirty = true};
 434        bool ret = true;
 435
 436        /*
 437         * Make sure we find the associated mm_struct even when the particular
 438         * thread has already terminated and cleared its mm.
 439         * We might have race with exit path so consider our work done if there
 440         * is no mm.
 441         */
 442        p = find_lock_task_mm(tsk);
 443        if (!p)
 444                return true;
 445
 446        mm = p->mm;
 447        if (!atomic_inc_not_zero(&mm->mm_users)) {
 448                task_unlock(p);
 449                return true;
 450        }
 451
 452        task_unlock(p);
 453
 454        if (!down_read_trylock(&mm->mmap_sem)) {
 455                ret = false;
 456                goto out;
 457        }
 458
 459        tlb_gather_mmu(&tlb, mm, 0, -1);
 460        for (vma = mm->mmap ; vma; vma = vma->vm_next) {
 461                if (is_vm_hugetlb_page(vma))
 462                        continue;
 463
 464                /*
 465                 * mlocked VMAs require explicit munlocking before unmap.
 466                 * Let's keep it simple here and skip such VMAs.
 467                 */
 468                if (vma->vm_flags & VM_LOCKED)
 469                        continue;
 470
 471                /*
 472                 * Only anonymous pages have a good chance to be dropped
 473                 * without additional steps which we cannot afford as we
 474                 * are OOM already.
 475                 *
 476                 * We do not even care about fs backed pages because all
 477                 * which are reclaimable have already been reclaimed and
 478                 * we do not want to block exit_mmap by keeping mm ref
 479                 * count elevated without a good reason.
 480                 */
 481                if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
 482                        unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
 483                                         &details);
 484        }
 485        tlb_finish_mmu(&tlb, 0, -1);
 486        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 487                        task_pid_nr(tsk), tsk->comm,
 488                        K(get_mm_counter(mm, MM_ANONPAGES)),
 489                        K(get_mm_counter(mm, MM_FILEPAGES)),
 490                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
 491        up_read(&mm->mmap_sem);
 492
 493        /*
 494         * Clear TIF_MEMDIE because the task shouldn't be sitting on a
 495         * reasonably reclaimable memory anymore. OOM killer can continue
 496         * by selecting other victim if unmapping hasn't led to any
 497         * improvements. This also means that selecting this task doesn't
 498         * make any sense.
 499         */
 500        tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
 501        exit_oom_victim(tsk);
 502out:
 503        mmput(mm);
 504        return ret;
 505}
 506
 507#define MAX_OOM_REAP_RETRIES 10
 508static void oom_reap_task(struct task_struct *tsk)
 509{
 510        int attempts = 0;
 511
 512        /* Retry the down_read_trylock(mmap_sem) a few times */
 513        while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
 514                schedule_timeout_idle(HZ/10);
 515
 516        if (attempts > MAX_OOM_REAP_RETRIES) {
 517                pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 518                                task_pid_nr(tsk), tsk->comm);
 519                debug_show_all_locks();
 520        }
 521
 522        /* Drop a reference taken by wake_oom_reaper */
 523        put_task_struct(tsk);
 524}
 525
 526static int oom_reaper(void *unused)
 527{
 528        set_freezable();
 529
 530        while (true) {
 531                struct task_struct *tsk = NULL;
 532
 533                wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
 534                spin_lock(&oom_reaper_lock);
 535                if (oom_reaper_list != NULL) {
 536                        tsk = oom_reaper_list;
 537                        oom_reaper_list = tsk->oom_reaper_list;
 538                }
 539                spin_unlock(&oom_reaper_lock);
 540
 541                if (tsk)
 542                        oom_reap_task(tsk);
 543        }
 544
 545        return 0;
 546}
 547
 548static void wake_oom_reaper(struct task_struct *tsk)
 549{
 550        if (!oom_reaper_th)
 551                return;
 552
 553        /* tsk is already queued? */
 554        if (tsk == oom_reaper_list || tsk->oom_reaper_list)
 555                return;
 556
 557        get_task_struct(tsk);
 558
 559        spin_lock(&oom_reaper_lock);
 560        tsk->oom_reaper_list = oom_reaper_list;
 561        oom_reaper_list = tsk;
 562        spin_unlock(&oom_reaper_lock);
 563        wake_up(&oom_reaper_wait);
 564}
 565
 566static int __init oom_init(void)
 567{
 568        oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
 569        if (IS_ERR(oom_reaper_th)) {
 570                pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
 571                                PTR_ERR(oom_reaper_th));
 572                oom_reaper_th = NULL;
 573        }
 574        return 0;
 575}
 576subsys_initcall(oom_init)
 577#else
 578static void wake_oom_reaper(struct task_struct *tsk)
 579{
 580}
 581#endif
 582
 583/**
 584 * mark_oom_victim - mark the given task as OOM victim
 585 * @tsk: task to mark
 586 *
 587 * Has to be called with oom_lock held and never after
 588 * oom has been disabled already.
 589 */
 590void mark_oom_victim(struct task_struct *tsk)
 591{
 592        WARN_ON(oom_killer_disabled);
 593        /* OOM killer might race with memcg OOM */
 594        if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 595                return;
 596        /*
 597         * Make sure that the task is woken up from uninterruptible sleep
 598         * if it is frozen because OOM killer wouldn't be able to free
 599         * any memory and livelock. freezing_slow_path will tell the freezer
 600         * that TIF_MEMDIE tasks should be ignored.
 601         */
 602        __thaw_task(tsk);
 603        atomic_inc(&oom_victims);
 604}
 605
 606/**
 607 * exit_oom_victim - note the exit of an OOM victim
 608 */
 609void exit_oom_victim(struct task_struct *tsk)
 610{
 611        if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 612                return;
 613
 614        if (!atomic_dec_return(&oom_victims))
 615                wake_up_all(&oom_victims_wait);
 616}
 617
 618/**
 619 * oom_killer_disable - disable OOM killer
 620 *
 621 * Forces all page allocations to fail rather than trigger OOM killer.
 622 * Will block and wait until all OOM victims are killed.
 623 *
 624 * The function cannot be called when there are runnable user tasks because
 625 * the userspace would see unexpected allocation failures as a result. Any
 626 * new usage of this function should be consulted with MM people.
 627 *
 628 * Returns true if successful and false if the OOM killer cannot be
 629 * disabled.
 630 */
 631bool oom_killer_disable(void)
 632{
 633        /*
 634         * Make sure to not race with an ongoing OOM killer. Check that the
 635         * current is not killed (possibly due to sharing the victim's memory).
 636         */
 637        if (mutex_lock_killable(&oom_lock))
 638                return false;
 639        oom_killer_disabled = true;
 640        mutex_unlock(&oom_lock);
 641
 642        wait_event(oom_victims_wait, !atomic_read(&oom_victims));
 643
 644        return true;
 645}
 646
 647/**
 648 * oom_killer_enable - enable OOM killer
 649 */
 650void oom_killer_enable(void)
 651{
 652        oom_killer_disabled = false;
 653}
 654
 655/*
 656 * task->mm can be NULL if the task is the exited group leader.  So to
 657 * determine whether the task is using a particular mm, we examine all the
 658 * task's threads: if one of those is using this mm then this task was also
 659 * using it.
 660 */
 661static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 662{
 663        struct task_struct *t;
 664
 665        for_each_thread(p, t) {
 666                struct mm_struct *t_mm = READ_ONCE(t->mm);
 667                if (t_mm)
 668                        return t_mm == mm;
 669        }
 670        return false;
 671}
 672
 673/*
 674 * Must be called while holding a reference to p, which will be released upon
 675 * returning.
 676 */
 677void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 678                      unsigned int points, unsigned long totalpages,
 679                      struct mem_cgroup *memcg, const char *message)
 680{
 681        struct task_struct *victim = p;
 682        struct task_struct *child;
 683        struct task_struct *t;
 684        struct mm_struct *mm;
 685        unsigned int victim_points = 0;
 686        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 687                                              DEFAULT_RATELIMIT_BURST);
 688        bool can_oom_reap = true;
 689
 690        /*
 691         * If the task is already exiting, don't alarm the sysadmin or kill
 692         * its children or threads, just set TIF_MEMDIE so it can die quickly
 693         */
 694        task_lock(p);
 695        if (p->mm && task_will_free_mem(p)) {
 696                mark_oom_victim(p);
 697                task_unlock(p);
 698                put_task_struct(p);
 699                return;
 700        }
 701        task_unlock(p);
 702
 703        if (__ratelimit(&oom_rs))
 704                dump_header(oc, p, memcg);
 705
 706        pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
 707                message, task_pid_nr(p), p->comm, points);
 708
 709        /*
 710         * If any of p's children has a different mm and is eligible for kill,
 711         * the one with the highest oom_badness() score is sacrificed for its
 712         * parent.  This attempts to lose the minimal amount of work done while
 713         * still freeing memory.
 714         */
 715        read_lock(&tasklist_lock);
 716        for_each_thread(p, t) {
 717                list_for_each_entry(child, &t->children, sibling) {
 718                        unsigned int child_points;
 719
 720                        if (process_shares_mm(child, p->mm))
 721                                continue;
 722                        /*
 723                         * oom_badness() returns 0 if the thread is unkillable
 724                         */
 725                        child_points = oom_badness(child, memcg, oc->nodemask,
 726                                                                totalpages);
 727                        if (child_points > victim_points) {
 728                                put_task_struct(victim);
 729                                victim = child;
 730                                victim_points = child_points;
 731                                get_task_struct(victim);
 732                        }
 733                }
 734        }
 735        read_unlock(&tasklist_lock);
 736
 737        p = find_lock_task_mm(victim);
 738        if (!p) {
 739                put_task_struct(victim);
 740                return;
 741        } else if (victim != p) {
 742                get_task_struct(p);
 743                put_task_struct(victim);
 744                victim = p;
 745        }
 746
 747        /* Get a reference to safely compare mm after task_unlock(victim) */
 748        mm = victim->mm;
 749        atomic_inc(&mm->mm_count);
 750        /*
 751         * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 752         * the OOM victim from depleting the memory reserves from the user
 753         * space under its control.
 754         */
 755        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 756        mark_oom_victim(victim);
 757        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 758                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 759                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
 760                K(get_mm_counter(victim->mm, MM_FILEPAGES)),
 761                K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
 762        task_unlock(victim);
 763
 764        /*
 765         * Kill all user processes sharing victim->mm in other thread groups, if
 766         * any.  They don't get access to memory reserves, though, to avoid
 767         * depletion of all memory.  This prevents mm->mmap_sem livelock when an
 768         * oom killed thread cannot exit because it requires the semaphore and
 769         * its contended by another thread trying to allocate memory itself.
 770         * That thread will now get access to memory reserves since it has a
 771         * pending fatal signal.
 772         */
 773        rcu_read_lock();
 774        for_each_process(p) {
 775                if (!process_shares_mm(p, mm))
 776                        continue;
 777                if (same_thread_group(p, victim))
 778                        continue;
 779                if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
 780                    p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
 781                        /*
 782                         * We cannot use oom_reaper for the mm shared by this
 783                         * process because it wouldn't get killed and so the
 784                         * memory might be still used.
 785                         */
 786                        can_oom_reap = false;
 787                        continue;
 788                }
 789                do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
 790        }
 791        rcu_read_unlock();
 792
 793        if (can_oom_reap)
 794                wake_oom_reaper(victim);
 795
 796        mmdrop(mm);
 797        put_task_struct(victim);
 798}
 799#undef K
 800
 801/*
 802 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 803 */
 804void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
 805                        struct mem_cgroup *memcg)
 806{
 807        if (likely(!sysctl_panic_on_oom))
 808                return;
 809        if (sysctl_panic_on_oom != 2) {
 810                /*
 811                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
 812                 * does not panic for cpuset, mempolicy, or memcg allocation
 813                 * failures.
 814                 */
 815                if (constraint != CONSTRAINT_NONE)
 816                        return;
 817        }
 818        /* Do not panic for oom kills triggered by sysrq */
 819        if (is_sysrq_oom(oc))
 820                return;
 821        dump_header(oc, NULL, memcg);
 822        panic("Out of memory: %s panic_on_oom is enabled\n",
 823                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 824}
 825
 826static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 827
 828int register_oom_notifier(struct notifier_block *nb)
 829{
 830        return blocking_notifier_chain_register(&oom_notify_list, nb);
 831}
 832EXPORT_SYMBOL_GPL(register_oom_notifier);
 833
 834int unregister_oom_notifier(struct notifier_block *nb)
 835{
 836        return blocking_notifier_chain_unregister(&oom_notify_list, nb);
 837}
 838EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 839
 840/**
 841 * out_of_memory - kill the "best" process when we run out of memory
 842 * @oc: pointer to struct oom_control
 843 *
 844 * If we run out of memory, we have the choice between either
 845 * killing a random task (bad), letting the system crash (worse)
 846 * OR try to be smart about which process to kill. Note that we
 847 * don't have to be perfect here, we just have to be good.
 848 */
 849bool out_of_memory(struct oom_control *oc)
 850{
 851        struct task_struct *p;
 852        unsigned long totalpages;
 853        unsigned long freed = 0;
 854        unsigned int uninitialized_var(points);
 855        enum oom_constraint constraint = CONSTRAINT_NONE;
 856
 857        if (oom_killer_disabled)
 858                return false;
 859
 860        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 861        if (freed > 0)
 862                /* Got some memory back in the last second. */
 863                return true;
 864
 865        /*
 866         * If current has a pending SIGKILL or is exiting, then automatically
 867         * select it.  The goal is to allow it to allocate so that it may
 868         * quickly exit and free its memory.
 869         *
 870         * But don't select if current has already released its mm and cleared
 871         * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
 872         */
 873        if (current->mm &&
 874            (fatal_signal_pending(current) || task_will_free_mem(current))) {
 875                mark_oom_victim(current);
 876                return true;
 877        }
 878
 879        /*
 880         * Check if there were limitations on the allocation (only relevant for
 881         * NUMA) that may require different handling.
 882         */
 883        constraint = constrained_alloc(oc, &totalpages);
 884        if (constraint != CONSTRAINT_MEMORY_POLICY)
 885                oc->nodemask = NULL;
 886        check_panic_on_oom(oc, constraint, NULL);
 887
 888        if (sysctl_oom_kill_allocating_task && current->mm &&
 889            !oom_unkillable_task(current, NULL, oc->nodemask) &&
 890            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 891                get_task_struct(current);
 892                oom_kill_process(oc, current, 0, totalpages, NULL,
 893                                 "Out of memory (oom_kill_allocating_task)");
 894                return true;
 895        }
 896
 897        p = select_bad_process(oc, &points, totalpages);
 898        /* Found nothing?!?! Either we hang forever, or we panic. */
 899        if (!p && !is_sysrq_oom(oc)) {
 900                dump_header(oc, NULL, NULL);
 901                panic("Out of memory and no killable processes...\n");
 902        }
 903        if (p && p != (void *)-1UL) {
 904                oom_kill_process(oc, p, points, totalpages, NULL,
 905                                 "Out of memory");
 906                /*
 907                 * Give the killed process a good chance to exit before trying
 908                 * to allocate memory again.
 909                 */
 910                schedule_timeout_killable(1);
 911        }
 912        return true;
 913}
 914
 915/*
 916 * The pagefault handler calls here because it is out of memory, so kill a
 917 * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
 918 * parallel oom killing is already in progress so do nothing.
 919 */
 920void pagefault_out_of_memory(void)
 921{
 922        struct oom_control oc = {
 923                .zonelist = NULL,
 924                .nodemask = NULL,
 925                .gfp_mask = 0,
 926                .order = 0,
 927        };
 928
 929        if (mem_cgroup_oom_synchronize(true))
 930                return;
 931
 932        if (!mutex_trylock(&oom_lock))
 933                return;
 934
 935        if (!out_of_memory(&oc)) {
 936                /*
 937                 * There shouldn't be any user tasks runnable while the
 938                 * OOM killer is disabled, so the current task has to
 939                 * be a racing OOM victim for which oom_killer_disable()
 940                 * is waiting for.
 941                 */
 942                WARN_ON(test_thread_flag(TIF_MEMDIE));
 943        }
 944
 945        mutex_unlock(&oom_lock);
 946}
 947