LXR linux/mm/vmstat.c

   1/*
   2 *  linux/mm/vmstat.c
   3 *
   4 *  Manages VM statistics
   5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6 *
   7 *  zoned VM statistics
   8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
   9 *              Christoph Lameter <christoph@lameter.com>
  10 *  Copyright (C) 2008-2014 Christoph Lameter
  11 */
  12#include <linux/fs.h>
  13#include <linux/mm.h>
  14#include <linux/err.h>
  15#include <linux/module.h>
  16#include <linux/slab.h>
  17#include <linux/cpu.h>
  18#include <linux/cpumask.h>
  19#include <linux/vmstat.h>
  20#include <linux/proc_fs.h>
  21#include <linux/seq_file.h>
  22#include <linux/debugfs.h>
  23#include <linux/sched.h>
  24#include <linux/math64.h>
  25#include <linux/writeback.h>
  26#include <linux/compaction.h>
  27#include <linux/mm_inline.h>
  28#include <linux/page_ext.h>
  29#include <linux/page_owner.h>
  30
  31#include "internal.h"
  32
  33#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
  34
  35#ifdef CONFIG_NUMA
  36int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
  37
  38/* zero numa counters within a zone */
  39static void zero_zone_numa_counters(struct zone *zone)
  40{
  41        int item, cpu;
  42
  43        for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
  44                atomic_long_set(&zone->vm_numa_stat[item], 0);
  45                for_each_online_cpu(cpu)
  46                        per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
  47                                                = 0;
  48        }
  49}
  50
  51/* zero numa counters of all the populated zones */
  52static void zero_zones_numa_counters(void)
  53{
  54        struct zone *zone;
  55
  56        for_each_populated_zone(zone)
  57                zero_zone_numa_counters(zone);
  58}
  59
  60/* zero global numa counters */
  61static void zero_global_numa_counters(void)
  62{
  63        int item;
  64
  65        for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
  66                atomic_long_set(&vm_numa_stat[item], 0);
  67}
  68
  69static void invalid_numa_statistics(void)
  70{
  71        zero_zones_numa_counters();
  72        zero_global_numa_counters();
  73}
  74
  75static DEFINE_MUTEX(vm_numa_stat_lock);
  76
  77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
  78                void __user *buffer, size_t *length, loff_t *ppos)
  79{
  80        int ret, oldval;
  81
  82        mutex_lock(&vm_numa_stat_lock);
  83        if (write)
  84                oldval = sysctl_vm_numa_stat;
  85        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
  86        if (ret || !write)
  87                goto out;
  88
  89        if (oldval == sysctl_vm_numa_stat)
  90                goto out;
  91        else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
  92                static_branch_enable(&vm_numa_stat_key);
  93                pr_info("enable numa statistics\n");
  94        } else {
  95                static_branch_disable(&vm_numa_stat_key);
  96                invalid_numa_statistics();
  97                pr_info("disable numa statistics, and clear numa counters\n");
  98        }
  99
 100out:
 101        mutex_unlock(&vm_numa_stat_lock);
 102        return ret;
 103}
 104#endif
 105
 106#ifdef CONFIG_VM_EVENT_COUNTERS
 107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 108EXPORT_PER_CPU_SYMBOL(vm_event_states);
 109
 110static void sum_vm_events(unsigned long *ret)
 111{
 112        int cpu;
 113        int i;
 114
 115        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 116
 117        for_each_online_cpu(cpu) {
 118                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 119
 120                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 121                        ret[i] += this->event[i];
 122        }
 123}
 124
 125/*
 126 * Accumulate the vm event counters across all CPUs.
 127 * The result is unavoidably approximate - it can change
 128 * during and after execution of this function.
 129*/
 130void all_vm_events(unsigned long *ret)
 131{
 132        get_online_cpus();
 133        sum_vm_events(ret);
 134        put_online_cpus();
 135}
 136EXPORT_SYMBOL_GPL(all_vm_events);
 137
 138/*
 139 * Fold the foreign cpu events into our own.
 140 *
 141 * This is adding to the events on one processor
 142 * but keeps the global counts constant.
 143 */
 144void vm_events_fold_cpu(int cpu)
 145{
 146        struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
 147        int i;
 148
 149        for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
 150                count_vm_events(i, fold_state->event[i]);
 151                fold_state->event[i] = 0;
 152        }
 153}
 154
 155#endif /* CONFIG_VM_EVENT_COUNTERS */
 156
 157/*
 158 * Manage combined zone based / global counters
 159 *
 160 * vm_stat contains the global counters
 161 */
 162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 163atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
 164atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
 165EXPORT_SYMBOL(vm_zone_stat);
 166EXPORT_SYMBOL(vm_numa_stat);
 167EXPORT_SYMBOL(vm_node_stat);
 168
 169#ifdef CONFIG_SMP
 170
 171int calculate_pressure_threshold(struct zone *zone)
 172{
 173        int threshold;
 174        int watermark_distance;
 175
 176        /*
 177         * As vmstats are not up to date, there is drift between the estimated
 178         * and real values. For high thresholds and a high number of CPUs, it
 179         * is possible for the min watermark to be breached while the estimated
 180         * value looks fine. The pressure threshold is a reduced value such
 181         * that even the maximum amount of drift will not accidentally breach
 182         * the min watermark
 183         */
 184        watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
 185        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 186
 187        /*
 188         * Maximum threshold is 125
 189         */
 190        threshold = min(125, threshold);
 191
 192        return threshold;
 193}
 194
 195int calculate_normal_threshold(struct zone *zone)
 196{
 197        int threshold;
 198        int mem;        /* memory in 128 MB units */
 199
 200        /*
 201         * The threshold scales with the number of processors and the amount
 202         * of memory per zone. More memory means that we can defer updates for
 203         * longer, more processors could lead to more contention.
 204         * fls() is used to have a cheap way of logarithmic scaling.
 205         *
 206         * Some sample thresholds:
 207         *
 208         * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
 209         * ------------------------------------------------------------------
 210         * 8            1               1       0.9-1 GB        4
 211         * 16           2               2       0.9-1 GB        4
 212         * 20           2               2       1-2 GB          5
 213         * 24           2               2       2-4 GB          6
 214         * 28           2               2       4-8 GB          7
 215         * 32           2               2       8-16 GB         8
 216         * 4            2               2       <128M           1
 217         * 30           4               3       2-4 GB          5
 218         * 48           4               3       8-16 GB         8
 219         * 32           8               4       1-2 GB          4
 220         * 32           8               4       0.9-1GB         4
 221         * 10           16              5       <128M           1
 222         * 40           16              5       900M            4
 223         * 70           64              7       2-4 GB          5
 224         * 84           64              7       4-8 GB          6
 225         * 108          512             9       4-8 GB          6
 226         * 125          1024            10      8-16 GB         8
 227         * 125          1024            10      16-32 GB        9
 228         */
 229
 230        mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
 231
 232        threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 233
 234        /*
 235         * Maximum threshold is 125
 236         */
 237        threshold = min(125, threshold);
 238
 239        return threshold;
 240}
 241
 242/*
 243 * Refresh the thresholds for each zone.
 244 */
 245void refresh_zone_stat_thresholds(void)
 246{
 247        struct pglist_data *pgdat;
 248        struct zone *zone;
 249        int cpu;
 250        int threshold;
 251
 252        /* Zero current pgdat thresholds */
 253        for_each_online_pgdat(pgdat) {
 254                for_each_online_cpu(cpu) {
 255                        per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
 256                }
 257        }
 258
 259        for_each_populated_zone(zone) {
 260                struct pglist_data *pgdat = zone->zone_pgdat;
 261                unsigned long max_drift, tolerate_drift;
 262
 263                threshold = calculate_normal_threshold(zone);
 264
 265                for_each_online_cpu(cpu) {
 266                        int pgdat_threshold;
 267
 268                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 269                                                        = threshold;
 270
 271                        /* Base nodestat threshold on the largest populated zone. */
 272                        pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
 273                        per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
 274                                = max(threshold, pgdat_threshold);
 275                }
 276
 277                /*
 278                 * Only set percpu_drift_mark if there is a danger that
 279                 * NR_FREE_PAGES reports the low watermark is ok when in fact
 280                 * the min watermark could be breached by an allocation
 281                 */
 282                tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
 283                max_drift = num_online_cpus() * threshold;
 284                if (max_drift > tolerate_drift)
 285                        zone->percpu_drift_mark = high_wmark_pages(zone) +
 286                                        max_drift;
 287        }
 288}
 289
 290void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 291                                int (*calculate_pressure)(struct zone *))
 292{
 293        struct zone *zone;
 294        int cpu;
 295        int threshold;
 296        int i;
 297
 298        for (i = 0; i < pgdat->nr_zones; i++) {
 299                zone = &pgdat->node_zones[i];
 300                if (!zone->percpu_drift_mark)
 301                        continue;
 302
 303                threshold = (*calculate_pressure)(zone);
 304                for_each_online_cpu(cpu)
 305                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 306                                                        = threshold;
 307        }
 308}
 309
 310/*
 311 * For use when we know that interrupts are disabled,
 312 * or when we know that preemption is disabled and that
 313 * particular counter cannot be updated from interrupt context.
 314 */
 315void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 316                           long delta)
 317{
 318        struct per_cpu_pageset __percpu *pcp = zone->pageset;
 319        s8 __percpu *p = pcp->vm_stat_diff + item;
 320        long x;
 321        long t;
 322
 323        x = delta + __this_cpu_read(*p);
 324
 325        t = __this_cpu_read(pcp->stat_threshold);
 326
 327        if (unlikely(x > t || x < -t)) {
 328                zone_page_state_add(x, zone, item);
 329                x = 0;
 330        }
 331        __this_cpu_write(*p, x);
 332}
 333EXPORT_SYMBOL(__mod_zone_page_state);
 334
 335void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 336                                long delta)
 337{
 338        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 339        s8 __percpu *p = pcp->vm_node_stat_diff + item;
 340        long x;
 341        long t;
 342
 343        if (vmstat_item_in_bytes(item)) {
 344                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 345                delta >>= PAGE_SHIFT;
 346        }
 347
 348        x = delta + __this_cpu_read(*p);
 349
 350        t = __this_cpu_read(pcp->stat_threshold);
 351
 352        if (unlikely(x > t || x < -t)) {
 353                node_page_state_add(x, pgdat, item);
 354                x = 0;
 355        }
 356        __this_cpu_write(*p, x);
 357}
 358EXPORT_SYMBOL(__mod_node_page_state);
 359
 360/*
 361 * Optimized increment and decrement functions.
 362 *
 363 * These are only for a single page and therefore can take a struct page *
 364 * argument instead of struct zone *. This allows the inclusion of the code
 365 * generated for page_zone(page) into the optimized functions.
 366 *
 367 * No overflow check is necessary and therefore the differential can be
 368 * incremented or decremented in place which may allow the compilers to
 369 * generate better code.
 370 * The increment or decrement is known and therefore one boundary check can
 371 * be omitted.
 372 *
 373 * NOTE: These functions are very performance sensitive. Change only
 374 * with care.
 375 *
 376 * Some processors have inc/dec instructions that are atomic vs an interrupt.
 377 * However, the code must first determine the differential location in a zone
 378 * based on the processor number and then inc/dec the counter. There is no
 379 * guarantee without disabling preemption that the processor will not change
 380 * in between and therefore the atomicity vs. interrupt cannot be exploited
 381 * in a useful way here.
 382 */
 383void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 384{
 385        struct per_cpu_pageset __percpu *pcp = zone->pageset;
 386        s8 __percpu *p = pcp->vm_stat_diff + item;
 387        s8 v, t;
 388
 389        v = __this_cpu_inc_return(*p);
 390        t = __this_cpu_read(pcp->stat_threshold);
 391        if (unlikely(v > t)) {
 392                s8 overstep = t >> 1;
 393
 394                zone_page_state_add(v + overstep, zone, item);
 395                __this_cpu_write(*p, -overstep);
 396        }
 397}
 398
 399void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 400{
 401        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 402        s8 __percpu *p = pcp->vm_node_stat_diff + item;
 403        s8 v, t;
 404
 405        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 406
 407        v = __this_cpu_inc_return(*p);
 408        t = __this_cpu_read(pcp->stat_threshold);
 409        if (unlikely(v > t)) {
 410                s8 overstep = t >> 1;
 411
 412                node_page_state_add(v + overstep, pgdat, item);
 413                __this_cpu_write(*p, -overstep);
 414        }
 415}
 416
 417void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 418{
 419        __inc_zone_state(page_zone(page), item);
 420}
 421EXPORT_SYMBOL(__inc_zone_page_state);
 422
 423void __inc_node_page_state(struct page *page, enum node_stat_item item)
 424{
 425        __inc_node_state(page_pgdat(page), item);
 426}
 427EXPORT_SYMBOL(__inc_node_page_state);
 428
 429void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 430{
 431        struct per_cpu_pageset __percpu *pcp = zone->pageset;
 432        s8 __percpu *p = pcp->vm_stat_diff + item;
 433        s8 v, t;
 434
 435        v = __this_cpu_dec_return(*p);
 436        t = __this_cpu_read(pcp->stat_threshold);
 437        if (unlikely(v < - t)) {
 438                s8 overstep = t >> 1;
 439
 440                zone_page_state_add(v - overstep, zone, item);
 441                __this_cpu_write(*p, overstep);
 442        }
 443}
 444
 445void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 446{
 447        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 448        s8 __percpu *p = pcp->vm_node_stat_diff + item;
 449        s8 v, t;
 450
 451        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 452
 453        v = __this_cpu_dec_return(*p);
 454        t = __this_cpu_read(pcp->stat_threshold);
 455        if (unlikely(v < - t)) {
 456                s8 overstep = t >> 1;
 457
 458                node_page_state_add(v - overstep, pgdat, item);
 459                __this_cpu_write(*p, overstep);
 460        }
 461}
 462
 463void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 464{
 465        __dec_zone_state(page_zone(page), item);
 466}
 467EXPORT_SYMBOL(__dec_zone_page_state);
 468
 469void __dec_node_page_state(struct page *page, enum node_stat_item item)
 470{
 471        __dec_node_state(page_pgdat(page), item);
 472}
 473EXPORT_SYMBOL(__dec_node_page_state);
 474
 475#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 476/*
 477 * If we have cmpxchg_local support then we do not need to incur the overhead
 478 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
 479 *
 480 * mod_state() modifies the zone counter state through atomic per cpu
 481 * operations.
 482 *
 483 * Overstep mode specifies how overstep should handled:
 484 *     0       No overstepping
 485 *     1       Overstepping half of threshold
 486 *     -1      Overstepping minus half of threshold
 487*/
 488static inline void mod_zone_state(struct zone *zone,
 489       enum zone_stat_item item, long delta, int overstep_mode)
 490{
 491        struct per_cpu_pageset __percpu *pcp = zone->pageset;
 492        s8 __percpu *p = pcp->vm_stat_diff + item;
 493        long o, n, t, z;
 494
 495        do {
 496                z = 0;  /* overflow to zone counters */
 497
 498                /*
 499                 * The fetching of the stat_threshold is racy. We may apply
 500                 * a counter threshold to the wrong the cpu if we get
 501                 * rescheduled while executing here. However, the next
 502                 * counter update will apply the threshold again and
 503                 * therefore bring the counter under the threshold again.
 504                 *
 505                 * Most of the time the thresholds are the same anyways
 506                 * for all cpus in a zone.
 507                 */
 508                t = this_cpu_read(pcp->stat_threshold);
 509
 510                o = this_cpu_read(*p);
 511                n = delta + o;
 512
 513                if (n > t || n < -t) {
 514                        int os = overstep_mode * (t >> 1) ;
 515
 516                        /* Overflow must be added to zone counters */
 517                        z = n + os;
 518                        n = -os;
 519                }
 520        } while (this_cpu_cmpxchg(*p, o, n) != o);
 521
 522        if (z)
 523                zone_page_state_add(z, zone, item);
 524}
 525
 526void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 527                         long delta)
 528{
 529        mod_zone_state(zone, item, delta, 0);
 530}
 531EXPORT_SYMBOL(mod_zone_page_state);
 532
 533void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 534{
 535        mod_zone_state(page_zone(page), item, 1, 1);
 536}
 537EXPORT_SYMBOL(inc_zone_page_state);
 538
 539void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 540{
 541        mod_zone_state(page_zone(page), item, -1, -1);
 542}
 543EXPORT_SYMBOL(dec_zone_page_state);
 544
 545static inline void mod_node_state(struct pglist_data *pgdat,
 546       enum node_stat_item item, int delta, int overstep_mode)
 547{
 548        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 549        s8 __percpu *p = pcp->vm_node_stat_diff + item;
 550        long o, n, t, z;
 551
 552        if (vmstat_item_in_bytes(item)) {
 553                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 554                delta >>= PAGE_SHIFT;
 555        }
 556
 557        do {
 558                z = 0;  /* overflow to node counters */
 559
 560                /*
 561                 * The fetching of the stat_threshold is racy. We may apply
 562                 * a counter threshold to the wrong the cpu if we get
 563                 * rescheduled while executing here. However, the next
 564                 * counter update will apply the threshold again and
 565                 * therefore bring the counter under the threshold again.
 566                 *
 567                 * Most of the time the thresholds are the same anyways
 568                 * for all cpus in a node.
 569                 */
 570                t = this_cpu_read(pcp->stat_threshold);
 571
 572                o = this_cpu_read(*p);
 573                n = delta + o;
 574
 575                if (n > t || n < -t) {
 576                        int os = overstep_mode * (t >> 1) ;
 577
 578                        /* Overflow must be added to node counters */
 579                        z = n + os;
 580                        n = -os;
 581                }
 582        } while (this_cpu_cmpxchg(*p, o, n) != o);
 583
 584        if (z)
 585                node_page_state_add(z, pgdat, item);
 586}
 587
 588void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 589                                        long delta)
 590{
 591        mod_node_state(pgdat, item, delta, 0);
 592}
 593EXPORT_SYMBOL(mod_node_page_state);
 594
 595void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 596{
 597        mod_node_state(pgdat, item, 1, 1);
 598}
 599
 600void inc_node_page_state(struct page *page, enum node_stat_item item)
 601{
 602        mod_node_state(page_pgdat(page), item, 1, 1);
 603}
 604EXPORT_SYMBOL(inc_node_page_state);
 605
 606void dec_node_page_state(struct page *page, enum node_stat_item item)
 607{
 608        mod_node_state(page_pgdat(page), item, -1, -1);
 609}
 610EXPORT_SYMBOL(dec_node_page_state);
 611#else
 612/*
 613 * Use interrupt disable to serialize counter updates
 614 */
 615void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 616                         long delta)
 617{
 618        unsigned long flags;
 619
 620        local_irq_save(flags);
 621        __mod_zone_page_state(zone, item, delta);
 622        local_irq_restore(flags);
 623}
 624EXPORT_SYMBOL(mod_zone_page_state);
 625
 626void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 627{
 628        unsigned long flags;
 629        struct zone *zone;
 630
 631        zone = page_zone(page);
 632        local_irq_save(flags);
 633        __inc_zone_state(zone, item);
 634        local_irq_restore(flags);
 635}
 636EXPORT_SYMBOL(inc_zone_page_state);
 637
 638void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 639{
 640        unsigned long flags;
 641
 642        local_irq_save(flags);
 643        __dec_zone_page_state(page, item);
 644        local_irq_restore(flags);
 645}
 646EXPORT_SYMBOL(dec_zone_page_state);
 647
 648void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 649{
 650        unsigned long flags;
 651
 652        local_irq_save(flags);
 653        __inc_node_state(pgdat, item);
 654        local_irq_restore(flags);
 655}
 656EXPORT_SYMBOL(inc_node_state);
 657
 658void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 659                                        long delta)
 660{
 661        unsigned long flags;
 662
 663        local_irq_save(flags);
 664        __mod_node_page_state(pgdat, item, delta);
 665        local_irq_restore(flags);
 666}
 667EXPORT_SYMBOL(mod_node_page_state);
 668
 669void inc_node_page_state(struct page *page, enum node_stat_item item)
 670{
 671        unsigned long flags;
 672        struct pglist_data *pgdat;
 673
 674        pgdat = page_pgdat(page);
 675        local_irq_save(flags);
 676        __inc_node_state(pgdat, item);
 677        local_irq_restore(flags);
 678}
 679EXPORT_SYMBOL(inc_node_page_state);
 680
 681void dec_node_page_state(struct page *page, enum node_stat_item item)
 682{
 683        unsigned long flags;
 684
 685        local_irq_save(flags);
 686        __dec_node_page_state(page, item);
 687        local_irq_restore(flags);
 688}
 689EXPORT_SYMBOL(dec_node_page_state);
 690#endif
 691
 692/*
 693 * Fold a differential into the global counters.
 694 * Returns the number of counters updated.
 695 */
 696#ifdef CONFIG_NUMA
 697static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
 698{
 699        int i;
 700        int changes = 0;
 701
 702        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 703                if (zone_diff[i]) {
 704                        atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
 705                        changes++;
 706        }
 707
 708        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
 709                if (numa_diff[i]) {
 710                        atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
 711                        changes++;
 712        }
 713
 714        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 715                if (node_diff[i]) {
 716                        atomic_long_add(node_diff[i], &vm_node_stat[i]);
 717                        changes++;
 718        }
 719        return changes;
 720}
 721#else
 722static int fold_diff(int *zone_diff, int *node_diff)
 723{
 724        int i;
 725        int changes = 0;
 726
 727        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 728                if (zone_diff[i]) {
 729                        atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
 730                        changes++;
 731        }
 732
 733        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 734                if (node_diff[i]) {
 735                        atomic_long_add(node_diff[i], &vm_node_stat[i]);
 736                        changes++;
 737        }
 738        return changes;
 739}
 740#endif /* CONFIG_NUMA */
 741
 742/*
 743 * Update the zone counters for the current cpu.
 744 *
 745 * Note that refresh_cpu_vm_stats strives to only access
 746 * node local memory. The per cpu pagesets on remote zones are placed
 747 * in the memory local to the processor using that pageset. So the
 748 * loop over all zones will access a series of cachelines local to
 749 * the processor.
 750 *
 751 * The call to zone_page_state_add updates the cachelines with the
 752 * statistics in the remote zone struct as well as the global cachelines
 753 * with the global counters. These could cause remote node cache line
 754 * bouncing and will have to be only done when necessary.
 755 *
 756 * The function returns the number of global counters updated.
 757 */
 758static int refresh_cpu_vm_stats(bool do_pagesets)
 759{
 760        struct pglist_data *pgdat;
 761        struct zone *zone;
 762        int i;
 763        int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 764#ifdef CONFIG_NUMA
 765        int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
 766#endif
 767        int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 768        int changes = 0;
 769
 770        for_each_populated_zone(zone) {
 771                struct per_cpu_pageset __percpu *p = zone->pageset;
 772
 773                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 774                        int v;
 775
 776                        v = this_cpu_xchg(p->vm_stat_diff[i], 0);
 777                        if (v) {
 778
 779                                atomic_long_add(v, &zone->vm_stat[i]);
 780                                global_zone_diff[i] += v;
 781#ifdef CONFIG_NUMA
 782                                /* 3 seconds idle till flush */
 783                                __this_cpu_write(p->expire, 3);
 784#endif
 785                        }
 786                }
 787#ifdef CONFIG_NUMA
 788                for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
 789                        int v;
 790
 791                        v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
 792                        if (v) {
 793
 794                                atomic_long_add(v, &zone->vm_numa_stat[i]);
 795                                global_numa_diff[i] += v;
 796                                __this_cpu_write(p->expire, 3);
 797                        }
 798                }
 799
 800                if (do_pagesets) {
 801                        cond_resched();
 802                        /*
 803                         * Deal with draining the remote pageset of this
 804                         * processor
 805                         *
 806                         * Check if there are pages remaining in this pageset
 807                         * if not then there is nothing to expire.
 808                         */
 809                        if (!__this_cpu_read(p->expire) ||
 810                               !__this_cpu_read(p->pcp.count))
 811                                continue;
 812
 813                        /*
 814                         * We never drain zones local to this processor.
 815                         */
 816                        if (zone_to_nid(zone) == numa_node_id()) {
 817                                __this_cpu_write(p->expire, 0);
 818                                continue;
 819                        }
 820
 821                        if (__this_cpu_dec_return(p->expire))
 822                                continue;
 823
 824                        if (__this_cpu_read(p->pcp.count)) {
 825                                drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
 826                                changes++;
 827                        }
 828                }
 829#endif
 830        }
 831
 832        for_each_online_pgdat(pgdat) {
 833                struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
 834
 835                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 836                        int v;
 837
 838                        v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
 839                        if (v) {
 840                                atomic_long_add(v, &pgdat->vm_stat[i]);
 841                                global_node_diff[i] += v;
 842                        }
 843                }
 844        }
 845
 846#ifdef CONFIG_NUMA
 847        changes += fold_diff(global_zone_diff, global_numa_diff,
 848                             global_node_diff);
 849#else
 850        changes += fold_diff(global_zone_diff, global_node_diff);
 851#endif
 852        return changes;
 853}
 854
 855/*
 856 * Fold the data for an offline cpu into the global array.
 857 * There cannot be any access by the offline cpu and therefore
 858 * synchronization is simplified.
 859 */
 860void cpu_vm_stats_fold(int cpu)
 861{
 862        struct pglist_data *pgdat;
 863        struct zone *zone;
 864        int i;
 865        int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 866#ifdef CONFIG_NUMA
 867        int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
 868#endif
 869        int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 870
 871        for_each_populated_zone(zone) {
 872                struct per_cpu_pageset *p;
 873
 874                p = per_cpu_ptr(zone->pageset, cpu);
 875
 876                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 877                        if (p->vm_stat_diff[i]) {
 878                                int v;
 879
 880                                v = p->vm_stat_diff[i];
 881                                p->vm_stat_diff[i] = 0;
 882                                atomic_long_add(v, &zone->vm_stat[i]);
 883                                global_zone_diff[i] += v;
 884                        }
 885
 886#ifdef CONFIG_NUMA
 887                for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
 888                        if (p->vm_numa_stat_diff[i]) {
 889                                int v;
 890
 891                                v = p->vm_numa_stat_diff[i];
 892                                p->vm_numa_stat_diff[i] = 0;
 893                                atomic_long_add(v, &zone->vm_numa_stat[i]);
 894                                global_numa_diff[i] += v;
 895                        }
 896#endif
 897        }
 898
 899        for_each_online_pgdat(pgdat) {
 900                struct per_cpu_nodestat *p;
 901
 902                p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
 903
 904                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 905                        if (p->vm_node_stat_diff[i]) {
 906                                int v;
 907
 908                                v = p->vm_node_stat_diff[i];
 909                                p->vm_node_stat_diff[i] = 0;
 910                                atomic_long_add(v, &pgdat->vm_stat[i]);
 911                                global_node_diff[i] += v;
 912                        }
 913        }
 914
 915#ifdef CONFIG_NUMA
 916        fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
 917#else
 918        fold_diff(global_zone_diff, global_node_diff);
 919#endif
 920}
 921
 922/*
 923 * this is only called if !populated_zone(zone), which implies no other users of
 924 * pset->vm_stat_diff[] exsist.
 925 */
 926void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 927{
 928        int i;
 929
 930        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 931                if (pset->vm_stat_diff[i]) {
 932                        int v = pset->vm_stat_diff[i];
 933                        pset->vm_stat_diff[i] = 0;
 934                        atomic_long_add(v, &zone->vm_stat[i]);
 935                        atomic_long_add(v, &vm_zone_stat[i]);
 936                }
 937
 938#ifdef CONFIG_NUMA
 939        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
 940                if (pset->vm_numa_stat_diff[i]) {
 941                        int v = pset->vm_numa_stat_diff[i];
 942
 943                        pset->vm_numa_stat_diff[i] = 0;
 944                        atomic_long_add(v, &zone->vm_numa_stat[i]);
 945                        atomic_long_add(v, &vm_numa_stat[i]);
 946                }
 947#endif
 948}
 949#endif
 950
 951#ifdef CONFIG_NUMA
 952void __inc_numa_state(struct zone *zone,
 953                                 enum numa_stat_item item)
 954{
 955        struct per_cpu_pageset __percpu *pcp = zone->pageset;
 956        u16 __percpu *p = pcp->vm_numa_stat_diff + item;
 957        u16 v;
 958
 959        v = __this_cpu_inc_return(*p);
 960
 961        if (unlikely(v > NUMA_STATS_THRESHOLD)) {
 962                zone_numa_state_add(v, zone, item);
 963                __this_cpu_write(*p, 0);
 964        }
 965}
 966
 967/*
 968 * Determine the per node value of a stat item. This function
 969 * is called frequently in a NUMA machine, so try to be as
 970 * frugal as possible.
 971 */
 972unsigned long sum_zone_node_page_state(int node,
 973                                 enum zone_stat_item item)
 974{
 975        struct zone *zones = NODE_DATA(node)->node_zones;
 976        int i;
 977        unsigned long count = 0;
 978
 979        for (i = 0; i < MAX_NR_ZONES; i++)
 980                count += zone_page_state(zones + i, item);
 981
 982        return count;
 983}
 984
 985/*
 986 * Determine the per node value of a numa stat item. To avoid deviation,
 987 * the per cpu stat number in vm_numa_stat_diff[] is also included.
 988 */
 989unsigned long sum_zone_numa_state(int node,
 990                                 enum numa_stat_item item)
 991{
 992        struct zone *zones = NODE_DATA(node)->node_zones;
 993        int i;
 994        unsigned long count = 0;
 995
 996        for (i = 0; i < MAX_NR_ZONES; i++)
 997                count += zone_numa_state_snapshot(zones + i, item);
 998
 999        return count;
1000}

1001
1002/*
1003 * Determine the per node value of a stat item.
1004 */
1005unsigned long node_page_state_pages(struct pglist_data *pgdat,
1006                                    enum node_stat_item item)
1007{
1008        long x = atomic_long_read(&pgdat->vm_stat[item]);
1009#ifdef CONFIG_SMP
1010        if (x < 0)
1011                x = 0;
1012#endif
1013        return x;
1014}
1015
1016unsigned long node_page_state(struct pglist_data *pgdat,
1017                              enum node_stat_item item)
1018{
1019        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1020
1021        return node_page_state_pages(pgdat, item);
1022}
1023#endif
1024
1025#ifdef CONFIG_COMPACTION
1026
1027struct contig_page_info {
1028        unsigned long free_pages;
1029        unsigned long free_blocks_total;
1030        unsigned long free_blocks_suitable;
1031};
1032
1033/*
1034 * Calculate the number of free pages in a zone, how many contiguous
1035 * pages are free and how many are large enough to satisfy an allocation of
1036 * the target size. Note that this function makes no attempt to estimate
1037 * how many suitable free blocks there *might* be if MOVABLE pages were
1038 * migrated. Calculating that is possible, but expensive and can be
1039 * figured out from userspace
1040 */
1041static void fill_contig_page_info(struct zone *zone,
1042                                unsigned int suitable_order,
1043                                struct contig_page_info *info)
1044{
1045        unsigned int order;
1046
1047        info->free_pages = 0;
1048        info->free_blocks_total = 0;
1049        info->free_blocks_suitable = 0;
1050
1051        for (order = 0; order < MAX_ORDER; order++) {
1052                unsigned long blocks;
1053
1054                /* Count number of free blocks */
1055                blocks = zone->free_area[order].nr_free;
1056                info->free_blocks_total += blocks;
1057
1058                /* Count free base pages */
1059                info->free_pages += blocks << order;
1060
1061                /* Count the suitable free blocks */
1062                if (order >= suitable_order)
1063                        info->free_blocks_suitable += blocks <<
1064                                                (order - suitable_order);
1065        }
1066}
1067
1068/*
1069 * A fragmentation index only makes sense if an allocation of a requested
1070 * size would fail. If that is true, the fragmentation index indicates
1071 * whether external fragmentation or a lack of memory was the problem.
1072 * The value can be used to determine if page reclaim or compaction
1073 * should be used
1074 */
1075static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1076{
1077        unsigned long requested = 1UL << order;
1078
1079        if (WARN_ON_ONCE(order >= MAX_ORDER))
1080                return 0;
1081
1082        if (!info->free_blocks_total)
1083                return 0;
1084
1085        /* Fragmentation index only makes sense when a request would fail */
1086        if (info->free_blocks_suitable)
1087                return -1000;
1088
1089        /*
1090         * Index is between 0 and 1 so return within 3 decimal places
1091         *
1092         * 0 => allocation would fail due to lack of memory
1093         * 1 => allocation would fail due to fragmentation
1094         */
1095        return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1096}
1097
1098/*
1099 * Calculates external fragmentation within a zone wrt the given order.
1100 * It is defined as the percentage of pages found in blocks of size
1101 * less than 1 << order. It returns values in range [0, 100].
1102 */
1103unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1104{
1105        struct contig_page_info info;
1106
1107        fill_contig_page_info(zone, order, &info);
1108        if (info.free_pages == 0)
1109                return 0;
1110
1111        return div_u64((info.free_pages -
1112                        (info.free_blocks_suitable << order)) * 100,
1113                        info.free_pages);
1114}
1115
1116/* Same as __fragmentation index but allocs contig_page_info on stack */
1117int fragmentation_index(struct zone *zone, unsigned int order)
1118{
1119        struct contig_page_info info;
1120
1121        fill_contig_page_info(zone, order, &info);
1122        return __fragmentation_index(order, &info);
1123}
1124#endif
1125
1126#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1127    defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1128#ifdef CONFIG_ZONE_DMA
1129#define TEXT_FOR_DMA(xx) xx "_dma",
1130#else
1131#define TEXT_FOR_DMA(xx)
1132#endif
1133
1134#ifdef CONFIG_ZONE_DMA32
1135#define TEXT_FOR_DMA32(xx) xx "_dma32",
1136#else
1137#define TEXT_FOR_DMA32(xx)
1138#endif
1139
1140#ifdef CONFIG_HIGHMEM
1141#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1142#else
1143#define TEXT_FOR_HIGHMEM(xx)
1144#endif
1145
1146#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1147                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
1148
1149const char * const vmstat_text[] = {
1150        /* enum zone_stat_item counters */
1151        "nr_free_pages",
1152        "nr_zone_inactive_anon",
1153        "nr_zone_active_anon",
1154        "nr_zone_inactive_file",
1155        "nr_zone_active_file",
1156        "nr_zone_unevictable",
1157        "nr_zone_write_pending",
1158        "nr_mlock",
1159        "nr_bounce",
1160#if IS_ENABLED(CONFIG_ZSMALLOC)
1161        "nr_zspages",
1162#endif
1163        "nr_free_cma",
1164
1165        /* enum numa_stat_item counters */
1166#ifdef CONFIG_NUMA
1167        "numa_hit",
1168        "numa_miss",
1169        "numa_foreign",
1170        "numa_interleave",
1171        "numa_local",
1172        "numa_other",
1173#endif
1174
1175        /* enum node_stat_item counters */
1176        "nr_inactive_anon",
1177        "nr_active_anon",
1178        "nr_inactive_file",
1179        "nr_active_file",
1180        "nr_unevictable",
1181        "nr_slab_reclaimable",
1182        "nr_slab_unreclaimable",
1183        "nr_isolated_anon",
1184        "nr_isolated_file",
1185        "workingset_nodes",
1186        "workingset_refault_anon",
1187        "workingset_refault_file",
1188        "workingset_activate_anon",
1189        "workingset_activate_file",
1190        "workingset_restore_anon",
1191        "workingset_restore_file",
1192        "workingset_nodereclaim",
1193        "nr_anon_pages",
1194        "nr_mapped",
1195        "nr_file_pages",
1196        "nr_dirty",
1197        "nr_writeback",
1198        "nr_writeback_temp",
1199        "nr_shmem",
1200        "nr_shmem_hugepages",
1201        "nr_shmem_pmdmapped",
1202        "nr_file_hugepages",
1203        "nr_file_pmdmapped",
1204        "nr_anon_transparent_hugepages",
1205        "nr_vmscan_write",
1206        "nr_vmscan_immediate_reclaim",
1207        "nr_dirtied",
1208        "nr_written",
1209        "nr_kernel_misc_reclaimable",
1210        "nr_foll_pin_acquired",
1211        "nr_foll_pin_released",
1212        "nr_kernel_stack",
1213#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1214        "nr_shadow_call_stack",
1215#endif
1216        "nr_page_table_pages",
1217#ifdef CONFIG_SWAP
1218        "nr_swapcached",
1219#endif
1220
1221        /* enum writeback_stat_item counters */
1222        "nr_dirty_threshold",
1223        "nr_dirty_background_threshold",
1224
1225#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1226        /* enum vm_event_item counters */
1227        "pgpgin",
1228        "pgpgout",
1229        "pswpin",
1230        "pswpout",
1231
1232        TEXTS_FOR_ZONES("pgalloc")
1233        TEXTS_FOR_ZONES("allocstall")
1234        TEXTS_FOR_ZONES("pgskip")
1235
1236        "pgfree",
1237        "pgactivate",
1238        "pgdeactivate",
1239        "pglazyfree",
1240
1241        "pgfault",
1242        "pgmajfault",
1243        "pglazyfreed",
1244
1245        "pgrefill",
1246        "pgsteal_kswapd",
1247        "pgsteal_direct",
1248        "pgscan_kswapd",
1249        "pgscan_direct",
1250        "pgscan_direct_throttle",
1251        "pgscan_anon",
1252        "pgscan_file",
1253        "pgsteal_anon",
1254        "pgsteal_file",
1255
1256#ifdef CONFIG_NUMA
1257        "zone_reclaim_failed",
1258#endif
1259        "pginodesteal",
1260        "slabs_scanned",
1261        "kswapd_inodesteal",
1262        "kswapd_low_wmark_hit_quickly",
1263        "kswapd_high_wmark_hit_quickly",
1264        "pageoutrun",
1265
1266        "pgrotated",
1267
1268        "drop_pagecache",
1269        "drop_slab",
1270        "oom_kill",
1271
1272#ifdef CONFIG_NUMA_BALANCING
1273        "numa_pte_updates",
1274        "numa_huge_pte_updates",
1275        "numa_hint_faults",
1276        "numa_hint_faults_local",
1277        "numa_pages_migrated",
1278#endif
1279#ifdef CONFIG_MIGRATION
1280        "pgmigrate_success",
1281        "pgmigrate_fail",
1282#endif
1283#ifdef CONFIG_COMPACTION
1284        "compact_migrate_scanned",
1285        "compact_free_scanned",
1286        "compact_isolated",
1287        "compact_stall",
1288        "compact_fail",
1289        "compact_success",
1290        "compact_daemon_wake",
1291        "compact_daemon_migrate_scanned",
1292        "compact_daemon_free_scanned",
1293#endif
1294
1295#ifdef CONFIG_HUGETLB_PAGE
1296        "htlb_buddy_alloc_success",
1297        "htlb_buddy_alloc_fail",
1298#endif
1299        "unevictable_pgs_culled",
1300        "unevictable_pgs_scanned",
1301        "unevictable_pgs_rescued",
1302        "unevictable_pgs_mlocked",
1303        "unevictable_pgs_munlocked",
1304        "unevictable_pgs_cleared",
1305        "unevictable_pgs_stranded",
1306
1307#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1308        "thp_fault_alloc",
1309        "thp_fault_fallback",
1310        "thp_fault_fallback_charge",
1311        "thp_collapse_alloc",
1312        "thp_collapse_alloc_failed",
1313        "thp_file_alloc",
1314        "thp_file_fallback",
1315        "thp_file_fallback_charge",
1316        "thp_file_mapped",
1317        "thp_split_page",
1318        "thp_split_page_failed",
1319        "thp_deferred_split_page",
1320        "thp_split_pmd",
1321#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1322        "thp_split_pud",
1323#endif
1324        "thp_zero_page_alloc",
1325        "thp_zero_page_alloc_failed",
1326        "thp_swpout",
1327        "thp_swpout_fallback",
1328#endif
1329#ifdef CONFIG_MEMORY_BALLOON
1330        "balloon_inflate",
1331        "balloon_deflate",
1332#ifdef CONFIG_BALLOON_COMPACTION
1333        "balloon_migrate",
1334#endif
1335#endif /* CONFIG_MEMORY_BALLOON */
1336#ifdef CONFIG_DEBUG_TLBFLUSH
1337        "nr_tlb_remote_flush",
1338        "nr_tlb_remote_flush_received",
1339        "nr_tlb_local_flush_all",
1340        "nr_tlb_local_flush_one",
1341#endif /* CONFIG_DEBUG_TLBFLUSH */
1342
1343#ifdef CONFIG_DEBUG_VM_VMACACHE
1344        "vmacache_find_calls",
1345        "vmacache_find_hits",
1346#endif
1347#ifdef CONFIG_SWAP
1348        "swap_ra",
1349        "swap_ra_hit",
1350#endif
1351#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1352};
1353#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1354
1355#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1356     defined(CONFIG_PROC_FS)
1357static void *frag_start(struct seq_file *m, loff_t *pos)
1358{
1359        pg_data_t *pgdat;
1360        loff_t node = *pos;
1361
1362        for (pgdat = first_online_pgdat();
1363             pgdat && node;
1364             pgdat = next_online_pgdat(pgdat))
1365                --node;
1366
1367        return pgdat;
1368}
1369
1370static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1371{
1372        pg_data_t *pgdat = (pg_data_t *)arg;
1373
1374        (*pos)++;
1375        return next_online_pgdat(pgdat);
1376}
1377
1378static void frag_stop(struct seq_file *m, void *arg)
1379{
1380}
1381
1382/*
1383 * Walk zones in a node and print using a callback.
1384 * If @assert_populated is true, only use callback for zones that are populated.
1385 */
1386static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1387                bool assert_populated, bool nolock,
1388                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1389{
1390        struct zone *zone;
1391        struct zone *node_zones = pgdat->node_zones;
1392        unsigned long flags;
1393
1394        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1395                if (assert_populated && !populated_zone(zone))
1396                        continue;
1397
1398                if (!nolock)
1399                        spin_lock_irqsave(&zone->lock, flags);
1400                print(m, pgdat, zone);
1401                if (!nolock)
1402                        spin_unlock_irqrestore(&zone->lock, flags);
1403        }
1404}
1405#endif
1406
1407#ifdef CONFIG_PROC_FS
1408static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1409                                                struct zone *zone)
1410{
1411        int order;
1412
1413        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1414        for (order = 0; order < MAX_ORDER; ++order)
1415                seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1416        seq_putc(m, '\n');
1417}
1418
1419/*
1420 * This walks the free areas for each zone.
1421 */
1422static int frag_show(struct seq_file *m, void *arg)
1423{
1424        pg_data_t *pgdat = (pg_data_t *)arg;
1425        walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1426        return 0;
1427}
1428
1429static void pagetypeinfo_showfree_print(struct seq_file *m,
1430                                        pg_data_t *pgdat, struct zone *zone)
1431{
1432        int order, mtype;
1433
1434        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1435                seq_printf(m, "Node %4d, zone %8s, type %12s ",
1436                                        pgdat->node_id,
1437                                        zone->name,
1438                                        migratetype_names[mtype]);
1439                for (order = 0; order < MAX_ORDER; ++order) {
1440                        unsigned long freecount = 0;
1441                        struct free_area *area;
1442                        struct list_head *curr;
1443                        bool overflow = false;
1444
1445                        area = &(zone->free_area[order]);
1446
1447                        list_for_each(curr, &area->free_list[mtype]) {
1448                                /*
1449                                 * Cap the free_list iteration because it might
1450                                 * be really large and we are under a spinlock
1451                                 * so a long time spent here could trigger a
1452                                 * hard lockup detector. Anyway this is a
1453                                 * debugging tool so knowing there is a handful
1454                                 * of pages of this order should be more than
1455                                 * sufficient.
1456                                 */
1457                                if (++freecount >= 100000) {
1458                                        overflow = true;
1459                                        break;
1460                                }
1461                        }
1462                        seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1463                        spin_unlock_irq(&zone->lock);
1464                        cond_resched();
1465                        spin_lock_irq(&zone->lock);
1466                }
1467                seq_putc(m, '\n');
1468        }
1469}
1470
1471/* Print out the free pages at each order for each migatetype */
1472static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1473{
1474        int order;
1475        pg_data_t *pgdat = (pg_data_t *)arg;
1476
1477        /* Print header */
1478        seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1479        for (order = 0; order < MAX_ORDER; ++order)
1480                seq_printf(m, "%6d ", order);
1481        seq_putc(m, '\n');
1482
1483        walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1484
1485        return 0;
1486}
1487
1488static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1489                                        pg_data_t *pgdat, struct zone *zone)
1490{
1491        int mtype;
1492        unsigned long pfn;
1493        unsigned long start_pfn = zone->zone_start_pfn;
1494        unsigned long end_pfn = zone_end_pfn(zone);
1495        unsigned long count[MIGRATE_TYPES] = { 0, };
1496
1497        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1498                struct page *page;
1499
1500                page = pfn_to_online_page(pfn);
1501                if (!page)
1502                        continue;
1503
1504                /* Watch for unexpected holes punched in the memmap */
1505                if (!memmap_valid_within(pfn, page, zone))
1506                        continue;
1507
1508                if (page_zone(page) != zone)
1509                        continue;
1510
1511                mtype = get_pageblock_migratetype(page);
1512
1513                if (mtype < MIGRATE_TYPES)
1514                        count[mtype]++;
1515        }
1516
1517        /* Print counts */
1518        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1519        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1520                seq_printf(m, "%12lu ", count[mtype]);
1521        seq_putc(m, '\n');
1522}
1523
1524/* Print out the number of pageblocks for each migratetype */
1525static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1526{
1527        int mtype;
1528        pg_data_t *pgdat = (pg_data_t *)arg;
1529
1530        seq_printf(m, "\n%-23s", "Number of blocks type ");
1531        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1532                seq_printf(m, "%12s ", migratetype_names[mtype]);
1533        seq_putc(m, '\n');
1534        walk_zones_in_node(m, pgdat, true, false,
1535                pagetypeinfo_showblockcount_print);
1536
1537        return 0;
1538}
1539
1540/*
1541 * Print out the number of pageblocks for each migratetype that contain pages
1542 * of other types. This gives an indication of how well fallbacks are being
1543 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1544 * to determine what is going on
1545 */
1546static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1547{
1548#ifdef CONFIG_PAGE_OWNER
1549        int mtype;
1550
1551        if (!static_branch_unlikely(&page_owner_inited))
1552                return;
1553
1554        drain_all_pages(NULL);
1555
1556        seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1557        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1558                seq_printf(m, "%12s ", migratetype_names[mtype]);
1559        seq_putc(m, '\n');
1560
1561        walk_zones_in_node(m, pgdat, true, true,
1562                pagetypeinfo_showmixedcount_print);
1563#endif /* CONFIG_PAGE_OWNER */
1564}
1565
1566/*
1567 * This prints out statistics in relation to grouping pages by mobility.
1568 * It is expensive to collect so do not constantly read the file.
1569 */
1570static int pagetypeinfo_show(struct seq_file *m, void *arg)
1571{
1572        pg_data_t *pgdat = (pg_data_t *)arg;
1573
1574        /* check memoryless node */
1575        if (!node_state(pgdat->node_id, N_MEMORY))
1576                return 0;
1577
1578        seq_printf(m, "Page block order: %d\n", pageblock_order);
1579        seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1580        seq_putc(m, '\n');
1581        pagetypeinfo_showfree(m, pgdat);
1582        pagetypeinfo_showblockcount(m, pgdat);
1583        pagetypeinfo_showmixedcount(m, pgdat);
1584
1585        return 0;
1586}
1587
1588static const struct seq_operations fragmentation_op = {
1589        .start  = frag_start,
1590        .next   = frag_next,
1591        .stop   = frag_stop,
1592        .show   = frag_show,
1593};
1594
1595static const struct seq_operations pagetypeinfo_op = {
1596        .start  = frag_start,
1597        .next   = frag_next,
1598        .stop   = frag_stop,
1599        .show   = pagetypeinfo_show,
1600};
1601
1602static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1603{
1604        int zid;
1605
1606        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1607                struct zone *compare = &pgdat->node_zones[zid];
1608
1609                if (populated_zone(compare))
1610                        return zone == compare;
1611        }
1612
1613        return false;
1614}
1615
1616static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1617                                                        struct zone *zone)
1618{
1619        int i;
1620        seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1621        if (is_zone_first_populated(pgdat, zone)) {
1622                seq_printf(m, "\n  per-node stats");
1623                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1624                        unsigned long pages = node_page_state_pages(pgdat, i);
1625
1626                        if (vmstat_item_print_in_thp(i))
1627                                pages /= HPAGE_PMD_NR;
1628                        seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1629                                   pages);
1630                }
1631        }
1632        seq_printf(m,
1633                   "\n  pages free     %lu"
1634                   "\n        min      %lu"
1635                   "\n        low      %lu"
1636                   "\n        high     %lu"
1637                   "\n        spanned  %lu"
1638                   "\n        present  %lu"
1639                   "\n        managed  %lu",
1640                   zone_page_state(zone, NR_FREE_PAGES),
1641                   min_wmark_pages(zone),
1642                   low_wmark_pages(zone),
1643                   high_wmark_pages(zone),
1644                   zone->spanned_pages,
1645                   zone->present_pages,
1646                   zone_managed_pages(zone));
1647
1648        seq_printf(m,
1649                   "\n        protection: (%ld",
1650                   zone->lowmem_reserve[0]);
1651        for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1652                seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1653        seq_putc(m, ')');
1654
1655        /* If unpopulated, no other information is useful */
1656        if (!populated_zone(zone)) {
1657                seq_putc(m, '\n');
1658                return;
1659        }
1660
1661        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1662                seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1663                           zone_page_state(zone, i));
1664
1665#ifdef CONFIG_NUMA
1666        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1667                seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1668                           zone_numa_state_snapshot(zone, i));
1669#endif
1670
1671        seq_printf(m, "\n  pagesets");
1672        for_each_online_cpu(i) {
1673                struct per_cpu_pageset *pageset;
1674
1675                pageset = per_cpu_ptr(zone->pageset, i);
1676                seq_printf(m,
1677                           "\n    cpu: %i"
1678                           "\n              count: %i"
1679                           "\n              high:  %i"
1680                           "\n              batch: %i",
1681                           i,
1682                           pageset->pcp.count,
1683                           pageset->pcp.high,
1684                           pageset->pcp.batch);
1685#ifdef CONFIG_SMP
1686                seq_printf(m, "\n  vm stats threshold: %d",
1687                                pageset->stat_threshold);
1688#endif
1689        }
1690        seq_printf(m,
1691                   "\n  node_unreclaimable:  %u"
1692                   "\n  start_pfn:           %lu",
1693                   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1694                   zone->zone_start_pfn);
1695        seq_putc(m, '\n');
1696}
1697
1698/*
1699 * Output information about zones in @pgdat.  All zones are printed regardless
1700 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1701 * set of all zones and userspace would not be aware of such zones if they are
1702 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1703 */
1704static int zoneinfo_show(struct seq_file *m, void *arg)
1705{
1706        pg_data_t *pgdat = (pg_data_t *)arg;
1707        walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1708        return 0;
1709}
1710
1711static const struct seq_operations zoneinfo_op = {
1712        .start  = frag_start, /* iterate over all zones. The same as in
1713                               * fragmentation. */
1714        .next   = frag_next,
1715        .stop   = frag_stop,
1716        .show   = zoneinfo_show,
1717};
1718
1719#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1720                         NR_VM_NUMA_STAT_ITEMS + \
1721                         NR_VM_NODE_STAT_ITEMS + \
1722                         NR_VM_WRITEBACK_STAT_ITEMS + \
1723                         (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1724                          NR_VM_EVENT_ITEMS : 0))
1725
1726static void *vmstat_start(struct seq_file *m, loff_t *pos)
1727{
1728        unsigned long *v;
1729        int i;
1730
1731        if (*pos >= NR_VMSTAT_ITEMS)
1732                return NULL;
1733
1734        BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1735        v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1736        m->private = v;
1737        if (!v)
1738                return ERR_PTR(-ENOMEM);
1739        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1740                v[i] = global_zone_page_state(i);
1741        v += NR_VM_ZONE_STAT_ITEMS;
1742
1743#ifdef CONFIG_NUMA
1744        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1745                v[i] = global_numa_state(i);
1746        v += NR_VM_NUMA_STAT_ITEMS;
1747#endif
1748
1749        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1750                v[i] = global_node_page_state_pages(i);
1751                if (vmstat_item_print_in_thp(i))
1752                        v[i] /= HPAGE_PMD_NR;
1753        }
1754        v += NR_VM_NODE_STAT_ITEMS;
1755
1756        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1757                            v + NR_DIRTY_THRESHOLD);
1758        v += NR_VM_WRITEBACK_STAT_ITEMS;
1759
1760#ifdef CONFIG_VM_EVENT_COUNTERS
1761        all_vm_events(v);
1762        v[PGPGIN] /= 2;         /* sectors -> kbytes */
1763        v[PGPGOUT] /= 2;
1764#endif
1765        return (unsigned long *)m->private + *pos;
1766}
1767
1768static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1769{
1770        (*pos)++;
1771        if (*pos >= NR_VMSTAT_ITEMS)
1772                return NULL;
1773        return (unsigned long *)m->private + *pos;
1774}
1775
1776static int vmstat_show(struct seq_file *m, void *arg)
1777{
1778        unsigned long *l = arg;
1779        unsigned long off = l - (unsigned long *)m->private;
1780
1781        seq_puts(m, vmstat_text[off]);
1782        seq_put_decimal_ull(m, " ", *l);
1783        seq_putc(m, '\n');
1784
1785        if (off == NR_VMSTAT_ITEMS - 1) {
1786                /*
1787                 * We've come to the end - add any deprecated counters to avoid
1788                 * breaking userspace which might depend on them being present.
1789                 */
1790                seq_puts(m, "nr_unstable 0\n");
1791        }
1792        return 0;
1793}
1794
1795static void vmstat_stop(struct seq_file *m, void *arg)
1796{
1797        kfree(m->private);
1798        m->private = NULL;
1799}
1800
1801static const struct seq_operations vmstat_op = {
1802        .start  = vmstat_start,
1803        .next   = vmstat_next,
1804        .stop   = vmstat_stop,
1805        .show   = vmstat_show,
1806};
1807#endif /* CONFIG_PROC_FS */
1808
1809#ifdef CONFIG_SMP
1810static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1811int sysctl_stat_interval __read_mostly = HZ;
1812
1813#ifdef CONFIG_PROC_FS
1814static void refresh_vm_stats(struct work_struct *work)
1815{
1816        refresh_cpu_vm_stats(true);
1817}
1818
1819int vmstat_refresh(struct ctl_table *table, int write,
1820                   void __user *buffer, size_t *lenp, loff_t *ppos)
1821{
1822        long val;
1823        int err;
1824        int i;
1825
1826        /*
1827         * The regular update, every sysctl_stat_interval, may come later
1828         * than expected: leaving a significant amount in per_cpu buckets.
1829         * This is particularly misleading when checking a quantity of HUGE
1830         * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1831         * which can equally be echo'ed to or cat'ted from (by root),
1832         * can be used to update the stats just before reading them.
1833         *
1834         * Oh, and since global_zone_page_state() etc. are so careful to hide
1835         * transiently negative values, report an error here if any of
1836         * the stats is negative, so we know to go looking for imbalance.
1837         */
1838        err = schedule_on_each_cpu(refresh_vm_stats);
1839        if (err)
1840                return err;
1841        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1842                val = atomic_long_read(&vm_zone_stat[i]);
1843                if (val < 0) {
1844                        pr_warn("%s: %s %ld\n",
1845                                __func__, zone_stat_name(i), val);
1846                        err = -EINVAL;
1847                }
1848        }
1849#ifdef CONFIG_NUMA
1850        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1851                val = atomic_long_read(&vm_numa_stat[i]);
1852                if (val < 0) {
1853                        pr_warn("%s: %s %ld\n",
1854                                __func__, numa_stat_name(i), val);
1855                        err = -EINVAL;
1856                }
1857        }
1858#endif
1859        if (err)
1860                return err;
1861        if (write)
1862                *ppos += *lenp;
1863        else
1864                *lenp = 0;
1865        return 0;
1866}
1867#endif /* CONFIG_PROC_FS */
1868
1869static void vmstat_update(struct work_struct *w)
1870{
1871        if (refresh_cpu_vm_stats(true)) {
1872                /*
1873                 * Counters were updated so we expect more updates
1874                 * to occur in the future. Keep on running the
1875                 * update worker thread.
1876                 */
1877                queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
1878                                this_cpu_ptr(&vmstat_work),
1879                                round_jiffies_relative(sysctl_stat_interval));
1880        }
1881}
1882
1883/*
1884 * Switch off vmstat processing and then fold all the remaining differentials
1885 * until the diffs stay at zero. The function is used by NOHZ and can only be
1886 * invoked when tick processing is not active.
1887 */
1888/*
1889 * Check if the diffs for a certain cpu indicate that
1890 * an update is needed.
1891 */
1892static bool need_update(int cpu)
1893{
1894        struct zone *zone;
1895
1896        for_each_populated_zone(zone) {
1897                struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1898
1899                BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1900#ifdef CONFIG_NUMA
1901                BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
1902#endif
1903
1904                /*
1905                 * The fast way of checking if there are any vmstat diffs.
1906                 */
1907                if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1908                               sizeof(p->vm_stat_diff[0])))
1909                        return true;
1910#ifdef CONFIG_NUMA
1911                if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1912                               sizeof(p->vm_numa_stat_diff[0])))
1913                        return true;
1914#endif
1915        }
1916        return false;
1917}
1918
1919/*
1920 * Switch off vmstat processing and then fold all the remaining differentials
1921 * until the diffs stay at zero. The function is used by NOHZ and can only be
1922 * invoked when tick processing is not active.
1923 */
1924void quiet_vmstat(void)
1925{
1926        if (system_state != SYSTEM_RUNNING)
1927                return;
1928
1929        if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
1930                return;
1931
1932        if (!need_update(smp_processor_id()))
1933                return;
1934
1935        /*
1936         * Just refresh counters and do not care about the pending delayed
1937         * vmstat_update. It doesn't fire that often to matter and canceling
1938         * it would be too expensive from this path.
1939         * vmstat_shepherd will take care about that for us.
1940         */
1941        refresh_cpu_vm_stats(false);
1942}
1943
1944/*
1945 * Shepherd worker thread that checks the
1946 * differentials of processors that have their worker
1947 * threads for vm statistics updates disabled because of
1948 * inactivity.
1949 */
1950static void vmstat_shepherd(struct work_struct *w);
1951
1952static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
1953
1954static void vmstat_shepherd(struct work_struct *w)
1955{
1956        int cpu;
1957
1958        get_online_cpus();
1959        /* Check processors whose vmstat worker threads have been disabled */
1960        for_each_online_cpu(cpu) {
1961                struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
1962
1963                if (!delayed_work_pending(dw) && need_update(cpu))
1964                        queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
1965        }
1966        put_online_cpus();
1967
1968        schedule_delayed_work(&shepherd,
1969                round_jiffies_relative(sysctl_stat_interval));
1970}
1971
1972static void __init start_shepherd_timer(void)
1973{
1974        int cpu;
1975
1976        for_each_possible_cpu(cpu)
1977                INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1978                        vmstat_update);
1979
1980        schedule_delayed_work(&shepherd,
1981                round_jiffies_relative(sysctl_stat_interval));
1982}
1983
1984static void __init init_cpu_node_state(void)
1985{
1986        int node;
1987
1988        for_each_online_node(node) {
1989                if (cpumask_weight(cpumask_of_node(node)) > 0)
1990                        node_set_state(node, N_CPU);
1991        }
1992}
1993
1994static int vmstat_cpu_online(unsigned int cpu)
1995{
1996        refresh_zone_stat_thresholds();
1997        node_set_state(cpu_to_node(cpu), N_CPU);
1998        return 0;
1999}
2000

2001static int vmstat_cpu_down_prep(unsigned int cpu)
2002{
2003        cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2004        return 0;
2005}
2006
2007static int vmstat_cpu_dead(unsigned int cpu)
2008{
2009        const struct cpumask *node_cpus;
2010        int node;
2011
2012        node = cpu_to_node(cpu);
2013
2014        refresh_zone_stat_thresholds();
2015        node_cpus = cpumask_of_node(node);
2016        if (cpumask_weight(node_cpus) > 0)
2017                return 0;
2018
2019        node_clear_state(node, N_CPU);
2020        return 0;
2021}
2022
2023#endif
2024
2025struct workqueue_struct *mm_percpu_wq;
2026
2027void __init init_mm_internals(void)
2028{
2029        int ret __maybe_unused;
2030
2031        mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2032
2033#ifdef CONFIG_SMP
2034        ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2035                                        NULL, vmstat_cpu_dead);
2036        if (ret < 0)
2037                pr_err("vmstat: failed to register 'dead' hotplug state\n");
2038
2039        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2040                                        vmstat_cpu_online,
2041                                        vmstat_cpu_down_prep);
2042        if (ret < 0)
2043                pr_err("vmstat: failed to register 'online' hotplug state\n");
2044
2045        get_online_cpus();
2046        init_cpu_node_state();
2047        put_online_cpus();
2048
2049        start_shepherd_timer();
2050#endif
2051#ifdef CONFIG_PROC_FS
2052        proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2053        proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2054        proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2055        proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2056#endif
2057}
2058
2059#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2060
2061/*
2062 * Return an index indicating how much of the available free memory is
2063 * unusable for an allocation of the requested size.
2064 */
2065static int unusable_free_index(unsigned int order,
2066                                struct contig_page_info *info)
2067{
2068        /* No free memory is interpreted as all free memory is unusable */
2069        if (info->free_pages == 0)
2070                return 1000;
2071
2072        /*
2073         * Index should be a value between 0 and 1. Return a value to 3
2074         * decimal places.
2075         *
2076         * 0 => no fragmentation
2077         * 1 => high fragmentation
2078         */
2079        return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2080
2081}
2082
2083static void unusable_show_print(struct seq_file *m,
2084                                        pg_data_t *pgdat, struct zone *zone)
2085{
2086        unsigned int order;
2087        int index;
2088        struct contig_page_info info;
2089
2090        seq_printf(m, "Node %d, zone %8s ",
2091                                pgdat->node_id,
2092                                zone->name);
2093        for (order = 0; order < MAX_ORDER; ++order) {
2094                fill_contig_page_info(zone, order, &info);
2095                index = unusable_free_index(order, &info);
2096                seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2097        }
2098
2099        seq_putc(m, '\n');
2100}
2101
2102/*
2103 * Display unusable free space index
2104 *
2105 * The unusable free space index measures how much of the available free
2106 * memory cannot be used to satisfy an allocation of a given size and is a
2107 * value between 0 and 1. The higher the value, the more of free memory is
2108 * unusable and by implication, the worse the external fragmentation is. This
2109 * can be expressed as a percentage by multiplying by 100.
2110 */
2111static int unusable_show(struct seq_file *m, void *arg)
2112{
2113        pg_data_t *pgdat = (pg_data_t *)arg;
2114
2115        /* check memoryless node */
2116        if (!node_state(pgdat->node_id, N_MEMORY))
2117                return 0;
2118
2119        walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2120
2121        return 0;
2122}
2123
2124static const struct seq_operations unusable_op = {
2125        .start  = frag_start,
2126        .next   = frag_next,
2127        .stop   = frag_stop,
2128        .show   = unusable_show,
2129};
2130
2131static int unusable_open(struct inode *inode, struct file *file)
2132{
2133        return seq_open(file, &unusable_op);
2134}
2135
2136static const struct file_operations unusable_file_ops = {
2137        .open           = unusable_open,
2138        .read           = seq_read,
2139        .llseek         = seq_lseek,
2140        .release        = seq_release,
2141};
2142
2143static void extfrag_show_print(struct seq_file *m,
2144                                        pg_data_t *pgdat, struct zone *zone)
2145{
2146        unsigned int order;
2147        int index;
2148
2149        /* Alloc on stack as interrupts are disabled for zone walk */
2150        struct contig_page_info info;
2151
2152        seq_printf(m, "Node %d, zone %8s ",
2153                                pgdat->node_id,
2154                                zone->name);
2155        for (order = 0; order < MAX_ORDER; ++order) {
2156                fill_contig_page_info(zone, order, &info);
2157                index = __fragmentation_index(order, &info);
2158                seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2159        }
2160
2161        seq_putc(m, '\n');
2162}
2163
2164/*
2165 * Display fragmentation index for orders that allocations would fail for
2166 */
2167static int extfrag_show(struct seq_file *m, void *arg)
2168{
2169        pg_data_t *pgdat = (pg_data_t *)arg;
2170
2171        walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2172
2173        return 0;
2174}
2175
2176static const struct seq_operations extfrag_op = {
2177        .start  = frag_start,
2178        .next   = frag_next,
2179        .stop   = frag_stop,
2180        .show   = extfrag_show,
2181};
2182
2183static int extfrag_open(struct inode *inode, struct file *file)
2184{
2185        return seq_open(file, &extfrag_op);
2186}
2187
2188static const struct file_operations extfrag_file_ops = {
2189        .open           = extfrag_open,
2190        .read           = seq_read,
2191        .llseek         = seq_lseek,
2192        .release        = seq_release,
2193};
2194
2195static int __init extfrag_debug_init(void)
2196{
2197        struct dentry *extfrag_debug_root;
2198
2199        extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2200        if (!extfrag_debug_root)
2201                return -ENOMEM;
2202
2203        if (!debugfs_create_file("unusable_index", 0444,
2204                        extfrag_debug_root, NULL, &unusable_file_ops))
2205                goto fail;
2206
2207        if (!debugfs_create_file("extfrag_index", 0444,
2208                        extfrag_debug_root, NULL, &extfrag_file_ops))
2209                goto fail;
2210
2211        return 0;
2212fail:
2213        debugfs_remove_recursive(extfrag_debug_root);
2214        return -ENOMEM;
2215}
2216
2217module_init(extfrag_debug_init);
2218#endif
2219