linux/mm/page_owner.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/debugfs.h>
   3#include <linux/mm.h>
   4#include <linux/slab.h>
   5#include <linux/uaccess.h>
   6#include <linux/memblock.h>
   7#include <linux/stacktrace.h>
   8#include <linux/page_owner.h>
   9#include <linux/jump_label.h>
  10#include <linux/migrate.h>
  11#include <linux/stackdepot.h>
  12#include <linux/seq_file.h>
  13#include <linux/memcontrol.h>
  14#include <linux/sched/clock.h>
  15
  16#include "internal.h"
  17
  18/*
  19 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
  20 * to use off stack temporal storage
  21 */
  22#define PAGE_OWNER_STACK_DEPTH (16)
  23
  24struct page_owner {
  25        unsigned short order;
  26        short last_migrate_reason;
  27        gfp_t gfp_mask;
  28        depot_stack_handle_t handle;
  29        depot_stack_handle_t free_handle;
  30        u64 ts_nsec;
  31        u64 free_ts_nsec;
  32        char comm[TASK_COMM_LEN];
  33        pid_t pid;
  34        pid_t tgid;
  35        pid_t free_pid;
  36        pid_t free_tgid;
  37};
  38
  39struct stack {
  40        struct stack_record *stack_record;
  41        struct stack *next;
  42};
  43static struct stack dummy_stack;
  44static struct stack failure_stack;
  45static struct stack *stack_list;
  46static DEFINE_SPINLOCK(stack_list_lock);
  47
  48static bool page_owner_enabled __initdata;
  49DEFINE_STATIC_KEY_FALSE(page_owner_inited);
  50
  51static depot_stack_handle_t dummy_handle;
  52static depot_stack_handle_t failure_handle;
  53static depot_stack_handle_t early_handle;
  54
  55static void init_early_allocated_pages(void);
  56
  57static inline void set_current_in_page_owner(void)
  58{
  59        /*
  60         * Avoid recursion.
  61         *
  62         * We might need to allocate more memory from page_owner code, so make
  63         * sure to signal it in order to avoid recursion.
  64         */
  65        current->in_page_owner = 1;
  66}
  67
  68static inline void unset_current_in_page_owner(void)
  69{
  70        current->in_page_owner = 0;
  71}
  72
  73static int __init early_page_owner_param(char *buf)
  74{
  75        int ret = kstrtobool(buf, &page_owner_enabled);
  76
  77        if (page_owner_enabled)
  78                stack_depot_request_early_init();
  79
  80        return ret;
  81}
  82early_param("page_owner", early_page_owner_param);
  83
  84static __init bool need_page_owner(void)
  85{
  86        return page_owner_enabled;
  87}
  88
  89static __always_inline depot_stack_handle_t create_dummy_stack(void)
  90{
  91        unsigned long entries[4];
  92        unsigned int nr_entries;
  93
  94        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
  95        return stack_depot_save(entries, nr_entries, GFP_KERNEL);
  96}
  97
  98static noinline void register_dummy_stack(void)
  99{
 100        dummy_handle = create_dummy_stack();
 101}
 102
 103static noinline void register_failure_stack(void)
 104{
 105        failure_handle = create_dummy_stack();
 106}
 107
 108static noinline void register_early_stack(void)
 109{
 110        early_handle = create_dummy_stack();
 111}
 112
 113static __init void init_page_owner(void)
 114{
 115        if (!page_owner_enabled)
 116                return;
 117
 118        register_dummy_stack();
 119        register_failure_stack();
 120        register_early_stack();
 121        init_early_allocated_pages();
 122        /* Initialize dummy and failure stacks and link them to stack_list */
 123        dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
 124        failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
 125        if (dummy_stack.stack_record)
 126                refcount_set(&dummy_stack.stack_record->count, 1);
 127        if (failure_stack.stack_record)
 128                refcount_set(&failure_stack.stack_record->count, 1);
 129        dummy_stack.next = &failure_stack;
 130        stack_list = &dummy_stack;
 131        static_branch_enable(&page_owner_inited);
 132}
 133
 134struct page_ext_operations page_owner_ops = {
 135        .size = sizeof(struct page_owner),
 136        .need = need_page_owner,
 137        .init = init_page_owner,
 138        .need_shared_flags = true,
 139};
 140
 141static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
 142{
 143        return page_ext_data(page_ext, &page_owner_ops);
 144}
 145
 146static noinline depot_stack_handle_t save_stack(gfp_t flags)
 147{
 148        unsigned long entries[PAGE_OWNER_STACK_DEPTH];
 149        depot_stack_handle_t handle;
 150        unsigned int nr_entries;
 151
 152        if (current->in_page_owner)
 153                return dummy_handle;
 154
 155        set_current_in_page_owner();
 156        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
 157        handle = stack_depot_save(entries, nr_entries, flags);
 158        if (!handle)
 159                handle = failure_handle;
 160        unset_current_in_page_owner();
 161
 162        return handle;
 163}
 164
 165static void add_stack_record_to_list(struct stack_record *stack_record,
 166                                     gfp_t gfp_mask)
 167{
 168        unsigned long flags;
 169        struct stack *stack;
 170
 171        set_current_in_page_owner();
 172        stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
 173        if (!stack) {
 174                unset_current_in_page_owner();
 175                return;
 176        }
 177        unset_current_in_page_owner();
 178
 179        stack->stack_record = stack_record;
 180        stack->next = NULL;
 181
 182        spin_lock_irqsave(&stack_list_lock, flags);
 183        stack->next = stack_list;
 184        /*
 185         * This pairs with smp_load_acquire() from function
 186         * stack_start(). This guarantees that stack_start()
 187         * will see an updated stack_list before starting to
 188         * traverse the list.
 189         */
 190        smp_store_release(&stack_list, stack);
 191        spin_unlock_irqrestore(&stack_list_lock, flags);
 192}
 193
 194static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
 195                                   int nr_base_pages)
 196{
 197        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
 198
 199        if (!stack_record)
 200                return;
 201
 202        /*
 203         * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
 204         * with REFCOUNT_SATURATED to catch spurious increments of their
 205         * refcount.
 206         * Since we do not use STACK_DEPOT_FLAG_GET API, let us
 207         * set a refcount of 1 ourselves.
 208         */
 209        if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
 210                int old = REFCOUNT_SATURATED;
 211
 212                if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
 213                        /* Add the new stack_record to our list */
 214                        add_stack_record_to_list(stack_record, gfp_mask);
 215        }
 216        refcount_add(nr_base_pages, &stack_record->count);
 217}
 218
 219static void dec_stack_record_count(depot_stack_handle_t handle,
 220                                   int nr_base_pages)
 221{
 222        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
 223
 224        if (!stack_record)
 225                return;
 226
 227        if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
 228                pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
 229                        handle);
 230}
 231
 232static inline void __update_page_owner_handle(struct page *page,
 233                                              depot_stack_handle_t handle,
 234                                              unsigned short order,
 235                                              gfp_t gfp_mask,
 236                                              short last_migrate_reason, u64 ts_nsec,
 237                                              pid_t pid, pid_t tgid, char *comm)
 238{
 239        struct page_ext_iter iter;
 240        struct page_ext *page_ext;
 241        struct page_owner *page_owner;
 242
 243        rcu_read_lock();
 244        for_each_page_ext(page, 1 << order, page_ext, iter) {
 245                page_owner = get_page_owner(page_ext);
 246                page_owner->handle = handle;
 247                page_owner->order = order;
 248                page_owner->gfp_mask = gfp_mask;
 249                page_owner->last_migrate_reason = last_migrate_reason;
 250                page_owner->pid = pid;
 251                page_owner->tgid = tgid;
 252                page_owner->ts_nsec = ts_nsec;
 253                strscpy(page_owner->comm, comm,
 254                        sizeof(page_owner->comm));
 255                __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 256                __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 257        }
 258        rcu_read_unlock();
 259}
 260
 261static inline void __update_page_owner_free_handle(struct page *page,
 262                                                   depot_stack_handle_t handle,
 263                                                   unsigned short order,
 264                                                   pid_t pid, pid_t tgid,
 265                                                   u64 free_ts_nsec)
 266{
 267        struct page_ext_iter iter;
 268        struct page_ext *page_ext;
 269        struct page_owner *page_owner;
 270
 271        rcu_read_lock();
 272        for_each_page_ext(page, 1 << order, page_ext, iter) {
 273                page_owner = get_page_owner(page_ext);
 274                /* Only __reset_page_owner() wants to clear the bit */
 275                if (handle) {
 276                        __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 277                        page_owner->free_handle = handle;
 278                }
 279                page_owner->free_ts_nsec = free_ts_nsec;
 280                page_owner->free_pid = current->pid;
 281                page_owner->free_tgid = current->tgid;
 282        }
 283        rcu_read_unlock();
 284}
 285
 286void __reset_page_owner(struct page *page, unsigned short order)
 287{
 288        struct page_ext *page_ext;
 289        depot_stack_handle_t handle;
 290        depot_stack_handle_t alloc_handle;
 291        struct page_owner *page_owner;
 292        u64 free_ts_nsec = local_clock();
 293
 294        page_ext = page_ext_get(page);
 295        if (unlikely(!page_ext))
 296                return;
 297
 298        page_owner = get_page_owner(page_ext);
 299        alloc_handle = page_owner->handle;
 300        page_ext_put(page_ext);
 301
 302        /*
 303         * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
 304         * to prevent issues in stack_depot_save().
 305         * This is similar to alloc_pages_nolock() gfp flags, but only used
 306         * to signal stack_depot to avoid spin_locks.
 307         */
 308        handle = save_stack(__GFP_NOWARN);
 309        __update_page_owner_free_handle(page, handle, order, current->pid,
 310                                        current->tgid, free_ts_nsec);
 311
 312        if (alloc_handle != early_handle)
 313                /*
 314                 * early_handle is being set as a handle for all those
 315                 * early allocated pages. See init_pages_in_zone().
 316                 * Since their refcount is not being incremented because
 317                 * the machinery is not ready yet, we cannot decrement
 318                 * their refcount either.
 319                 */
 320                dec_stack_record_count(alloc_handle, 1 << order);
 321}
 322
 323noinline void __set_page_owner(struct page *page, unsigned short order,
 324                                        gfp_t gfp_mask)
 325{
 326        u64 ts_nsec = local_clock();
 327        depot_stack_handle_t handle;
 328
 329        handle = save_stack(gfp_mask);
 330        __update_page_owner_handle(page, handle, order, gfp_mask, -1,
 331                                   ts_nsec, current->pid, current->tgid,
 332                                   current->comm);
 333        inc_stack_record_count(handle, gfp_mask, 1 << order);
 334}
 335
 336void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
 337{
 338        struct page_ext *page_ext = page_ext_get(&folio->page);
 339        struct page_owner *page_owner;
 340
 341        if (unlikely(!page_ext))
 342                return;
 343
 344        page_owner = get_page_owner(page_ext);
 345        page_owner->last_migrate_reason = reason;
 346        page_ext_put(page_ext);
 347}
 348
 349void __split_page_owner(struct page *page, int old_order, int new_order)
 350{
 351        struct page_ext_iter iter;
 352        struct page_ext *page_ext;
 353        struct page_owner *page_owner;
 354
 355        rcu_read_lock();
 356        for_each_page_ext(page, 1 << old_order, page_ext, iter) {
 357                page_owner = get_page_owner(page_ext);
 358                page_owner->order = new_order;
 359        }
 360        rcu_read_unlock();
 361}
 362
 363void __folio_copy_owner(struct folio *newfolio, struct folio *old)
 364{
 365        struct page_ext *page_ext;
 366        struct page_ext_iter iter;
 367        struct page_owner *old_page_owner;
 368        struct page_owner *new_page_owner;
 369        depot_stack_handle_t migrate_handle;
 370
 371        page_ext = page_ext_get(&old->page);
 372        if (unlikely(!page_ext))
 373                return;
 374
 375        old_page_owner = get_page_owner(page_ext);
 376        page_ext_put(page_ext);
 377
 378        page_ext = page_ext_get(&newfolio->page);
 379        if (unlikely(!page_ext))
 380                return;
 381
 382        new_page_owner = get_page_owner(page_ext);
 383        page_ext_put(page_ext);
 384
 385        migrate_handle = new_page_owner->handle;
 386        __update_page_owner_handle(&newfolio->page, old_page_owner->handle,
 387                                   old_page_owner->order, old_page_owner->gfp_mask,
 388                                   old_page_owner->last_migrate_reason,
 389                                   old_page_owner->ts_nsec, old_page_owner->pid,
 390                                   old_page_owner->tgid, old_page_owner->comm);
 391        /*
 392         * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
 393         * will be freed after migration. Keep them until then as they may be
 394         * useful.
 395         */
 396        __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
 397                                        old_page_owner->free_pid,
 398                                        old_page_owner->free_tgid,
 399                                        old_page_owner->free_ts_nsec);
 400        /*
 401         * We linked the original stack to the new folio, we need to do the same
 402         * for the new one and the old folio otherwise there will be an imbalance
 403         * when subtracting those pages from the stack.
 404         */
 405        rcu_read_lock();
 406        for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
 407                old_page_owner = get_page_owner(page_ext);
 408                old_page_owner->handle = migrate_handle;
 409        }
 410        rcu_read_unlock();
 411}
 412
 413void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 414                                       pg_data_t *pgdat, struct zone *zone)
 415{
 416        struct page *page;
 417        struct page_ext *page_ext;
 418        struct page_owner *page_owner;
 419        unsigned long pfn, block_end_pfn;
 420        unsigned long end_pfn = zone_end_pfn(zone);
 421        unsigned long count[MIGRATE_TYPES] = { 0, };
 422        int pageblock_mt, page_mt;
 423        int i;
 424
 425        /* Scan block by block. First and last block may be incomplete */
 426        pfn = zone->zone_start_pfn;
 427
 428        /*
 429         * Walk the zone in pageblock_nr_pages steps. If a page block spans
 430         * a zone boundary, it will be double counted between zones. This does
 431         * not matter as the mixed block count will still be correct
 432         */
 433        for (; pfn < end_pfn; ) {
 434                page = pfn_to_online_page(pfn);
 435                if (!page) {
 436                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
 437                        continue;
 438                }
 439
 440                block_end_pfn = pageblock_end_pfn(pfn);
 441                block_end_pfn = min(block_end_pfn, end_pfn);
 442
 443                pageblock_mt = get_pageblock_migratetype(page);
 444
 445                for (; pfn < block_end_pfn; pfn++) {
 446                        /* The pageblock is online, no need to recheck. */
 447                        page = pfn_to_page(pfn);
 448
 449                        if (page_zone(page) != zone)
 450                                continue;
 451
 452                        if (PageBuddy(page)) {
 453                                unsigned long freepage_order;
 454
 455                                freepage_order = buddy_order_unsafe(page);
 456                                if (freepage_order <= MAX_PAGE_ORDER)
 457                                        pfn += (1UL << freepage_order) - 1;
 458                                continue;
 459                        }
 460
 461                        if (PageReserved(page))
 462                                continue;
 463
 464                        page_ext = page_ext_get(page);
 465                        if (unlikely(!page_ext))
 466                                continue;
 467
 468                        if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 469                                goto ext_put_continue;
 470
 471                        page_owner = get_page_owner(page_ext);
 472                        page_mt = gfp_migratetype(page_owner->gfp_mask);
 473                        if (pageblock_mt != page_mt) {
 474                                if (is_migrate_cma(pageblock_mt))
 475                                        count[MIGRATE_MOVABLE]++;
 476                                else
 477                                        count[pageblock_mt]++;
 478
 479                                pfn = block_end_pfn;
 480                                page_ext_put(page_ext);
 481                                break;
 482                        }
 483                        pfn += (1UL << page_owner->order) - 1;
 484ext_put_continue:
 485                        page_ext_put(page_ext);
 486                }
 487        }
 488
 489        /* Print counts */
 490        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 491        for (i = 0; i < MIGRATE_TYPES; i++)
 492                seq_printf(m, "%12lu ", count[i]);
 493        seq_putc(m, '\n');
 494}
 495
 496/*
 497 * Looking for memcg information and print it out
 498 */
 499static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
 500                                         struct page *page)
 501{
 502#ifdef CONFIG_MEMCG
 503        unsigned long memcg_data;
 504        struct mem_cgroup *memcg;
 505        bool online;
 506        char name[80];
 507
 508        rcu_read_lock();
 509        memcg_data = READ_ONCE(page->memcg_data);
 510        if (!memcg_data || PageTail(page))
 511                goto out_unlock;
 512
 513        if (memcg_data & MEMCG_DATA_OBJEXTS)
 514                ret += scnprintf(kbuf + ret, count - ret,
 515                                "Slab cache page\n");
 516
 517        memcg = page_memcg_check(page);
 518        if (!memcg)
 519                goto out_unlock;
 520
 521        online = (memcg->css.flags & CSS_ONLINE);
 522        cgroup_name(memcg->css.cgroup, name, sizeof(name));
 523        ret += scnprintf(kbuf + ret, count - ret,
 524                        "Charged %sto %smemcg %s\n",
 525                        PageMemcgKmem(page) ? "(via objcg) " : "",
 526                        online ? "" : "offline ",
 527                        name);
 528out_unlock:
 529        rcu_read_unlock();
 530#endif /* CONFIG_MEMCG */
 531
 532        return ret;
 533}
 534
 535static ssize_t
 536print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 537                struct page *page, struct page_owner *page_owner,
 538                depot_stack_handle_t handle)
 539{
 540        int ret, pageblock_mt, page_mt;
 541        char *kbuf;
 542
 543        count = min_t(size_t, count, PAGE_SIZE);
 544        kbuf = kmalloc(count, GFP_KERNEL);
 545        if (!kbuf)
 546                return -ENOMEM;
 547
 548        ret = scnprintf(kbuf, count,
 549                        "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
 550                        page_owner->order, page_owner->gfp_mask,
 551                        &page_owner->gfp_mask, page_owner->pid,
 552                        page_owner->tgid, page_owner->comm,
 553                        page_owner->ts_nsec);
 554
 555        /* Print information relevant to grouping pages by mobility */
 556        pageblock_mt = get_pageblock_migratetype(page);
 557        page_mt  = gfp_migratetype(page_owner->gfp_mask);
 558        ret += scnprintf(kbuf + ret, count - ret,
 559                        "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
 560                        pfn,
 561                        migratetype_names[page_mt],
 562                        pfn >> pageblock_order,
 563                        migratetype_names[pageblock_mt],
 564                        &page->flags);
 565
 566        ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
 567        if (ret >= count)
 568                goto err;
 569
 570        if (page_owner->last_migrate_reason != -1) {
 571                ret += scnprintf(kbuf + ret, count - ret,
 572                        "Page has been migrated, last migrate reason: %s\n",
 573                        migrate_reason_names[page_owner->last_migrate_reason]);
 574        }
 575
 576        ret = print_page_owner_memcg(kbuf, count, ret, page);
 577
 578        ret += snprintf(kbuf + ret, count - ret, "\n");
 579        if (ret >= count)
 580                goto err;
 581
 582        if (copy_to_user(buf, kbuf, ret))
 583                ret = -EFAULT;
 584
 585        kfree(kbuf);
 586        return ret;
 587
 588err:
 589        kfree(kbuf);
 590        return -ENOMEM;
 591}
 592
 593void __dump_page_owner(const struct page *page)
 594{
 595        struct page_ext *page_ext = page_ext_get((void *)page);
 596        struct page_owner *page_owner;
 597        depot_stack_handle_t handle;
 598        gfp_t gfp_mask;
 599        int mt;
 600
 601        if (unlikely(!page_ext)) {
 602                pr_alert("There is not page extension available.\n");
 603                return;
 604        }
 605
 606        page_owner = get_page_owner(page_ext);
 607        gfp_mask = page_owner->gfp_mask;
 608        mt = gfp_migratetype(gfp_mask);
 609
 610        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
 611                pr_alert("page_owner info is not present (never set?)\n");
 612                page_ext_put(page_ext);
 613                return;
 614        }
 615
 616        if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 617                pr_alert("page_owner tracks the page as allocated\n");
 618        else
 619                pr_alert("page_owner tracks the page as freed\n");
 620
 621        pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
 622                 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
 623                 page_owner->pid, page_owner->tgid, page_owner->comm,
 624                 page_owner->ts_nsec, page_owner->free_ts_nsec);
 625
 626        handle = READ_ONCE(page_owner->handle);
 627        if (!handle)
 628                pr_alert("page_owner allocation stack trace missing\n");
 629        else
 630                stack_depot_print(handle);
 631
 632        handle = READ_ONCE(page_owner->free_handle);
 633        if (!handle) {
 634                pr_alert("page_owner free stack trace missing\n");
 635        } else {
 636                pr_alert("page last free pid %d tgid %d stack trace:\n",
 637                          page_owner->free_pid, page_owner->free_tgid);
 638                stack_depot_print(handle);
 639        }
 640
 641        if (page_owner->last_migrate_reason != -1)
 642                pr_alert("page has been migrated, last migrate reason: %s\n",
 643                        migrate_reason_names[page_owner->last_migrate_reason]);
 644        page_ext_put(page_ext);
 645}
 646
 647static ssize_t
 648read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 649{
 650        unsigned long pfn;
 651        struct page *page;
 652        struct page_ext *page_ext;
 653        struct page_owner *page_owner;
 654        depot_stack_handle_t handle;
 655
 656        if (!static_branch_unlikely(&page_owner_inited))
 657                return -EINVAL;
 658
 659        page = NULL;
 660        if (*ppos == 0)
 661                pfn = min_low_pfn;
 662        else
 663                pfn = *ppos;
 664        /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
 665        while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
 666                pfn++;
 667
 668        /* Find an allocated page */
 669        for (; pfn < max_pfn; pfn++) {
 670                /*
 671                 * This temporary page_owner is required so
 672                 * that we can avoid the context switches while holding
 673                 * the rcu lock and copying the page owner information to
 674                 * user through copy_to_user() or GFP_KERNEL allocations.
 675                 */
 676                struct page_owner page_owner_tmp;
 677
 678                /*
 679                 * If the new page is in a new MAX_ORDER_NR_PAGES area,
 680                 * validate the area as existing, skip it if not
 681                 */
 682                if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
 683                        pfn += MAX_ORDER_NR_PAGES - 1;
 684                        continue;
 685                }
 686
 687                page = pfn_to_page(pfn);
 688                if (PageBuddy(page)) {
 689                        unsigned long freepage_order = buddy_order_unsafe(page);
 690
 691                        if (freepage_order <= MAX_PAGE_ORDER)
 692                                pfn += (1UL << freepage_order) - 1;
 693                        continue;
 694                }
 695
 696                page_ext = page_ext_get(page);
 697                if (unlikely(!page_ext))
 698                        continue;
 699
 700                /*
 701                 * Some pages could be missed by concurrent allocation or free,
 702                 * because we don't hold the zone lock.
 703                 */
 704                if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 705                        goto ext_put_continue;
 706
 707                /*
 708                 * Although we do have the info about past allocation of free
 709                 * pages, it's not relevant for current memory usage.
 710                 */
 711                if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 712                        goto ext_put_continue;
 713
 714                page_owner = get_page_owner(page_ext);
 715
 716                /*
 717                 * Don't print "tail" pages of high-order allocations as that
 718                 * would inflate the stats.
 719                 */
 720                if (!IS_ALIGNED(pfn, 1 << page_owner->order))
 721                        goto ext_put_continue;
 722
 723                /*
 724                 * Access to page_ext->handle isn't synchronous so we should
 725                 * be careful to access it.
 726                 */
 727                handle = READ_ONCE(page_owner->handle);
 728                if (!handle)
 729                        goto ext_put_continue;
 730
 731                /* Record the next PFN to read in the file offset */
 732                *ppos = pfn + 1;
 733
 734                page_owner_tmp = *page_owner;
 735                page_ext_put(page_ext);
 736                return print_page_owner(buf, count, pfn, page,
 737                                &page_owner_tmp, handle);
 738ext_put_continue:
 739                page_ext_put(page_ext);
 740        }
 741
 742        return 0;
 743}
 744
 745static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
 746{
 747        switch (orig) {
 748        case SEEK_SET:
 749                file->f_pos = offset;
 750                break;
 751        case SEEK_CUR:
 752                file->f_pos += offset;
 753                break;
 754        default:
 755                return -EINVAL;
 756        }
 757        return file->f_pos;
 758}
 759
 760static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 761{
 762        unsigned long pfn = zone->zone_start_pfn;
 763        unsigned long end_pfn = zone_end_pfn(zone);
 764        unsigned long count = 0;
 765
 766        /*
 767         * Walk the zone in pageblock_nr_pages steps. If a page block spans
 768         * a zone boundary, it will be double counted between zones. This does
 769         * not matter as the mixed block count will still be correct
 770         */
 771        for (; pfn < end_pfn; ) {
 772                unsigned long block_end_pfn;
 773
 774                if (!pfn_valid(pfn)) {
 775                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
 776                        continue;
 777                }
 778
 779                block_end_pfn = pageblock_end_pfn(pfn);
 780                block_end_pfn = min(block_end_pfn, end_pfn);
 781
 782                for (; pfn < block_end_pfn; pfn++) {
 783                        struct page *page = pfn_to_page(pfn);
 784                        struct page_ext *page_ext;
 785
 786                        if (page_zone(page) != zone)
 787                                continue;
 788
 789                        /*
 790                         * To avoid having to grab zone->lock, be a little
 791                         * careful when reading buddy page order. The only
 792                         * danger is that we skip too much and potentially miss
 793                         * some early allocated pages, which is better than
 794                         * heavy lock contention.
 795                         */
 796                        if (PageBuddy(page)) {
 797                                unsigned long order = buddy_order_unsafe(page);
 798
 799                                if (order > 0 && order <= MAX_PAGE_ORDER)
 800                                        pfn += (1UL << order) - 1;
 801                                continue;
 802                        }
 803
 804                        if (PageReserved(page))
 805                                continue;
 806
 807                        page_ext = page_ext_get(page);
 808                        if (unlikely(!page_ext))
 809                                continue;
 810
 811                        /* Maybe overlapping zone */
 812                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 813                                goto ext_put_continue;
 814
 815                        /* Found early allocated page */
 816                        __update_page_owner_handle(page, early_handle, 0, 0,
 817                                                   -1, local_clock(), current->pid,
 818                                                   current->tgid, current->comm);
 819                        count++;
 820ext_put_continue:
 821                        page_ext_put(page_ext);
 822                }
 823                cond_resched();
 824        }
 825
 826        pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
 827                pgdat->node_id, zone->name, count);
 828}
 829
 830static void init_zones_in_node(pg_data_t *pgdat)
 831{
 832        struct zone *zone;
 833        struct zone *node_zones = pgdat->node_zones;
 834
 835        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 836                if (!populated_zone(zone))
 837                        continue;
 838
 839                init_pages_in_zone(pgdat, zone);
 840        }
 841}
 842
 843static void init_early_allocated_pages(void)
 844{
 845        pg_data_t *pgdat;
 846
 847        for_each_online_pgdat(pgdat)
 848                init_zones_in_node(pgdat);
 849}
 850
 851static const struct file_operations proc_page_owner_operations = {
 852        .read           = read_page_owner,
 853        .llseek         = lseek_page_owner,
 854};
 855
 856static void *stack_start(struct seq_file *m, loff_t *ppos)
 857{
 858        struct stack *stack;
 859
 860        if (*ppos == -1UL)
 861                return NULL;
 862
 863        if (!*ppos) {
 864                /*
 865                 * This pairs with smp_store_release() from function
 866                 * add_stack_record_to_list(), so we get a consistent
 867                 * value of stack_list.
 868                 */
 869                stack = smp_load_acquire(&stack_list);
 870                m->private = stack;
 871        } else {
 872                stack = m->private;
 873        }
 874
 875        return stack;
 876}
 877
 878static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
 879{
 880        struct stack *stack = v;
 881
 882        stack = stack->next;
 883        *ppos = stack ? *ppos + 1 : -1UL;
 884        m->private = stack;
 885
 886        return stack;
 887}
 888
 889static unsigned long page_owner_pages_threshold;
 890
 891static int stack_print(struct seq_file *m, void *v)
 892{
 893        int i, nr_base_pages;
 894        struct stack *stack = v;
 895        unsigned long *entries;
 896        unsigned long nr_entries;
 897        struct stack_record *stack_record = stack->stack_record;
 898
 899        if (!stack->stack_record)
 900                return 0;
 901
 902        nr_entries = stack_record->size;
 903        entries = stack_record->entries;
 904        nr_base_pages = refcount_read(&stack_record->count) - 1;
 905
 906        if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
 907                return 0;
 908
 909        for (i = 0; i < nr_entries; i++)
 910                seq_printf(m, " %pS\n", (void *)entries[i]);
 911        seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
 912
 913        return 0;
 914}
 915
 916static void stack_stop(struct seq_file *m, void *v)
 917{
 918}
 919
 920static const struct seq_operations page_owner_stack_op = {
 921        .start  = stack_start,
 922        .next   = stack_next,
 923        .stop   = stack_stop,
 924        .show   = stack_print
 925};
 926
 927static int page_owner_stack_open(struct inode *inode, struct file *file)
 928{
 929        return seq_open_private(file, &page_owner_stack_op, 0);
 930}
 931
 932static const struct file_operations page_owner_stack_operations = {
 933        .open           = page_owner_stack_open,
 934        .read           = seq_read,
 935        .llseek         = seq_lseek,
 936        .release        = seq_release,
 937};
 938
 939static int page_owner_threshold_get(void *data, u64 *val)
 940{
 941        *val = READ_ONCE(page_owner_pages_threshold);
 942        return 0;
 943}
 944
 945static int page_owner_threshold_set(void *data, u64 val)
 946{
 947        WRITE_ONCE(page_owner_pages_threshold, val);
 948        return 0;
 949}
 950
 951DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
 952                        &page_owner_threshold_set, "%llu");
 953
 954
 955static int __init pageowner_init(void)
 956{
 957        struct dentry *dir;
 958
 959        if (!static_branch_unlikely(&page_owner_inited)) {
 960                pr_info("page_owner is disabled\n");
 961                return 0;
 962        }
 963
 964        debugfs_create_file("page_owner", 0400, NULL, NULL,
 965                            &proc_page_owner_operations);
 966        dir = debugfs_create_dir("page_owner_stacks", NULL);
 967        debugfs_create_file("show_stacks", 0400, dir, NULL,
 968                            &page_owner_stack_operations);
 969        debugfs_create_file("count_threshold", 0600, dir, NULL,
 970                            &proc_page_owner_threshold);
 971
 972        return 0;
 973}
 974late_initcall(pageowner_init)
 975