linux/mm/page_ext.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/page_ext.h>
   5#include <linux/memory.h>
   6#include <linux/vmalloc.h>
   7#include <linux/kmemleak.h>
   8#include <linux/page_owner.h>
   9#include <linux/page_idle.h>
  10
  11/*
  12 * struct page extension
  13 *
  14 * This is the feature to manage memory for extended data per page.
  15 *
  16 * Until now, we must modify struct page itself to store extra data per page.
  17 * This requires rebuilding the kernel and it is really time consuming process.
  18 * And, sometimes, rebuild is impossible due to third party module dependency.
  19 * At last, enlarging struct page could cause un-wanted system behaviour change.
  20 *
  21 * This feature is intended to overcome above mentioned problems. This feature
  22 * allocates memory for extended data per page in certain place rather than
  23 * the struct page itself. This memory can be accessed by the accessor
  24 * functions provided by this code. During the boot process, it checks whether
  25 * allocation of huge chunk of memory is needed or not. If not, it avoids
  26 * allocating memory at all. With this advantage, we can include this feature
  27 * into the kernel in default and can avoid rebuild and solve related problems.
  28 *
  29 * To help these things to work well, there are two callbacks for clients. One
  30 * is the need callback which is mandatory if user wants to avoid useless
  31 * memory allocation at boot-time. The other is optional, init callback, which
  32 * is used to do proper initialization after memory is allocated.
  33 *
  34 * The need callback is used to decide whether extended memory allocation is
  35 * needed or not. Sometimes users want to deactivate some features in this
  36 * boot and extra memory would be unneccessary. In this case, to avoid
  37 * allocating huge chunk of memory, each clients represent their need of
  38 * extra memory through the need callback. If one of the need callbacks
  39 * returns true, it means that someone needs extra memory so that
  40 * page extension core should allocates memory for page extension. If
  41 * none of need callbacks return true, memory isn't needed at all in this boot
  42 * and page extension core can skip to allocate memory. As result,
  43 * none of memory is wasted.
  44 *
  45 * The init callback is used to do proper initialization after page extension
  46 * is completely initialized. In sparse memory system, extra memory is
  47 * allocated some time later than memmap is allocated. In other words, lifetime
  48 * of memory for page extension isn't same with memmap for struct page.
  49 * Therefore, clients can't store extra data until page extension is
  50 * initialized, even if pages are allocated and used freely. This could
  51 * cause inadequate state of extra data per page, so, to prevent it, client
  52 * can utilize this callback to initialize the state of it correctly.
  53 */
  54
  55static struct page_ext_operations *page_ext_ops[] = {
  56        &debug_guardpage_ops,
  57#ifdef CONFIG_PAGE_POISONING
  58        &page_poisoning_ops,
  59#endif
  60#ifdef CONFIG_PAGE_OWNER
  61        &page_owner_ops,
  62#endif
  63#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
  64        &page_idle_ops,
  65#endif
  66};
  67
  68static unsigned long total_usage;
  69
  70static bool __init invoke_need_callbacks(void)
  71{
  72        int i;
  73        int entries = ARRAY_SIZE(page_ext_ops);
  74
  75        for (i = 0; i < entries; i++) {
  76                if (page_ext_ops[i]->need && page_ext_ops[i]->need())
  77                        return true;
  78        }
  79
  80        return false;
  81}
  82
  83static void __init invoke_init_callbacks(void)
  84{
  85        int i;
  86        int entries = ARRAY_SIZE(page_ext_ops);
  87
  88        for (i = 0; i < entries; i++) {
  89                if (page_ext_ops[i]->init)
  90                        page_ext_ops[i]->init();
  91        }
  92}
  93
  94#if !defined(CONFIG_SPARSEMEM)
  95
  96
  97void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  98{
  99        pgdat->node_page_ext = NULL;
 100}
 101
 102struct page_ext *lookup_page_ext(struct page *page)
 103{
 104        unsigned long pfn = page_to_pfn(page);
 105        unsigned long offset;
 106        struct page_ext *base;
 107
 108        base = NODE_DATA(page_to_nid(page))->node_page_ext;
 109#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
 110        /*
 111         * The sanity checks the page allocator does upon freeing a
 112         * page can reach here before the page_ext arrays are
 113         * allocated when feeding a range of pages to the allocator
 114         * for the first time during bootup or memory hotplug.
 115         *
 116         * This check is also necessary for ensuring page poisoning
 117         * works as expected when enabled
 118         */
 119        if (unlikely(!base))
 120                return NULL;
 121#endif
 122        offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
 123                                        MAX_ORDER_NR_PAGES);
 124        return base + offset;
 125}
 126
 127static int __init alloc_node_page_ext(int nid)
 128{
 129        struct page_ext *base;
 130        unsigned long table_size;
 131        unsigned long nr_pages;
 132
 133        nr_pages = NODE_DATA(nid)->node_spanned_pages;
 134        if (!nr_pages)
 135                return 0;
 136
 137        /*
 138         * Need extra space if node range is not aligned with
 139         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
 140         * checks buddy's status, range could be out of exact node range.
 141         */
 142        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
 143                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
 144                nr_pages += MAX_ORDER_NR_PAGES;
 145
 146        table_size = sizeof(struct page_ext) * nr_pages;
 147
 148        base = memblock_virt_alloc_try_nid_nopanic(
 149                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
 150                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
 151        if (!base)
 152                return -ENOMEM;
 153        NODE_DATA(nid)->node_page_ext = base;
 154        total_usage += table_size;
 155        return 0;
 156}
 157
 158void __init page_ext_init_flatmem(void)
 159{
 160
 161        int nid, fail;
 162
 163        if (!invoke_need_callbacks())
 164                return;
 165
 166        for_each_online_node(nid)  {
 167                fail = alloc_node_page_ext(nid);
 168                if (fail)
 169                        goto fail;
 170        }
 171        pr_info("allocated %ld bytes of page_ext\n", total_usage);
 172        invoke_init_callbacks();
 173        return;
 174
 175fail:
 176        pr_crit("allocation of page_ext failed.\n");
 177        panic("Out of memory");
 178}
 179
 180#else /* CONFIG_FLAT_NODE_MEM_MAP */
 181
 182struct page_ext *lookup_page_ext(struct page *page)
 183{
 184        unsigned long pfn = page_to_pfn(page);
 185        struct mem_section *section = __pfn_to_section(pfn);
 186#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
 187        /*
 188         * The sanity checks the page allocator does upon freeing a
 189         * page can reach here before the page_ext arrays are
 190         * allocated when feeding a range of pages to the allocator
 191         * for the first time during bootup or memory hotplug.
 192         *
 193         * This check is also necessary for ensuring page poisoning
 194         * works as expected when enabled
 195         */
 196        if (!section->page_ext)
 197                return NULL;
 198#endif
 199        return section->page_ext + pfn;
 200}
 201
 202static void *__meminit alloc_page_ext(size_t size, int nid)
 203{
 204        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 205        void *addr = NULL;
 206
 207        addr = alloc_pages_exact_nid(nid, size, flags);
 208        if (addr) {
 209                kmemleak_alloc(addr, size, 1, flags);
 210                return addr;
 211        }
 212
 213        if (node_state(nid, N_HIGH_MEMORY))
 214                addr = vzalloc_node(size, nid);
 215        else
 216                addr = vzalloc(size);
 217
 218        return addr;
 219}
 220
 221static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 222{
 223        struct mem_section *section;
 224        struct page_ext *base;
 225        unsigned long table_size;
 226
 227        section = __pfn_to_section(pfn);
 228
 229        if (section->page_ext)
 230                return 0;
 231
 232        table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
 233        base = alloc_page_ext(table_size, nid);
 234
 235        /*
 236         * The value stored in section->page_ext is (base - pfn)
 237         * and it does not point to the memory block allocated above,
 238         * causing kmemleak false positives.
 239         */
 240        kmemleak_not_leak(base);
 241
 242        if (!base) {
 243                pr_err("page ext allocation failure\n");
 244                return -ENOMEM;
 245        }
 246
 247        /*
 248         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 249         * we need to apply a mask.
 250         */
 251        pfn &= PAGE_SECTION_MASK;
 252        section->page_ext = base - pfn;
 253        total_usage += table_size;
 254        return 0;
 255}
 256#ifdef CONFIG_MEMORY_HOTPLUG
 257static void free_page_ext(void *addr)
 258{
 259        if (is_vmalloc_addr(addr)) {
 260                vfree(addr);
 261        } else {
 262                struct page *page = virt_to_page(addr);
 263                size_t table_size;
 264
 265                table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
 266
 267                BUG_ON(PageReserved(page));
 268                free_pages_exact(addr, table_size);
 269        }
 270}
 271
 272static void __free_page_ext(unsigned long pfn)
 273{
 274        struct mem_section *ms;
 275        struct page_ext *base;
 276
 277        ms = __pfn_to_section(pfn);
 278        if (!ms || !ms->page_ext)
 279                return;
 280        base = ms->page_ext + pfn;
 281        free_page_ext(base);
 282        ms->page_ext = NULL;
 283}
 284
 285static int __meminit online_page_ext(unsigned long start_pfn,
 286                                unsigned long nr_pages,
 287                                int nid)
 288{
 289        unsigned long start, end, pfn;
 290        int fail = 0;
 291
 292        start = SECTION_ALIGN_DOWN(start_pfn);
 293        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 294
 295        if (nid == -1) {
 296                /*
 297                 * In this case, "nid" already exists and contains valid memory.
 298                 * "start_pfn" passed to us is a pfn which is an arg for
 299                 * online__pages(), and start_pfn should exist.
 300                 */
 301                nid = pfn_to_nid(start_pfn);
 302                VM_BUG_ON(!node_state(nid, N_ONLINE));
 303        }
 304
 305        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 306                if (!pfn_present(pfn))
 307                        continue;
 308                fail = init_section_page_ext(pfn, nid);
 309        }
 310        if (!fail)
 311                return 0;
 312
 313        /* rollback */
 314        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 315                __free_page_ext(pfn);
 316
 317        return -ENOMEM;
 318}
 319
 320static int __meminit offline_page_ext(unsigned long start_pfn,
 321                                unsigned long nr_pages, int nid)
 322{
 323        unsigned long start, end, pfn;
 324
 325        start = SECTION_ALIGN_DOWN(start_pfn);
 326        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 327
 328        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 329                __free_page_ext(pfn);
 330        return 0;
 331
 332}
 333
 334static int __meminit page_ext_callback(struct notifier_block *self,
 335                               unsigned long action, void *arg)
 336{
 337        struct memory_notify *mn = arg;
 338        int ret = 0;
 339
 340        switch (action) {
 341        case MEM_GOING_ONLINE:
 342                ret = online_page_ext(mn->start_pfn,
 343                                   mn->nr_pages, mn->status_change_nid);
 344                break;
 345        case MEM_OFFLINE:
 346                offline_page_ext(mn->start_pfn,
 347                                mn->nr_pages, mn->status_change_nid);
 348                break;
 349        case MEM_CANCEL_ONLINE:
 350                offline_page_ext(mn->start_pfn,
 351                                mn->nr_pages, mn->status_change_nid);
 352                break;
 353        case MEM_GOING_OFFLINE:
 354                break;
 355        case MEM_ONLINE:
 356        case MEM_CANCEL_OFFLINE:
 357                break;
 358        }
 359
 360        return notifier_from_errno(ret);
 361}
 362
 363#endif
 364
 365void __init page_ext_init(void)
 366{
 367        unsigned long pfn;
 368        int nid;
 369
 370        if (!invoke_need_callbacks())
 371                return;
 372
 373        for_each_node_state(nid, N_MEMORY) {
 374                unsigned long start_pfn, end_pfn;
 375
 376                start_pfn = node_start_pfn(nid);
 377                end_pfn = node_end_pfn(nid);
 378                /*
 379                 * start_pfn and end_pfn may not be aligned to SECTION and the
 380                 * page->flags of out of node pages are not initialized.  So we
 381                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 382                 */
 383                for (pfn = start_pfn; pfn < end_pfn;
 384                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 385
 386                        if (!pfn_valid(pfn))
 387                                continue;
 388                        /*
 389                         * Nodes's pfns can be overlapping.
 390                         * We know some arch can have a nodes layout such as
 391                         * -------------pfn-------------->
 392                         * N0 | N1 | N2 | N0 | N1 | N2|....
 393                         */
 394                        if (pfn_to_nid(pfn) != nid)
 395                                continue;
 396                        if (init_section_page_ext(pfn, nid))
 397                                goto oom;
 398                }
 399        }
 400        hotplug_memory_notifier(page_ext_callback, 0);
 401        pr_info("allocated %ld bytes of page_ext\n", total_usage);
 402        invoke_init_callbacks();
 403        return;
 404
 405oom:
 406        panic("Out of memory");
 407}
 408
 409void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
 410{
 411}
 412
 413#endif
 414