linux/mm/page_ext.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/page_ext.h>
   5#include <linux/memory.h>
   6#include <linux/vmalloc.h>
   7#include <linux/kmemleak.h>
   8#include <linux/page_owner.h>
   9
  10/*
  11 * struct page extension
  12 *
  13 * This is the feature to manage memory for extended data per page.
  14 *
  15 * Until now, we must modify struct page itself to store extra data per page.
  16 * This requires rebuilding the kernel and it is really time consuming process.
  17 * And, sometimes, rebuild is impossible due to third party module dependency.
  18 * At last, enlarging struct page could cause un-wanted system behaviour change.
  19 *
  20 * This feature is intended to overcome above mentioned problems. This feature
  21 * allocates memory for extended data per page in certain place rather than
  22 * the struct page itself. This memory can be accessed by the accessor
  23 * functions provided by this code. During the boot process, it checks whether
  24 * allocation of huge chunk of memory is needed or not. If not, it avoids
  25 * allocating memory at all. With this advantage, we can include this feature
  26 * into the kernel in default and can avoid rebuild and solve related problems.
  27 *
  28 * To help these things to work well, there are two callbacks for clients. One
  29 * is the need callback which is mandatory if user wants to avoid useless
  30 * memory allocation at boot-time. The other is optional, init callback, which
  31 * is used to do proper initialization after memory is allocated.
  32 *
  33 * The need callback is used to decide whether extended memory allocation is
  34 * needed or not. Sometimes users want to deactivate some features in this
  35 * boot and extra memory would be unneccessary. In this case, to avoid
  36 * allocating huge chunk of memory, each clients represent their need of
  37 * extra memory through the need callback. If one of the need callbacks
  38 * returns true, it means that someone needs extra memory so that
  39 * page extension core should allocates memory for page extension. If
  40 * none of need callbacks return true, memory isn't needed at all in this boot
  41 * and page extension core can skip to allocate memory. As result,
  42 * none of memory is wasted.
  43 *
  44 * The init callback is used to do proper initialization after page extension
  45 * is completely initialized. In sparse memory system, extra memory is
  46 * allocated some time later than memmap is allocated. In other words, lifetime
  47 * of memory for page extension isn't same with memmap for struct page.
  48 * Therefore, clients can't store extra data until page extension is
  49 * initialized, even if pages are allocated and used freely. This could
  50 * cause inadequate state of extra data per page, so, to prevent it, client
  51 * can utilize this callback to initialize the state of it correctly.
  52 */
  53
  54static struct page_ext_operations *page_ext_ops[] = {
  55        &debug_guardpage_ops,
  56#ifdef CONFIG_PAGE_POISONING
  57        &page_poisoning_ops,
  58#endif
  59#ifdef CONFIG_PAGE_OWNER
  60        &page_owner_ops,
  61#endif
  62};
  63
  64static unsigned long total_usage;
  65
  66static bool __init invoke_need_callbacks(void)
  67{
  68        int i;
  69        int entries = ARRAY_SIZE(page_ext_ops);
  70
  71        for (i = 0; i < entries; i++) {
  72                if (page_ext_ops[i]->need && page_ext_ops[i]->need())
  73                        return true;
  74        }
  75
  76        return false;
  77}
  78
  79static void __init invoke_init_callbacks(void)
  80{
  81        int i;
  82        int entries = ARRAY_SIZE(page_ext_ops);
  83
  84        for (i = 0; i < entries; i++) {
  85                if (page_ext_ops[i]->init)
  86                        page_ext_ops[i]->init();
  87        }
  88}
  89
  90#if !defined(CONFIG_SPARSEMEM)
  91
  92
  93void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  94{
  95        pgdat->node_page_ext = NULL;
  96}
  97
  98struct page_ext *lookup_page_ext(struct page *page)
  99{
 100        unsigned long pfn = page_to_pfn(page);
 101        unsigned long offset;
 102        struct page_ext *base;
 103
 104        base = NODE_DATA(page_to_nid(page))->node_page_ext;
 105#ifdef CONFIG_DEBUG_VM
 106        /*
 107         * The sanity checks the page allocator does upon freeing a
 108         * page can reach here before the page_ext arrays are
 109         * allocated when feeding a range of pages to the allocator
 110         * for the first time during bootup or memory hotplug.
 111         */
 112        if (unlikely(!base))
 113                return NULL;
 114#endif
 115        offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
 116                                        MAX_ORDER_NR_PAGES);
 117        return base + offset;
 118}
 119
 120static int __init alloc_node_page_ext(int nid)
 121{
 122        struct page_ext *base;
 123        unsigned long table_size;
 124        unsigned long nr_pages;
 125
 126        nr_pages = NODE_DATA(nid)->node_spanned_pages;
 127        if (!nr_pages)
 128                return 0;
 129
 130        /*
 131         * Need extra space if node range is not aligned with
 132         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
 133         * checks buddy's status, range could be out of exact node range.
 134         */
 135        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
 136                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
 137                nr_pages += MAX_ORDER_NR_PAGES;
 138
 139        table_size = sizeof(struct page_ext) * nr_pages;
 140
 141        base = memblock_virt_alloc_try_nid_nopanic(
 142                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
 143                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
 144        if (!base)
 145                return -ENOMEM;
 146        NODE_DATA(nid)->node_page_ext = base;
 147        total_usage += table_size;
 148        return 0;
 149}
 150
 151void __init page_ext_init_flatmem(void)
 152{
 153
 154        int nid, fail;
 155
 156        if (!invoke_need_callbacks())
 157                return;
 158
 159        for_each_online_node(nid)  {
 160                fail = alloc_node_page_ext(nid);
 161                if (fail)
 162                        goto fail;
 163        }
 164        pr_info("allocated %ld bytes of page_ext\n", total_usage);
 165        invoke_init_callbacks();
 166        return;
 167
 168fail:
 169        pr_crit("allocation of page_ext failed.\n");
 170        panic("Out of memory");
 171}
 172
 173#else /* CONFIG_FLAT_NODE_MEM_MAP */
 174
 175struct page_ext *lookup_page_ext(struct page *page)
 176{
 177        unsigned long pfn = page_to_pfn(page);
 178        struct mem_section *section = __pfn_to_section(pfn);
 179#ifdef CONFIG_DEBUG_VM
 180        /*
 181         * The sanity checks the page allocator does upon freeing a
 182         * page can reach here before the page_ext arrays are
 183         * allocated when feeding a range of pages to the allocator
 184         * for the first time during bootup or memory hotplug.
 185         */
 186        if (!section->page_ext)
 187                return NULL;
 188#endif
 189        return section->page_ext + pfn;
 190}
 191
 192static void *__meminit alloc_page_ext(size_t size, int nid)
 193{
 194        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 195        void *addr = NULL;
 196
 197        addr = alloc_pages_exact_nid(nid, size, flags);
 198        if (addr) {
 199                kmemleak_alloc(addr, size, 1, flags);
 200                return addr;
 201        }
 202
 203        if (node_state(nid, N_HIGH_MEMORY))
 204                addr = vzalloc_node(size, nid);
 205        else
 206                addr = vzalloc(size);
 207
 208        return addr;
 209}
 210
 211static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 212{
 213        struct mem_section *section;
 214        struct page_ext *base;
 215        unsigned long table_size;
 216
 217        section = __pfn_to_section(pfn);
 218
 219        if (section->page_ext)
 220                return 0;
 221
 222        table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
 223        base = alloc_page_ext(table_size, nid);
 224
 225        /*
 226         * The value stored in section->page_ext is (base - pfn)
 227         * and it does not point to the memory block allocated above,
 228         * causing kmemleak false positives.
 229         */
 230        kmemleak_not_leak(base);
 231
 232        if (!base) {
 233                pr_err("page ext allocation failure\n");
 234                return -ENOMEM;
 235        }
 236
 237        /*
 238         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 239         * we need to apply a mask.
 240         */
 241        pfn &= PAGE_SECTION_MASK;
 242        section->page_ext = base - pfn;
 243        total_usage += table_size;
 244        return 0;
 245}
 246#ifdef CONFIG_MEMORY_HOTPLUG
 247static void free_page_ext(void *addr)
 248{
 249        if (is_vmalloc_addr(addr)) {
 250                vfree(addr);
 251        } else {
 252                struct page *page = virt_to_page(addr);
 253                size_t table_size;
 254
 255                table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
 256
 257                BUG_ON(PageReserved(page));
 258                free_pages_exact(addr, table_size);
 259        }
 260}
 261
 262static void __free_page_ext(unsigned long pfn)
 263{
 264        struct mem_section *ms;
 265        struct page_ext *base;
 266
 267        ms = __pfn_to_section(pfn);
 268        if (!ms || !ms->page_ext)
 269                return;
 270        base = ms->page_ext + pfn;
 271        free_page_ext(base);
 272        ms->page_ext = NULL;
 273}
 274
 275static int __meminit online_page_ext(unsigned long start_pfn,
 276                                unsigned long nr_pages,
 277                                int nid)
 278{
 279        unsigned long start, end, pfn;
 280        int fail = 0;
 281
 282        start = SECTION_ALIGN_DOWN(start_pfn);
 283        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 284
 285        if (nid == -1) {
 286                /*
 287                 * In this case, "nid" already exists and contains valid memory.
 288                 * "start_pfn" passed to us is a pfn which is an arg for
 289                 * online__pages(), and start_pfn should exist.
 290                 */
 291                nid = pfn_to_nid(start_pfn);
 292                VM_BUG_ON(!node_state(nid, N_ONLINE));
 293        }
 294
 295        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 296                if (!pfn_present(pfn))
 297                        continue;
 298                fail = init_section_page_ext(pfn, nid);
 299        }
 300        if (!fail)
 301                return 0;
 302
 303        /* rollback */
 304        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 305                __free_page_ext(pfn);
 306
 307        return -ENOMEM;
 308}
 309
 310static int __meminit offline_page_ext(unsigned long start_pfn,
 311                                unsigned long nr_pages, int nid)
 312{
 313        unsigned long start, end, pfn;
 314
 315        start = SECTION_ALIGN_DOWN(start_pfn);
 316        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 317
 318        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 319                __free_page_ext(pfn);
 320        return 0;
 321
 322}
 323
 324static int __meminit page_ext_callback(struct notifier_block *self,
 325                               unsigned long action, void *arg)
 326{
 327        struct memory_notify *mn = arg;
 328        int ret = 0;
 329
 330        switch (action) {
 331        case MEM_GOING_ONLINE:
 332                ret = online_page_ext(mn->start_pfn,
 333                                   mn->nr_pages, mn->status_change_nid);
 334                break;
 335        case MEM_OFFLINE:
 336                offline_page_ext(mn->start_pfn,
 337                                mn->nr_pages, mn->status_change_nid);
 338                break;
 339        case MEM_CANCEL_ONLINE:
 340                offline_page_ext(mn->start_pfn,
 341                                mn->nr_pages, mn->status_change_nid);
 342                break;
 343        case MEM_GOING_OFFLINE:
 344                break;
 345        case MEM_ONLINE:
 346        case MEM_CANCEL_OFFLINE:
 347                break;
 348        }
 349
 350        return notifier_from_errno(ret);
 351}
 352
 353#endif
 354
 355void __init page_ext_init(void)
 356{
 357        unsigned long pfn;
 358        int nid;
 359
 360        if (!invoke_need_callbacks())
 361                return;
 362
 363        for_each_node_state(nid, N_MEMORY) {
 364                unsigned long start_pfn, end_pfn;
 365
 366                start_pfn = node_start_pfn(nid);
 367                end_pfn = node_end_pfn(nid);
 368                /*
 369                 * start_pfn and end_pfn may not be aligned to SECTION and the
 370                 * page->flags of out of node pages are not initialized.  So we
 371                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 372                 */
 373                for (pfn = start_pfn; pfn < end_pfn;
 374                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 375
 376                        if (!pfn_valid(pfn))
 377                                continue;
 378                        /*
 379                         * Nodes's pfns can be overlapping.
 380                         * We know some arch can have a nodes layout such as
 381                         * -------------pfn-------------->
 382                         * N0 | N1 | N2 | N0 | N1 | N2|....
 383                         */
 384                        if (pfn_to_nid(pfn) != nid)
 385                                continue;
 386                        if (init_section_page_ext(pfn, nid))
 387                                goto oom;
 388                }
 389        }
 390        hotplug_memory_notifier(page_ext_callback, 0);
 391        pr_info("allocated %ld bytes of page_ext\n", total_usage);
 392        invoke_init_callbacks();
 393        return;
 394
 395oom:
 396        panic("Out of memory");
 397}
 398
 399void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
 400{
 401}
 402
 403#endif
 404