linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static unsigned long total_usage;
  15
  16#if !defined(CONFIG_SPARSEMEM)
  17
  18
  19void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  20{
  21        pgdat->node_page_cgroup = NULL;
  22}
  23
  24struct page_cgroup *lookup_page_cgroup(struct page *page)
  25{
  26        unsigned long pfn = page_to_pfn(page);
  27        unsigned long offset;
  28        struct page_cgroup *base;
  29
  30        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  31#ifdef CONFIG_DEBUG_VM
  32        /*
  33         * The sanity checks the page allocator does upon freeing a
  34         * page can reach here before the page_cgroup arrays are
  35         * allocated when feeding a range of pages to the allocator
  36         * for the first time during bootup or memory hotplug.
  37         */
  38        if (unlikely(!base))
  39                return NULL;
  40#endif
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45static int __init alloc_node_page_cgroup(int nid)
  46{
  47        struct page_cgroup *base;
  48        unsigned long table_size;
  49        unsigned long nr_pages;
  50
  51        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  52        if (!nr_pages)
  53                return 0;
  54
  55        table_size = sizeof(struct page_cgroup) * nr_pages;
  56
  57        base = memblock_virt_alloc_try_nid_nopanic(
  58                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
  59                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
  60        if (!base)
  61                return -ENOMEM;
  62        NODE_DATA(nid)->node_page_cgroup = base;
  63        total_usage += table_size;
  64        return 0;
  65}
  66
  67void __init page_cgroup_init_flatmem(void)
  68{
  69
  70        int nid, fail;
  71
  72        if (mem_cgroup_disabled())
  73                return;
  74
  75        for_each_online_node(nid)  {
  76                fail = alloc_node_page_cgroup(nid);
  77                if (fail)
  78                        goto fail;
  79        }
  80        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  81        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
  82        " don't want memory cgroups\n");
  83        return;
  84fail:
  85        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
  86        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
  87        panic("Out of memory");
  88}
  89
  90#else /* CONFIG_FLAT_NODE_MEM_MAP */
  91
  92struct page_cgroup *lookup_page_cgroup(struct page *page)
  93{
  94        unsigned long pfn = page_to_pfn(page);
  95        struct mem_section *section = __pfn_to_section(pfn);
  96#ifdef CONFIG_DEBUG_VM
  97        /*
  98         * The sanity checks the page allocator does upon freeing a
  99         * page can reach here before the page_cgroup arrays are
 100         * allocated when feeding a range of pages to the allocator
 101         * for the first time during bootup or memory hotplug.
 102         */
 103        if (!section->page_cgroup)
 104                return NULL;
 105#endif
 106        return section->page_cgroup + pfn;
 107}
 108
 109static void *__meminit alloc_page_cgroup(size_t size, int nid)
 110{
 111        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 112        void *addr = NULL;
 113
 114        addr = alloc_pages_exact_nid(nid, size, flags);
 115        if (addr) {
 116                kmemleak_alloc(addr, size, 1, flags);
 117                return addr;
 118        }
 119
 120        if (node_state(nid, N_HIGH_MEMORY))
 121                addr = vzalloc_node(size, nid);
 122        else
 123                addr = vzalloc(size);
 124
 125        return addr;
 126}
 127
 128static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 129{
 130        struct mem_section *section;
 131        struct page_cgroup *base;
 132        unsigned long table_size;
 133
 134        section = __pfn_to_section(pfn);
 135
 136        if (section->page_cgroup)
 137                return 0;
 138
 139        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 140        base = alloc_page_cgroup(table_size, nid);
 141
 142        /*
 143         * The value stored in section->page_cgroup is (base - pfn)
 144         * and it does not point to the memory block allocated above,
 145         * causing kmemleak false positives.
 146         */
 147        kmemleak_not_leak(base);
 148
 149        if (!base) {
 150                printk(KERN_ERR "page cgroup allocation failure\n");
 151                return -ENOMEM;
 152        }
 153
 154        /*
 155         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 156         * we need to apply a mask.
 157         */
 158        pfn &= PAGE_SECTION_MASK;
 159        section->page_cgroup = base - pfn;
 160        total_usage += table_size;
 161        return 0;
 162}
 163#ifdef CONFIG_MEMORY_HOTPLUG
 164static void free_page_cgroup(void *addr)
 165{
 166        if (is_vmalloc_addr(addr)) {
 167                vfree(addr);
 168        } else {
 169                struct page *page = virt_to_page(addr);
 170                size_t table_size =
 171                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 172
 173                BUG_ON(PageReserved(page));
 174                kmemleak_free(addr);
 175                free_pages_exact(addr, table_size);
 176        }
 177}
 178
 179static void __free_page_cgroup(unsigned long pfn)
 180{
 181        struct mem_section *ms;
 182        struct page_cgroup *base;
 183
 184        ms = __pfn_to_section(pfn);
 185        if (!ms || !ms->page_cgroup)
 186                return;
 187        base = ms->page_cgroup + pfn;
 188        free_page_cgroup(base);
 189        ms->page_cgroup = NULL;
 190}
 191
 192static int __meminit online_page_cgroup(unsigned long start_pfn,
 193                                unsigned long nr_pages,
 194                                int nid)
 195{
 196        unsigned long start, end, pfn;
 197        int fail = 0;
 198
 199        start = SECTION_ALIGN_DOWN(start_pfn);
 200        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 201
 202        if (nid == -1) {
 203                /*
 204                 * In this case, "nid" already exists and contains valid memory.
 205                 * "start_pfn" passed to us is a pfn which is an arg for
 206                 * online__pages(), and start_pfn should exist.
 207                 */
 208                nid = pfn_to_nid(start_pfn);
 209                VM_BUG_ON(!node_state(nid, N_ONLINE));
 210        }
 211
 212        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 213                if (!pfn_present(pfn))
 214                        continue;
 215                fail = init_section_page_cgroup(pfn, nid);
 216        }
 217        if (!fail)
 218                return 0;
 219
 220        /* rollback */
 221        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 222                __free_page_cgroup(pfn);
 223
 224        return -ENOMEM;
 225}
 226
 227static int __meminit offline_page_cgroup(unsigned long start_pfn,
 228                                unsigned long nr_pages, int nid)
 229{
 230        unsigned long start, end, pfn;
 231
 232        start = SECTION_ALIGN_DOWN(start_pfn);
 233        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 234
 235        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 236                __free_page_cgroup(pfn);
 237        return 0;
 238
 239}
 240
 241static int __meminit page_cgroup_callback(struct notifier_block *self,
 242                               unsigned long action, void *arg)
 243{
 244        struct memory_notify *mn = arg;
 245        int ret = 0;
 246        switch (action) {
 247        case MEM_GOING_ONLINE:
 248                ret = online_page_cgroup(mn->start_pfn,
 249                                   mn->nr_pages, mn->status_change_nid);
 250                break;
 251        case MEM_OFFLINE:
 252                offline_page_cgroup(mn->start_pfn,
 253                                mn->nr_pages, mn->status_change_nid);
 254                break;
 255        case MEM_CANCEL_ONLINE:
 256                offline_page_cgroup(mn->start_pfn,
 257                                mn->nr_pages, mn->status_change_nid);
 258                break;
 259        case MEM_GOING_OFFLINE:
 260                break;
 261        case MEM_ONLINE:
 262        case MEM_CANCEL_OFFLINE:
 263                break;
 264        }
 265
 266        return notifier_from_errno(ret);
 267}
 268
 269#endif
 270
 271void __init page_cgroup_init(void)
 272{
 273        unsigned long pfn;
 274        int nid;
 275
 276        if (mem_cgroup_disabled())
 277                return;
 278
 279        for_each_node_state(nid, N_MEMORY) {
 280                unsigned long start_pfn, end_pfn;
 281
 282                start_pfn = node_start_pfn(nid);
 283                end_pfn = node_end_pfn(nid);
 284                /*
 285                 * start_pfn and end_pfn may not be aligned to SECTION and the
 286                 * page->flags of out of node pages are not initialized.  So we
 287                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 288                 */
 289                for (pfn = start_pfn;
 290                     pfn < end_pfn;
 291                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 292
 293                        if (!pfn_valid(pfn))
 294                                continue;
 295                        /*
 296                         * Nodes's pfns can be overlapping.
 297                         * We know some arch can have a nodes layout such as
 298                         * -------------pfn-------------->
 299                         * N0 | N1 | N2 | N0 | N1 | N2|....
 300                         */
 301                        if (pfn_to_nid(pfn) != nid)
 302                                continue;
 303                        if (init_section_page_cgroup(pfn, nid))
 304                                goto oom;
 305                }
 306        }
 307        hotplug_memory_notifier(page_cgroup_callback, 0);
 308        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 309        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 310                         "don't want memory cgroups\n");
 311        return;
 312oom:
 313        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 314        panic("Out of memory");
 315}
 316
 317void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 318{
 319        return;
 320}
 321
 322#endif
 323
 324
 325#ifdef CONFIG_MEMCG_SWAP
 326
 327static DEFINE_MUTEX(swap_cgroup_mutex);
 328struct swap_cgroup_ctrl {
 329        struct page **map;
 330        unsigned long length;
 331        spinlock_t      lock;
 332};
 333
 334static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 335
 336struct swap_cgroup {
 337        unsigned short          id;
 338};
 339#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 340
 341/*
 342 * SwapCgroup implements "lookup" and "exchange" operations.
 343 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 344 * against SwapCache. At swap_free(), this is accessed directly from swap.
 345 *
 346 * This means,
 347 *  - we have no race in "exchange" when we're accessed via SwapCache because
 348 *    SwapCache(and its swp_entry) is under lock.
 349 *  - When called via swap_free(), there is no user of this entry and no race.
 350 * Then, we don't need lock around "exchange".
 351 *
 352 * TODO: we can push these buffers out to HIGHMEM.
 353 */
 354
 355/*
 356 * allocate buffer for swap_cgroup.
 357 */
 358static int swap_cgroup_prepare(int type)
 359{
 360        struct page *page;
 361        struct swap_cgroup_ctrl *ctrl;
 362        unsigned long idx, max;
 363
 364        ctrl = &swap_cgroup_ctrl[type];
 365
 366        for (idx = 0; idx < ctrl->length; idx++) {
 367                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 368                if (!page)
 369                        goto not_enough_page;
 370                ctrl->map[idx] = page;
 371        }
 372        return 0;
 373not_enough_page:
 374        max = idx;
 375        for (idx = 0; idx < max; idx++)
 376                __free_page(ctrl->map[idx]);
 377
 378        return -ENOMEM;
 379}
 380
 381static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 382                                        struct swap_cgroup_ctrl **ctrlp)
 383{
 384        pgoff_t offset = swp_offset(ent);
 385        struct swap_cgroup_ctrl *ctrl;
 386        struct page *mappage;
 387        struct swap_cgroup *sc;
 388
 389        ctrl = &swap_cgroup_ctrl[swp_type(ent)];
 390        if (ctrlp)
 391                *ctrlp = ctrl;
 392
 393        mappage = ctrl->map[offset / SC_PER_PAGE];
 394        sc = page_address(mappage);
 395        return sc + offset % SC_PER_PAGE;
 396}
 397
 398/**
 399 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 400 * @ent: swap entry to be cmpxchged
 401 * @old: old id
 402 * @new: new id
 403 *
 404 * Returns old id at success, 0 at failure.
 405 * (There is no mem_cgroup using 0 as its id)
 406 */
 407unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 408                                        unsigned short old, unsigned short new)
 409{
 410        struct swap_cgroup_ctrl *ctrl;
 411        struct swap_cgroup *sc;
 412        unsigned long flags;
 413        unsigned short retval;
 414
 415        sc = lookup_swap_cgroup(ent, &ctrl);
 416
 417        spin_lock_irqsave(&ctrl->lock, flags);
 418        retval = sc->id;
 419        if (retval == old)
 420                sc->id = new;
 421        else
 422                retval = 0;
 423        spin_unlock_irqrestore(&ctrl->lock, flags);
 424        return retval;
 425}
 426
 427/**
 428 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 429 * @ent: swap entry to be recorded into
 430 * @id: mem_cgroup to be recorded
 431 *
 432 * Returns old value at success, 0 at failure.
 433 * (Of course, old value can be 0.)
 434 */
 435unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 436{
 437        struct swap_cgroup_ctrl *ctrl;
 438        struct swap_cgroup *sc;
 439        unsigned short old;
 440        unsigned long flags;
 441
 442        sc = lookup_swap_cgroup(ent, &ctrl);
 443
 444        spin_lock_irqsave(&ctrl->lock, flags);
 445        old = sc->id;
 446        sc->id = id;
 447        spin_unlock_irqrestore(&ctrl->lock, flags);
 448
 449        return old;
 450}
 451
 452/**
 453 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
 454 * @ent: swap entry to be looked up.
 455 *
 456 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 457 */
 458unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 459{
 460        return lookup_swap_cgroup(ent, NULL)->id;
 461}
 462
 463int swap_cgroup_swapon(int type, unsigned long max_pages)
 464{
 465        void *array;
 466        unsigned long array_size;
 467        unsigned long length;
 468        struct swap_cgroup_ctrl *ctrl;
 469
 470        if (!do_swap_account)
 471                return 0;
 472
 473        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 474        array_size = length * sizeof(void *);
 475
 476        array = vzalloc(array_size);
 477        if (!array)
 478                goto nomem;
 479
 480        ctrl = &swap_cgroup_ctrl[type];
 481        mutex_lock(&swap_cgroup_mutex);
 482        ctrl->length = length;
 483        ctrl->map = array;
 484        spin_lock_init(&ctrl->lock);
 485        if (swap_cgroup_prepare(type)) {
 486                /* memory shortage */
 487                ctrl->map = NULL;
 488                ctrl->length = 0;
 489                mutex_unlock(&swap_cgroup_mutex);
 490                vfree(array);
 491                goto nomem;
 492        }
 493        mutex_unlock(&swap_cgroup_mutex);
 494
 495        return 0;
 496nomem:
 497        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 498        printk(KERN_INFO
 499                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 500        return -ENOMEM;
 501}
 502
 503void swap_cgroup_swapoff(int type)
 504{
 505        struct page **map;
 506        unsigned long i, length;
 507        struct swap_cgroup_ctrl *ctrl;
 508
 509        if (!do_swap_account)
 510                return;
 511
 512        mutex_lock(&swap_cgroup_mutex);
 513        ctrl = &swap_cgroup_ctrl[type];
 514        map = ctrl->map;
 515        length = ctrl->length;
 516        ctrl->map = NULL;
 517        ctrl->length = 0;
 518        mutex_unlock(&swap_cgroup_mutex);
 519
 520        if (map) {
 521                for (i = 0; i < length; i++) {
 522                        struct page *page = map[i];
 523                        if (page)
 524                                __free_page(page);
 525                }
 526                vfree(map);
 527        }
 528}
 529
 530#endif
 531