linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static unsigned long total_usage;
  15
  16#if !defined(CONFIG_SPARSEMEM)
  17
  18
  19void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  20{
  21        pgdat->node_page_cgroup = NULL;
  22}
  23
  24struct page_cgroup *lookup_page_cgroup(struct page *page)
  25{
  26        unsigned long pfn = page_to_pfn(page);
  27        unsigned long offset;
  28        struct page_cgroup *base;
  29
  30        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  31#ifdef CONFIG_DEBUG_VM
  32        /*
  33         * The sanity checks the page allocator does upon freeing a
  34         * page can reach here before the page_cgroup arrays are
  35         * allocated when feeding a range of pages to the allocator
  36         * for the first time during bootup or memory hotplug.
  37         */
  38        if (unlikely(!base))
  39                return NULL;
  40#endif
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45static int __init alloc_node_page_cgroup(int nid)
  46{
  47        struct page_cgroup *base;
  48        unsigned long table_size;
  49        unsigned long nr_pages;
  50
  51        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  52        if (!nr_pages)
  53                return 0;
  54
  55        table_size = sizeof(struct page_cgroup) * nr_pages;
  56
  57        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  58                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  59        if (!base)
  60                return -ENOMEM;
  61        NODE_DATA(nid)->node_page_cgroup = base;
  62        total_usage += table_size;
  63        return 0;
  64}
  65
  66void __init page_cgroup_init_flatmem(void)
  67{
  68
  69        int nid, fail;
  70
  71        if (mem_cgroup_disabled())
  72                return;
  73
  74        for_each_online_node(nid)  {
  75                fail = alloc_node_page_cgroup(nid);
  76                if (fail)
  77                        goto fail;
  78        }
  79        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  80        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
  81        " don't want memory cgroups\n");
  82        invoke_page_ext_init_callbacks();
  83        return;
  84fail:
  85        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
  86        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
  87        panic("Out of memory");
  88}
  89
  90#else /* CONFIG_FLAT_NODE_MEM_MAP */
  91
  92struct page_cgroup *lookup_page_cgroup(struct page *page)
  93{
  94        unsigned long pfn = page_to_pfn(page);
  95        struct mem_section *section = __pfn_to_section(pfn);
  96#ifdef CONFIG_DEBUG_VM
  97        /*
  98         * The sanity checks the page allocator does upon freeing a
  99         * page can reach here before the page_cgroup arrays are
 100         * allocated when feeding a range of pages to the allocator
 101         * for the first time during bootup or memory hotplug.
 102         */
 103        if (!section->page_cgroup)
 104                return NULL;
 105#endif
 106        return section->page_cgroup + pfn;
 107}
 108
 109static void *__meminit alloc_page_cgroup(size_t size, int nid)
 110{
 111        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 112        void *addr = NULL;
 113
 114        addr = alloc_pages_exact_nid(nid, size, flags);
 115        if (addr) {
 116                kmemleak_alloc(addr, size, 1, flags);
 117                return addr;
 118        }
 119
 120        if (node_state(nid, N_HIGH_MEMORY))
 121                addr = vzalloc_node(size, nid);
 122        else
 123                addr = vzalloc(size);
 124
 125        return addr;
 126}
 127
 128static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 129{
 130        struct mem_section *section;
 131        struct page_cgroup *base;
 132        unsigned long table_size;
 133
 134        section = __pfn_to_section(pfn);
 135
 136        if (section->page_cgroup)
 137                return 0;
 138
 139        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 140        base = alloc_page_cgroup(table_size, nid);
 141
 142        /*
 143         * The value stored in section->page_cgroup is (base - pfn)
 144         * and it does not point to the memory block allocated above,
 145         * causing kmemleak false positives.
 146         */
 147        kmemleak_not_leak(base);
 148
 149        if (!base) {
 150                printk(KERN_ERR "page cgroup allocation failure\n");
 151                return -ENOMEM;
 152        }
 153
 154        /*
 155         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 156         * we need to apply a mask.
 157         */
 158        pfn &= PAGE_SECTION_MASK;
 159        section->page_cgroup = base - pfn;
 160        total_usage += table_size;
 161        return 0;
 162}
 163#ifdef CONFIG_MEMORY_HOTPLUG
 164static void free_page_cgroup(void *addr)
 165{
 166        if (is_vmalloc_addr(addr)) {
 167                vfree(addr);
 168        } else {
 169                struct page *page = virt_to_page(addr);
 170                size_t table_size =
 171                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 172
 173                BUG_ON(PageReserved(page));
 174                free_pages_exact(addr, table_size);
 175        }
 176}
 177
 178void __free_page_cgroup(unsigned long pfn)
 179{
 180        struct mem_section *ms;
 181        struct page_cgroup *base;
 182
 183        ms = __pfn_to_section(pfn);
 184        if (!ms || !ms->page_cgroup)
 185                return;
 186        base = ms->page_cgroup + pfn;
 187        free_page_cgroup(base);
 188        ms->page_cgroup = NULL;
 189}
 190
 191int __meminit online_page_cgroup(unsigned long start_pfn,
 192                        unsigned long nr_pages,
 193                        int nid)
 194{
 195        unsigned long start, end, pfn;
 196        int fail = 0;
 197
 198        start = SECTION_ALIGN_DOWN(start_pfn);
 199        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 200
 201        if (nid == -1) {
 202                /*
 203                 * In this case, "nid" already exists and contains valid memory.
 204                 * "start_pfn" passed to us is a pfn which is an arg for
 205                 * online__pages(), and start_pfn should exist.
 206                 */
 207                nid = pfn_to_nid(start_pfn);
 208                VM_BUG_ON(!node_state(nid, N_ONLINE));
 209        }
 210
 211        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 212                if (!pfn_present(pfn))
 213                        continue;
 214                fail = init_section_page_cgroup(pfn, nid);
 215        }
 216        if (!fail)
 217                return 0;
 218
 219        /* rollback */
 220        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 221                __free_page_cgroup(pfn);
 222
 223        return -ENOMEM;
 224}
 225
 226int __meminit offline_page_cgroup(unsigned long start_pfn,
 227                unsigned long nr_pages, int nid)
 228{
 229        unsigned long start, end, pfn;
 230
 231        start = SECTION_ALIGN_DOWN(start_pfn);
 232        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 233
 234        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 235                __free_page_cgroup(pfn);
 236        return 0;
 237
 238}
 239
 240static int __meminit page_cgroup_callback(struct notifier_block *self,
 241                               unsigned long action, void *arg)
 242{
 243        struct memory_notify *mn = arg;
 244        int ret = 0;
 245        switch (action) {
 246        case MEM_GOING_ONLINE:
 247                ret = online_page_cgroup(mn->start_pfn,
 248                                   mn->nr_pages, mn->status_change_nid);
 249                break;
 250        case MEM_OFFLINE:
 251                offline_page_cgroup(mn->start_pfn,
 252                                mn->nr_pages, mn->status_change_nid);
 253                break;
 254        case MEM_CANCEL_ONLINE:
 255                offline_page_cgroup(mn->start_pfn,
 256                                mn->nr_pages, mn->status_change_nid);
 257                break;
 258        case MEM_GOING_OFFLINE:
 259                break;
 260        case MEM_ONLINE:
 261        case MEM_CANCEL_OFFLINE:
 262                break;
 263        }
 264
 265        return notifier_from_errno(ret);
 266}
 267
 268#endif
 269
 270void __init page_cgroup_init(void)
 271{
 272        unsigned long pfn;
 273        int nid;
 274
 275        if (mem_cgroup_disabled())
 276                return;
 277
 278        for_each_node_state(nid, N_MEMORY) {
 279                unsigned long start_pfn, end_pfn;
 280
 281                start_pfn = node_start_pfn(nid);
 282                end_pfn = node_end_pfn(nid);
 283                /*
 284                 * start_pfn and end_pfn may not be aligned to SECTION and the
 285                 * page->flags of out of node pages are not initialized.  So we
 286                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 287                 */
 288                for (pfn = start_pfn;
 289                     pfn < end_pfn;
 290                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 291
 292                        if (!pfn_valid(pfn))
 293                                continue;
 294                        /*
 295                         * Nodes's pfns can be overlapping.
 296                         * We know some arch can have a nodes layout such as
 297                         * -------------pfn-------------->
 298                         * N0 | N1 | N2 | N0 | N1 | N2|....
 299                         * skip if this section starts in a higher node
 300                         */
 301                        if (early_pfn_to_nid(pfn) > nid)
 302                                continue;
 303                        if (init_section_page_cgroup(pfn, nid))
 304                                goto oom;
 305                }
 306        }
 307        hotplug_memory_notifier(page_cgroup_callback, 0);
 308        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 309        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 310                         "don't want memory cgroups\n");
 311        invoke_page_ext_init_callbacks();
 312        return;
 313oom:
 314        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 315        panic("Out of memory");
 316}
 317
 318void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 319{
 320        return;
 321}
 322
 323#endif
 324
 325
 326#ifdef CONFIG_MEMCG_SWAP
 327
 328static DEFINE_MUTEX(swap_cgroup_mutex);
 329struct swap_cgroup_ctrl {
 330        struct page **map;
 331        unsigned long length;
 332        spinlock_t      lock;
 333};
 334
 335static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 336
 337struct swap_cgroup {
 338        unsigned short          id;
 339};
 340#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 341
 342/*
 343 * SwapCgroup implements "lookup" and "exchange" operations.
 344 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 345 * against SwapCache. At swap_free(), this is accessed directly from swap.
 346 *
 347 * This means,
 348 *  - we have no race in "exchange" when we're accessed via SwapCache because
 349 *    SwapCache(and its swp_entry) is under lock.
 350 *  - When called via swap_free(), there is no user of this entry and no race.
 351 * Then, we don't need lock around "exchange".
 352 *
 353 * TODO: we can push these buffers out to HIGHMEM.
 354 */
 355
 356/*
 357 * allocate buffer for swap_cgroup.
 358 */
 359static int swap_cgroup_prepare(int type)
 360{
 361        struct page *page;
 362        struct swap_cgroup_ctrl *ctrl;
 363        unsigned long idx, max;
 364
 365        ctrl = &swap_cgroup_ctrl[type];
 366
 367        for (idx = 0; idx < ctrl->length; idx++) {
 368                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 369                if (!page)
 370                        goto not_enough_page;
 371                ctrl->map[idx] = page;
 372        }
 373        return 0;
 374not_enough_page:
 375        max = idx;
 376        for (idx = 0; idx < max; idx++)
 377                __free_page(ctrl->map[idx]);
 378
 379        return -ENOMEM;
 380}
 381
 382static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 383                                        struct swap_cgroup_ctrl **ctrlp)
 384{
 385        pgoff_t offset = swp_offset(ent);
 386        struct swap_cgroup_ctrl *ctrl;
 387        struct page *mappage;
 388        struct swap_cgroup *sc;
 389
 390        ctrl = &swap_cgroup_ctrl[swp_type(ent)];
 391        if (ctrlp)
 392                *ctrlp = ctrl;
 393
 394        mappage = ctrl->map[offset / SC_PER_PAGE];
 395        sc = page_address(mappage);
 396        return sc + offset % SC_PER_PAGE;
 397}
 398
 399/**
 400 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 401 * @ent: swap entry to be cmpxchged
 402 * @old: old id
 403 * @new: new id
 404 *
 405 * Returns old id at success, 0 at failure.
 406 * (There is no mem_cgroup using 0 as its id)
 407 */
 408unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 409                                        unsigned short old, unsigned short new)
 410{
 411        struct swap_cgroup_ctrl *ctrl;
 412        struct swap_cgroup *sc;
 413        unsigned long flags;
 414        unsigned short retval;
 415
 416        sc = lookup_swap_cgroup(ent, &ctrl);
 417
 418        spin_lock_irqsave(&ctrl->lock, flags);
 419        retval = sc->id;
 420        if (retval == old)
 421                sc->id = new;
 422        else
 423                retval = 0;
 424        spin_unlock_irqrestore(&ctrl->lock, flags);
 425        return retval;
 426}
 427
 428/**
 429 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 430 * @ent: swap entry to be recorded into
 431 * @id: mem_cgroup to be recorded
 432 *
 433 * Returns old value at success, 0 at failure.
 434 * (Of course, old value can be 0.)
 435 */
 436unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 437{
 438        struct swap_cgroup_ctrl *ctrl;
 439        struct swap_cgroup *sc;
 440        unsigned short old;
 441        unsigned long flags;
 442
 443        sc = lookup_swap_cgroup(ent, &ctrl);
 444
 445        spin_lock_irqsave(&ctrl->lock, flags);
 446        old = sc->id;
 447        sc->id = id;
 448        spin_unlock_irqrestore(&ctrl->lock, flags);
 449
 450        return old;
 451}
 452
 453/**
 454 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
 455 * @ent: swap entry to be looked up.
 456 *
 457 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 458 */
 459unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 460{
 461        return lookup_swap_cgroup(ent, NULL)->id;
 462}
 463
 464int swap_cgroup_swapon(int type, unsigned long max_pages)
 465{
 466        void *array;
 467        unsigned long array_size;
 468        unsigned long length;
 469        struct swap_cgroup_ctrl *ctrl;
 470
 471        if (!do_swap_account)
 472                return 0;
 473
 474        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 475        array_size = length * sizeof(void *);
 476
 477        array = vzalloc(array_size);
 478        if (!array)
 479                goto nomem;
 480
 481        ctrl = &swap_cgroup_ctrl[type];
 482        mutex_lock(&swap_cgroup_mutex);
 483        ctrl->length = length;
 484        ctrl->map = array;
 485        spin_lock_init(&ctrl->lock);
 486        if (swap_cgroup_prepare(type)) {
 487                /* memory shortage */
 488                ctrl->map = NULL;
 489                ctrl->length = 0;
 490                mutex_unlock(&swap_cgroup_mutex);
 491                vfree(array);
 492                goto nomem;
 493        }
 494        mutex_unlock(&swap_cgroup_mutex);
 495
 496        return 0;
 497nomem:
 498        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 499        printk(KERN_INFO
 500                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 501        return -ENOMEM;
 502}
 503
 504void swap_cgroup_swapoff(int type)
 505{
 506        struct page **map;
 507        unsigned long i, length;
 508        struct swap_cgroup_ctrl *ctrl;
 509
 510        if (!do_swap_account)
 511                return;
 512
 513        mutex_lock(&swap_cgroup_mutex);
 514        ctrl = &swap_cgroup_ctrl[type];
 515        map = ctrl->map;
 516        length = ctrl->length;
 517        ctrl->map = NULL;
 518        ctrl->length = 0;
 519        mutex_unlock(&swap_cgroup_mutex);
 520
 521        if (map) {
 522                for (i = 0; i < length; i++) {
 523                        struct page *page = map[i];
 524                        if (page)
 525                                __free_page(page);
 526                }
 527                vfree(map);
 528        }
 529}
 530
 531#endif
 532