linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
  15{
  16        pc->flags = 0;
  17        set_page_cgroup_array_id(pc, id);
  18        pc->mem_cgroup = NULL;
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45struct page *lookup_cgroup_page(struct page_cgroup *pc)
  46{
  47        unsigned long pfn;
  48        struct page *page;
  49        pg_data_t *pgdat;
  50
  51        pgdat = NODE_DATA(page_cgroup_array_id(pc));
  52        pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
  53        page = pfn_to_page(pfn);
  54        VM_BUG_ON(pc != lookup_page_cgroup(page));
  55        return page;
  56}
  57
  58static int __init alloc_node_page_cgroup(int nid)
  59{
  60        struct page_cgroup *base, *pc;
  61        unsigned long table_size;
  62        unsigned long start_pfn, nr_pages, index;
  63
  64        start_pfn = NODE_DATA(nid)->node_start_pfn;
  65        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  66
  67        if (!nr_pages)
  68                return 0;
  69
  70        table_size = sizeof(struct page_cgroup) * nr_pages;
  71
  72        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  73                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  74        if (!base)
  75                return -ENOMEM;
  76        for (index = 0; index < nr_pages; index++) {
  77                pc = base + index;
  78                init_page_cgroup(pc, nid);
  79        }
  80        NODE_DATA(nid)->node_page_cgroup = base;
  81        total_usage += table_size;
  82        return 0;
  83}
  84
  85void __init page_cgroup_init_flatmem(void)
  86{
  87
  88        int nid, fail;
  89
  90        if (mem_cgroup_disabled())
  91                return;
  92
  93        for_each_online_node(nid)  {
  94                fail = alloc_node_page_cgroup(nid);
  95                if (fail)
  96                        goto fail;
  97        }
  98        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  99        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
 100        " don't want memory cgroups\n");
 101        return;
 102fail:
 103        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
 104        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 105        panic("Out of memory");
 106}
 107
 108#else /* CONFIG_FLAT_NODE_MEM_MAP */
 109
 110struct page_cgroup *lookup_page_cgroup(struct page *page)
 111{
 112        unsigned long pfn = page_to_pfn(page);
 113        struct mem_section *section = __pfn_to_section(pfn);
 114
 115        if (!section->page_cgroup)
 116                return NULL;
 117        return section->page_cgroup + pfn;
 118}
 119
 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
 121{
 122        struct mem_section *section;
 123        struct page *page;
 124        unsigned long nr;
 125
 126        nr = page_cgroup_array_id(pc);
 127        section = __nr_to_section(nr);
 128        page = pfn_to_page(pc - section->page_cgroup);
 129        VM_BUG_ON(pc != lookup_page_cgroup(page));
 130        return page;
 131}
 132
 133static void *__init_refok alloc_page_cgroup(size_t size, int nid)
 134{
 135        void *addr = NULL;
 136
 137        addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
 138        if (addr)
 139                return addr;
 140
 141        if (node_state(nid, N_HIGH_MEMORY))
 142                addr = vmalloc_node(size, nid);
 143        else
 144                addr = vmalloc(size);
 145
 146        return addr;
 147}
 148
 149#ifdef CONFIG_MEMORY_HOTPLUG
 150static void free_page_cgroup(void *addr)
 151{
 152        if (is_vmalloc_addr(addr)) {
 153                vfree(addr);
 154        } else {
 155                struct page *page = virt_to_page(addr);
 156                size_t table_size =
 157                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 158
 159                BUG_ON(PageReserved(page));
 160                free_pages_exact(addr, table_size);
 161        }
 162}
 163#endif
 164
 165static int __init_refok init_section_page_cgroup(unsigned long pfn)
 166{
 167        struct page_cgroup *base, *pc;
 168        struct mem_section *section;
 169        unsigned long table_size;
 170        unsigned long nr;
 171        int nid, index;
 172
 173        nr = pfn_to_section_nr(pfn);
 174        section = __nr_to_section(nr);
 175
 176        if (section->page_cgroup)
 177                return 0;
 178
 179        nid = page_to_nid(pfn_to_page(pfn));
 180        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 181        base = alloc_page_cgroup(table_size, nid);
 182
 183        /*
 184         * The value stored in section->page_cgroup is (base - pfn)
 185         * and it does not point to the memory block allocated above,
 186         * causing kmemleak false positives.
 187         */
 188        kmemleak_not_leak(base);
 189
 190        if (!base) {
 191                printk(KERN_ERR "page cgroup allocation failure\n");
 192                return -ENOMEM;
 193        }
 194
 195        for (index = 0; index < PAGES_PER_SECTION; index++) {
 196                pc = base + index;
 197                init_page_cgroup(pc, nr);
 198        }
 199
 200        section->page_cgroup = base - pfn;
 201        total_usage += table_size;
 202        return 0;
 203}
 204#ifdef CONFIG_MEMORY_HOTPLUG
 205void __free_page_cgroup(unsigned long pfn)
 206{
 207        struct mem_section *ms;
 208        struct page_cgroup *base;
 209
 210        ms = __pfn_to_section(pfn);
 211        if (!ms || !ms->page_cgroup)
 212                return;
 213        base = ms->page_cgroup + pfn;
 214        free_page_cgroup(base);
 215        ms->page_cgroup = NULL;
 216}
 217
 218int __meminit online_page_cgroup(unsigned long start_pfn,
 219                        unsigned long nr_pages,
 220                        int nid)
 221{
 222        unsigned long start, end, pfn;
 223        int fail = 0;
 224
 225        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 226        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 227
 228        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 229                if (!pfn_present(pfn))
 230                        continue;
 231                fail = init_section_page_cgroup(pfn);
 232        }
 233        if (!fail)
 234                return 0;
 235
 236        /* rollback */
 237        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 238                __free_page_cgroup(pfn);
 239
 240        return -ENOMEM;
 241}
 242
 243int __meminit offline_page_cgroup(unsigned long start_pfn,
 244                unsigned long nr_pages, int nid)
 245{
 246        unsigned long start, end, pfn;
 247
 248        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 249        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 250
 251        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 252                __free_page_cgroup(pfn);
 253        return 0;
 254
 255}
 256
 257static int __meminit page_cgroup_callback(struct notifier_block *self,
 258                               unsigned long action, void *arg)
 259{
 260        struct memory_notify *mn = arg;
 261        int ret = 0;
 262        switch (action) {
 263        case MEM_GOING_ONLINE:
 264                ret = online_page_cgroup(mn->start_pfn,
 265                                   mn->nr_pages, mn->status_change_nid);
 266                break;
 267        case MEM_OFFLINE:
 268                offline_page_cgroup(mn->start_pfn,
 269                                mn->nr_pages, mn->status_change_nid);
 270                break;
 271        case MEM_CANCEL_ONLINE:
 272        case MEM_GOING_OFFLINE:
 273                break;
 274        case MEM_ONLINE:
 275        case MEM_CANCEL_OFFLINE:
 276                break;
 277        }
 278
 279        return notifier_from_errno(ret);
 280}
 281
 282#endif
 283
 284void __init page_cgroup_init(void)
 285{
 286        unsigned long pfn;
 287        int fail = 0;
 288
 289        if (mem_cgroup_disabled())
 290                return;
 291
 292        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 293                if (!pfn_present(pfn))
 294                        continue;
 295                fail = init_section_page_cgroup(pfn);
 296        }
 297        if (fail) {
 298                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 299                panic("Out of memory");
 300        } else {
 301                hotplug_memory_notifier(page_cgroup_callback, 0);
 302        }
 303        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 304        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
 305        " want memory cgroups\n");
 306}
 307
 308void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 309{
 310        return;
 311}
 312
 313#endif
 314
 315
 316#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 317
 318static DEFINE_MUTEX(swap_cgroup_mutex);
 319struct swap_cgroup_ctrl {
 320        struct page **map;
 321        unsigned long length;
 322        spinlock_t      lock;
 323};
 324
 325struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 326
 327struct swap_cgroup {
 328        unsigned short          id;
 329};
 330#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 331#define SC_POS_MASK     (SC_PER_PAGE - 1)
 332
 333/*
 334 * SwapCgroup implements "lookup" and "exchange" operations.
 335 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 336 * against SwapCache. At swap_free(), this is accessed directly from swap.
 337 *
 338 * This means,
 339 *  - we have no race in "exchange" when we're accessed via SwapCache because
 340 *    SwapCache(and its swp_entry) is under lock.
 341 *  - When called via swap_free(), there is no user of this entry and no race.
 342 * Then, we don't need lock around "exchange".
 343 *
 344 * TODO: we can push these buffers out to HIGHMEM.
 345 */
 346
 347/*
 348 * allocate buffer for swap_cgroup.
 349 */
 350static int swap_cgroup_prepare(int type)
 351{
 352        struct page *page;
 353        struct swap_cgroup_ctrl *ctrl;
 354        unsigned long idx, max;
 355
 356        ctrl = &swap_cgroup_ctrl[type];
 357
 358        for (idx = 0; idx < ctrl->length; idx++) {
 359                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 360                if (!page)
 361                        goto not_enough_page;
 362                ctrl->map[idx] = page;
 363        }
 364        return 0;
 365not_enough_page:
 366        max = idx;
 367        for (idx = 0; idx < max; idx++)
 368                __free_page(ctrl->map[idx]);
 369
 370        return -ENOMEM;
 371}
 372
 373/**
 374 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 375 * @end: swap entry to be cmpxchged
 376 * @old: old id
 377 * @new: new id
 378 *
 379 * Returns old id at success, 0 at failure.
 380 * (There is no mem_cgroup using 0 as its id)
 381 */
 382unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 383                                        unsigned short old, unsigned short new)
 384{
 385        int type = swp_type(ent);
 386        unsigned long offset = swp_offset(ent);
 387        unsigned long idx = offset / SC_PER_PAGE;
 388        unsigned long pos = offset & SC_POS_MASK;
 389        struct swap_cgroup_ctrl *ctrl;
 390        struct page *mappage;
 391        struct swap_cgroup *sc;
 392        unsigned long flags;
 393        unsigned short retval;
 394
 395        ctrl = &swap_cgroup_ctrl[type];
 396
 397        mappage = ctrl->map[idx];
 398        sc = page_address(mappage);
 399        sc += pos;
 400        spin_lock_irqsave(&ctrl->lock, flags);
 401        retval = sc->id;
 402        if (retval == old)
 403                sc->id = new;
 404        else
 405                retval = 0;
 406        spin_unlock_irqrestore(&ctrl->lock, flags);
 407        return retval;
 408}
 409
 410/**
 411 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 412 * @ent: swap entry to be recorded into
 413 * @mem: mem_cgroup to be recorded
 414 *
 415 * Returns old value at success, 0 at failure.
 416 * (Of course, old value can be 0.)
 417 */
 418unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 419{
 420        int type = swp_type(ent);
 421        unsigned long offset = swp_offset(ent);
 422        unsigned long idx = offset / SC_PER_PAGE;
 423        unsigned long pos = offset & SC_POS_MASK;
 424        struct swap_cgroup_ctrl *ctrl;
 425        struct page *mappage;
 426        struct swap_cgroup *sc;
 427        unsigned short old;
 428        unsigned long flags;
 429
 430        ctrl = &swap_cgroup_ctrl[type];
 431
 432        mappage = ctrl->map[idx];
 433        sc = page_address(mappage);
 434        sc += pos;
 435        spin_lock_irqsave(&ctrl->lock, flags);
 436        old = sc->id;
 437        sc->id = id;
 438        spin_unlock_irqrestore(&ctrl->lock, flags);
 439
 440        return old;
 441}
 442
 443/**
 444 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 445 * @ent: swap entry to be looked up.
 446 *
 447 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 448 */
 449unsigned short lookup_swap_cgroup(swp_entry_t ent)
 450{
 451        int type = swp_type(ent);
 452        unsigned long offset = swp_offset(ent);
 453        unsigned long idx = offset / SC_PER_PAGE;
 454        unsigned long pos = offset & SC_POS_MASK;
 455        struct swap_cgroup_ctrl *ctrl;
 456        struct page *mappage;
 457        struct swap_cgroup *sc;
 458        unsigned short ret;
 459
 460        ctrl = &swap_cgroup_ctrl[type];
 461        mappage = ctrl->map[idx];
 462        sc = page_address(mappage);
 463        sc += pos;
 464        ret = sc->id;
 465        return ret;
 466}
 467
 468int swap_cgroup_swapon(int type, unsigned long max_pages)
 469{
 470        void *array;
 471        unsigned long array_size;
 472        unsigned long length;
 473        struct swap_cgroup_ctrl *ctrl;
 474
 475        if (!do_swap_account)
 476                return 0;
 477
 478        length = ((max_pages/SC_PER_PAGE) + 1);
 479        array_size = length * sizeof(void *);
 480
 481        array = vmalloc(array_size);
 482        if (!array)
 483                goto nomem;
 484
 485        memset(array, 0, array_size);
 486        ctrl = &swap_cgroup_ctrl[type];
 487        mutex_lock(&swap_cgroup_mutex);
 488        ctrl->length = length;
 489        ctrl->map = array;
 490        spin_lock_init(&ctrl->lock);
 491        if (swap_cgroup_prepare(type)) {
 492                /* memory shortage */
 493                ctrl->map = NULL;
 494                ctrl->length = 0;
 495                vfree(array);
 496                mutex_unlock(&swap_cgroup_mutex);
 497                goto nomem;
 498        }
 499        mutex_unlock(&swap_cgroup_mutex);
 500
 501        return 0;
 502nomem:
 503        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 504        printk(KERN_INFO
 505                "swap_cgroup can be disabled by noswapaccount boot option\n");
 506        return -ENOMEM;
 507}
 508
 509void swap_cgroup_swapoff(int type)
 510{
 511        int i;
 512        struct swap_cgroup_ctrl *ctrl;
 513
 514        if (!do_swap_account)
 515                return;
 516
 517        mutex_lock(&swap_cgroup_mutex);
 518        ctrl = &swap_cgroup_ctrl[type];
 519        if (ctrl->map) {
 520                for (i = 0; i < ctrl->length; i++) {
 521                        struct page *page = ctrl->map[i];
 522                        if (page)
 523                                __free_page(page);
 524                }
 525                vfree(ctrl->map);
 526                ctrl->map = NULL;
 527                ctrl->length = 0;
 528        }
 529        mutex_unlock(&swap_cgroup_mutex);
 530}
 531
 532#endif
 533