linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static void __meminit
  15__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
  16{
  17        pc->flags = 0;
  18        pc->mem_cgroup = NULL;
  19        pc->page = pfn_to_page(pfn);
  20        INIT_LIST_HEAD(&pc->lru);
  21}
  22static unsigned long total_usage;
  23
  24#if !defined(CONFIG_SPARSEMEM)
  25
  26
  27void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  28{
  29        pgdat->node_page_cgroup = NULL;
  30}
  31
  32struct page_cgroup *lookup_page_cgroup(struct page *page)
  33{
  34        unsigned long pfn = page_to_pfn(page);
  35        unsigned long offset;
  36        struct page_cgroup *base;
  37
  38        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  39        if (unlikely(!base))
  40                return NULL;
  41
  42        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  43        return base + offset;
  44}
  45
  46static int __init alloc_node_page_cgroup(int nid)
  47{
  48        struct page_cgroup *base, *pc;
  49        unsigned long table_size;
  50        unsigned long start_pfn, nr_pages, index;
  51
  52        start_pfn = NODE_DATA(nid)->node_start_pfn;
  53        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  54
  55        if (!nr_pages)
  56                return 0;
  57
  58        table_size = sizeof(struct page_cgroup) * nr_pages;
  59
  60        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  61                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  62        if (!base)
  63                return -ENOMEM;
  64        for (index = 0; index < nr_pages; index++) {
  65                pc = base + index;
  66                __init_page_cgroup(pc, start_pfn + index);
  67        }
  68        NODE_DATA(nid)->node_page_cgroup = base;
  69        total_usage += table_size;
  70        return 0;
  71}
  72
  73void __init page_cgroup_init_flatmem(void)
  74{
  75
  76        int nid, fail;
  77
  78        if (mem_cgroup_disabled())
  79                return;
  80
  81        for_each_online_node(nid)  {
  82                fail = alloc_node_page_cgroup(nid);
  83                if (fail)
  84                        goto fail;
  85        }
  86        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  87        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
  88        " don't want memory cgroups\n");
  89        return;
  90fail:
  91        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
  92        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
  93        panic("Out of memory");
  94}
  95
  96#else /* CONFIG_FLAT_NODE_MEM_MAP */
  97
  98struct page_cgroup *lookup_page_cgroup(struct page *page)
  99{
 100        unsigned long pfn = page_to_pfn(page);
 101        struct mem_section *section = __pfn_to_section(pfn);
 102
 103        if (!section->page_cgroup)
 104                return NULL;
 105        return section->page_cgroup + pfn;
 106}
 107
 108/* __alloc_bootmem...() is protected by !slab_available() */
 109static int __init_refok init_section_page_cgroup(unsigned long pfn)
 110{
 111        struct mem_section *section = __pfn_to_section(pfn);
 112        struct page_cgroup *base, *pc;
 113        unsigned long table_size;
 114        int nid, index;
 115
 116        if (!section->page_cgroup) {
 117                nid = page_to_nid(pfn_to_page(pfn));
 118                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 119                VM_BUG_ON(!slab_is_available());
 120                if (node_state(nid, N_HIGH_MEMORY)) {
 121                        base = kmalloc_node(table_size,
 122                                GFP_KERNEL | __GFP_NOWARN, nid);
 123                        if (!base)
 124                                base = vmalloc_node(table_size, nid);
 125                } else {
 126                        base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
 127                        if (!base)
 128                                base = vmalloc(table_size);
 129                }
 130                /*
 131                 * The value stored in section->page_cgroup is (base - pfn)
 132                 * and it does not point to the memory block allocated above,
 133                 * causing kmemleak false positives.
 134                 */
 135                kmemleak_not_leak(base);
 136        } else {
 137                /*
 138                 * We don't have to allocate page_cgroup again, but
 139                 * address of memmap may be changed. So, we have to initialize
 140                 * again.
 141                 */
 142                base = section->page_cgroup + pfn;
 143                table_size = 0;
 144                /* check address of memmap is changed or not. */
 145                if (base->page == pfn_to_page(pfn))
 146                        return 0;
 147        }
 148
 149        if (!base) {
 150                printk(KERN_ERR "page cgroup allocation failure\n");
 151                return -ENOMEM;
 152        }
 153
 154        for (index = 0; index < PAGES_PER_SECTION; index++) {
 155                pc = base + index;
 156                __init_page_cgroup(pc, pfn + index);
 157        }
 158
 159        section->page_cgroup = base - pfn;
 160        total_usage += table_size;
 161        return 0;
 162}
 163#ifdef CONFIG_MEMORY_HOTPLUG
 164void __free_page_cgroup(unsigned long pfn)
 165{
 166        struct mem_section *ms;
 167        struct page_cgroup *base;
 168
 169        ms = __pfn_to_section(pfn);
 170        if (!ms || !ms->page_cgroup)
 171                return;
 172        base = ms->page_cgroup + pfn;
 173        if (is_vmalloc_addr(base)) {
 174                vfree(base);
 175                ms->page_cgroup = NULL;
 176        } else {
 177                struct page *page = virt_to_page(base);
 178                if (!PageReserved(page)) { /* Is bootmem ? */
 179                        kfree(base);
 180                        ms->page_cgroup = NULL;
 181                }
 182        }
 183}
 184
 185int __meminit online_page_cgroup(unsigned long start_pfn,
 186                        unsigned long nr_pages,
 187                        int nid)
 188{
 189        unsigned long start, end, pfn;
 190        int fail = 0;
 191
 192        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 193        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 194
 195        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 196                if (!pfn_present(pfn))
 197                        continue;
 198                fail = init_section_page_cgroup(pfn);
 199        }
 200        if (!fail)
 201                return 0;
 202
 203        /* rollback */
 204        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 205                __free_page_cgroup(pfn);
 206
 207        return -ENOMEM;
 208}
 209
 210int __meminit offline_page_cgroup(unsigned long start_pfn,
 211                unsigned long nr_pages, int nid)
 212{
 213        unsigned long start, end, pfn;
 214
 215        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 216        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 217
 218        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 219                __free_page_cgroup(pfn);
 220        return 0;
 221
 222}
 223
 224static int __meminit page_cgroup_callback(struct notifier_block *self,
 225                               unsigned long action, void *arg)
 226{
 227        struct memory_notify *mn = arg;
 228        int ret = 0;
 229        switch (action) {
 230        case MEM_GOING_ONLINE:
 231                ret = online_page_cgroup(mn->start_pfn,
 232                                   mn->nr_pages, mn->status_change_nid);
 233                break;
 234        case MEM_OFFLINE:
 235                offline_page_cgroup(mn->start_pfn,
 236                                mn->nr_pages, mn->status_change_nid);
 237                break;
 238        case MEM_CANCEL_ONLINE:
 239        case MEM_GOING_OFFLINE:
 240                break;
 241        case MEM_ONLINE:
 242        case MEM_CANCEL_OFFLINE:
 243                break;
 244        }
 245
 246        if (ret)
 247                ret = notifier_from_errno(ret);
 248        else
 249                ret = NOTIFY_OK;
 250
 251        return ret;
 252}
 253
 254#endif
 255
 256void __init page_cgroup_init(void)
 257{
 258        unsigned long pfn;
 259        int fail = 0;
 260
 261        if (mem_cgroup_disabled())
 262                return;
 263
 264        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 265                if (!pfn_present(pfn))
 266                        continue;
 267                fail = init_section_page_cgroup(pfn);
 268        }
 269        if (fail) {
 270                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 271                panic("Out of memory");
 272        } else {
 273                hotplug_memory_notifier(page_cgroup_callback, 0);
 274        }
 275        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 276        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
 277        " want memory cgroups\n");
 278}
 279
 280void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 281{
 282        return;
 283}
 284
 285#endif
 286
 287
 288#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 289
 290static DEFINE_MUTEX(swap_cgroup_mutex);
 291struct swap_cgroup_ctrl {
 292        struct page **map;
 293        unsigned long length;
 294        spinlock_t      lock;
 295};
 296
 297struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 298
 299struct swap_cgroup {
 300        unsigned short          id;
 301};
 302#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 303#define SC_POS_MASK     (SC_PER_PAGE - 1)
 304
 305/*
 306 * SwapCgroup implements "lookup" and "exchange" operations.
 307 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 308 * against SwapCache. At swap_free(), this is accessed directly from swap.
 309 *
 310 * This means,
 311 *  - we have no race in "exchange" when we're accessed via SwapCache because
 312 *    SwapCache(and its swp_entry) is under lock.
 313 *  - When called via swap_free(), there is no user of this entry and no race.
 314 * Then, we don't need lock around "exchange".
 315 *
 316 * TODO: we can push these buffers out to HIGHMEM.
 317 */
 318
 319/*
 320 * allocate buffer for swap_cgroup.
 321 */
 322static int swap_cgroup_prepare(int type)
 323{
 324        struct page *page;
 325        struct swap_cgroup_ctrl *ctrl;
 326        unsigned long idx, max;
 327
 328        ctrl = &swap_cgroup_ctrl[type];
 329
 330        for (idx = 0; idx < ctrl->length; idx++) {
 331                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 332                if (!page)
 333                        goto not_enough_page;
 334                ctrl->map[idx] = page;
 335        }
 336        return 0;
 337not_enough_page:
 338        max = idx;
 339        for (idx = 0; idx < max; idx++)
 340                __free_page(ctrl->map[idx]);
 341
 342        return -ENOMEM;
 343}
 344
 345/**
 346 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 347 * @end: swap entry to be cmpxchged
 348 * @old: old id
 349 * @new: new id
 350 *
 351 * Returns old id at success, 0 at failure.
 352 * (There is no mem_cgroup useing 0 as its id)
 353 */
 354unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 355                                        unsigned short old, unsigned short new)
 356{
 357        int type = swp_type(ent);
 358        unsigned long offset = swp_offset(ent);
 359        unsigned long idx = offset / SC_PER_PAGE;
 360        unsigned long pos = offset & SC_POS_MASK;
 361        struct swap_cgroup_ctrl *ctrl;
 362        struct page *mappage;
 363        struct swap_cgroup *sc;
 364        unsigned long flags;
 365        unsigned short retval;
 366
 367        ctrl = &swap_cgroup_ctrl[type];
 368
 369        mappage = ctrl->map[idx];
 370        sc = page_address(mappage);
 371        sc += pos;
 372        spin_lock_irqsave(&ctrl->lock, flags);
 373        retval = sc->id;
 374        if (retval == old)
 375                sc->id = new;
 376        else
 377                retval = 0;
 378        spin_unlock_irqrestore(&ctrl->lock, flags);
 379        return retval;
 380}
 381
 382/**
 383 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 384 * @ent: swap entry to be recorded into
 385 * @mem: mem_cgroup to be recorded
 386 *
 387 * Returns old value at success, 0 at failure.
 388 * (Of course, old value can be 0.)
 389 */
 390unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 391{
 392        int type = swp_type(ent);
 393        unsigned long offset = swp_offset(ent);
 394        unsigned long idx = offset / SC_PER_PAGE;
 395        unsigned long pos = offset & SC_POS_MASK;
 396        struct swap_cgroup_ctrl *ctrl;
 397        struct page *mappage;
 398        struct swap_cgroup *sc;
 399        unsigned short old;
 400        unsigned long flags;
 401
 402        ctrl = &swap_cgroup_ctrl[type];
 403
 404        mappage = ctrl->map[idx];
 405        sc = page_address(mappage);
 406        sc += pos;
 407        spin_lock_irqsave(&ctrl->lock, flags);
 408        old = sc->id;
 409        sc->id = id;
 410        spin_unlock_irqrestore(&ctrl->lock, flags);
 411
 412        return old;
 413}
 414
 415/**
 416 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 417 * @ent: swap entry to be looked up.
 418 *
 419 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 420 */
 421unsigned short lookup_swap_cgroup(swp_entry_t ent)
 422{
 423        int type = swp_type(ent);
 424        unsigned long offset = swp_offset(ent);
 425        unsigned long idx = offset / SC_PER_PAGE;
 426        unsigned long pos = offset & SC_POS_MASK;
 427        struct swap_cgroup_ctrl *ctrl;
 428        struct page *mappage;
 429        struct swap_cgroup *sc;
 430        unsigned short ret;
 431
 432        ctrl = &swap_cgroup_ctrl[type];
 433        mappage = ctrl->map[idx];
 434        sc = page_address(mappage);
 435        sc += pos;
 436        ret = sc->id;
 437        return ret;
 438}
 439
 440int swap_cgroup_swapon(int type, unsigned long max_pages)
 441{
 442        void *array;
 443        unsigned long array_size;
 444        unsigned long length;
 445        struct swap_cgroup_ctrl *ctrl;
 446
 447        if (!do_swap_account)
 448                return 0;
 449
 450        length = ((max_pages/SC_PER_PAGE) + 1);
 451        array_size = length * sizeof(void *);
 452
 453        array = vmalloc(array_size);
 454        if (!array)
 455                goto nomem;
 456
 457        memset(array, 0, array_size);
 458        ctrl = &swap_cgroup_ctrl[type];
 459        mutex_lock(&swap_cgroup_mutex);
 460        ctrl->length = length;
 461        ctrl->map = array;
 462        spin_lock_init(&ctrl->lock);
 463        if (swap_cgroup_prepare(type)) {
 464                /* memory shortage */
 465                ctrl->map = NULL;
 466                ctrl->length = 0;
 467                vfree(array);
 468                mutex_unlock(&swap_cgroup_mutex);
 469                goto nomem;
 470        }
 471        mutex_unlock(&swap_cgroup_mutex);
 472
 473        return 0;
 474nomem:
 475        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 476        printk(KERN_INFO
 477                "swap_cgroup can be disabled by noswapaccount boot option\n");
 478        return -ENOMEM;
 479}
 480
 481void swap_cgroup_swapoff(int type)
 482{
 483        int i;
 484        struct swap_cgroup_ctrl *ctrl;
 485
 486        if (!do_swap_account)
 487                return;
 488
 489        mutex_lock(&swap_cgroup_mutex);
 490        ctrl = &swap_cgroup_ctrl[type];
 491        if (ctrl->map) {
 492                for (i = 0; i < ctrl->length; i++) {
 493                        struct page *page = ctrl->map[i];
 494                        if (page)
 495                                __free_page(page);
 496                }
 497                vfree(ctrl->map);
 498                ctrl->map = NULL;
 499                ctrl->length = 0;
 500        }
 501        mutex_unlock(&swap_cgroup_mutex);
 502}
 503
 504#endif
 505