linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12
  13static void __meminit
  14__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
  15{
  16        pc->flags = 0;
  17        pc->mem_cgroup = NULL;
  18        pc->page = pfn_to_page(pfn);
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45static int __init alloc_node_page_cgroup(int nid)
  46{
  47        struct page_cgroup *base, *pc;
  48        unsigned long table_size;
  49        unsigned long start_pfn, nr_pages, index;
  50
  51        start_pfn = NODE_DATA(nid)->node_start_pfn;
  52        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  53
  54        if (!nr_pages)
  55                return 0;
  56
  57        table_size = sizeof(struct page_cgroup) * nr_pages;
  58
  59        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  60                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  61        if (!base)
  62                return -ENOMEM;
  63        for (index = 0; index < nr_pages; index++) {
  64                pc = base + index;
  65                __init_page_cgroup(pc, start_pfn + index);
  66        }
  67        NODE_DATA(nid)->node_page_cgroup = base;
  68        total_usage += table_size;
  69        return 0;
  70}
  71
  72void __init page_cgroup_init_flatmem(void)
  73{
  74
  75        int nid, fail;
  76
  77        if (mem_cgroup_disabled())
  78                return;
  79
  80        for_each_online_node(nid)  {
  81                fail = alloc_node_page_cgroup(nid);
  82                if (fail)
  83                        goto fail;
  84        }
  85        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  86        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
  87        " don't want memory cgroups\n");
  88        return;
  89fail:
  90        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
  91        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
  92        panic("Out of memory");
  93}
  94
  95#else /* CONFIG_FLAT_NODE_MEM_MAP */
  96
  97struct page_cgroup *lookup_page_cgroup(struct page *page)
  98{
  99        unsigned long pfn = page_to_pfn(page);
 100        struct mem_section *section = __pfn_to_section(pfn);
 101
 102        if (!section->page_cgroup)
 103                return NULL;
 104        return section->page_cgroup + pfn;
 105}
 106
 107/* __alloc_bootmem...() is protected by !slab_available() */
 108static int __init_refok init_section_page_cgroup(unsigned long pfn)
 109{
 110        struct mem_section *section = __pfn_to_section(pfn);
 111        struct page_cgroup *base, *pc;
 112        unsigned long table_size;
 113        int nid, index;
 114
 115        if (!section->page_cgroup) {
 116                nid = page_to_nid(pfn_to_page(pfn));
 117                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 118                VM_BUG_ON(!slab_is_available());
 119                if (node_state(nid, N_HIGH_MEMORY)) {
 120                        base = kmalloc_node(table_size,
 121                                GFP_KERNEL | __GFP_NOWARN, nid);
 122                        if (!base)
 123                                base = vmalloc_node(table_size, nid);
 124                } else {
 125                        base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
 126                        if (!base)
 127                                base = vmalloc(table_size);
 128                }
 129        } else {
 130                /*
 131                 * We don't have to allocate page_cgroup again, but
 132                 * address of memmap may be changed. So, we have to initialize
 133                 * again.
 134                 */
 135                base = section->page_cgroup + pfn;
 136                table_size = 0;
 137                /* check address of memmap is changed or not. */
 138                if (base->page == pfn_to_page(pfn))
 139                        return 0;
 140        }
 141
 142        if (!base) {
 143                printk(KERN_ERR "page cgroup allocation failure\n");
 144                return -ENOMEM;
 145        }
 146
 147        for (index = 0; index < PAGES_PER_SECTION; index++) {
 148                pc = base + index;
 149                __init_page_cgroup(pc, pfn + index);
 150        }
 151
 152        section->page_cgroup = base - pfn;
 153        total_usage += table_size;
 154        return 0;
 155}
 156#ifdef CONFIG_MEMORY_HOTPLUG
 157void __free_page_cgroup(unsigned long pfn)
 158{
 159        struct mem_section *ms;
 160        struct page_cgroup *base;
 161
 162        ms = __pfn_to_section(pfn);
 163        if (!ms || !ms->page_cgroup)
 164                return;
 165        base = ms->page_cgroup + pfn;
 166        if (is_vmalloc_addr(base)) {
 167                vfree(base);
 168                ms->page_cgroup = NULL;
 169        } else {
 170                struct page *page = virt_to_page(base);
 171                if (!PageReserved(page)) { /* Is bootmem ? */
 172                        kfree(base);
 173                        ms->page_cgroup = NULL;
 174                }
 175        }
 176}
 177
 178int __meminit online_page_cgroup(unsigned long start_pfn,
 179                        unsigned long nr_pages,
 180                        int nid)
 181{
 182        unsigned long start, end, pfn;
 183        int fail = 0;
 184
 185        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 186        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 187
 188        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 189                if (!pfn_present(pfn))
 190                        continue;
 191                fail = init_section_page_cgroup(pfn);
 192        }
 193        if (!fail)
 194                return 0;
 195
 196        /* rollback */
 197        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 198                __free_page_cgroup(pfn);
 199
 200        return -ENOMEM;
 201}
 202
 203int __meminit offline_page_cgroup(unsigned long start_pfn,
 204                unsigned long nr_pages, int nid)
 205{
 206        unsigned long start, end, pfn;
 207
 208        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 209        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 210
 211        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 212                __free_page_cgroup(pfn);
 213        return 0;
 214
 215}
 216
 217static int __meminit page_cgroup_callback(struct notifier_block *self,
 218                               unsigned long action, void *arg)
 219{
 220        struct memory_notify *mn = arg;
 221        int ret = 0;
 222        switch (action) {
 223        case MEM_GOING_ONLINE:
 224                ret = online_page_cgroup(mn->start_pfn,
 225                                   mn->nr_pages, mn->status_change_nid);
 226                break;
 227        case MEM_OFFLINE:
 228                offline_page_cgroup(mn->start_pfn,
 229                                mn->nr_pages, mn->status_change_nid);
 230                break;
 231        case MEM_CANCEL_ONLINE:
 232        case MEM_GOING_OFFLINE:
 233                break;
 234        case MEM_ONLINE:
 235        case MEM_CANCEL_OFFLINE:
 236                break;
 237        }
 238
 239        if (ret)
 240                ret = notifier_from_errno(ret);
 241        else
 242                ret = NOTIFY_OK;
 243
 244        return ret;
 245}
 246
 247#endif
 248
 249void __init page_cgroup_init(void)
 250{
 251        unsigned long pfn;
 252        int fail = 0;
 253
 254        if (mem_cgroup_disabled())
 255                return;
 256
 257        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 258                if (!pfn_present(pfn))
 259                        continue;
 260                fail = init_section_page_cgroup(pfn);
 261        }
 262        if (fail) {
 263                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 264                panic("Out of memory");
 265        } else {
 266                hotplug_memory_notifier(page_cgroup_callback, 0);
 267        }
 268        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 269        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
 270        " want memory cgroups\n");
 271}
 272
 273void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 274{
 275        return;
 276}
 277
 278#endif
 279
 280
 281#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 282
 283static DEFINE_MUTEX(swap_cgroup_mutex);
 284struct swap_cgroup_ctrl {
 285        struct page **map;
 286        unsigned long length;
 287};
 288
 289struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 290
 291struct swap_cgroup {
 292        unsigned short          id;
 293};
 294#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 295#define SC_POS_MASK     (SC_PER_PAGE - 1)
 296
 297/*
 298 * SwapCgroup implements "lookup" and "exchange" operations.
 299 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 300 * against SwapCache. At swap_free(), this is accessed directly from swap.
 301 *
 302 * This means,
 303 *  - we have no race in "exchange" when we're accessed via SwapCache because
 304 *    SwapCache(and its swp_entry) is under lock.
 305 *  - When called via swap_free(), there is no user of this entry and no race.
 306 * Then, we don't need lock around "exchange".
 307 *
 308 * TODO: we can push these buffers out to HIGHMEM.
 309 */
 310
 311/*
 312 * allocate buffer for swap_cgroup.
 313 */
 314static int swap_cgroup_prepare(int type)
 315{
 316        struct page *page;
 317        struct swap_cgroup_ctrl *ctrl;
 318        unsigned long idx, max;
 319
 320        ctrl = &swap_cgroup_ctrl[type];
 321
 322        for (idx = 0; idx < ctrl->length; idx++) {
 323                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 324                if (!page)
 325                        goto not_enough_page;
 326                ctrl->map[idx] = page;
 327        }
 328        return 0;
 329not_enough_page:
 330        max = idx;
 331        for (idx = 0; idx < max; idx++)
 332                __free_page(ctrl->map[idx]);
 333
 334        return -ENOMEM;
 335}
 336
 337/**
 338 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 339 * @ent: swap entry to be recorded into
 340 * @mem: mem_cgroup to be recorded
 341 *
 342 * Returns old value at success, 0 at failure.
 343 * (Of course, old value can be 0.)
 344 */
 345unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 346{
 347        int type = swp_type(ent);
 348        unsigned long offset = swp_offset(ent);
 349        unsigned long idx = offset / SC_PER_PAGE;
 350        unsigned long pos = offset & SC_POS_MASK;
 351        struct swap_cgroup_ctrl *ctrl;
 352        struct page *mappage;
 353        struct swap_cgroup *sc;
 354        unsigned short old;
 355
 356        ctrl = &swap_cgroup_ctrl[type];
 357
 358        mappage = ctrl->map[idx];
 359        sc = page_address(mappage);
 360        sc += pos;
 361        old = sc->id;
 362        sc->id = id;
 363
 364        return old;
 365}
 366
 367/**
 368 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 369 * @ent: swap entry to be looked up.
 370 *
 371 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 372 */
 373unsigned short lookup_swap_cgroup(swp_entry_t ent)
 374{
 375        int type = swp_type(ent);
 376        unsigned long offset = swp_offset(ent);
 377        unsigned long idx = offset / SC_PER_PAGE;
 378        unsigned long pos = offset & SC_POS_MASK;
 379        struct swap_cgroup_ctrl *ctrl;
 380        struct page *mappage;
 381        struct swap_cgroup *sc;
 382        unsigned short ret;
 383
 384        ctrl = &swap_cgroup_ctrl[type];
 385        mappage = ctrl->map[idx];
 386        sc = page_address(mappage);
 387        sc += pos;
 388        ret = sc->id;
 389        return ret;
 390}
 391
 392int swap_cgroup_swapon(int type, unsigned long max_pages)
 393{
 394        void *array;
 395        unsigned long array_size;
 396        unsigned long length;
 397        struct swap_cgroup_ctrl *ctrl;
 398
 399        if (!do_swap_account)
 400                return 0;
 401
 402        length = ((max_pages/SC_PER_PAGE) + 1);
 403        array_size = length * sizeof(void *);
 404
 405        array = vmalloc(array_size);
 406        if (!array)
 407                goto nomem;
 408
 409        memset(array, 0, array_size);
 410        ctrl = &swap_cgroup_ctrl[type];
 411        mutex_lock(&swap_cgroup_mutex);
 412        ctrl->length = length;
 413        ctrl->map = array;
 414        if (swap_cgroup_prepare(type)) {
 415                /* memory shortage */
 416                ctrl->map = NULL;
 417                ctrl->length = 0;
 418                vfree(array);
 419                mutex_unlock(&swap_cgroup_mutex);
 420                goto nomem;
 421        }
 422        mutex_unlock(&swap_cgroup_mutex);
 423
 424        return 0;
 425nomem:
 426        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 427        printk(KERN_INFO
 428                "swap_cgroup can be disabled by noswapaccount boot option\n");
 429        return -ENOMEM;
 430}
 431
 432void swap_cgroup_swapoff(int type)
 433{
 434        int i;
 435        struct swap_cgroup_ctrl *ctrl;
 436
 437        if (!do_swap_account)
 438                return;
 439
 440        mutex_lock(&swap_cgroup_mutex);
 441        ctrl = &swap_cgroup_ctrl[type];
 442        if (ctrl->map) {
 443                for (i = 0; i < ctrl->length; i++) {
 444                        struct page *page = ctrl->map[i];
 445                        if (page)
 446                                __free_page(page);
 447                }
 448                vfree(ctrl->map);
 449                ctrl->map = NULL;
 450                ctrl->length = 0;
 451        }
 452        mutex_unlock(&swap_cgroup_mutex);
 453}
 454
 455#endif
 456