linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
  15{
  16        pc->flags = 0;
  17        set_page_cgroup_array_id(pc, id);
  18        pc->mem_cgroup = NULL;
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45struct page *lookup_cgroup_page(struct page_cgroup *pc)
  46{
  47        unsigned long pfn;
  48        struct page *page;
  49        pg_data_t *pgdat;
  50
  51        pgdat = NODE_DATA(page_cgroup_array_id(pc));
  52        pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
  53        page = pfn_to_page(pfn);
  54        VM_BUG_ON(pc != lookup_page_cgroup(page));
  55        return page;
  56}
  57
  58static int __init alloc_node_page_cgroup(int nid)
  59{
  60        struct page_cgroup *base, *pc;
  61        unsigned long table_size;
  62        unsigned long start_pfn, nr_pages, index;
  63
  64        start_pfn = NODE_DATA(nid)->node_start_pfn;
  65        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  66
  67        if (!nr_pages)
  68                return 0;
  69
  70        table_size = sizeof(struct page_cgroup) * nr_pages;
  71
  72        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  73                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  74        if (!base)
  75                return -ENOMEM;
  76        for (index = 0; index < nr_pages; index++) {
  77                pc = base + index;
  78                init_page_cgroup(pc, nid);
  79        }
  80        NODE_DATA(nid)->node_page_cgroup = base;
  81        total_usage += table_size;
  82        return 0;
  83}
  84
  85void __init page_cgroup_init_flatmem(void)
  86{
  87
  88        int nid, fail;
  89
  90        if (mem_cgroup_disabled())
  91                return;
  92
  93        for_each_online_node(nid)  {
  94                fail = alloc_node_page_cgroup(nid);
  95                if (fail)
  96                        goto fail;
  97        }
  98        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  99        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
 100        " don't want memory cgroups\n");
 101        return;
 102fail:
 103        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
 104        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 105        panic("Out of memory");
 106}
 107
 108#else /* CONFIG_FLAT_NODE_MEM_MAP */
 109
 110struct page_cgroup *lookup_page_cgroup(struct page *page)
 111{
 112        unsigned long pfn = page_to_pfn(page);
 113        struct mem_section *section = __pfn_to_section(pfn);
 114
 115        if (!section->page_cgroup)
 116                return NULL;
 117        return section->page_cgroup + pfn;
 118}
 119
 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
 121{
 122        struct mem_section *section;
 123        struct page *page;
 124        unsigned long nr;
 125
 126        nr = page_cgroup_array_id(pc);
 127        section = __nr_to_section(nr);
 128        page = pfn_to_page(pc - section->page_cgroup);
 129        VM_BUG_ON(pc != lookup_page_cgroup(page));
 130        return page;
 131}
 132
 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
 134{
 135        void *addr = NULL;
 136
 137        addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
 138        if (addr)
 139                return addr;
 140
 141        if (node_state(nid, N_HIGH_MEMORY))
 142                addr = vmalloc_node(size, nid);
 143        else
 144                addr = vmalloc(size);
 145
 146        return addr;
 147}
 148
 149#ifdef CONFIG_MEMORY_HOTPLUG
 150static void free_page_cgroup(void *addr)
 151{
 152        if (is_vmalloc_addr(addr)) {
 153                vfree(addr);
 154        } else {
 155                struct page *page = virt_to_page(addr);
 156                size_t table_size =
 157                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 158
 159                BUG_ON(PageReserved(page));
 160                free_pages_exact(addr, table_size);
 161        }
 162}
 163#endif
 164
 165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 166{
 167        struct page_cgroup *base, *pc;
 168        struct mem_section *section;
 169        unsigned long table_size;
 170        unsigned long nr;
 171        int index;
 172
 173        nr = pfn_to_section_nr(pfn);
 174        section = __nr_to_section(nr);
 175
 176        if (section->page_cgroup)
 177                return 0;
 178
 179        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 180        base = alloc_page_cgroup(table_size, nid);
 181
 182        /*
 183         * The value stored in section->page_cgroup is (base - pfn)
 184         * and it does not point to the memory block allocated above,
 185         * causing kmemleak false positives.
 186         */
 187        kmemleak_not_leak(base);
 188
 189        if (!base) {
 190                printk(KERN_ERR "page cgroup allocation failure\n");
 191                return -ENOMEM;
 192        }
 193
 194        for (index = 0; index < PAGES_PER_SECTION; index++) {
 195                pc = base + index;
 196                init_page_cgroup(pc, nr);
 197        }
 198        /*
 199         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 200         * we need to apply a mask.
 201         */
 202        pfn &= PAGE_SECTION_MASK;
 203        section->page_cgroup = base - pfn;
 204        total_usage += table_size;
 205        return 0;
 206}
 207#ifdef CONFIG_MEMORY_HOTPLUG
 208void __free_page_cgroup(unsigned long pfn)
 209{
 210        struct mem_section *ms;
 211        struct page_cgroup *base;
 212
 213        ms = __pfn_to_section(pfn);
 214        if (!ms || !ms->page_cgroup)
 215                return;
 216        base = ms->page_cgroup + pfn;
 217        free_page_cgroup(base);
 218        ms->page_cgroup = NULL;
 219}
 220
 221int __meminit online_page_cgroup(unsigned long start_pfn,
 222                        unsigned long nr_pages,
 223                        int nid)
 224{
 225        unsigned long start, end, pfn;
 226        int fail = 0;
 227
 228        start = SECTION_ALIGN_DOWN(start_pfn);
 229        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 230
 231        if (nid == -1) {
 232                /*
 233                 * In this case, "nid" already exists and contains valid memory.
 234                 * "start_pfn" passed to us is a pfn which is an arg for
 235                 * online__pages(), and start_pfn should exist.
 236                 */
 237                nid = pfn_to_nid(start_pfn);
 238                VM_BUG_ON(!node_state(nid, N_ONLINE));
 239        }
 240
 241        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 242                if (!pfn_present(pfn))
 243                        continue;
 244                fail = init_section_page_cgroup(pfn, nid);
 245        }
 246        if (!fail)
 247                return 0;
 248
 249        /* rollback */
 250        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 251                __free_page_cgroup(pfn);
 252
 253        return -ENOMEM;
 254}
 255
 256int __meminit offline_page_cgroup(unsigned long start_pfn,
 257                unsigned long nr_pages, int nid)
 258{
 259        unsigned long start, end, pfn;
 260
 261        start = SECTION_ALIGN_DOWN(start_pfn);
 262        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 263
 264        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 265                __free_page_cgroup(pfn);
 266        return 0;
 267
 268}
 269
 270static int __meminit page_cgroup_callback(struct notifier_block *self,
 271                               unsigned long action, void *arg)
 272{
 273        struct memory_notify *mn = arg;
 274        int ret = 0;
 275        switch (action) {
 276        case MEM_GOING_ONLINE:
 277                ret = online_page_cgroup(mn->start_pfn,
 278                                   mn->nr_pages, mn->status_change_nid);
 279                break;
 280        case MEM_OFFLINE:
 281                offline_page_cgroup(mn->start_pfn,
 282                                mn->nr_pages, mn->status_change_nid);
 283                break;
 284        case MEM_CANCEL_ONLINE:
 285        case MEM_GOING_OFFLINE:
 286                break;
 287        case MEM_ONLINE:
 288        case MEM_CANCEL_OFFLINE:
 289                break;
 290        }
 291
 292        return notifier_from_errno(ret);
 293}
 294
 295#endif
 296
 297void __init page_cgroup_init(void)
 298{
 299        unsigned long pfn;
 300        int nid;
 301
 302        if (mem_cgroup_disabled())
 303                return;
 304
 305        for_each_node_state(nid, N_HIGH_MEMORY) {
 306                unsigned long start_pfn, end_pfn;
 307
 308                start_pfn = node_start_pfn(nid);
 309                end_pfn = node_end_pfn(nid);
 310                /*
 311                 * start_pfn and end_pfn may not be aligned to SECTION and the
 312                 * page->flags of out of node pages are not initialized.  So we
 313                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 314                 */
 315                for (pfn = start_pfn;
 316                     pfn < end_pfn;
 317                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 318
 319                        if (!pfn_valid(pfn))
 320                                continue;
 321                        /*
 322                         * Nodes's pfns can be overlapping.
 323                         * We know some arch can have a nodes layout such as
 324                         * -------------pfn-------------->
 325                         * N0 | N1 | N2 | N0 | N1 | N2|....
 326                         */
 327                        if (pfn_to_nid(pfn) != nid)
 328                                continue;
 329                        if (init_section_page_cgroup(pfn, nid))
 330                                goto oom;
 331                }
 332        }
 333        hotplug_memory_notifier(page_cgroup_callback, 0);
 334        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 335        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 336                         "don't want memory cgroups\n");
 337        return;
 338oom:
 339        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 340        panic("Out of memory");
 341}
 342
 343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 344{
 345        return;
 346}
 347
 348#endif
 349
 350
 351#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 352
 353static DEFINE_MUTEX(swap_cgroup_mutex);
 354struct swap_cgroup_ctrl {
 355        struct page **map;
 356        unsigned long length;
 357        spinlock_t      lock;
 358};
 359
 360struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 361
 362struct swap_cgroup {
 363        unsigned short          id;
 364};
 365#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 366#define SC_POS_MASK     (SC_PER_PAGE - 1)
 367
 368/*
 369 * SwapCgroup implements "lookup" and "exchange" operations.
 370 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 371 * against SwapCache. At swap_free(), this is accessed directly from swap.
 372 *
 373 * This means,
 374 *  - we have no race in "exchange" when we're accessed via SwapCache because
 375 *    SwapCache(and its swp_entry) is under lock.
 376 *  - When called via swap_free(), there is no user of this entry and no race.
 377 * Then, we don't need lock around "exchange".
 378 *
 379 * TODO: we can push these buffers out to HIGHMEM.
 380 */
 381
 382/*
 383 * allocate buffer for swap_cgroup.
 384 */
 385static int swap_cgroup_prepare(int type)
 386{
 387        struct page *page;
 388        struct swap_cgroup_ctrl *ctrl;
 389        unsigned long idx, max;
 390
 391        ctrl = &swap_cgroup_ctrl[type];
 392
 393        for (idx = 0; idx < ctrl->length; idx++) {
 394                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 395                if (!page)
 396                        goto not_enough_page;
 397                ctrl->map[idx] = page;
 398        }
 399        return 0;
 400not_enough_page:
 401        max = idx;
 402        for (idx = 0; idx < max; idx++)
 403                __free_page(ctrl->map[idx]);
 404
 405        return -ENOMEM;
 406}
 407
 408/**
 409 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 410 * @end: swap entry to be cmpxchged
 411 * @old: old id
 412 * @new: new id
 413 *
 414 * Returns old id at success, 0 at failure.
 415 * (There is no mem_cgroup using 0 as its id)
 416 */
 417unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 418                                        unsigned short old, unsigned short new)
 419{
 420        int type = swp_type(ent);
 421        unsigned long offset = swp_offset(ent);
 422        unsigned long idx = offset / SC_PER_PAGE;
 423        unsigned long pos = offset & SC_POS_MASK;
 424        struct swap_cgroup_ctrl *ctrl;
 425        struct page *mappage;
 426        struct swap_cgroup *sc;
 427        unsigned long flags;
 428        unsigned short retval;
 429
 430        ctrl = &swap_cgroup_ctrl[type];
 431
 432        mappage = ctrl->map[idx];
 433        sc = page_address(mappage);
 434        sc += pos;
 435        spin_lock_irqsave(&ctrl->lock, flags);
 436        retval = sc->id;
 437        if (retval == old)
 438                sc->id = new;
 439        else
 440                retval = 0;
 441        spin_unlock_irqrestore(&ctrl->lock, flags);
 442        return retval;
 443}
 444
 445/**
 446 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 447 * @ent: swap entry to be recorded into
 448 * @mem: mem_cgroup to be recorded
 449 *
 450 * Returns old value at success, 0 at failure.
 451 * (Of course, old value can be 0.)
 452 */
 453unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 454{
 455        int type = swp_type(ent);
 456        unsigned long offset = swp_offset(ent);
 457        unsigned long idx = offset / SC_PER_PAGE;
 458        unsigned long pos = offset & SC_POS_MASK;
 459        struct swap_cgroup_ctrl *ctrl;
 460        struct page *mappage;
 461        struct swap_cgroup *sc;
 462        unsigned short old;
 463        unsigned long flags;
 464
 465        ctrl = &swap_cgroup_ctrl[type];
 466
 467        mappage = ctrl->map[idx];
 468        sc = page_address(mappage);
 469        sc += pos;
 470        spin_lock_irqsave(&ctrl->lock, flags);
 471        old = sc->id;
 472        sc->id = id;
 473        spin_unlock_irqrestore(&ctrl->lock, flags);
 474
 475        return old;
 476}
 477
 478/**
 479 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 480 * @ent: swap entry to be looked up.
 481 *
 482 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 483 */
 484unsigned short lookup_swap_cgroup(swp_entry_t ent)
 485{
 486        int type = swp_type(ent);
 487        unsigned long offset = swp_offset(ent);
 488        unsigned long idx = offset / SC_PER_PAGE;
 489        unsigned long pos = offset & SC_POS_MASK;
 490        struct swap_cgroup_ctrl *ctrl;
 491        struct page *mappage;
 492        struct swap_cgroup *sc;
 493        unsigned short ret;
 494
 495        ctrl = &swap_cgroup_ctrl[type];
 496        mappage = ctrl->map[idx];
 497        sc = page_address(mappage);
 498        sc += pos;
 499        ret = sc->id;
 500        return ret;
 501}
 502
 503int swap_cgroup_swapon(int type, unsigned long max_pages)
 504{
 505        void *array;
 506        unsigned long array_size;
 507        unsigned long length;
 508        struct swap_cgroup_ctrl *ctrl;
 509
 510        if (!do_swap_account)
 511                return 0;
 512
 513        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 514        array_size = length * sizeof(void *);
 515
 516        array = vmalloc(array_size);
 517        if (!array)
 518                goto nomem;
 519
 520        memset(array, 0, array_size);
 521        ctrl = &swap_cgroup_ctrl[type];
 522        mutex_lock(&swap_cgroup_mutex);
 523        ctrl->length = length;
 524        ctrl->map = array;
 525        spin_lock_init(&ctrl->lock);
 526        if (swap_cgroup_prepare(type)) {
 527                /* memory shortage */
 528                ctrl->map = NULL;
 529                ctrl->length = 0;
 530                mutex_unlock(&swap_cgroup_mutex);
 531                vfree(array);
 532                goto nomem;
 533        }
 534        mutex_unlock(&swap_cgroup_mutex);
 535
 536        return 0;
 537nomem:
 538        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 539        printk(KERN_INFO
 540                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 541        return -ENOMEM;
 542}
 543
 544void swap_cgroup_swapoff(int type)
 545{
 546        struct page **map;
 547        unsigned long i, length;
 548        struct swap_cgroup_ctrl *ctrl;
 549
 550        if (!do_swap_account)
 551                return;
 552
 553        mutex_lock(&swap_cgroup_mutex);
 554        ctrl = &swap_cgroup_ctrl[type];
 555        map = ctrl->map;
 556        length = ctrl->length;
 557        ctrl->map = NULL;
 558        ctrl->length = 0;
 559        mutex_unlock(&swap_cgroup_mutex);
 560
 561        if (map) {
 562                for (i = 0; i < length; i++) {
 563                        struct page *page = map[i];
 564                        if (page)
 565                                __free_page(page);
 566                }
 567                vfree(map);
 568        }
 569}
 570
 571#endif
 572