LXR linux/mm/hugetlb

   1/*
   2 *
   3 * Copyright IBM Corporation, 2012
   4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
   5 *
   6 * Cgroup v2
   7 * Copyright (C) 2019 Red Hat, Inc.
   8 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms of version 2.1 of the GNU Lesser General Public License
  12 * as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope that it would be useful, but
  15 * WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  17 *
  18 */
  19
  20#include <linux/cgroup.h>
  21#include <linux/page_counter.h>
  22#include <linux/slab.h>
  23#include <linux/hugetlb.h>
  24#include <linux/hugetlb_cgroup.h>
  25
  26enum hugetlb_memory_event {
  27        HUGETLB_MAX,
  28        HUGETLB_NR_MEMORY_EVENTS,
  29};
  30
  31struct hugetlb_cgroup {
  32        struct cgroup_subsys_state css;
  33
  34        /*
  35         * the counter to account for hugepages from hugetlb.
  36         */
  37        struct page_counter hugepage[HUGE_MAX_HSTATE];
  38
  39        atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
  40        atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
  41
  42        /* Handle for "hugetlb.events" */
  43        struct cgroup_file events_file[HUGE_MAX_HSTATE];
  44
  45        /* Handle for "hugetlb.events.local" */
  46        struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
  47};
  48
  49#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
  50#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
  51#define MEMFILE_ATTR(val)       ((val) & 0xffff)
  52
  53#define hugetlb_cgroup_from_counter(counter, idx)                   \
  54        container_of(counter, struct hugetlb_cgroup, hugepage[idx])
  55
  56static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
  57
  58static inline
  59struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
  60{
  61        return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
  62}
  63
  64static inline
  65struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
  66{
  67        return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
  68}
  69
  70static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
  71{
  72        return (h_cg == root_h_cgroup);
  73}
  74
  75static inline struct hugetlb_cgroup *
  76parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
  77{
  78        return hugetlb_cgroup_from_css(h_cg->css.parent);
  79}
  80
  81static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
  82{
  83        int idx;
  84
  85        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
  86                if (page_counter_read(&h_cg->hugepage[idx]))
  87                        return true;
  88        }
  89        return false;
  90}
  91
  92static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
  93                                struct hugetlb_cgroup *parent_h_cgroup)
  94{
  95        int idx;
  96
  97        for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
  98                struct page_counter *counter = &h_cgroup->hugepage[idx];
  99                struct page_counter *parent = NULL;
 100                unsigned long limit;
 101                int ret;
 102
 103                if (parent_h_cgroup)
 104                        parent = &parent_h_cgroup->hugepage[idx];
 105                page_counter_init(counter, parent);
 106
 107                limit = round_down(PAGE_COUNTER_MAX,
 108                                   1 << huge_page_order(&hstates[idx]));
 109                ret = page_counter_set_max(counter, limit);
 110                VM_BUG_ON(ret);
 111        }
 112}
 113
 114static struct cgroup_subsys_state *
 115hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 116{
 117        struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
 118        struct hugetlb_cgroup *h_cgroup;
 119
 120        h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
 121        if (!h_cgroup)
 122                return ERR_PTR(-ENOMEM);
 123
 124        if (!parent_h_cgroup)
 125                root_h_cgroup = h_cgroup;
 126
 127        hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
 128        return &h_cgroup->css;
 129}
 130
 131static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
 132{
 133        struct hugetlb_cgroup *h_cgroup;
 134
 135        h_cgroup = hugetlb_cgroup_from_css(css);
 136        kfree(h_cgroup);
 137}
 138
 139
 140/*
 141 * Should be called with hugetlb_lock held.
 142 * Since we are holding hugetlb_lock, pages cannot get moved from
 143 * active list or uncharged from the cgroup, So no need to get
 144 * page reference and test for page active here. This function
 145 * cannot fail.
 146 */
 147static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
 148                                       struct page *page)
 149{
 150        unsigned int nr_pages;
 151        struct page_counter *counter;
 152        struct hugetlb_cgroup *page_hcg;
 153        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
 154
 155        page_hcg = hugetlb_cgroup_from_page(page);
 156        /*
 157         * We can have pages in active list without any cgroup
 158         * ie, hugepage with less than 3 pages. We can safely
 159         * ignore those pages.
 160         */
 161        if (!page_hcg || page_hcg != h_cg)
 162                goto out;
 163
 164        nr_pages = compound_nr(page);
 165        if (!parent) {
 166                parent = root_h_cgroup;
 167                /* root has no limit */
 168                page_counter_charge(&parent->hugepage[idx], nr_pages);
 169        }
 170        counter = &h_cg->hugepage[idx];
 171        /* Take the pages off the local counter */
 172        page_counter_cancel(counter, nr_pages);
 173
 174        set_hugetlb_cgroup(page, parent);
 175out:
 176        return;
 177}
 178
 179/*
 180 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 181 * the parent cgroup.
 182 */
 183static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
 184{
 185        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
 186        struct hstate *h;
 187        struct page *page;
 188        int idx = 0;
 189
 190        do {
 191                for_each_hstate(h) {
 192                        spin_lock(&hugetlb_lock);
 193                        list_for_each_entry(page, &h->hugepage_activelist, lru)
 194                                hugetlb_cgroup_move_parent(idx, h_cg, page);
 195
 196                        spin_unlock(&hugetlb_lock);
 197                        idx++;
 198                }
 199                cond_resched();
 200        } while (hugetlb_cgroup_have_usage(h_cg));
 201}
 202
 203static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
 204                                 enum hugetlb_memory_event event)
 205{
 206        atomic_long_inc(&hugetlb->events_local[idx][event]);
 207        cgroup_file_notify(&hugetlb->events_local_file[idx]);
 208
 209        do {
 210                atomic_long_inc(&hugetlb->events[idx][event]);
 211                cgroup_file_notify(&hugetlb->events_file[idx]);
 212        } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
 213                 !hugetlb_cgroup_is_root(hugetlb));
 214}
 215
 216int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 217                                 struct hugetlb_cgroup **ptr)
 218{
 219        int ret = 0;
 220        struct page_counter *counter;
 221        struct hugetlb_cgroup *h_cg = NULL;
 222
 223        if (hugetlb_cgroup_disabled())
 224                goto done;
 225        /*
 226         * We don't charge any cgroup if the compound page have less
 227         * than 3 pages.
 228         */
 229        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
 230                goto done;
 231again:
 232        rcu_read_lock();
 233        h_cg = hugetlb_cgroup_from_task(current);
 234        if (!css_tryget(&h_cg->css)) {
 235                rcu_read_unlock();
 236                goto again;
 237        }
 238        rcu_read_unlock();
 239
 240        if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
 241                                     &counter)) {
 242                ret = -ENOMEM;
 243                hugetlb_event(h_cg, idx, HUGETLB_MAX);
 244        }
 245        css_put(&h_cg->css);
 246done:
 247        *ptr = h_cg;
 248        return ret;
 249}
 250
 251/* Should be called with hugetlb_lock held */
 252void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 253                                  struct hugetlb_cgroup *h_cg,
 254                                  struct page *page)
 255{
 256        if (hugetlb_cgroup_disabled() || !h_cg)
 257                return;
 258
 259        set_hugetlb_cgroup(page, h_cg);
 260        return;
 261}
 262
 263/*
 264 * Should be called with hugetlb_lock held
 265 */
 266void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
 267                                  struct page *page)
 268{
 269        struct hugetlb_cgroup *h_cg;
 270
 271        if (hugetlb_cgroup_disabled())
 272                return;
 273        lockdep_assert_held(&hugetlb_lock);
 274        h_cg = hugetlb_cgroup_from_page(page);
 275        if (unlikely(!h_cg))
 276                return;
 277        set_hugetlb_cgroup(page, NULL);
 278        page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
 279        return;
 280}
 281
 282void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 283                                    struct hugetlb_cgroup *h_cg)
 284{
 285        if (hugetlb_cgroup_disabled() || !h_cg)
 286                return;
 287
 288        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
 289                return;
 290
 291        page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
 292        return;
 293}
 294
 295enum {
 296        RES_USAGE,
 297        RES_LIMIT,
 298        RES_MAX_USAGE,
 299        RES_FAILCNT,
 300};
 301
 302static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
 303                                   struct cftype *cft)
 304{
 305        struct page_counter *counter;
 306        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
 307
 308        counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
 309
 310        switch (MEMFILE_ATTR(cft->private)) {
 311        case RES_USAGE:
 312                return (u64)page_counter_read(counter) * PAGE_SIZE;
 313        case RES_LIMIT:
 314                return (u64)counter->max * PAGE_SIZE;
 315        case RES_MAX_USAGE:
 316                return (u64)counter->watermark * PAGE_SIZE;
 317        case RES_FAILCNT:
 318                return counter->failcnt;
 319        default:
 320                BUG();
 321        }
 322}
 323
 324static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
 325{
 326        int idx;
 327        u64 val;
 328        struct cftype *cft = seq_cft(seq);
 329        unsigned long limit;
 330        struct page_counter *counter;
 331        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
 332
 333        idx = MEMFILE_IDX(cft->private);
 334        counter = &h_cg->hugepage[idx];
 335
 336        limit = round_down(PAGE_COUNTER_MAX,
 337                           1 << huge_page_order(&hstates[idx]));
 338
 339        switch (MEMFILE_ATTR(cft->private)) {
 340        case RES_USAGE:
 341                val = (u64)page_counter_read(counter);
 342                seq_printf(seq, "%llu\n", val * PAGE_SIZE);
 343                break;
 344        case RES_LIMIT:
 345                val = (u64)counter->max;
 346                if (val == limit)
 347                        seq_puts(seq, "max\n");
 348                else
 349                        seq_printf(seq, "%llu\n", val * PAGE_SIZE);
 350                break;
 351        default:
 352                BUG();
 353        }
 354
 355        return 0;
 356}
 357
 358static DEFINE_MUTEX(hugetlb_limit_mutex);
 359
 360static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 361                                    char *buf, size_t nbytes, loff_t off,
 362                                    const char *max)
 363{
 364        int ret, idx;
 365        unsigned long nr_pages;
 366        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 367
 368        if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
 369                return -EINVAL;
 370
 371        buf = strstrip(buf);
 372        ret = page_counter_memparse(buf, max, &nr_pages);
 373        if (ret)
 374                return ret;
 375
 376        idx = MEMFILE_IDX(of_cft(of)->private);
 377        nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
 378
 379        switch (MEMFILE_ATTR(of_cft(of)->private)) {
 380        case RES_LIMIT:
 381                mutex_lock(&hugetlb_limit_mutex);
 382                ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
 383                mutex_unlock(&hugetlb_limit_mutex);
 384                break;
 385        default:
 386                ret = -EINVAL;
 387                break;
 388        }
 389        return ret ?: nbytes;
 390}
 391
 392static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
 393                                           char *buf, size_t nbytes, loff_t off)
 394{
 395        return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
 396}
 397
 398static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
 399                                        char *buf, size_t nbytes, loff_t off)
 400{
 401        return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
 402}
 403
 404static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
 405                                    char *buf, size_t nbytes, loff_t off)
 406{
 407        int ret = 0;
 408        struct page_counter *counter;
 409        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 410
 411        counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
 412
 413        switch (MEMFILE_ATTR(of_cft(of)->private)) {
 414        case RES_MAX_USAGE:
 415                page_counter_reset_watermark(counter);
 416                break;
 417        case RES_FAILCNT:
 418                counter->failcnt = 0;
 419                break;
 420        default:
 421                ret = -EINVAL;
 422                break;
 423        }
 424        return ret ?: nbytes;
 425}
 426
 427static char *mem_fmt(char *buf, int size, unsigned long hsize)
 428{
 429        if (hsize >= (1UL << 30))
 430                snprintf(buf, size, "%luGB", hsize >> 30);
 431        else if (hsize >= (1UL << 20))
 432                snprintf(buf, size, "%luMB", hsize >> 20);
 433        else
 434                snprintf(buf, size, "%luKB", hsize >> 10);
 435        return buf;
 436}
 437
 438static int __hugetlb_events_show(struct seq_file *seq, bool local)
 439{
 440        int idx;
 441        long max;
 442        struct cftype *cft = seq_cft(seq);
 443        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
 444
 445        idx = MEMFILE_IDX(cft->private);
 446
 447        if (local)
 448                max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
 449        else
 450                max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
 451
 452        seq_printf(seq, "max %lu\n", max);
 453
 454        return 0;
 455}
 456
 457static int hugetlb_events_show(struct seq_file *seq, void *v)
 458{
 459        return __hugetlb_events_show(seq, false);
 460}
 461
 462static int hugetlb_events_local_show(struct seq_file *seq, void *v)
 463{
 464        return __hugetlb_events_show(seq, true);
 465}
 466
 467static void __init __hugetlb_cgroup_file_dfl_init(int idx)
 468{
 469        char buf[32];
 470        struct cftype *cft;
 471        struct hstate *h = &hstates[idx];
 472
 473        /* format the size */
 474        mem_fmt(buf, 32, huge_page_size(h));
 475
 476        /* Add the limit file */
 477        cft = &h->cgroup_files_dfl[0];
 478        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
 479        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
 480        cft->seq_show = hugetlb_cgroup_read_u64_max;
 481        cft->write = hugetlb_cgroup_write_dfl;
 482        cft->flags = CFTYPE_NOT_ON_ROOT;
 483
 484        /* Add the current usage file */
 485        cft = &h->cgroup_files_dfl[1];
 486        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
 487        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
 488        cft->seq_show = hugetlb_cgroup_read_u64_max;
 489        cft->flags = CFTYPE_NOT_ON_ROOT;
 490
 491        /* Add the events file */
 492        cft = &h->cgroup_files_dfl[2];
 493        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
 494        cft->private = MEMFILE_PRIVATE(idx, 0);
 495        cft->seq_show = hugetlb_events_show;
 496        cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
 497        cft->flags = CFTYPE_NOT_ON_ROOT;
 498
 499        /* Add the events.local file */
 500        cft = &h->cgroup_files_dfl[3];
 501        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
 502        cft->private = MEMFILE_PRIVATE(idx, 0);
 503        cft->seq_show = hugetlb_events_local_show;
 504        cft->file_offset = offsetof(struct hugetlb_cgroup,
 505                                    events_local_file[idx]),
 506        cft->flags = CFTYPE_NOT_ON_ROOT;
 507
 508        /* NULL terminate the last cft */
 509        cft = &h->cgroup_files_dfl[4];
 510        memset(cft, 0, sizeof(*cft));
 511
 512        WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
 513                                       h->cgroup_files_dfl));
 514}
 515
 516static void __init __hugetlb_cgroup_file_legacy_init(int idx)
 517{
 518        char buf[32];
 519        struct cftype *cft;
 520        struct hstate *h = &hstates[idx];
 521
 522        /* format the size */
 523        mem_fmt(buf, 32, huge_page_size(h));
 524
 525        /* Add the limit file */
 526        cft = &h->cgroup_files_legacy[0];
 527        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
 528        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
 529        cft->read_u64 = hugetlb_cgroup_read_u64;
 530        cft->write = hugetlb_cgroup_write_legacy;
 531
 532        /* Add the usage file */
 533        cft = &h->cgroup_files_legacy[1];
 534        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
 535        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
 536        cft->read_u64 = hugetlb_cgroup_read_u64;
 537
 538        /* Add the MAX usage file */
 539        cft = &h->cgroup_files_legacy[2];
 540        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
 541        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
 542        cft->write = hugetlb_cgroup_reset;
 543        cft->read_u64 = hugetlb_cgroup_read_u64;
 544
 545        /* Add the failcntfile */
 546        cft = &h->cgroup_files_legacy[3];
 547        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
 548        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
 549        cft->write = hugetlb_cgroup_reset;
 550        cft->read_u64 = hugetlb_cgroup_read_u64;
 551
 552        /* NULL terminate the last cft */
 553        cft = &h->cgroup_files_legacy[4];
 554        memset(cft, 0, sizeof(*cft));
 555
 556        WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
 557                                          h->cgroup_files_legacy));
 558}
 559
 560static void __init __hugetlb_cgroup_file_init(int idx)
 561{
 562        __hugetlb_cgroup_file_dfl_init(idx);
 563        __hugetlb_cgroup_file_legacy_init(idx);
 564}
 565
 566void __init hugetlb_cgroup_file_init(void)
 567{
 568        struct hstate *h;
 569
 570        for_each_hstate(h) {
 571                /*
 572                 * Add cgroup control files only if the huge page consists
 573                 * of more than two normal pages. This is because we use
 574                 * page[2].private for storing cgroup details.
 575                 */
 576                if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
 577                        __hugetlb_cgroup_file_init(hstate_index(h));
 578        }
 579}
 580
 581/*
 582 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
 583 * when we migrate hugepages
 584 */
 585void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
 586{
 587        struct hugetlb_cgroup *h_cg;
 588        struct hstate *h = page_hstate(oldhpage);
 589
 590        if (hugetlb_cgroup_disabled())
 591                return;
 592
 593        VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
 594        spin_lock(&hugetlb_lock);
 595        h_cg = hugetlb_cgroup_from_page(oldhpage);
 596        set_hugetlb_cgroup(oldhpage, NULL);
 597
 598        /* move the h_cg details to new cgroup */
 599        set_hugetlb_cgroup(newhpage, h_cg);
 600        list_move(&newhpage->lru, &h->hugepage_activelist);
 601        spin_unlock(&hugetlb_lock);
 602        return;
 603}
 604
 605static struct cftype hugetlb_files[] = {
 606        {} /* terminate */
 607};
 608
 609struct cgroup_subsys hugetlb_cgrp_subsys = {
 610        .css_alloc      = hugetlb_cgroup_css_alloc,
 611        .css_offline    = hugetlb_cgroup_css_offline,
 612        .css_free       = hugetlb_cgroup_css_free,
 613        .dfl_cftypes    = hugetlb_files,
 614        .legacy_cftypes = hugetlb_files,
 615};
 616