linux/mm/hugetlb_cgroup.c
<<
>>
Prefs
   1/*
   2 *
   3 * Copyright IBM Corporation, 2012
   4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
   5 *
   6 * Cgroup v2
   7 * Copyright (C) 2019 Red Hat, Inc.
   8 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms of version 2.1 of the GNU Lesser General Public License
  12 * as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope that it would be useful, but
  15 * WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  17 *
  18 */
  19
  20#include <linux/cgroup.h>
  21#include <linux/page_counter.h>
  22#include <linux/slab.h>
  23#include <linux/hugetlb.h>
  24#include <linux/hugetlb_cgroup.h>
  25
  26#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
  27#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
  28#define MEMFILE_ATTR(val)       ((val) & 0xffff)
  29
  30/* Use t->m[0] to encode the offset */
  31#define MEMFILE_OFFSET(t, m0)   (((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
  32#define MEMFILE_OFFSET0(val)    (((val) >> 16) & 0xffff)
  33#define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff)
  34
  35#define DFL_TMPL_SIZE           ARRAY_SIZE(hugetlb_dfl_tmpl)
  36#define LEGACY_TMPL_SIZE        ARRAY_SIZE(hugetlb_legacy_tmpl)
  37
  38static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
  39static struct cftype *dfl_files;
  40static struct cftype *legacy_files;
  41
  42static inline struct page_counter *
  43__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
  44                                     bool rsvd)
  45{
  46        if (rsvd)
  47                return &h_cg->rsvd_hugepage[idx];
  48        return &h_cg->hugepage[idx];
  49}
  50
  51static inline struct page_counter *
  52hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
  53{
  54        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
  55}
  56
  57static inline struct page_counter *
  58hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
  59{
  60        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
  61}
  62
  63static inline
  64struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
  65{
  66        return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
  67}
  68
  69static inline
  70struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
  71{
  72        return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
  73}
  74
  75static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
  76{
  77        return (h_cg == root_h_cgroup);
  78}
  79
  80static inline struct hugetlb_cgroup *
  81parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
  82{
  83        return hugetlb_cgroup_from_css(h_cg->css.parent);
  84}
  85
  86static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
  87{
  88        struct hstate *h;
  89
  90        for_each_hstate(h) {
  91                if (page_counter_read(
  92                    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
  93                        return true;
  94        }
  95        return false;
  96}
  97
  98static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
  99                                struct hugetlb_cgroup *parent_h_cgroup)
 100{
 101        int idx;
 102
 103        for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
 104                struct page_counter *fault, *fault_parent = NULL;
 105                struct page_counter *rsvd, *rsvd_parent = NULL;
 106                unsigned long limit;
 107
 108                if (parent_h_cgroup) {
 109                        fault_parent = hugetlb_cgroup_counter_from_cgroup(
 110                                parent_h_cgroup, idx);
 111                        rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
 112                                parent_h_cgroup, idx);
 113                }
 114                fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
 115                rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
 116
 117                page_counter_init(fault, fault_parent, false);
 118                page_counter_init(rsvd, rsvd_parent, false);
 119
 120                if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
 121                        fault->track_failcnt = true;
 122                        rsvd->track_failcnt = true;
 123                }
 124
 125                limit = round_down(PAGE_COUNTER_MAX,
 126                                   pages_per_huge_page(&hstates[idx]));
 127
 128                VM_BUG_ON(page_counter_set_max(fault, limit));
 129                VM_BUG_ON(page_counter_set_max(rsvd, limit));
 130        }
 131}
 132
 133static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
 134{
 135        int node;
 136
 137        for_each_node(node)
 138                kfree(h_cgroup->nodeinfo[node]);
 139        kfree(h_cgroup);
 140}
 141
 142static struct cgroup_subsys_state *
 143hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 144{
 145        struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
 146        struct hugetlb_cgroup *h_cgroup;
 147        int node;
 148
 149        h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
 150                           GFP_KERNEL);
 151
 152        if (!h_cgroup)
 153                return ERR_PTR(-ENOMEM);
 154
 155        if (!parent_h_cgroup)
 156                root_h_cgroup = h_cgroup;
 157
 158        /*
 159         * TODO: this routine can waste much memory for nodes which will
 160         * never be onlined. It's better to use memory hotplug callback
 161         * function.
 162         */
 163        for_each_node(node) {
 164                /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
 165                int node_to_alloc =
 166                        node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
 167                h_cgroup->nodeinfo[node] =
 168                        kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
 169                                     GFP_KERNEL, node_to_alloc);
 170                if (!h_cgroup->nodeinfo[node])
 171                        goto fail_alloc_nodeinfo;
 172        }
 173
 174        hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
 175        return &h_cgroup->css;
 176
 177fail_alloc_nodeinfo:
 178        hugetlb_cgroup_free(h_cgroup);
 179        return ERR_PTR(-ENOMEM);
 180}
 181
 182static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
 183{
 184        hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
 185}
 186
 187/*
 188 * Should be called with hugetlb_lock held.
 189 * Since we are holding hugetlb_lock, pages cannot get moved from
 190 * active list or uncharged from the cgroup, So no need to get
 191 * page reference and test for page active here. This function
 192 * cannot fail.
 193 */
 194static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
 195                                       struct folio *folio)
 196{
 197        unsigned int nr_pages;
 198        struct page_counter *counter;
 199        struct hugetlb_cgroup *hcg;
 200        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
 201
 202        hcg = hugetlb_cgroup_from_folio(folio);
 203        /*
 204         * We can have pages in active list without any cgroup
 205         * ie, hugepage with less than 3 pages. We can safely
 206         * ignore those pages.
 207         */
 208        if (!hcg || hcg != h_cg)
 209                goto out;
 210
 211        nr_pages = folio_nr_pages(folio);
 212        if (!parent) {
 213                parent = root_h_cgroup;
 214                /* root has no limit */
 215                page_counter_charge(&parent->hugepage[idx], nr_pages);
 216        }
 217        counter = &h_cg->hugepage[idx];
 218        /* Take the pages off the local counter */
 219        page_counter_cancel(counter, nr_pages);
 220
 221        set_hugetlb_cgroup(folio, parent);
 222out:
 223        return;
 224}
 225
 226/*
 227 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 228 * the parent cgroup.
 229 */
 230static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
 231{
 232        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
 233        struct hstate *h;
 234        struct folio *folio;
 235
 236        do {
 237                for_each_hstate(h) {
 238                        spin_lock_irq(&hugetlb_lock);
 239                        list_for_each_entry(folio, &h->hugepage_activelist, lru)
 240                                hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
 241
 242                        spin_unlock_irq(&hugetlb_lock);
 243                }
 244                cond_resched();
 245        } while (hugetlb_cgroup_have_usage(h_cg));
 246}
 247
 248static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
 249                                 enum hugetlb_memory_event event)
 250{
 251        atomic_long_inc(&hugetlb->events_local[idx][event]);
 252        cgroup_file_notify(&hugetlb->events_local_file[idx]);
 253
 254        do {
 255                atomic_long_inc(&hugetlb->events[idx][event]);
 256                cgroup_file_notify(&hugetlb->events_file[idx]);
 257        } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
 258                 !hugetlb_cgroup_is_root(hugetlb));
 259}
 260
 261static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 262                                          struct hugetlb_cgroup **ptr,
 263                                          bool rsvd)
 264{
 265        int ret = 0;
 266        struct page_counter *counter;
 267        struct hugetlb_cgroup *h_cg = NULL;
 268
 269        if (hugetlb_cgroup_disabled())
 270                goto done;
 271again:
 272        rcu_read_lock();
 273        h_cg = hugetlb_cgroup_from_task(current);
 274        if (!css_tryget(&h_cg->css)) {
 275                rcu_read_unlock();
 276                goto again;
 277        }
 278        rcu_read_unlock();
 279
 280        if (!page_counter_try_charge(
 281                    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
 282                    nr_pages, &counter)) {
 283                ret = -ENOMEM;
 284                hugetlb_event(h_cg, idx, HUGETLB_MAX);
 285                css_put(&h_cg->css);
 286                goto done;
 287        }
 288        /* Reservations take a reference to the css because they do not get
 289         * reparented.
 290         */
 291        if (!rsvd)
 292                css_put(&h_cg->css);
 293done:
 294        *ptr = h_cg;
 295        return ret;
 296}
 297
 298int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 299                                 struct hugetlb_cgroup **ptr)
 300{
 301        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
 302}
 303
 304int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
 305                                      struct hugetlb_cgroup **ptr)
 306{
 307        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
 308}
 309
 310/* Should be called with hugetlb_lock held */
 311static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 312                                           struct hugetlb_cgroup *h_cg,
 313                                           struct folio *folio, bool rsvd)
 314{
 315        if (hugetlb_cgroup_disabled() || !h_cg)
 316                return;
 317        lockdep_assert_held(&hugetlb_lock);
 318        __set_hugetlb_cgroup(folio, h_cg, rsvd);
 319        if (!rsvd) {
 320                unsigned long usage =
 321                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
 322                /*
 323                 * This write is not atomic due to fetching usage and writing
 324                 * to it, but that's fine because we call this with
 325                 * hugetlb_lock held anyway.
 326                 */
 327                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
 328                           usage + nr_pages);
 329        }
 330}
 331
 332void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 333                                  struct hugetlb_cgroup *h_cg,
 334                                  struct folio *folio)
 335{
 336        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
 337}
 338
 339void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
 340                                       struct hugetlb_cgroup *h_cg,
 341                                       struct folio *folio)
 342{
 343        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
 344}
 345
 346/*
 347 * Should be called with hugetlb_lock held
 348 */
 349static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
 350                                           struct folio *folio, bool rsvd)
 351{
 352        struct hugetlb_cgroup *h_cg;
 353
 354        if (hugetlb_cgroup_disabled())
 355                return;
 356        lockdep_assert_held(&hugetlb_lock);
 357        h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
 358        if (unlikely(!h_cg))
 359                return;
 360        __set_hugetlb_cgroup(folio, NULL, rsvd);
 361
 362        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
 363                                                                   rsvd),
 364                              nr_pages);
 365
 366        if (rsvd)
 367                css_put(&h_cg->css);
 368        else {
 369                unsigned long usage =
 370                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
 371                /*
 372                 * This write is not atomic due to fetching usage and writing
 373                 * to it, but that's fine because we call this with
 374                 * hugetlb_lock held anyway.
 375                 */
 376                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
 377                           usage - nr_pages);
 378        }
 379}
 380
 381void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
 382                                  struct folio *folio)
 383{
 384        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
 385}
 386
 387void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
 388                                       struct folio *folio)
 389{
 390        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
 391}
 392
 393static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 394                                             struct hugetlb_cgroup *h_cg,
 395                                             bool rsvd)
 396{
 397        if (hugetlb_cgroup_disabled() || !h_cg)
 398                return;
 399
 400        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
 401                                                                   rsvd),
 402                              nr_pages);
 403
 404        if (rsvd)
 405                css_put(&h_cg->css);
 406}
 407
 408void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 409                                    struct hugetlb_cgroup *h_cg)
 410{
 411        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
 412}
 413
 414void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
 415                                         struct hugetlb_cgroup *h_cg)
 416{
 417        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
 418}
 419
 420void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
 421                                     unsigned long end)
 422{
 423        if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
 424            !resv->css)
 425                return;
 426
 427        page_counter_uncharge(resv->reservation_counter,
 428                              (end - start) * resv->pages_per_hpage);
 429        css_put(resv->css);
 430}
 431
 432void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
 433                                         struct file_region *rg,
 434                                         unsigned long nr_pages,
 435                                         bool region_del)
 436{
 437        if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
 438                return;
 439
 440        if (rg->reservation_counter && resv->pages_per_hpage &&
 441            !resv->reservation_counter) {
 442                page_counter_uncharge(rg->reservation_counter,
 443                                      nr_pages * resv->pages_per_hpage);
 444                /*
 445                 * Only do css_put(rg->css) when we delete the entire region
 446                 * because one file_region must hold exactly one css reference.
 447                 */
 448                if (region_del)
 449                        css_put(rg->css);
 450        }
 451}
 452
 453enum {
 454        RES_USAGE,
 455        RES_RSVD_USAGE,
 456        RES_LIMIT,
 457        RES_RSVD_LIMIT,
 458        RES_MAX_USAGE,
 459        RES_RSVD_MAX_USAGE,
 460        RES_FAILCNT,
 461        RES_RSVD_FAILCNT,
 462};
 463
 464static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
 465{
 466        int nid;
 467        struct cftype *cft = seq_cft(seq);
 468        int idx = MEMFILE_IDX(cft->private);
 469        bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
 470        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
 471        struct cgroup_subsys_state *css;
 472        unsigned long usage;
 473
 474        if (legacy) {
 475                /* Add up usage across all nodes for the non-hierarchical total. */
 476                usage = 0;
 477                for_each_node_state(nid, N_MEMORY)
 478                        usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
 479                seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
 480
 481                /* Simply print the per-node usage for the non-hierarchical total. */
 482                for_each_node_state(nid, N_MEMORY)
 483                        seq_printf(seq, " N%d=%lu", nid,
 484                                   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
 485                                           PAGE_SIZE);
 486                seq_putc(seq, '\n');
 487        }
 488
 489        /*
 490         * The hierarchical total is pretty much the value recorded by the
 491         * counter, so use that.
 492         */
 493        seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
 494                   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
 495
 496        /*
 497         * For each node, transverse the css tree to obtain the hierarchical
 498         * node usage.
 499         */
 500        for_each_node_state(nid, N_MEMORY) {
 501                usage = 0;
 502                rcu_read_lock();
 503                css_for_each_descendant_pre(css, &h_cg->css) {
 504                        usage += READ_ONCE(hugetlb_cgroup_from_css(css)
 505                                                   ->nodeinfo[nid]
 506                                                   ->usage[idx]);
 507                }
 508                rcu_read_unlock();
 509                seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
 510        }
 511
 512        seq_putc(seq, '\n');
 513
 514        return 0;
 515}
 516
 517static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
 518                                   struct cftype *cft)
 519{
 520        struct page_counter *counter;
 521        struct page_counter *rsvd_counter;
 522        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
 523
 524        counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
 525        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
 526
 527        switch (MEMFILE_ATTR(cft->private)) {
 528        case RES_USAGE:
 529                return (u64)page_counter_read(counter) * PAGE_SIZE;
 530        case RES_RSVD_USAGE:
 531                return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
 532        case RES_LIMIT:
 533                return (u64)counter->max * PAGE_SIZE;
 534        case RES_RSVD_LIMIT:
 535                return (u64)rsvd_counter->max * PAGE_SIZE;
 536        case RES_MAX_USAGE:
 537                return (u64)counter->watermark * PAGE_SIZE;
 538        case RES_RSVD_MAX_USAGE:
 539                return (u64)rsvd_counter->watermark * PAGE_SIZE;
 540        case RES_FAILCNT:
 541                return counter->failcnt;
 542        case RES_RSVD_FAILCNT:
 543                return rsvd_counter->failcnt;
 544        default:
 545                BUG();
 546        }
 547}
 548
 549static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
 550{
 551        int idx;
 552        u64 val;
 553        struct cftype *cft = seq_cft(seq);
 554        unsigned long limit;
 555        struct page_counter *counter;
 556        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
 557
 558        idx = MEMFILE_IDX(cft->private);
 559        counter = &h_cg->hugepage[idx];
 560
 561        limit = round_down(PAGE_COUNTER_MAX,
 562                           pages_per_huge_page(&hstates[idx]));
 563
 564        switch (MEMFILE_ATTR(cft->private)) {
 565        case RES_RSVD_USAGE:
 566                counter = &h_cg->rsvd_hugepage[idx];
 567                fallthrough;
 568        case RES_USAGE:
 569                val = (u64)page_counter_read(counter);
 570                seq_printf(seq, "%llu\n", val * PAGE_SIZE);
 571                break;
 572        case RES_RSVD_LIMIT:
 573                counter = &h_cg->rsvd_hugepage[idx];
 574                fallthrough;
 575        case RES_LIMIT:
 576                val = (u64)counter->max;
 577                if (val == limit)
 578                        seq_puts(seq, "max\n");
 579                else
 580                        seq_printf(seq, "%llu\n", val * PAGE_SIZE);
 581                break;
 582        default:
 583                BUG();
 584        }
 585
 586        return 0;
 587}
 588
 589static DEFINE_MUTEX(hugetlb_limit_mutex);
 590
 591static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 592                                    char *buf, size_t nbytes, loff_t off,
 593                                    const char *max)
 594{
 595        int ret, idx;
 596        unsigned long nr_pages;
 597        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 598        bool rsvd = false;
 599
 600        if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
 601                return -EINVAL;
 602
 603        buf = strstrip(buf);
 604        ret = page_counter_memparse(buf, max, &nr_pages);
 605        if (ret)
 606                return ret;
 607
 608        idx = MEMFILE_IDX(of_cft(of)->private);
 609        nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
 610
 611        switch (MEMFILE_ATTR(of_cft(of)->private)) {
 612        case RES_RSVD_LIMIT:
 613                rsvd = true;
 614                fallthrough;
 615        case RES_LIMIT:
 616                mutex_lock(&hugetlb_limit_mutex);
 617                ret = page_counter_set_max(
 618                        __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
 619                        nr_pages);
 620                mutex_unlock(&hugetlb_limit_mutex);
 621                break;
 622        default:
 623                ret = -EINVAL;
 624                break;
 625        }
 626        return ret ?: nbytes;
 627}
 628
 629static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
 630                                           char *buf, size_t nbytes, loff_t off)
 631{
 632        return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
 633}
 634
 635static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
 636                                        char *buf, size_t nbytes, loff_t off)
 637{
 638        return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
 639}
 640
 641static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
 642                                    char *buf, size_t nbytes, loff_t off)
 643{
 644        int ret = 0;
 645        struct page_counter *counter, *rsvd_counter;
 646        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 647
 648        counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
 649        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
 650
 651        switch (MEMFILE_ATTR(of_cft(of)->private)) {
 652        case RES_MAX_USAGE:
 653                page_counter_reset_watermark(counter);
 654                break;
 655        case RES_RSVD_MAX_USAGE:
 656                page_counter_reset_watermark(rsvd_counter);
 657                break;
 658        case RES_FAILCNT:
 659                counter->failcnt = 0;
 660                break;
 661        case RES_RSVD_FAILCNT:
 662                rsvd_counter->failcnt = 0;
 663                break;
 664        default:
 665                ret = -EINVAL;
 666                break;
 667        }
 668        return ret ?: nbytes;
 669}
 670
 671static char *mem_fmt(char *buf, int size, unsigned long hsize)
 672{
 673        if (hsize >= SZ_1G)
 674                snprintf(buf, size, "%luGB", hsize / SZ_1G);
 675        else if (hsize >= SZ_1M)
 676                snprintf(buf, size, "%luMB", hsize / SZ_1M);
 677        else
 678                snprintf(buf, size, "%luKB", hsize / SZ_1K);
 679        return buf;
 680}
 681
 682static int __hugetlb_events_show(struct seq_file *seq, bool local)
 683{
 684        int idx;
 685        long max;
 686        struct cftype *cft = seq_cft(seq);
 687        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
 688
 689        idx = MEMFILE_IDX(cft->private);
 690
 691        if (local)
 692                max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
 693        else
 694                max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
 695
 696        seq_printf(seq, "max %lu\n", max);
 697
 698        return 0;
 699}
 700
 701static int hugetlb_events_show(struct seq_file *seq, void *v)
 702{
 703        return __hugetlb_events_show(seq, false);
 704}
 705
 706static int hugetlb_events_local_show(struct seq_file *seq, void *v)
 707{
 708        return __hugetlb_events_show(seq, true);
 709}
 710
 711static struct cftype hugetlb_dfl_tmpl[] = {
 712        {
 713                .name = "max",
 714                .private = RES_LIMIT,
 715                .seq_show = hugetlb_cgroup_read_u64_max,
 716                .write = hugetlb_cgroup_write_dfl,
 717                .flags = CFTYPE_NOT_ON_ROOT,
 718        },
 719        {
 720                .name = "rsvd.max",
 721                .private = RES_RSVD_LIMIT,
 722                .seq_show = hugetlb_cgroup_read_u64_max,
 723                .write = hugetlb_cgroup_write_dfl,
 724                .flags = CFTYPE_NOT_ON_ROOT,
 725        },
 726        {
 727                .name = "current",
 728                .private = RES_USAGE,
 729                .seq_show = hugetlb_cgroup_read_u64_max,
 730                .flags = CFTYPE_NOT_ON_ROOT,
 731        },
 732        {
 733                .name = "rsvd.current",
 734                .private = RES_RSVD_USAGE,
 735                .seq_show = hugetlb_cgroup_read_u64_max,
 736                .flags = CFTYPE_NOT_ON_ROOT,
 737        },
 738        {
 739                .name = "events",
 740                .seq_show = hugetlb_events_show,
 741                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
 742                .flags = CFTYPE_NOT_ON_ROOT,
 743        },
 744        {
 745                .name = "events.local",
 746                .seq_show = hugetlb_events_local_show,
 747                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
 748                .flags = CFTYPE_NOT_ON_ROOT,
 749        },
 750        {
 751                .name = "numa_stat",
 752                .seq_show = hugetlb_cgroup_read_numa_stat,
 753                .flags = CFTYPE_NOT_ON_ROOT,
 754        },
 755        /* don't need terminator here */
 756};
 757
 758static struct cftype hugetlb_legacy_tmpl[] = {
 759        {
 760                .name = "limit_in_bytes",
 761                .private = RES_LIMIT,
 762                .read_u64 = hugetlb_cgroup_read_u64,
 763                .write = hugetlb_cgroup_write_legacy,
 764        },
 765        {
 766                .name = "rsvd.limit_in_bytes",
 767                .private = RES_RSVD_LIMIT,
 768                .read_u64 = hugetlb_cgroup_read_u64,
 769                .write = hugetlb_cgroup_write_legacy,
 770        },
 771        {
 772                .name = "usage_in_bytes",
 773                .private = RES_USAGE,
 774                .read_u64 = hugetlb_cgroup_read_u64,
 775        },
 776        {
 777                .name = "rsvd.usage_in_bytes",
 778                .private = RES_RSVD_USAGE,
 779                .read_u64 = hugetlb_cgroup_read_u64,
 780        },
 781        {
 782                .name = "max_usage_in_bytes",
 783                .private = RES_MAX_USAGE,
 784                .write = hugetlb_cgroup_reset,
 785                .read_u64 = hugetlb_cgroup_read_u64,
 786        },
 787        {
 788                .name = "rsvd.max_usage_in_bytes",
 789                .private = RES_RSVD_MAX_USAGE,
 790                .write = hugetlb_cgroup_reset,
 791                .read_u64 = hugetlb_cgroup_read_u64,
 792        },
 793        {
 794                .name = "failcnt",
 795                .private = RES_FAILCNT,
 796                .write = hugetlb_cgroup_reset,
 797                .read_u64 = hugetlb_cgroup_read_u64,
 798        },
 799        {
 800                .name = "rsvd.failcnt",
 801                .private = RES_RSVD_FAILCNT,
 802                .write = hugetlb_cgroup_reset,
 803                .read_u64 = hugetlb_cgroup_read_u64,
 804        },
 805        {
 806                .name = "numa_stat",
 807                .seq_show = hugetlb_cgroup_read_numa_stat,
 808        },
 809        /* don't need terminator here */
 810};
 811
 812static void __init
 813hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
 814                             struct cftype *tmpl, int tmpl_size)
 815{
 816        char buf[32];
 817        int i, idx = hstate_index(h);
 818
 819        /* format the size */
 820        mem_fmt(buf, sizeof(buf), huge_page_size(h));
 821
 822        for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
 823                *cft = *tmpl;
 824                /* rebuild the name */
 825                snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
 826                /* rebuild the private */
 827                cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
 828                /* rebuild the file_offset */
 829                if (tmpl->file_offset) {
 830                        unsigned int offset = tmpl->file_offset;
 831
 832                        cft->file_offset = MEMFILE_OFFSET0(offset) +
 833                                           MEMFILE_FIELD_SIZE(offset) * idx;
 834                }
 835
 836                lockdep_register_key(&cft->lockdep_key);
 837        }
 838}
 839
 840static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
 841{
 842        int idx = hstate_index(h);
 843
 844        hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
 845                                     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
 846}
 847
 848static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
 849{
 850        int idx = hstate_index(h);
 851
 852        hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
 853                                     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
 854}
 855
 856static void __init __hugetlb_cgroup_file_init(struct hstate *h)
 857{
 858        __hugetlb_cgroup_file_dfl_init(h);
 859        __hugetlb_cgroup_file_legacy_init(h);
 860}
 861
 862static void __init __hugetlb_cgroup_file_pre_init(void)
 863{
 864        int cft_count;
 865
 866        cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
 867        dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
 868        BUG_ON(!dfl_files);
 869        cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
 870        legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
 871        BUG_ON(!legacy_files);
 872}
 873
 874static void __init __hugetlb_cgroup_file_post_init(void)
 875{
 876        WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
 877                                       dfl_files));
 878        WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
 879                                          legacy_files));
 880}
 881
 882void __init hugetlb_cgroup_file_init(void)
 883{
 884        struct hstate *h;
 885
 886        __hugetlb_cgroup_file_pre_init();
 887        for_each_hstate(h)
 888                __hugetlb_cgroup_file_init(h);
 889        __hugetlb_cgroup_file_post_init();
 890}
 891
 892/*
 893 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
 894 * when we migrate hugepages
 895 */
 896void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
 897{
 898        struct hugetlb_cgroup *h_cg;
 899        struct hugetlb_cgroup *h_cg_rsvd;
 900        struct hstate *h = folio_hstate(old_folio);
 901
 902        if (hugetlb_cgroup_disabled())
 903                return;
 904
 905        spin_lock_irq(&hugetlb_lock);
 906        h_cg = hugetlb_cgroup_from_folio(old_folio);
 907        h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
 908        set_hugetlb_cgroup(old_folio, NULL);
 909        set_hugetlb_cgroup_rsvd(old_folio, NULL);
 910
 911        /* move the h_cg details to new cgroup */
 912        set_hugetlb_cgroup(new_folio, h_cg);
 913        set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
 914        list_move(&new_folio->lru, &h->hugepage_activelist);
 915        spin_unlock_irq(&hugetlb_lock);
 916}
 917
 918static struct cftype hugetlb_files[] = {
 919        {} /* terminate */
 920};
 921
 922struct cgroup_subsys hugetlb_cgrp_subsys = {
 923        .css_alloc      = hugetlb_cgroup_css_alloc,
 924        .css_offline    = hugetlb_cgroup_css_offline,
 925        .css_free       = hugetlb_cgroup_css_free,
 926        .dfl_cftypes    = hugetlb_files,
 927        .legacy_cftypes = hugetlb_files,
 928};
 929