LXR linux/kernel/cgroup/rstat.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2#include "cgroup-internal.h"
   3
   4#include <linux/sched/cputime.h>
   5
   6static DEFINE_SPINLOCK(cgroup_rstat_lock);
   7static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
   8
   9static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10
  11static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  12{
  13        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  14}
  15
  16/**
  17 * cgroup_rstat_updated - keep track of updated rstat_cpu
  18 * @cgrp: target cgroup
  19 * @cpu: cpu on which rstat_cpu was updated
  20 *
  21 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
  22 * rstat_cpu->updated_children list.  See the comment on top of
  23 * cgroup_rstat_cpu definition for details.
  24 */
  25void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  26{
  27        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  28        struct cgroup *parent;
  29        unsigned long flags;
  30
  31        /* nothing to do for root */
  32        if (!cgroup_parent(cgrp))
  33                return;
  34
  35        /*
  36         * Speculative already-on-list test. This may race leading to
  37         * temporary inaccuracies, which is fine.
  38         *
  39         * Because @parent's updated_children is terminated with @parent
  40         * instead of NULL, we can tell whether @cgrp is on the list by
  41         * testing the next pointer for NULL.
  42         */
  43        if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
  44                return;
  45
  46        raw_spin_lock_irqsave(cpu_lock, flags);
  47
  48        /* put @cgrp and all ancestors on the corresponding updated lists */
  49        for (parent = cgroup_parent(cgrp); parent;
  50             cgrp = parent, parent = cgroup_parent(cgrp)) {
  51                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  52                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
  53
  54                /*
  55                 * Both additions and removals are bottom-up.  If a cgroup
  56                 * is already in the tree, all ancestors are.
  57                 */
  58                if (rstatc->updated_next)
  59                        break;
  60
  61                rstatc->updated_next = prstatc->updated_children;
  62                prstatc->updated_children = cgrp;
  63        }
  64
  65        raw_spin_unlock_irqrestore(cpu_lock, flags);
  66}
  67EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
  68
  69/**
  70 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  71 * @pos: current position
  72 * @root: root of the tree to traversal
  73 * @cpu: target cpu
  74 *
  75 * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  76 * the traversal and %NULL return indicates the end.  During traversal,
  77 * each returned cgroup is unlinked from the tree.  Must be called with the
  78 * matching cgroup_rstat_cpu_lock held.
  79 *
  80 * The only ordering guarantee is that, for a parent and a child pair
  81 * covered by a given traversal, if a child is visited, its parent is
  82 * guaranteed to be visited afterwards.
  83 */
  84static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  85                                                   struct cgroup *root, int cpu)
  86{
  87        struct cgroup_rstat_cpu *rstatc;
  88
  89        if (pos == root)
  90                return NULL;
  91
  92        /*
  93         * We're gonna walk down to the first leaf and visit/remove it.  We
  94         * can pick whatever unvisited node as the starting point.
  95         */
  96        if (!pos)
  97                pos = root;
  98        else
  99                pos = cgroup_parent(pos);
 100
 101        /* walk down to the first leaf */
 102        while (true) {
 103                rstatc = cgroup_rstat_cpu(pos, cpu);
 104                if (rstatc->updated_children == pos)
 105                        break;
 106                pos = rstatc->updated_children;
 107        }
 108
 109        /*
 110         * Unlink @pos from the tree.  As the updated_children list is
 111         * singly linked, we have to walk it to find the removal point.
 112         * However, due to the way we traverse, @pos will be the first
 113         * child in most cases. The only exception is @root.
 114         */
 115        if (rstatc->updated_next) {
 116                struct cgroup *parent = cgroup_parent(pos);
 117                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
 118                struct cgroup_rstat_cpu *nrstatc;
 119                struct cgroup **nextp;
 120
 121                nextp = &prstatc->updated_children;
 122                while (true) {
 123                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 124                        if (*nextp == pos)
 125                                break;
 126
 127                        WARN_ON_ONCE(*nextp == parent);
 128                        nextp = &nrstatc->updated_next;
 129                }
 130
 131                *nextp = rstatc->updated_next;
 132                rstatc->updated_next = NULL;
 133
 134                return pos;
 135        }
 136
 137        /* only happens for @root */
 138        return NULL;
 139}
 140
 141/* see cgroup_rstat_flush() */
 142static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 143        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 144{
 145        int cpu;
 146
 147        lockdep_assert_held(&cgroup_rstat_lock);
 148
 149        for_each_possible_cpu(cpu) {
 150                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 151                                                       cpu);
 152                struct cgroup *pos = NULL;
 153
 154                raw_spin_lock(cpu_lock);
 155                while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 156                        struct cgroup_subsys_state *css;
 157
 158                        cgroup_base_stat_flush(pos, cpu);
 159
 160                        rcu_read_lock();
 161                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
 162                                                rstat_css_node)
 163                                css->ss->css_rstat_flush(css, cpu);
 164                        rcu_read_unlock();
 165                }
 166                raw_spin_unlock(cpu_lock);
 167
 168                /* if @may_sleep, play nice and yield if necessary */
 169                if (may_sleep && (need_resched() ||
 170                                  spin_needbreak(&cgroup_rstat_lock))) {
 171                        spin_unlock_irq(&cgroup_rstat_lock);
 172                        if (!cond_resched())
 173                                cpu_relax();
 174                        spin_lock_irq(&cgroup_rstat_lock);
 175                }
 176        }
 177}
 178
 179/**
 180 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 181 * @cgrp: target cgroup
 182 *
 183 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 184 * and propagate them upwards.  After this function returns, all cgroups in
 185 * the subtree have up-to-date ->stat.
 186 *
 187 * This also gets all cgroups in the subtree including @cgrp off the
 188 * ->updated_children lists.
 189 *
 190 * This function may block.
 191 */
 192void cgroup_rstat_flush(struct cgroup *cgrp)
 193{
 194        might_sleep();
 195
 196        spin_lock_irq(&cgroup_rstat_lock);
 197        cgroup_rstat_flush_locked(cgrp, true);
 198        spin_unlock_irq(&cgroup_rstat_lock);
 199}
 200
 201/**
 202 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 203 * @cgrp: target cgroup
 204 *
 205 * This function can be called from any context.
 206 */
 207void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 208{
 209        unsigned long flags;
 210
 211        spin_lock_irqsave(&cgroup_rstat_lock, flags);
 212        cgroup_rstat_flush_locked(cgrp, false);
 213        spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 214}
 215
 216/**
 217 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
 218 * @cgrp: target cgroup
 219 *
 220 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 221 * paired with cgroup_rstat_flush_release().
 222 *
 223 * This function may block.
 224 */
 225void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 226        __acquires(&cgroup_rstat_lock)
 227{
 228        might_sleep();
 229        spin_lock_irq(&cgroup_rstat_lock);
 230        cgroup_rstat_flush_locked(cgrp, true);
 231}
 232
 233/**
 234 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 235 */
 236void cgroup_rstat_flush_release(void)
 237        __releases(&cgroup_rstat_lock)
 238{
 239        spin_unlock_irq(&cgroup_rstat_lock);
 240}
 241
 242int cgroup_rstat_init(struct cgroup *cgrp)
 243{
 244        int cpu;
 245
 246        /* the root cgrp has rstat_cpu preallocated */
 247        if (!cgrp->rstat_cpu) {
 248                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
 249                if (!cgrp->rstat_cpu)
 250                        return -ENOMEM;
 251        }
 252
 253        /* ->updated_children list is self terminated */
 254        for_each_possible_cpu(cpu) {
 255                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 256
 257                rstatc->updated_children = cgrp;
 258                u64_stats_init(&rstatc->bsync);
 259        }
 260
 261        return 0;
 262}
 263
 264void cgroup_rstat_exit(struct cgroup *cgrp)
 265{
 266        int cpu;
 267
 268        cgroup_rstat_flush(cgrp);
 269
 270        /* sanity check */
 271        for_each_possible_cpu(cpu) {
 272                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 273
 274                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
 275                    WARN_ON_ONCE(rstatc->updated_next))
 276                        return;
 277        }
 278
 279        free_percpu(cgrp->rstat_cpu);
 280        cgrp->rstat_cpu = NULL;
 281}
 282
 283void __init cgroup_rstat_boot(void)
 284{
 285        int cpu;
 286
 287        for_each_possible_cpu(cpu)
 288                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 289
 290        BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
 291}
 292
 293/*
 294 * Functions for cgroup basic resource statistics implemented on top of
 295 * rstat.
 296 */
 297static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 298                                 struct cgroup_base_stat *src_bstat)
 299{
 300        dst_bstat->cputime.utime += src_bstat->cputime.utime;
 301        dst_bstat->cputime.stime += src_bstat->cputime.stime;
 302        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 303}
 304
 305static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 306                                 struct cgroup_base_stat *src_bstat)
 307{
 308        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 309        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 310        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
 311}
 312
 313static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 314{
 315        struct cgroup *parent = cgroup_parent(cgrp);
 316        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 317        struct cgroup_base_stat cur, delta;
 318        unsigned seq;
 319
 320        /* fetch the current per-cpu values */
 321        do {
 322                seq = __u64_stats_fetch_begin(&rstatc->bsync);
 323                cur.cputime = rstatc->bstat.cputime;
 324        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 325
 326        /* propagate percpu delta to global */
 327        delta = cur;
 328        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 329        cgroup_base_stat_add(&cgrp->bstat, &delta);
 330        cgroup_base_stat_add(&rstatc->last_bstat, &delta);
 331
 332        /* propagate global delta to parent */
 333        if (parent) {
 334                delta = cgrp->bstat;
 335                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
 336                cgroup_base_stat_add(&parent->bstat, &delta);
 337                cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 338        }
 339}
 340
 341static struct cgroup_rstat_cpu *
 342cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
 343{
 344        struct cgroup_rstat_cpu *rstatc;
 345
 346        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
 347        u64_stats_update_begin(&rstatc->bsync);
 348        return rstatc;
 349}
 350
 351static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 352                                                 struct cgroup_rstat_cpu *rstatc)
 353{
 354        u64_stats_update_end(&rstatc->bsync);
 355        cgroup_rstat_updated(cgrp, smp_processor_id());
 356        put_cpu_ptr(rstatc);
 357}
 358
 359void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 360{
 361        struct cgroup_rstat_cpu *rstatc;
 362
 363        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
 364        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
 365        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
 366}
 367
 368void __cgroup_account_cputime_field(struct cgroup *cgrp,
 369                                    enum cpu_usage_stat index, u64 delta_exec)
 370{
 371        struct cgroup_rstat_cpu *rstatc;
 372
 373        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
 374
 375        switch (index) {
 376        case CPUTIME_USER:
 377        case CPUTIME_NICE:
 378                rstatc->bstat.cputime.utime += delta_exec;
 379                break;
 380        case CPUTIME_SYSTEM:
 381        case CPUTIME_IRQ:
 382        case CPUTIME_SOFTIRQ:
 383                rstatc->bstat.cputime.stime += delta_exec;
 384                break;
 385        default:
 386                break;
 387        }
 388
 389        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
 390}
 391
 392/*
 393 * compute the cputime for the root cgroup by getting the per cpu data
 394 * at a global level, then categorizing the fields in a manner consistent
 395 * with how it is done by __cgroup_account_cputime_field for each bit of
 396 * cpu time attributed to a cgroup.
 397 */
 398static void root_cgroup_cputime(struct task_cputime *cputime)
 399{
 400        int i;
 401
 402        cputime->stime = 0;
 403        cputime->utime = 0;
 404        cputime->sum_exec_runtime = 0;
 405        for_each_possible_cpu(i) {
 406                struct kernel_cpustat kcpustat;
 407                u64 *cpustat = kcpustat.cpustat;
 408                u64 user = 0;
 409                u64 sys = 0;
 410
 411                kcpustat_cpu_fetch(&kcpustat, i);
 412
 413                user += cpustat[CPUTIME_USER];
 414                user += cpustat[CPUTIME_NICE];
 415                cputime->utime += user;
 416
 417                sys += cpustat[CPUTIME_SYSTEM];
 418                sys += cpustat[CPUTIME_IRQ];
 419                sys += cpustat[CPUTIME_SOFTIRQ];
 420                cputime->stime += sys;
 421
 422                cputime->sum_exec_runtime += user;
 423                cputime->sum_exec_runtime += sys;
 424                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
 425                cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
 426                cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
 427        }
 428}
 429
 430void cgroup_base_stat_cputime_show(struct seq_file *seq)
 431{
 432        struct cgroup *cgrp = seq_css(seq)->cgroup;
 433        u64 usage, utime, stime;
 434        struct task_cputime cputime;
 435
 436        if (cgroup_parent(cgrp)) {
 437                cgroup_rstat_flush_hold(cgrp);
 438                usage = cgrp->bstat.cputime.sum_exec_runtime;
 439                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 440                               &utime, &stime);
 441                cgroup_rstat_flush_release();
 442        } else {
 443                root_cgroup_cputime(&cputime);
 444                usage = cputime.sum_exec_runtime;
 445                utime = cputime.utime;
 446                stime = cputime.stime;
 447        }
 448
 449        do_div(usage, NSEC_PER_USEC);
 450        do_div(utime, NSEC_PER_USEC);
 451        do_div(stime, NSEC_PER_USEC);
 452
 453        seq_printf(seq, "usage_usec %llu\n"
 454                   "user_usec %llu\n"
 455                   "system_usec %llu\n",
 456                   usage, utime, stime);
 457}
 458