LXR linux/kernel/cgroup/rstat.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2#include "cgroup-internal.h"
   3
   4#include <linux/sched/cputime.h>
   5
   6static DEFINE_SPINLOCK(cgroup_rstat_lock);
   7static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
   8
   9static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10
  11static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  12{
  13        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  14}
  15
  16/**
  17 * cgroup_rstat_updated - keep track of updated rstat_cpu
  18 * @cgrp: target cgroup
  19 * @cpu: cpu on which rstat_cpu was updated
  20 *
  21 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
  22 * rstat_cpu->updated_children list.  See the comment on top of
  23 * cgroup_rstat_cpu definition for details.
  24 */
  25void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  26{
  27        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  28        unsigned long flags;
  29
  30        /*
  31         * Speculative already-on-list test. This may race leading to
  32         * temporary inaccuracies, which is fine.
  33         *
  34         * Because @parent's updated_children is terminated with @parent
  35         * instead of NULL, we can tell whether @cgrp is on the list by
  36         * testing the next pointer for NULL.
  37         */
  38        if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
  39                return;
  40
  41        raw_spin_lock_irqsave(cpu_lock, flags);
  42
  43        /* put @cgrp and all ancestors on the corresponding updated lists */
  44        while (true) {
  45                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  46                struct cgroup *parent = cgroup_parent(cgrp);
  47                struct cgroup_rstat_cpu *prstatc;
  48
  49                /*
  50                 * Both additions and removals are bottom-up.  If a cgroup
  51                 * is already in the tree, all ancestors are.
  52                 */
  53                if (rstatc->updated_next)
  54                        break;
  55
  56                /* Root has no parent to link it to, but mark it busy */
  57                if (!parent) {
  58                        rstatc->updated_next = cgrp;
  59                        break;
  60                }
  61
  62                prstatc = cgroup_rstat_cpu(parent, cpu);
  63                rstatc->updated_next = prstatc->updated_children;
  64                prstatc->updated_children = cgrp;
  65
  66                cgrp = parent;
  67        }
  68
  69        raw_spin_unlock_irqrestore(cpu_lock, flags);
  70}
  71
  72/**
  73 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  74 * @pos: current position
  75 * @root: root of the tree to traversal
  76 * @cpu: target cpu
  77 *
  78 * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  79 * the traversal and %NULL return indicates the end.  During traversal,
  80 * each returned cgroup is unlinked from the tree.  Must be called with the
  81 * matching cgroup_rstat_cpu_lock held.
  82 *
  83 * The only ordering guarantee is that, for a parent and a child pair
  84 * covered by a given traversal, if a child is visited, its parent is
  85 * guaranteed to be visited afterwards.
  86 */
  87static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  88                                                   struct cgroup *root, int cpu)
  89{
  90        struct cgroup_rstat_cpu *rstatc;
  91
  92        if (pos == root)
  93                return NULL;
  94
  95        /*
  96         * We're gonna walk down to the first leaf and visit/remove it.  We
  97         * can pick whatever unvisited node as the starting point.
  98         */
  99        if (!pos)
 100                pos = root;
 101        else
 102                pos = cgroup_parent(pos);
 103
 104        /* walk down to the first leaf */
 105        while (true) {
 106                rstatc = cgroup_rstat_cpu(pos, cpu);
 107                if (rstatc->updated_children == pos)
 108                        break;
 109                pos = rstatc->updated_children;
 110        }
 111
 112        /*
 113         * Unlink @pos from the tree.  As the updated_children list is
 114         * singly linked, we have to walk it to find the removal point.
 115         * However, due to the way we traverse, @pos will be the first
 116         * child in most cases. The only exception is @root.
 117         */
 118        if (rstatc->updated_next) {
 119                struct cgroup *parent = cgroup_parent(pos);
 120
 121                if (parent) {
 122                        struct cgroup_rstat_cpu *prstatc;
 123                        struct cgroup **nextp;
 124
 125                        prstatc = cgroup_rstat_cpu(parent, cpu);
 126                        nextp = &prstatc->updated_children;
 127                        while (true) {
 128                                struct cgroup_rstat_cpu *nrstatc;
 129
 130                                nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 131                                if (*nextp == pos)
 132                                        break;
 133                                WARN_ON_ONCE(*nextp == parent);
 134                                nextp = &nrstatc->updated_next;
 135                        }
 136                        *nextp = rstatc->updated_next;
 137                }
 138
 139                rstatc->updated_next = NULL;
 140                return pos;
 141        }
 142
 143        /* only happens for @root */
 144        return NULL;
 145}
 146
 147/* see cgroup_rstat_flush() */
 148static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 149        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 150{
 151        int cpu;
 152
 153        lockdep_assert_held(&cgroup_rstat_lock);
 154
 155        for_each_possible_cpu(cpu) {
 156                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 157                                                       cpu);
 158                struct cgroup *pos = NULL;
 159
 160                raw_spin_lock(cpu_lock);
 161                while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 162                        struct cgroup_subsys_state *css;
 163
 164                        cgroup_base_stat_flush(pos, cpu);
 165
 166                        rcu_read_lock();
 167                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
 168                                                rstat_css_node)
 169                                css->ss->css_rstat_flush(css, cpu);
 170                        rcu_read_unlock();
 171                }
 172                raw_spin_unlock(cpu_lock);
 173
 174                /* if @may_sleep, play nice and yield if necessary */
 175                if (may_sleep && (need_resched() ||
 176                                  spin_needbreak(&cgroup_rstat_lock))) {
 177                        spin_unlock_irq(&cgroup_rstat_lock);
 178                        if (!cond_resched())
 179                                cpu_relax();
 180                        spin_lock_irq(&cgroup_rstat_lock);
 181                }
 182        }
 183}
 184
 185/**
 186 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 187 * @cgrp: target cgroup
 188 *
 189 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 190 * and propagate them upwards.  After this function returns, all cgroups in
 191 * the subtree have up-to-date ->stat.
 192 *
 193 * This also gets all cgroups in the subtree including @cgrp off the
 194 * ->updated_children lists.
 195 *
 196 * This function may block.
 197 */
 198void cgroup_rstat_flush(struct cgroup *cgrp)
 199{
 200        might_sleep();
 201
 202        spin_lock_irq(&cgroup_rstat_lock);
 203        cgroup_rstat_flush_locked(cgrp, true);
 204        spin_unlock_irq(&cgroup_rstat_lock);
 205}
 206
 207/**
 208 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 209 * @cgrp: target cgroup
 210 *
 211 * This function can be called from any context.
 212 */
 213void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 214{
 215        unsigned long flags;
 216
 217        spin_lock_irqsave(&cgroup_rstat_lock, flags);
 218        cgroup_rstat_flush_locked(cgrp, false);
 219        spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 220}
 221
 222/**
 223 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 224 * @cgrp: target cgroup
 225 *
 226 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 227 * paired with cgroup_rstat_flush_release().
 228 *
 229 * This function may block.
 230 */
 231void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 232        __acquires(&cgroup_rstat_lock)
 233{
 234        might_sleep();
 235        spin_lock_irq(&cgroup_rstat_lock);
 236        cgroup_rstat_flush_locked(cgrp, true);
 237}
 238
 239/**
 240 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 241 */
 242void cgroup_rstat_flush_release(void)
 243        __releases(&cgroup_rstat_lock)
 244{
 245        spin_unlock_irq(&cgroup_rstat_lock);
 246}
 247
 248int cgroup_rstat_init(struct cgroup *cgrp)
 249{
 250        int cpu;
 251
 252        /* the root cgrp has rstat_cpu preallocated */
 253        if (!cgrp->rstat_cpu) {
 254                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
 255                if (!cgrp->rstat_cpu)
 256                        return -ENOMEM;
 257        }
 258
 259        /* ->updated_children list is self terminated */
 260        for_each_possible_cpu(cpu) {
 261                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 262
 263                rstatc->updated_children = cgrp;
 264                u64_stats_init(&rstatc->bsync);
 265        }
 266
 267        return 0;
 268}
 269
 270void cgroup_rstat_exit(struct cgroup *cgrp)
 271{
 272        int cpu;
 273
 274        cgroup_rstat_flush(cgrp);
 275
 276        /* sanity check */
 277        for_each_possible_cpu(cpu) {
 278                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 279
 280                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
 281                    WARN_ON_ONCE(rstatc->updated_next))
 282                        return;
 283        }
 284
 285        free_percpu(cgrp->rstat_cpu);
 286        cgrp->rstat_cpu = NULL;
 287}
 288
 289void __init cgroup_rstat_boot(void)
 290{
 291        int cpu;
 292
 293        for_each_possible_cpu(cpu)
 294                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 295}
 296
 297/*
 298 * Functions for cgroup basic resource statistics implemented on top of
 299 * rstat.
 300 */
 301static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 302                                 struct cgroup_base_stat *src_bstat)
 303{
 304        dst_bstat->cputime.utime += src_bstat->cputime.utime;
 305        dst_bstat->cputime.stime += src_bstat->cputime.stime;
 306        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 307}
 308
 309static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 310                                 struct cgroup_base_stat *src_bstat)
 311{
 312        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 313        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 314        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
 315}
 316
 317static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 318{
 319        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 320        struct cgroup *parent = cgroup_parent(cgrp);
 321        struct cgroup_base_stat cur, delta;
 322        unsigned seq;
 323
 324        /* Root-level stats are sourced from system-wide CPU stats */
 325        if (!parent)
 326                return;
 327
 328        /* fetch the current per-cpu values */
 329        do {
 330                seq = __u64_stats_fetch_begin(&rstatc->bsync);
 331                cur.cputime = rstatc->bstat.cputime;
 332        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 333
 334        /* propagate percpu delta to global */
 335        delta = cur;
 336        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 337        cgroup_base_stat_add(&cgrp->bstat, &delta);
 338        cgroup_base_stat_add(&rstatc->last_bstat, &delta);
 339
 340        /* propagate global delta to parent (unless that's root) */
 341        if (cgroup_parent(parent)) {
 342                delta = cgrp->bstat;
 343                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
 344                cgroup_base_stat_add(&parent->bstat, &delta);
 345                cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 346        }
 347}
 348
 349static struct cgroup_rstat_cpu *
 350cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
 351{
 352        struct cgroup_rstat_cpu *rstatc;
 353
 354        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
 355        *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
 356        return rstatc;
 357}
 358
 359static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 360                                                 struct cgroup_rstat_cpu *rstatc,
 361                                                 unsigned long flags)
 362{
 363        u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
 364        cgroup_rstat_updated(cgrp, smp_processor_id());
 365        put_cpu_ptr(rstatc);
 366}
 367
 368void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 369{
 370        struct cgroup_rstat_cpu *rstatc;
 371        unsigned long flags;
 372
 373        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 374        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
 375        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 376}
 377
 378void __cgroup_account_cputime_field(struct cgroup *cgrp,
 379                                    enum cpu_usage_stat index, u64 delta_exec)
 380{
 381        struct cgroup_rstat_cpu *rstatc;
 382        unsigned long flags;
 383
 384        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 385
 386        switch (index) {
 387        case CPUTIME_USER:
 388        case CPUTIME_NICE:
 389                rstatc->bstat.cputime.utime += delta_exec;
 390                break;
 391        case CPUTIME_SYSTEM:
 392        case CPUTIME_IRQ:
 393        case CPUTIME_SOFTIRQ:
 394                rstatc->bstat.cputime.stime += delta_exec;
 395                break;
 396        default:
 397                break;
 398        }
 399
 400        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 401}
 402
 403/*
 404 * compute the cputime for the root cgroup by getting the per cpu data
 405 * at a global level, then categorizing the fields in a manner consistent
 406 * with how it is done by __cgroup_account_cputime_field for each bit of
 407 * cpu time attributed to a cgroup.
 408 */
 409static void root_cgroup_cputime(struct task_cputime *cputime)
 410{
 411        int i;
 412
 413        cputime->stime = 0;
 414        cputime->utime = 0;
 415        cputime->sum_exec_runtime = 0;
 416        for_each_possible_cpu(i) {
 417                struct kernel_cpustat kcpustat;
 418                u64 *cpustat = kcpustat.cpustat;
 419                u64 user = 0;
 420                u64 sys = 0;
 421
 422                kcpustat_cpu_fetch(&kcpustat, i);
 423
 424                user += cpustat[CPUTIME_USER];
 425                user += cpustat[CPUTIME_NICE];
 426                cputime->utime += user;
 427
 428                sys += cpustat[CPUTIME_SYSTEM];
 429                sys += cpustat[CPUTIME_IRQ];
 430                sys += cpustat[CPUTIME_SOFTIRQ];
 431                cputime->stime += sys;
 432
 433                cputime->sum_exec_runtime += user;
 434                cputime->sum_exec_runtime += sys;
 435                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
 436                cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
 437                cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
 438        }
 439}
 440
 441void cgroup_base_stat_cputime_show(struct seq_file *seq)
 442{
 443        struct cgroup *cgrp = seq_css(seq)->cgroup;
 444        u64 usage, utime, stime;
 445        struct task_cputime cputime;
 446
 447        if (cgroup_parent(cgrp)) {
 448                cgroup_rstat_flush_hold(cgrp);
 449                usage = cgrp->bstat.cputime.sum_exec_runtime;
 450                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 451                               &utime, &stime);
 452                cgroup_rstat_flush_release();
 453        } else {
 454                root_cgroup_cputime(&cputime);
 455                usage = cputime.sum_exec_runtime;
 456                utime = cputime.utime;
 457                stime = cputime.stime;
 458        }
 459
 460        do_div(usage, NSEC_PER_USEC);
 461        do_div(utime, NSEC_PER_USEC);
 462        do_div(stime, NSEC_PER_USEC);
 463
 464        seq_printf(seq, "usage_usec %llu\n"
 465                   "user_usec %llu\n"
 466                   "system_usec %llu\n",
 467                   usage, utime, stime);
 468}
 469