LXR linux/kernel/sched/cputime.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple CPU accounting cgroup controller
   4 */
   5#include "sched.h"
   6
   7#ifdef CONFIG_IRQ_TIME_ACCOUNTING
   8
   9/*
  10 * There are no locks covering percpu hardirq/softirq time.
  11 * They are only modified in vtime_account, on corresponding CPU
  12 * with interrupts disabled. So, writes are safe.
  13 * They are read and saved off onto struct rq in update_rq_clock().
  14 * This may result in other CPU reading this CPU's irq time and can
  15 * race with irq/vtime_account on this CPU. We would either get old
  16 * or new value with a side effect of accounting a slice of irq time to wrong
  17 * task when irq is in progress while we read rq->clock. That is a worthy
  18 * compromise in place of having locks on each irq in account_system_time.
  19 */
  20DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  21
  22static int sched_clock_irqtime;
  23
  24void enable_sched_clock_irqtime(void)
  25{
  26        sched_clock_irqtime = 1;
  27}
  28
  29void disable_sched_clock_irqtime(void)
  30{
  31        sched_clock_irqtime = 0;
  32}
  33
  34static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
  35                                  enum cpu_usage_stat idx)
  36{
  37        u64 *cpustat = kcpustat_this_cpu->cpustat;
  38
  39        u64_stats_update_begin(&irqtime->sync);
  40        cpustat[idx] += delta;
  41        irqtime->total += delta;
  42        irqtime->tick_delta += delta;
  43        u64_stats_update_end(&irqtime->sync);
  44}
  45
  46/*
  47 * Called before incrementing preempt_count on {soft,}irq_enter
  48 * and before decrementing preempt_count on {soft,}irq_exit.
  49 */
  50void irqtime_account_irq(struct task_struct *curr)
  51{
  52        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  53        s64 delta;
  54        int cpu;
  55
  56        if (!sched_clock_irqtime)
  57                return;
  58
  59        cpu = smp_processor_id();
  60        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  61        irqtime->irq_start_time += delta;
  62
  63        /*
  64         * We do not account for softirq time from ksoftirqd here.
  65         * We want to continue accounting softirq time to ksoftirqd thread
  66         * in that case, so as not to confuse scheduler with a special task
  67         * that do not consume any time, but still wants to run.
  68         */
  69        if (hardirq_count())
  70                irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
  71        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  72                irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
  73}
  74EXPORT_SYMBOL_GPL(irqtime_account_irq);
  75
  76static u64 irqtime_tick_accounted(u64 maxtime)
  77{
  78        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  79        u64 delta;
  80
  81        delta = min(irqtime->tick_delta, maxtime);
  82        irqtime->tick_delta -= delta;
  83
  84        return delta;
  85}
  86
  87#else /* CONFIG_IRQ_TIME_ACCOUNTING */
  88
  89#define sched_clock_irqtime     (0)
  90
  91static u64 irqtime_tick_accounted(u64 dummy)
  92{
  93        return 0;
  94}
  95
  96#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
  97
  98static inline void task_group_account_field(struct task_struct *p, int index,
  99                                            u64 tmp)
 100{
 101        /*
 102         * Since all updates are sure to touch the root cgroup, we
 103         * get ourselves ahead and touch it first. If the root cgroup
 104         * is the only cgroup, then nothing else should be necessary.
 105         *
 106         */
 107        __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 108
 109        cgroup_account_cputime_field(p, index, tmp);
 110}
 111
 112/*
 113 * Account user CPU time to a process.
 114 * @p: the process that the CPU time gets accounted to
 115 * @cputime: the CPU time spent in user space since the last update
 116 */
 117void account_user_time(struct task_struct *p, u64 cputime)
 118{
 119        int index;
 120
 121        /* Add user time to process. */
 122        p->utime += cputime;
 123        account_group_user_time(p, cputime);
 124
 125        index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 126
 127        /* Add user time to cpustat. */
 128        task_group_account_field(p, index, cputime);
 129
 130        /* Account for user time used */
 131        acct_account_cputime(p);
 132}
 133
 134/*
 135 * Account guest CPU time to a process.
 136 * @p: the process that the CPU time gets accounted to
 137 * @cputime: the CPU time spent in virtual machine since the last update
 138 */
 139void account_guest_time(struct task_struct *p, u64 cputime)
 140{
 141        u64 *cpustat = kcpustat_this_cpu->cpustat;
 142
 143        /* Add guest time to process. */
 144        p->utime += cputime;
 145        account_group_user_time(p, cputime);
 146        p->gtime += cputime;
 147
 148        /* Add guest time to cpustat. */
 149        if (task_nice(p) > 0) {
 150                cpustat[CPUTIME_NICE] += cputime;
 151                cpustat[CPUTIME_GUEST_NICE] += cputime;
 152        } else {
 153                cpustat[CPUTIME_USER] += cputime;
 154                cpustat[CPUTIME_GUEST] += cputime;
 155        }
 156}
 157
 158/*
 159 * Account system CPU time to a process and desired cpustat field
 160 * @p: the process that the CPU time gets accounted to
 161 * @cputime: the CPU time spent in kernel space since the last update
 162 * @index: pointer to cpustat field that has to be updated
 163 */
 164void account_system_index_time(struct task_struct *p,
 165                               u64 cputime, enum cpu_usage_stat index)
 166{
 167        /* Add system time to process. */
 168        p->stime += cputime;
 169        account_group_system_time(p, cputime);
 170
 171        /* Add system time to cpustat. */
 172        task_group_account_field(p, index, cputime);
 173
 174        /* Account for system time used */
 175        acct_account_cputime(p);
 176}
 177
 178/*
 179 * Account system CPU time to a process.
 180 * @p: the process that the CPU time gets accounted to
 181 * @hardirq_offset: the offset to subtract from hardirq_count()
 182 * @cputime: the CPU time spent in kernel space since the last update
 183 */
 184void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 185{
 186        int index;
 187
 188        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 189                account_guest_time(p, cputime);
 190                return;
 191        }
 192
 193        if (hardirq_count() - hardirq_offset)
 194                index = CPUTIME_IRQ;
 195        else if (in_serving_softirq())
 196                index = CPUTIME_SOFTIRQ;
 197        else
 198                index = CPUTIME_SYSTEM;
 199
 200        account_system_index_time(p, cputime, index);
 201}
 202
 203/*
 204 * Account for involuntary wait time.
 205 * @cputime: the CPU time spent in involuntary wait
 206 */
 207void account_steal_time(u64 cputime)
 208{
 209        u64 *cpustat = kcpustat_this_cpu->cpustat;
 210
 211        cpustat[CPUTIME_STEAL] += cputime;
 212}
 213
 214/*
 215 * Account for idle time.
 216 * @cputime: the CPU time spent in idle wait
 217 */
 218void account_idle_time(u64 cputime)
 219{
 220        u64 *cpustat = kcpustat_this_cpu->cpustat;
 221        struct rq *rq = this_rq();
 222
 223        if (atomic_read(&rq->nr_iowait) > 0)
 224                cpustat[CPUTIME_IOWAIT] += cputime;
 225        else
 226                cpustat[CPUTIME_IDLE] += cputime;
 227}
 228
 229/*
 230 * When a guest is interrupted for a longer amount of time, missed clock
 231 * ticks are not redelivered later. Due to that, this function may on
 232 * occasion account more time than the calling functions think elapsed.
 233 */
 234static __always_inline u64 steal_account_process_time(u64 maxtime)
 235{
 236#ifdef CONFIG_PARAVIRT
 237        if (static_key_false(&paravirt_steal_enabled)) {
 238                u64 steal;
 239
 240                steal = paravirt_steal_clock(smp_processor_id());
 241                steal -= this_rq()->prev_steal_time;
 242                steal = min(steal, maxtime);
 243                account_steal_time(steal);
 244                this_rq()->prev_steal_time += steal;
 245
 246                return steal;
 247        }
 248#endif
 249        return 0;
 250}
 251
 252/*
 253 * Account how much elapsed time was spent in steal, irq, or softirq time.
 254 */
 255static inline u64 account_other_time(u64 max)
 256{
 257        u64 accounted;
 258
 259        lockdep_assert_irqs_disabled();
 260
 261        accounted = steal_account_process_time(max);
 262
 263        if (accounted < max)
 264                accounted += irqtime_tick_accounted(max - accounted);
 265
 266        return accounted;
 267}
 268
 269#ifdef CONFIG_64BIT
 270static inline u64 read_sum_exec_runtime(struct task_struct *t)
 271{
 272        return t->se.sum_exec_runtime;
 273}
 274#else
 275static u64 read_sum_exec_runtime(struct task_struct *t)
 276{
 277        u64 ns;
 278        struct rq_flags rf;
 279        struct rq *rq;
 280
 281        rq = task_rq_lock(t, &rf);
 282        ns = t->se.sum_exec_runtime;
 283        task_rq_unlock(rq, t, &rf);
 284
 285        return ns;
 286}
 287#endif
 288
 289/*
 290 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 291 * tasks (sum on group iteration) belonging to @tsk's group.
 292 */
 293void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 294{
 295        struct signal_struct *sig = tsk->signal;
 296        u64 utime, stime;
 297        struct task_struct *t;
 298        unsigned int seq, nextseq;
 299        unsigned long flags;
 300
 301        /*
 302         * Update current task runtime to account pending time since last
 303         * scheduler action or thread_group_cputime() call. This thread group
 304         * might have other running tasks on different CPUs, but updating
 305         * their runtime can affect syscall performance, so we skip account
 306         * those pending times and rely only on values updated on tick or
 307         * other scheduler action.
 308         */
 309        if (same_thread_group(current, tsk))
 310                (void) task_sched_runtime(current);
 311
 312        rcu_read_lock();
 313        /* Attempt a lockless read on the first round. */
 314        nextseq = 0;
 315        do {
 316                seq = nextseq;
 317                flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 318                times->utime = sig->utime;
 319                times->stime = sig->stime;
 320                times->sum_exec_runtime = sig->sum_sched_runtime;
 321
 322                for_each_thread(tsk, t) {
 323                        task_cputime(t, &utime, &stime);
 324                        times->utime += utime;
 325                        times->stime += stime;
 326                        times->sum_exec_runtime += read_sum_exec_runtime(t);
 327                }
 328                /* If lockless access failed, take the lock. */
 329                nextseq = 1;
 330        } while (need_seqretry(&sig->stats_lock, seq));
 331        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 332        rcu_read_unlock();
 333}
 334
 335#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 336/*
 337 * Account a tick to a process and cpustat
 338 * @p: the process that the CPU time gets accounted to
 339 * @user_tick: is the tick from userspace
 340 * @rq: the pointer to rq
 341 *
 342 * Tick demultiplexing follows the order
 343 * - pending hardirq update
 344 * - pending softirq update
 345 * - user_time
 346 * - idle_time
 347 * - system time
 348 *   - check for guest_time
 349 *   - else account as system_time
 350 *
 351 * Check for hardirq is done both for system and user time as there is
 352 * no timer going off while we are on hardirq and hence we may never get an
 353 * opportunity to update it solely in system time.
 354 * p->stime and friends are only updated on system time and not on irq
 355 * softirq as those do not count in task exec_runtime any more.
 356 */
 357static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 358                                         struct rq *rq, int ticks)
 359{
 360        u64 other, cputime = TICK_NSEC * ticks;
 361
 362        /*
 363         * When returning from idle, many ticks can get accounted at
 364         * once, including some ticks of steal, irq, and softirq time.
 365         * Subtract those ticks from the amount of time accounted to
 366         * idle, or potentially user or system time. Due to rounding,
 367         * other time can exceed ticks occasionally.
 368         */
 369        other = account_other_time(ULONG_MAX);
 370        if (other >= cputime)
 371                return;
 372
 373        cputime -= other;
 374
 375        if (this_cpu_ksoftirqd() == p) {
 376                /*
 377                 * ksoftirqd time do not get accounted in cpu_softirq_time.
 378                 * So, we have to handle it separately here.
 379                 * Also, p->stime needs to be updated for ksoftirqd.
 380                 */
 381                account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 382        } else if (user_tick) {
 383                account_user_time(p, cputime);
 384        } else if (p == rq->idle) {
 385                account_idle_time(cputime);
 386        } else if (p->flags & PF_VCPU) { /* System time or guest time */
 387                account_guest_time(p, cputime);
 388        } else {
 389                account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 390        }
 391}
 392
 393static void irqtime_account_idle_ticks(int ticks)
 394{
 395        struct rq *rq = this_rq();
 396
 397        irqtime_account_process_tick(current, 0, rq, ticks);
 398}
 399#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 400static inline void irqtime_account_idle_ticks(int ticks) { }
 401static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 402                                                struct rq *rq, int nr_ticks) { }
 403#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 404
 405/*
 406 * Use precise platform statistics if available:
 407 */
 408#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 409
 410# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 411void vtime_task_switch(struct task_struct *prev)
 412{
 413        if (is_idle_task(prev))
 414                vtime_account_idle(prev);
 415        else
 416                vtime_account_kernel(prev);
 417
 418        vtime_flush(prev);
 419        arch_vtime_task_switch(prev);
 420}
 421# endif
 422
 423/*
 424 * Archs that account the whole time spent in the idle task
 425 * (outside irq) as idle time can rely on this and just implement
 426 * vtime_account_kernel() and vtime_account_idle(). Archs that
 427 * have other meaning of the idle time (s390 only includes the
 428 * time spent by the CPU when it's in low power mode) must override
 429 * vtime_account().
 430 */
 431#ifndef __ARCH_HAS_VTIME_ACCOUNT
 432void vtime_account_irq_enter(struct task_struct *tsk)
 433{
 434        if (!in_interrupt() && is_idle_task(tsk))
 435                vtime_account_idle(tsk);
 436        else
 437                vtime_account_kernel(tsk);
 438}
 439EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 440#endif /* __ARCH_HAS_VTIME_ACCOUNT */
 441
 442void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 443                    u64 *ut, u64 *st)
 444{
 445        *ut = curr->utime;
 446        *st = curr->stime;
 447}
 448
 449void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 450{
 451        *ut = p->utime;
 452        *st = p->stime;
 453}
 454EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 455
 456void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 457{
 458        struct task_cputime cputime;
 459
 460        thread_group_cputime(p, &cputime);
 461
 462        *ut = cputime.utime;
 463        *st = cputime.stime;
 464}
 465
 466#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
 467
 468/*
 469 * Account a single tick of CPU time.
 470 * @p: the process that the CPU time gets accounted to
 471 * @user_tick: indicates if the tick is a user or a system tick
 472 */
 473void account_process_tick(struct task_struct *p, int user_tick)
 474{
 475        u64 cputime, steal;
 476        struct rq *rq = this_rq();
 477
 478        if (vtime_accounting_enabled_this_cpu())
 479                return;
 480
 481        if (sched_clock_irqtime) {
 482                irqtime_account_process_tick(p, user_tick, rq, 1);
 483                return;
 484        }
 485
 486        cputime = TICK_NSEC;
 487        steal = steal_account_process_time(ULONG_MAX);
 488
 489        if (steal >= cputime)
 490                return;
 491
 492        cputime -= steal;
 493
 494        if (user_tick)
 495                account_user_time(p, cputime);
 496        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 497                account_system_time(p, HARDIRQ_OFFSET, cputime);
 498        else
 499                account_idle_time(cputime);
 500}
 501
 502/*
 503 * Account multiple ticks of idle time.
 504 * @ticks: number of stolen ticks
 505 */
 506void account_idle_ticks(unsigned long ticks)
 507{
 508        u64 cputime, steal;
 509
 510        if (sched_clock_irqtime) {
 511                irqtime_account_idle_ticks(ticks);
 512                return;
 513        }
 514
 515        cputime = ticks * TICK_NSEC;
 516        steal = steal_account_process_time(ULONG_MAX);
 517
 518        if (steal >= cputime)
 519                return;
 520
 521        cputime -= steal;
 522        account_idle_time(cputime);
 523}
 524
 525/*
 526 * Perform (stime * rtime) / total, but avoid multiplication overflow by
 527 * losing precision when the numbers are big.
 528 */
 529static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 530{
 531        u64 scaled;
 532
 533        for (;;) {
 534                /* Make sure "rtime" is the bigger of stime/rtime */
 535                if (stime > rtime)
 536                        swap(rtime, stime);
 537
 538                /* Make sure 'total' fits in 32 bits */
 539                if (total >> 32)
 540                        goto drop_precision;
 541
 542                /* Does rtime (and thus stime) fit in 32 bits? */
 543                if (!(rtime >> 32))
 544                        break;
 545
 546                /* Can we just balance rtime/stime rather than dropping bits? */
 547                if (stime >> 31)
 548                        goto drop_precision;
 549
 550                /* We can grow stime and shrink rtime and try to make them both fit */
 551                stime <<= 1;
 552                rtime >>= 1;
 553                continue;
 554
 555drop_precision:
 556                /* We drop from rtime, it has more bits than stime */
 557                rtime >>= 1;
 558                total >>= 1;
 559        }
 560
 561        /*
 562         * Make sure gcc understands that this is a 32x32->64 multiply,
 563         * followed by a 64/32->64 divide.
 564         */
 565        scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
 566        return scaled;
 567}
 568
 569/*
 570 * Adjust tick based cputime random precision against scheduler runtime
 571 * accounting.
 572 *
 573 * Tick based cputime accounting depend on random scheduling timeslices of a
 574 * task to be interrupted or not by the timer.  Depending on these
 575 * circumstances, the number of these interrupts may be over or
 576 * under-optimistic, matching the real user and system cputime with a variable
 577 * precision.
 578 *
 579 * Fix this by scaling these tick based values against the total runtime
 580 * accounted by the CFS scheduler.
 581 *
 582 * This code provides the following guarantees:
 583 *
 584 *   stime + utime == rtime
 585 *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 586 *
 587 * Assuming that rtime_i+1 >= rtime_i.
 588 */
 589void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 590                    u64 *ut, u64 *st)
 591{
 592        u64 rtime, stime, utime;
 593        unsigned long flags;
 594
 595        /* Serialize concurrent callers such that we can honour our guarantees */
 596        raw_spin_lock_irqsave(&prev->lock, flags);
 597        rtime = curr->sum_exec_runtime;
 598
 599        /*
 600         * This is possible under two circumstances:
 601         *  - rtime isn't monotonic after all (a bug);
 602         *  - we got reordered by the lock.
 603         *
 604         * In both cases this acts as a filter such that the rest of the code
 605         * can assume it is monotonic regardless of anything else.
 606         */
 607        if (prev->stime + prev->utime >= rtime)
 608                goto out;
 609
 610        stime = curr->stime;
 611        utime = curr->utime;
 612
 613        /*
 614         * If either stime or utime are 0, assume all runtime is userspace.
 615         * Once a task gets some ticks, the monotonicy code at 'update:'
 616         * will ensure things converge to the observed ratio.
 617         */
 618        if (stime == 0) {
 619                utime = rtime;
 620                goto update;
 621        }
 622
 623        if (utime == 0) {
 624                stime = rtime;
 625                goto update;
 626        }
 627
 628        stime = scale_stime(stime, rtime, stime + utime);
 629
 630update:
 631        /*
 632         * Make sure stime doesn't go backwards; this preserves monotonicity
 633         * for utime because rtime is monotonic.
 634         *
 635         *  utime_i+1 = rtime_i+1 - stime_i
 636         *            = rtime_i+1 - (rtime_i - utime_i)
 637         *            = (rtime_i+1 - rtime_i) + utime_i
 638         *            >= utime_i
 639         */
 640        if (stime < prev->stime)
 641                stime = prev->stime;
 642        utime = rtime - stime;
 643
 644        /*
 645         * Make sure utime doesn't go backwards; this still preserves
 646         * monotonicity for stime, analogous argument to above.
 647         */
 648        if (utime < prev->utime) {
 649                utime = prev->utime;
 650                stime = rtime - utime;
 651        }
 652
 653        prev->stime = stime;
 654        prev->utime = utime;
 655out:
 656        *ut = prev->utime;
 657        *st = prev->stime;
 658        raw_spin_unlock_irqrestore(&prev->lock, flags);
 659}
 660
 661void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 662{
 663        struct task_cputime cputime = {
 664                .sum_exec_runtime = p->se.sum_exec_runtime,
 665        };
 666
 667        task_cputime(p, &cputime.utime, &cputime.stime);
 668        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 669}
 670EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 671
 672void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 673{
 674        struct task_cputime cputime;
 675
 676        thread_group_cputime(p, &cputime);
 677        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 678}
 679#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 680
 681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 682static u64 vtime_delta(struct vtime *vtime)
 683{
 684        unsigned long long clock;
 685
 686        clock = sched_clock();
 687        if (clock < vtime->starttime)
 688                return 0;
 689
 690        return clock - vtime->starttime;
 691}
 692
 693static u64 get_vtime_delta(struct vtime *vtime)
 694{
 695        u64 delta = vtime_delta(vtime);
 696        u64 other;
 697
 698        /*
 699         * Unlike tick based timing, vtime based timing never has lost
 700         * ticks, and no need for steal time accounting to make up for
 701         * lost ticks. Vtime accounts a rounded version of actual
 702         * elapsed time. Limit account_other_time to prevent rounding
 703         * errors from causing elapsed vtime to go negative.
 704         */
 705        other = account_other_time(delta);
 706        WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
 707        vtime->starttime += delta;
 708
 709        return delta - other;
 710}
 711
 712static void vtime_account_system(struct task_struct *tsk,
 713                                 struct vtime *vtime)
 714{
 715        vtime->stime += get_vtime_delta(vtime);
 716        if (vtime->stime >= TICK_NSEC) {
 717                account_system_time(tsk, irq_count(), vtime->stime);
 718                vtime->stime = 0;
 719        }
 720}
 721
 722static void vtime_account_guest(struct task_struct *tsk,
 723                                struct vtime *vtime)
 724{
 725        vtime->gtime += get_vtime_delta(vtime);
 726        if (vtime->gtime >= TICK_NSEC) {
 727                account_guest_time(tsk, vtime->gtime);
 728                vtime->gtime = 0;
 729        }
 730}
 731
 732static void __vtime_account_kernel(struct task_struct *tsk,
 733                                   struct vtime *vtime)
 734{
 735        /* We might have scheduled out from guest path */
 736        if (vtime->state == VTIME_GUEST)
 737                vtime_account_guest(tsk, vtime);
 738        else
 739                vtime_account_system(tsk, vtime);
 740}
 741
 742void vtime_account_kernel(struct task_struct *tsk)
 743{
 744        struct vtime *vtime = &tsk->vtime;
 745
 746        if (!vtime_delta(vtime))
 747                return;
 748
 749        write_seqcount_begin(&vtime->seqcount);
 750        __vtime_account_kernel(tsk, vtime);
 751        write_seqcount_end(&vtime->seqcount);
 752}
 753
 754void vtime_user_enter(struct task_struct *tsk)
 755{
 756        struct vtime *vtime = &tsk->vtime;
 757
 758        write_seqcount_begin(&vtime->seqcount);
 759        vtime_account_system(tsk, vtime);
 760        vtime->state = VTIME_USER;
 761        write_seqcount_end(&vtime->seqcount);
 762}
 763
 764void vtime_user_exit(struct task_struct *tsk)
 765{
 766        struct vtime *vtime = &tsk->vtime;
 767
 768        write_seqcount_begin(&vtime->seqcount);
 769        vtime->utime += get_vtime_delta(vtime);
 770        if (vtime->utime >= TICK_NSEC) {
 771                account_user_time(tsk, vtime->utime);
 772                vtime->utime = 0;
 773        }
 774        vtime->state = VTIME_SYS;
 775        write_seqcount_end(&vtime->seqcount);
 776}
 777
 778void vtime_guest_enter(struct task_struct *tsk)
 779{
 780        struct vtime *vtime = &tsk->vtime;
 781        /*
 782         * The flags must be updated under the lock with
 783         * the vtime_starttime flush and update.
 784         * That enforces a right ordering and update sequence
 785         * synchronization against the reader (task_gtime())
 786         * that can thus safely catch up with a tickless delta.
 787         */
 788        write_seqcount_begin(&vtime->seqcount);
 789        vtime_account_system(tsk, vtime);
 790        tsk->flags |= PF_VCPU;
 791        vtime->state = VTIME_GUEST;
 792        write_seqcount_end(&vtime->seqcount);
 793}
 794EXPORT_SYMBOL_GPL(vtime_guest_enter);
 795
 796void vtime_guest_exit(struct task_struct *tsk)
 797{
 798        struct vtime *vtime = &tsk->vtime;
 799
 800        write_seqcount_begin(&vtime->seqcount);
 801        vtime_account_guest(tsk, vtime);
 802        tsk->flags &= ~PF_VCPU;
 803        vtime->state = VTIME_SYS;
 804        write_seqcount_end(&vtime->seqcount);
 805}
 806EXPORT_SYMBOL_GPL(vtime_guest_exit);
 807
 808void vtime_account_idle(struct task_struct *tsk)
 809{
 810        account_idle_time(get_vtime_delta(&tsk->vtime));
 811}
 812
 813void vtime_task_switch_generic(struct task_struct *prev)
 814{
 815        struct vtime *vtime = &prev->vtime;
 816
 817        write_seqcount_begin(&vtime->seqcount);
 818        if (vtime->state == VTIME_IDLE)
 819                vtime_account_idle(prev);
 820        else
 821                __vtime_account_kernel(prev, vtime);
 822        vtime->state = VTIME_INACTIVE;
 823        vtime->cpu = -1;
 824        write_seqcount_end(&vtime->seqcount);
 825
 826        vtime = &current->vtime;
 827
 828        write_seqcount_begin(&vtime->seqcount);
 829        if (is_idle_task(current))
 830                vtime->state = VTIME_IDLE;
 831        else if (current->flags & PF_VCPU)
 832                vtime->state = VTIME_GUEST;
 833        else
 834                vtime->state = VTIME_SYS;
 835        vtime->starttime = sched_clock();
 836        vtime->cpu = smp_processor_id();
 837        write_seqcount_end(&vtime->seqcount);
 838}
 839
 840void vtime_init_idle(struct task_struct *t, int cpu)
 841{
 842        struct vtime *vtime = &t->vtime;
 843        unsigned long flags;
 844
 845        local_irq_save(flags);
 846        write_seqcount_begin(&vtime->seqcount);
 847        vtime->state = VTIME_IDLE;
 848        vtime->starttime = sched_clock();
 849        vtime->cpu = cpu;
 850        write_seqcount_end(&vtime->seqcount);
 851        local_irq_restore(flags);
 852}
 853
 854u64 task_gtime(struct task_struct *t)
 855{
 856        struct vtime *vtime = &t->vtime;
 857        unsigned int seq;
 858        u64 gtime;
 859
 860        if (!vtime_accounting_enabled())
 861                return t->gtime;
 862
 863        do {
 864                seq = read_seqcount_begin(&vtime->seqcount);
 865
 866                gtime = t->gtime;
 867                if (vtime->state == VTIME_GUEST)
 868                        gtime += vtime->gtime + vtime_delta(vtime);
 869
 870        } while (read_seqcount_retry(&vtime->seqcount, seq));
 871
 872        return gtime;
 873}
 874
 875/*
 876 * Fetch cputime raw values from fields of task_struct and
 877 * add up the pending nohz execution time since the last
 878 * cputime snapshot.
 879 */
 880void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 881{
 882        struct vtime *vtime = &t->vtime;
 883        unsigned int seq;
 884        u64 delta;
 885
 886        if (!vtime_accounting_enabled()) {
 887                *utime = t->utime;
 888                *stime = t->stime;
 889                return;
 890        }
 891
 892        do {
 893                seq = read_seqcount_begin(&vtime->seqcount);
 894
 895                *utime = t->utime;
 896                *stime = t->stime;
 897
 898                /* Task is sleeping or idle, nothing to add */
 899                if (vtime->state < VTIME_SYS)
 900                        continue;
 901
 902                delta = vtime_delta(vtime);
 903
 904                /*
 905                 * Task runs either in user (including guest) or kernel space,
 906                 * add pending nohz time to the right place.
 907                 */
 908                if (vtime->state == VTIME_SYS)
 909                        *stime += vtime->stime + delta;
 910                else
 911                        *utime += vtime->utime + delta;
 912        } while (read_seqcount_retry(&vtime->seqcount, seq));
 913}
 914
 915static int vtime_state_check(struct vtime *vtime, int cpu)
 916{
 917        /*
 918         * We raced against a context switch, fetch the
 919         * kcpustat task again.
 920         */
 921        if (vtime->cpu != cpu && vtime->cpu != -1)
 922                return -EAGAIN;
 923
 924        /*
 925         * Two possible things here:
 926         * 1) We are seeing the scheduling out task (prev) or any past one.
 927         * 2) We are seeing the scheduling in task (next) but it hasn't
 928         *    passed though vtime_task_switch() yet so the pending
 929         *    cputime of the prev task may not be flushed yet.
 930         *
 931         * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
 932         */
 933        if (vtime->state == VTIME_INACTIVE)
 934                return -EAGAIN;
 935
 936        return 0;
 937}
 938
 939static u64 kcpustat_user_vtime(struct vtime *vtime)
 940{
 941        if (vtime->state == VTIME_USER)
 942                return vtime->utime + vtime_delta(vtime);
 943        else if (vtime->state == VTIME_GUEST)
 944                return vtime->gtime + vtime_delta(vtime);
 945        return 0;
 946}
 947
 948static int kcpustat_field_vtime(u64 *cpustat,
 949                                struct task_struct *tsk,
 950                                enum cpu_usage_stat usage,
 951                                int cpu, u64 *val)
 952{
 953        struct vtime *vtime = &tsk->vtime;
 954        unsigned int seq;
 955        int err;
 956
 957        do {
 958                seq = read_seqcount_begin(&vtime->seqcount);
 959
 960                err = vtime_state_check(vtime, cpu);
 961                if (err < 0)
 962                        return err;
 963
 964                *val = cpustat[usage];
 965
 966                /*
 967                 * Nice VS unnice cputime accounting may be inaccurate if
 968                 * the nice value has changed since the last vtime update.
 969                 * But proper fix would involve interrupting target on nice
 970                 * updates which is a no go on nohz_full (although the scheduler
 971                 * may still interrupt the target if rescheduling is needed...)
 972                 */
 973                switch (usage) {
 974                case CPUTIME_SYSTEM:
 975                        if (vtime->state == VTIME_SYS)
 976                                *val += vtime->stime + vtime_delta(vtime);
 977                        break;
 978                case CPUTIME_USER:
 979                        if (task_nice(tsk) <= 0)
 980                                *val += kcpustat_user_vtime(vtime);
 981                        break;
 982                case CPUTIME_NICE:
 983                        if (task_nice(tsk) > 0)
 984                                *val += kcpustat_user_vtime(vtime);
 985                        break;
 986                case CPUTIME_GUEST:
 987                        if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
 988                                *val += vtime->gtime + vtime_delta(vtime);
 989                        break;
 990                case CPUTIME_GUEST_NICE:
 991                        if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
 992                                *val += vtime->gtime + vtime_delta(vtime);
 993                        break;
 994                default:
 995                        break;
 996                }
 997        } while (read_seqcount_retry(&vtime->seqcount, seq));
 998
 999        return 0;
1000}

1001
1002u64 kcpustat_field(struct kernel_cpustat *kcpustat,
1003                   enum cpu_usage_stat usage, int cpu)
1004{
1005        u64 *cpustat = kcpustat->cpustat;
1006        struct rq *rq;
1007        u64 val;
1008        int err;
1009
1010        if (!vtime_accounting_enabled_cpu(cpu))
1011                return cpustat[usage];
1012
1013        rq = cpu_rq(cpu);
1014
1015        for (;;) {
1016                struct task_struct *curr;
1017
1018                rcu_read_lock();
1019                curr = rcu_dereference(rq->curr);
1020                if (WARN_ON_ONCE(!curr)) {
1021                        rcu_read_unlock();
1022                        return cpustat[usage];
1023                }
1024
1025                err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
1026                rcu_read_unlock();
1027
1028                if (!err)
1029                        return val;
1030
1031                cpu_relax();
1032        }
1033}
1034EXPORT_SYMBOL_GPL(kcpustat_field);
1035
1036static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
1037                                    const struct kernel_cpustat *src,
1038                                    struct task_struct *tsk, int cpu)
1039{
1040        struct vtime *vtime = &tsk->vtime;
1041        unsigned int seq;
1042        int err;
1043
1044        do {
1045                u64 *cpustat;
1046                u64 delta;
1047
1048                seq = read_seqcount_begin(&vtime->seqcount);
1049
1050                err = vtime_state_check(vtime, cpu);
1051                if (err < 0)
1052                        return err;
1053
1054                *dst = *src;
1055                cpustat = dst->cpustat;
1056
1057                /* Task is sleeping, dead or idle, nothing to add */
1058                if (vtime->state < VTIME_SYS)
1059                        continue;
1060
1061                delta = vtime_delta(vtime);
1062
1063                /*
1064                 * Task runs either in user (including guest) or kernel space,
1065                 * add pending nohz time to the right place.
1066                 */
1067                if (vtime->state == VTIME_SYS) {
1068                        cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
1069                } else if (vtime->state == VTIME_USER) {
1070                        if (task_nice(tsk) > 0)
1071                                cpustat[CPUTIME_NICE] += vtime->utime + delta;
1072                        else
1073                                cpustat[CPUTIME_USER] += vtime->utime + delta;
1074                } else {
1075                        WARN_ON_ONCE(vtime->state != VTIME_GUEST);
1076                        if (task_nice(tsk) > 0) {
1077                                cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
1078                                cpustat[CPUTIME_NICE] += vtime->gtime + delta;
1079                        } else {
1080                                cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
1081                                cpustat[CPUTIME_USER] += vtime->gtime + delta;
1082                        }
1083                }
1084        } while (read_seqcount_retry(&vtime->seqcount, seq));
1085
1086        return err;
1087}
1088
1089void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
1090{
1091        const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
1092        struct rq *rq;
1093        int err;
1094
1095        if (!vtime_accounting_enabled_cpu(cpu)) {
1096                *dst = *src;
1097                return;
1098        }
1099
1100        rq = cpu_rq(cpu);
1101
1102        for (;;) {
1103                struct task_struct *curr;
1104
1105                rcu_read_lock();
1106                curr = rcu_dereference(rq->curr);
1107                if (WARN_ON_ONCE(!curr)) {
1108                        rcu_read_unlock();
1109                        *dst = *src;
1110                        return;
1111                }
1112
1113                err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
1114                rcu_read_unlock();
1115
1116                if (!err)
1117                        return;
1118
1119                cpu_relax();
1120        }
1121}
1122EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
1123
1124#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
1125