LXR linux/kernel/sched/cputime.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple CPU accounting cgroup controller
   4 */
   5#include "sched.h"
   6
   7#ifdef CONFIG_IRQ_TIME_ACCOUNTING
   8
   9/*
  10 * There are no locks covering percpu hardirq/softirq time.
  11 * They are only modified in vtime_account, on corresponding CPU
  12 * with interrupts disabled. So, writes are safe.
  13 * They are read and saved off onto struct rq in update_rq_clock().
  14 * This may result in other CPU reading this CPU's irq time and can
  15 * race with irq/vtime_account on this CPU. We would either get old
  16 * or new value with a side effect of accounting a slice of irq time to wrong
  17 * task when irq is in progress while we read rq->clock. That is a worthy
  18 * compromise in place of having locks on each irq in account_system_time.
  19 */
  20DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  21
  22static int sched_clock_irqtime;
  23
  24void enable_sched_clock_irqtime(void)
  25{
  26        sched_clock_irqtime = 1;
  27}
  28
  29void disable_sched_clock_irqtime(void)
  30{
  31        sched_clock_irqtime = 0;
  32}
  33
  34static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
  35                                  enum cpu_usage_stat idx)
  36{
  37        u64 *cpustat = kcpustat_this_cpu->cpustat;
  38
  39        u64_stats_update_begin(&irqtime->sync);
  40        cpustat[idx] += delta;
  41        irqtime->total += delta;
  42        irqtime->tick_delta += delta;
  43        u64_stats_update_end(&irqtime->sync);
  44}
  45
  46/*
  47 * Called before incrementing preempt_count on {soft,}irq_enter
  48 * and before decrementing preempt_count on {soft,}irq_exit.
  49 */
  50void irqtime_account_irq(struct task_struct *curr)
  51{
  52        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  53        s64 delta;
  54        int cpu;
  55
  56        if (!sched_clock_irqtime)
  57                return;
  58
  59        cpu = smp_processor_id();
  60        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  61        irqtime->irq_start_time += delta;
  62
  63        /*
  64         * We do not account for softirq time from ksoftirqd here.
  65         * We want to continue accounting softirq time to ksoftirqd thread
  66         * in that case, so as not to confuse scheduler with a special task
  67         * that do not consume any time, but still wants to run.
  68         */
  69        if (hardirq_count())
  70                irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
  71        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  72                irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
  73}
  74EXPORT_SYMBOL_GPL(irqtime_account_irq);
  75
  76static u64 irqtime_tick_accounted(u64 maxtime)
  77{
  78        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  79        u64 delta;
  80
  81        delta = min(irqtime->tick_delta, maxtime);
  82        irqtime->tick_delta -= delta;
  83
  84        return delta;
  85}
  86
  87#else /* CONFIG_IRQ_TIME_ACCOUNTING */
  88
  89#define sched_clock_irqtime     (0)
  90
  91static u64 irqtime_tick_accounted(u64 dummy)
  92{
  93        return 0;
  94}
  95
  96#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
  97
  98static inline void task_group_account_field(struct task_struct *p, int index,
  99                                            u64 tmp)
 100{
 101        /*
 102         * Since all updates are sure to touch the root cgroup, we
 103         * get ourselves ahead and touch it first. If the root cgroup
 104         * is the only cgroup, then nothing else should be necessary.
 105         *
 106         */
 107        __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 108
 109        cgroup_account_cputime_field(p, index, tmp);
 110}
 111
 112/*
 113 * Account user CPU time to a process.
 114 * @p: the process that the CPU time gets accounted to
 115 * @cputime: the CPU time spent in user space since the last update
 116 */
 117void account_user_time(struct task_struct *p, u64 cputime)
 118{
 119        int index;
 120
 121        /* Add user time to process. */
 122        p->utime += cputime;
 123        account_group_user_time(p, cputime);
 124
 125        index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 126
 127        /* Add user time to cpustat. */
 128        task_group_account_field(p, index, cputime);
 129
 130        /* Account for user time used */
 131        acct_account_cputime(p);
 132}
 133
 134/*
 135 * Account guest CPU time to a process.
 136 * @p: the process that the CPU time gets accounted to
 137 * @cputime: the CPU time spent in virtual machine since the last update
 138 */
 139void account_guest_time(struct task_struct *p, u64 cputime)
 140{
 141        u64 *cpustat = kcpustat_this_cpu->cpustat;
 142
 143        /* Add guest time to process. */
 144        p->utime += cputime;
 145        account_group_user_time(p, cputime);
 146        p->gtime += cputime;
 147
 148        /* Add guest time to cpustat. */
 149        if (task_nice(p) > 0) {
 150                cpustat[CPUTIME_NICE] += cputime;
 151                cpustat[CPUTIME_GUEST_NICE] += cputime;
 152        } else {
 153                cpustat[CPUTIME_USER] += cputime;
 154                cpustat[CPUTIME_GUEST] += cputime;
 155        }
 156}
 157
 158/*
 159 * Account system CPU time to a process and desired cpustat field
 160 * @p: the process that the CPU time gets accounted to
 161 * @cputime: the CPU time spent in kernel space since the last update
 162 * @index: pointer to cpustat field that has to be updated
 163 */
 164void account_system_index_time(struct task_struct *p,
 165                               u64 cputime, enum cpu_usage_stat index)
 166{
 167        /* Add system time to process. */
 168        p->stime += cputime;
 169        account_group_system_time(p, cputime);
 170
 171        /* Add system time to cpustat. */
 172        task_group_account_field(p, index, cputime);
 173
 174        /* Account for system time used */
 175        acct_account_cputime(p);
 176}
 177
 178/*
 179 * Account system CPU time to a process.
 180 * @p: the process that the CPU time gets accounted to
 181 * @hardirq_offset: the offset to subtract from hardirq_count()
 182 * @cputime: the CPU time spent in kernel space since the last update
 183 */
 184void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 185{
 186        int index;
 187
 188        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 189                account_guest_time(p, cputime);
 190                return;
 191        }
 192
 193        if (hardirq_count() - hardirq_offset)
 194                index = CPUTIME_IRQ;
 195        else if (in_serving_softirq())
 196                index = CPUTIME_SOFTIRQ;
 197        else
 198                index = CPUTIME_SYSTEM;
 199
 200        account_system_index_time(p, cputime, index);
 201}
 202
 203/*
 204 * Account for involuntary wait time.
 205 * @cputime: the CPU time spent in involuntary wait
 206 */
 207void account_steal_time(u64 cputime)
 208{
 209        u64 *cpustat = kcpustat_this_cpu->cpustat;
 210
 211        cpustat[CPUTIME_STEAL] += cputime;
 212}
 213
 214/*
 215 * Account for idle time.
 216 * @cputime: the CPU time spent in idle wait
 217 */
 218void account_idle_time(u64 cputime)
 219{
 220        u64 *cpustat = kcpustat_this_cpu->cpustat;
 221        struct rq *rq = this_rq();
 222
 223        if (atomic_read(&rq->nr_iowait) > 0)
 224                cpustat[CPUTIME_IOWAIT] += cputime;
 225        else
 226                cpustat[CPUTIME_IDLE] += cputime;
 227}
 228
 229/*
 230 * When a guest is interrupted for a longer amount of time, missed clock
 231 * ticks are not redelivered later. Due to that, this function may on
 232 * occasion account more time than the calling functions think elapsed.
 233 */
 234static __always_inline u64 steal_account_process_time(u64 maxtime)
 235{
 236#ifdef CONFIG_PARAVIRT
 237        if (static_key_false(&paravirt_steal_enabled)) {
 238                u64 steal;
 239
 240                steal = paravirt_steal_clock(smp_processor_id());
 241                steal -= this_rq()->prev_steal_time;
 242                steal = min(steal, maxtime);
 243                account_steal_time(steal);
 244                this_rq()->prev_steal_time += steal;
 245
 246                return steal;
 247        }
 248#endif
 249        return 0;
 250}
 251
 252/*
 253 * Account how much elapsed time was spent in steal, irq, or softirq time.
 254 */
 255static inline u64 account_other_time(u64 max)
 256{
 257        u64 accounted;
 258
 259        lockdep_assert_irqs_disabled();
 260
 261        accounted = steal_account_process_time(max);
 262
 263        if (accounted < max)
 264                accounted += irqtime_tick_accounted(max - accounted);
 265
 266        return accounted;
 267}
 268
 269#ifdef CONFIG_64BIT
 270static inline u64 read_sum_exec_runtime(struct task_struct *t)
 271{
 272        return t->se.sum_exec_runtime;
 273}
 274#else
 275static u64 read_sum_exec_runtime(struct task_struct *t)
 276{
 277        u64 ns;
 278        struct rq_flags rf;
 279        struct rq *rq;
 280
 281        rq = task_rq_lock(t, &rf);
 282        ns = t->se.sum_exec_runtime;
 283        task_rq_unlock(rq, t, &rf);
 284
 285        return ns;
 286}
 287#endif
 288
 289/*
 290 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 291 * tasks (sum on group iteration) belonging to @tsk's group.
 292 */
 293void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 294{
 295        struct signal_struct *sig = tsk->signal;
 296        u64 utime, stime;
 297        struct task_struct *t;
 298        unsigned int seq, nextseq;
 299        unsigned long flags;
 300
 301        /*
 302         * Update current task runtime to account pending time since last
 303         * scheduler action or thread_group_cputime() call. This thread group
 304         * might have other running tasks on different CPUs, but updating
 305         * their runtime can affect syscall performance, so we skip account
 306         * those pending times and rely only on values updated on tick or
 307         * other scheduler action.
 308         */
 309        if (same_thread_group(current, tsk))
 310                (void) task_sched_runtime(current);
 311
 312        rcu_read_lock();
 313        /* Attempt a lockless read on the first round. */
 314        nextseq = 0;
 315        do {
 316                seq = nextseq;
 317                flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 318                times->utime = sig->utime;
 319                times->stime = sig->stime;
 320                times->sum_exec_runtime = sig->sum_sched_runtime;
 321
 322                for_each_thread(tsk, t) {
 323                        task_cputime(t, &utime, &stime);
 324                        times->utime += utime;
 325                        times->stime += stime;
 326                        times->sum_exec_runtime += read_sum_exec_runtime(t);
 327                }
 328                /* If lockless access failed, take the lock. */
 329                nextseq = 1;
 330        } while (need_seqretry(&sig->stats_lock, seq));
 331        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 332        rcu_read_unlock();
 333}
 334
 335#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 336/*
 337 * Account a tick to a process and cpustat
 338 * @p: the process that the CPU time gets accounted to
 339 * @user_tick: is the tick from userspace
 340 * @rq: the pointer to rq
 341 *
 342 * Tick demultiplexing follows the order
 343 * - pending hardirq update
 344 * - pending softirq update
 345 * - user_time
 346 * - idle_time
 347 * - system time
 348 *   - check for guest_time
 349 *   - else account as system_time
 350 *
 351 * Check for hardirq is done both for system and user time as there is
 352 * no timer going off while we are on hardirq and hence we may never get an
 353 * opportunity to update it solely in system time.
 354 * p->stime and friends are only updated on system time and not on irq
 355 * softirq as those do not count in task exec_runtime any more.
 356 */
 357static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 358                                         struct rq *rq, int ticks)
 359{
 360        u64 other, cputime = TICK_NSEC * ticks;
 361
 362        /*
 363         * When returning from idle, many ticks can get accounted at
 364         * once, including some ticks of steal, irq, and softirq time.
 365         * Subtract those ticks from the amount of time accounted to
 366         * idle, or potentially user or system time. Due to rounding,
 367         * other time can exceed ticks occasionally.
 368         */
 369        other = account_other_time(ULONG_MAX);
 370        if (other >= cputime)
 371                return;
 372
 373        cputime -= other;
 374
 375        if (this_cpu_ksoftirqd() == p) {
 376                /*
 377                 * ksoftirqd time do not get accounted in cpu_softirq_time.
 378                 * So, we have to handle it separately here.
 379                 * Also, p->stime needs to be updated for ksoftirqd.
 380                 */
 381                account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 382        } else if (user_tick) {
 383                account_user_time(p, cputime);
 384        } else if (p == rq->idle) {
 385                account_idle_time(cputime);
 386        } else if (p->flags & PF_VCPU) { /* System time or guest time */
 387                account_guest_time(p, cputime);
 388        } else {
 389                account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 390        }
 391}
 392
 393static void irqtime_account_idle_ticks(int ticks)
 394{
 395        struct rq *rq = this_rq();
 396
 397        irqtime_account_process_tick(current, 0, rq, ticks);
 398}
 399#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 400static inline void irqtime_account_idle_ticks(int ticks) { }
 401static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 402                                                struct rq *rq, int nr_ticks) { }
 403#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 404
 405/*
 406 * Use precise platform statistics if available:
 407 */
 408#ifdef CONFIG_VIRT_CPU_ACCOUNTING
 409# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 410void vtime_common_task_switch(struct task_struct *prev)
 411{
 412        if (is_idle_task(prev))
 413                vtime_account_idle(prev);
 414        else
 415                vtime_account_system(prev);
 416
 417        vtime_flush(prev);
 418        arch_vtime_task_switch(prev);
 419}
 420# endif
 421#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 422
 423
 424#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 425/*
 426 * Archs that account the whole time spent in the idle task
 427 * (outside irq) as idle time can rely on this and just implement
 428 * vtime_account_system() and vtime_account_idle(). Archs that
 429 * have other meaning of the idle time (s390 only includes the
 430 * time spent by the CPU when it's in low power mode) must override
 431 * vtime_account().
 432 */
 433#ifndef __ARCH_HAS_VTIME_ACCOUNT
 434void vtime_account_irq_enter(struct task_struct *tsk)
 435{
 436        if (!in_interrupt() && is_idle_task(tsk))
 437                vtime_account_idle(tsk);
 438        else
 439                vtime_account_system(tsk);
 440}
 441EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 442#endif /* __ARCH_HAS_VTIME_ACCOUNT */
 443
 444void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 445                    u64 *ut, u64 *st)
 446{
 447        *ut = curr->utime;
 448        *st = curr->stime;
 449}
 450
 451void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 452{
 453        *ut = p->utime;
 454        *st = p->stime;
 455}
 456EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 457
 458void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 459{
 460        struct task_cputime cputime;
 461
 462        thread_group_cputime(p, &cputime);
 463
 464        *ut = cputime.utime;
 465        *st = cputime.stime;
 466}
 467
 468#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
 469
 470/*
 471 * Account a single tick of CPU time.
 472 * @p: the process that the CPU time gets accounted to
 473 * @user_tick: indicates if the tick is a user or a system tick
 474 */
 475void account_process_tick(struct task_struct *p, int user_tick)
 476{
 477        u64 cputime, steal;
 478        struct rq *rq = this_rq();
 479
 480        if (vtime_accounting_cpu_enabled())
 481                return;
 482
 483        if (sched_clock_irqtime) {
 484                irqtime_account_process_tick(p, user_tick, rq, 1);
 485                return;
 486        }
 487
 488        cputime = TICK_NSEC;
 489        steal = steal_account_process_time(ULONG_MAX);
 490
 491        if (steal >= cputime)
 492                return;
 493
 494        cputime -= steal;
 495
 496        if (user_tick)
 497                account_user_time(p, cputime);
 498        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 499                account_system_time(p, HARDIRQ_OFFSET, cputime);
 500        else
 501                account_idle_time(cputime);
 502}
 503
 504/*
 505 * Account multiple ticks of idle time.
 506 * @ticks: number of stolen ticks
 507 */
 508void account_idle_ticks(unsigned long ticks)
 509{
 510        u64 cputime, steal;
 511
 512        if (sched_clock_irqtime) {
 513                irqtime_account_idle_ticks(ticks);
 514                return;
 515        }
 516
 517        cputime = ticks * TICK_NSEC;
 518        steal = steal_account_process_time(ULONG_MAX);
 519
 520        if (steal >= cputime)
 521                return;
 522
 523        cputime -= steal;
 524        account_idle_time(cputime);
 525}
 526
 527/*
 528 * Perform (stime * rtime) / total, but avoid multiplication overflow by
 529 * losing precision when the numbers are big.
 530 */
 531static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 532{
 533        u64 scaled;
 534
 535        for (;;) {
 536                /* Make sure "rtime" is the bigger of stime/rtime */
 537                if (stime > rtime)
 538                        swap(rtime, stime);
 539
 540                /* Make sure 'total' fits in 32 bits */
 541                if (total >> 32)
 542                        goto drop_precision;
 543
 544                /* Does rtime (and thus stime) fit in 32 bits? */
 545                if (!(rtime >> 32))
 546                        break;
 547
 548                /* Can we just balance rtime/stime rather than dropping bits? */
 549                if (stime >> 31)
 550                        goto drop_precision;
 551
 552                /* We can grow stime and shrink rtime and try to make them both fit */
 553                stime <<= 1;
 554                rtime >>= 1;
 555                continue;
 556
 557drop_precision:
 558                /* We drop from rtime, it has more bits than stime */
 559                rtime >>= 1;
 560                total >>= 1;
 561        }
 562
 563        /*
 564         * Make sure gcc understands that this is a 32x32->64 multiply,
 565         * followed by a 64/32->64 divide.
 566         */
 567        scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
 568        return scaled;
 569}
 570
 571/*
 572 * Adjust tick based cputime random precision against scheduler runtime
 573 * accounting.
 574 *
 575 * Tick based cputime accounting depend on random scheduling timeslices of a
 576 * task to be interrupted or not by the timer.  Depending on these
 577 * circumstances, the number of these interrupts may be over or
 578 * under-optimistic, matching the real user and system cputime with a variable
 579 * precision.
 580 *
 581 * Fix this by scaling these tick based values against the total runtime
 582 * accounted by the CFS scheduler.
 583 *
 584 * This code provides the following guarantees:
 585 *
 586 *   stime + utime == rtime
 587 *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 588 *
 589 * Assuming that rtime_i+1 >= rtime_i.
 590 */
 591void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 592                    u64 *ut, u64 *st)
 593{
 594        u64 rtime, stime, utime;
 595        unsigned long flags;
 596
 597        /* Serialize concurrent callers such that we can honour our guarantees */
 598        raw_spin_lock_irqsave(&prev->lock, flags);
 599        rtime = curr->sum_exec_runtime;
 600
 601        /*
 602         * This is possible under two circumstances:
 603         *  - rtime isn't monotonic after all (a bug);
 604         *  - we got reordered by the lock.
 605         *
 606         * In both cases this acts as a filter such that the rest of the code
 607         * can assume it is monotonic regardless of anything else.
 608         */
 609        if (prev->stime + prev->utime >= rtime)
 610                goto out;
 611
 612        stime = curr->stime;
 613        utime = curr->utime;
 614
 615        /*
 616         * If either stime or utime are 0, assume all runtime is userspace.
 617         * Once a task gets some ticks, the monotonicy code at 'update:'
 618         * will ensure things converge to the observed ratio.
 619         */
 620        if (stime == 0) {
 621                utime = rtime;
 622                goto update;
 623        }
 624
 625        if (utime == 0) {
 626                stime = rtime;
 627                goto update;
 628        }
 629
 630        stime = scale_stime(stime, rtime, stime + utime);
 631
 632update:
 633        /*
 634         * Make sure stime doesn't go backwards; this preserves monotonicity
 635         * for utime because rtime is monotonic.
 636         *
 637         *  utime_i+1 = rtime_i+1 - stime_i
 638         *            = rtime_i+1 - (rtime_i - utime_i)
 639         *            = (rtime_i+1 - rtime_i) + utime_i
 640         *            >= utime_i
 641         */
 642        if (stime < prev->stime)
 643                stime = prev->stime;
 644        utime = rtime - stime;
 645
 646        /*
 647         * Make sure utime doesn't go backwards; this still preserves
 648         * monotonicity for stime, analogous argument to above.
 649         */
 650        if (utime < prev->utime) {
 651                utime = prev->utime;
 652                stime = rtime - utime;
 653        }
 654
 655        prev->stime = stime;
 656        prev->utime = utime;
 657out:
 658        *ut = prev->utime;
 659        *st = prev->stime;
 660        raw_spin_unlock_irqrestore(&prev->lock, flags);
 661}
 662
 663void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 664{
 665        struct task_cputime cputime = {
 666                .sum_exec_runtime = p->se.sum_exec_runtime,
 667        };
 668
 669        task_cputime(p, &cputime.utime, &cputime.stime);
 670        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 671}
 672EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 673
 674void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 675{
 676        struct task_cputime cputime;
 677
 678        thread_group_cputime(p, &cputime);
 679        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 680}
 681#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 682
 683#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 684static u64 vtime_delta(struct vtime *vtime)
 685{
 686        unsigned long long clock;
 687
 688        clock = sched_clock();
 689        if (clock < vtime->starttime)
 690                return 0;
 691
 692        return clock - vtime->starttime;
 693}
 694
 695static u64 get_vtime_delta(struct vtime *vtime)
 696{
 697        u64 delta = vtime_delta(vtime);
 698        u64 other;
 699
 700        /*
 701         * Unlike tick based timing, vtime based timing never has lost
 702         * ticks, and no need for steal time accounting to make up for
 703         * lost ticks. Vtime accounts a rounded version of actual
 704         * elapsed time. Limit account_other_time to prevent rounding
 705         * errors from causing elapsed vtime to go negative.
 706         */
 707        other = account_other_time(delta);
 708        WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
 709        vtime->starttime += delta;
 710
 711        return delta - other;
 712}
 713
 714static void __vtime_account_system(struct task_struct *tsk,
 715                                   struct vtime *vtime)
 716{
 717        vtime->stime += get_vtime_delta(vtime);
 718        if (vtime->stime >= TICK_NSEC) {
 719                account_system_time(tsk, irq_count(), vtime->stime);
 720                vtime->stime = 0;
 721        }
 722}
 723
 724static void vtime_account_guest(struct task_struct *tsk,
 725                                struct vtime *vtime)
 726{
 727        vtime->gtime += get_vtime_delta(vtime);
 728        if (vtime->gtime >= TICK_NSEC) {
 729                account_guest_time(tsk, vtime->gtime);
 730                vtime->gtime = 0;
 731        }
 732}
 733
 734void vtime_account_system(struct task_struct *tsk)
 735{
 736        struct vtime *vtime = &tsk->vtime;
 737
 738        if (!vtime_delta(vtime))
 739                return;
 740
 741        write_seqcount_begin(&vtime->seqcount);
 742        /* We might have scheduled out from guest path */
 743        if (current->flags & PF_VCPU)
 744                vtime_account_guest(tsk, vtime);
 745        else
 746                __vtime_account_system(tsk, vtime);
 747        write_seqcount_end(&vtime->seqcount);
 748}
 749
 750void vtime_user_enter(struct task_struct *tsk)
 751{
 752        struct vtime *vtime = &tsk->vtime;
 753
 754        write_seqcount_begin(&vtime->seqcount);
 755        __vtime_account_system(tsk, vtime);
 756        vtime->state = VTIME_USER;
 757        write_seqcount_end(&vtime->seqcount);
 758}
 759
 760void vtime_user_exit(struct task_struct *tsk)
 761{
 762        struct vtime *vtime = &tsk->vtime;
 763
 764        write_seqcount_begin(&vtime->seqcount);
 765        vtime->utime += get_vtime_delta(vtime);
 766        if (vtime->utime >= TICK_NSEC) {
 767                account_user_time(tsk, vtime->utime);
 768                vtime->utime = 0;
 769        }
 770        vtime->state = VTIME_SYS;
 771        write_seqcount_end(&vtime->seqcount);
 772}
 773
 774void vtime_guest_enter(struct task_struct *tsk)
 775{
 776        struct vtime *vtime = &tsk->vtime;
 777        /*
 778         * The flags must be updated under the lock with
 779         * the vtime_starttime flush and update.
 780         * That enforces a right ordering and update sequence
 781         * synchronization against the reader (task_gtime())
 782         * that can thus safely catch up with a tickless delta.
 783         */
 784        write_seqcount_begin(&vtime->seqcount);
 785        __vtime_account_system(tsk, vtime);
 786        current->flags |= PF_VCPU;
 787        write_seqcount_end(&vtime->seqcount);
 788}
 789EXPORT_SYMBOL_GPL(vtime_guest_enter);
 790
 791void vtime_guest_exit(struct task_struct *tsk)
 792{
 793        struct vtime *vtime = &tsk->vtime;
 794
 795        write_seqcount_begin(&vtime->seqcount);
 796        vtime_account_guest(tsk, vtime);
 797        current->flags &= ~PF_VCPU;
 798        write_seqcount_end(&vtime->seqcount);
 799}
 800EXPORT_SYMBOL_GPL(vtime_guest_exit);
 801
 802void vtime_account_idle(struct task_struct *tsk)
 803{
 804        account_idle_time(get_vtime_delta(&tsk->vtime));
 805}
 806
 807void arch_vtime_task_switch(struct task_struct *prev)
 808{
 809        struct vtime *vtime = &prev->vtime;
 810
 811        write_seqcount_begin(&vtime->seqcount);
 812        vtime->state = VTIME_INACTIVE;
 813        write_seqcount_end(&vtime->seqcount);
 814
 815        vtime = &current->vtime;
 816
 817        write_seqcount_begin(&vtime->seqcount);
 818        vtime->state = VTIME_SYS;
 819        vtime->starttime = sched_clock();
 820        write_seqcount_end(&vtime->seqcount);
 821}
 822
 823void vtime_init_idle(struct task_struct *t, int cpu)
 824{
 825        struct vtime *vtime = &t->vtime;
 826        unsigned long flags;
 827
 828        local_irq_save(flags);
 829        write_seqcount_begin(&vtime->seqcount);
 830        vtime->state = VTIME_SYS;
 831        vtime->starttime = sched_clock();
 832        write_seqcount_end(&vtime->seqcount);
 833        local_irq_restore(flags);
 834}
 835
 836u64 task_gtime(struct task_struct *t)
 837{
 838        struct vtime *vtime = &t->vtime;
 839        unsigned int seq;
 840        u64 gtime;
 841
 842        if (!vtime_accounting_enabled())
 843                return t->gtime;
 844
 845        do {
 846                seq = read_seqcount_begin(&vtime->seqcount);
 847
 848                gtime = t->gtime;
 849                if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
 850                        gtime += vtime->gtime + vtime_delta(vtime);
 851
 852        } while (read_seqcount_retry(&vtime->seqcount, seq));
 853
 854        return gtime;
 855}
 856
 857/*
 858 * Fetch cputime raw values from fields of task_struct and
 859 * add up the pending nohz execution time since the last
 860 * cputime snapshot.
 861 */
 862void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 863{
 864        struct vtime *vtime = &t->vtime;
 865        unsigned int seq;
 866        u64 delta;
 867
 868        if (!vtime_accounting_enabled()) {
 869                *utime = t->utime;
 870                *stime = t->stime;
 871                return;
 872        }
 873
 874        do {
 875                seq = read_seqcount_begin(&vtime->seqcount);
 876
 877                *utime = t->utime;
 878                *stime = t->stime;
 879
 880                /* Task is sleeping, nothing to add */
 881                if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
 882                        continue;
 883
 884                delta = vtime_delta(vtime);
 885
 886                /*
 887                 * Task runs either in user or kernel space, add pending nohz time to
 888                 * the right place.
 889                 */
 890                if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
 891                        *utime += vtime->utime + delta;
 892                else if (vtime->state == VTIME_SYS)
 893                        *stime += vtime->stime + delta;
 894        } while (read_seqcount_retry(&vtime->seqcount, seq));
 895}
 896#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 897