linux/kernel/time/tick-sched.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/time/tick-sched.c
   3 *
   4 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
   6 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
   7 *
   8 *  No idle tick implementation for low and high resolution timers
   9 *
  10 *  Started by: Thomas Gleixner and Ingo Molnar
  11 *
  12 *  Distribute under GPLv2.
  13 */
  14#include <linux/cpu.h>
  15#include <linux/err.h>
  16#include <linux/hrtimer.h>
  17#include <linux/interrupt.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/percpu.h>
  20#include <linux/profile.h>
  21#include <linux/sched.h>
  22#include <linux/module.h>
  23#include <linux/irq_work.h>
  24#include <linux/posix-timers.h>
  25#include <linux/context_tracking.h>
  26
  27#include <asm/irq_regs.h>
  28
  29#include "tick-internal.h"
  30
  31#include <trace/events/timer.h>
  32
  33/*
  34 * Per-CPU nohz control structure
  35 */
  36static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
  37
  38struct tick_sched *tick_get_tick_sched(int cpu)
  39{
  40        return &per_cpu(tick_cpu_sched, cpu);
  41}
  42
  43#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
  44/*
  45 * The time, when the last jiffy update happened. Protected by jiffies_lock.
  46 */
  47static ktime_t last_jiffies_update;
  48
  49/*
  50 * Must be called with interrupts disabled !
  51 */
  52static void tick_do_update_jiffies64(ktime_t now)
  53{
  54        unsigned long ticks = 0;
  55        ktime_t delta;
  56
  57        /*
  58         * Do a quick check without holding jiffies_lock:
  59         */
  60        delta = ktime_sub(now, last_jiffies_update);
  61        if (delta.tv64 < tick_period.tv64)
  62                return;
  63
  64        /* Reevaluate with jiffies_lock held */
  65        write_seqlock(&jiffies_lock);
  66
  67        delta = ktime_sub(now, last_jiffies_update);
  68        if (delta.tv64 >= tick_period.tv64) {
  69
  70                delta = ktime_sub(delta, tick_period);
  71                last_jiffies_update = ktime_add(last_jiffies_update,
  72                                                tick_period);
  73
  74                /* Slow path for long timeouts */
  75                if (unlikely(delta.tv64 >= tick_period.tv64)) {
  76                        s64 incr = ktime_to_ns(tick_period);
  77
  78                        ticks = ktime_divns(delta, incr);
  79
  80                        last_jiffies_update = ktime_add_ns(last_jiffies_update,
  81                                                           incr * ticks);
  82                }
  83                do_timer(++ticks);
  84
  85                /* Keep the tick_next_period variable up to date */
  86                tick_next_period = ktime_add(last_jiffies_update, tick_period);
  87        } else {
  88                write_sequnlock(&jiffies_lock);
  89                return;
  90        }
  91        write_sequnlock(&jiffies_lock);
  92        update_wall_time();
  93}
  94
  95/*
  96 * Initialize and return retrieve the jiffies update.
  97 */
  98static ktime_t tick_init_jiffy_update(void)
  99{
 100        ktime_t period;
 101
 102        write_seqlock(&jiffies_lock);
 103        /* Did we start the jiffies update yet ? */
 104        if (last_jiffies_update.tv64 == 0)
 105                last_jiffies_update = tick_next_period;
 106        period = last_jiffies_update;
 107        write_sequnlock(&jiffies_lock);
 108        return period;
 109}
 110
 111
 112static void tick_sched_do_timer(ktime_t now)
 113{
 114        int cpu = smp_processor_id();
 115
 116#ifdef CONFIG_NO_HZ_COMMON
 117        /*
 118         * Check if the do_timer duty was dropped. We don't care about
 119         * concurrency: This happens only when the CPU in charge went
 120         * into a long sleep. If two CPUs happen to assign themselves to
 121         * this duty, then the jiffies update is still serialized by
 122         * jiffies_lock.
 123         */
 124        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 125            && !tick_nohz_full_cpu(cpu))
 126                tick_do_timer_cpu = cpu;
 127#endif
 128
 129        /* Check, if the jiffies need an update */
 130        if (tick_do_timer_cpu == cpu)
 131                tick_do_update_jiffies64(now);
 132}
 133
 134static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 135{
 136#ifdef CONFIG_NO_HZ_COMMON
 137        /*
 138         * When we are idle and the tick is stopped, we have to touch
 139         * the watchdog as we might not schedule for a really long
 140         * time. This happens on complete idle SMP systems while
 141         * waiting on the login prompt. We also increment the "start of
 142         * idle" jiffy stamp so the idle accounting adjustment we do
 143         * when we go busy again does not account too much ticks.
 144         */
 145        if (ts->tick_stopped) {
 146                touch_softlockup_watchdog_sched();
 147                if (is_idle_task(current))
 148                        ts->idle_jiffies++;
 149        }
 150#endif
 151        update_process_times(user_mode(regs));
 152        profile_tick(CPU_PROFILING);
 153}
 154#endif
 155
 156#ifdef CONFIG_NO_HZ_FULL
 157cpumask_var_t tick_nohz_full_mask;
 158cpumask_var_t housekeeping_mask;
 159bool tick_nohz_full_running;
 160static atomic_t tick_dep_mask;
 161
 162static bool check_tick_dependency(atomic_t *dep)
 163{
 164        int val = atomic_read(dep);
 165
 166        if (val & TICK_DEP_MASK_POSIX_TIMER) {
 167                trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
 168                return true;
 169        }
 170
 171        if (val & TICK_DEP_MASK_PERF_EVENTS) {
 172                trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
 173                return true;
 174        }
 175
 176        if (val & TICK_DEP_MASK_SCHED) {
 177                trace_tick_stop(0, TICK_DEP_MASK_SCHED);
 178                return true;
 179        }
 180
 181        if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
 182                trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
 183                return true;
 184        }
 185
 186        return false;
 187}
 188
 189static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
 190{
 191        WARN_ON_ONCE(!irqs_disabled());
 192
 193        if (unlikely(!cpu_online(cpu)))
 194                return false;
 195
 196        if (check_tick_dependency(&tick_dep_mask))
 197                return false;
 198
 199        if (check_tick_dependency(&ts->tick_dep_mask))
 200                return false;
 201
 202        if (check_tick_dependency(&current->tick_dep_mask))
 203                return false;
 204
 205        if (check_tick_dependency(&current->signal->tick_dep_mask))
 206                return false;
 207
 208        return true;
 209}
 210
 211static void nohz_full_kick_func(struct irq_work *work)
 212{
 213        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 214}
 215
 216static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 217        .func = nohz_full_kick_func,
 218};
 219
 220/*
 221 * Kick this CPU if it's full dynticks in order to force it to
 222 * re-evaluate its dependency on the tick and restart it if necessary.
 223 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 224 * is NMI safe.
 225 */
 226static void tick_nohz_full_kick(void)
 227{
 228        if (!tick_nohz_full_cpu(smp_processor_id()))
 229                return;
 230
 231        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
 232}
 233
 234/*
 235 * Kick the CPU if it's full dynticks in order to force it to
 236 * re-evaluate its dependency on the tick and restart it if necessary.
 237 */
 238void tick_nohz_full_kick_cpu(int cpu)
 239{
 240        if (!tick_nohz_full_cpu(cpu))
 241                return;
 242
 243        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
 244}
 245
 246/*
 247 * Kick all full dynticks CPUs in order to force these to re-evaluate
 248 * their dependency on the tick and restart it if necessary.
 249 */
 250static void tick_nohz_full_kick_all(void)
 251{
 252        int cpu;
 253
 254        if (!tick_nohz_full_running)
 255                return;
 256
 257        preempt_disable();
 258        for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
 259                tick_nohz_full_kick_cpu(cpu);
 260        preempt_enable();
 261}
 262
 263static void tick_nohz_dep_set_all(atomic_t *dep,
 264                                  enum tick_dep_bits bit)
 265{
 266        int prev;
 267
 268        prev = atomic_fetch_or(BIT(bit), dep);
 269        if (!prev)
 270                tick_nohz_full_kick_all();
 271}
 272
 273/*
 274 * Set a global tick dependency. Used by perf events that rely on freq and
 275 * by unstable clock.
 276 */
 277void tick_nohz_dep_set(enum tick_dep_bits bit)
 278{
 279        tick_nohz_dep_set_all(&tick_dep_mask, bit);
 280}
 281
 282void tick_nohz_dep_clear(enum tick_dep_bits bit)
 283{
 284        atomic_andnot(BIT(bit), &tick_dep_mask);
 285}
 286
 287/*
 288 * Set per-CPU tick dependency. Used by scheduler and perf events in order to
 289 * manage events throttling.
 290 */
 291void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 292{
 293        int prev;
 294        struct tick_sched *ts;
 295
 296        ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 297
 298        prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
 299        if (!prev) {
 300                preempt_disable();
 301                /* Perf needs local kick that is NMI safe */
 302                if (cpu == smp_processor_id()) {
 303                        tick_nohz_full_kick();
 304                } else {
 305                        /* Remote irq work not NMI-safe */
 306                        if (!WARN_ON_ONCE(in_nmi()))
 307                                tick_nohz_full_kick_cpu(cpu);
 308                }
 309                preempt_enable();
 310        }
 311}
 312
 313void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 314{
 315        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 316
 317        atomic_andnot(BIT(bit), &ts->tick_dep_mask);
 318}
 319
 320/*
 321 * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
 322 * per task timers.
 323 */
 324void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 325{
 326        /*
 327         * We could optimize this with just kicking the target running the task
 328         * if that noise matters for nohz full users.
 329         */
 330        tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
 331}
 332
 333void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
 334{
 335        atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
 336}
 337
 338/*
 339 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
 340 * per process timers.
 341 */
 342void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
 343{
 344        tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
 345}
 346
 347void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
 348{
 349        atomic_andnot(BIT(bit), &sig->tick_dep_mask);
 350}
 351
 352/*
 353 * Re-evaluate the need for the tick as we switch the current task.
 354 * It might need the tick due to per task/process properties:
 355 * perf events, posix CPU timers, ...
 356 */
 357void __tick_nohz_task_switch(void)
 358{
 359        unsigned long flags;
 360        struct tick_sched *ts;
 361
 362        local_irq_save(flags);
 363
 364        if (!tick_nohz_full_cpu(smp_processor_id()))
 365                goto out;
 366
 367        ts = this_cpu_ptr(&tick_cpu_sched);
 368
 369        if (ts->tick_stopped) {
 370                if (atomic_read(&current->tick_dep_mask) ||
 371                    atomic_read(&current->signal->tick_dep_mask))
 372                        tick_nohz_full_kick();
 373        }
 374out:
 375        local_irq_restore(flags);
 376}
 377
 378/* Parse the boot-time nohz CPU list from the kernel parameters. */
 379static int __init tick_nohz_full_setup(char *str)
 380{
 381        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
 382        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
 383                pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
 384                free_bootmem_cpumask_var(tick_nohz_full_mask);
 385                return 1;
 386        }
 387        tick_nohz_full_running = true;
 388
 389        return 1;
 390}
 391__setup("nohz_full=", tick_nohz_full_setup);
 392
 393static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 394                                       unsigned long action,
 395                                       void *hcpu)
 396{
 397        unsigned int cpu = (unsigned long)hcpu;
 398
 399        switch (action & ~CPU_TASKS_FROZEN) {
 400        case CPU_DOWN_PREPARE:
 401                /*
 402                 * The boot CPU handles housekeeping duty (unbound timers,
 403                 * workqueues, timekeeping, ...) on behalf of full dynticks
 404                 * CPUs. It must remain online when nohz full is enabled.
 405                 */
 406                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
 407                        return NOTIFY_BAD;
 408                break;
 409        }
 410        return NOTIFY_OK;
 411}
 412
 413static int tick_nohz_init_all(void)
 414{
 415        int err = -1;
 416
 417#ifdef CONFIG_NO_HZ_FULL_ALL
 418        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
 419                WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
 420                return err;
 421        }
 422        err = 0;
 423        cpumask_setall(tick_nohz_full_mask);
 424        tick_nohz_full_running = true;
 425#endif
 426        return err;
 427}
 428
 429void __init tick_nohz_init(void)
 430{
 431        int cpu;
 432
 433        if (!tick_nohz_full_running) {
 434                if (tick_nohz_init_all() < 0)
 435                        return;
 436        }
 437
 438        if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
 439                WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
 440                cpumask_clear(tick_nohz_full_mask);
 441                tick_nohz_full_running = false;
 442                return;
 443        }
 444
 445        /*
 446         * Full dynticks uses irq work to drive the tick rescheduling on safe
 447         * locking contexts. But then we need irq work to raise its own
 448         * interrupts to avoid circular dependency on the tick
 449         */
 450        if (!arch_irq_work_has_interrupt()) {
 451                pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
 452                cpumask_clear(tick_nohz_full_mask);
 453                cpumask_copy(housekeeping_mask, cpu_possible_mask);
 454                tick_nohz_full_running = false;
 455                return;
 456        }
 457
 458        cpu = smp_processor_id();
 459
 460        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
 461                pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
 462                        cpu);
 463                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 464        }
 465
 466        cpumask_andnot(housekeeping_mask,
 467                       cpu_possible_mask, tick_nohz_full_mask);
 468
 469        for_each_cpu(cpu, tick_nohz_full_mask)
 470                context_tracking_cpu_set(cpu);
 471
 472        cpu_notifier(tick_nohz_cpu_down_callback, 0);
 473        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 474                cpumask_pr_args(tick_nohz_full_mask));
 475
 476        /*
 477         * We need at least one CPU to handle housekeeping work such
 478         * as timekeeping, unbound timers, workqueues, ...
 479         */
 480        WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 481}
 482#endif
 483
 484/*
 485 * NOHZ - aka dynamic tick functionality
 486 */
 487#ifdef CONFIG_NO_HZ_COMMON
 488/*
 489 * NO HZ enabled ?
 490 */
 491bool tick_nohz_enabled __read_mostly  = true;
 492unsigned long tick_nohz_active  __read_mostly;
 493/*
 494 * Enable / Disable tickless mode
 495 */
 496static int __init setup_tick_nohz(char *str)
 497{
 498        return (kstrtobool(str, &tick_nohz_enabled) == 0);
 499}
 500
 501__setup("nohz=", setup_tick_nohz);
 502
 503int tick_nohz_tick_stopped(void)
 504{
 505        return __this_cpu_read(tick_cpu_sched.tick_stopped);
 506}
 507
 508/**
 509 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 510 *
 511 * Called from interrupt entry when the CPU was idle
 512 *
 513 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 514 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 515 * value. We do this unconditionally on any CPU, as we don't know whether the
 516 * CPU, which has the update task assigned is in a long sleep.
 517 */
 518static void tick_nohz_update_jiffies(ktime_t now)
 519{
 520        unsigned long flags;
 521
 522        __this_cpu_write(tick_cpu_sched.idle_waketime, now);
 523
 524        local_irq_save(flags);
 525        tick_do_update_jiffies64(now);
 526        local_irq_restore(flags);
 527
 528        touch_softlockup_watchdog_sched();
 529}
 530
 531/*
 532 * Updates the per-CPU time idle statistics counters
 533 */
 534static void
 535update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 536{
 537        ktime_t delta;
 538
 539        if (ts->idle_active) {
 540                delta = ktime_sub(now, ts->idle_entrytime);
 541                if (nr_iowait_cpu(cpu) > 0)
 542                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 543                else
 544                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 545                ts->idle_entrytime = now;
 546        }
 547
 548        if (last_update_time)
 549                *last_update_time = ktime_to_us(now);
 550
 551}
 552
 553static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 554{
 555        update_ts_time_stats(smp_processor_id(), ts, now, NULL);
 556        ts->idle_active = 0;
 557
 558        sched_clock_idle_wakeup_event(0);
 559}
 560
 561static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 562{
 563        ktime_t now = ktime_get();
 564
 565        ts->idle_entrytime = now;
 566        ts->idle_active = 1;
 567        sched_clock_idle_sleep_event();
 568        return now;
 569}
 570
 571/**
 572 * get_cpu_idle_time_us - get the total idle time of a CPU
 573 * @cpu: CPU number to query
 574 * @last_update_time: variable to store update time in. Do not update
 575 * counters if NULL.
 576 *
 577 * Return the cumulative idle time (since boot) for a given
 578 * CPU, in microseconds.
 579 *
 580 * This time is measured via accounting rather than sampling,
 581 * and is as accurate as ktime_get() is.
 582 *
 583 * This function returns -1 if NOHZ is not enabled.
 584 */
 585u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 586{
 587        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 588        ktime_t now, idle;
 589
 590        if (!tick_nohz_active)
 591                return -1;
 592
 593        now = ktime_get();
 594        if (last_update_time) {
 595                update_ts_time_stats(cpu, ts, now, last_update_time);
 596                idle = ts->idle_sleeptime;
 597        } else {
 598                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
 599                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 600
 601                        idle = ktime_add(ts->idle_sleeptime, delta);
 602                } else {
 603                        idle = ts->idle_sleeptime;
 604                }
 605        }
 606
 607        return ktime_to_us(idle);
 608
 609}
 610EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 611
 612/**
 613 * get_cpu_iowait_time_us - get the total iowait time of a CPU
 614 * @cpu: CPU number to query
 615 * @last_update_time: variable to store update time in. Do not update
 616 * counters if NULL.
 617 *
 618 * Return the cumulative iowait time (since boot) for a given
 619 * CPU, in microseconds.
 620 *
 621 * This time is measured via accounting rather than sampling,
 622 * and is as accurate as ktime_get() is.
 623 *
 624 * This function returns -1 if NOHZ is not enabled.
 625 */
 626u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 627{
 628        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 629        ktime_t now, iowait;
 630
 631        if (!tick_nohz_active)
 632                return -1;
 633
 634        now = ktime_get();
 635        if (last_update_time) {
 636                update_ts_time_stats(cpu, ts, now, last_update_time);
 637                iowait = ts->iowait_sleeptime;
 638        } else {
 639                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
 640                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 641
 642                        iowait = ktime_add(ts->iowait_sleeptime, delta);
 643                } else {
 644                        iowait = ts->iowait_sleeptime;
 645                }
 646        }
 647
 648        return ktime_to_us(iowait);
 649}
 650EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 651
 652static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 653{
 654        hrtimer_cancel(&ts->sched_timer);
 655        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 656
 657        /* Forward the time to expire in the future */
 658        hrtimer_forward(&ts->sched_timer, now, tick_period);
 659
 660        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 661                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 662        else
 663                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 664}
 665
 666static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 667                                         ktime_t now, int cpu)
 668{
 669        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 670        u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
 671        unsigned long seq, basejiff;
 672        ktime_t tick;
 673
 674        /* Read jiffies and the time when jiffies were updated last */
 675        do {
 676                seq = read_seqbegin(&jiffies_lock);
 677                basemono = last_jiffies_update.tv64;
 678                basejiff = jiffies;
 679        } while (read_seqretry(&jiffies_lock, seq));
 680        ts->last_jiffies = basejiff;
 681
 682        if (rcu_needs_cpu(basemono, &next_rcu) ||
 683            arch_needs_cpu() || irq_work_needs_cpu()) {
 684                next_tick = basemono + TICK_NSEC;
 685        } else {
 686                /*
 687                 * Get the next pending timer. If high resolution
 688                 * timers are enabled this only takes the timer wheel
 689                 * timers into account. If high resolution timers are
 690                 * disabled this also looks at the next expiring
 691                 * hrtimer.
 692                 */
 693                next_tmr = get_next_timer_interrupt(basejiff, basemono);
 694                ts->next_timer = next_tmr;
 695                /* Take the next rcu event into account */
 696                next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 697        }
 698
 699        /*
 700         * If the tick is due in the next period, keep it ticking or
 701         * force prod the timer.
 702         */
 703        delta = next_tick - basemono;
 704        if (delta <= (u64)TICK_NSEC) {
 705                tick.tv64 = 0;
 706
 707                /*
 708                 * Tell the timer code that the base is not idle, i.e. undo
 709                 * the effect of get_next_timer_interrupt():
 710                 */
 711                timer_clear_idle();
 712                /*
 713                 * We've not stopped the tick yet, and there's a timer in the
 714                 * next period, so no point in stopping it either, bail.
 715                 */
 716                if (!ts->tick_stopped)
 717                        goto out;
 718
 719                /*
 720                 * If, OTOH, we did stop it, but there's a pending (expired)
 721                 * timer reprogram the timer hardware to fire now.
 722                 *
 723                 * We will not restart the tick proper, just prod the timer
 724                 * hardware into firing an interrupt to process the pending
 725                 * timers. Just like tick_irq_exit() will not restart the tick
 726                 * for 'normal' interrupts.
 727                 *
 728                 * Only once we exit the idle loop will we re-enable the tick,
 729                 * see tick_nohz_idle_exit().
 730                 */
 731                if (delta == 0) {
 732                        tick_nohz_restart(ts, now);
 733                        goto out;
 734                }
 735        }
 736
 737        /*
 738         * If this CPU is the one which updates jiffies, then give up
 739         * the assignment and let it be taken by the CPU which runs
 740         * the tick timer next, which might be this CPU as well. If we
 741         * don't drop this here the jiffies might be stale and
 742         * do_timer() never invoked. Keep track of the fact that it
 743         * was the one which had the do_timer() duty last. If this CPU
 744         * is the one which had the do_timer() duty last, we limit the
 745         * sleep time to the timekeeping max_deferment value.
 746         * Otherwise we can sleep as long as we want.
 747         */
 748        delta = timekeeping_max_deferment();
 749        if (cpu == tick_do_timer_cpu) {
 750                tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 751                ts->do_timer_last = 1;
 752        } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
 753                delta = KTIME_MAX;
 754                ts->do_timer_last = 0;
 755        } else if (!ts->do_timer_last) {
 756                delta = KTIME_MAX;
 757        }
 758
 759#ifdef CONFIG_NO_HZ_FULL
 760        /* Limit the tick delta to the maximum scheduler deferment */
 761        if (!ts->inidle)
 762                delta = min(delta, scheduler_tick_max_deferment());
 763#endif
 764
 765        /* Calculate the next expiry time */
 766        if (delta < (KTIME_MAX - basemono))
 767                expires = basemono + delta;
 768        else
 769                expires = KTIME_MAX;
 770
 771        expires = min_t(u64, expires, next_tick);
 772        tick.tv64 = expires;
 773
 774        /* Skip reprogram of event if its not changed */
 775        if (ts->tick_stopped && (expires == dev->next_event.tv64))
 776                goto out;
 777
 778        /*
 779         * nohz_stop_sched_tick can be called several times before
 780         * the nohz_restart_sched_tick is called. This happens when
 781         * interrupts arrive which do not cause a reschedule. In the
 782         * first call we save the current tick time, so we can restart
 783         * the scheduler tick in nohz_restart_sched_tick.
 784         */
 785        if (!ts->tick_stopped) {
 786                nohz_balance_enter_idle(cpu);
 787                calc_load_enter_idle();
 788                cpu_load_update_nohz_start();
 789
 790                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 791                ts->tick_stopped = 1;
 792                trace_tick_stop(1, TICK_DEP_MASK_NONE);
 793        }
 794
 795        /*
 796         * If the expiration time == KTIME_MAX, then we simply stop
 797         * the tick timer.
 798         */
 799        if (unlikely(expires == KTIME_MAX)) {
 800                if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 801                        hrtimer_cancel(&ts->sched_timer);
 802                goto out;
 803        }
 804
 805        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 806                hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
 807        else
 808                tick_program_event(tick, 1);
 809out:
 810        /* Update the estimated sleep length */
 811        ts->sleep_length = ktime_sub(dev->next_event, now);
 812        return tick;
 813}
 814
 815static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 816{
 817        /* Update jiffies first */
 818        tick_do_update_jiffies64(now);
 819        cpu_load_update_nohz_stop();
 820
 821        /*
 822         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
 823         * the clock forward checks in the enqueue path:
 824         */
 825        timer_clear_idle();
 826
 827        calc_load_exit_idle();
 828        touch_softlockup_watchdog_sched();
 829        /*
 830         * Cancel the scheduled timer and restore the tick
 831         */
 832        ts->tick_stopped  = 0;
 833        ts->idle_exittime = now;
 834
 835        tick_nohz_restart(ts, now);
 836}
 837
 838static void tick_nohz_full_update_tick(struct tick_sched *ts)
 839{
 840#ifdef CONFIG_NO_HZ_FULL
 841        int cpu = smp_processor_id();
 842
 843        if (!tick_nohz_full_cpu(cpu))
 844                return;
 845
 846        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 847                return;
 848
 849        if (can_stop_full_tick(cpu, ts))
 850                tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 851        else if (ts->tick_stopped)
 852                tick_nohz_restart_sched_tick(ts, ktime_get());
 853#endif
 854}
 855
 856static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 857{
 858        /*
 859         * If this CPU is offline and it is the one which updates
 860         * jiffies, then give up the assignment and let it be taken by
 861         * the CPU which runs the tick timer next. If we don't drop
 862         * this here the jiffies might be stale and do_timer() never
 863         * invoked.
 864         */
 865        if (unlikely(!cpu_online(cpu))) {
 866                if (cpu == tick_do_timer_cpu)
 867                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 868                return false;
 869        }
 870
 871        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
 872                ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
 873                return false;
 874        }
 875
 876        if (need_resched())
 877                return false;
 878
 879        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
 880                static int ratelimit;
 881
 882                if (ratelimit < 10 &&
 883                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
 884                        pr_warn("NOHZ: local_softirq_pending %02x\n",
 885                                (unsigned int) local_softirq_pending());
 886                        ratelimit++;
 887                }
 888                return false;
 889        }
 890
 891        if (tick_nohz_full_enabled()) {
 892                /*
 893                 * Keep the tick alive to guarantee timekeeping progression
 894                 * if there are full dynticks CPUs around
 895                 */
 896                if (tick_do_timer_cpu == cpu)
 897                        return false;
 898                /*
 899                 * Boot safety: make sure the timekeeping duty has been
 900                 * assigned before entering dyntick-idle mode,
 901                 */
 902                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 903                        return false;
 904        }
 905
 906        return true;
 907}
 908
 909static void __tick_nohz_idle_enter(struct tick_sched *ts)
 910{
 911        ktime_t now, expires;
 912        int cpu = smp_processor_id();
 913
 914        now = tick_nohz_start_idle(ts);
 915
 916        if (can_stop_idle_tick(cpu, ts)) {
 917                int was_stopped = ts->tick_stopped;
 918
 919                ts->idle_calls++;
 920
 921                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
 922                if (expires.tv64 > 0LL) {
 923                        ts->idle_sleeps++;
 924                        ts->idle_expires = expires;
 925                }
 926
 927                if (!was_stopped && ts->tick_stopped)
 928                        ts->idle_jiffies = ts->last_jiffies;
 929        }
 930}
 931
 932/**
 933 * tick_nohz_idle_enter - stop the idle tick from the idle task
 934 *
 935 * When the next event is more than a tick into the future, stop the idle tick
 936 * Called when we start the idle loop.
 937 *
 938 * The arch is responsible of calling:
 939 *
 940 * - rcu_idle_enter() after its last use of RCU before the CPU is put
 941 *  to sleep.
 942 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
 943 */
 944void tick_nohz_idle_enter(void)
 945{
 946        struct tick_sched *ts;
 947
 948        WARN_ON_ONCE(irqs_disabled());
 949
 950        /*
 951         * Update the idle state in the scheduler domain hierarchy
 952         * when tick_nohz_stop_sched_tick() is called from the idle loop.
 953         * State will be updated to busy during the first busy tick after
 954         * exiting idle.
 955         */
 956        set_cpu_sd_state_idle();
 957
 958        local_irq_disable();
 959
 960        ts = this_cpu_ptr(&tick_cpu_sched);
 961        ts->inidle = 1;
 962        __tick_nohz_idle_enter(ts);
 963
 964        local_irq_enable();
 965}
 966
 967/**
 968 * tick_nohz_irq_exit - update next tick event from interrupt exit
 969 *
 970 * When an interrupt fires while we are idle and it doesn't cause
 971 * a reschedule, it may still add, modify or delete a timer, enqueue
 972 * an RCU callback, etc...
 973 * So we need to re-calculate and reprogram the next tick event.
 974 */
 975void tick_nohz_irq_exit(void)
 976{
 977        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 978
 979        if (ts->inidle)
 980                __tick_nohz_idle_enter(ts);
 981        else
 982                tick_nohz_full_update_tick(ts);
 983}
 984
 985/**
 986 * tick_nohz_get_sleep_length - return the length of the current sleep
 987 *
 988 * Called from power state control code with interrupts disabled
 989 */
 990ktime_t tick_nohz_get_sleep_length(void)
 991{
 992        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 993
 994        return ts->sleep_length;
 995}
 996
 997static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 998{
 999#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1000        unsigned long ticks;
1001
1002        if (vtime_accounting_cpu_enabled())
1003                return;
1004        /*
1005         * We stopped the tick in idle. Update process times would miss the
1006         * time we slept as update_process_times does only a 1 tick
1007         * accounting. Enforce that this is accounted to idle !
1008         */
1009        ticks = jiffies - ts->idle_jiffies;
1010        /*
1011         * We might be one off. Do not randomly account a huge number of ticks!
1012         */
1013        if (ticks && ticks < LONG_MAX)
1014                account_idle_ticks(ticks);
1015#endif
1016}
1017
1018/**
1019 * tick_nohz_idle_exit - restart the idle tick from the idle task
1020 *
1021 * Restart the idle tick when the CPU is woken up from idle
1022 * This also exit the RCU extended quiescent state. The CPU
1023 * can use RCU again after this function is called.
1024 */
1025void tick_nohz_idle_exit(void)
1026{
1027        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1028        ktime_t now;
1029
1030        local_irq_disable();
1031
1032        WARN_ON_ONCE(!ts->inidle);
1033
1034        ts->inidle = 0;
1035
1036        if (ts->idle_active || ts->tick_stopped)
1037                now = ktime_get();
1038
1039        if (ts->idle_active)
1040                tick_nohz_stop_idle(ts, now);
1041
1042        if (ts->tick_stopped) {
1043                tick_nohz_restart_sched_tick(ts, now);
1044                tick_nohz_account_idle_ticks(ts);
1045        }
1046
1047        local_irq_enable();
1048}
1049
1050/*
1051 * The nohz low res interrupt handler
1052 */
1053static void tick_nohz_handler(struct clock_event_device *dev)
1054{
1055        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1056        struct pt_regs *regs = get_irq_regs();
1057        ktime_t now = ktime_get();
1058
1059        dev->next_event.tv64 = KTIME_MAX;
1060
1061        tick_sched_do_timer(now);
1062        tick_sched_handle(ts, regs);
1063
1064        /* No need to reprogram if we are running tickless  */
1065        if (unlikely(ts->tick_stopped))
1066                return;
1067
1068        hrtimer_forward(&ts->sched_timer, now, tick_period);
1069        tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1070}
1071
1072static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
1073{
1074        if (!tick_nohz_enabled)
1075                return;
1076        ts->nohz_mode = mode;
1077        /* One update is enough */
1078        if (!test_and_set_bit(0, &tick_nohz_active))
1079                timers_update_migration(true);
1080}
1081
1082/**
1083 * tick_nohz_switch_to_nohz - switch to nohz mode
1084 */
1085static void tick_nohz_switch_to_nohz(void)
1086{
1087        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1088        ktime_t next;
1089
1090        if (!tick_nohz_enabled)
1091                return;
1092
1093        if (tick_switch_to_oneshot(tick_nohz_handler))
1094                return;
1095
1096        /*
1097         * Recycle the hrtimer in ts, so we can share the
1098         * hrtimer_forward with the highres code.
1099         */
1100        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1101        /* Get the next period */
1102        next = tick_init_jiffy_update();
1103
1104        hrtimer_set_expires(&ts->sched_timer, next);
1105        hrtimer_forward_now(&ts->sched_timer, tick_period);
1106        tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
1107        tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
1108}
1109
1110static inline void tick_nohz_irq_enter(void)
1111{
1112        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1113        ktime_t now;
1114
1115        if (!ts->idle_active && !ts->tick_stopped)
1116                return;
1117        now = ktime_get();
1118        if (ts->idle_active)
1119                tick_nohz_stop_idle(ts, now);
1120        if (ts->tick_stopped)
1121                tick_nohz_update_jiffies(now);
1122}
1123
1124#else
1125
1126static inline void tick_nohz_switch_to_nohz(void) { }
1127static inline void tick_nohz_irq_enter(void) { }
1128static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
1129
1130#endif /* CONFIG_NO_HZ_COMMON */
1131
1132/*
1133 * Called from irq_enter to notify about the possible interruption of idle()
1134 */
1135void tick_irq_enter(void)
1136{
1137        tick_check_oneshot_broadcast_this_cpu();
1138        tick_nohz_irq_enter();
1139}
1140
1141/*
1142 * High resolution timer specific code
1143 */
1144#ifdef CONFIG_HIGH_RES_TIMERS
1145/*
1146 * We rearm the timer until we get disabled by the idle code.
1147 * Called with interrupts disabled.
1148 */
1149static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1150{
1151        struct tick_sched *ts =
1152                container_of(timer, struct tick_sched, sched_timer);
1153        struct pt_regs *regs = get_irq_regs();
1154        ktime_t now = ktime_get();
1155
1156        tick_sched_do_timer(now);
1157
1158        /*
1159         * Do not call, when we are not in irq context and have
1160         * no valid regs pointer
1161         */
1162        if (regs)
1163                tick_sched_handle(ts, regs);
1164
1165        /* No need to reprogram if we are in idle or full dynticks mode */
1166        if (unlikely(ts->tick_stopped))
1167                return HRTIMER_NORESTART;
1168
1169        hrtimer_forward(timer, now, tick_period);
1170
1171        return HRTIMER_RESTART;
1172}
1173
1174static int sched_skew_tick;
1175
1176static int __init skew_tick(char *str)
1177{
1178        get_option(&str, &sched_skew_tick);
1179
1180        return 0;
1181}
1182early_param("skew_tick", skew_tick);
1183
1184/**
1185 * tick_setup_sched_timer - setup the tick emulation timer
1186 */
1187void tick_setup_sched_timer(void)
1188{
1189        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1190        ktime_t now = ktime_get();
1191
1192        /*
1193         * Emulate tick processing via per-CPU hrtimers:
1194         */
1195        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1196        ts->sched_timer.function = tick_sched_timer;
1197
1198        /* Get the next period (per-CPU) */
1199        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1200
1201        /* Offset the tick to avert jiffies_lock contention. */
1202        if (sched_skew_tick) {
1203                u64 offset = ktime_to_ns(tick_period) >> 1;
1204                do_div(offset, num_possible_cpus());
1205                offset *= smp_processor_id();
1206                hrtimer_add_expires_ns(&ts->sched_timer, offset);
1207        }
1208
1209        hrtimer_forward(&ts->sched_timer, now, tick_period);
1210        hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
1211        tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
1212}
1213#endif /* HIGH_RES_TIMERS */
1214
1215#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
1216void tick_cancel_sched_timer(int cpu)
1217{
1218        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1219
1220# ifdef CONFIG_HIGH_RES_TIMERS
1221        if (ts->sched_timer.base)
1222                hrtimer_cancel(&ts->sched_timer);
1223# endif
1224
1225        memset(ts, 0, sizeof(*ts));
1226}
1227#endif
1228
1229/**
1230 * Async notification about clocksource changes
1231 */
1232void tick_clock_notify(void)
1233{
1234        int cpu;
1235
1236        for_each_possible_cpu(cpu)
1237                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1238}
1239
1240/*
1241 * Async notification about clock event changes
1242 */
1243void tick_oneshot_notify(void)
1244{
1245        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1246
1247        set_bit(0, &ts->check_clocks);
1248}
1249
1250/**
1251 * Check, if a change happened, which makes oneshot possible.
1252 *
1253 * Called cyclic from the hrtimer softirq (driven by the timer
1254 * softirq) allow_nohz signals, that we can switch into low-res nohz
1255 * mode, because high resolution timers are disabled (either compile
1256 * or runtime). Called with interrupts disabled.
1257 */
1258int tick_check_oneshot_change(int allow_nohz)
1259{
1260        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1261
1262        if (!test_and_clear_bit(0, &ts->check_clocks))
1263                return 0;
1264
1265        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1266                return 0;
1267
1268        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1269                return 0;
1270
1271        if (!allow_nohz)
1272                return 1;
1273
1274        tick_nohz_switch_to_nohz();
1275        return 0;
1276}
1277