linux/kernel/time/tick-sched.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/time/tick-sched.c
   3 *
   4 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
   6 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
   7 *
   8 *  No idle tick implementation for low and high resolution timers
   9 *
  10 *  Started by: Thomas Gleixner and Ingo Molnar
  11 *
  12 *  Distribute under GPLv2.
  13 */
  14#include <linux/cpu.h>
  15#include <linux/err.h>
  16#include <linux/hrtimer.h>
  17#include <linux/interrupt.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/percpu.h>
  20#include <linux/profile.h>
  21#include <linux/sched.h>
  22#include <linux/module.h>
  23#include <linux/irq_work.h>
  24#include <linux/posix-timers.h>
  25#include <linux/perf_event.h>
  26
  27#include <asm/irq_regs.h>
  28
  29#include "tick-internal.h"
  30
  31#include <trace/events/timer.h>
  32
  33/*
  34 * Per cpu nohz control structure
  35 */
  36DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
  37
  38/*
  39 * The time, when the last jiffy update happened. Protected by jiffies_lock.
  40 */
  41static ktime_t last_jiffies_update;
  42
  43struct tick_sched *tick_get_tick_sched(int cpu)
  44{
  45        return &per_cpu(tick_cpu_sched, cpu);
  46}
  47
  48/*
  49 * Must be called with interrupts disabled !
  50 */
  51static void tick_do_update_jiffies64(ktime_t now)
  52{
  53        unsigned long ticks = 0;
  54        ktime_t delta;
  55
  56        /*
  57         * Do a quick check without holding jiffies_lock:
  58         */
  59        delta = ktime_sub(now, last_jiffies_update);
  60        if (delta.tv64 < tick_period.tv64)
  61                return;
  62
  63        /* Reevalute with jiffies_lock held */
  64        write_seqlock(&jiffies_lock);
  65
  66        delta = ktime_sub(now, last_jiffies_update);
  67        if (delta.tv64 >= tick_period.tv64) {
  68
  69                delta = ktime_sub(delta, tick_period);
  70                last_jiffies_update = ktime_add(last_jiffies_update,
  71                                                tick_period);
  72
  73                /* Slow path for long timeouts */
  74                if (unlikely(delta.tv64 >= tick_period.tv64)) {
  75                        s64 incr = ktime_to_ns(tick_period);
  76
  77                        ticks = ktime_divns(delta, incr);
  78
  79                        last_jiffies_update = ktime_add_ns(last_jiffies_update,
  80                                                           incr * ticks);
  81                }
  82                do_timer(++ticks);
  83
  84                /* Keep the tick_next_period variable up to date */
  85                tick_next_period = ktime_add(last_jiffies_update, tick_period);
  86        }
  87        write_sequnlock(&jiffies_lock);
  88}
  89
  90/*
  91 * Initialize and return retrieve the jiffies update.
  92 */
  93static ktime_t tick_init_jiffy_update(void)
  94{
  95        ktime_t period;
  96
  97        write_seqlock(&jiffies_lock);
  98        /* Did we start the jiffies update yet ? */
  99        if (last_jiffies_update.tv64 == 0)
 100                last_jiffies_update = tick_next_period;
 101        period = last_jiffies_update;
 102        write_sequnlock(&jiffies_lock);
 103        return period;
 104}
 105
 106
 107static void tick_sched_do_timer(ktime_t now)
 108{
 109        int cpu = smp_processor_id();
 110
 111#ifdef CONFIG_NO_HZ_COMMON
 112        /*
 113         * Check if the do_timer duty was dropped. We don't care about
 114         * concurrency: This happens only when the cpu in charge went
 115         * into a long sleep. If two cpus happen to assign themself to
 116         * this duty, then the jiffies update is still serialized by
 117         * jiffies_lock.
 118         */
 119        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 120            && !tick_nohz_full_cpu(cpu))
 121                tick_do_timer_cpu = cpu;
 122#endif
 123
 124        /* Check, if the jiffies need an update */
 125        if (tick_do_timer_cpu == cpu)
 126                tick_do_update_jiffies64(now);
 127}
 128
 129static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 130{
 131#ifdef CONFIG_NO_HZ_COMMON
 132        /*
 133         * When we are idle and the tick is stopped, we have to touch
 134         * the watchdog as we might not schedule for a really long
 135         * time. This happens on complete idle SMP systems while
 136         * waiting on the login prompt. We also increment the "start of
 137         * idle" jiffy stamp so the idle accounting adjustment we do
 138         * when we go busy again does not account too much ticks.
 139         */
 140        if (ts->tick_stopped) {
 141                touch_softlockup_watchdog();
 142                if (is_idle_task(current))
 143                        ts->idle_jiffies++;
 144        }
 145#endif
 146        update_process_times(user_mode(regs));
 147        profile_tick(CPU_PROFILING);
 148}
 149
 150#ifdef CONFIG_NO_HZ_FULL
 151static cpumask_var_t nohz_full_mask;
 152bool have_nohz_full_mask;
 153
 154static bool can_stop_full_tick(void)
 155{
 156        WARN_ON_ONCE(!irqs_disabled());
 157
 158        if (!sched_can_stop_tick()) {
 159                trace_tick_stop(0, "more than 1 task in runqueue\n");
 160                return false;
 161        }
 162
 163        if (!posix_cpu_timers_can_stop_tick(current)) {
 164                trace_tick_stop(0, "posix timers running\n");
 165                return false;
 166        }
 167
 168        if (!perf_event_can_stop_tick()) {
 169                trace_tick_stop(0, "perf events running\n");
 170                return false;
 171        }
 172
 173        /* sched_clock_tick() needs us? */
 174#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 175        /*
 176         * TODO: kick full dynticks CPUs when
 177         * sched_clock_stable is set.
 178         */
 179        if (!sched_clock_stable) {
 180                trace_tick_stop(0, "unstable sched clock\n");
 181                return false;
 182        }
 183#endif
 184
 185        return true;
 186}
 187
 188static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
 189
 190/*
 191 * Re-evaluate the need for the tick on the current CPU
 192 * and restart it if necessary.
 193 */
 194void tick_nohz_full_check(void)
 195{
 196        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 197
 198        if (tick_nohz_full_cpu(smp_processor_id())) {
 199                if (ts->tick_stopped && !is_idle_task(current)) {
 200                        if (!can_stop_full_tick())
 201                                tick_nohz_restart_sched_tick(ts, ktime_get());
 202                }
 203        }
 204}
 205
 206static void nohz_full_kick_work_func(struct irq_work *work)
 207{
 208        tick_nohz_full_check();
 209}
 210
 211static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 212        .func = nohz_full_kick_work_func,
 213};
 214
 215/*
 216 * Kick the current CPU if it's full dynticks in order to force it to
 217 * re-evaluate its dependency on the tick and restart it if necessary.
 218 */
 219void tick_nohz_full_kick(void)
 220{
 221        if (tick_nohz_full_cpu(smp_processor_id()))
 222                irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
 223}
 224
 225static void nohz_full_kick_ipi(void *info)
 226{
 227        tick_nohz_full_check();
 228}
 229
 230/*
 231 * Kick all full dynticks CPUs in order to force these to re-evaluate
 232 * their dependency on the tick and restart it if necessary.
 233 */
 234void tick_nohz_full_kick_all(void)
 235{
 236        if (!have_nohz_full_mask)
 237                return;
 238
 239        preempt_disable();
 240        smp_call_function_many(nohz_full_mask,
 241                               nohz_full_kick_ipi, NULL, false);
 242        preempt_enable();
 243}
 244
 245/*
 246 * Re-evaluate the need for the tick as we switch the current task.
 247 * It might need the tick due to per task/process properties:
 248 * perf events, posix cpu timers, ...
 249 */
 250void tick_nohz_task_switch(struct task_struct *tsk)
 251{
 252        unsigned long flags;
 253
 254        local_irq_save(flags);
 255
 256        if (!tick_nohz_full_cpu(smp_processor_id()))
 257                goto out;
 258
 259        if (tick_nohz_tick_stopped() && !can_stop_full_tick())
 260                tick_nohz_full_kick();
 261
 262out:
 263        local_irq_restore(flags);
 264}
 265
 266int tick_nohz_full_cpu(int cpu)
 267{
 268        if (!have_nohz_full_mask)
 269                return 0;
 270
 271        return cpumask_test_cpu(cpu, nohz_full_mask);
 272}
 273
 274/* Parse the boot-time nohz CPU list from the kernel parameters. */
 275static int __init tick_nohz_full_setup(char *str)
 276{
 277        int cpu;
 278
 279        alloc_bootmem_cpumask_var(&nohz_full_mask);
 280        if (cpulist_parse(str, nohz_full_mask) < 0) {
 281                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
 282                return 1;
 283        }
 284
 285        cpu = smp_processor_id();
 286        if (cpumask_test_cpu(cpu, nohz_full_mask)) {
 287                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
 288                cpumask_clear_cpu(cpu, nohz_full_mask);
 289        }
 290        have_nohz_full_mask = true;
 291
 292        return 1;
 293}
 294__setup("nohz_full=", tick_nohz_full_setup);
 295
 296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 297                                                 unsigned long action,
 298                                                 void *hcpu)
 299{
 300        unsigned int cpu = (unsigned long)hcpu;
 301
 302        switch (action & ~CPU_TASKS_FROZEN) {
 303        case CPU_DOWN_PREPARE:
 304                /*
 305                 * If we handle the timekeeping duty for full dynticks CPUs,
 306                 * we can't safely shutdown that CPU.
 307                 */
 308                if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
 309                        return NOTIFY_BAD;
 310                break;
 311        }
 312        return NOTIFY_OK;
 313}
 314
 315/*
 316 * Worst case string length in chunks of CPU range seems 2 steps
 317 * separations: 0,2,4,6,...
 318 * This is NR_CPUS + sizeof('\0')
 319 */
 320static char __initdata nohz_full_buf[NR_CPUS + 1];
 321
 322static int tick_nohz_init_all(void)
 323{
 324        int err = -1;
 325
 326#ifdef CONFIG_NO_HZ_FULL_ALL
 327        if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
 328                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
 329                return err;
 330        }
 331        err = 0;
 332        cpumask_setall(nohz_full_mask);
 333        cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
 334        have_nohz_full_mask = true;
 335#endif
 336        return err;
 337}
 338
 339void __init tick_nohz_init(void)
 340{
 341        int cpu;
 342
 343        if (!have_nohz_full_mask) {
 344                if (tick_nohz_init_all() < 0)
 345                        return;
 346        }
 347
 348        cpu_notifier(tick_nohz_cpu_down_callback, 0);
 349
 350        /* Make sure full dynticks CPU are also RCU nocbs */
 351        for_each_cpu(cpu, nohz_full_mask) {
 352                if (!rcu_is_nocb_cpu(cpu)) {
 353                        pr_warning("NO_HZ: CPU %d is not RCU nocb: "
 354                                   "cleared from nohz_full range", cpu);
 355                        cpumask_clear_cpu(cpu, nohz_full_mask);
 356                }
 357        }
 358
 359        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
 360        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 361}
 362#else
 363#define have_nohz_full_mask (0)
 364#endif
 365
 366/*
 367 * NOHZ - aka dynamic tick functionality
 368 */
 369#ifdef CONFIG_NO_HZ_COMMON
 370/*
 371 * NO HZ enabled ?
 372 */
 373int tick_nohz_enabled __read_mostly  = 1;
 374
 375/*
 376 * Enable / Disable tickless mode
 377 */
 378static int __init setup_tick_nohz(char *str)
 379{
 380        if (!strcmp(str, "off"))
 381                tick_nohz_enabled = 0;
 382        else if (!strcmp(str, "on"))
 383                tick_nohz_enabled = 1;
 384        else
 385                return 0;
 386        return 1;
 387}
 388
 389__setup("nohz=", setup_tick_nohz);
 390
 391/**
 392 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 393 *
 394 * Called from interrupt entry when the CPU was idle
 395 *
 396 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 397 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 398 * value. We do this unconditionally on any cpu, as we don't know whether the
 399 * cpu, which has the update task assigned is in a long sleep.
 400 */
 401static void tick_nohz_update_jiffies(ktime_t now)
 402{
 403        int cpu = smp_processor_id();
 404        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 405        unsigned long flags;
 406
 407        ts->idle_waketime = now;
 408
 409        local_irq_save(flags);
 410        tick_do_update_jiffies64(now);
 411        local_irq_restore(flags);
 412
 413        touch_softlockup_watchdog();
 414}
 415
 416/*
 417 * Updates the per cpu time idle statistics counters
 418 */
 419static void
 420update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 421{
 422        ktime_t delta;
 423
 424        if (ts->idle_active) {
 425                delta = ktime_sub(now, ts->idle_entrytime);
 426                if (nr_iowait_cpu(cpu) > 0)
 427                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 428                else
 429                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 430                ts->idle_entrytime = now;
 431        }
 432
 433        if (last_update_time)
 434                *last_update_time = ktime_to_us(now);
 435
 436}
 437
 438static void tick_nohz_stop_idle(int cpu, ktime_t now)
 439{
 440        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 441
 442        update_ts_time_stats(cpu, ts, now, NULL);
 443        ts->idle_active = 0;
 444
 445        sched_clock_idle_wakeup_event(0);
 446}
 447
 448static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 449{
 450        ktime_t now = ktime_get();
 451
 452        ts->idle_entrytime = now;
 453        ts->idle_active = 1;
 454        sched_clock_idle_sleep_event();
 455        return now;
 456}
 457
 458/**
 459 * get_cpu_idle_time_us - get the total idle time of a cpu
 460 * @cpu: CPU number to query
 461 * @last_update_time: variable to store update time in. Do not update
 462 * counters if NULL.
 463 *
 464 * Return the cummulative idle time (since boot) for a given
 465 * CPU, in microseconds.
 466 *
 467 * This time is measured via accounting rather than sampling,
 468 * and is as accurate as ktime_get() is.
 469 *
 470 * This function returns -1 if NOHZ is not enabled.
 471 */
 472u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 473{
 474        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 475        ktime_t now, idle;
 476
 477        if (!tick_nohz_enabled)
 478                return -1;
 479
 480        now = ktime_get();
 481        if (last_update_time) {
 482                update_ts_time_stats(cpu, ts, now, last_update_time);
 483                idle = ts->idle_sleeptime;
 484        } else {
 485                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
 486                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 487
 488                        idle = ktime_add(ts->idle_sleeptime, delta);
 489                } else {
 490                        idle = ts->idle_sleeptime;
 491                }
 492        }
 493
 494        return ktime_to_us(idle);
 495
 496}
 497EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 498
 499/**
 500 * get_cpu_iowait_time_us - get the total iowait time of a cpu
 501 * @cpu: CPU number to query
 502 * @last_update_time: variable to store update time in. Do not update
 503 * counters if NULL.
 504 *
 505 * Return the cummulative iowait time (since boot) for a given
 506 * CPU, in microseconds.
 507 *
 508 * This time is measured via accounting rather than sampling,
 509 * and is as accurate as ktime_get() is.
 510 *
 511 * This function returns -1 if NOHZ is not enabled.
 512 */
 513u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 514{
 515        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 516        ktime_t now, iowait;
 517
 518        if (!tick_nohz_enabled)
 519                return -1;
 520
 521        now = ktime_get();
 522        if (last_update_time) {
 523                update_ts_time_stats(cpu, ts, now, last_update_time);
 524                iowait = ts->iowait_sleeptime;
 525        } else {
 526                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
 527                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 528
 529                        iowait = ktime_add(ts->iowait_sleeptime, delta);
 530                } else {
 531                        iowait = ts->iowait_sleeptime;
 532                }
 533        }
 534
 535        return ktime_to_us(iowait);
 536}
 537EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 538
 539static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 540                                         ktime_t now, int cpu)
 541{
 542        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
 543        ktime_t last_update, expires, ret = { .tv64 = 0 };
 544        unsigned long rcu_delta_jiffies;
 545        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 546        u64 time_delta;
 547
 548        /* Read jiffies and the time when jiffies were updated last */
 549        do {
 550                seq = read_seqbegin(&jiffies_lock);
 551                last_update = last_jiffies_update;
 552                last_jiffies = jiffies;
 553                time_delta = timekeeping_max_deferment();
 554        } while (read_seqretry(&jiffies_lock, seq));
 555
 556        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
 557            arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
 558                next_jiffies = last_jiffies + 1;
 559                delta_jiffies = 1;
 560        } else {
 561                /* Get the next timer wheel timer */
 562                next_jiffies = get_next_timer_interrupt(last_jiffies);
 563                delta_jiffies = next_jiffies - last_jiffies;
 564                if (rcu_delta_jiffies < delta_jiffies) {
 565                        next_jiffies = last_jiffies + rcu_delta_jiffies;
 566                        delta_jiffies = rcu_delta_jiffies;
 567                }
 568        }
 569
 570        /*
 571         * Do not stop the tick, if we are only one off (or less)
 572         * or if the cpu is required for RCU:
 573         */
 574        if (!ts->tick_stopped && delta_jiffies <= 1)
 575                goto out;
 576
 577        /* Schedule the tick, if we are at least one jiffie off */
 578        if ((long)delta_jiffies >= 1) {
 579
 580                /*
 581                 * If this cpu is the one which updates jiffies, then
 582                 * give up the assignment and let it be taken by the
 583                 * cpu which runs the tick timer next, which might be
 584                 * this cpu as well. If we don't drop this here the
 585                 * jiffies might be stale and do_timer() never
 586                 * invoked. Keep track of the fact that it was the one
 587                 * which had the do_timer() duty last. If this cpu is
 588                 * the one which had the do_timer() duty last, we
 589                 * limit the sleep time to the timekeeping
 590                 * max_deferement value which we retrieved
 591                 * above. Otherwise we can sleep as long as we want.
 592                 */
 593                if (cpu == tick_do_timer_cpu) {
 594                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 595                        ts->do_timer_last = 1;
 596                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
 597                        time_delta = KTIME_MAX;
 598                        ts->do_timer_last = 0;
 599                } else if (!ts->do_timer_last) {
 600                        time_delta = KTIME_MAX;
 601                }
 602
 603#ifdef CONFIG_NO_HZ_FULL
 604                if (!ts->inidle) {
 605                        time_delta = min(time_delta,
 606                                         scheduler_tick_max_deferment());
 607                }
 608#endif
 609
 610                /*
 611                 * calculate the expiry time for the next timer wheel
 612                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
 613                 * that there is no timer pending or at least extremely
 614                 * far into the future (12 days for HZ=1000). In this
 615                 * case we set the expiry to the end of time.
 616                 */
 617                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
 618                        /*
 619                         * Calculate the time delta for the next timer event.
 620                         * If the time delta exceeds the maximum time delta
 621                         * permitted by the current clocksource then adjust
 622                         * the time delta accordingly to ensure the
 623                         * clocksource does not wrap.
 624                         */
 625                        time_delta = min_t(u64, time_delta,
 626                                           tick_period.tv64 * delta_jiffies);
 627                }
 628
 629                if (time_delta < KTIME_MAX)
 630                        expires = ktime_add_ns(last_update, time_delta);
 631                else
 632                        expires.tv64 = KTIME_MAX;
 633
 634                /* Skip reprogram of event if its not changed */
 635                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 636                        goto out;
 637
 638                ret = expires;
 639
 640                /*
 641                 * nohz_stop_sched_tick can be called several times before
 642                 * the nohz_restart_sched_tick is called. This happens when
 643                 * interrupts arrive which do not cause a reschedule. In the
 644                 * first call we save the current tick time, so we can restart
 645                 * the scheduler tick in nohz_restart_sched_tick.
 646                 */
 647                if (!ts->tick_stopped) {
 648                        nohz_balance_enter_idle(cpu);
 649                        calc_load_enter_idle();
 650
 651                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 652                        ts->tick_stopped = 1;
 653                        trace_tick_stop(1, " ");
 654                }
 655
 656                /*
 657                 * If the expiration time == KTIME_MAX, then
 658                 * in this case we simply stop the tick timer.
 659                 */
 660                 if (unlikely(expires.tv64 == KTIME_MAX)) {
 661                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 662                                hrtimer_cancel(&ts->sched_timer);
 663                        goto out;
 664                }
 665
 666                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 667                        hrtimer_start(&ts->sched_timer, expires,
 668                                      HRTIMER_MODE_ABS_PINNED);
 669                        /* Check, if the timer was already in the past */
 670                        if (hrtimer_active(&ts->sched_timer))
 671                                goto out;
 672                } else if (!tick_program_event(expires, 0))
 673                                goto out;
 674                /*
 675                 * We are past the event already. So we crossed a
 676                 * jiffie boundary. Update jiffies and raise the
 677                 * softirq.
 678                 */
 679                tick_do_update_jiffies64(ktime_get());
 680        }
 681        raise_softirq_irqoff(TIMER_SOFTIRQ);
 682out:
 683        ts->next_jiffies = next_jiffies;
 684        ts->last_jiffies = last_jiffies;
 685        ts->sleep_length = ktime_sub(dev->next_event, now);
 686
 687        return ret;
 688}
 689
 690static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 691{
 692#ifdef CONFIG_NO_HZ_FULL
 693       int cpu = smp_processor_id();
 694
 695       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
 696               return;
 697
 698       if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 699               return;
 700
 701       if (!can_stop_full_tick())
 702               return;
 703
 704       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 705#endif
 706}
 707
 708static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 709{
 710        /*
 711         * If this cpu is offline and it is the one which updates
 712         * jiffies, then give up the assignment and let it be taken by
 713         * the cpu which runs the tick timer next. If we don't drop
 714         * this here the jiffies might be stale and do_timer() never
 715         * invoked.
 716         */
 717        if (unlikely(!cpu_online(cpu))) {
 718                if (cpu == tick_do_timer_cpu)
 719                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 720                return false;
 721        }
 722
 723        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 724                return false;
 725
 726        if (need_resched())
 727                return false;
 728
 729        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
 730                static int ratelimit;
 731
 732                if (ratelimit < 10 &&
 733                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
 734                        pr_warn("NOHZ: local_softirq_pending %02x\n",
 735                                (unsigned int) local_softirq_pending());
 736                        ratelimit++;
 737                }
 738                return false;
 739        }
 740
 741        if (have_nohz_full_mask) {
 742                /*
 743                 * Keep the tick alive to guarantee timekeeping progression
 744                 * if there are full dynticks CPUs around
 745                 */
 746                if (tick_do_timer_cpu == cpu)
 747                        return false;
 748                /*
 749                 * Boot safety: make sure the timekeeping duty has been
 750                 * assigned before entering dyntick-idle mode,
 751                 */
 752                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 753                        return false;
 754        }
 755
 756        return true;
 757}
 758
 759static void __tick_nohz_idle_enter(struct tick_sched *ts)
 760{
 761        ktime_t now, expires;
 762        int cpu = smp_processor_id();
 763
 764        now = tick_nohz_start_idle(cpu, ts);
 765
 766        if (can_stop_idle_tick(cpu, ts)) {
 767                int was_stopped = ts->tick_stopped;
 768
 769                ts->idle_calls++;
 770
 771                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
 772                if (expires.tv64 > 0LL) {
 773                        ts->idle_sleeps++;
 774                        ts->idle_expires = expires;
 775                }
 776
 777                if (!was_stopped && ts->tick_stopped)
 778                        ts->idle_jiffies = ts->last_jiffies;
 779        }
 780}
 781
 782/**
 783 * tick_nohz_idle_enter - stop the idle tick from the idle task
 784 *
 785 * When the next event is more than a tick into the future, stop the idle tick
 786 * Called when we start the idle loop.
 787 *
 788 * The arch is responsible of calling:
 789 *
 790 * - rcu_idle_enter() after its last use of RCU before the CPU is put
 791 *  to sleep.
 792 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
 793 */
 794void tick_nohz_idle_enter(void)
 795{
 796        struct tick_sched *ts;
 797
 798        WARN_ON_ONCE(irqs_disabled());
 799
 800        /*
 801         * Update the idle state in the scheduler domain hierarchy
 802         * when tick_nohz_stop_sched_tick() is called from the idle loop.
 803         * State will be updated to busy during the first busy tick after
 804         * exiting idle.
 805         */
 806        set_cpu_sd_state_idle();
 807
 808        local_irq_disable();
 809
 810        ts = &__get_cpu_var(tick_cpu_sched);
 811        /*
 812         * set ts->inidle unconditionally. even if the system did not
 813         * switch to nohz mode the cpu frequency governers rely on the
 814         * update of the idle time accounting in tick_nohz_start_idle().
 815         */
 816        ts->inidle = 1;
 817        __tick_nohz_idle_enter(ts);
 818
 819        local_irq_enable();
 820}
 821EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 822
 823/**
 824 * tick_nohz_irq_exit - update next tick event from interrupt exit
 825 *
 826 * When an interrupt fires while we are idle and it doesn't cause
 827 * a reschedule, it may still add, modify or delete a timer, enqueue
 828 * an RCU callback, etc...
 829 * So we need to re-calculate and reprogram the next tick event.
 830 */
 831void tick_nohz_irq_exit(void)
 832{
 833        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 834
 835        if (ts->inidle) {
 836                /* Cancel the timer because CPU already waken up from the C-states*/
 837                menu_hrtimer_cancel();
 838                __tick_nohz_idle_enter(ts);
 839        } else {
 840                tick_nohz_full_stop_tick(ts);
 841        }
 842}
 843
 844/**
 845 * tick_nohz_get_sleep_length - return the length of the current sleep
 846 *
 847 * Called from power state control code with interrupts disabled
 848 */
 849ktime_t tick_nohz_get_sleep_length(void)
 850{
 851        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 852
 853        return ts->sleep_length;
 854}
 855
 856static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 857{
 858        hrtimer_cancel(&ts->sched_timer);
 859        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 860
 861        while (1) {
 862                /* Forward the time to expire in the future */
 863                hrtimer_forward(&ts->sched_timer, now, tick_period);
 864
 865                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 866                        hrtimer_start_expires(&ts->sched_timer,
 867                                              HRTIMER_MODE_ABS_PINNED);
 868                        /* Check, if the timer was already in the past */
 869                        if (hrtimer_active(&ts->sched_timer))
 870                                break;
 871                } else {
 872                        if (!tick_program_event(
 873                                hrtimer_get_expires(&ts->sched_timer), 0))
 874                                break;
 875                }
 876                /* Reread time and update jiffies */
 877                now = ktime_get();
 878                tick_do_update_jiffies64(now);
 879        }
 880}
 881
 882static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 883{
 884        /* Update jiffies first */
 885        tick_do_update_jiffies64(now);
 886        update_cpu_load_nohz();
 887
 888        calc_load_exit_idle();
 889        touch_softlockup_watchdog();
 890        /*
 891         * Cancel the scheduled timer and restore the tick
 892         */
 893        ts->tick_stopped  = 0;
 894        ts->idle_exittime = now;
 895
 896        tick_nohz_restart(ts, now);
 897}
 898
 899static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 900{
 901#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 902        unsigned long ticks;
 903
 904        if (vtime_accounting_enabled())
 905                return;
 906        /*
 907         * We stopped the tick in idle. Update process times would miss the
 908         * time we slept as update_process_times does only a 1 tick
 909         * accounting. Enforce that this is accounted to idle !
 910         */
 911        ticks = jiffies - ts->idle_jiffies;
 912        /*
 913         * We might be one off. Do not randomly account a huge number of ticks!
 914         */
 915        if (ticks && ticks < LONG_MAX)
 916                account_idle_ticks(ticks);
 917#endif
 918}
 919
 920/**
 921 * tick_nohz_idle_exit - restart the idle tick from the idle task
 922 *
 923 * Restart the idle tick when the CPU is woken up from idle
 924 * This also exit the RCU extended quiescent state. The CPU
 925 * can use RCU again after this function is called.
 926 */
 927void tick_nohz_idle_exit(void)
 928{
 929        int cpu = smp_processor_id();
 930        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 931        ktime_t now;
 932
 933        local_irq_disable();
 934
 935        WARN_ON_ONCE(!ts->inidle);
 936
 937        ts->inidle = 0;
 938
 939        /* Cancel the timer because CPU already waken up from the C-states*/
 940        menu_hrtimer_cancel();
 941        if (ts->idle_active || ts->tick_stopped)
 942                now = ktime_get();
 943
 944        if (ts->idle_active)
 945                tick_nohz_stop_idle(cpu, now);
 946
 947        if (ts->tick_stopped) {
 948                tick_nohz_restart_sched_tick(ts, now);
 949                tick_nohz_account_idle_ticks(ts);
 950        }
 951
 952        local_irq_enable();
 953}
 954EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 955
 956static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 957{
 958        hrtimer_forward(&ts->sched_timer, now, tick_period);
 959        return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
 960}
 961
 962/*
 963 * The nohz low res interrupt handler
 964 */
 965static void tick_nohz_handler(struct clock_event_device *dev)
 966{
 967        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 968        struct pt_regs *regs = get_irq_regs();
 969        ktime_t now = ktime_get();
 970
 971        dev->next_event.tv64 = KTIME_MAX;
 972
 973        tick_sched_do_timer(now);
 974        tick_sched_handle(ts, regs);
 975
 976        while (tick_nohz_reprogram(ts, now)) {
 977                now = ktime_get();
 978                tick_do_update_jiffies64(now);
 979        }
 980}
 981
 982/**
 983 * tick_nohz_switch_to_nohz - switch to nohz mode
 984 */
 985static void tick_nohz_switch_to_nohz(void)
 986{
 987        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 988        ktime_t next;
 989
 990        if (!tick_nohz_enabled)
 991                return;
 992
 993        local_irq_disable();
 994        if (tick_switch_to_oneshot(tick_nohz_handler)) {
 995                local_irq_enable();
 996                return;
 997        }
 998
 999        ts->nohz_mode = NOHZ_MODE_LOWRES;
1000
1001        /*
1002         * Recycle the hrtimer in ts, so we can share the
1003         * hrtimer_forward with the highres code.
1004         */
1005        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1006        /* Get the next period */
1007        next = tick_init_jiffy_update();
1008
1009        for (;;) {
1010                hrtimer_set_expires(&ts->sched_timer, next);
1011                if (!tick_program_event(next, 0))
1012                        break;
1013                next = ktime_add(next, tick_period);
1014        }
1015        local_irq_enable();
1016}
1017
1018/*
1019 * When NOHZ is enabled and the tick is stopped, we need to kick the
1020 * tick timer from irq_enter() so that the jiffies update is kept
1021 * alive during long running softirqs. That's ugly as hell, but
1022 * correctness is key even if we need to fix the offending softirq in
1023 * the first place.
1024 *
1025 * Note, this is different to tick_nohz_restart. We just kick the
1026 * timer and do not touch the other magic bits which need to be done
1027 * when idle is left.
1028 */
1029static void tick_nohz_kick_tick(int cpu, ktime_t now)
1030{
1031#if 0
1032        /* Switch back to 2.6.27 behaviour */
1033
1034        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1035        ktime_t delta;
1036
1037        /*
1038         * Do not touch the tick device, when the next expiry is either
1039         * already reached or less/equal than the tick period.
1040         */
1041        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
1042        if (delta.tv64 <= tick_period.tv64)
1043                return;
1044
1045        tick_nohz_restart(ts, now);
1046#endif
1047}
1048
1049static inline void tick_check_nohz(int cpu)
1050{
1051        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1052        ktime_t now;
1053
1054        if (!ts->idle_active && !ts->tick_stopped)
1055                return;
1056        now = ktime_get();
1057        if (ts->idle_active)
1058                tick_nohz_stop_idle(cpu, now);
1059        if (ts->tick_stopped) {
1060                tick_nohz_update_jiffies(now);
1061                tick_nohz_kick_tick(cpu, now);
1062        }
1063}
1064
1065#else
1066
1067static inline void tick_nohz_switch_to_nohz(void) { }
1068static inline void tick_check_nohz(int cpu) { }
1069
1070#endif /* CONFIG_NO_HZ_COMMON */
1071
1072/*
1073 * Called from irq_enter to notify about the possible interruption of idle()
1074 */
1075void tick_check_idle(int cpu)
1076{
1077        tick_check_oneshot_broadcast(cpu);
1078        tick_check_nohz(cpu);
1079}
1080
1081/*
1082 * High resolution timer specific code
1083 */
1084#ifdef CONFIG_HIGH_RES_TIMERS
1085/*
1086 * We rearm the timer until we get disabled by the idle code.
1087 * Called with interrupts disabled.
1088 */
1089static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1090{
1091        struct tick_sched *ts =
1092                container_of(timer, struct tick_sched, sched_timer);
1093        struct pt_regs *regs = get_irq_regs();
1094        ktime_t now = ktime_get();
1095
1096        tick_sched_do_timer(now);
1097
1098        /*
1099         * Do not call, when we are not in irq context and have
1100         * no valid regs pointer
1101         */
1102        if (regs)
1103                tick_sched_handle(ts, regs);
1104
1105        hrtimer_forward(timer, now, tick_period);
1106
1107        return HRTIMER_RESTART;
1108}
1109
1110static int sched_skew_tick;
1111
1112static int __init skew_tick(char *str)
1113{
1114        get_option(&str, &sched_skew_tick);
1115
1116        return 0;
1117}
1118early_param("skew_tick", skew_tick);
1119
1120/**
1121 * tick_setup_sched_timer - setup the tick emulation timer
1122 */
1123void tick_setup_sched_timer(void)
1124{
1125        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1126        ktime_t now = ktime_get();
1127
1128        /*
1129         * Emulate tick processing via per-CPU hrtimers:
1130         */
1131        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1132        ts->sched_timer.function = tick_sched_timer;
1133
1134        /* Get the next period (per cpu) */
1135        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1136
1137        /* Offset the tick to avert jiffies_lock contention. */
1138        if (sched_skew_tick) {
1139                u64 offset = ktime_to_ns(tick_period) >> 1;
1140                do_div(offset, num_possible_cpus());
1141                offset *= smp_processor_id();
1142                hrtimer_add_expires_ns(&ts->sched_timer, offset);
1143        }
1144
1145        for (;;) {
1146                hrtimer_forward(&ts->sched_timer, now, tick_period);
1147                hrtimer_start_expires(&ts->sched_timer,
1148                                      HRTIMER_MODE_ABS_PINNED);
1149                /* Check, if the timer was already in the past */
1150                if (hrtimer_active(&ts->sched_timer))
1151                        break;
1152                now = ktime_get();
1153        }
1154
1155#ifdef CONFIG_NO_HZ_COMMON
1156        if (tick_nohz_enabled)
1157                ts->nohz_mode = NOHZ_MODE_HIGHRES;
1158#endif
1159}
1160#endif /* HIGH_RES_TIMERS */
1161
1162#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
1163void tick_cancel_sched_timer(int cpu)
1164{
1165        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1166
1167# ifdef CONFIG_HIGH_RES_TIMERS
1168        if (ts->sched_timer.base)
1169                hrtimer_cancel(&ts->sched_timer);
1170# endif
1171
1172        memset(ts, 0, sizeof(*ts));
1173}
1174#endif
1175
1176/**
1177 * Async notification about clocksource changes
1178 */
1179void tick_clock_notify(void)
1180{
1181        int cpu;
1182
1183        for_each_possible_cpu(cpu)
1184                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1185}
1186
1187/*
1188 * Async notification about clock event changes
1189 */
1190void tick_oneshot_notify(void)
1191{
1192        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1193
1194        set_bit(0, &ts->check_clocks);
1195}
1196
1197/**
1198 * Check, if a change happened, which makes oneshot possible.
1199 *
1200 * Called cyclic from the hrtimer softirq (driven by the timer
1201 * softirq) allow_nohz signals, that we can switch into low-res nohz
1202 * mode, because high resolution timers are disabled (either compile
1203 * or runtime).
1204 */
1205int tick_check_oneshot_change(int allow_nohz)
1206{
1207        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1208
1209        if (!test_and_clear_bit(0, &ts->check_clocks))
1210                return 0;
1211
1212        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1213                return 0;
1214
1215        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1216                return 0;
1217
1218        if (!allow_nohz)
1219                return 1;
1220
1221        tick_nohz_switch_to_nohz();
1222        return 0;
1223}
1224