linux/kernel/time/tick-sched.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/time/tick-sched.c
   3 *
   4 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
   6 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
   7 *
   8 *  No idle tick implementation for low and high resolution timers
   9 *
  10 *  Started by: Thomas Gleixner and Ingo Molnar
  11 *
  12 *  Distribute under GPLv2.
  13 */
  14#include <linux/cpu.h>
  15#include <linux/err.h>
  16#include <linux/hrtimer.h>
  17#include <linux/interrupt.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/percpu.h>
  20#include <linux/profile.h>
  21#include <linux/sched.h>
  22#include <linux/module.h>
  23#include <linux/irq_work.h>
  24#include <linux/posix-timers.h>
  25#include <linux/perf_event.h>
  26
  27#include <asm/irq_regs.h>
  28
  29#include "tick-internal.h"
  30
  31#include <trace/events/timer.h>
  32
  33/*
  34 * Per cpu nohz control structure
  35 */
  36DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
  37
  38/*
  39 * The time, when the last jiffy update happened. Protected by jiffies_lock.
  40 */
  41static ktime_t last_jiffies_update;
  42
  43struct tick_sched *tick_get_tick_sched(int cpu)
  44{
  45        return &per_cpu(tick_cpu_sched, cpu);
  46}
  47
  48/*
  49 * Must be called with interrupts disabled !
  50 */
  51static void tick_do_update_jiffies64(ktime_t now)
  52{
  53        unsigned long ticks = 0;
  54        ktime_t delta;
  55
  56        /*
  57         * Do a quick check without holding jiffies_lock:
  58         */
  59        delta = ktime_sub(now, last_jiffies_update);
  60        if (delta.tv64 < tick_period.tv64)
  61                return;
  62
  63        /* Reevalute with jiffies_lock held */
  64        write_seqlock(&jiffies_lock);
  65
  66        delta = ktime_sub(now, last_jiffies_update);
  67        if (delta.tv64 >= tick_period.tv64) {
  68
  69                delta = ktime_sub(delta, tick_period);
  70                last_jiffies_update = ktime_add(last_jiffies_update,
  71                                                tick_period);
  72
  73                /* Slow path for long timeouts */
  74                if (unlikely(delta.tv64 >= tick_period.tv64)) {
  75                        s64 incr = ktime_to_ns(tick_period);
  76
  77                        ticks = ktime_divns(delta, incr);
  78
  79                        last_jiffies_update = ktime_add_ns(last_jiffies_update,
  80                                                           incr * ticks);
  81                }
  82                do_timer(++ticks);
  83
  84                /* Keep the tick_next_period variable up to date */
  85                tick_next_period = ktime_add(last_jiffies_update, tick_period);
  86        }
  87        write_sequnlock(&jiffies_lock);
  88}
  89
  90/*
  91 * Initialize and return retrieve the jiffies update.
  92 */
  93static ktime_t tick_init_jiffy_update(void)
  94{
  95        ktime_t period;
  96
  97        write_seqlock(&jiffies_lock);
  98        /* Did we start the jiffies update yet ? */
  99        if (last_jiffies_update.tv64 == 0)
 100                last_jiffies_update = tick_next_period;
 101        period = last_jiffies_update;
 102        write_sequnlock(&jiffies_lock);
 103        return period;
 104}
 105
 106
 107static void tick_sched_do_timer(ktime_t now)
 108{
 109        int cpu = smp_processor_id();
 110
 111#ifdef CONFIG_NO_HZ_COMMON
 112        /*
 113         * Check if the do_timer duty was dropped. We don't care about
 114         * concurrency: This happens only when the cpu in charge went
 115         * into a long sleep. If two cpus happen to assign themself to
 116         * this duty, then the jiffies update is still serialized by
 117         * jiffies_lock.
 118         */
 119        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 120            && !tick_nohz_full_cpu(cpu))
 121                tick_do_timer_cpu = cpu;
 122#endif
 123
 124        /* Check, if the jiffies need an update */
 125        if (tick_do_timer_cpu == cpu)
 126                tick_do_update_jiffies64(now);
 127}
 128
 129static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 130{
 131#ifdef CONFIG_NO_HZ_COMMON
 132        /*
 133         * When we are idle and the tick is stopped, we have to touch
 134         * the watchdog as we might not schedule for a really long
 135         * time. This happens on complete idle SMP systems while
 136         * waiting on the login prompt. We also increment the "start of
 137         * idle" jiffy stamp so the idle accounting adjustment we do
 138         * when we go busy again does not account too much ticks.
 139         */
 140        if (ts->tick_stopped) {
 141                touch_softlockup_watchdog();
 142                if (is_idle_task(current))
 143                        ts->idle_jiffies++;
 144        }
 145#endif
 146        update_process_times(user_mode(regs));
 147        profile_tick(CPU_PROFILING);
 148}
 149
 150#ifdef CONFIG_NO_HZ_FULL
 151static cpumask_var_t nohz_full_mask;
 152bool have_nohz_full_mask;
 153
 154static bool can_stop_full_tick(void)
 155{
 156        WARN_ON_ONCE(!irqs_disabled());
 157
 158        if (!sched_can_stop_tick()) {
 159                trace_tick_stop(0, "more than 1 task in runqueue\n");
 160                return false;
 161        }
 162
 163        if (!posix_cpu_timers_can_stop_tick(current)) {
 164                trace_tick_stop(0, "posix timers running\n");
 165                return false;
 166        }
 167
 168        if (!perf_event_can_stop_tick()) {
 169                trace_tick_stop(0, "perf events running\n");
 170                return false;
 171        }
 172
 173        /* sched_clock_tick() needs us? */
 174#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 175        /*
 176         * TODO: kick full dynticks CPUs when
 177         * sched_clock_stable is set.
 178         */
 179        if (!sched_clock_stable) {
 180                trace_tick_stop(0, "unstable sched clock\n");
 181                /*
 182                 * Don't allow the user to think they can get
 183                 * full NO_HZ with this machine.
 184                 */
 185                WARN_ONCE(have_nohz_full_mask,
 186                          "NO_HZ FULL will not work with unstable sched clock");
 187                return false;
 188        }
 189#endif
 190
 191        return true;
 192}
 193
 194static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
 195
 196/*
 197 * Re-evaluate the need for the tick on the current CPU
 198 * and restart it if necessary.
 199 */
 200void tick_nohz_full_check(void)
 201{
 202        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 203
 204        if (tick_nohz_full_cpu(smp_processor_id())) {
 205                if (ts->tick_stopped && !is_idle_task(current)) {
 206                        if (!can_stop_full_tick())
 207                                tick_nohz_restart_sched_tick(ts, ktime_get());
 208                }
 209        }
 210}
 211
 212static void nohz_full_kick_work_func(struct irq_work *work)
 213{
 214        tick_nohz_full_check();
 215}
 216
 217static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 218        .func = nohz_full_kick_work_func,
 219};
 220
 221/*
 222 * Kick the current CPU if it's full dynticks in order to force it to
 223 * re-evaluate its dependency on the tick and restart it if necessary.
 224 */
 225void tick_nohz_full_kick(void)
 226{
 227        if (tick_nohz_full_cpu(smp_processor_id()))
 228                irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
 229}
 230
 231static void nohz_full_kick_ipi(void *info)
 232{
 233        tick_nohz_full_check();
 234}
 235
 236/*
 237 * Kick all full dynticks CPUs in order to force these to re-evaluate
 238 * their dependency on the tick and restart it if necessary.
 239 */
 240void tick_nohz_full_kick_all(void)
 241{
 242        if (!have_nohz_full_mask)
 243                return;
 244
 245        preempt_disable();
 246        smp_call_function_many(nohz_full_mask,
 247                               nohz_full_kick_ipi, NULL, false);
 248        preempt_enable();
 249}
 250
 251/*
 252 * Re-evaluate the need for the tick as we switch the current task.
 253 * It might need the tick due to per task/process properties:
 254 * perf events, posix cpu timers, ...
 255 */
 256void tick_nohz_task_switch(struct task_struct *tsk)
 257{
 258        unsigned long flags;
 259
 260        local_irq_save(flags);
 261
 262        if (!tick_nohz_full_cpu(smp_processor_id()))
 263                goto out;
 264
 265        if (tick_nohz_tick_stopped() && !can_stop_full_tick())
 266                tick_nohz_full_kick();
 267
 268out:
 269        local_irq_restore(flags);
 270}
 271
 272int tick_nohz_full_cpu(int cpu)
 273{
 274        if (!have_nohz_full_mask)
 275                return 0;
 276
 277        return cpumask_test_cpu(cpu, nohz_full_mask);
 278}
 279
 280/* Parse the boot-time nohz CPU list from the kernel parameters. */
 281static int __init tick_nohz_full_setup(char *str)
 282{
 283        int cpu;
 284
 285        alloc_bootmem_cpumask_var(&nohz_full_mask);
 286        if (cpulist_parse(str, nohz_full_mask) < 0) {
 287                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
 288                return 1;
 289        }
 290
 291        cpu = smp_processor_id();
 292        if (cpumask_test_cpu(cpu, nohz_full_mask)) {
 293                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
 294                cpumask_clear_cpu(cpu, nohz_full_mask);
 295        }
 296        have_nohz_full_mask = true;
 297
 298        return 1;
 299}
 300__setup("nohz_full=", tick_nohz_full_setup);
 301
 302static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 303                                                 unsigned long action,
 304                                                 void *hcpu)
 305{
 306        unsigned int cpu = (unsigned long)hcpu;
 307
 308        switch (action & ~CPU_TASKS_FROZEN) {
 309        case CPU_DOWN_PREPARE:
 310                /*
 311                 * If we handle the timekeeping duty for full dynticks CPUs,
 312                 * we can't safely shutdown that CPU.
 313                 */
 314                if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
 315                        return NOTIFY_BAD;
 316                break;
 317        }
 318        return NOTIFY_OK;
 319}
 320
 321/*
 322 * Worst case string length in chunks of CPU range seems 2 steps
 323 * separations: 0,2,4,6,...
 324 * This is NR_CPUS + sizeof('\0')
 325 */
 326static char __initdata nohz_full_buf[NR_CPUS + 1];
 327
 328static int tick_nohz_init_all(void)
 329{
 330        int err = -1;
 331
 332#ifdef CONFIG_NO_HZ_FULL_ALL
 333        if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
 334                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
 335                return err;
 336        }
 337        err = 0;
 338        cpumask_setall(nohz_full_mask);
 339        cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
 340        have_nohz_full_mask = true;
 341#endif
 342        return err;
 343}
 344
 345void __init tick_nohz_init(void)
 346{
 347        if (!have_nohz_full_mask) {
 348                if (tick_nohz_init_all() < 0)
 349                        return;
 350        }
 351
 352        cpu_notifier(tick_nohz_cpu_down_callback, 0);
 353        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
 354        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 355}
 356#else
 357#define have_nohz_full_mask (0)
 358#endif
 359
 360/*
 361 * NOHZ - aka dynamic tick functionality
 362 */
 363#ifdef CONFIG_NO_HZ_COMMON
 364/*
 365 * NO HZ enabled ?
 366 */
 367int tick_nohz_enabled __read_mostly  = 1;
 368
 369/*
 370 * Enable / Disable tickless mode
 371 */
 372static int __init setup_tick_nohz(char *str)
 373{
 374        if (!strcmp(str, "off"))
 375                tick_nohz_enabled = 0;
 376        else if (!strcmp(str, "on"))
 377                tick_nohz_enabled = 1;
 378        else
 379                return 0;
 380        return 1;
 381}
 382
 383__setup("nohz=", setup_tick_nohz);
 384
 385/**
 386 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 387 *
 388 * Called from interrupt entry when the CPU was idle
 389 *
 390 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 391 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 392 * value. We do this unconditionally on any cpu, as we don't know whether the
 393 * cpu, which has the update task assigned is in a long sleep.
 394 */
 395static void tick_nohz_update_jiffies(ktime_t now)
 396{
 397        int cpu = smp_processor_id();
 398        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 399        unsigned long flags;
 400
 401        ts->idle_waketime = now;
 402
 403        local_irq_save(flags);
 404        tick_do_update_jiffies64(now);
 405        local_irq_restore(flags);
 406
 407        touch_softlockup_watchdog();
 408}
 409
 410/*
 411 * Updates the per cpu time idle statistics counters
 412 */
 413static void
 414update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 415{
 416        ktime_t delta;
 417
 418        if (ts->idle_active) {
 419                delta = ktime_sub(now, ts->idle_entrytime);
 420                if (nr_iowait_cpu(cpu) > 0)
 421                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 422                else
 423                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 424                ts->idle_entrytime = now;
 425        }
 426
 427        if (last_update_time)
 428                *last_update_time = ktime_to_us(now);
 429
 430}
 431
 432static void tick_nohz_stop_idle(int cpu, ktime_t now)
 433{
 434        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 435
 436        update_ts_time_stats(cpu, ts, now, NULL);
 437        ts->idle_active = 0;
 438
 439        sched_clock_idle_wakeup_event(0);
 440}
 441
 442static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 443{
 444        ktime_t now = ktime_get();
 445
 446        ts->idle_entrytime = now;
 447        ts->idle_active = 1;
 448        sched_clock_idle_sleep_event();
 449        return now;
 450}
 451
 452/**
 453 * get_cpu_idle_time_us - get the total idle time of a cpu
 454 * @cpu: CPU number to query
 455 * @last_update_time: variable to store update time in. Do not update
 456 * counters if NULL.
 457 *
 458 * Return the cummulative idle time (since boot) for a given
 459 * CPU, in microseconds.
 460 *
 461 * This time is measured via accounting rather than sampling,
 462 * and is as accurate as ktime_get() is.
 463 *
 464 * This function returns -1 if NOHZ is not enabled.
 465 */
 466u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 467{
 468        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 469        ktime_t now, idle;
 470
 471        if (!tick_nohz_enabled)
 472                return -1;
 473
 474        now = ktime_get();
 475        if (last_update_time) {
 476                update_ts_time_stats(cpu, ts, now, last_update_time);
 477                idle = ts->idle_sleeptime;
 478        } else {
 479                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
 480                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 481
 482                        idle = ktime_add(ts->idle_sleeptime, delta);
 483                } else {
 484                        idle = ts->idle_sleeptime;
 485                }
 486        }
 487
 488        return ktime_to_us(idle);
 489
 490}
 491EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 492
 493/**
 494 * get_cpu_iowait_time_us - get the total iowait time of a cpu
 495 * @cpu: CPU number to query
 496 * @last_update_time: variable to store update time in. Do not update
 497 * counters if NULL.
 498 *
 499 * Return the cummulative iowait time (since boot) for a given
 500 * CPU, in microseconds.
 501 *
 502 * This time is measured via accounting rather than sampling,
 503 * and is as accurate as ktime_get() is.
 504 *
 505 * This function returns -1 if NOHZ is not enabled.
 506 */
 507u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 508{
 509        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 510        ktime_t now, iowait;
 511
 512        if (!tick_nohz_enabled)
 513                return -1;
 514
 515        now = ktime_get();
 516        if (last_update_time) {
 517                update_ts_time_stats(cpu, ts, now, last_update_time);
 518                iowait = ts->iowait_sleeptime;
 519        } else {
 520                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
 521                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 522
 523                        iowait = ktime_add(ts->iowait_sleeptime, delta);
 524                } else {
 525                        iowait = ts->iowait_sleeptime;
 526                }
 527        }
 528
 529        return ktime_to_us(iowait);
 530}
 531EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 532
 533static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 534                                         ktime_t now, int cpu)
 535{
 536        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
 537        ktime_t last_update, expires, ret = { .tv64 = 0 };
 538        unsigned long rcu_delta_jiffies;
 539        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 540        u64 time_delta;
 541
 542        /* Read jiffies and the time when jiffies were updated last */
 543        do {
 544                seq = read_seqbegin(&jiffies_lock);
 545                last_update = last_jiffies_update;
 546                last_jiffies = jiffies;
 547                time_delta = timekeeping_max_deferment();
 548        } while (read_seqretry(&jiffies_lock, seq));
 549
 550        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
 551            arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
 552                next_jiffies = last_jiffies + 1;
 553                delta_jiffies = 1;
 554        } else {
 555                /* Get the next timer wheel timer */
 556                next_jiffies = get_next_timer_interrupt(last_jiffies);
 557                delta_jiffies = next_jiffies - last_jiffies;
 558                if (rcu_delta_jiffies < delta_jiffies) {
 559                        next_jiffies = last_jiffies + rcu_delta_jiffies;
 560                        delta_jiffies = rcu_delta_jiffies;
 561                }
 562        }
 563
 564        /*
 565         * Do not stop the tick, if we are only one off (or less)
 566         * or if the cpu is required for RCU:
 567         */
 568        if (!ts->tick_stopped && delta_jiffies <= 1)
 569                goto out;
 570
 571        /* Schedule the tick, if we are at least one jiffie off */
 572        if ((long)delta_jiffies >= 1) {
 573
 574                /*
 575                 * If this cpu is the one which updates jiffies, then
 576                 * give up the assignment and let it be taken by the
 577                 * cpu which runs the tick timer next, which might be
 578                 * this cpu as well. If we don't drop this here the
 579                 * jiffies might be stale and do_timer() never
 580                 * invoked. Keep track of the fact that it was the one
 581                 * which had the do_timer() duty last. If this cpu is
 582                 * the one which had the do_timer() duty last, we
 583                 * limit the sleep time to the timekeeping
 584                 * max_deferement value which we retrieved
 585                 * above. Otherwise we can sleep as long as we want.
 586                 */
 587                if (cpu == tick_do_timer_cpu) {
 588                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 589                        ts->do_timer_last = 1;
 590                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
 591                        time_delta = KTIME_MAX;
 592                        ts->do_timer_last = 0;
 593                } else if (!ts->do_timer_last) {
 594                        time_delta = KTIME_MAX;
 595                }
 596
 597#ifdef CONFIG_NO_HZ_FULL
 598                if (!ts->inidle) {
 599                        time_delta = min(time_delta,
 600                                         scheduler_tick_max_deferment());
 601                }
 602#endif
 603
 604                /*
 605                 * calculate the expiry time for the next timer wheel
 606                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
 607                 * that there is no timer pending or at least extremely
 608                 * far into the future (12 days for HZ=1000). In this
 609                 * case we set the expiry to the end of time.
 610                 */
 611                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
 612                        /*
 613                         * Calculate the time delta for the next timer event.
 614                         * If the time delta exceeds the maximum time delta
 615                         * permitted by the current clocksource then adjust
 616                         * the time delta accordingly to ensure the
 617                         * clocksource does not wrap.
 618                         */
 619                        time_delta = min_t(u64, time_delta,
 620                                           tick_period.tv64 * delta_jiffies);
 621                }
 622
 623                if (time_delta < KTIME_MAX)
 624                        expires = ktime_add_ns(last_update, time_delta);
 625                else
 626                        expires.tv64 = KTIME_MAX;
 627
 628                /* Skip reprogram of event if its not changed */
 629                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 630                        goto out;
 631
 632                ret = expires;
 633
 634                /*
 635                 * nohz_stop_sched_tick can be called several times before
 636                 * the nohz_restart_sched_tick is called. This happens when
 637                 * interrupts arrive which do not cause a reschedule. In the
 638                 * first call we save the current tick time, so we can restart
 639                 * the scheduler tick in nohz_restart_sched_tick.
 640                 */
 641                if (!ts->tick_stopped) {
 642                        nohz_balance_enter_idle(cpu);
 643                        calc_load_enter_idle();
 644
 645                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 646                        ts->tick_stopped = 1;
 647                        trace_tick_stop(1, " ");
 648                }
 649
 650                /*
 651                 * If the expiration time == KTIME_MAX, then
 652                 * in this case we simply stop the tick timer.
 653                 */
 654                 if (unlikely(expires.tv64 == KTIME_MAX)) {
 655                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 656                                hrtimer_cancel(&ts->sched_timer);
 657                        goto out;
 658                }
 659
 660                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 661                        hrtimer_start(&ts->sched_timer, expires,
 662                                      HRTIMER_MODE_ABS_PINNED);
 663                        /* Check, if the timer was already in the past */
 664                        if (hrtimer_active(&ts->sched_timer))
 665                                goto out;
 666                } else if (!tick_program_event(expires, 0))
 667                                goto out;
 668                /*
 669                 * We are past the event already. So we crossed a
 670                 * jiffie boundary. Update jiffies and raise the
 671                 * softirq.
 672                 */
 673                tick_do_update_jiffies64(ktime_get());
 674        }
 675        raise_softirq_irqoff(TIMER_SOFTIRQ);
 676out:
 677        ts->next_jiffies = next_jiffies;
 678        ts->last_jiffies = last_jiffies;
 679        ts->sleep_length = ktime_sub(dev->next_event, now);
 680
 681        return ret;
 682}
 683
 684static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 685{
 686#ifdef CONFIG_NO_HZ_FULL
 687       int cpu = smp_processor_id();
 688
 689       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
 690               return;
 691
 692       if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 693               return;
 694
 695       if (!can_stop_full_tick())
 696               return;
 697
 698       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 699#endif
 700}
 701
 702static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 703{
 704        /*
 705         * If this cpu is offline and it is the one which updates
 706         * jiffies, then give up the assignment and let it be taken by
 707         * the cpu which runs the tick timer next. If we don't drop
 708         * this here the jiffies might be stale and do_timer() never
 709         * invoked.
 710         */
 711        if (unlikely(!cpu_online(cpu))) {
 712                if (cpu == tick_do_timer_cpu)
 713                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 714                return false;
 715        }
 716
 717        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 718                return false;
 719
 720        if (need_resched())
 721                return false;
 722
 723        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
 724                static int ratelimit;
 725
 726                if (ratelimit < 10 &&
 727                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
 728                        pr_warn("NOHZ: local_softirq_pending %02x\n",
 729                                (unsigned int) local_softirq_pending());
 730                        ratelimit++;
 731                }
 732                return false;
 733        }
 734
 735        if (have_nohz_full_mask) {
 736                /*
 737                 * Keep the tick alive to guarantee timekeeping progression
 738                 * if there are full dynticks CPUs around
 739                 */
 740                if (tick_do_timer_cpu == cpu)
 741                        return false;
 742                /*
 743                 * Boot safety: make sure the timekeeping duty has been
 744                 * assigned before entering dyntick-idle mode,
 745                 */
 746                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 747                        return false;
 748        }
 749
 750        return true;
 751}
 752
 753static void __tick_nohz_idle_enter(struct tick_sched *ts)
 754{
 755        ktime_t now, expires;
 756        int cpu = smp_processor_id();
 757
 758        now = tick_nohz_start_idle(cpu, ts);
 759
 760        if (can_stop_idle_tick(cpu, ts)) {
 761                int was_stopped = ts->tick_stopped;
 762
 763                ts->idle_calls++;
 764
 765                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
 766                if (expires.tv64 > 0LL) {
 767                        ts->idle_sleeps++;
 768                        ts->idle_expires = expires;
 769                }
 770
 771                if (!was_stopped && ts->tick_stopped)
 772                        ts->idle_jiffies = ts->last_jiffies;
 773        }
 774}
 775
 776/**
 777 * tick_nohz_idle_enter - stop the idle tick from the idle task
 778 *
 779 * When the next event is more than a tick into the future, stop the idle tick
 780 * Called when we start the idle loop.
 781 *
 782 * The arch is responsible of calling:
 783 *
 784 * - rcu_idle_enter() after its last use of RCU before the CPU is put
 785 *  to sleep.
 786 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
 787 */
 788void tick_nohz_idle_enter(void)
 789{
 790        struct tick_sched *ts;
 791
 792        WARN_ON_ONCE(irqs_disabled());
 793
 794        /*
 795         * Update the idle state in the scheduler domain hierarchy
 796         * when tick_nohz_stop_sched_tick() is called from the idle loop.
 797         * State will be updated to busy during the first busy tick after
 798         * exiting idle.
 799         */
 800        set_cpu_sd_state_idle();
 801
 802        local_irq_disable();
 803
 804        ts = &__get_cpu_var(tick_cpu_sched);
 805        /*
 806         * set ts->inidle unconditionally. even if the system did not
 807         * switch to nohz mode the cpu frequency governers rely on the
 808         * update of the idle time accounting in tick_nohz_start_idle().
 809         */
 810        ts->inidle = 1;
 811        __tick_nohz_idle_enter(ts);
 812
 813        local_irq_enable();
 814}
 815EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 816
 817/**
 818 * tick_nohz_irq_exit - update next tick event from interrupt exit
 819 *
 820 * When an interrupt fires while we are idle and it doesn't cause
 821 * a reschedule, it may still add, modify or delete a timer, enqueue
 822 * an RCU callback, etc...
 823 * So we need to re-calculate and reprogram the next tick event.
 824 */
 825void tick_nohz_irq_exit(void)
 826{
 827        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 828
 829        if (ts->inidle)
 830                __tick_nohz_idle_enter(ts);
 831        else
 832                tick_nohz_full_stop_tick(ts);
 833}
 834
 835/**
 836 * tick_nohz_get_sleep_length - return the length of the current sleep
 837 *
 838 * Called from power state control code with interrupts disabled
 839 */
 840ktime_t tick_nohz_get_sleep_length(void)
 841{
 842        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 843
 844        return ts->sleep_length;
 845}
 846
 847static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 848{
 849        hrtimer_cancel(&ts->sched_timer);
 850        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 851
 852        while (1) {
 853                /* Forward the time to expire in the future */
 854                hrtimer_forward(&ts->sched_timer, now, tick_period);
 855
 856                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 857                        hrtimer_start_expires(&ts->sched_timer,
 858                                              HRTIMER_MODE_ABS_PINNED);
 859                        /* Check, if the timer was already in the past */
 860                        if (hrtimer_active(&ts->sched_timer))
 861                                break;
 862                } else {
 863                        if (!tick_program_event(
 864                                hrtimer_get_expires(&ts->sched_timer), 0))
 865                                break;
 866                }
 867                /* Reread time and update jiffies */
 868                now = ktime_get();
 869                tick_do_update_jiffies64(now);
 870        }
 871}
 872
 873static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 874{
 875        /* Update jiffies first */
 876        tick_do_update_jiffies64(now);
 877        update_cpu_load_nohz();
 878
 879        calc_load_exit_idle();
 880        touch_softlockup_watchdog();
 881        /*
 882         * Cancel the scheduled timer and restore the tick
 883         */
 884        ts->tick_stopped  = 0;
 885        ts->idle_exittime = now;
 886
 887        tick_nohz_restart(ts, now);
 888}
 889
 890static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 891{
 892#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 893        unsigned long ticks;
 894
 895        if (vtime_accounting_enabled())
 896                return;
 897        /*
 898         * We stopped the tick in idle. Update process times would miss the
 899         * time we slept as update_process_times does only a 1 tick
 900         * accounting. Enforce that this is accounted to idle !
 901         */
 902        ticks = jiffies - ts->idle_jiffies;
 903        /*
 904         * We might be one off. Do not randomly account a huge number of ticks!
 905         */
 906        if (ticks && ticks < LONG_MAX)
 907                account_idle_ticks(ticks);
 908#endif
 909}
 910
 911/**
 912 * tick_nohz_idle_exit - restart the idle tick from the idle task
 913 *
 914 * Restart the idle tick when the CPU is woken up from idle
 915 * This also exit the RCU extended quiescent state. The CPU
 916 * can use RCU again after this function is called.
 917 */
 918void tick_nohz_idle_exit(void)
 919{
 920        int cpu = smp_processor_id();
 921        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 922        ktime_t now;
 923
 924        local_irq_disable();
 925
 926        WARN_ON_ONCE(!ts->inidle);
 927
 928        ts->inidle = 0;
 929
 930        if (ts->idle_active || ts->tick_stopped)
 931                now = ktime_get();
 932
 933        if (ts->idle_active)
 934                tick_nohz_stop_idle(cpu, now);
 935
 936        if (ts->tick_stopped) {
 937                tick_nohz_restart_sched_tick(ts, now);
 938                tick_nohz_account_idle_ticks(ts);
 939        }
 940
 941        local_irq_enable();
 942}
 943EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 944
 945static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 946{
 947        hrtimer_forward(&ts->sched_timer, now, tick_period);
 948        return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
 949}
 950
 951/*
 952 * The nohz low res interrupt handler
 953 */
 954static void tick_nohz_handler(struct clock_event_device *dev)
 955{
 956        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 957        struct pt_regs *regs = get_irq_regs();
 958        ktime_t now = ktime_get();
 959
 960        dev->next_event.tv64 = KTIME_MAX;
 961
 962        tick_sched_do_timer(now);
 963        tick_sched_handle(ts, regs);
 964
 965        while (tick_nohz_reprogram(ts, now)) {
 966                now = ktime_get();
 967                tick_do_update_jiffies64(now);
 968        }
 969}
 970
 971/**
 972 * tick_nohz_switch_to_nohz - switch to nohz mode
 973 */
 974static void tick_nohz_switch_to_nohz(void)
 975{
 976        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 977        ktime_t next;
 978
 979        if (!tick_nohz_enabled)
 980                return;
 981
 982        local_irq_disable();
 983        if (tick_switch_to_oneshot(tick_nohz_handler)) {
 984                local_irq_enable();
 985                return;
 986        }
 987
 988        ts->nohz_mode = NOHZ_MODE_LOWRES;
 989
 990        /*
 991         * Recycle the hrtimer in ts, so we can share the
 992         * hrtimer_forward with the highres code.
 993         */
 994        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 995        /* Get the next period */
 996        next = tick_init_jiffy_update();
 997
 998        for (;;) {
 999                hrtimer_set_expires(&ts->sched_timer, next);
1000                if (!tick_program_event(next, 0))
1001                        break;
1002                next = ktime_add(next, tick_period);
1003        }
1004        local_irq_enable();
1005}
1006
1007/*
1008 * When NOHZ is enabled and the tick is stopped, we need to kick the
1009 * tick timer from irq_enter() so that the jiffies update is kept
1010 * alive during long running softirqs. That's ugly as hell, but
1011 * correctness is key even if we need to fix the offending softirq in
1012 * the first place.
1013 *
1014 * Note, this is different to tick_nohz_restart. We just kick the
1015 * timer and do not touch the other magic bits which need to be done
1016 * when idle is left.
1017 */
1018static void tick_nohz_kick_tick(int cpu, ktime_t now)
1019{
1020#if 0
1021        /* Switch back to 2.6.27 behaviour */
1022
1023        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1024        ktime_t delta;
1025
1026        /*
1027         * Do not touch the tick device, when the next expiry is either
1028         * already reached or less/equal than the tick period.
1029         */
1030        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
1031        if (delta.tv64 <= tick_period.tv64)
1032                return;
1033
1034        tick_nohz_restart(ts, now);
1035#endif
1036}
1037
1038static inline void tick_check_nohz(int cpu)
1039{
1040        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1041        ktime_t now;
1042
1043        if (!ts->idle_active && !ts->tick_stopped)
1044                return;
1045        now = ktime_get();
1046        if (ts->idle_active)
1047                tick_nohz_stop_idle(cpu, now);
1048        if (ts->tick_stopped) {
1049                tick_nohz_update_jiffies(now);
1050                tick_nohz_kick_tick(cpu, now);
1051        }
1052}
1053
1054#else
1055
1056static inline void tick_nohz_switch_to_nohz(void) { }
1057static inline void tick_check_nohz(int cpu) { }
1058
1059#endif /* CONFIG_NO_HZ_COMMON */
1060
1061/*
1062 * Called from irq_enter to notify about the possible interruption of idle()
1063 */
1064void tick_check_idle(int cpu)
1065{
1066        tick_check_oneshot_broadcast(cpu);
1067        tick_check_nohz(cpu);
1068}
1069
1070/*
1071 * High resolution timer specific code
1072 */
1073#ifdef CONFIG_HIGH_RES_TIMERS
1074/*
1075 * We rearm the timer until we get disabled by the idle code.
1076 * Called with interrupts disabled.
1077 */
1078static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1079{
1080        struct tick_sched *ts =
1081                container_of(timer, struct tick_sched, sched_timer);
1082        struct pt_regs *regs = get_irq_regs();
1083        ktime_t now = ktime_get();
1084
1085        tick_sched_do_timer(now);
1086
1087        /*
1088         * Do not call, when we are not in irq context and have
1089         * no valid regs pointer
1090         */
1091        if (regs)
1092                tick_sched_handle(ts, regs);
1093
1094        hrtimer_forward(timer, now, tick_period);
1095
1096        return HRTIMER_RESTART;
1097}
1098
1099static int sched_skew_tick;
1100
1101static int __init skew_tick(char *str)
1102{
1103        get_option(&str, &sched_skew_tick);
1104
1105        return 0;
1106}
1107early_param("skew_tick", skew_tick);
1108
1109/**
1110 * tick_setup_sched_timer - setup the tick emulation timer
1111 */
1112void tick_setup_sched_timer(void)
1113{
1114        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1115        ktime_t now = ktime_get();
1116
1117        /*
1118         * Emulate tick processing via per-CPU hrtimers:
1119         */
1120        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1121        ts->sched_timer.function = tick_sched_timer;
1122
1123        /* Get the next period (per cpu) */
1124        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1125
1126        /* Offset the tick to avert jiffies_lock contention. */
1127        if (sched_skew_tick) {
1128                u64 offset = ktime_to_ns(tick_period) >> 1;
1129                do_div(offset, num_possible_cpus());
1130                offset *= smp_processor_id();
1131                hrtimer_add_expires_ns(&ts->sched_timer, offset);
1132        }
1133
1134        for (;;) {
1135                hrtimer_forward(&ts->sched_timer, now, tick_period);
1136                hrtimer_start_expires(&ts->sched_timer,
1137                                      HRTIMER_MODE_ABS_PINNED);
1138                /* Check, if the timer was already in the past */
1139                if (hrtimer_active(&ts->sched_timer))
1140                        break;
1141                now = ktime_get();
1142        }
1143
1144#ifdef CONFIG_NO_HZ_COMMON
1145        if (tick_nohz_enabled)
1146                ts->nohz_mode = NOHZ_MODE_HIGHRES;
1147#endif
1148}
1149#endif /* HIGH_RES_TIMERS */
1150
1151#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
1152void tick_cancel_sched_timer(int cpu)
1153{
1154        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1155
1156# ifdef CONFIG_HIGH_RES_TIMERS
1157        if (ts->sched_timer.base)
1158                hrtimer_cancel(&ts->sched_timer);
1159# endif
1160
1161        memset(ts, 0, sizeof(*ts));
1162}
1163#endif
1164
1165/**
1166 * Async notification about clocksource changes
1167 */
1168void tick_clock_notify(void)
1169{
1170        int cpu;
1171
1172        for_each_possible_cpu(cpu)
1173                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1174}
1175
1176/*
1177 * Async notification about clock event changes
1178 */
1179void tick_oneshot_notify(void)
1180{
1181        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1182
1183        set_bit(0, &ts->check_clocks);
1184}
1185
1186/**
1187 * Check, if a change happened, which makes oneshot possible.
1188 *
1189 * Called cyclic from the hrtimer softirq (driven by the timer
1190 * softirq) allow_nohz signals, that we can switch into low-res nohz
1191 * mode, because high resolution timers are disabled (either compile
1192 * or runtime).
1193 */
1194int tick_check_oneshot_change(int allow_nohz)
1195{
1196        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1197
1198        if (!test_and_clear_bit(0, &ts->check_clocks))
1199                return 0;
1200
1201        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1202                return 0;
1203
1204        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1205                return 0;
1206
1207        if (!allow_nohz)
1208                return 1;
1209
1210        tick_nohz_switch_to_nohz();
1211        return 0;
1212}
1213