linux/kernel/time/tick-sched.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/time/tick-sched.c
   3 *
   4 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
   6 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
   7 *
   8 *  No idle tick implementation for low and high resolution timers
   9 *
  10 *  Started by: Thomas Gleixner and Ingo Molnar
  11 *
  12 *  Distribute under GPLv2.
  13 */
  14#include <linux/cpu.h>
  15#include <linux/err.h>
  16#include <linux/hrtimer.h>
  17#include <linux/interrupt.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/percpu.h>
  20#include <linux/profile.h>
  21#include <linux/sched.h>
  22#include <linux/module.h>
  23#include <linux/irq_work.h>
  24#include <linux/posix-timers.h>
  25#include <linux/perf_event.h>
  26#include <linux/context_tracking.h>
  27
  28#include <asm/irq_regs.h>
  29
  30#include "tick-internal.h"
  31
  32#include <trace/events/timer.h>
  33
  34/*
  35 * Per cpu nohz control structure
  36 */
  37static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
  38
  39/*
  40 * The time, when the last jiffy update happened. Protected by jiffies_lock.
  41 */
  42static ktime_t last_jiffies_update;
  43
  44struct tick_sched *tick_get_tick_sched(int cpu)
  45{
  46        return &per_cpu(tick_cpu_sched, cpu);
  47}
  48
  49/*
  50 * Must be called with interrupts disabled !
  51 */
  52static void tick_do_update_jiffies64(ktime_t now)
  53{
  54        unsigned long ticks = 0;
  55        ktime_t delta;
  56
  57        /*
  58         * Do a quick check without holding jiffies_lock:
  59         */
  60        delta = ktime_sub(now, last_jiffies_update);
  61        if (delta.tv64 < tick_period.tv64)
  62                return;
  63
  64        /* Reevalute with jiffies_lock held */
  65        write_seqlock(&jiffies_lock);
  66
  67        delta = ktime_sub(now, last_jiffies_update);
  68        if (delta.tv64 >= tick_period.tv64) {
  69
  70                delta = ktime_sub(delta, tick_period);
  71                last_jiffies_update = ktime_add(last_jiffies_update,
  72                                                tick_period);
  73
  74                /* Slow path for long timeouts */
  75                if (unlikely(delta.tv64 >= tick_period.tv64)) {
  76                        s64 incr = ktime_to_ns(tick_period);
  77
  78                        ticks = ktime_divns(delta, incr);
  79
  80                        last_jiffies_update = ktime_add_ns(last_jiffies_update,
  81                                                           incr * ticks);
  82                }
  83                do_timer(++ticks);
  84
  85                /* Keep the tick_next_period variable up to date */
  86                tick_next_period = ktime_add(last_jiffies_update, tick_period);
  87        } else {
  88                write_sequnlock(&jiffies_lock);
  89                return;
  90        }
  91        write_sequnlock(&jiffies_lock);
  92        update_wall_time();
  93}
  94
  95/*
  96 * Initialize and return retrieve the jiffies update.
  97 */
  98static ktime_t tick_init_jiffy_update(void)
  99{
 100        ktime_t period;
 101
 102        write_seqlock(&jiffies_lock);
 103        /* Did we start the jiffies update yet ? */
 104        if (last_jiffies_update.tv64 == 0)
 105                last_jiffies_update = tick_next_period;
 106        period = last_jiffies_update;
 107        write_sequnlock(&jiffies_lock);
 108        return period;
 109}
 110
 111
 112static void tick_sched_do_timer(ktime_t now)
 113{
 114        int cpu = smp_processor_id();
 115
 116#ifdef CONFIG_NO_HZ_COMMON
 117        /*
 118         * Check if the do_timer duty was dropped. We don't care about
 119         * concurrency: This happens only when the cpu in charge went
 120         * into a long sleep. If two cpus happen to assign themself to
 121         * this duty, then the jiffies update is still serialized by
 122         * jiffies_lock.
 123         */
 124        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 125            && !tick_nohz_full_cpu(cpu))
 126                tick_do_timer_cpu = cpu;
 127#endif
 128
 129        /* Check, if the jiffies need an update */
 130        if (tick_do_timer_cpu == cpu)
 131                tick_do_update_jiffies64(now);
 132}
 133
 134static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 135{
 136#ifdef CONFIG_NO_HZ_COMMON
 137        /*
 138         * When we are idle and the tick is stopped, we have to touch
 139         * the watchdog as we might not schedule for a really long
 140         * time. This happens on complete idle SMP systems while
 141         * waiting on the login prompt. We also increment the "start of
 142         * idle" jiffy stamp so the idle accounting adjustment we do
 143         * when we go busy again does not account too much ticks.
 144         */
 145        if (ts->tick_stopped) {
 146                touch_softlockup_watchdog();
 147                if (is_idle_task(current))
 148                        ts->idle_jiffies++;
 149        }
 150#endif
 151        update_process_times(user_mode(regs));
 152        profile_tick(CPU_PROFILING);
 153}
 154
 155#ifdef CONFIG_NO_HZ_FULL
 156cpumask_var_t tick_nohz_full_mask;
 157cpumask_var_t housekeeping_mask;
 158bool tick_nohz_full_running;
 159
 160static bool can_stop_full_tick(void)
 161{
 162        WARN_ON_ONCE(!irqs_disabled());
 163
 164        if (!sched_can_stop_tick()) {
 165                trace_tick_stop(0, "more than 1 task in runqueue\n");
 166                return false;
 167        }
 168
 169        if (!posix_cpu_timers_can_stop_tick(current)) {
 170                trace_tick_stop(0, "posix timers running\n");
 171                return false;
 172        }
 173
 174        if (!perf_event_can_stop_tick()) {
 175                trace_tick_stop(0, "perf events running\n");
 176                return false;
 177        }
 178
 179        /* sched_clock_tick() needs us? */
 180#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 181        /*
 182         * TODO: kick full dynticks CPUs when
 183         * sched_clock_stable is set.
 184         */
 185        if (!sched_clock_stable()) {
 186                trace_tick_stop(0, "unstable sched clock\n");
 187                /*
 188                 * Don't allow the user to think they can get
 189                 * full NO_HZ with this machine.
 190                 */
 191                WARN_ONCE(tick_nohz_full_running,
 192                          "NO_HZ FULL will not work with unstable sched clock");
 193                return false;
 194        }
 195#endif
 196
 197        return true;
 198}
 199
 200static void nohz_full_kick_work_func(struct irq_work *work)
 201{
 202        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 203}
 204
 205static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 206        .func = nohz_full_kick_work_func,
 207};
 208
 209/*
 210 * Kick this CPU if it's full dynticks in order to force it to
 211 * re-evaluate its dependency on the tick and restart it if necessary.
 212 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 213 * is NMI safe.
 214 */
 215void tick_nohz_full_kick(void)
 216{
 217        if (!tick_nohz_full_cpu(smp_processor_id()))
 218                return;
 219
 220        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
 221}
 222
 223/*
 224 * Kick the CPU if it's full dynticks in order to force it to
 225 * re-evaluate its dependency on the tick and restart it if necessary.
 226 */
 227void tick_nohz_full_kick_cpu(int cpu)
 228{
 229        if (!tick_nohz_full_cpu(cpu))
 230                return;
 231
 232        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
 233}
 234
 235static void nohz_full_kick_ipi(void *info)
 236{
 237        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 238}
 239
 240/*
 241 * Kick all full dynticks CPUs in order to force these to re-evaluate
 242 * their dependency on the tick and restart it if necessary.
 243 */
 244void tick_nohz_full_kick_all(void)
 245{
 246        if (!tick_nohz_full_running)
 247                return;
 248
 249        preempt_disable();
 250        smp_call_function_many(tick_nohz_full_mask,
 251                               nohz_full_kick_ipi, NULL, false);
 252        tick_nohz_full_kick();
 253        preempt_enable();
 254}
 255
 256/*
 257 * Re-evaluate the need for the tick as we switch the current task.
 258 * It might need the tick due to per task/process properties:
 259 * perf events, posix cpu timers, ...
 260 */
 261void __tick_nohz_task_switch(void)
 262{
 263        unsigned long flags;
 264
 265        local_irq_save(flags);
 266
 267        if (!tick_nohz_full_cpu(smp_processor_id()))
 268                goto out;
 269
 270        if (tick_nohz_tick_stopped() && !can_stop_full_tick())
 271                tick_nohz_full_kick();
 272
 273out:
 274        local_irq_restore(flags);
 275}
 276
 277/* Parse the boot-time nohz CPU list from the kernel parameters. */
 278static int __init tick_nohz_full_setup(char *str)
 279{
 280        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
 281        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
 282                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
 283                free_bootmem_cpumask_var(tick_nohz_full_mask);
 284                return 1;
 285        }
 286        tick_nohz_full_running = true;
 287
 288        return 1;
 289}
 290__setup("nohz_full=", tick_nohz_full_setup);
 291
 292static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 293                                       unsigned long action,
 294                                       void *hcpu)
 295{
 296        unsigned int cpu = (unsigned long)hcpu;
 297
 298        switch (action & ~CPU_TASKS_FROZEN) {
 299        case CPU_DOWN_PREPARE:
 300                /*
 301                 * The boot CPU handles housekeeping duty (unbound timers,
 302                 * workqueues, timekeeping, ...) on behalf of full dynticks
 303                 * CPUs. It must remain online when nohz full is enabled.
 304                 */
 305                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
 306                        return NOTIFY_BAD;
 307                break;
 308        }
 309        return NOTIFY_OK;
 310}
 311
 312static int tick_nohz_init_all(void)
 313{
 314        int err = -1;
 315
 316#ifdef CONFIG_NO_HZ_FULL_ALL
 317        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
 318                WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
 319                return err;
 320        }
 321        err = 0;
 322        cpumask_setall(tick_nohz_full_mask);
 323        tick_nohz_full_running = true;
 324#endif
 325        return err;
 326}
 327
 328void __init tick_nohz_init(void)
 329{
 330        int cpu;
 331
 332        if (!tick_nohz_full_running) {
 333                if (tick_nohz_init_all() < 0)
 334                        return;
 335        }
 336
 337        if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
 338                WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
 339                cpumask_clear(tick_nohz_full_mask);
 340                tick_nohz_full_running = false;
 341                return;
 342        }
 343
 344        /*
 345         * Full dynticks uses irq work to drive the tick rescheduling on safe
 346         * locking contexts. But then we need irq work to raise its own
 347         * interrupts to avoid circular dependency on the tick
 348         */
 349        if (!arch_irq_work_has_interrupt()) {
 350                pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
 351                           "support irq work self-IPIs\n");
 352                cpumask_clear(tick_nohz_full_mask);
 353                cpumask_copy(housekeeping_mask, cpu_possible_mask);
 354                tick_nohz_full_running = false;
 355                return;
 356        }
 357
 358        cpu = smp_processor_id();
 359
 360        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
 361                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
 362                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 363        }
 364
 365        cpumask_andnot(housekeeping_mask,
 366                       cpu_possible_mask, tick_nohz_full_mask);
 367
 368        for_each_cpu(cpu, tick_nohz_full_mask)
 369                context_tracking_cpu_set(cpu);
 370
 371        cpu_notifier(tick_nohz_cpu_down_callback, 0);
 372        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 373                cpumask_pr_args(tick_nohz_full_mask));
 374
 375        /*
 376         * We need at least one CPU to handle housekeeping work such
 377         * as timekeeping, unbound timers, workqueues, ...
 378         */
 379        WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 380}
 381#endif
 382
 383/*
 384 * NOHZ - aka dynamic tick functionality
 385 */
 386#ifdef CONFIG_NO_HZ_COMMON
 387/*
 388 * NO HZ enabled ?
 389 */
 390static int tick_nohz_enabled __read_mostly  = 1;
 391unsigned long tick_nohz_active  __read_mostly;
 392/*
 393 * Enable / Disable tickless mode
 394 */
 395static int __init setup_tick_nohz(char *str)
 396{
 397        if (!strcmp(str, "off"))
 398                tick_nohz_enabled = 0;
 399        else if (!strcmp(str, "on"))
 400                tick_nohz_enabled = 1;
 401        else
 402                return 0;
 403        return 1;
 404}
 405
 406__setup("nohz=", setup_tick_nohz);
 407
 408int tick_nohz_tick_stopped(void)
 409{
 410        return __this_cpu_read(tick_cpu_sched.tick_stopped);
 411}
 412
 413/**
 414 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 415 *
 416 * Called from interrupt entry when the CPU was idle
 417 *
 418 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 419 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 420 * value. We do this unconditionally on any cpu, as we don't know whether the
 421 * cpu, which has the update task assigned is in a long sleep.
 422 */
 423static void tick_nohz_update_jiffies(ktime_t now)
 424{
 425        unsigned long flags;
 426
 427        __this_cpu_write(tick_cpu_sched.idle_waketime, now);
 428
 429        local_irq_save(flags);
 430        tick_do_update_jiffies64(now);
 431        local_irq_restore(flags);
 432
 433        touch_softlockup_watchdog();
 434}
 435
 436/*
 437 * Updates the per cpu time idle statistics counters
 438 */
 439static void
 440update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 441{
 442        ktime_t delta;
 443
 444        if (ts->idle_active) {
 445                delta = ktime_sub(now, ts->idle_entrytime);
 446                if (nr_iowait_cpu(cpu) > 0)
 447                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 448                else
 449                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 450                ts->idle_entrytime = now;
 451        }
 452
 453        if (last_update_time)
 454                *last_update_time = ktime_to_us(now);
 455
 456}
 457
 458static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 459{
 460        update_ts_time_stats(smp_processor_id(), ts, now, NULL);
 461        ts->idle_active = 0;
 462
 463        sched_clock_idle_wakeup_event(0);
 464}
 465
 466static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 467{
 468        ktime_t now = ktime_get();
 469
 470        ts->idle_entrytime = now;
 471        ts->idle_active = 1;
 472        sched_clock_idle_sleep_event();
 473        return now;
 474}
 475
 476/**
 477 * get_cpu_idle_time_us - get the total idle time of a cpu
 478 * @cpu: CPU number to query
 479 * @last_update_time: variable to store update time in. Do not update
 480 * counters if NULL.
 481 *
 482 * Return the cummulative idle time (since boot) for a given
 483 * CPU, in microseconds.
 484 *
 485 * This time is measured via accounting rather than sampling,
 486 * and is as accurate as ktime_get() is.
 487 *
 488 * This function returns -1 if NOHZ is not enabled.
 489 */
 490u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 491{
 492        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 493        ktime_t now, idle;
 494
 495        if (!tick_nohz_active)
 496                return -1;
 497
 498        now = ktime_get();
 499        if (last_update_time) {
 500                update_ts_time_stats(cpu, ts, now, last_update_time);
 501                idle = ts->idle_sleeptime;
 502        } else {
 503                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
 504                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 505
 506                        idle = ktime_add(ts->idle_sleeptime, delta);
 507                } else {
 508                        idle = ts->idle_sleeptime;
 509                }
 510        }
 511
 512        return ktime_to_us(idle);
 513
 514}
 515EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 516
 517/**
 518 * get_cpu_iowait_time_us - get the total iowait time of a cpu
 519 * @cpu: CPU number to query
 520 * @last_update_time: variable to store update time in. Do not update
 521 * counters if NULL.
 522 *
 523 * Return the cummulative iowait time (since boot) for a given
 524 * CPU, in microseconds.
 525 *
 526 * This time is measured via accounting rather than sampling,
 527 * and is as accurate as ktime_get() is.
 528 *
 529 * This function returns -1 if NOHZ is not enabled.
 530 */
 531u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 532{
 533        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 534        ktime_t now, iowait;
 535
 536        if (!tick_nohz_active)
 537                return -1;
 538
 539        now = ktime_get();
 540        if (last_update_time) {
 541                update_ts_time_stats(cpu, ts, now, last_update_time);
 542                iowait = ts->iowait_sleeptime;
 543        } else {
 544                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
 545                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 546
 547                        iowait = ktime_add(ts->iowait_sleeptime, delta);
 548                } else {
 549                        iowait = ts->iowait_sleeptime;
 550                }
 551        }
 552
 553        return ktime_to_us(iowait);
 554}
 555EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 556
 557static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 558{
 559        hrtimer_cancel(&ts->sched_timer);
 560        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 561
 562        /* Forward the time to expire in the future */
 563        hrtimer_forward(&ts->sched_timer, now, tick_period);
 564
 565        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 566                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 567        else
 568                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 569}
 570
 571static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 572                                         ktime_t now, int cpu)
 573{
 574        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 575        u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
 576        unsigned long seq, basejiff;
 577        ktime_t tick;
 578
 579        /* Read jiffies and the time when jiffies were updated last */
 580        do {
 581                seq = read_seqbegin(&jiffies_lock);
 582                basemono = last_jiffies_update.tv64;
 583                basejiff = jiffies;
 584        } while (read_seqretry(&jiffies_lock, seq));
 585        ts->last_jiffies = basejiff;
 586
 587        if (rcu_needs_cpu(basemono, &next_rcu) ||
 588            arch_needs_cpu() || irq_work_needs_cpu()) {
 589                next_tick = basemono + TICK_NSEC;
 590        } else {
 591                /*
 592                 * Get the next pending timer. If high resolution
 593                 * timers are enabled this only takes the timer wheel
 594                 * timers into account. If high resolution timers are
 595                 * disabled this also looks at the next expiring
 596                 * hrtimer.
 597                 */
 598                next_tmr = get_next_timer_interrupt(basejiff, basemono);
 599                ts->next_timer = next_tmr;
 600                /* Take the next rcu event into account */
 601                next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 602        }
 603
 604        /*
 605         * If the tick is due in the next period, keep it ticking or
 606         * restart it proper.
 607         */
 608        delta = next_tick - basemono;
 609        if (delta <= (u64)TICK_NSEC) {
 610                tick.tv64 = 0;
 611                if (!ts->tick_stopped)
 612                        goto out;
 613                if (delta == 0) {
 614                        /* Tick is stopped, but required now. Enforce it */
 615                        tick_nohz_restart(ts, now);
 616                        goto out;
 617                }
 618        }
 619
 620        /*
 621         * If this cpu is the one which updates jiffies, then give up
 622         * the assignment and let it be taken by the cpu which runs
 623         * the tick timer next, which might be this cpu as well. If we
 624         * don't drop this here the jiffies might be stale and
 625         * do_timer() never invoked. Keep track of the fact that it
 626         * was the one which had the do_timer() duty last. If this cpu
 627         * is the one which had the do_timer() duty last, we limit the
 628         * sleep time to the timekeeping max_deferement value.
 629         * Otherwise we can sleep as long as we want.
 630         */
 631        delta = timekeeping_max_deferment();
 632        if (cpu == tick_do_timer_cpu) {
 633                tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 634                ts->do_timer_last = 1;
 635        } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
 636                delta = KTIME_MAX;
 637                ts->do_timer_last = 0;
 638        } else if (!ts->do_timer_last) {
 639                delta = KTIME_MAX;
 640        }
 641
 642#ifdef CONFIG_NO_HZ_FULL
 643        /* Limit the tick delta to the maximum scheduler deferment */
 644        if (!ts->inidle)
 645                delta = min(delta, scheduler_tick_max_deferment());
 646#endif
 647
 648        /* Calculate the next expiry time */
 649        if (delta < (KTIME_MAX - basemono))
 650                expires = basemono + delta;
 651        else
 652                expires = KTIME_MAX;
 653
 654        expires = min_t(u64, expires, next_tick);
 655        tick.tv64 = expires;
 656
 657        /* Skip reprogram of event if its not changed */
 658        if (ts->tick_stopped && (expires == dev->next_event.tv64))
 659                goto out;
 660
 661        /*
 662         * nohz_stop_sched_tick can be called several times before
 663         * the nohz_restart_sched_tick is called. This happens when
 664         * interrupts arrive which do not cause a reschedule. In the
 665         * first call we save the current tick time, so we can restart
 666         * the scheduler tick in nohz_restart_sched_tick.
 667         */
 668        if (!ts->tick_stopped) {
 669                nohz_balance_enter_idle(cpu);
 670                calc_load_enter_idle();
 671
 672                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 673                ts->tick_stopped = 1;
 674                trace_tick_stop(1, " ");
 675        }
 676
 677        /*
 678         * If the expiration time == KTIME_MAX, then we simply stop
 679         * the tick timer.
 680         */
 681        if (unlikely(expires == KTIME_MAX)) {
 682                if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 683                        hrtimer_cancel(&ts->sched_timer);
 684                goto out;
 685        }
 686
 687        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 688                hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
 689        else
 690                tick_program_event(tick, 1);
 691out:
 692        /* Update the estimated sleep length */
 693        ts->sleep_length = ktime_sub(dev->next_event, now);
 694        return tick;
 695}
 696
 697static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 698{
 699        /* Update jiffies first */
 700        tick_do_update_jiffies64(now);
 701        update_cpu_load_nohz();
 702
 703        calc_load_exit_idle();
 704        touch_softlockup_watchdog();
 705        /*
 706         * Cancel the scheduled timer and restore the tick
 707         */
 708        ts->tick_stopped  = 0;
 709        ts->idle_exittime = now;
 710
 711        tick_nohz_restart(ts, now);
 712}
 713
 714static void tick_nohz_full_update_tick(struct tick_sched *ts)
 715{
 716#ifdef CONFIG_NO_HZ_FULL
 717        int cpu = smp_processor_id();
 718
 719        if (!tick_nohz_full_cpu(cpu))
 720                return;
 721
 722        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 723                return;
 724
 725        if (can_stop_full_tick())
 726                tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 727        else if (ts->tick_stopped)
 728                tick_nohz_restart_sched_tick(ts, ktime_get());
 729#endif
 730}
 731
 732static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 733{
 734        /*
 735         * If this cpu is offline and it is the one which updates
 736         * jiffies, then give up the assignment and let it be taken by
 737         * the cpu which runs the tick timer next. If we don't drop
 738         * this here the jiffies might be stale and do_timer() never
 739         * invoked.
 740         */
 741        if (unlikely(!cpu_online(cpu))) {
 742                if (cpu == tick_do_timer_cpu)
 743                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 744                return false;
 745        }
 746
 747        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
 748                ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
 749                return false;
 750        }
 751
 752        if (need_resched())
 753                return false;
 754
 755        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
 756                static int ratelimit;
 757
 758                if (ratelimit < 10 &&
 759                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
 760                        pr_warn("NOHZ: local_softirq_pending %02x\n",
 761                                (unsigned int) local_softirq_pending());
 762                        ratelimit++;
 763                }
 764                return false;
 765        }
 766
 767        if (tick_nohz_full_enabled()) {
 768                /*
 769                 * Keep the tick alive to guarantee timekeeping progression
 770                 * if there are full dynticks CPUs around
 771                 */
 772                if (tick_do_timer_cpu == cpu)
 773                        return false;
 774                /*
 775                 * Boot safety: make sure the timekeeping duty has been
 776                 * assigned before entering dyntick-idle mode,
 777                 */
 778                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
 779                        return false;
 780        }
 781
 782        return true;
 783}
 784
 785static void __tick_nohz_idle_enter(struct tick_sched *ts)
 786{
 787        ktime_t now, expires;
 788        int cpu = smp_processor_id();
 789
 790        now = tick_nohz_start_idle(ts);
 791
 792        if (can_stop_idle_tick(cpu, ts)) {
 793                int was_stopped = ts->tick_stopped;
 794
 795                ts->idle_calls++;
 796
 797                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
 798                if (expires.tv64 > 0LL) {
 799                        ts->idle_sleeps++;
 800                        ts->idle_expires = expires;
 801                }
 802
 803                if (!was_stopped && ts->tick_stopped)
 804                        ts->idle_jiffies = ts->last_jiffies;
 805        }
 806}
 807
 808/**
 809 * tick_nohz_idle_enter - stop the idle tick from the idle task
 810 *
 811 * When the next event is more than a tick into the future, stop the idle tick
 812 * Called when we start the idle loop.
 813 *
 814 * The arch is responsible of calling:
 815 *
 816 * - rcu_idle_enter() after its last use of RCU before the CPU is put
 817 *  to sleep.
 818 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
 819 */
 820void tick_nohz_idle_enter(void)
 821{
 822        struct tick_sched *ts;
 823
 824        WARN_ON_ONCE(irqs_disabled());
 825
 826        /*
 827         * Update the idle state in the scheduler domain hierarchy
 828         * when tick_nohz_stop_sched_tick() is called from the idle loop.
 829         * State will be updated to busy during the first busy tick after
 830         * exiting idle.
 831         */
 832        set_cpu_sd_state_idle();
 833
 834        local_irq_disable();
 835
 836        ts = this_cpu_ptr(&tick_cpu_sched);
 837        ts->inidle = 1;
 838        __tick_nohz_idle_enter(ts);
 839
 840        local_irq_enable();
 841}
 842
 843/**
 844 * tick_nohz_irq_exit - update next tick event from interrupt exit
 845 *
 846 * When an interrupt fires while we are idle and it doesn't cause
 847 * a reschedule, it may still add, modify or delete a timer, enqueue
 848 * an RCU callback, etc...
 849 * So we need to re-calculate and reprogram the next tick event.
 850 */
 851void tick_nohz_irq_exit(void)
 852{
 853        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 854
 855        if (ts->inidle)
 856                __tick_nohz_idle_enter(ts);
 857        else
 858                tick_nohz_full_update_tick(ts);
 859}
 860
 861/**
 862 * tick_nohz_get_sleep_length - return the length of the current sleep
 863 *
 864 * Called from power state control code with interrupts disabled
 865 */
 866ktime_t tick_nohz_get_sleep_length(void)
 867{
 868        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 869
 870        return ts->sleep_length;
 871}
 872
 873static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 874{
 875#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 876        unsigned long ticks;
 877
 878        if (vtime_accounting_enabled())
 879                return;
 880        /*
 881         * We stopped the tick in idle. Update process times would miss the
 882         * time we slept as update_process_times does only a 1 tick
 883         * accounting. Enforce that this is accounted to idle !
 884         */
 885        ticks = jiffies - ts->idle_jiffies;
 886        /*
 887         * We might be one off. Do not randomly account a huge number of ticks!
 888         */
 889        if (ticks && ticks < LONG_MAX)
 890                account_idle_ticks(ticks);
 891#endif
 892}
 893
 894/**
 895 * tick_nohz_idle_exit - restart the idle tick from the idle task
 896 *
 897 * Restart the idle tick when the CPU is woken up from idle
 898 * This also exit the RCU extended quiescent state. The CPU
 899 * can use RCU again after this function is called.
 900 */
 901void tick_nohz_idle_exit(void)
 902{
 903        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 904        ktime_t now;
 905
 906        local_irq_disable();
 907
 908        WARN_ON_ONCE(!ts->inidle);
 909
 910        ts->inidle = 0;
 911
 912        if (ts->idle_active || ts->tick_stopped)
 913                now = ktime_get();
 914
 915        if (ts->idle_active)
 916                tick_nohz_stop_idle(ts, now);
 917
 918        if (ts->tick_stopped) {
 919                tick_nohz_restart_sched_tick(ts, now);
 920                tick_nohz_account_idle_ticks(ts);
 921        }
 922
 923        local_irq_enable();
 924}
 925
 926/*
 927 * The nohz low res interrupt handler
 928 */
 929static void tick_nohz_handler(struct clock_event_device *dev)
 930{
 931        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 932        struct pt_regs *regs = get_irq_regs();
 933        ktime_t now = ktime_get();
 934
 935        dev->next_event.tv64 = KTIME_MAX;
 936
 937        tick_sched_do_timer(now);
 938        tick_sched_handle(ts, regs);
 939
 940        /* No need to reprogram if we are running tickless  */
 941        if (unlikely(ts->tick_stopped))
 942                return;
 943
 944        hrtimer_forward(&ts->sched_timer, now, tick_period);
 945        tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 946}
 947
 948static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
 949{
 950        if (!tick_nohz_enabled)
 951                return;
 952        ts->nohz_mode = mode;
 953        /* One update is enough */
 954        if (!test_and_set_bit(0, &tick_nohz_active))
 955                timers_update_migration(true);
 956}
 957
 958/**
 959 * tick_nohz_switch_to_nohz - switch to nohz mode
 960 */
 961static void tick_nohz_switch_to_nohz(void)
 962{
 963        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 964        ktime_t next;
 965
 966        if (!tick_nohz_enabled)
 967                return;
 968
 969        if (tick_switch_to_oneshot(tick_nohz_handler))
 970                return;
 971
 972        /*
 973         * Recycle the hrtimer in ts, so we can share the
 974         * hrtimer_forward with the highres code.
 975         */
 976        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 977        /* Get the next period */
 978        next = tick_init_jiffy_update();
 979
 980        hrtimer_forward_now(&ts->sched_timer, tick_period);
 981        hrtimer_set_expires(&ts->sched_timer, next);
 982        tick_program_event(next, 1);
 983        tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 984}
 985
 986/*
 987 * When NOHZ is enabled and the tick is stopped, we need to kick the
 988 * tick timer from irq_enter() so that the jiffies update is kept
 989 * alive during long running softirqs. That's ugly as hell, but
 990 * correctness is key even if we need to fix the offending softirq in
 991 * the first place.
 992 *
 993 * Note, this is different to tick_nohz_restart. We just kick the
 994 * timer and do not touch the other magic bits which need to be done
 995 * when idle is left.
 996 */
 997static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
 998{
 999#if 0
1000        /* Switch back to 2.6.27 behaviour */
1001        ktime_t delta;
1002
1003        /*
1004         * Do not touch the tick device, when the next expiry is either
1005         * already reached or less/equal than the tick period.
1006         */
1007        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
1008        if (delta.tv64 <= tick_period.tv64)
1009                return;
1010
1011        tick_nohz_restart(ts, now);
1012#endif
1013}
1014
1015static inline void tick_nohz_irq_enter(void)
1016{
1017        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1018        ktime_t now;
1019
1020        if (!ts->idle_active && !ts->tick_stopped)
1021                return;
1022        now = ktime_get();
1023        if (ts->idle_active)
1024                tick_nohz_stop_idle(ts, now);
1025        if (ts->tick_stopped) {
1026                tick_nohz_update_jiffies(now);
1027                tick_nohz_kick_tick(ts, now);
1028        }
1029}
1030
1031#else
1032
1033static inline void tick_nohz_switch_to_nohz(void) { }
1034static inline void tick_nohz_irq_enter(void) { }
1035static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
1036
1037#endif /* CONFIG_NO_HZ_COMMON */
1038
1039/*
1040 * Called from irq_enter to notify about the possible interruption of idle()
1041 */
1042void tick_irq_enter(void)
1043{
1044        tick_check_oneshot_broadcast_this_cpu();
1045        tick_nohz_irq_enter();
1046}
1047
1048/*
1049 * High resolution timer specific code
1050 */
1051#ifdef CONFIG_HIGH_RES_TIMERS
1052/*
1053 * We rearm the timer until we get disabled by the idle code.
1054 * Called with interrupts disabled.
1055 */
1056static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1057{
1058        struct tick_sched *ts =
1059                container_of(timer, struct tick_sched, sched_timer);
1060        struct pt_regs *regs = get_irq_regs();
1061        ktime_t now = ktime_get();
1062
1063        tick_sched_do_timer(now);
1064
1065        /*
1066         * Do not call, when we are not in irq context and have
1067         * no valid regs pointer
1068         */
1069        if (regs)
1070                tick_sched_handle(ts, regs);
1071
1072        /* No need to reprogram if we are in idle or full dynticks mode */
1073        if (unlikely(ts->tick_stopped))
1074                return HRTIMER_NORESTART;
1075
1076        hrtimer_forward(timer, now, tick_period);
1077
1078        return HRTIMER_RESTART;
1079}
1080
1081static int sched_skew_tick;
1082
1083static int __init skew_tick(char *str)
1084{
1085        get_option(&str, &sched_skew_tick);
1086
1087        return 0;
1088}
1089early_param("skew_tick", skew_tick);
1090
1091/**
1092 * tick_setup_sched_timer - setup the tick emulation timer
1093 */
1094void tick_setup_sched_timer(void)
1095{
1096        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1097        ktime_t now = ktime_get();
1098
1099        /*
1100         * Emulate tick processing via per-CPU hrtimers:
1101         */
1102        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1103        ts->sched_timer.function = tick_sched_timer;
1104
1105        /* Get the next period (per cpu) */
1106        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1107
1108        /* Offset the tick to avert jiffies_lock contention. */
1109        if (sched_skew_tick) {
1110                u64 offset = ktime_to_ns(tick_period) >> 1;
1111                do_div(offset, num_possible_cpus());
1112                offset *= smp_processor_id();
1113                hrtimer_add_expires_ns(&ts->sched_timer, offset);
1114        }
1115
1116        hrtimer_forward(&ts->sched_timer, now, tick_period);
1117        hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
1118        tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
1119}
1120#endif /* HIGH_RES_TIMERS */
1121
1122#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
1123void tick_cancel_sched_timer(int cpu)
1124{
1125        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1126
1127# ifdef CONFIG_HIGH_RES_TIMERS
1128        if (ts->sched_timer.base)
1129                hrtimer_cancel(&ts->sched_timer);
1130# endif
1131
1132        memset(ts, 0, sizeof(*ts));
1133}
1134#endif
1135
1136/**
1137 * Async notification about clocksource changes
1138 */
1139void tick_clock_notify(void)
1140{
1141        int cpu;
1142
1143        for_each_possible_cpu(cpu)
1144                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
1145}
1146
1147/*
1148 * Async notification about clock event changes
1149 */
1150void tick_oneshot_notify(void)
1151{
1152        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1153
1154        set_bit(0, &ts->check_clocks);
1155}
1156
1157/**
1158 * Check, if a change happened, which makes oneshot possible.
1159 *
1160 * Called cyclic from the hrtimer softirq (driven by the timer
1161 * softirq) allow_nohz signals, that we can switch into low-res nohz
1162 * mode, because high resolution timers are disabled (either compile
1163 * or runtime). Called with interrupts disabled.
1164 */
1165int tick_check_oneshot_change(int allow_nohz)
1166{
1167        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1168
1169        if (!test_and_clear_bit(0, &ts->check_clocks))
1170                return 0;
1171
1172        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
1173                return 0;
1174
1175        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
1176                return 0;
1177
1178        if (!allow_nohz)
1179                return 1;
1180
1181        tick_nohz_switch_to_nohz();
1182        return 0;
1183}
1184