linux/drivers/cpuidle/governors/teo.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Timer events oriented CPU idle governor
   4 *
   5 * Copyright (C) 2018 - 2021 Intel Corporation
   6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   7 */
   8
   9/**
  10 * DOC: teo-description
  11 *
  12 * The idea of this governor is based on the observation that on many systems
  13 * timer events are two or more orders of magnitude more frequent than any
  14 * other interrupts, so they are likely to be the most significant cause of CPU
  15 * wakeups from idle states.  Moreover, information about what happened in the
  16 * (relatively recent) past can be used to estimate whether or not the deepest
  17 * idle state with target residency within the (known) time till the closest
  18 * timer event, referred to as the sleep length, is likely to be suitable for
  19 * the upcoming CPU idle period and, if not, then which of the shallower idle
  20 * states to choose instead of it.
  21 *
  22 * Of course, non-timer wakeup sources are more important in some use cases
  23 * which can be covered by taking a few most recent idle time intervals of the
  24 * CPU into account.  However, even in that context it is not necessary to
  25 * consider idle duration values greater than the sleep length, because the
  26 * closest timer will ultimately wake up the CPU anyway unless it is woken up
  27 * earlier.
  28 *
  29 * Thus this governor estimates whether or not the prospective idle duration of
  30 * a CPU is likely to be significantly shorter than the sleep length and selects
  31 * an idle state for it accordingly.
  32 *
  33 * The computations carried out by this governor are based on using bins whose
  34 * boundaries are aligned with the target residency parameter values of the CPU
  35 * idle states provided by the %CPUIdle driver in the ascending order.  That is,
  36 * the first bin spans from 0 up to, but not including, the target residency of
  37 * the second idle state (idle state 1), the second bin spans from the target
  38 * residency of idle state 1 up to, but not including, the target residency of
  39 * idle state 2, the third bin spans from the target residency of idle state 2
  40 * up to, but not including, the target residency of idle state 3 and so on.
  41 * The last bin spans from the target residency of the deepest idle state
  42 * supplied by the driver to infinity.
  43 *
  44 * Two metrics called "hits" and "intercepts" are associated with each bin.
  45 * They are updated every time before selecting an idle state for the given CPU
  46 * in accordance with what happened last time.
  47 *
  48 * The "hits" metric reflects the relative frequency of situations in which the
  49 * sleep length and the idle duration measured after CPU wakeup fall into the
  50 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
  51 * length).  In turn, the "intercepts" metric reflects the relative frequency of
  52 * situations in which the measured idle duration is so much shorter than the
  53 * sleep length that the bin it falls into corresponds to an idle state
  54 * shallower than the one whose bin is fallen into by the sleep length (these
  55 * situations are referred to as "intercepts" below).
  56 *
  57 * In addition to the metrics described above, the governor counts recent
  58 * intercepts (that is, intercepts that have occurred during the last
  59 * %NR_RECENT invocations of it for the given CPU) for each bin.
  60 *
  61 * In order to select an idle state for a CPU, the governor takes the following
  62 * steps (modulo the possible latency constraint that must be taken into account
  63 * too):
  64 *
  65 * 1. Find the deepest CPU idle state whose target residency does not exceed
  66 *    the current sleep length (the candidate idle state) and compute 3 sums as
  67 *    follows:
  68 *
  69 *    - The sum of the "hits" and "intercepts" metrics for the candidate state
  70 *      and all of the deeper idle states (it represents the cases in which the
  71 *      CPU was idle long enough to avoid being intercepted if the sleep length
  72 *      had been equal to the current one).
  73 *
  74 *    - The sum of the "intercepts" metrics for all of the idle states shallower
  75 *      than the candidate one (it represents the cases in which the CPU was not
  76 *      idle long enough to avoid being intercepted if the sleep length had been
  77 *      equal to the current one).
  78 *
  79 *    - The sum of the numbers of recent intercepts for all of the idle states
  80 *      shallower than the candidate one.
  81 *
  82 * 2. If the second sum is greater than the first one or the third sum is
  83 *    greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look
  84 *    for an alternative idle state to select.
  85 *
  86 *    - Traverse the idle states shallower than the candidate one in the
  87 *      descending order.
  88 *
  89 *    - For each of them compute the sum of the "intercepts" metrics and the sum
  90 *      of the numbers of recent intercepts over all of the idle states between
  91 *      it and the candidate one (including the former and excluding the
  92 *      latter).
  93 *
  94 *    - If each of these sums that needs to be taken into account (because the
  95 *      check related to it has indicated that the CPU is likely to wake up
  96 *      early) is greater than a half of the corresponding sum computed in step
  97 *      1 (which means that the target residency of the state in question had
  98 *      not exceeded the idle duration in over a half of the relevant cases),
  99 *      select the given idle state instead of the candidate one.
 100 *
 101 * 3. By default, select the candidate state.
 102 */
 103
 104#include <linux/cpuidle.h>
 105#include <linux/jiffies.h>
 106#include <linux/kernel.h>
 107#include <linux/sched/clock.h>
 108#include <linux/tick.h>
 109
 110/*
 111 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
 112 * is used for decreasing metrics on a regular basis.
 113 */
 114#define PULSE           1024
 115#define DECAY_SHIFT     3
 116
 117/*
 118 * Number of the most recent idle duration values to take into consideration for
 119 * the detection of recent early wakeup patterns.
 120 */
 121#define NR_RECENT       9
 122
 123/**
 124 * struct teo_bin - Metrics used by the TEO cpuidle governor.
 125 * @intercepts: The "intercepts" metric.
 126 * @hits: The "hits" metric.
 127 * @recent: The number of recent "intercepts".
 128 */
 129struct teo_bin {
 130        unsigned int intercepts;
 131        unsigned int hits;
 132        unsigned int recent;
 133};
 134
 135/**
 136 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
 137 * @time_span_ns: Time between idle state selection and post-wakeup update.
 138 * @sleep_length_ns: Time till the closest timer event (at the selection time).
 139 * @state_bins: Idle state data bins for this CPU.
 140 * @total: Grand total of the "intercepts" and "hits" mertics for all bins.
 141 * @next_recent_idx: Index of the next @recent_idx entry to update.
 142 * @recent_idx: Indices of bins corresponding to recent "intercepts".
 143 */
 144struct teo_cpu {
 145        s64 time_span_ns;
 146        s64 sleep_length_ns;
 147        struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 148        unsigned int total;
 149        int next_recent_idx;
 150        int recent_idx[NR_RECENT];
 151};
 152
 153static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 154
 155/**
 156 * teo_update - Update CPU metrics after wakeup.
 157 * @drv: cpuidle driver containing state data.
 158 * @dev: Target CPU.
 159 */
 160static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 161{
 162        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 163        int i, idx_timer = 0, idx_duration = 0;
 164        u64 measured_ns;
 165
 166        if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
 167                /*
 168                 * One of the safety nets has triggered or the wakeup was close
 169                 * enough to the closest timer event expected at the idle state
 170                 * selection time to be discarded.
 171                 */
 172                measured_ns = U64_MAX;
 173        } else {
 174                u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 175
 176                /*
 177                 * The computations below are to determine whether or not the
 178                 * (saved) time till the next timer event and the measured idle
 179                 * duration fall into the same "bin", so use last_residency_ns
 180                 * for that instead of time_span_ns which includes the cpuidle
 181                 * overhead.
 182                 */
 183                measured_ns = dev->last_residency_ns;
 184                /*
 185                 * The delay between the wakeup and the first instruction
 186                 * executed by the CPU is not likely to be worst-case every
 187                 * time, so take 1/2 of the exit latency as a very rough
 188                 * approximation of the average of it.
 189                 */
 190                if (measured_ns >= lat_ns)
 191                        measured_ns -= lat_ns / 2;
 192                else
 193                        measured_ns /= 2;
 194        }
 195
 196        cpu_data->total = 0;
 197
 198        /*
 199         * Decay the "hits" and "intercepts" metrics for all of the bins and
 200         * find the bins that the sleep length and the measured idle duration
 201         * fall into.
 202         */
 203        for (i = 0; i < drv->state_count; i++) {
 204                s64 target_residency_ns = drv->states[i].target_residency_ns;
 205                struct teo_bin *bin = &cpu_data->state_bins[i];
 206
 207                bin->hits -= bin->hits >> DECAY_SHIFT;
 208                bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
 209
 210                cpu_data->total += bin->hits + bin->intercepts;
 211
 212                if (target_residency_ns <= cpu_data->sleep_length_ns) {
 213                        idx_timer = i;
 214                        if (target_residency_ns <= measured_ns)
 215                                idx_duration = i;
 216                }
 217        }
 218
 219        i = cpu_data->next_recent_idx++;
 220        if (cpu_data->next_recent_idx >= NR_RECENT)
 221                cpu_data->next_recent_idx = 0;
 222
 223        if (cpu_data->recent_idx[i] >= 0)
 224                cpu_data->state_bins[cpu_data->recent_idx[i]].recent--;
 225
 226        /*
 227         * If the measured idle duration falls into the same bin as the sleep
 228         * length, this is a "hit", so update the "hits" metric for that bin.
 229         * Otherwise, update the "intercepts" metric for the bin fallen into by
 230         * the measured idle duration.
 231         */
 232        if (idx_timer == idx_duration) {
 233                cpu_data->state_bins[idx_timer].hits += PULSE;
 234                cpu_data->recent_idx[i] = -1;
 235        } else {
 236                cpu_data->state_bins[idx_duration].intercepts += PULSE;
 237                cpu_data->state_bins[idx_duration].recent++;
 238                cpu_data->recent_idx[i] = idx_duration;
 239        }
 240
 241        cpu_data->total += PULSE;
 242}
 243
 244static bool teo_time_ok(u64 interval_ns)
 245{
 246        return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC;
 247}
 248
 249static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
 250{
 251        return (drv->states[idx].target_residency_ns +
 252                drv->states[idx+1].target_residency_ns) / 2;
 253}
 254
 255/**
 256 * teo_find_shallower_state - Find shallower idle state matching given duration.
 257 * @drv: cpuidle driver containing state data.
 258 * @dev: Target CPU.
 259 * @state_idx: Index of the capping idle state.
 260 * @duration_ns: Idle duration value to match.
 261 */
 262static int teo_find_shallower_state(struct cpuidle_driver *drv,
 263                                    struct cpuidle_device *dev, int state_idx,
 264                                    s64 duration_ns)
 265{
 266        int i;
 267
 268        for (i = state_idx - 1; i >= 0; i--) {
 269                if (dev->states_usage[i].disable)
 270                        continue;
 271
 272                state_idx = i;
 273                if (drv->states[i].target_residency_ns <= duration_ns)
 274                        break;
 275        }
 276        return state_idx;
 277}
 278
 279/**
 280 * teo_select - Selects the next idle state to enter.
 281 * @drv: cpuidle driver containing state data.
 282 * @dev: Target CPU.
 283 * @stop_tick: Indication on whether or not to stop the scheduler tick.
 284 */
 285static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 286                      bool *stop_tick)
 287{
 288        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 289        s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 290        unsigned int idx_intercept_sum = 0;
 291        unsigned int intercept_sum = 0;
 292        unsigned int idx_recent_sum = 0;
 293        unsigned int recent_sum = 0;
 294        unsigned int idx_hit_sum = 0;
 295        unsigned int hit_sum = 0;
 296        int constraint_idx = 0;
 297        int idx0 = 0, idx = -1;
 298        bool alt_intercepts, alt_recent;
 299        ktime_t delta_tick;
 300        s64 duration_ns;
 301        int i;
 302
 303        if (dev->last_state_idx >= 0) {
 304                teo_update(drv, dev);
 305                dev->last_state_idx = -1;
 306        }
 307
 308        cpu_data->time_span_ns = local_clock();
 309
 310        duration_ns = tick_nohz_get_sleep_length(&delta_tick);
 311        cpu_data->sleep_length_ns = duration_ns;
 312
 313        /* Check if there is any choice in the first place. */
 314        if (drv->state_count < 2) {
 315                idx = 0;
 316                goto end;
 317        }
 318        if (!dev->states_usage[0].disable) {
 319                idx = 0;
 320                if (drv->states[1].target_residency_ns > duration_ns)
 321                        goto end;
 322        }
 323
 324        /*
 325         * Find the deepest idle state whose target residency does not exceed
 326         * the current sleep length and the deepest idle state not deeper than
 327         * the former whose exit latency does not exceed the current latency
 328         * constraint.  Compute the sums of metrics for early wakeup pattern
 329         * detection.
 330         */
 331        for (i = 1; i < drv->state_count; i++) {
 332                struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
 333                struct cpuidle_state *s = &drv->states[i];
 334
 335                /*
 336                 * Update the sums of idle state mertics for all of the states
 337                 * shallower than the current one.
 338                 */
 339                intercept_sum += prev_bin->intercepts;
 340                hit_sum += prev_bin->hits;
 341                recent_sum += prev_bin->recent;
 342
 343                if (dev->states_usage[i].disable)
 344                        continue;
 345
 346                if (idx < 0) {
 347                        idx = i; /* first enabled state */
 348                        idx0 = i;
 349                }
 350
 351                if (s->target_residency_ns > duration_ns)
 352                        break;
 353
 354                idx = i;
 355
 356                if (s->exit_latency_ns <= latency_req)
 357                        constraint_idx = i;
 358
 359                idx_intercept_sum = intercept_sum;
 360                idx_hit_sum = hit_sum;
 361                idx_recent_sum = recent_sum;
 362        }
 363
 364        /* Avoid unnecessary overhead. */
 365        if (idx < 0) {
 366                idx = 0; /* No states enabled, must use 0. */
 367                goto end;
 368        } else if (idx == idx0) {
 369                goto end;
 370        }
 371
 372        /*
 373         * If the sum of the intercepts metric for all of the idle states
 374         * shallower than the current candidate one (idx) is greater than the
 375         * sum of the intercepts and hits metrics for the candidate state and
 376         * all of the deeper states, or the sum of the numbers of recent
 377         * intercepts over all of the states shallower than the candidate one
 378         * is greater than a half of the number of recent events taken into
 379         * account, the CPU is likely to wake up early, so find an alternative
 380         * idle state to select.
 381         */
 382        alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum;
 383        alt_recent = idx_recent_sum > NR_RECENT / 2;
 384        if (alt_recent || alt_intercepts) {
 385                s64 first_suitable_span_ns = duration_ns;
 386                int first_suitable_idx = idx;
 387
 388                /*
 389                 * Look for the deepest idle state whose target residency had
 390                 * not exceeded the idle duration in over a half of the relevant
 391                 * cases (both with respect to intercepts overall and with
 392                 * respect to the recent intercepts only) in the past.
 393                 *
 394                 * Take the possible latency constraint and duration limitation
 395                 * present if the tick has been stopped already into account.
 396                 */
 397                intercept_sum = 0;
 398                recent_sum = 0;
 399
 400                for (i = idx - 1; i >= 0; i--) {
 401                        struct teo_bin *bin = &cpu_data->state_bins[i];
 402                        s64 span_ns;
 403
 404                        intercept_sum += bin->intercepts;
 405                        recent_sum += bin->recent;
 406
 407                        span_ns = teo_middle_of_bin(i, drv);
 408
 409                        if ((!alt_recent || 2 * recent_sum > idx_recent_sum) &&
 410                            (!alt_intercepts ||
 411                             2 * intercept_sum > idx_intercept_sum)) {
 412                                if (teo_time_ok(span_ns) &&
 413                                    !dev->states_usage[i].disable) {
 414                                        idx = i;
 415                                        duration_ns = span_ns;
 416                                } else {
 417                                        /*
 418                                         * The current state is too shallow or
 419                                         * disabled, so take the first enabled
 420                                         * deeper state with suitable time span.
 421                                         */
 422                                        idx = first_suitable_idx;
 423                                        duration_ns = first_suitable_span_ns;
 424                                }
 425                                break;
 426                        }
 427
 428                        if (dev->states_usage[i].disable)
 429                                continue;
 430
 431                        if (!teo_time_ok(span_ns)) {
 432                                /*
 433                                 * The current state is too shallow, but if an
 434                                 * alternative candidate state has been found,
 435                                 * it may still turn out to be a better choice.
 436                                 */
 437                                if (first_suitable_idx != idx)
 438                                        continue;
 439
 440                                break;
 441                        }
 442
 443                        first_suitable_span_ns = span_ns;
 444                        first_suitable_idx = i;
 445                }
 446        }
 447
 448        /*
 449         * If there is a latency constraint, it may be necessary to select an
 450         * idle state shallower than the current candidate one.
 451         */
 452        if (idx > constraint_idx)
 453                idx = constraint_idx;
 454
 455end:
 456        /*
 457         * Don't stop the tick if the selected state is a polling one or if the
 458         * expected idle duration is shorter than the tick period length.
 459         */
 460        if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
 461            duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 462                *stop_tick = false;
 463
 464                /*
 465                 * The tick is not going to be stopped, so if the target
 466                 * residency of the state to be returned is not within the time
 467                 * till the closest timer including the tick, try to correct
 468                 * that.
 469                 */
 470                if (idx > idx0 &&
 471                    drv->states[idx].target_residency_ns > delta_tick)
 472                        idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
 473        }
 474
 475        return idx;
 476}
 477
 478/**
 479 * teo_reflect - Note that governor data for the CPU need to be updated.
 480 * @dev: Target CPU.
 481 * @state: Entered state.
 482 */
 483static void teo_reflect(struct cpuidle_device *dev, int state)
 484{
 485        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 486
 487        dev->last_state_idx = state;
 488        /*
 489         * If the wakeup was not "natural", but triggered by one of the safety
 490         * nets, assume that the CPU might have been idle for the entire sleep
 491         * length time.
 492         */
 493        if (dev->poll_time_limit ||
 494            (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
 495                dev->poll_time_limit = false;
 496                cpu_data->time_span_ns = cpu_data->sleep_length_ns;
 497        } else {
 498                cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
 499        }
 500}
 501
 502/**
 503 * teo_enable_device - Initialize the governor's data for the target CPU.
 504 * @drv: cpuidle driver (not used).
 505 * @dev: Target CPU.
 506 */
 507static int teo_enable_device(struct cpuidle_driver *drv,
 508                             struct cpuidle_device *dev)
 509{
 510        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 511        int i;
 512
 513        memset(cpu_data, 0, sizeof(*cpu_data));
 514
 515        for (i = 0; i < NR_RECENT; i++)
 516                cpu_data->recent_idx[i] = -1;
 517
 518        return 0;
 519}
 520
 521static struct cpuidle_governor teo_governor = {
 522        .name =         "teo",
 523        .rating =       19,
 524        .enable =       teo_enable_device,
 525        .select =       teo_select,
 526        .reflect =      teo_reflect,
 527};
 528
 529static int __init teo_governor_init(void)
 530{
 531        return cpuidle_register_governor(&teo_governor);
 532}
 533
 534postcore_initcall(teo_governor_init);
 535