linux/drivers/cpuidle/governors/menu.c
<<
>>
Prefs
   1/*
   2 * menu.c - the menu idle governor
   3 *
   4 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
   5 * Copyright (C) 2009 Intel Corporation
   6 * Author:
   7 *        Arjan van de Ven <arjan@linux.intel.com>
   8 *
   9 * This code is licenced under the GPL version 2 as described
  10 * in the COPYING file that acompanies the Linux Kernel.
  11 */
  12
  13#include <linux/kernel.h>
  14#include <linux/cpuidle.h>
  15#include <linux/pm_qos.h>
  16#include <linux/time.h>
  17#include <linux/ktime.h>
  18#include <linux/hrtimer.h>
  19#include <linux/tick.h>
  20#include <linux/sched.h>
  21#include <linux/sched/loadavg.h>
  22#include <linux/sched/stat.h>
  23#include <linux/math64.h>
  24#include <linux/cpu.h>
  25
  26/*
  27 * Please note when changing the tuning values:
  28 * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
  29 * a scaling operation multiplication may overflow on 32 bit platforms.
  30 * In that case, #define RESOLUTION as ULL to get 64 bit result:
  31 * #define RESOLUTION 1024ULL
  32 *
  33 * The default values do not overflow.
  34 */
  35#define BUCKETS 12
  36#define INTERVAL_SHIFT 3
  37#define INTERVALS (1UL << INTERVAL_SHIFT)
  38#define RESOLUTION 1024
  39#define DECAY 8
  40#define MAX_INTERESTING 50000
  41
  42
  43/*
  44 * Concepts and ideas behind the menu governor
  45 *
  46 * For the menu governor, there are 3 decision factors for picking a C
  47 * state:
  48 * 1) Energy break even point
  49 * 2) Performance impact
  50 * 3) Latency tolerance (from pmqos infrastructure)
  51 * These these three factors are treated independently.
  52 *
  53 * Energy break even point
  54 * -----------------------
  55 * C state entry and exit have an energy cost, and a certain amount of time in
  56 * the  C state is required to actually break even on this cost. CPUIDLE
  57 * provides us this duration in the "target_residency" field. So all that we
  58 * need is a good prediction of how long we'll be idle. Like the traditional
  59 * menu governor, we start with the actual known "next timer event" time.
  60 *
  61 * Since there are other source of wakeups (interrupts for example) than
  62 * the next timer event, this estimation is rather optimistic. To get a
  63 * more realistic estimate, a correction factor is applied to the estimate,
  64 * that is based on historic behavior. For example, if in the past the actual
  65 * duration always was 50% of the next timer tick, the correction factor will
  66 * be 0.5.
  67 *
  68 * menu uses a running average for this correction factor, however it uses a
  69 * set of factors, not just a single factor. This stems from the realization
  70 * that the ratio is dependent on the order of magnitude of the expected
  71 * duration; if we expect 500 milliseconds of idle time the likelihood of
  72 * getting an interrupt very early is much higher than if we expect 50 micro
  73 * seconds of idle time. A second independent factor that has big impact on
  74 * the actual factor is if there is (disk) IO outstanding or not.
  75 * (as a special twist, we consider every sleep longer than 50 milliseconds
  76 * as perfect; there are no power gains for sleeping longer than this)
  77 *
  78 * For these two reasons we keep an array of 12 independent factors, that gets
  79 * indexed based on the magnitude of the expected duration as well as the
  80 * "is IO outstanding" property.
  81 *
  82 * Repeatable-interval-detector
  83 * ----------------------------
  84 * There are some cases where "next timer" is a completely unusable predictor:
  85 * Those cases where the interval is fixed, for example due to hardware
  86 * interrupt mitigation, but also due to fixed transfer rate devices such as
  87 * mice.
  88 * For this, we use a different predictor: We track the duration of the last 8
  89 * intervals and if the stand deviation of these 8 intervals is below a
  90 * threshold value, we use the average of these intervals as prediction.
  91 *
  92 * Limiting Performance Impact
  93 * ---------------------------
  94 * C states, especially those with large exit latencies, can have a real
  95 * noticeable impact on workloads, which is not acceptable for most sysadmins,
  96 * and in addition, less performance has a power price of its own.
  97 *
  98 * As a general rule of thumb, menu assumes that the following heuristic
  99 * holds:
 100 *     The busier the system, the less impact of C states is acceptable
 101 *
 102 * This rule-of-thumb is implemented using a performance-multiplier:
 103 * If the exit latency times the performance multiplier is longer than
 104 * the predicted duration, the C state is not considered a candidate
 105 * for selection due to a too high performance impact. So the higher
 106 * this multiplier is, the longer we need to be idle to pick a deep C
 107 * state, and thus the less likely a busy CPU will hit such a deep
 108 * C state.
 109 *
 110 * Two factors are used in determing this multiplier:
 111 * a value of 10 is added for each point of "per cpu load average" we have.
 112 * a value of 5 points is added for each process that is waiting for
 113 * IO on this CPU.
 114 * (these values are experimentally determined)
 115 *
 116 * The load average factor gives a longer term (few seconds) input to the
 117 * decision, while the iowait value gives a cpu local instantanious input.
 118 * The iowait factor may look low, but realize that this is also already
 119 * represented in the system load average.
 120 *
 121 */
 122
 123struct menu_device {
 124        int             last_state_idx;
 125        int             needs_update;
 126
 127        unsigned int    next_timer_us;
 128        unsigned int    predicted_us;
 129        unsigned int    bucket;
 130        unsigned int    correction_factor[BUCKETS];
 131        unsigned int    intervals[INTERVALS];
 132        int             interval_ptr;
 133};
 134
 135
 136#define LOAD_INT(x) ((x) >> FSHIFT)
 137#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 138
 139static inline int get_loadavg(unsigned long load)
 140{
 141        return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
 142}
 143
 144static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
 145{
 146        int bucket = 0;
 147
 148        /*
 149         * We keep two groups of stats; one with no
 150         * IO pending, one without.
 151         * This allows us to calculate
 152         * E(duration)|iowait
 153         */
 154        if (nr_iowaiters)
 155                bucket = BUCKETS/2;
 156
 157        if (duration < 10)
 158                return bucket;
 159        if (duration < 100)
 160                return bucket + 1;
 161        if (duration < 1000)
 162                return bucket + 2;
 163        if (duration < 10000)
 164                return bucket + 3;
 165        if (duration < 100000)
 166                return bucket + 4;
 167        return bucket + 5;
 168}
 169
 170/*
 171 * Return a multiplier for the exit latency that is intended
 172 * to take performance requirements into account.
 173 * The more performance critical we estimate the system
 174 * to be, the higher this multiplier, and thus the higher
 175 * the barrier to go to an expensive C state.
 176 */
 177static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load)
 178{
 179        int mult = 1;
 180
 181        /* for higher loadavg, we are more reluctant */
 182
 183        mult += 2 * get_loadavg(load);
 184
 185        /* for IO wait tasks (per cpu!) we add 5x each */
 186        mult += 10 * nr_iowaiters;
 187
 188        return mult;
 189}
 190
 191static DEFINE_PER_CPU(struct menu_device, menu_devices);
 192
 193static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 194
 195/*
 196 * Try detecting repeating patterns by keeping track of the last 8
 197 * intervals, and checking if the standard deviation of that set
 198 * of points is below a threshold. If it is... then use the
 199 * average of these 8 points as the estimated value.
 200 */
 201static unsigned int get_typical_interval(struct menu_device *data)
 202{
 203        int i, divisor;
 204        unsigned int max, thresh, avg;
 205        uint64_t sum, variance;
 206
 207        thresh = UINT_MAX; /* Discard outliers above this value */
 208
 209again:
 210
 211        /* First calculate the average of past intervals */
 212        max = 0;
 213        sum = 0;
 214        divisor = 0;
 215        for (i = 0; i < INTERVALS; i++) {
 216                unsigned int value = data->intervals[i];
 217                if (value <= thresh) {
 218                        sum += value;
 219                        divisor++;
 220                        if (value > max)
 221                                max = value;
 222                }
 223        }
 224        if (divisor == INTERVALS)
 225                avg = sum >> INTERVAL_SHIFT;
 226        else
 227                avg = div_u64(sum, divisor);
 228
 229        /* Then try to determine variance */
 230        variance = 0;
 231        for (i = 0; i < INTERVALS; i++) {
 232                unsigned int value = data->intervals[i];
 233                if (value <= thresh) {
 234                        int64_t diff = (int64_t)value - avg;
 235                        variance += diff * diff;
 236                }
 237        }
 238        if (divisor == INTERVALS)
 239                variance >>= INTERVAL_SHIFT;
 240        else
 241                do_div(variance, divisor);
 242
 243        /*
 244         * The typical interval is obtained when standard deviation is
 245         * small (stddev <= 20 us, variance <= 400 us^2) or standard
 246         * deviation is small compared to the average interval (avg >
 247         * 6*stddev, avg^2 > 36*variance). The average is smaller than
 248         * UINT_MAX aka U32_MAX, so computing its square does not
 249         * overflow a u64. We simply reject this candidate average if
 250         * the standard deviation is greater than 715 s (which is
 251         * rather unlikely).
 252         *
 253         * Use this result only if there is no timer to wake us up sooner.
 254         */
 255        if (likely(variance <= U64_MAX/36)) {
 256                if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
 257                                                        || variance <= 400) {
 258                        return avg;
 259                }
 260        }
 261
 262        /*
 263         * If we have outliers to the upside in our distribution, discard
 264         * those by setting the threshold to exclude these outliers, then
 265         * calculate the average and standard deviation again. Once we get
 266         * down to the bottom 3/4 of our samples, stop excluding samples.
 267         *
 268         * This can deal with workloads that have long pauses interspersed
 269         * with sporadic activity with a bunch of short pauses.
 270         */
 271        if ((divisor * 4) <= INTERVALS * 3)
 272                return UINT_MAX;
 273
 274        thresh = max - 1;
 275        goto again;
 276}
 277
 278/**
 279 * menu_select - selects the next idle state to enter
 280 * @drv: cpuidle driver containing state data
 281 * @dev: the CPU
 282 */
 283static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 284{
 285        struct menu_device *data = this_cpu_ptr(&menu_devices);
 286        struct device *device = get_cpu_device(dev->cpu);
 287        int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 288        int i;
 289        int first_idx;
 290        int idx;
 291        unsigned int interactivity_req;
 292        unsigned int expected_interval;
 293        unsigned long nr_iowaiters, cpu_load;
 294        int resume_latency = dev_pm_qos_raw_read_value(device);
 295
 296        if (data->needs_update) {
 297                menu_update(drv, dev);
 298                data->needs_update = 0;
 299        }
 300
 301        if (resume_latency < latency_req &&
 302            resume_latency != PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
 303                latency_req = resume_latency;
 304
 305        /* Special case when user has set very strict latency requirement */
 306        if (unlikely(latency_req == 0))
 307                return 0;
 308
 309        /* determine the expected residency time, round up */
 310        data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
 311
 312        get_iowait_load(&nr_iowaiters, &cpu_load);
 313        data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
 314
 315        /*
 316         * Force the result of multiplication to be 64 bits even if both
 317         * operands are 32 bits.
 318         * Make sure to round up for half microseconds.
 319         */
 320        data->predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
 321                                         data->correction_factor[data->bucket],
 322                                         RESOLUTION * DECAY);
 323
 324        expected_interval = get_typical_interval(data);
 325        expected_interval = min(expected_interval, data->next_timer_us);
 326
 327        first_idx = 0;
 328        if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) {
 329                struct cpuidle_state *s = &drv->states[1];
 330                unsigned int polling_threshold;
 331
 332                /*
 333                 * We want to default to C1 (hlt), not to busy polling
 334                 * unless the timer is happening really really soon, or
 335                 * C1's exit latency exceeds the user configured limit.
 336                 */
 337                polling_threshold = max_t(unsigned int, 20, s->target_residency);
 338                if (data->next_timer_us > polling_threshold &&
 339                    latency_req > s->exit_latency && !s->disabled &&
 340                    !dev->states_usage[1].disable)
 341                        first_idx = 1;
 342        }
 343
 344        /*
 345         * Use the lowest expected idle interval to pick the idle state.
 346         */
 347        data->predicted_us = min(data->predicted_us, expected_interval);
 348
 349        /*
 350         * Use the performance multiplier and the user-configurable
 351         * latency_req to determine the maximum exit latency.
 352         */
 353        interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
 354        if (latency_req > interactivity_req)
 355                latency_req = interactivity_req;
 356
 357        /*
 358         * Find the idle state with the lowest power while satisfying
 359         * our constraints.
 360         */
 361        idx = -1;
 362        for (i = first_idx; i < drv->state_count; i++) {
 363                struct cpuidle_state *s = &drv->states[i];
 364                struct cpuidle_state_usage *su = &dev->states_usage[i];
 365
 366                if (s->disabled || su->disable)
 367                        continue;
 368                if (idx == -1)
 369                        idx = i; /* first enabled state */
 370                if (s->target_residency > data->predicted_us)
 371                        break;
 372                if (s->exit_latency > latency_req)
 373                        break;
 374
 375                idx = i;
 376        }
 377
 378        if (idx == -1)
 379                idx = 0; /* No states enabled. Must use 0. */
 380
 381        data->last_state_idx = idx;
 382
 383        return data->last_state_idx;
 384}
 385
 386/**
 387 * menu_reflect - records that data structures need update
 388 * @dev: the CPU
 389 * @index: the index of actual entered state
 390 *
 391 * NOTE: it's important to be fast here because this operation will add to
 392 *       the overall exit latency.
 393 */
 394static void menu_reflect(struct cpuidle_device *dev, int index)
 395{
 396        struct menu_device *data = this_cpu_ptr(&menu_devices);
 397
 398        data->last_state_idx = index;
 399        data->needs_update = 1;
 400}
 401
 402/**
 403 * menu_update - attempts to guess what happened after entry
 404 * @drv: cpuidle driver containing state data
 405 * @dev: the CPU
 406 */
 407static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 408{
 409        struct menu_device *data = this_cpu_ptr(&menu_devices);
 410        int last_idx = data->last_state_idx;
 411        struct cpuidle_state *target = &drv->states[last_idx];
 412        unsigned int measured_us;
 413        unsigned int new_factor;
 414
 415        /*
 416         * Try to figure out how much time passed between entry to low
 417         * power state and occurrence of the wakeup event.
 418         *
 419         * If the entered idle state didn't support residency measurements,
 420         * we use them anyway if they are short, and if long,
 421         * truncate to the whole expected time.
 422         *
 423         * Any measured amount of time will include the exit latency.
 424         * Since we are interested in when the wakeup begun, not when it
 425         * was completed, we must subtract the exit latency. However, if
 426         * the measured amount of time is less than the exit latency,
 427         * assume the state was never reached and the exit latency is 0.
 428         */
 429
 430        /* measured value */
 431        measured_us = cpuidle_get_last_residency(dev);
 432
 433        /* Deduct exit latency */
 434        if (measured_us > 2 * target->exit_latency)
 435                measured_us -= target->exit_latency;
 436        else
 437                measured_us /= 2;
 438
 439        /* Make sure our coefficients do not exceed unity */
 440        if (measured_us > data->next_timer_us)
 441                measured_us = data->next_timer_us;
 442
 443        /* Update our correction ratio */
 444        new_factor = data->correction_factor[data->bucket];
 445        new_factor -= new_factor / DECAY;
 446
 447        if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
 448                new_factor += RESOLUTION * measured_us / data->next_timer_us;
 449        else
 450                /*
 451                 * we were idle so long that we count it as a perfect
 452                 * prediction
 453                 */
 454                new_factor += RESOLUTION;
 455
 456        /*
 457         * We don't want 0 as factor; we always want at least
 458         * a tiny bit of estimated time. Fortunately, due to rounding,
 459         * new_factor will stay nonzero regardless of measured_us values
 460         * and the compiler can eliminate this test as long as DECAY > 1.
 461         */
 462        if (DECAY == 1 && unlikely(new_factor == 0))
 463                new_factor = 1;
 464
 465        data->correction_factor[data->bucket] = new_factor;
 466
 467        /* update the repeating-pattern data */
 468        data->intervals[data->interval_ptr++] = measured_us;
 469        if (data->interval_ptr >= INTERVALS)
 470                data->interval_ptr = 0;
 471}
 472
 473/**
 474 * menu_enable_device - scans a CPU's states and does setup
 475 * @drv: cpuidle driver
 476 * @dev: the CPU
 477 */
 478static int menu_enable_device(struct cpuidle_driver *drv,
 479                                struct cpuidle_device *dev)
 480{
 481        struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
 482        int i;
 483
 484        memset(data, 0, sizeof(struct menu_device));
 485
 486        /*
 487         * if the correction factor is 0 (eg first time init or cpu hotplug
 488         * etc), we actually want to start out with a unity factor.
 489         */
 490        for(i = 0; i < BUCKETS; i++)
 491                data->correction_factor[i] = RESOLUTION * DECAY;
 492
 493        return 0;
 494}
 495
 496static struct cpuidle_governor menu_governor = {
 497        .name =         "menu",
 498        .rating =       20,
 499        .enable =       menu_enable_device,
 500        .select =       menu_select,
 501        .reflect =      menu_reflect,
 502};
 503
 504/**
 505 * init_menu - initializes the governor
 506 */
 507static int __init init_menu(void)
 508{
 509        return cpuidle_register_governor(&menu_governor);
 510}
 511
 512postcore_initcall(init_menu);
 513