linux/drivers/cpuidle/governors/menu.c
<<
>>
Prefs
   1/*
   2 * menu.c - the menu idle governor
   3 *
   4 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
   5 * Copyright (C) 2009 Intel Corporation
   6 * Author:
   7 *        Arjan van de Ven <arjan@linux.intel.com>
   8 *
   9 * This code is licenced under the GPL version 2 as described
  10 * in the COPYING file that acompanies the Linux Kernel.
  11 */
  12
  13#include <linux/kernel.h>
  14#include <linux/cpuidle.h>
  15#include <linux/pm_qos.h>
  16#include <linux/time.h>
  17#include <linux/ktime.h>
  18#include <linux/hrtimer.h>
  19#include <linux/tick.h>
  20#include <linux/sched.h>
  21#include <linux/math64.h>
  22#include <linux/module.h>
  23
  24/*
  25 * Please note when changing the tuning values:
  26 * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
  27 * a scaling operation multiplication may overflow on 32 bit platforms.
  28 * In that case, #define RESOLUTION as ULL to get 64 bit result:
  29 * #define RESOLUTION 1024ULL
  30 *
  31 * The default values do not overflow.
  32 */
  33#define BUCKETS 12
  34#define INTERVAL_SHIFT 3
  35#define INTERVALS (1UL << INTERVAL_SHIFT)
  36#define RESOLUTION 1024
  37#define DECAY 8
  38#define MAX_INTERESTING 50000
  39
  40
  41/*
  42 * Concepts and ideas behind the menu governor
  43 *
  44 * For the menu governor, there are 3 decision factors for picking a C
  45 * state:
  46 * 1) Energy break even point
  47 * 2) Performance impact
  48 * 3) Latency tolerance (from pmqos infrastructure)
  49 * These these three factors are treated independently.
  50 *
  51 * Energy break even point
  52 * -----------------------
  53 * C state entry and exit have an energy cost, and a certain amount of time in
  54 * the  C state is required to actually break even on this cost. CPUIDLE
  55 * provides us this duration in the "target_residency" field. So all that we
  56 * need is a good prediction of how long we'll be idle. Like the traditional
  57 * menu governor, we start with the actual known "next timer event" time.
  58 *
  59 * Since there are other source of wakeups (interrupts for example) than
  60 * the next timer event, this estimation is rather optimistic. To get a
  61 * more realistic estimate, a correction factor is applied to the estimate,
  62 * that is based on historic behavior. For example, if in the past the actual
  63 * duration always was 50% of the next timer tick, the correction factor will
  64 * be 0.5.
  65 *
  66 * menu uses a running average for this correction factor, however it uses a
  67 * set of factors, not just a single factor. This stems from the realization
  68 * that the ratio is dependent on the order of magnitude of the expected
  69 * duration; if we expect 500 milliseconds of idle time the likelihood of
  70 * getting an interrupt very early is much higher than if we expect 50 micro
  71 * seconds of idle time. A second independent factor that has big impact on
  72 * the actual factor is if there is (disk) IO outstanding or not.
  73 * (as a special twist, we consider every sleep longer than 50 milliseconds
  74 * as perfect; there are no power gains for sleeping longer than this)
  75 *
  76 * For these two reasons we keep an array of 12 independent factors, that gets
  77 * indexed based on the magnitude of the expected duration as well as the
  78 * "is IO outstanding" property.
  79 *
  80 * Repeatable-interval-detector
  81 * ----------------------------
  82 * There are some cases where "next timer" is a completely unusable predictor:
  83 * Those cases where the interval is fixed, for example due to hardware
  84 * interrupt mitigation, but also due to fixed transfer rate devices such as
  85 * mice.
  86 * For this, we use a different predictor: We track the duration of the last 8
  87 * intervals and if the stand deviation of these 8 intervals is below a
  88 * threshold value, we use the average of these intervals as prediction.
  89 *
  90 * Limiting Performance Impact
  91 * ---------------------------
  92 * C states, especially those with large exit latencies, can have a real
  93 * noticeable impact on workloads, which is not acceptable for most sysadmins,
  94 * and in addition, less performance has a power price of its own.
  95 *
  96 * As a general rule of thumb, menu assumes that the following heuristic
  97 * holds:
  98 *     The busier the system, the less impact of C states is acceptable
  99 *
 100 * This rule-of-thumb is implemented using a performance-multiplier:
 101 * If the exit latency times the performance multiplier is longer than
 102 * the predicted duration, the C state is not considered a candidate
 103 * for selection due to a too high performance impact. So the higher
 104 * this multiplier is, the longer we need to be idle to pick a deep C
 105 * state, and thus the less likely a busy CPU will hit such a deep
 106 * C state.
 107 *
 108 * Two factors are used in determing this multiplier:
 109 * a value of 10 is added for each point of "per cpu load average" we have.
 110 * a value of 5 points is added for each process that is waiting for
 111 * IO on this CPU.
 112 * (these values are experimentally determined)
 113 *
 114 * The load average factor gives a longer term (few seconds) input to the
 115 * decision, while the iowait value gives a cpu local instantanious input.
 116 * The iowait factor may look low, but realize that this is also already
 117 * represented in the system load average.
 118 *
 119 */
 120
 121struct menu_device {
 122        int             last_state_idx;
 123        int             needs_update;
 124
 125        unsigned int    next_timer_us;
 126        unsigned int    predicted_us;
 127        unsigned int    bucket;
 128        unsigned int    correction_factor[BUCKETS];
 129        unsigned int    intervals[INTERVALS];
 130        int             interval_ptr;
 131};
 132
 133
 134#define LOAD_INT(x) ((x) >> FSHIFT)
 135#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 136
 137static inline int get_loadavg(unsigned long load)
 138{
 139        return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
 140}
 141
 142static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
 143{
 144        int bucket = 0;
 145
 146        /*
 147         * We keep two groups of stats; one with no
 148         * IO pending, one without.
 149         * This allows us to calculate
 150         * E(duration)|iowait
 151         */
 152        if (nr_iowaiters)
 153                bucket = BUCKETS/2;
 154
 155        if (duration < 10)
 156                return bucket;
 157        if (duration < 100)
 158                return bucket + 1;
 159        if (duration < 1000)
 160                return bucket + 2;
 161        if (duration < 10000)
 162                return bucket + 3;
 163        if (duration < 100000)
 164                return bucket + 4;
 165        return bucket + 5;
 166}
 167
 168/*
 169 * Return a multiplier for the exit latency that is intended
 170 * to take performance requirements into account.
 171 * The more performance critical we estimate the system
 172 * to be, the higher this multiplier, and thus the higher
 173 * the barrier to go to an expensive C state.
 174 */
 175static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load)
 176{
 177        int mult = 1;
 178
 179        /* for higher loadavg, we are more reluctant */
 180
 181        mult += 2 * get_loadavg(load);
 182
 183        /* for IO wait tasks (per cpu!) we add 5x each */
 184        mult += 10 * nr_iowaiters;
 185
 186        return mult;
 187}
 188
 189static DEFINE_PER_CPU(struct menu_device, menu_devices);
 190
 191static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 192
 193/* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */
 194static u64 div_round64(u64 dividend, u32 divisor)
 195{
 196        return div_u64(dividend + (divisor / 2), divisor);
 197}
 198
 199/*
 200 * Try detecting repeating patterns by keeping track of the last 8
 201 * intervals, and checking if the standard deviation of that set
 202 * of points is below a threshold. If it is... then use the
 203 * average of these 8 points as the estimated value.
 204 */
 205static void get_typical_interval(struct menu_device *data)
 206{
 207        int i, divisor;
 208        unsigned int max, thresh;
 209        uint64_t avg, stddev;
 210
 211        thresh = UINT_MAX; /* Discard outliers above this value */
 212
 213again:
 214
 215        /* First calculate the average of past intervals */
 216        max = 0;
 217        avg = 0;
 218        divisor = 0;
 219        for (i = 0; i < INTERVALS; i++) {
 220                unsigned int value = data->intervals[i];
 221                if (value <= thresh) {
 222                        avg += value;
 223                        divisor++;
 224                        if (value > max)
 225                                max = value;
 226                }
 227        }
 228        if (divisor == INTERVALS)
 229                avg >>= INTERVAL_SHIFT;
 230        else
 231                do_div(avg, divisor);
 232
 233        /* Then try to determine standard deviation */
 234        stddev = 0;
 235        for (i = 0; i < INTERVALS; i++) {
 236                unsigned int value = data->intervals[i];
 237                if (value <= thresh) {
 238                        int64_t diff = value - avg;
 239                        stddev += diff * diff;
 240                }
 241        }
 242        if (divisor == INTERVALS)
 243                stddev >>= INTERVAL_SHIFT;
 244        else
 245                do_div(stddev, divisor);
 246
 247        /*
 248         * The typical interval is obtained when standard deviation is small
 249         * or standard deviation is small compared to the average interval.
 250         *
 251         * int_sqrt() formal parameter type is unsigned long. When the
 252         * greatest difference to an outlier exceeds ~65 ms * sqrt(divisor)
 253         * the resulting squared standard deviation exceeds the input domain
 254         * of int_sqrt on platforms where unsigned long is 32 bits in size.
 255         * In such case reject the candidate average.
 256         *
 257         * Use this result only if there is no timer to wake us up sooner.
 258         */
 259        if (likely(stddev <= ULONG_MAX)) {
 260                stddev = int_sqrt(stddev);
 261                if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
 262                                                        || stddev <= 20) {
 263                        if (data->next_timer_us > avg)
 264                                data->predicted_us = avg;
 265                        return;
 266                }
 267        }
 268
 269        /*
 270         * If we have outliers to the upside in our distribution, discard
 271         * those by setting the threshold to exclude these outliers, then
 272         * calculate the average and standard deviation again. Once we get
 273         * down to the bottom 3/4 of our samples, stop excluding samples.
 274         *
 275         * This can deal with workloads that have long pauses interspersed
 276         * with sporadic activity with a bunch of short pauses.
 277         */
 278        if ((divisor * 4) <= INTERVALS * 3)
 279                return;
 280
 281        thresh = max - 1;
 282        goto again;
 283}
 284
 285/**
 286 * menu_select - selects the next idle state to enter
 287 * @drv: cpuidle driver containing state data
 288 * @dev: the CPU
 289 */
 290static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 291{
 292        struct menu_device *data = this_cpu_ptr(&menu_devices);
 293        int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 294        int i;
 295        unsigned int interactivity_req;
 296        unsigned long nr_iowaiters, cpu_load;
 297
 298        if (data->needs_update) {
 299                menu_update(drv, dev);
 300                data->needs_update = 0;
 301        }
 302
 303        data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 304
 305        /* Special case when user has set very strict latency requirement */
 306        if (unlikely(latency_req == 0))
 307                return 0;
 308
 309        /* determine the expected residency time, round up */
 310        data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
 311
 312        get_iowait_load(&nr_iowaiters, &cpu_load);
 313        data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
 314
 315        /*
 316         * Force the result of multiplication to be 64 bits even if both
 317         * operands are 32 bits.
 318         * Make sure to round up for half microseconds.
 319         */
 320        data->predicted_us = div_round64((uint64_t)data->next_timer_us *
 321                                         data->correction_factor[data->bucket],
 322                                         RESOLUTION * DECAY);
 323
 324        get_typical_interval(data);
 325
 326        /*
 327         * Performance multiplier defines a minimum predicted idle
 328         * duration / latency ratio. Adjust the latency limit if
 329         * necessary.
 330         */
 331        interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
 332        if (latency_req > interactivity_req)
 333                latency_req = interactivity_req;
 334
 335        /*
 336         * We want to default to C1 (hlt), not to busy polling
 337         * unless the timer is happening really really soon.
 338         */
 339        if (data->next_timer_us > 5 &&
 340            !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
 341                dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
 342                data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 343
 344        /*
 345         * Find the idle state with the lowest power while satisfying
 346         * our constraints.
 347         */
 348        for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 349                struct cpuidle_state *s = &drv->states[i];
 350                struct cpuidle_state_usage *su = &dev->states_usage[i];
 351
 352                if (s->disabled || su->disable)
 353                        continue;
 354                if (s->target_residency > data->predicted_us)
 355                        continue;
 356                if (s->exit_latency > latency_req)
 357                        continue;
 358
 359                data->last_state_idx = i;
 360        }
 361
 362        return data->last_state_idx;
 363}
 364
 365/**
 366 * menu_reflect - records that data structures need update
 367 * @dev: the CPU
 368 * @index: the index of actual entered state
 369 *
 370 * NOTE: it's important to be fast here because this operation will add to
 371 *       the overall exit latency.
 372 */
 373static void menu_reflect(struct cpuidle_device *dev, int index)
 374{
 375        struct menu_device *data = this_cpu_ptr(&menu_devices);
 376        data->last_state_idx = index;
 377        if (index >= 0)
 378                data->needs_update = 1;
 379}
 380
 381/**
 382 * menu_update - attempts to guess what happened after entry
 383 * @drv: cpuidle driver containing state data
 384 * @dev: the CPU
 385 */
 386static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 387{
 388        struct menu_device *data = this_cpu_ptr(&menu_devices);
 389        int last_idx = data->last_state_idx;
 390        struct cpuidle_state *target = &drv->states[last_idx];
 391        unsigned int measured_us;
 392        unsigned int new_factor;
 393
 394        /*
 395         * Try to figure out how much time passed between entry to low
 396         * power state and occurrence of the wakeup event.
 397         *
 398         * If the entered idle state didn't support residency measurements,
 399         * we use them anyway if they are short, and if long,
 400         * truncate to the whole expected time.
 401         *
 402         * Any measured amount of time will include the exit latency.
 403         * Since we are interested in when the wakeup begun, not when it
 404         * was completed, we must subtract the exit latency. However, if
 405         * the measured amount of time is less than the exit latency,
 406         * assume the state was never reached and the exit latency is 0.
 407         */
 408
 409        /* measured value */
 410        measured_us = cpuidle_get_last_residency(dev);
 411
 412        /* Deduct exit latency */
 413        if (measured_us > target->exit_latency)
 414                measured_us -= target->exit_latency;
 415
 416        /* Make sure our coefficients do not exceed unity */
 417        if (measured_us > data->next_timer_us)
 418                measured_us = data->next_timer_us;
 419
 420        /* Update our correction ratio */
 421        new_factor = data->correction_factor[data->bucket];
 422        new_factor -= new_factor / DECAY;
 423
 424        if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
 425                new_factor += RESOLUTION * measured_us / data->next_timer_us;
 426        else
 427                /*
 428                 * we were idle so long that we count it as a perfect
 429                 * prediction
 430                 */
 431                new_factor += RESOLUTION;
 432
 433        /*
 434         * We don't want 0 as factor; we always want at least
 435         * a tiny bit of estimated time. Fortunately, due to rounding,
 436         * new_factor will stay nonzero regardless of measured_us values
 437         * and the compiler can eliminate this test as long as DECAY > 1.
 438         */
 439        if (DECAY == 1 && unlikely(new_factor == 0))
 440                new_factor = 1;
 441
 442        data->correction_factor[data->bucket] = new_factor;
 443
 444        /* update the repeating-pattern data */
 445        data->intervals[data->interval_ptr++] = measured_us;
 446        if (data->interval_ptr >= INTERVALS)
 447                data->interval_ptr = 0;
 448}
 449
 450/**
 451 * menu_enable_device - scans a CPU's states and does setup
 452 * @drv: cpuidle driver
 453 * @dev: the CPU
 454 */
 455static int menu_enable_device(struct cpuidle_driver *drv,
 456                                struct cpuidle_device *dev)
 457{
 458        struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
 459        int i;
 460
 461        memset(data, 0, sizeof(struct menu_device));
 462
 463        /*
 464         * if the correction factor is 0 (eg first time init or cpu hotplug
 465         * etc), we actually want to start out with a unity factor.
 466         */
 467        for(i = 0; i < BUCKETS; i++)
 468                data->correction_factor[i] = RESOLUTION * DECAY;
 469
 470        return 0;
 471}
 472
 473static struct cpuidle_governor menu_governor = {
 474        .name =         "menu",
 475        .rating =       20,
 476        .enable =       menu_enable_device,
 477        .select =       menu_select,
 478        .reflect =      menu_reflect,
 479        .owner =        THIS_MODULE,
 480};
 481
 482/**
 483 * init_menu - initializes the governor
 484 */
 485static int __init init_menu(void)
 486{
 487        return cpuidle_register_governor(&menu_governor);
 488}
 489
 490postcore_initcall(init_menu);
 491