LXR linux/drivers/cpuidle/governors/menu.c

   1/*
   2 * menu.c - the menu idle governor
   3 *
   4 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
   5 * Copyright (C) 2009 Intel Corporation
   6 * Author:
   7 *        Arjan van de Ven <arjan@linux.intel.com>
   8 *
   9 * This code is licenced under the GPL version 2 as described
  10 * in the COPYING file that acompanies the Linux Kernel.
  11 */
  12
  13#include <linux/kernel.h>
  14#include <linux/cpuidle.h>
  15#include <linux/pm_qos.h>
  16#include <linux/time.h>
  17#include <linux/ktime.h>
  18#include <linux/hrtimer.h>
  19#include <linux/tick.h>
  20#include <linux/sched.h>
  21#include <linux/math64.h>
  22#include <linux/module.h>
  23
  24/*
  25 * Please note when changing the tuning values:
  26 * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
  27 * a scaling operation multiplication may overflow on 32 bit platforms.
  28 * In that case, #define RESOLUTION as ULL to get 64 bit result:
  29 * #define RESOLUTION 1024ULL
  30 *
  31 * The default values do not overflow.
  32 */
  33#define BUCKETS 12
  34#define INTERVAL_SHIFT 3
  35#define INTERVALS (1UL << INTERVAL_SHIFT)
  36#define RESOLUTION 1024
  37#define DECAY 8
  38#define MAX_INTERESTING 50000
  39
  40
  41/*
  42 * Concepts and ideas behind the menu governor
  43 *
  44 * For the menu governor, there are 3 decision factors for picking a C
  45 * state:
  46 * 1) Energy break even point
  47 * 2) Performance impact
  48 * 3) Latency tolerance (from pmqos infrastructure)
  49 * These these three factors are treated independently.
  50 *
  51 * Energy break even point
  52 * -----------------------
  53 * C state entry and exit have an energy cost, and a certain amount of time in
  54 * the  C state is required to actually break even on this cost. CPUIDLE
  55 * provides us this duration in the "target_residency" field. So all that we
  56 * need is a good prediction of how long we'll be idle. Like the traditional
  57 * menu governor, we start with the actual known "next timer event" time.
  58 *
  59 * Since there are other source of wakeups (interrupts for example) than
  60 * the next timer event, this estimation is rather optimistic. To get a
  61 * more realistic estimate, a correction factor is applied to the estimate,
  62 * that is based on historic behavior. For example, if in the past the actual
  63 * duration always was 50% of the next timer tick, the correction factor will
  64 * be 0.5.
  65 *
  66 * menu uses a running average for this correction factor, however it uses a
  67 * set of factors, not just a single factor. This stems from the realization
  68 * that the ratio is dependent on the order of magnitude of the expected
  69 * duration; if we expect 500 milliseconds of idle time the likelihood of
  70 * getting an interrupt very early is much higher than if we expect 50 micro
  71 * seconds of idle time. A second independent factor that has big impact on
  72 * the actual factor is if there is (disk) IO outstanding or not.
  73 * (as a special twist, we consider every sleep longer than 50 milliseconds
  74 * as perfect; there are no power gains for sleeping longer than this)
  75 *
  76 * For these two reasons we keep an array of 12 independent factors, that gets
  77 * indexed based on the magnitude of the expected duration as well as the
  78 * "is IO outstanding" property.
  79 *
  80 * Repeatable-interval-detector
  81 * ----------------------------
  82 * There are some cases where "next timer" is a completely unusable predictor:
  83 * Those cases where the interval is fixed, for example due to hardware
  84 * interrupt mitigation, but also due to fixed transfer rate devices such as
  85 * mice.
  86 * For this, we use a different predictor: We track the duration of the last 8
  87 * intervals and if the stand deviation of these 8 intervals is below a
  88 * threshold value, we use the average of these intervals as prediction.
  89 *
  90 * Limiting Performance Impact
  91 * ---------------------------
  92 * C states, especially those with large exit latencies, can have a real
  93 * noticeable impact on workloads, which is not acceptable for most sysadmins,
  94 * and in addition, less performance has a power price of its own.
  95 *
  96 * As a general rule of thumb, menu assumes that the following heuristic
  97 * holds:
  98 *     The busier the system, the less impact of C states is acceptable
  99 *
 100 * This rule-of-thumb is implemented using a performance-multiplier:
 101 * If the exit latency times the performance multiplier is longer than
 102 * the predicted duration, the C state is not considered a candidate
 103 * for selection due to a too high performance impact. So the higher
 104 * this multiplier is, the longer we need to be idle to pick a deep C
 105 * state, and thus the less likely a busy CPU will hit such a deep
 106 * C state.
 107 *
 108 * Two factors are used in determing this multiplier:
 109 * a value of 10 is added for each point of "per cpu load average" we have.
 110 * a value of 5 points is added for each process that is waiting for
 111 * IO on this CPU.
 112 * (these values are experimentally determined)
 113 *
 114 * The load average factor gives a longer term (few seconds) input to the
 115 * decision, while the iowait value gives a cpu local instantanious input.
 116 * The iowait factor may look low, but realize that this is also already
 117 * represented in the system load average.
 118 *
 119 */
 120
 121struct menu_device {
 122        int             last_state_idx;
 123        int             needs_update;
 124
 125        unsigned int    next_timer_us;
 126        unsigned int    predicted_us;
 127        unsigned int    bucket;
 128        unsigned int    correction_factor[BUCKETS];
 129        unsigned int    intervals[INTERVALS];
 130        int             interval_ptr;
 131};
 132
 133
 134#define LOAD_INT(x) ((x) >> FSHIFT)
 135#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 136
 137static inline int get_loadavg(unsigned long load)
 138{
 139        return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
 140}
 141
 142static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
 143{
 144        int bucket = 0;
 145
 146        /*
 147         * We keep two groups of stats; one with no
 148         * IO pending, one without.
 149         * This allows us to calculate
 150         * E(duration)|iowait
 151         */
 152        if (nr_iowaiters)
 153                bucket = BUCKETS/2;
 154
 155        if (duration < 10)
 156                return bucket;
 157        if (duration < 100)
 158                return bucket + 1;
 159        if (duration < 1000)
 160                return bucket + 2;
 161        if (duration < 10000)
 162                return bucket + 3;
 163        if (duration < 100000)
 164                return bucket + 4;
 165        return bucket + 5;
 166}
 167
 168/*
 169 * Return a multiplier for the exit latency that is intended
 170 * to take performance requirements into account.
 171 * The more performance critical we estimate the system
 172 * to be, the higher this multiplier, and thus the higher
 173 * the barrier to go to an expensive C state.
 174 */
 175static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load)
 176{
 177        int mult = 1;
 178
 179        /* for higher loadavg, we are more reluctant */
 180
 181        mult += 2 * get_loadavg(load);
 182
 183        /* for IO wait tasks (per cpu!) we add 5x each */
 184        mult += 10 * nr_iowaiters;
 185
 186        return mult;
 187}
 188
 189static DEFINE_PER_CPU(struct menu_device, menu_devices);
 190
 191static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 192
 193/*
 194 * Try detecting repeating patterns by keeping track of the last 8
 195 * intervals, and checking if the standard deviation of that set
 196 * of points is below a threshold. If it is... then use the
 197 * average of these 8 points as the estimated value.
 198 */
 199static unsigned int get_typical_interval(struct menu_device *data)
 200{
 201        int i, divisor;
 202        unsigned int max, thresh, avg;
 203        uint64_t sum, variance;
 204
 205        thresh = UINT_MAX; /* Discard outliers above this value */
 206
 207again:
 208
 209        /* First calculate the average of past intervals */
 210        max = 0;
 211        sum = 0;
 212        divisor = 0;
 213        for (i = 0; i < INTERVALS; i++) {
 214                unsigned int value = data->intervals[i];
 215                if (value <= thresh) {
 216                        sum += value;
 217                        divisor++;
 218                        if (value > max)
 219                                max = value;
 220                }
 221        }
 222        if (divisor == INTERVALS)
 223                avg = sum >> INTERVAL_SHIFT;
 224        else
 225                avg = div_u64(sum, divisor);
 226
 227        /* Then try to determine variance */
 228        variance = 0;
 229        for (i = 0; i < INTERVALS; i++) {
 230                unsigned int value = data->intervals[i];
 231                if (value <= thresh) {
 232                        int64_t diff = (int64_t)value - avg;
 233                        variance += diff * diff;
 234                }
 235        }
 236        if (divisor == INTERVALS)
 237                variance >>= INTERVAL_SHIFT;
 238        else
 239                do_div(variance, divisor);
 240
 241        /*
 242         * The typical interval is obtained when standard deviation is
 243         * small (stddev <= 20 us, variance <= 400 us^2) or standard
 244         * deviation is small compared to the average interval (avg >
 245         * 6*stddev, avg^2 > 36*variance). The average is smaller than
 246         * UINT_MAX aka U32_MAX, so computing its square does not
 247         * overflow a u64. We simply reject this candidate average if
 248         * the standard deviation is greater than 715 s (which is
 249         * rather unlikely).
 250         *
 251         * Use this result only if there is no timer to wake us up sooner.
 252         */
 253        if (likely(variance <= U64_MAX/36)) {
 254                if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
 255                                                        || variance <= 400) {
 256                        return avg;
 257                }
 258        }
 259
 260        /*
 261         * If we have outliers to the upside in our distribution, discard
 262         * those by setting the threshold to exclude these outliers, then
 263         * calculate the average and standard deviation again. Once we get
 264         * down to the bottom 3/4 of our samples, stop excluding samples.
 265         *
 266         * This can deal with workloads that have long pauses interspersed
 267         * with sporadic activity with a bunch of short pauses.
 268         */
 269        if ((divisor * 4) <= INTERVALS * 3)
 270                return UINT_MAX;
 271
 272        thresh = max - 1;
 273        goto again;
 274}
 275
 276/**
 277 * menu_select - selects the next idle state to enter
 278 * @drv: cpuidle driver containing state data
 279 * @dev: the CPU
 280 */
 281static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 282{
 283        struct menu_device *data = this_cpu_ptr(&menu_devices);
 284        int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 285        int i;
 286        unsigned int interactivity_req;
 287        unsigned int expected_interval;
 288        unsigned long nr_iowaiters, cpu_load;
 289
 290        if (data->needs_update) {
 291                menu_update(drv, dev);
 292                data->needs_update = 0;
 293        }
 294
 295        /* Special case when user has set very strict latency requirement */
 296        if (unlikely(latency_req == 0))
 297                return 0;
 298
 299        /* determine the expected residency time, round up */
 300        data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
 301
 302        get_iowait_load(&nr_iowaiters, &cpu_load);
 303        data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
 304
 305        /*
 306         * Force the result of multiplication to be 64 bits even if both
 307         * operands are 32 bits.
 308         * Make sure to round up for half microseconds.
 309         */
 310        data->predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
 311                                         data->correction_factor[data->bucket],
 312                                         RESOLUTION * DECAY);
 313
 314        expected_interval = get_typical_interval(data);
 315        expected_interval = min(expected_interval, data->next_timer_us);
 316
 317        if (CPUIDLE_DRIVER_STATE_START > 0) {
 318                struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
 319                unsigned int polling_threshold;
 320
 321                /*
 322                 * We want to default to C1 (hlt), not to busy polling
 323                 * unless the timer is happening really really soon, or
 324                 * C1's exit latency exceeds the user configured limit.
 325                 */
 326                polling_threshold = max_t(unsigned int, 20, s->target_residency);
 327                if (data->next_timer_us > polling_threshold &&
 328                    latency_req > s->exit_latency && !s->disabled &&
 329                    !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
 330                        data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 331                else
 332                        data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 333        } else {
 334                data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 335        }
 336
 337        /*
 338         * Use the lowest expected idle interval to pick the idle state.
 339         */
 340        data->predicted_us = min(data->predicted_us, expected_interval);
 341
 342        /*
 343         * Use the performance multiplier and the user-configurable
 344         * latency_req to determine the maximum exit latency.
 345         */
 346        interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
 347        if (latency_req > interactivity_req)
 348                latency_req = interactivity_req;
 349
 350        /*
 351         * Find the idle state with the lowest power while satisfying
 352         * our constraints.
 353         */
 354        for (i = data->last_state_idx + 1; i < drv->state_count; i++) {
 355                struct cpuidle_state *s = &drv->states[i];
 356                struct cpuidle_state_usage *su = &dev->states_usage[i];
 357
 358                if (s->disabled || su->disable)
 359                        continue;
 360                if (s->target_residency > data->predicted_us)
 361                        continue;
 362                if (s->exit_latency > latency_req)
 363                        continue;
 364
 365                data->last_state_idx = i;
 366        }
 367
 368        return data->last_state_idx;
 369}
 370
 371/**
 372 * menu_reflect - records that data structures need update
 373 * @dev: the CPU
 374 * @index: the index of actual entered state
 375 *
 376 * NOTE: it's important to be fast here because this operation will add to
 377 *       the overall exit latency.
 378 */
 379static void menu_reflect(struct cpuidle_device *dev, int index)
 380{
 381        struct menu_device *data = this_cpu_ptr(&menu_devices);
 382
 383        data->last_state_idx = index;
 384        data->needs_update = 1;
 385}
 386
 387/**
 388 * menu_update - attempts to guess what happened after entry
 389 * @drv: cpuidle driver containing state data
 390 * @dev: the CPU
 391 */
 392static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 393{
 394        struct menu_device *data = this_cpu_ptr(&menu_devices);
 395        int last_idx = data->last_state_idx;
 396        struct cpuidle_state *target = &drv->states[last_idx];
 397        unsigned int measured_us;
 398        unsigned int new_factor;
 399
 400        /*
 401         * Try to figure out how much time passed between entry to low
 402         * power state and occurrence of the wakeup event.
 403         *
 404         * If the entered idle state didn't support residency measurements,
 405         * we use them anyway if they are short, and if long,
 406         * truncate to the whole expected time.
 407         *
 408         * Any measured amount of time will include the exit latency.
 409         * Since we are interested in when the wakeup begun, not when it
 410         * was completed, we must subtract the exit latency. However, if
 411         * the measured amount of time is less than the exit latency,
 412         * assume the state was never reached and the exit latency is 0.
 413         */
 414
 415        /* measured value */
 416        measured_us = cpuidle_get_last_residency(dev);
 417
 418        /* Deduct exit latency */
 419        if (measured_us > 2 * target->exit_latency)
 420                measured_us -= target->exit_latency;
 421        else
 422                measured_us /= 2;
 423
 424        /* Make sure our coefficients do not exceed unity */
 425        if (measured_us > data->next_timer_us)
 426                measured_us = data->next_timer_us;
 427
 428        /* Update our correction ratio */
 429        new_factor = data->correction_factor[data->bucket];
 430        new_factor -= new_factor / DECAY;
 431
 432        if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
 433                new_factor += RESOLUTION * measured_us / data->next_timer_us;
 434        else
 435                /*
 436                 * we were idle so long that we count it as a perfect
 437                 * prediction
 438                 */
 439                new_factor += RESOLUTION;
 440
 441        /*
 442         * We don't want 0 as factor; we always want at least
 443         * a tiny bit of estimated time. Fortunately, due to rounding,
 444         * new_factor will stay nonzero regardless of measured_us values
 445         * and the compiler can eliminate this test as long as DECAY > 1.
 446         */
 447        if (DECAY == 1 && unlikely(new_factor == 0))
 448                new_factor = 1;
 449
 450        data->correction_factor[data->bucket] = new_factor;
 451
 452        /* update the repeating-pattern data */
 453        data->intervals[data->interval_ptr++] = measured_us;
 454        if (data->interval_ptr >= INTERVALS)
 455                data->interval_ptr = 0;
 456}
 457
 458/**
 459 * menu_enable_device - scans a CPU's states and does setup
 460 * @drv: cpuidle driver
 461 * @dev: the CPU
 462 */
 463static int menu_enable_device(struct cpuidle_driver *drv,
 464                                struct cpuidle_device *dev)
 465{
 466        struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
 467        int i;
 468
 469        memset(data, 0, sizeof(struct menu_device));
 470
 471        /*
 472         * if the correction factor is 0 (eg first time init or cpu hotplug
 473         * etc), we actually want to start out with a unity factor.
 474         */
 475        for(i = 0; i < BUCKETS; i++)
 476                data->correction_factor[i] = RESOLUTION * DECAY;
 477
 478        return 0;
 479}
 480
 481static struct cpuidle_governor menu_governor = {
 482        .name =         "menu",
 483        .rating =       20,
 484        .enable =       menu_enable_device,
 485        .select =       menu_select,
 486        .reflect =      menu_reflect,
 487        .owner =        THIS_MODULE,
 488};
 489
 490/**
 491 * init_menu - initializes the governor
 492 */
 493static int __init init_menu(void)
 494{
 495        return cpuidle_register_governor(&menu_governor);
 496}
 497
 498postcore_initcall(init_menu);
 499