linux/drivers/cpufreq/cpufreq_conservative.c
<<
>>
Prefs
   1/*
   2 *  drivers/cpufreq/cpufreq_conservative.c
   3 *
   4 *  Copyright (C)  2001 Russell King
   5 *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
   6 *                      Jun Nakajima <jun.nakajima@intel.com>
   7 *            (C)  2009 Alexander Clouter <alex@digriz.org.uk>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13
  14#include <linux/kernel.h>
  15#include <linux/module.h>
  16#include <linux/init.h>
  17#include <linux/cpufreq.h>
  18#include <linux/cpu.h>
  19#include <linux/jiffies.h>
  20#include <linux/kernel_stat.h>
  21#include <linux/mutex.h>
  22#include <linux/hrtimer.h>
  23#include <linux/tick.h>
  24#include <linux/ktime.h>
  25#include <linux/sched.h>
  26
  27/*
  28 * dbs is used in this file as a shortform for demandbased switching
  29 * It helps to keep variable names smaller, simpler
  30 */
  31
  32#define DEF_FREQUENCY_UP_THRESHOLD              (80)
  33#define DEF_FREQUENCY_DOWN_THRESHOLD            (20)
  34
  35/*
  36 * The polling frequency of this governor depends on the capability of
  37 * the processor. Default polling frequency is 1000 times the transition
  38 * latency of the processor. The governor will work on any processor with
  39 * transition latency <= 10mS, using appropriate sampling
  40 * rate.
  41 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
  42 * this governor will not work.
  43 * All times here are in uS.
  44 */
  45#define MIN_SAMPLING_RATE_RATIO                 (2)
  46
  47static unsigned int min_sampling_rate;
  48
  49#define LATENCY_MULTIPLIER                      (1000)
  50#define MIN_LATENCY_MULTIPLIER                  (100)
  51#define DEF_SAMPLING_DOWN_FACTOR                (1)
  52#define MAX_SAMPLING_DOWN_FACTOR                (10)
  53#define TRANSITION_LATENCY_LIMIT                (10 * 1000 * 1000)
  54
  55static void do_dbs_timer(struct work_struct *work);
  56
  57struct cpu_dbs_info_s {
  58        cputime64_t prev_cpu_idle;
  59        cputime64_t prev_cpu_wall;
  60        cputime64_t prev_cpu_nice;
  61        struct cpufreq_policy *cur_policy;
  62        struct delayed_work work;
  63        unsigned int down_skip;
  64        unsigned int requested_freq;
  65        int cpu;
  66        unsigned int enable:1;
  67        /*
  68         * percpu mutex that serializes governor limit change with
  69         * do_dbs_timer invocation. We do not want do_dbs_timer to run
  70         * when user is changing the governor or limits.
  71         */
  72        struct mutex timer_mutex;
  73};
  74static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
  75
  76static unsigned int dbs_enable; /* number of CPUs using this policy */
  77
  78/*
  79 * dbs_mutex protects dbs_enable in governor start/stop.
  80 */
  81static DEFINE_MUTEX(dbs_mutex);
  82
  83static struct dbs_tuners {
  84        unsigned int sampling_rate;
  85        unsigned int sampling_down_factor;
  86        unsigned int up_threshold;
  87        unsigned int down_threshold;
  88        unsigned int ignore_nice;
  89        unsigned int freq_step;
  90} dbs_tuners_ins = {
  91        .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
  92        .down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD,
  93        .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR,
  94        .ignore_nice = 0,
  95        .freq_step = 5,
  96};
  97
  98static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
  99                                                        cputime64_t *wall)
 100{
 101        cputime64_t idle_time;
 102        cputime64_t cur_wall_time;
 103        cputime64_t busy_time;
 104
 105        cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
 106        busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
 107                        kstat_cpu(cpu).cpustat.system);
 108
 109        busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
 110        busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
 111        busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
 112        busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
 113
 114        idle_time = cputime64_sub(cur_wall_time, busy_time);
 115        if (wall)
 116                *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
 117
 118        return (cputime64_t)jiffies_to_usecs(idle_time);
 119}
 120
 121static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 122{
 123        u64 idle_time = get_cpu_idle_time_us(cpu, wall);
 124
 125        if (idle_time == -1ULL)
 126                return get_cpu_idle_time_jiffy(cpu, wall);
 127
 128        return idle_time;
 129}
 130
 131/* keep track of frequency transitions */
 132static int
 133dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 134                     void *data)
 135{
 136        struct cpufreq_freqs *freq = data;
 137        struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
 138                                                        freq->cpu);
 139
 140        struct cpufreq_policy *policy;
 141
 142        if (!this_dbs_info->enable)
 143                return 0;
 144
 145        policy = this_dbs_info->cur_policy;
 146
 147        /*
 148         * we only care if our internally tracked freq moves outside
 149         * the 'valid' ranges of freqency available to us otherwise
 150         * we do not change it
 151        */
 152        if (this_dbs_info->requested_freq > policy->max
 153                        || this_dbs_info->requested_freq < policy->min)
 154                this_dbs_info->requested_freq = freq->new;
 155
 156        return 0;
 157}
 158
 159static struct notifier_block dbs_cpufreq_notifier_block = {
 160        .notifier_call = dbs_cpufreq_notifier
 161};
 162
 163/************************** sysfs interface ************************/
 164static ssize_t show_sampling_rate_min(struct kobject *kobj,
 165                                      struct attribute *attr, char *buf)
 166{
 167        return sprintf(buf, "%u\n", min_sampling_rate);
 168}
 169
 170define_one_global_ro(sampling_rate_min);
 171
 172/* cpufreq_conservative Governor Tunables */
 173#define show_one(file_name, object)                                     \
 174static ssize_t show_##file_name                                         \
 175(struct kobject *kobj, struct attribute *attr, char *buf)               \
 176{                                                                       \
 177        return sprintf(buf, "%u\n", dbs_tuners_ins.object);             \
 178}
 179show_one(sampling_rate, sampling_rate);
 180show_one(sampling_down_factor, sampling_down_factor);
 181show_one(up_threshold, up_threshold);
 182show_one(down_threshold, down_threshold);
 183show_one(ignore_nice_load, ignore_nice);
 184show_one(freq_step, freq_step);
 185
 186static ssize_t store_sampling_down_factor(struct kobject *a,
 187                                          struct attribute *b,
 188                                          const char *buf, size_t count)
 189{
 190        unsigned int input;
 191        int ret;
 192        ret = sscanf(buf, "%u", &input);
 193
 194        if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 195                return -EINVAL;
 196
 197        dbs_tuners_ins.sampling_down_factor = input;
 198        return count;
 199}
 200
 201static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
 202                                   const char *buf, size_t count)
 203{
 204        unsigned int input;
 205        int ret;
 206        ret = sscanf(buf, "%u", &input);
 207
 208        if (ret != 1)
 209                return -EINVAL;
 210
 211        dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
 212        return count;
 213}
 214
 215static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
 216                                  const char *buf, size_t count)
 217{
 218        unsigned int input;
 219        int ret;
 220        ret = sscanf(buf, "%u", &input);
 221
 222        if (ret != 1 || input > 100 ||
 223                        input <= dbs_tuners_ins.down_threshold)
 224                return -EINVAL;
 225
 226        dbs_tuners_ins.up_threshold = input;
 227        return count;
 228}
 229
 230static ssize_t store_down_threshold(struct kobject *a, struct attribute *b,
 231                                    const char *buf, size_t count)
 232{
 233        unsigned int input;
 234        int ret;
 235        ret = sscanf(buf, "%u", &input);
 236
 237        /* cannot be lower than 11 otherwise freq will not fall */
 238        if (ret != 1 || input < 11 || input > 100 ||
 239                        input >= dbs_tuners_ins.up_threshold)
 240                return -EINVAL;
 241
 242        dbs_tuners_ins.down_threshold = input;
 243        return count;
 244}
 245
 246static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 247                                      const char *buf, size_t count)
 248{
 249        unsigned int input;
 250        int ret;
 251
 252        unsigned int j;
 253
 254        ret = sscanf(buf, "%u", &input);
 255        if (ret != 1)
 256                return -EINVAL;
 257
 258        if (input > 1)
 259                input = 1;
 260
 261        if (input == dbs_tuners_ins.ignore_nice) /* nothing to do */
 262                return count;
 263
 264        dbs_tuners_ins.ignore_nice = input;
 265
 266        /* we need to re-evaluate prev_cpu_idle */
 267        for_each_online_cpu(j) {
 268                struct cpu_dbs_info_s *dbs_info;
 269                dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 270                dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 271                                                &dbs_info->prev_cpu_wall);
 272                if (dbs_tuners_ins.ignore_nice)
 273                        dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 274        }
 275        return count;
 276}
 277
 278static ssize_t store_freq_step(struct kobject *a, struct attribute *b,
 279                               const char *buf, size_t count)
 280{
 281        unsigned int input;
 282        int ret;
 283        ret = sscanf(buf, "%u", &input);
 284
 285        if (ret != 1)
 286                return -EINVAL;
 287
 288        if (input > 100)
 289                input = 100;
 290
 291        /* no need to test here if freq_step is zero as the user might actually
 292         * want this, they would be crazy though :) */
 293        dbs_tuners_ins.freq_step = input;
 294        return count;
 295}
 296
 297define_one_global_rw(sampling_rate);
 298define_one_global_rw(sampling_down_factor);
 299define_one_global_rw(up_threshold);
 300define_one_global_rw(down_threshold);
 301define_one_global_rw(ignore_nice_load);
 302define_one_global_rw(freq_step);
 303
 304static struct attribute *dbs_attributes[] = {
 305        &sampling_rate_min.attr,
 306        &sampling_rate.attr,
 307        &sampling_down_factor.attr,
 308        &up_threshold.attr,
 309        &down_threshold.attr,
 310        &ignore_nice_load.attr,
 311        &freq_step.attr,
 312        NULL
 313};
 314
 315static struct attribute_group dbs_attr_group = {
 316        .attrs = dbs_attributes,
 317        .name = "conservative",
 318};
 319
 320/************************** sysfs end ************************/
 321
 322static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 323{
 324        unsigned int load = 0;
 325        unsigned int max_load = 0;
 326        unsigned int freq_target;
 327
 328        struct cpufreq_policy *policy;
 329        unsigned int j;
 330
 331        policy = this_dbs_info->cur_policy;
 332
 333        /*
 334         * Every sampling_rate, we check, if current idle time is less
 335         * than 20% (default), then we try to increase frequency
 336         * Every sampling_rate*sampling_down_factor, we check, if current
 337         * idle time is more than 80%, then we try to decrease frequency
 338         *
 339         * Any frequency increase takes it to the maximum frequency.
 340         * Frequency reduction happens at minimum steps of
 341         * 5% (default) of maximum frequency
 342         */
 343
 344        /* Get Absolute Load */
 345        for_each_cpu(j, policy->cpus) {
 346                struct cpu_dbs_info_s *j_dbs_info;
 347                cputime64_t cur_wall_time, cur_idle_time;
 348                unsigned int idle_time, wall_time;
 349
 350                j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 351
 352                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
 353
 354                wall_time = (unsigned int) cputime64_sub(cur_wall_time,
 355                                j_dbs_info->prev_cpu_wall);
 356                j_dbs_info->prev_cpu_wall = cur_wall_time;
 357
 358                idle_time = (unsigned int) cputime64_sub(cur_idle_time,
 359                                j_dbs_info->prev_cpu_idle);
 360                j_dbs_info->prev_cpu_idle = cur_idle_time;
 361
 362                if (dbs_tuners_ins.ignore_nice) {
 363                        cputime64_t cur_nice;
 364                        unsigned long cur_nice_jiffies;
 365
 366                        cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
 367                                         j_dbs_info->prev_cpu_nice);
 368                        /*
 369                         * Assumption: nice time between sampling periods will
 370                         * be less than 2^32 jiffies for 32 bit sys
 371                         */
 372                        cur_nice_jiffies = (unsigned long)
 373                                        cputime64_to_jiffies64(cur_nice);
 374
 375                        j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 376                        idle_time += jiffies_to_usecs(cur_nice_jiffies);
 377                }
 378
 379                if (unlikely(!wall_time || wall_time < idle_time))
 380                        continue;
 381
 382                load = 100 * (wall_time - idle_time) / wall_time;
 383
 384                if (load > max_load)
 385                        max_load = load;
 386        }
 387
 388        /*
 389         * break out if we 'cannot' reduce the speed as the user might
 390         * want freq_step to be zero
 391         */
 392        if (dbs_tuners_ins.freq_step == 0)
 393                return;
 394
 395        /* Check for frequency increase */
 396        if (max_load > dbs_tuners_ins.up_threshold) {
 397                this_dbs_info->down_skip = 0;
 398
 399                /* if we are already at full speed then break out early */
 400                if (this_dbs_info->requested_freq == policy->max)
 401                        return;
 402
 403                freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100;
 404
 405                /* max freq cannot be less than 100. But who knows.... */
 406                if (unlikely(freq_target == 0))
 407                        freq_target = 5;
 408
 409                this_dbs_info->requested_freq += freq_target;
 410                if (this_dbs_info->requested_freq > policy->max)
 411                        this_dbs_info->requested_freq = policy->max;
 412
 413                __cpufreq_driver_target(policy, this_dbs_info->requested_freq,
 414                        CPUFREQ_RELATION_H);
 415                return;
 416        }
 417
 418        /*
 419         * The optimal frequency is the frequency that is the lowest that
 420         * can support the current CPU usage without triggering the up
 421         * policy. To be safe, we focus 10 points under the threshold.
 422         */
 423        if (max_load < (dbs_tuners_ins.down_threshold - 10)) {
 424                freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100;
 425
 426                this_dbs_info->requested_freq -= freq_target;
 427                if (this_dbs_info->requested_freq < policy->min)
 428                        this_dbs_info->requested_freq = policy->min;
 429
 430                /*
 431                 * if we cannot reduce the frequency anymore, break out early
 432                 */
 433                if (policy->cur == policy->min)
 434                        return;
 435
 436                __cpufreq_driver_target(policy, this_dbs_info->requested_freq,
 437                                CPUFREQ_RELATION_H);
 438                return;
 439        }
 440}
 441
 442static void do_dbs_timer(struct work_struct *work)
 443{
 444        struct cpu_dbs_info_s *dbs_info =
 445                container_of(work, struct cpu_dbs_info_s, work.work);
 446        unsigned int cpu = dbs_info->cpu;
 447
 448        /* We want all CPUs to do sampling nearly on same jiffy */
 449        int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 450
 451        delay -= jiffies % delay;
 452
 453        mutex_lock(&dbs_info->timer_mutex);
 454
 455        dbs_check_cpu(dbs_info);
 456
 457        schedule_delayed_work_on(cpu, &dbs_info->work, delay);
 458        mutex_unlock(&dbs_info->timer_mutex);
 459}
 460
 461static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 462{
 463        /* We want all CPUs to do sampling nearly on same jiffy */
 464        int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 465        delay -= jiffies % delay;
 466
 467        dbs_info->enable = 1;
 468        INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
 469        schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay);
 470}
 471
 472static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 473{
 474        dbs_info->enable = 0;
 475        cancel_delayed_work_sync(&dbs_info->work);
 476}
 477
 478static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 479                                   unsigned int event)
 480{
 481        unsigned int cpu = policy->cpu;
 482        struct cpu_dbs_info_s *this_dbs_info;
 483        unsigned int j;
 484        int rc;
 485
 486        this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
 487
 488        switch (event) {
 489        case CPUFREQ_GOV_START:
 490                if ((!cpu_online(cpu)) || (!policy->cur))
 491                        return -EINVAL;
 492
 493                mutex_lock(&dbs_mutex);
 494
 495                for_each_cpu(j, policy->cpus) {
 496                        struct cpu_dbs_info_s *j_dbs_info;
 497                        j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 498                        j_dbs_info->cur_policy = policy;
 499
 500                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 501                                                &j_dbs_info->prev_cpu_wall);
 502                        if (dbs_tuners_ins.ignore_nice) {
 503                                j_dbs_info->prev_cpu_nice =
 504                                                kstat_cpu(j).cpustat.nice;
 505                        }
 506                }
 507                this_dbs_info->down_skip = 0;
 508                this_dbs_info->requested_freq = policy->cur;
 509
 510                mutex_init(&this_dbs_info->timer_mutex);
 511                dbs_enable++;
 512                /*
 513                 * Start the timerschedule work, when this governor
 514                 * is used for first time
 515                 */
 516                if (dbs_enable == 1) {
 517                        unsigned int latency;
 518                        /* policy latency is in nS. Convert it to uS first */
 519                        latency = policy->cpuinfo.transition_latency / 1000;
 520                        if (latency == 0)
 521                                latency = 1;
 522
 523                        rc = sysfs_create_group(cpufreq_global_kobject,
 524                                                &dbs_attr_group);
 525                        if (rc) {
 526                                mutex_unlock(&dbs_mutex);
 527                                return rc;
 528                        }
 529
 530                        /*
 531                         * conservative does not implement micro like ondemand
 532                         * governor, thus we are bound to jiffes/HZ
 533                         */
 534                        min_sampling_rate =
 535                                MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10);
 536                        /* Bring kernel and HW constraints together */
 537                        min_sampling_rate = max(min_sampling_rate,
 538                                        MIN_LATENCY_MULTIPLIER * latency);
 539                        dbs_tuners_ins.sampling_rate =
 540                                max(min_sampling_rate,
 541                                    latency * LATENCY_MULTIPLIER);
 542
 543                        cpufreq_register_notifier(
 544                                        &dbs_cpufreq_notifier_block,
 545                                        CPUFREQ_TRANSITION_NOTIFIER);
 546                }
 547                mutex_unlock(&dbs_mutex);
 548
 549                dbs_timer_init(this_dbs_info);
 550
 551                break;
 552
 553        case CPUFREQ_GOV_STOP:
 554                dbs_timer_exit(this_dbs_info);
 555
 556                mutex_lock(&dbs_mutex);
 557                dbs_enable--;
 558                mutex_destroy(&this_dbs_info->timer_mutex);
 559
 560                /*
 561                 * Stop the timerschedule work, when this governor
 562                 * is used for first time
 563                 */
 564                if (dbs_enable == 0)
 565                        cpufreq_unregister_notifier(
 566                                        &dbs_cpufreq_notifier_block,
 567                                        CPUFREQ_TRANSITION_NOTIFIER);
 568
 569                mutex_unlock(&dbs_mutex);
 570                if (!dbs_enable)
 571                        sysfs_remove_group(cpufreq_global_kobject,
 572                                           &dbs_attr_group);
 573
 574                break;
 575
 576        case CPUFREQ_GOV_LIMITS:
 577                mutex_lock(&this_dbs_info->timer_mutex);
 578                if (policy->max < this_dbs_info->cur_policy->cur)
 579                        __cpufreq_driver_target(
 580                                        this_dbs_info->cur_policy,
 581                                        policy->max, CPUFREQ_RELATION_H);
 582                else if (policy->min > this_dbs_info->cur_policy->cur)
 583                        __cpufreq_driver_target(
 584                                        this_dbs_info->cur_policy,
 585                                        policy->min, CPUFREQ_RELATION_L);
 586                mutex_unlock(&this_dbs_info->timer_mutex);
 587
 588                break;
 589        }
 590        return 0;
 591}
 592
 593#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 594static
 595#endif
 596struct cpufreq_governor cpufreq_gov_conservative = {
 597        .name                   = "conservative",
 598        .governor               = cpufreq_governor_dbs,
 599        .max_transition_latency = TRANSITION_LATENCY_LIMIT,
 600        .owner                  = THIS_MODULE,
 601};
 602
 603static int __init cpufreq_gov_dbs_init(void)
 604{
 605        return cpufreq_register_governor(&cpufreq_gov_conservative);
 606}
 607
 608static void __exit cpufreq_gov_dbs_exit(void)
 609{
 610        cpufreq_unregister_governor(&cpufreq_gov_conservative);
 611}
 612
 613
 614MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
 615MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
 616                "Low Latency Frequency Transition capable processors "
 617                "optimised for use in a battery environment");
 618MODULE_LICENSE("GPL");
 619
 620#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 621fs_initcall(cpufreq_gov_dbs_init);
 622#else
 623module_init(cpufreq_gov_dbs_init);
 624#endif
 625module_exit(cpufreq_gov_dbs_exit);
 626