linux/kernel/sched/cpufreq_schedutil.c
<<
>>
Prefs
   1/*
   2 * CPUFreq governor based on scheduler-provided CPU utilization data.
   3 *
   4 * Copyright (C) 2016, Intel Corporation
   5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 */
  11
  12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14#include <linux/cpufreq.h>
  15#include <linux/kthread.h>
  16#include <uapi/linux/sched/types.h>
  17#include <linux/slab.h>
  18#include <trace/events/power.h>
  19
  20#include "sched.h"
  21
  22#define SUGOV_KTHREAD_PRIORITY  50
  23
  24struct sugov_tunables {
  25        struct gov_attr_set attr_set;
  26        unsigned int rate_limit_us;
  27};
  28
  29struct sugov_policy {
  30        struct cpufreq_policy *policy;
  31
  32        struct sugov_tunables *tunables;
  33        struct list_head tunables_hook;
  34
  35        raw_spinlock_t update_lock;  /* For shared policies */
  36        u64 last_freq_update_time;
  37        s64 freq_update_delay_ns;
  38        unsigned int next_freq;
  39        unsigned int cached_raw_freq;
  40
  41        /* The next fields are only needed if fast switch cannot be used. */
  42        struct irq_work irq_work;
  43        struct kthread_work work;
  44        struct mutex work_lock;
  45        struct kthread_worker worker;
  46        struct task_struct *thread;
  47        bool work_in_progress;
  48
  49        bool need_freq_update;
  50};
  51
  52struct sugov_cpu {
  53        struct update_util_data update_util;
  54        struct sugov_policy *sg_policy;
  55
  56        unsigned long iowait_boost;
  57        unsigned long iowait_boost_max;
  58        u64 last_update;
  59
  60        /* The fields below are only needed when sharing a policy. */
  61        unsigned long util;
  62        unsigned long max;
  63        unsigned int flags;
  64
  65        /* The field below is for single-CPU policies only. */
  66#ifdef CONFIG_NO_HZ_COMMON
  67        unsigned long saved_idle_calls;
  68#endif
  69};
  70
  71static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  72
  73/************************ Governor internals ***********************/
  74
  75static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  76{
  77        s64 delta_ns;
  78
  79        if (sg_policy->work_in_progress)
  80                return false;
  81
  82        if (unlikely(sg_policy->need_freq_update)) {
  83                sg_policy->need_freq_update = false;
  84                /*
  85                 * This happens when limits change, so forget the previous
  86                 * next_freq value and force an update.
  87                 */
  88                sg_policy->next_freq = UINT_MAX;
  89                return true;
  90        }
  91
  92        delta_ns = time - sg_policy->last_freq_update_time;
  93        return delta_ns >= sg_policy->freq_update_delay_ns;
  94}
  95
  96static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
  97                                unsigned int next_freq)
  98{
  99        struct cpufreq_policy *policy = sg_policy->policy;
 100
 101        if (sg_policy->next_freq == next_freq)
 102                return;
 103
 104        sg_policy->next_freq = next_freq;
 105        sg_policy->last_freq_update_time = time;
 106
 107        if (policy->fast_switch_enabled) {
 108                next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 109                if (next_freq == CPUFREQ_ENTRY_INVALID)
 110                        return;
 111
 112                policy->cur = next_freq;
 113                trace_cpu_frequency(next_freq, smp_processor_id());
 114        } else {
 115                sg_policy->work_in_progress = true;
 116                irq_work_queue(&sg_policy->irq_work);
 117        }
 118}
 119
 120/**
 121 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 122 * @sg_policy: schedutil policy object to compute the new frequency for.
 123 * @util: Current CPU utilization.
 124 * @max: CPU capacity.
 125 *
 126 * If the utilization is frequency-invariant, choose the new frequency to be
 127 * proportional to it, that is
 128 *
 129 * next_freq = C * max_freq * util / max
 130 *
 131 * Otherwise, approximate the would-be frequency-invariant utilization by
 132 * util_raw * (curr_freq / max_freq) which leads to
 133 *
 134 * next_freq = C * curr_freq * util_raw / max
 135 *
 136 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 137 *
 138 * The lowest driver-supported frequency which is equal or greater than the raw
 139 * next_freq (as calculated above) is returned, subject to policy min/max and
 140 * cpufreq driver limitations.
 141 */
 142static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 143                                  unsigned long util, unsigned long max)
 144{
 145        struct cpufreq_policy *policy = sg_policy->policy;
 146        unsigned int freq = arch_scale_freq_invariant() ?
 147                                policy->cpuinfo.max_freq : policy->cur;
 148
 149        freq = (freq + (freq >> 2)) * util / max;
 150
 151        if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
 152                return sg_policy->next_freq;
 153        sg_policy->cached_raw_freq = freq;
 154        return cpufreq_driver_resolve_freq(policy, freq);
 155}
 156
 157static void sugov_get_util(unsigned long *util, unsigned long *max)
 158{
 159        struct rq *rq = this_rq();
 160        unsigned long cfs_max;
 161
 162        cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
 163
 164        *util = min(rq->cfs.avg.util_avg, cfs_max);
 165        *max = cfs_max;
 166}
 167
 168static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 169                                   unsigned int flags)
 170{
 171        if (flags & SCHED_CPUFREQ_IOWAIT) {
 172                sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
 173        } else if (sg_cpu->iowait_boost) {
 174                s64 delta_ns = time - sg_cpu->last_update;
 175
 176                /* Clear iowait_boost if the CPU apprears to have been idle. */
 177                if (delta_ns > TICK_NSEC)
 178                        sg_cpu->iowait_boost = 0;
 179        }
 180}
 181
 182static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 183                               unsigned long *max)
 184{
 185        unsigned long boost_util = sg_cpu->iowait_boost;
 186        unsigned long boost_max = sg_cpu->iowait_boost_max;
 187
 188        if (!boost_util)
 189                return;
 190
 191        if (*util * boost_max < *max * boost_util) {
 192                *util = boost_util;
 193                *max = boost_max;
 194        }
 195        sg_cpu->iowait_boost >>= 1;
 196}
 197
 198#ifdef CONFIG_NO_HZ_COMMON
 199static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 200{
 201        unsigned long idle_calls = tick_nohz_get_idle_calls();
 202        bool ret = idle_calls == sg_cpu->saved_idle_calls;
 203
 204        sg_cpu->saved_idle_calls = idle_calls;
 205        return ret;
 206}
 207#else
 208static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 209#endif /* CONFIG_NO_HZ_COMMON */
 210
 211static void sugov_update_single(struct update_util_data *hook, u64 time,
 212                                unsigned int flags)
 213{
 214        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 215        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 216        struct cpufreq_policy *policy = sg_policy->policy;
 217        unsigned long util, max;
 218        unsigned int next_f;
 219        bool busy;
 220
 221        sugov_set_iowait_boost(sg_cpu, time, flags);
 222        sg_cpu->last_update = time;
 223
 224        if (!sugov_should_update_freq(sg_policy, time))
 225                return;
 226
 227        busy = sugov_cpu_is_busy(sg_cpu);
 228
 229        if (flags & SCHED_CPUFREQ_RT_DL) {
 230                next_f = policy->cpuinfo.max_freq;
 231        } else {
 232                sugov_get_util(&util, &max);
 233                sugov_iowait_boost(sg_cpu, &util, &max);
 234                next_f = get_next_freq(sg_policy, util, max);
 235                /*
 236                 * Do not reduce the frequency if the CPU has not been idle
 237                 * recently, as the reduction is likely to be premature then.
 238                 */
 239                if (busy && next_f < sg_policy->next_freq)
 240                        next_f = sg_policy->next_freq;
 241        }
 242        sugov_update_commit(sg_policy, time, next_f);
 243}
 244
 245static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 246{
 247        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 248        struct cpufreq_policy *policy = sg_policy->policy;
 249        unsigned long util = 0, max = 1;
 250        unsigned int j;
 251
 252        for_each_cpu(j, policy->cpus) {
 253                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 254                unsigned long j_util, j_max;
 255                s64 delta_ns;
 256
 257                /*
 258                 * If the CPU utilization was last updated before the previous
 259                 * frequency update and the time elapsed between the last update
 260                 * of the CPU utilization and the last frequency update is long
 261                 * enough, don't take the CPU into account as it probably is
 262                 * idle now (and clear iowait_boost for it).
 263                 */
 264                delta_ns = time - j_sg_cpu->last_update;
 265                if (delta_ns > TICK_NSEC) {
 266                        j_sg_cpu->iowait_boost = 0;
 267                        continue;
 268                }
 269                if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
 270                        return policy->cpuinfo.max_freq;
 271
 272                j_util = j_sg_cpu->util;
 273                j_max = j_sg_cpu->max;
 274                if (j_util * max > j_max * util) {
 275                        util = j_util;
 276                        max = j_max;
 277                }
 278
 279                sugov_iowait_boost(j_sg_cpu, &util, &max);
 280        }
 281
 282        return get_next_freq(sg_policy, util, max);
 283}
 284
 285static void sugov_update_shared(struct update_util_data *hook, u64 time,
 286                                unsigned int flags)
 287{
 288        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 289        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 290        unsigned long util, max;
 291        unsigned int next_f;
 292
 293        sugov_get_util(&util, &max);
 294
 295        raw_spin_lock(&sg_policy->update_lock);
 296
 297        sg_cpu->util = util;
 298        sg_cpu->max = max;
 299        sg_cpu->flags = flags;
 300
 301        sugov_set_iowait_boost(sg_cpu, time, flags);
 302        sg_cpu->last_update = time;
 303
 304        if (sugov_should_update_freq(sg_policy, time)) {
 305                if (flags & SCHED_CPUFREQ_RT_DL)
 306                        next_f = sg_policy->policy->cpuinfo.max_freq;
 307                else
 308                        next_f = sugov_next_freq_shared(sg_cpu, time);
 309
 310                sugov_update_commit(sg_policy, time, next_f);
 311        }
 312
 313        raw_spin_unlock(&sg_policy->update_lock);
 314}
 315
 316static void sugov_work(struct kthread_work *work)
 317{
 318        struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 319
 320        mutex_lock(&sg_policy->work_lock);
 321        __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
 322                                CPUFREQ_RELATION_L);
 323        mutex_unlock(&sg_policy->work_lock);
 324
 325        sg_policy->work_in_progress = false;
 326}
 327
 328static void sugov_irq_work(struct irq_work *irq_work)
 329{
 330        struct sugov_policy *sg_policy;
 331
 332        sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 333
 334        /*
 335         * For RT and deadline tasks, the schedutil governor shoots the
 336         * frequency to maximum. Special care must be taken to ensure that this
 337         * kthread doesn't result in the same behavior.
 338         *
 339         * This is (mostly) guaranteed by the work_in_progress flag. The flag is
 340         * updated only at the end of the sugov_work() function and before that
 341         * the schedutil governor rejects all other frequency scaling requests.
 342         *
 343         * There is a very rare case though, where the RT thread yields right
 344         * after the work_in_progress flag is cleared. The effects of that are
 345         * neglected for now.
 346         */
 347        kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 348}
 349
 350/************************** sysfs interface ************************/
 351
 352static struct sugov_tunables *global_tunables;
 353static DEFINE_MUTEX(global_tunables_lock);
 354
 355static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
 356{
 357        return container_of(attr_set, struct sugov_tunables, attr_set);
 358}
 359
 360static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 361{
 362        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 363
 364        return sprintf(buf, "%u\n", tunables->rate_limit_us);
 365}
 366
 367static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
 368                                   size_t count)
 369{
 370        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 371        struct sugov_policy *sg_policy;
 372        unsigned int rate_limit_us;
 373
 374        if (kstrtouint(buf, 10, &rate_limit_us))
 375                return -EINVAL;
 376
 377        tunables->rate_limit_us = rate_limit_us;
 378
 379        list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
 380                sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
 381
 382        return count;
 383}
 384
 385static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
 386
 387static struct attribute *sugov_attributes[] = {
 388        &rate_limit_us.attr,
 389        NULL
 390};
 391
 392static struct kobj_type sugov_tunables_ktype = {
 393        .default_attrs = sugov_attributes,
 394        .sysfs_ops = &governor_sysfs_ops,
 395};
 396
 397/********************** cpufreq governor interface *********************/
 398
 399static struct cpufreq_governor schedutil_gov;
 400
 401static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 402{
 403        struct sugov_policy *sg_policy;
 404
 405        sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
 406        if (!sg_policy)
 407                return NULL;
 408
 409        sg_policy->policy = policy;
 410        raw_spin_lock_init(&sg_policy->update_lock);
 411        return sg_policy;
 412}
 413
 414static void sugov_policy_free(struct sugov_policy *sg_policy)
 415{
 416        kfree(sg_policy);
 417}
 418
 419static int sugov_kthread_create(struct sugov_policy *sg_policy)
 420{
 421        struct task_struct *thread;
 422        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
 423        struct cpufreq_policy *policy = sg_policy->policy;
 424        int ret;
 425
 426        /* kthread only required for slow path */
 427        if (policy->fast_switch_enabled)
 428                return 0;
 429
 430        kthread_init_work(&sg_policy->work, sugov_work);
 431        kthread_init_worker(&sg_policy->worker);
 432        thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
 433                                "sugov:%d",
 434                                cpumask_first(policy->related_cpus));
 435        if (IS_ERR(thread)) {
 436                pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
 437                return PTR_ERR(thread);
 438        }
 439
 440        ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
 441        if (ret) {
 442                kthread_stop(thread);
 443                pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
 444                return ret;
 445        }
 446
 447        sg_policy->thread = thread;
 448        kthread_bind_mask(thread, policy->related_cpus);
 449        init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 450        mutex_init(&sg_policy->work_lock);
 451
 452        wake_up_process(thread);
 453
 454        return 0;
 455}
 456
 457static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 458{
 459        /* kthread only required for slow path */
 460        if (sg_policy->policy->fast_switch_enabled)
 461                return;
 462
 463        kthread_flush_worker(&sg_policy->worker);
 464        kthread_stop(sg_policy->thread);
 465        mutex_destroy(&sg_policy->work_lock);
 466}
 467
 468static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 469{
 470        struct sugov_tunables *tunables;
 471
 472        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
 473        if (tunables) {
 474                gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
 475                if (!have_governor_per_policy())
 476                        global_tunables = tunables;
 477        }
 478        return tunables;
 479}
 480
 481static void sugov_tunables_free(struct sugov_tunables *tunables)
 482{
 483        if (!have_governor_per_policy())
 484                global_tunables = NULL;
 485
 486        kfree(tunables);
 487}
 488
 489static int sugov_init(struct cpufreq_policy *policy)
 490{
 491        struct sugov_policy *sg_policy;
 492        struct sugov_tunables *tunables;
 493        int ret = 0;
 494
 495        /* State should be equivalent to EXIT */
 496        if (policy->governor_data)
 497                return -EBUSY;
 498
 499        cpufreq_enable_fast_switch(policy);
 500
 501        sg_policy = sugov_policy_alloc(policy);
 502        if (!sg_policy) {
 503                ret = -ENOMEM;
 504                goto disable_fast_switch;
 505        }
 506
 507        ret = sugov_kthread_create(sg_policy);
 508        if (ret)
 509                goto free_sg_policy;
 510
 511        mutex_lock(&global_tunables_lock);
 512
 513        if (global_tunables) {
 514                if (WARN_ON(have_governor_per_policy())) {
 515                        ret = -EINVAL;
 516                        goto stop_kthread;
 517                }
 518                policy->governor_data = sg_policy;
 519                sg_policy->tunables = global_tunables;
 520
 521                gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
 522                goto out;
 523        }
 524
 525        tunables = sugov_tunables_alloc(sg_policy);
 526        if (!tunables) {
 527                ret = -ENOMEM;
 528                goto stop_kthread;
 529        }
 530
 531        if (policy->transition_delay_us) {
 532                tunables->rate_limit_us = policy->transition_delay_us;
 533        } else {
 534                unsigned int lat;
 535
 536                tunables->rate_limit_us = LATENCY_MULTIPLIER;
 537                lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
 538                if (lat)
 539                        tunables->rate_limit_us *= lat;
 540        }
 541
 542        policy->governor_data = sg_policy;
 543        sg_policy->tunables = tunables;
 544
 545        ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
 546                                   get_governor_parent_kobj(policy), "%s",
 547                                   schedutil_gov.name);
 548        if (ret)
 549                goto fail;
 550
 551out:
 552        mutex_unlock(&global_tunables_lock);
 553        return 0;
 554
 555fail:
 556        policy->governor_data = NULL;
 557        sugov_tunables_free(tunables);
 558
 559stop_kthread:
 560        sugov_kthread_stop(sg_policy);
 561
 562free_sg_policy:
 563        mutex_unlock(&global_tunables_lock);
 564
 565        sugov_policy_free(sg_policy);
 566
 567disable_fast_switch:
 568        cpufreq_disable_fast_switch(policy);
 569
 570        pr_err("initialization failed (error %d)\n", ret);
 571        return ret;
 572}
 573
 574static void sugov_exit(struct cpufreq_policy *policy)
 575{
 576        struct sugov_policy *sg_policy = policy->governor_data;
 577        struct sugov_tunables *tunables = sg_policy->tunables;
 578        unsigned int count;
 579
 580        mutex_lock(&global_tunables_lock);
 581
 582        count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 583        policy->governor_data = NULL;
 584        if (!count)
 585                sugov_tunables_free(tunables);
 586
 587        mutex_unlock(&global_tunables_lock);
 588
 589        sugov_kthread_stop(sg_policy);
 590        sugov_policy_free(sg_policy);
 591        cpufreq_disable_fast_switch(policy);
 592}
 593
 594static int sugov_start(struct cpufreq_policy *policy)
 595{
 596        struct sugov_policy *sg_policy = policy->governor_data;
 597        unsigned int cpu;
 598
 599        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 600        sg_policy->last_freq_update_time = 0;
 601        sg_policy->next_freq = UINT_MAX;
 602        sg_policy->work_in_progress = false;
 603        sg_policy->need_freq_update = false;
 604        sg_policy->cached_raw_freq = 0;
 605
 606        for_each_cpu(cpu, policy->cpus) {
 607                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 608
 609                memset(sg_cpu, 0, sizeof(*sg_cpu));
 610                sg_cpu->sg_policy = sg_policy;
 611                sg_cpu->flags = SCHED_CPUFREQ_RT;
 612                sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
 613                cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 614                                             policy_is_shared(policy) ?
 615                                                        sugov_update_shared :
 616                                                        sugov_update_single);
 617        }
 618        return 0;
 619}
 620
 621static void sugov_stop(struct cpufreq_policy *policy)
 622{
 623        struct sugov_policy *sg_policy = policy->governor_data;
 624        unsigned int cpu;
 625
 626        for_each_cpu(cpu, policy->cpus)
 627                cpufreq_remove_update_util_hook(cpu);
 628
 629        synchronize_sched();
 630
 631        if (!policy->fast_switch_enabled) {
 632                irq_work_sync(&sg_policy->irq_work);
 633                kthread_cancel_work_sync(&sg_policy->work);
 634        }
 635}
 636
 637static void sugov_limits(struct cpufreq_policy *policy)
 638{
 639        struct sugov_policy *sg_policy = policy->governor_data;
 640
 641        if (!policy->fast_switch_enabled) {
 642                mutex_lock(&sg_policy->work_lock);
 643                cpufreq_policy_apply_limits(policy);
 644                mutex_unlock(&sg_policy->work_lock);
 645        }
 646
 647        sg_policy->need_freq_update = true;
 648}
 649
 650static struct cpufreq_governor schedutil_gov = {
 651        .name = "schedutil",
 652        .owner = THIS_MODULE,
 653        .init = sugov_init,
 654        .exit = sugov_exit,
 655        .start = sugov_start,
 656        .stop = sugov_stop,
 657        .limits = sugov_limits,
 658};
 659
 660#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 661struct cpufreq_governor *cpufreq_default_governor(void)
 662{
 663        return &schedutil_gov;
 664}
 665#endif
 666
 667static int __init sugov_register(void)
 668{
 669        return cpufreq_register_governor(&schedutil_gov);
 670}
 671fs_initcall(sugov_register);
 672