linux/kernel/sched/cpufreq_schedutil.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * CPUFreq governor based on scheduler-provided CPU utilization data.
   4 *
   5 * Copyright (C) 2016, Intel Corporation
   6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   7 */
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include "sched.h"
  12
  13#include <linux/sched/cpufreq.h>
  14#include <trace/events/power.h>
  15
  16#define IOWAIT_BOOST_MIN        (SCHED_CAPACITY_SCALE / 8)
  17
  18struct sugov_tunables {
  19        struct gov_attr_set     attr_set;
  20        unsigned int            rate_limit_us;
  21};
  22
  23struct sugov_policy {
  24        struct cpufreq_policy   *policy;
  25
  26        struct sugov_tunables   *tunables;
  27        struct list_head        tunables_hook;
  28
  29        raw_spinlock_t          update_lock;
  30        u64                     last_freq_update_time;
  31        s64                     freq_update_delay_ns;
  32        unsigned int            next_freq;
  33        unsigned int            cached_raw_freq;
  34
  35        /* The next fields are only needed if fast switch cannot be used: */
  36        struct                  irq_work irq_work;
  37        struct                  kthread_work work;
  38        struct                  mutex work_lock;
  39        struct                  kthread_worker worker;
  40        struct task_struct      *thread;
  41        bool                    work_in_progress;
  42
  43        bool                    limits_changed;
  44        bool                    need_freq_update;
  45};
  46
  47struct sugov_cpu {
  48        struct update_util_data update_util;
  49        struct sugov_policy     *sg_policy;
  50        unsigned int            cpu;
  51
  52        bool                    iowait_boost_pending;
  53        unsigned int            iowait_boost;
  54        u64                     last_update;
  55
  56        unsigned long           util;
  57        unsigned long           bw_dl;
  58        unsigned long           max;
  59
  60        /* The field below is for single-CPU policies only: */
  61#ifdef CONFIG_NO_HZ_COMMON
  62        unsigned long           saved_idle_calls;
  63#endif
  64};
  65
  66static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  67
  68/************************ Governor internals ***********************/
  69
  70static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  71{
  72        s64 delta_ns;
  73
  74        /*
  75         * Since cpufreq_update_util() is called with rq->lock held for
  76         * the @target_cpu, our per-CPU data is fully serialized.
  77         *
  78         * However, drivers cannot in general deal with cross-CPU
  79         * requests, so while get_next_freq() will work, our
  80         * sugov_update_commit() call may not for the fast switching platforms.
  81         *
  82         * Hence stop here for remote requests if they aren't supported
  83         * by the hardware, as calculating the frequency is pointless if
  84         * we cannot in fact act on it.
  85         *
  86         * This is needed on the slow switching platforms too to prevent CPUs
  87         * going offline from leaving stale IRQ work items behind.
  88         */
  89        if (!cpufreq_this_cpu_can_update(sg_policy->policy))
  90                return false;
  91
  92        if (unlikely(sg_policy->limits_changed)) {
  93                sg_policy->limits_changed = false;
  94                sg_policy->need_freq_update = true;
  95                return true;
  96        }
  97
  98        delta_ns = time - sg_policy->last_freq_update_time;
  99
 100        return delta_ns >= sg_policy->freq_update_delay_ns;
 101}
 102
 103static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
 104                                   unsigned int next_freq)
 105{
 106        if (sg_policy->need_freq_update)
 107                sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
 108        else if (sg_policy->next_freq == next_freq)
 109                return false;
 110
 111        sg_policy->next_freq = next_freq;
 112        sg_policy->last_freq_update_time = time;
 113
 114        return true;
 115}
 116
 117static void sugov_deferred_update(struct sugov_policy *sg_policy)
 118{
 119        if (!sg_policy->work_in_progress) {
 120                sg_policy->work_in_progress = true;
 121                irq_work_queue(&sg_policy->irq_work);
 122        }
 123}
 124
 125/**
 126 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 127 * @sg_policy: schedutil policy object to compute the new frequency for.
 128 * @util: Current CPU utilization.
 129 * @max: CPU capacity.
 130 *
 131 * If the utilization is frequency-invariant, choose the new frequency to be
 132 * proportional to it, that is
 133 *
 134 * next_freq = C * max_freq * util / max
 135 *
 136 * Otherwise, approximate the would-be frequency-invariant utilization by
 137 * util_raw * (curr_freq / max_freq) which leads to
 138 *
 139 * next_freq = C * curr_freq * util_raw / max
 140 *
 141 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 142 *
 143 * The lowest driver-supported frequency which is equal or greater than the raw
 144 * next_freq (as calculated above) is returned, subject to policy min/max and
 145 * cpufreq driver limitations.
 146 */
 147static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 148                                  unsigned long util, unsigned long max)
 149{
 150        struct cpufreq_policy *policy = sg_policy->policy;
 151        unsigned int freq = arch_scale_freq_invariant() ?
 152                                policy->cpuinfo.max_freq : policy->cur;
 153
 154        util = map_util_perf(util);
 155        freq = map_util_freq(util, freq, max);
 156
 157        if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 158                return sg_policy->next_freq;
 159
 160        sg_policy->cached_raw_freq = freq;
 161        return cpufreq_driver_resolve_freq(policy, freq);
 162}
 163
 164static void sugov_get_util(struct sugov_cpu *sg_cpu)
 165{
 166        struct rq *rq = cpu_rq(sg_cpu->cpu);
 167        unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
 168
 169        sg_cpu->max = max;
 170        sg_cpu->bw_dl = cpu_bw_dl(rq);
 171        sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
 172                                          FREQUENCY_UTIL, NULL);
 173}
 174
 175/**
 176 * sugov_iowait_reset() - Reset the IO boost status of a CPU.
 177 * @sg_cpu: the sugov data for the CPU to boost
 178 * @time: the update time from the caller
 179 * @set_iowait_boost: true if an IO boost has been requested
 180 *
 181 * The IO wait boost of a task is disabled after a tick since the last update
 182 * of a CPU. If a new IO wait boost is requested after more then a tick, then
 183 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
 184 * efficiency by ignoring sporadic wakeups from IO.
 185 */
 186static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 187                               bool set_iowait_boost)
 188{
 189        s64 delta_ns = time - sg_cpu->last_update;
 190
 191        /* Reset boost only if a tick has elapsed since last request */
 192        if (delta_ns <= TICK_NSEC)
 193                return false;
 194
 195        sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
 196        sg_cpu->iowait_boost_pending = set_iowait_boost;
 197
 198        return true;
 199}
 200
 201/**
 202 * sugov_iowait_boost() - Updates the IO boost status of a CPU.
 203 * @sg_cpu: the sugov data for the CPU to boost
 204 * @time: the update time from the caller
 205 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 206 *
 207 * Each time a task wakes up after an IO operation, the CPU utilization can be
 208 * boosted to a certain utilization which doubles at each "frequent and
 209 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
 210 * of the maximum OPP.
 211 *
 212 * To keep doubling, an IO boost has to be requested at least once per tick,
 213 * otherwise we restart from the utilization of the minimum OPP.
 214 */
 215static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 216                               unsigned int flags)
 217{
 218        bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
 219
 220        /* Reset boost if the CPU appears to have been idle enough */
 221        if (sg_cpu->iowait_boost &&
 222            sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
 223                return;
 224
 225        /* Boost only tasks waking up after IO */
 226        if (!set_iowait_boost)
 227                return;
 228
 229        /* Ensure boost doubles only one time at each request */
 230        if (sg_cpu->iowait_boost_pending)
 231                return;
 232        sg_cpu->iowait_boost_pending = true;
 233
 234        /* Double the boost at each request */
 235        if (sg_cpu->iowait_boost) {
 236                sg_cpu->iowait_boost =
 237                        min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
 238                return;
 239        }
 240
 241        /* First wakeup after IO: start with minimum boost */
 242        sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
 243}
 244
 245/**
 246 * sugov_iowait_apply() - Apply the IO boost to a CPU.
 247 * @sg_cpu: the sugov data for the cpu to boost
 248 * @time: the update time from the caller
 249 *
 250 * A CPU running a task which woken up after an IO operation can have its
 251 * utilization boosted to speed up the completion of those IO operations.
 252 * The IO boost value is increased each time a task wakes up from IO, in
 253 * sugov_iowait_apply(), and it's instead decreased by this function,
 254 * each time an increase has not been requested (!iowait_boost_pending).
 255 *
 256 * A CPU which also appears to have been idle for at least one tick has also
 257 * its IO boost utilization reset.
 258 *
 259 * This mechanism is designed to boost high frequently IO waiting tasks, while
 260 * being more conservative on tasks which does sporadic IO operations.
 261 */
 262static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 263{
 264        unsigned long boost;
 265
 266        /* No boost currently required */
 267        if (!sg_cpu->iowait_boost)
 268                return;
 269
 270        /* Reset boost if the CPU appears to have been idle enough */
 271        if (sugov_iowait_reset(sg_cpu, time, false))
 272                return;
 273
 274        if (!sg_cpu->iowait_boost_pending) {
 275                /*
 276                 * No boost pending; reduce the boost value.
 277                 */
 278                sg_cpu->iowait_boost >>= 1;
 279                if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
 280                        sg_cpu->iowait_boost = 0;
 281                        return;
 282                }
 283        }
 284
 285        sg_cpu->iowait_boost_pending = false;
 286
 287        /*
 288         * sg_cpu->util is already in capacity scale; convert iowait_boost
 289         * into the same scale so we can compare.
 290         */
 291        boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
 292        if (sg_cpu->util < boost)
 293                sg_cpu->util = boost;
 294}
 295
 296#ifdef CONFIG_NO_HZ_COMMON
 297static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 298{
 299        unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 300        bool ret = idle_calls == sg_cpu->saved_idle_calls;
 301
 302        sg_cpu->saved_idle_calls = idle_calls;
 303        return ret;
 304}
 305#else
 306static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 307#endif /* CONFIG_NO_HZ_COMMON */
 308
 309/*
 310 * Make sugov_should_update_freq() ignore the rate limit when DL
 311 * has increased the utilization.
 312 */
 313static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
 314{
 315        if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
 316                sg_cpu->sg_policy->limits_changed = true;
 317}
 318
 319static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
 320                                              u64 time, unsigned int flags)
 321{
 322        sugov_iowait_boost(sg_cpu, time, flags);
 323        sg_cpu->last_update = time;
 324
 325        ignore_dl_rate_limit(sg_cpu);
 326
 327        if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
 328                return false;
 329
 330        sugov_get_util(sg_cpu);
 331        sugov_iowait_apply(sg_cpu, time);
 332
 333        return true;
 334}
 335
 336static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 337                                     unsigned int flags)
 338{
 339        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 340        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 341        unsigned int cached_freq = sg_policy->cached_raw_freq;
 342        unsigned int next_f;
 343
 344        if (!sugov_update_single_common(sg_cpu, time, flags))
 345                return;
 346
 347        next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
 348        /*
 349         * Do not reduce the frequency if the CPU has not been idle
 350         * recently, as the reduction is likely to be premature then.
 351         */
 352        if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
 353                next_f = sg_policy->next_freq;
 354
 355                /* Restore cached freq as next_freq has changed */
 356                sg_policy->cached_raw_freq = cached_freq;
 357        }
 358
 359        if (!sugov_update_next_freq(sg_policy, time, next_f))
 360                return;
 361
 362        /*
 363         * This code runs under rq->lock for the target CPU, so it won't run
 364         * concurrently on two different CPUs for the same target and it is not
 365         * necessary to acquire the lock in the fast switch case.
 366         */
 367        if (sg_policy->policy->fast_switch_enabled) {
 368                cpufreq_driver_fast_switch(sg_policy->policy, next_f);
 369        } else {
 370                raw_spin_lock(&sg_policy->update_lock);
 371                sugov_deferred_update(sg_policy);
 372                raw_spin_unlock(&sg_policy->update_lock);
 373        }
 374}
 375
 376static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 377                                     unsigned int flags)
 378{
 379        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 380        unsigned long prev_util = sg_cpu->util;
 381
 382        /*
 383         * Fall back to the "frequency" path if frequency invariance is not
 384         * supported, because the direct mapping between the utilization and
 385         * the performance levels depends on the frequency invariance.
 386         */
 387        if (!arch_scale_freq_invariant()) {
 388                sugov_update_single_freq(hook, time, flags);
 389                return;
 390        }
 391
 392        if (!sugov_update_single_common(sg_cpu, time, flags))
 393                return;
 394
 395        /*
 396         * Do not reduce the target performance level if the CPU has not been
 397         * idle recently, as the reduction is likely to be premature then.
 398         */
 399        if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 400                sg_cpu->util = prev_util;
 401
 402        cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
 403                                   map_util_perf(sg_cpu->util), sg_cpu->max);
 404
 405        sg_cpu->sg_policy->last_freq_update_time = time;
 406}
 407
 408static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 409{
 410        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 411        struct cpufreq_policy *policy = sg_policy->policy;
 412        unsigned long util = 0, max = 1;
 413        unsigned int j;
 414
 415        for_each_cpu(j, policy->cpus) {
 416                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 417                unsigned long j_util, j_max;
 418
 419                sugov_get_util(j_sg_cpu);
 420                sugov_iowait_apply(j_sg_cpu, time);
 421                j_util = j_sg_cpu->util;
 422                j_max = j_sg_cpu->max;
 423
 424                if (j_util * max > j_max * util) {
 425                        util = j_util;
 426                        max = j_max;
 427                }
 428        }
 429
 430        return get_next_freq(sg_policy, util, max);
 431}
 432
 433static void
 434sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 435{
 436        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 437        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 438        unsigned int next_f;
 439
 440        raw_spin_lock(&sg_policy->update_lock);
 441
 442        sugov_iowait_boost(sg_cpu, time, flags);
 443        sg_cpu->last_update = time;
 444
 445        ignore_dl_rate_limit(sg_cpu);
 446
 447        if (sugov_should_update_freq(sg_policy, time)) {
 448                next_f = sugov_next_freq_shared(sg_cpu, time);
 449
 450                if (!sugov_update_next_freq(sg_policy, time, next_f))
 451                        goto unlock;
 452
 453                if (sg_policy->policy->fast_switch_enabled)
 454                        cpufreq_driver_fast_switch(sg_policy->policy, next_f);
 455                else
 456                        sugov_deferred_update(sg_policy);
 457        }
 458unlock:
 459        raw_spin_unlock(&sg_policy->update_lock);
 460}
 461
 462static void sugov_work(struct kthread_work *work)
 463{
 464        struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 465        unsigned int freq;
 466        unsigned long flags;
 467
 468        /*
 469         * Hold sg_policy->update_lock shortly to handle the case where:
 470         * in case sg_policy->next_freq is read here, and then updated by
 471         * sugov_deferred_update() just before work_in_progress is set to false
 472         * here, we may miss queueing the new update.
 473         *
 474         * Note: If a work was queued after the update_lock is released,
 475         * sugov_work() will just be called again by kthread_work code; and the
 476         * request will be proceed before the sugov thread sleeps.
 477         */
 478        raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 479        freq = sg_policy->next_freq;
 480        sg_policy->work_in_progress = false;
 481        raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 482
 483        mutex_lock(&sg_policy->work_lock);
 484        __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
 485        mutex_unlock(&sg_policy->work_lock);
 486}
 487
 488static void sugov_irq_work(struct irq_work *irq_work)
 489{
 490        struct sugov_policy *sg_policy;
 491
 492        sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 493
 494        kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 495}
 496
 497/************************** sysfs interface ************************/
 498
 499static struct sugov_tunables *global_tunables;
 500static DEFINE_MUTEX(global_tunables_lock);
 501
 502static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
 503{
 504        return container_of(attr_set, struct sugov_tunables, attr_set);
 505}
 506
 507static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 508{
 509        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 510
 511        return sprintf(buf, "%u\n", tunables->rate_limit_us);
 512}
 513
 514static ssize_t
 515rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
 516{
 517        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 518        struct sugov_policy *sg_policy;
 519        unsigned int rate_limit_us;
 520
 521        if (kstrtouint(buf, 10, &rate_limit_us))
 522                return -EINVAL;
 523
 524        tunables->rate_limit_us = rate_limit_us;
 525
 526        list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
 527                sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
 528
 529        return count;
 530}
 531
 532static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
 533
 534static struct attribute *sugov_attrs[] = {
 535        &rate_limit_us.attr,
 536        NULL
 537};
 538ATTRIBUTE_GROUPS(sugov);
 539
 540static void sugov_tunables_free(struct kobject *kobj)
 541{
 542        struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
 543
 544        kfree(to_sugov_tunables(attr_set));
 545}
 546
 547static struct kobj_type sugov_tunables_ktype = {
 548        .default_groups = sugov_groups,
 549        .sysfs_ops = &governor_sysfs_ops,
 550        .release = &sugov_tunables_free,
 551};
 552
 553/********************** cpufreq governor interface *********************/
 554
 555struct cpufreq_governor schedutil_gov;
 556
 557static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 558{
 559        struct sugov_policy *sg_policy;
 560
 561        sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
 562        if (!sg_policy)
 563                return NULL;
 564
 565        sg_policy->policy = policy;
 566        raw_spin_lock_init(&sg_policy->update_lock);
 567        return sg_policy;
 568}
 569
 570static void sugov_policy_free(struct sugov_policy *sg_policy)
 571{
 572        kfree(sg_policy);
 573}
 574
 575static int sugov_kthread_create(struct sugov_policy *sg_policy)
 576{
 577        struct task_struct *thread;
 578        struct sched_attr attr = {
 579                .size           = sizeof(struct sched_attr),
 580                .sched_policy   = SCHED_DEADLINE,
 581                .sched_flags    = SCHED_FLAG_SUGOV,
 582                .sched_nice     = 0,
 583                .sched_priority = 0,
 584                /*
 585                 * Fake (unused) bandwidth; workaround to "fix"
 586                 * priority inheritance.
 587                 */
 588                .sched_runtime  =  1000000,
 589                .sched_deadline = 10000000,
 590                .sched_period   = 10000000,
 591        };
 592        struct cpufreq_policy *policy = sg_policy->policy;
 593        int ret;
 594
 595        /* kthread only required for slow path */
 596        if (policy->fast_switch_enabled)
 597                return 0;
 598
 599        kthread_init_work(&sg_policy->work, sugov_work);
 600        kthread_init_worker(&sg_policy->worker);
 601        thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
 602                                "sugov:%d",
 603                                cpumask_first(policy->related_cpus));
 604        if (IS_ERR(thread)) {
 605                pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
 606                return PTR_ERR(thread);
 607        }
 608
 609        ret = sched_setattr_nocheck(thread, &attr);
 610        if (ret) {
 611                kthread_stop(thread);
 612                pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
 613                return ret;
 614        }
 615
 616        sg_policy->thread = thread;
 617        kthread_bind_mask(thread, policy->related_cpus);
 618        init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 619        mutex_init(&sg_policy->work_lock);
 620
 621        wake_up_process(thread);
 622
 623        return 0;
 624}
 625
 626static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 627{
 628        /* kthread only required for slow path */
 629        if (sg_policy->policy->fast_switch_enabled)
 630                return;
 631
 632        kthread_flush_worker(&sg_policy->worker);
 633        kthread_stop(sg_policy->thread);
 634        mutex_destroy(&sg_policy->work_lock);
 635}
 636
 637static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 638{
 639        struct sugov_tunables *tunables;
 640
 641        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
 642        if (tunables) {
 643                gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
 644                if (!have_governor_per_policy())
 645                        global_tunables = tunables;
 646        }
 647        return tunables;
 648}
 649
 650static void sugov_clear_global_tunables(void)
 651{
 652        if (!have_governor_per_policy())
 653                global_tunables = NULL;
 654}
 655
 656static int sugov_init(struct cpufreq_policy *policy)
 657{
 658        struct sugov_policy *sg_policy;
 659        struct sugov_tunables *tunables;
 660        int ret = 0;
 661
 662        /* State should be equivalent to EXIT */
 663        if (policy->governor_data)
 664                return -EBUSY;
 665
 666        cpufreq_enable_fast_switch(policy);
 667
 668        sg_policy = sugov_policy_alloc(policy);
 669        if (!sg_policy) {
 670                ret = -ENOMEM;
 671                goto disable_fast_switch;
 672        }
 673
 674        ret = sugov_kthread_create(sg_policy);
 675        if (ret)
 676                goto free_sg_policy;
 677
 678        mutex_lock(&global_tunables_lock);
 679
 680        if (global_tunables) {
 681                if (WARN_ON(have_governor_per_policy())) {
 682                        ret = -EINVAL;
 683                        goto stop_kthread;
 684                }
 685                policy->governor_data = sg_policy;
 686                sg_policy->tunables = global_tunables;
 687
 688                gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
 689                goto out;
 690        }
 691
 692        tunables = sugov_tunables_alloc(sg_policy);
 693        if (!tunables) {
 694                ret = -ENOMEM;
 695                goto stop_kthread;
 696        }
 697
 698        tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 699
 700        policy->governor_data = sg_policy;
 701        sg_policy->tunables = tunables;
 702
 703        ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
 704                                   get_governor_parent_kobj(policy), "%s",
 705                                   schedutil_gov.name);
 706        if (ret)
 707                goto fail;
 708
 709out:
 710        mutex_unlock(&global_tunables_lock);
 711        return 0;
 712
 713fail:
 714        kobject_put(&tunables->attr_set.kobj);
 715        policy->governor_data = NULL;
 716        sugov_clear_global_tunables();
 717
 718stop_kthread:
 719        sugov_kthread_stop(sg_policy);
 720        mutex_unlock(&global_tunables_lock);
 721
 722free_sg_policy:
 723        sugov_policy_free(sg_policy);
 724
 725disable_fast_switch:
 726        cpufreq_disable_fast_switch(policy);
 727
 728        pr_err("initialization failed (error %d)\n", ret);
 729        return ret;
 730}
 731
 732static void sugov_exit(struct cpufreq_policy *policy)
 733{
 734        struct sugov_policy *sg_policy = policy->governor_data;
 735        struct sugov_tunables *tunables = sg_policy->tunables;
 736        unsigned int count;
 737
 738        mutex_lock(&global_tunables_lock);
 739
 740        count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 741        policy->governor_data = NULL;
 742        if (!count)
 743                sugov_clear_global_tunables();
 744
 745        mutex_unlock(&global_tunables_lock);
 746
 747        sugov_kthread_stop(sg_policy);
 748        sugov_policy_free(sg_policy);
 749        cpufreq_disable_fast_switch(policy);
 750}
 751
 752static int sugov_start(struct cpufreq_policy *policy)
 753{
 754        struct sugov_policy *sg_policy = policy->governor_data;
 755        void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
 756        unsigned int cpu;
 757
 758        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 759        sg_policy->last_freq_update_time        = 0;
 760        sg_policy->next_freq                    = 0;
 761        sg_policy->work_in_progress             = false;
 762        sg_policy->limits_changed               = false;
 763        sg_policy->cached_raw_freq              = 0;
 764
 765        sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
 766
 767        for_each_cpu(cpu, policy->cpus) {
 768                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 769
 770                memset(sg_cpu, 0, sizeof(*sg_cpu));
 771                sg_cpu->cpu                     = cpu;
 772                sg_cpu->sg_policy               = sg_policy;
 773        }
 774
 775        if (policy_is_shared(policy))
 776                uu = sugov_update_shared;
 777        else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
 778                uu = sugov_update_single_perf;
 779        else
 780                uu = sugov_update_single_freq;
 781
 782        for_each_cpu(cpu, policy->cpus) {
 783                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 784
 785                cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
 786        }
 787        return 0;
 788}
 789
 790static void sugov_stop(struct cpufreq_policy *policy)
 791{
 792        struct sugov_policy *sg_policy = policy->governor_data;
 793        unsigned int cpu;
 794
 795        for_each_cpu(cpu, policy->cpus)
 796                cpufreq_remove_update_util_hook(cpu);
 797
 798        synchronize_rcu();
 799
 800        if (!policy->fast_switch_enabled) {
 801                irq_work_sync(&sg_policy->irq_work);
 802                kthread_cancel_work_sync(&sg_policy->work);
 803        }
 804}
 805
 806static void sugov_limits(struct cpufreq_policy *policy)
 807{
 808        struct sugov_policy *sg_policy = policy->governor_data;
 809
 810        if (!policy->fast_switch_enabled) {
 811                mutex_lock(&sg_policy->work_lock);
 812                cpufreq_policy_apply_limits(policy);
 813                mutex_unlock(&sg_policy->work_lock);
 814        }
 815
 816        sg_policy->limits_changed = true;
 817}
 818
 819struct cpufreq_governor schedutil_gov = {
 820        .name                   = "schedutil",
 821        .owner                  = THIS_MODULE,
 822        .flags                  = CPUFREQ_GOV_DYNAMIC_SWITCHING,
 823        .init                   = sugov_init,
 824        .exit                   = sugov_exit,
 825        .start                  = sugov_start,
 826        .stop                   = sugov_stop,
 827        .limits                 = sugov_limits,
 828};
 829
 830#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 831struct cpufreq_governor *cpufreq_default_governor(void)
 832{
 833        return &schedutil_gov;
 834}
 835#endif
 836
 837cpufreq_governor_init(schedutil_gov);
 838
 839#ifdef CONFIG_ENERGY_MODEL
 840static void rebuild_sd_workfn(struct work_struct *work)
 841{
 842        rebuild_sched_domains_energy();
 843}
 844static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
 845
 846/*
 847 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
 848 * on governor changes to make sure the scheduler knows about it.
 849 */
 850void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
 851                                  struct cpufreq_governor *old_gov)
 852{
 853        if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
 854                /*
 855                 * When called from the cpufreq_register_driver() path, the
 856                 * cpu_hotplug_lock is already held, so use a work item to
 857                 * avoid nested locking in rebuild_sched_domains().
 858                 */
 859                schedule_work(&rebuild_sd_work);
 860        }
 861
 862}
 863#endif
 864