linux/kernel/sched/cpufreq_schedutil.c
<<
>>
Prefs
   1/*
   2 * CPUFreq governor based on scheduler-provided CPU utilization data.
   3 *
   4 * Copyright (C) 2016, Intel Corporation
   5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 */
  11
  12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14#include "sched.h"
  15
  16#include <linux/sched/cpufreq.h>
  17#include <trace/events/power.h>
  18
  19struct sugov_tunables {
  20        struct gov_attr_set     attr_set;
  21        unsigned int            rate_limit_us;
  22};
  23
  24struct sugov_policy {
  25        struct cpufreq_policy   *policy;
  26
  27        struct sugov_tunables   *tunables;
  28        struct list_head        tunables_hook;
  29
  30        raw_spinlock_t          update_lock;    /* For shared policies */
  31        u64                     last_freq_update_time;
  32        s64                     freq_update_delay_ns;
  33        unsigned int            next_freq;
  34        unsigned int            cached_raw_freq;
  35
  36        /* The next fields are only needed if fast switch cannot be used: */
  37        struct                  irq_work irq_work;
  38        struct                  kthread_work work;
  39        struct                  mutex work_lock;
  40        struct                  kthread_worker worker;
  41        struct task_struct      *thread;
  42        bool                    work_in_progress;
  43
  44        bool                    limits_changed;
  45        bool                    need_freq_update;
  46};
  47
  48struct sugov_cpu {
  49        struct update_util_data update_util;
  50        struct sugov_policy     *sg_policy;
  51        unsigned int            cpu;
  52
  53        bool                    iowait_boost_pending;
  54        unsigned int            iowait_boost;
  55        u64                     last_update;
  56
  57        unsigned long           bw_dl;
  58        unsigned long           min;
  59        unsigned long           max;
  60
  61        /* The field below is for single-CPU policies only: */
  62#ifdef CONFIG_NO_HZ_COMMON
  63        unsigned long           saved_idle_calls;
  64#endif
  65};
  66
  67static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  68
  69/************************ Governor internals ***********************/
  70
  71static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  72{
  73        s64 delta_ns;
  74
  75        /*
  76         * Since cpufreq_update_util() is called with rq->lock held for
  77         * the @target_cpu, our per-CPU data is fully serialized.
  78         *
  79         * However, drivers cannot in general deal with cross-CPU
  80         * requests, so while get_next_freq() will work, our
  81         * sugov_update_commit() call may not for the fast switching platforms.
  82         *
  83         * Hence stop here for remote requests if they aren't supported
  84         * by the hardware, as calculating the frequency is pointless if
  85         * we cannot in fact act on it.
  86         *
  87         * For the slow switching platforms, the kthread is always scheduled on
  88         * the right set of CPUs and any CPU can find the next frequency and
  89         * schedule the kthread.
  90         */
  91        if (sg_policy->policy->fast_switch_enabled &&
  92            !cpufreq_this_cpu_can_update(sg_policy->policy))
  93                return false;
  94
  95        if (unlikely(sg_policy->limits_changed)) {
  96                sg_policy->limits_changed = false;
  97                sg_policy->need_freq_update = true;
  98                return true;
  99        }
 100
 101        delta_ns = time - sg_policy->last_freq_update_time;
 102
 103        return delta_ns >= sg_policy->freq_update_delay_ns;
 104}
 105
 106static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
 107                                   unsigned int next_freq)
 108{
 109        if (sg_policy->next_freq == next_freq)
 110                return false;
 111
 112        sg_policy->next_freq = next_freq;
 113        sg_policy->last_freq_update_time = time;
 114
 115        return true;
 116}
 117
 118static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
 119                              unsigned int next_freq)
 120{
 121        struct cpufreq_policy *policy = sg_policy->policy;
 122
 123        if (!sugov_update_next_freq(sg_policy, time, next_freq))
 124                return;
 125
 126        next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 127        if (!next_freq)
 128                return;
 129
 130        policy->cur = next_freq;
 131        trace_cpu_frequency(next_freq, smp_processor_id());
 132}
 133
 134static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
 135                                  unsigned int next_freq)
 136{
 137        if (!sugov_update_next_freq(sg_policy, time, next_freq))
 138                return;
 139
 140        if (!sg_policy->work_in_progress) {
 141                sg_policy->work_in_progress = true;
 142                irq_work_queue(&sg_policy->irq_work);
 143        }
 144}
 145
 146/**
 147 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 148 * @sg_policy: schedutil policy object to compute the new frequency for.
 149 * @util: Current CPU utilization.
 150 * @max: CPU capacity.
 151 *
 152 * If the utilization is frequency-invariant, choose the new frequency to be
 153 * proportional to it, that is
 154 *
 155 * next_freq = C * max_freq * util / max
 156 *
 157 * Otherwise, approximate the would-be frequency-invariant utilization by
 158 * util_raw * (curr_freq / max_freq) which leads to
 159 *
 160 * next_freq = C * curr_freq * util_raw / max
 161 *
 162 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 163 *
 164 * The lowest driver-supported frequency which is equal or greater than the raw
 165 * next_freq (as calculated above) is returned, subject to policy min/max and
 166 * cpufreq driver limitations.
 167 */
 168static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 169                                  unsigned long util, unsigned long max)
 170{
 171        struct cpufreq_policy *policy = sg_policy->policy;
 172        unsigned int freq = arch_scale_freq_invariant() ?
 173                                policy->cpuinfo.max_freq : policy->cur;
 174
 175        freq = map_util_freq(util, freq, max);
 176
 177        if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 178                return sg_policy->next_freq;
 179
 180        sg_policy->need_freq_update = false;
 181        sg_policy->cached_raw_freq = freq;
 182        return cpufreq_driver_resolve_freq(policy, freq);
 183}
 184
 185/*
 186 * This function computes an effective utilization for the given CPU, to be
 187 * used for frequency selection given the linear relation: f = u * f_max.
 188 *
 189 * The scheduler tracks the following metrics:
 190 *
 191 *   cpu_util_{cfs,rt,dl,irq}()
 192 *   cpu_bw_dl()
 193 *
 194 * Where the cfs,rt and dl util numbers are tracked with the same metric and
 195 * synchronized windows and are thus directly comparable.
 196 *
 197 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
 198 * which excludes things like IRQ and steal-time. These latter are then accrued
 199 * in the irq utilization.
 200 *
 201 * The DL bandwidth number otoh is not a measured metric but a value computed
 202 * based on the task model parameters and gives the minimal utilization
 203 * required to meet deadlines.
 204 */
 205unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
 206                                  unsigned long max, enum schedutil_type type)
 207{
 208        unsigned long dl_util, util, irq;
 209        struct rq *rq = cpu_rq(cpu);
 210
 211        if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
 212                return max;
 213
 214        /*
 215         * Early check to see if IRQ/steal time saturates the CPU, can be
 216         * because of inaccuracies in how we track these -- see
 217         * update_irq_load_avg().
 218         */
 219        irq = cpu_util_irq(rq);
 220        if (unlikely(irq >= max))
 221                return max;
 222
 223        /*
 224         * Because the time spend on RT/DL tasks is visible as 'lost' time to
 225         * CFS tasks and we use the same metric to track the effective
 226         * utilization (PELT windows are synchronized) we can directly add them
 227         * to obtain the CPU's actual utilization.
 228         */
 229        util = util_cfs;
 230        util += cpu_util_rt(rq);
 231
 232        dl_util = cpu_util_dl(rq);
 233
 234        /*
 235         * For frequency selection we do not make cpu_util_dl() a permanent part
 236         * of this sum because we want to use cpu_bw_dl() later on, but we need
 237         * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
 238         * that we select f_max when there is no idle time.
 239         *
 240         * NOTE: numerical errors or stop class might cause us to not quite hit
 241         * saturation when we should -- something for later.
 242         */
 243        if (util + dl_util >= max)
 244                return max;
 245
 246        /*
 247         * OTOH, for energy computation we need the estimated running time, so
 248         * include util_dl and ignore dl_bw.
 249         */
 250        if (type == ENERGY_UTIL)
 251                util += dl_util;
 252
 253        /*
 254         * There is still idle time; further improve the number by using the
 255         * irq metric. Because IRQ/steal time is hidden from the task clock we
 256         * need to scale the task numbers:
 257         *
 258         *              1 - irq
 259         *   U' = irq + ------- * U
 260         *                max
 261         */
 262        util = scale_irq_capacity(util, irq, max);
 263        util += irq;
 264
 265        /*
 266         * Bandwidth required by DEADLINE must always be granted while, for
 267         * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 268         * to gracefully reduce the frequency when no tasks show up for longer
 269         * periods of time.
 270         *
 271         * Ideally we would like to set bw_dl as min/guaranteed freq and util +
 272         * bw_dl as requested freq. However, cpufreq is not yet ready for such
 273         * an interface. So, we only do the latter for now.
 274         */
 275        if (type == FREQUENCY_UTIL)
 276                util += cpu_bw_dl(rq);
 277
 278        return min(max, util);
 279}
 280
 281static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 282{
 283        struct rq *rq = cpu_rq(sg_cpu->cpu);
 284        unsigned long util = cpu_util_cfs(rq);
 285        unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
 286
 287        sg_cpu->max = max;
 288        sg_cpu->bw_dl = cpu_bw_dl(rq);
 289
 290        return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
 291}
 292
 293/**
 294 * sugov_iowait_reset() - Reset the IO boost status of a CPU.
 295 * @sg_cpu: the sugov data for the CPU to boost
 296 * @time: the update time from the caller
 297 * @set_iowait_boost: true if an IO boost has been requested
 298 *
 299 * The IO wait boost of a task is disabled after a tick since the last update
 300 * of a CPU. If a new IO wait boost is requested after more then a tick, then
 301 * we enable the boost starting from the minimum frequency, which improves
 302 * energy efficiency by ignoring sporadic wakeups from IO.
 303 */
 304static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 305                               bool set_iowait_boost)
 306{
 307        s64 delta_ns = time - sg_cpu->last_update;
 308
 309        /* Reset boost only if a tick has elapsed since last request */
 310        if (delta_ns <= TICK_NSEC)
 311                return false;
 312
 313        sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
 314        sg_cpu->iowait_boost_pending = set_iowait_boost;
 315
 316        return true;
 317}
 318
 319/**
 320 * sugov_iowait_boost() - Updates the IO boost status of a CPU.
 321 * @sg_cpu: the sugov data for the CPU to boost
 322 * @time: the update time from the caller
 323 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 324 *
 325 * Each time a task wakes up after an IO operation, the CPU utilization can be
 326 * boosted to a certain utilization which doubles at each "frequent and
 327 * successive" wakeup from IO, ranging from the utilization of the minimum
 328 * OPP to the utilization of the maximum OPP.
 329 * To keep doubling, an IO boost has to be requested at least once per tick,
 330 * otherwise we restart from the utilization of the minimum OPP.
 331 */
 332static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 333                               unsigned int flags)
 334{
 335        bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
 336
 337        /* Reset boost if the CPU appears to have been idle enough */
 338        if (sg_cpu->iowait_boost &&
 339            sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
 340                return;
 341
 342        /* Boost only tasks waking up after IO */
 343        if (!set_iowait_boost)
 344                return;
 345
 346        /* Ensure boost doubles only one time at each request */
 347        if (sg_cpu->iowait_boost_pending)
 348                return;
 349        sg_cpu->iowait_boost_pending = true;
 350
 351        /* Double the boost at each request */
 352        if (sg_cpu->iowait_boost) {
 353                sg_cpu->iowait_boost =
 354                        min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
 355                return;
 356        }
 357
 358        /* First wakeup after IO: start with minimum boost */
 359        sg_cpu->iowait_boost = sg_cpu->min;
 360}
 361
 362/**
 363 * sugov_iowait_apply() - Apply the IO boost to a CPU.
 364 * @sg_cpu: the sugov data for the cpu to boost
 365 * @time: the update time from the caller
 366 * @util: the utilization to (eventually) boost
 367 * @max: the maximum value the utilization can be boosted to
 368 *
 369 * A CPU running a task which woken up after an IO operation can have its
 370 * utilization boosted to speed up the completion of those IO operations.
 371 * The IO boost value is increased each time a task wakes up from IO, in
 372 * sugov_iowait_apply(), and it's instead decreased by this function,
 373 * each time an increase has not been requested (!iowait_boost_pending).
 374 *
 375 * A CPU which also appears to have been idle for at least one tick has also
 376 * its IO boost utilization reset.
 377 *
 378 * This mechanism is designed to boost high frequently IO waiting tasks, while
 379 * being more conservative on tasks which does sporadic IO operations.
 380 */
 381static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 382                                        unsigned long util, unsigned long max)
 383{
 384        unsigned long boost;
 385
 386        /* No boost currently required */
 387        if (!sg_cpu->iowait_boost)
 388                return util;
 389
 390        /* Reset boost if the CPU appears to have been idle enough */
 391        if (sugov_iowait_reset(sg_cpu, time, false))
 392                return util;
 393
 394        if (!sg_cpu->iowait_boost_pending) {
 395                /*
 396                 * No boost pending; reduce the boost value.
 397                 */
 398                sg_cpu->iowait_boost >>= 1;
 399                if (sg_cpu->iowait_boost < sg_cpu->min) {
 400                        sg_cpu->iowait_boost = 0;
 401                        return util;
 402                }
 403        }
 404
 405        sg_cpu->iowait_boost_pending = false;
 406
 407        /*
 408         * @util is already in capacity scale; convert iowait_boost
 409         * into the same scale so we can compare.
 410         */
 411        boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
 412        return max(boost, util);
 413}
 414
 415#ifdef CONFIG_NO_HZ_COMMON
 416static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 417{
 418        unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 419        bool ret = idle_calls == sg_cpu->saved_idle_calls;
 420
 421        sg_cpu->saved_idle_calls = idle_calls;
 422        return ret;
 423}
 424#else
 425static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 426#endif /* CONFIG_NO_HZ_COMMON */
 427
 428/*
 429 * Make sugov_should_update_freq() ignore the rate limit when DL
 430 * has increased the utilization.
 431 */
 432static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
 433{
 434        if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
 435                sg_policy->limits_changed = true;
 436}
 437
 438static void sugov_update_single(struct update_util_data *hook, u64 time,
 439                                unsigned int flags)
 440{
 441        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 442        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 443        unsigned long util, max;
 444        unsigned int next_f;
 445        bool busy;
 446
 447        sugov_iowait_boost(sg_cpu, time, flags);
 448        sg_cpu->last_update = time;
 449
 450        ignore_dl_rate_limit(sg_cpu, sg_policy);
 451
 452        if (!sugov_should_update_freq(sg_policy, time))
 453                return;
 454
 455        /* Limits may have changed, don't skip frequency update */
 456        busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
 457
 458        util = sugov_get_util(sg_cpu);
 459        max = sg_cpu->max;
 460        util = sugov_iowait_apply(sg_cpu, time, util, max);
 461        next_f = get_next_freq(sg_policy, util, max);
 462        /*
 463         * Do not reduce the frequency if the CPU has not been idle
 464         * recently, as the reduction is likely to be premature then.
 465         */
 466        if (busy && next_f < sg_policy->next_freq) {
 467                next_f = sg_policy->next_freq;
 468
 469                /* Reset cached freq as next_freq has changed */
 470                sg_policy->cached_raw_freq = 0;
 471        }
 472
 473        /*
 474         * This code runs under rq->lock for the target CPU, so it won't run
 475         * concurrently on two different CPUs for the same target and it is not
 476         * necessary to acquire the lock in the fast switch case.
 477         */
 478        if (sg_policy->policy->fast_switch_enabled) {
 479                sugov_fast_switch(sg_policy, time, next_f);
 480        } else {
 481                raw_spin_lock(&sg_policy->update_lock);
 482                sugov_deferred_update(sg_policy, time, next_f);
 483                raw_spin_unlock(&sg_policy->update_lock);
 484        }
 485}
 486
 487static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 488{
 489        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 490        struct cpufreq_policy *policy = sg_policy->policy;
 491        unsigned long util = 0, max = 1;
 492        unsigned int j;
 493
 494        for_each_cpu(j, policy->cpus) {
 495                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 496                unsigned long j_util, j_max;
 497
 498                j_util = sugov_get_util(j_sg_cpu);
 499                j_max = j_sg_cpu->max;
 500                j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
 501
 502                if (j_util * max > j_max * util) {
 503                        util = j_util;
 504                        max = j_max;
 505                }
 506        }
 507
 508        return get_next_freq(sg_policy, util, max);
 509}
 510
 511static void
 512sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 513{
 514        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 515        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 516        unsigned int next_f;
 517
 518        raw_spin_lock(&sg_policy->update_lock);
 519
 520        sugov_iowait_boost(sg_cpu, time, flags);
 521        sg_cpu->last_update = time;
 522
 523        ignore_dl_rate_limit(sg_cpu, sg_policy);
 524
 525        if (sugov_should_update_freq(sg_policy, time)) {
 526                next_f = sugov_next_freq_shared(sg_cpu, time);
 527
 528                if (sg_policy->policy->fast_switch_enabled)
 529                        sugov_fast_switch(sg_policy, time, next_f);
 530                else
 531                        sugov_deferred_update(sg_policy, time, next_f);
 532        }
 533
 534        raw_spin_unlock(&sg_policy->update_lock);
 535}
 536
 537static void sugov_work(struct kthread_work *work)
 538{
 539        struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 540        unsigned int freq;
 541        unsigned long flags;
 542
 543        /*
 544         * Hold sg_policy->update_lock shortly to handle the case where:
 545         * incase sg_policy->next_freq is read here, and then updated by
 546         * sugov_deferred_update() just before work_in_progress is set to false
 547         * here, we may miss queueing the new update.
 548         *
 549         * Note: If a work was queued after the update_lock is released,
 550         * sugov_work() will just be called again by kthread_work code; and the
 551         * request will be proceed before the sugov thread sleeps.
 552         */
 553        raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 554        freq = sg_policy->next_freq;
 555        sg_policy->work_in_progress = false;
 556        raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 557
 558        mutex_lock(&sg_policy->work_lock);
 559        __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
 560        mutex_unlock(&sg_policy->work_lock);
 561}
 562
 563static void sugov_irq_work(struct irq_work *irq_work)
 564{
 565        struct sugov_policy *sg_policy;
 566
 567        sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 568
 569        kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 570}
 571
 572/************************** sysfs interface ************************/
 573
 574static struct sugov_tunables *global_tunables;
 575static DEFINE_MUTEX(global_tunables_lock);
 576
 577static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
 578{
 579        return container_of(attr_set, struct sugov_tunables, attr_set);
 580}
 581
 582static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 583{
 584        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 585
 586        return sprintf(buf, "%u\n", tunables->rate_limit_us);
 587}
 588
 589static ssize_t
 590rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
 591{
 592        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 593        struct sugov_policy *sg_policy;
 594        unsigned int rate_limit_us;
 595
 596        if (kstrtouint(buf, 10, &rate_limit_us))
 597                return -EINVAL;
 598
 599        tunables->rate_limit_us = rate_limit_us;
 600
 601        list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
 602                sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
 603
 604        return count;
 605}
 606
 607static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
 608
 609static struct attribute *sugov_attributes[] = {
 610        &rate_limit_us.attr,
 611        NULL
 612};
 613
 614static struct kobj_type sugov_tunables_ktype = {
 615        .default_attrs = sugov_attributes,
 616        .sysfs_ops = &governor_sysfs_ops,
 617};
 618
 619/********************** cpufreq governor interface *********************/
 620
 621struct cpufreq_governor schedutil_gov;
 622
 623static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 624{
 625        struct sugov_policy *sg_policy;
 626
 627        sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
 628        if (!sg_policy)
 629                return NULL;
 630
 631        sg_policy->policy = policy;
 632        raw_spin_lock_init(&sg_policy->update_lock);
 633        return sg_policy;
 634}
 635
 636static void sugov_policy_free(struct sugov_policy *sg_policy)
 637{
 638        kfree(sg_policy);
 639}
 640
 641static int sugov_kthread_create(struct sugov_policy *sg_policy)
 642{
 643        struct task_struct *thread;
 644        struct sched_attr attr = {
 645                .size           = sizeof(struct sched_attr),
 646                .sched_policy   = SCHED_DEADLINE,
 647                .sched_flags    = SCHED_FLAG_SUGOV,
 648                .sched_nice     = 0,
 649                .sched_priority = 0,
 650                /*
 651                 * Fake (unused) bandwidth; workaround to "fix"
 652                 * priority inheritance.
 653                 */
 654                .sched_runtime  =  1000000,
 655                .sched_deadline = 10000000,
 656                .sched_period   = 10000000,
 657        };
 658        struct cpufreq_policy *policy = sg_policy->policy;
 659        int ret;
 660
 661        /* kthread only required for slow path */
 662        if (policy->fast_switch_enabled)
 663                return 0;
 664
 665        kthread_init_work(&sg_policy->work, sugov_work);
 666        kthread_init_worker(&sg_policy->worker);
 667        thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
 668                                "sugov:%d",
 669                                cpumask_first(policy->related_cpus));
 670        if (IS_ERR(thread)) {
 671                pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
 672                return PTR_ERR(thread);
 673        }
 674
 675        ret = sched_setattr_nocheck(thread, &attr);
 676        if (ret) {
 677                kthread_stop(thread);
 678                pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
 679                return ret;
 680        }
 681
 682        sg_policy->thread = thread;
 683        kthread_bind_mask(thread, policy->related_cpus);
 684        init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 685        mutex_init(&sg_policy->work_lock);
 686
 687        wake_up_process(thread);
 688
 689        return 0;
 690}
 691
 692static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 693{
 694        /* kthread only required for slow path */
 695        if (sg_policy->policy->fast_switch_enabled)
 696                return;
 697
 698        kthread_flush_worker(&sg_policy->worker);
 699        kthread_stop(sg_policy->thread);
 700        mutex_destroy(&sg_policy->work_lock);
 701}
 702
 703static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 704{
 705        struct sugov_tunables *tunables;
 706
 707        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
 708        if (tunables) {
 709                gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
 710                if (!have_governor_per_policy())
 711                        global_tunables = tunables;
 712        }
 713        return tunables;
 714}
 715
 716static void sugov_tunables_free(struct sugov_tunables *tunables)
 717{
 718        if (!have_governor_per_policy())
 719                global_tunables = NULL;
 720
 721        kfree(tunables);
 722}
 723
 724static int sugov_init(struct cpufreq_policy *policy)
 725{
 726        struct sugov_policy *sg_policy;
 727        struct sugov_tunables *tunables;
 728        int ret = 0;
 729
 730        /* State should be equivalent to EXIT */
 731        if (policy->governor_data)
 732                return -EBUSY;
 733
 734        cpufreq_enable_fast_switch(policy);
 735
 736        sg_policy = sugov_policy_alloc(policy);
 737        if (!sg_policy) {
 738                ret = -ENOMEM;
 739                goto disable_fast_switch;
 740        }
 741
 742        ret = sugov_kthread_create(sg_policy);
 743        if (ret)
 744                goto free_sg_policy;
 745
 746        mutex_lock(&global_tunables_lock);
 747
 748        if (global_tunables) {
 749                if (WARN_ON(have_governor_per_policy())) {
 750                        ret = -EINVAL;
 751                        goto stop_kthread;
 752                }
 753                policy->governor_data = sg_policy;
 754                sg_policy->tunables = global_tunables;
 755
 756                gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
 757                goto out;
 758        }
 759
 760        tunables = sugov_tunables_alloc(sg_policy);
 761        if (!tunables) {
 762                ret = -ENOMEM;
 763                goto stop_kthread;
 764        }
 765
 766        tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 767
 768        policy->governor_data = sg_policy;
 769        sg_policy->tunables = tunables;
 770
 771        ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
 772                                   get_governor_parent_kobj(policy), "%s",
 773                                   schedutil_gov.name);
 774        if (ret)
 775                goto fail;
 776
 777out:
 778        mutex_unlock(&global_tunables_lock);
 779        return 0;
 780
 781fail:
 782        kobject_put(&tunables->attr_set.kobj);
 783        policy->governor_data = NULL;
 784        sugov_tunables_free(tunables);
 785
 786stop_kthread:
 787        sugov_kthread_stop(sg_policy);
 788        mutex_unlock(&global_tunables_lock);
 789
 790free_sg_policy:
 791        sugov_policy_free(sg_policy);
 792
 793disable_fast_switch:
 794        cpufreq_disable_fast_switch(policy);
 795
 796        pr_err("initialization failed (error %d)\n", ret);
 797        return ret;
 798}
 799
 800static void sugov_exit(struct cpufreq_policy *policy)
 801{
 802        struct sugov_policy *sg_policy = policy->governor_data;
 803        struct sugov_tunables *tunables = sg_policy->tunables;
 804        unsigned int count;
 805
 806        mutex_lock(&global_tunables_lock);
 807
 808        count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 809        policy->governor_data = NULL;
 810        if (!count)
 811                sugov_tunables_free(tunables);
 812
 813        mutex_unlock(&global_tunables_lock);
 814
 815        sugov_kthread_stop(sg_policy);
 816        sugov_policy_free(sg_policy);
 817        cpufreq_disable_fast_switch(policy);
 818}
 819
 820static int sugov_start(struct cpufreq_policy *policy)
 821{
 822        struct sugov_policy *sg_policy = policy->governor_data;
 823        unsigned int cpu;
 824
 825        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 826        sg_policy->last_freq_update_time        = 0;
 827        sg_policy->next_freq                    = 0;
 828        sg_policy->work_in_progress             = false;
 829        sg_policy->limits_changed               = false;
 830        sg_policy->need_freq_update             = false;
 831        sg_policy->cached_raw_freq              = 0;
 832
 833        for_each_cpu(cpu, policy->cpus) {
 834                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 835
 836                memset(sg_cpu, 0, sizeof(*sg_cpu));
 837                sg_cpu->cpu                     = cpu;
 838                sg_cpu->sg_policy               = sg_policy;
 839                sg_cpu->min                     =
 840                        (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
 841                        policy->cpuinfo.max_freq;
 842        }
 843
 844        for_each_cpu(cpu, policy->cpus) {
 845                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 846
 847                cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 848                                             policy_is_shared(policy) ?
 849                                                        sugov_update_shared :
 850                                                        sugov_update_single);
 851        }
 852        return 0;
 853}
 854
 855static void sugov_stop(struct cpufreq_policy *policy)
 856{
 857        struct sugov_policy *sg_policy = policy->governor_data;
 858        unsigned int cpu;
 859
 860        for_each_cpu(cpu, policy->cpus)
 861                cpufreq_remove_update_util_hook(cpu);
 862
 863        synchronize_rcu();
 864
 865        if (!policy->fast_switch_enabled) {
 866                irq_work_sync(&sg_policy->irq_work);
 867                kthread_cancel_work_sync(&sg_policy->work);
 868        }
 869}
 870
 871static void sugov_limits(struct cpufreq_policy *policy)
 872{
 873        struct sugov_policy *sg_policy = policy->governor_data;
 874
 875        if (!policy->fast_switch_enabled) {
 876                mutex_lock(&sg_policy->work_lock);
 877                cpufreq_policy_apply_limits(policy);
 878                mutex_unlock(&sg_policy->work_lock);
 879        }
 880
 881        sg_policy->limits_changed = true;
 882}
 883
 884struct cpufreq_governor schedutil_gov = {
 885        .name                   = "schedutil",
 886        .owner                  = THIS_MODULE,
 887        .dynamic_switching      = true,
 888        .init                   = sugov_init,
 889        .exit                   = sugov_exit,
 890        .start                  = sugov_start,
 891        .stop                   = sugov_stop,
 892        .limits                 = sugov_limits,
 893};
 894
 895#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 896struct cpufreq_governor *cpufreq_default_governor(void)
 897{
 898        return &schedutil_gov;
 899}
 900#endif
 901
 902static int __init sugov_register(void)
 903{
 904        return cpufreq_register_governor(&schedutil_gov);
 905}
 906fs_initcall(sugov_register);
 907
 908#ifdef CONFIG_ENERGY_MODEL
 909extern bool sched_energy_update;
 910extern struct mutex sched_energy_mutex;
 911
 912static void rebuild_sd_workfn(struct work_struct *work)
 913{
 914        mutex_lock(&sched_energy_mutex);
 915        sched_energy_update = true;
 916        rebuild_sched_domains();
 917        sched_energy_update = false;
 918        mutex_unlock(&sched_energy_mutex);
 919}
 920static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
 921
 922/*
 923 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
 924 * on governor changes to make sure the scheduler knows about it.
 925 */
 926void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
 927                                  struct cpufreq_governor *old_gov)
 928{
 929        if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
 930                /*
 931                 * When called from the cpufreq_register_driver() path, the
 932                 * cpu_hotplug_lock is already held, so use a work item to
 933                 * avoid nested locking in rebuild_sched_domains().
 934                 */
 935                schedule_work(&rebuild_sd_work);
 936        }
 937
 938}
 939#endif
 940