linux/kernel/sched/cpufreq_schedutil.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * CPUFreq governor based on scheduler-provided CPU utilization data.
   4 *
   5 * Copyright (C) 2016, Intel Corporation
   6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   7 */
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include "sched.h"
  12
  13#include <linux/sched/cpufreq.h>
  14#include <trace/events/power.h>
  15
  16#define IOWAIT_BOOST_MIN        (SCHED_CAPACITY_SCALE / 8)
  17
  18struct sugov_tunables {
  19        struct gov_attr_set     attr_set;
  20        unsigned int            rate_limit_us;
  21};
  22
  23struct sugov_policy {
  24        struct cpufreq_policy   *policy;
  25
  26        struct sugov_tunables   *tunables;
  27        struct list_head        tunables_hook;
  28
  29        raw_spinlock_t          update_lock;    /* For shared policies */
  30        u64                     last_freq_update_time;
  31        s64                     freq_update_delay_ns;
  32        unsigned int            next_freq;
  33        unsigned int            cached_raw_freq;
  34
  35        /* The next fields are only needed if fast switch cannot be used: */
  36        struct                  irq_work irq_work;
  37        struct                  kthread_work work;
  38        struct                  mutex work_lock;
  39        struct                  kthread_worker worker;
  40        struct task_struct      *thread;
  41        bool                    work_in_progress;
  42
  43        bool                    limits_changed;
  44        bool                    need_freq_update;
  45};
  46
  47struct sugov_cpu {
  48        struct update_util_data update_util;
  49        struct sugov_policy     *sg_policy;
  50        unsigned int            cpu;
  51
  52        bool                    iowait_boost_pending;
  53        unsigned int            iowait_boost;
  54        u64                     last_update;
  55
  56        unsigned long           bw_dl;
  57        unsigned long           max;
  58
  59        /* The field below is for single-CPU policies only: */
  60#ifdef CONFIG_NO_HZ_COMMON
  61        unsigned long           saved_idle_calls;
  62#endif
  63};
  64
  65static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  66
  67/************************ Governor internals ***********************/
  68
  69static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  70{
  71        s64 delta_ns;
  72
  73        /*
  74         * Since cpufreq_update_util() is called with rq->lock held for
  75         * the @target_cpu, our per-CPU data is fully serialized.
  76         *
  77         * However, drivers cannot in general deal with cross-CPU
  78         * requests, so while get_next_freq() will work, our
  79         * sugov_update_commit() call may not for the fast switching platforms.
  80         *
  81         * Hence stop here for remote requests if they aren't supported
  82         * by the hardware, as calculating the frequency is pointless if
  83         * we cannot in fact act on it.
  84         *
  85         * For the slow switching platforms, the kthread is always scheduled on
  86         * the right set of CPUs and any CPU can find the next frequency and
  87         * schedule the kthread.
  88         */
  89        if (sg_policy->policy->fast_switch_enabled &&
  90            !cpufreq_this_cpu_can_update(sg_policy->policy))
  91                return false;
  92
  93        if (unlikely(sg_policy->limits_changed)) {
  94                sg_policy->limits_changed = false;
  95                sg_policy->need_freq_update = true;
  96                return true;
  97        }
  98
  99        delta_ns = time - sg_policy->last_freq_update_time;
 100
 101        return delta_ns >= sg_policy->freq_update_delay_ns;
 102}
 103
 104static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
 105                                   unsigned int next_freq)
 106{
 107        if (sg_policy->next_freq == next_freq)
 108                return false;
 109
 110        sg_policy->next_freq = next_freq;
 111        sg_policy->last_freq_update_time = time;
 112
 113        return true;
 114}
 115
 116static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
 117                              unsigned int next_freq)
 118{
 119        struct cpufreq_policy *policy = sg_policy->policy;
 120
 121        if (!sugov_update_next_freq(sg_policy, time, next_freq))
 122                return;
 123
 124        next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 125        if (!next_freq)
 126                return;
 127
 128        policy->cur = next_freq;
 129        trace_cpu_frequency(next_freq, smp_processor_id());
 130}
 131
 132static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
 133                                  unsigned int next_freq)
 134{
 135        if (!sugov_update_next_freq(sg_policy, time, next_freq))
 136                return;
 137
 138        if (!sg_policy->work_in_progress) {
 139                sg_policy->work_in_progress = true;
 140                irq_work_queue(&sg_policy->irq_work);
 141        }
 142}
 143
 144/**
 145 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 146 * @sg_policy: schedutil policy object to compute the new frequency for.
 147 * @util: Current CPU utilization.
 148 * @max: CPU capacity.
 149 *
 150 * If the utilization is frequency-invariant, choose the new frequency to be
 151 * proportional to it, that is
 152 *
 153 * next_freq = C * max_freq * util / max
 154 *
 155 * Otherwise, approximate the would-be frequency-invariant utilization by
 156 * util_raw * (curr_freq / max_freq) which leads to
 157 *
 158 * next_freq = C * curr_freq * util_raw / max
 159 *
 160 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 161 *
 162 * The lowest driver-supported frequency which is equal or greater than the raw
 163 * next_freq (as calculated above) is returned, subject to policy min/max and
 164 * cpufreq driver limitations.
 165 */
 166static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 167                                  unsigned long util, unsigned long max)
 168{
 169        struct cpufreq_policy *policy = sg_policy->policy;
 170        unsigned int freq = arch_scale_freq_invariant() ?
 171                                policy->cpuinfo.max_freq : policy->cur;
 172
 173        freq = map_util_freq(util, freq, max);
 174
 175        if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 176                return sg_policy->next_freq;
 177
 178        sg_policy->need_freq_update = false;
 179        sg_policy->cached_raw_freq = freq;
 180        return cpufreq_driver_resolve_freq(policy, freq);
 181}
 182
 183/*
 184 * This function computes an effective utilization for the given CPU, to be
 185 * used for frequency selection given the linear relation: f = u * f_max.
 186 *
 187 * The scheduler tracks the following metrics:
 188 *
 189 *   cpu_util_{cfs,rt,dl,irq}()
 190 *   cpu_bw_dl()
 191 *
 192 * Where the cfs,rt and dl util numbers are tracked with the same metric and
 193 * synchronized windows and are thus directly comparable.
 194 *
 195 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
 196 * which excludes things like IRQ and steal-time. These latter are then accrued
 197 * in the irq utilization.
 198 *
 199 * The DL bandwidth number otoh is not a measured metric but a value computed
 200 * based on the task model parameters and gives the minimal utilization
 201 * required to meet deadlines.
 202 */
 203unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
 204                                 unsigned long max, enum schedutil_type type,
 205                                 struct task_struct *p)
 206{
 207        unsigned long dl_util, util, irq;
 208        struct rq *rq = cpu_rq(cpu);
 209
 210        if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
 211            type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
 212                return max;
 213        }
 214
 215        /*
 216         * Early check to see if IRQ/steal time saturates the CPU, can be
 217         * because of inaccuracies in how we track these -- see
 218         * update_irq_load_avg().
 219         */
 220        irq = cpu_util_irq(rq);
 221        if (unlikely(irq >= max))
 222                return max;
 223
 224        /*
 225         * Because the time spend on RT/DL tasks is visible as 'lost' time to
 226         * CFS tasks and we use the same metric to track the effective
 227         * utilization (PELT windows are synchronized) we can directly add them
 228         * to obtain the CPU's actual utilization.
 229         *
 230         * CFS and RT utilization can be boosted or capped, depending on
 231         * utilization clamp constraints requested by currently RUNNABLE
 232         * tasks.
 233         * When there are no CFS RUNNABLE tasks, clamps are released and
 234         * frequency will be gracefully reduced with the utilization decay.
 235         */
 236        util = util_cfs + cpu_util_rt(rq);
 237        if (type == FREQUENCY_UTIL)
 238                util = uclamp_util_with(rq, util, p);
 239
 240        dl_util = cpu_util_dl(rq);
 241
 242        /*
 243         * For frequency selection we do not make cpu_util_dl() a permanent part
 244         * of this sum because we want to use cpu_bw_dl() later on, but we need
 245         * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
 246         * that we select f_max when there is no idle time.
 247         *
 248         * NOTE: numerical errors or stop class might cause us to not quite hit
 249         * saturation when we should -- something for later.
 250         */
 251        if (util + dl_util >= max)
 252                return max;
 253
 254        /*
 255         * OTOH, for energy computation we need the estimated running time, so
 256         * include util_dl and ignore dl_bw.
 257         */
 258        if (type == ENERGY_UTIL)
 259                util += dl_util;
 260
 261        /*
 262         * There is still idle time; further improve the number by using the
 263         * irq metric. Because IRQ/steal time is hidden from the task clock we
 264         * need to scale the task numbers:
 265         *
 266         *              1 - irq
 267         *   U' = irq + ------- * U
 268         *                max
 269         */
 270        util = scale_irq_capacity(util, irq, max);
 271        util += irq;
 272
 273        /*
 274         * Bandwidth required by DEADLINE must always be granted while, for
 275         * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 276         * to gracefully reduce the frequency when no tasks show up for longer
 277         * periods of time.
 278         *
 279         * Ideally we would like to set bw_dl as min/guaranteed freq and util +
 280         * bw_dl as requested freq. However, cpufreq is not yet ready for such
 281         * an interface. So, we only do the latter for now.
 282         */
 283        if (type == FREQUENCY_UTIL)
 284                util += cpu_bw_dl(rq);
 285
 286        return min(max, util);
 287}
 288
 289static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 290{
 291        struct rq *rq = cpu_rq(sg_cpu->cpu);
 292        unsigned long util = cpu_util_cfs(rq);
 293        unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
 294
 295        sg_cpu->max = max;
 296        sg_cpu->bw_dl = cpu_bw_dl(rq);
 297
 298        return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
 299}
 300
 301/**
 302 * sugov_iowait_reset() - Reset the IO boost status of a CPU.
 303 * @sg_cpu: the sugov data for the CPU to boost
 304 * @time: the update time from the caller
 305 * @set_iowait_boost: true if an IO boost has been requested
 306 *
 307 * The IO wait boost of a task is disabled after a tick since the last update
 308 * of a CPU. If a new IO wait boost is requested after more then a tick, then
 309 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
 310 * efficiency by ignoring sporadic wakeups from IO.
 311 */
 312static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 313                               bool set_iowait_boost)
 314{
 315        s64 delta_ns = time - sg_cpu->last_update;
 316
 317        /* Reset boost only if a tick has elapsed since last request */
 318        if (delta_ns <= TICK_NSEC)
 319                return false;
 320
 321        sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
 322        sg_cpu->iowait_boost_pending = set_iowait_boost;
 323
 324        return true;
 325}
 326
 327/**
 328 * sugov_iowait_boost() - Updates the IO boost status of a CPU.
 329 * @sg_cpu: the sugov data for the CPU to boost
 330 * @time: the update time from the caller
 331 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 332 *
 333 * Each time a task wakes up after an IO operation, the CPU utilization can be
 334 * boosted to a certain utilization which doubles at each "frequent and
 335 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
 336 * of the maximum OPP.
 337 *
 338 * To keep doubling, an IO boost has to be requested at least once per tick,
 339 * otherwise we restart from the utilization of the minimum OPP.
 340 */
 341static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 342                               unsigned int flags)
 343{
 344        bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
 345
 346        /* Reset boost if the CPU appears to have been idle enough */
 347        if (sg_cpu->iowait_boost &&
 348            sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
 349                return;
 350
 351        /* Boost only tasks waking up after IO */
 352        if (!set_iowait_boost)
 353                return;
 354
 355        /* Ensure boost doubles only one time at each request */
 356        if (sg_cpu->iowait_boost_pending)
 357                return;
 358        sg_cpu->iowait_boost_pending = true;
 359
 360        /* Double the boost at each request */
 361        if (sg_cpu->iowait_boost) {
 362                sg_cpu->iowait_boost =
 363                        min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
 364                return;
 365        }
 366
 367        /* First wakeup after IO: start with minimum boost */
 368        sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
 369}
 370
 371/**
 372 * sugov_iowait_apply() - Apply the IO boost to a CPU.
 373 * @sg_cpu: the sugov data for the cpu to boost
 374 * @time: the update time from the caller
 375 * @util: the utilization to (eventually) boost
 376 * @max: the maximum value the utilization can be boosted to
 377 *
 378 * A CPU running a task which woken up after an IO operation can have its
 379 * utilization boosted to speed up the completion of those IO operations.
 380 * The IO boost value is increased each time a task wakes up from IO, in
 381 * sugov_iowait_apply(), and it's instead decreased by this function,
 382 * each time an increase has not been requested (!iowait_boost_pending).
 383 *
 384 * A CPU which also appears to have been idle for at least one tick has also
 385 * its IO boost utilization reset.
 386 *
 387 * This mechanism is designed to boost high frequently IO waiting tasks, while
 388 * being more conservative on tasks which does sporadic IO operations.
 389 */
 390static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 391                                        unsigned long util, unsigned long max)
 392{
 393        unsigned long boost;
 394
 395        /* No boost currently required */
 396        if (!sg_cpu->iowait_boost)
 397                return util;
 398
 399        /* Reset boost if the CPU appears to have been idle enough */
 400        if (sugov_iowait_reset(sg_cpu, time, false))
 401                return util;
 402
 403        if (!sg_cpu->iowait_boost_pending) {
 404                /*
 405                 * No boost pending; reduce the boost value.
 406                 */
 407                sg_cpu->iowait_boost >>= 1;
 408                if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
 409                        sg_cpu->iowait_boost = 0;
 410                        return util;
 411                }
 412        }
 413
 414        sg_cpu->iowait_boost_pending = false;
 415
 416        /*
 417         * @util is already in capacity scale; convert iowait_boost
 418         * into the same scale so we can compare.
 419         */
 420        boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
 421        return max(boost, util);
 422}
 423
 424#ifdef CONFIG_NO_HZ_COMMON
 425static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 426{
 427        unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 428        bool ret = idle_calls == sg_cpu->saved_idle_calls;
 429
 430        sg_cpu->saved_idle_calls = idle_calls;
 431        return ret;
 432}
 433#else
 434static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 435#endif /* CONFIG_NO_HZ_COMMON */
 436
 437/*
 438 * Make sugov_should_update_freq() ignore the rate limit when DL
 439 * has increased the utilization.
 440 */
 441static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
 442{
 443        if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
 444                sg_policy->limits_changed = true;
 445}
 446
 447static void sugov_update_single(struct update_util_data *hook, u64 time,
 448                                unsigned int flags)
 449{
 450        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 451        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 452        unsigned long util, max;
 453        unsigned int next_f;
 454        bool busy;
 455
 456        sugov_iowait_boost(sg_cpu, time, flags);
 457        sg_cpu->last_update = time;
 458
 459        ignore_dl_rate_limit(sg_cpu, sg_policy);
 460
 461        if (!sugov_should_update_freq(sg_policy, time))
 462                return;
 463
 464        /* Limits may have changed, don't skip frequency update */
 465        busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
 466
 467        util = sugov_get_util(sg_cpu);
 468        max = sg_cpu->max;
 469        util = sugov_iowait_apply(sg_cpu, time, util, max);
 470        next_f = get_next_freq(sg_policy, util, max);
 471        /*
 472         * Do not reduce the frequency if the CPU has not been idle
 473         * recently, as the reduction is likely to be premature then.
 474         */
 475        if (busy && next_f < sg_policy->next_freq) {
 476                next_f = sg_policy->next_freq;
 477
 478                /* Reset cached freq as next_freq has changed */
 479                sg_policy->cached_raw_freq = 0;
 480        }
 481
 482        /*
 483         * This code runs under rq->lock for the target CPU, so it won't run
 484         * concurrently on two different CPUs for the same target and it is not
 485         * necessary to acquire the lock in the fast switch case.
 486         */
 487        if (sg_policy->policy->fast_switch_enabled) {
 488                sugov_fast_switch(sg_policy, time, next_f);
 489        } else {
 490                raw_spin_lock(&sg_policy->update_lock);
 491                sugov_deferred_update(sg_policy, time, next_f);
 492                raw_spin_unlock(&sg_policy->update_lock);
 493        }
 494}
 495
 496static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 497{
 498        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 499        struct cpufreq_policy *policy = sg_policy->policy;
 500        unsigned long util = 0, max = 1;
 501        unsigned int j;
 502
 503        for_each_cpu(j, policy->cpus) {
 504                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 505                unsigned long j_util, j_max;
 506
 507                j_util = sugov_get_util(j_sg_cpu);
 508                j_max = j_sg_cpu->max;
 509                j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
 510
 511                if (j_util * max > j_max * util) {
 512                        util = j_util;
 513                        max = j_max;
 514                }
 515        }
 516
 517        return get_next_freq(sg_policy, util, max);
 518}
 519
 520static void
 521sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 522{
 523        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 524        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 525        unsigned int next_f;
 526
 527        raw_spin_lock(&sg_policy->update_lock);
 528
 529        sugov_iowait_boost(sg_cpu, time, flags);
 530        sg_cpu->last_update = time;
 531
 532        ignore_dl_rate_limit(sg_cpu, sg_policy);
 533
 534        if (sugov_should_update_freq(sg_policy, time)) {
 535                next_f = sugov_next_freq_shared(sg_cpu, time);
 536
 537                if (sg_policy->policy->fast_switch_enabled)
 538                        sugov_fast_switch(sg_policy, time, next_f);
 539                else
 540                        sugov_deferred_update(sg_policy, time, next_f);
 541        }
 542
 543        raw_spin_unlock(&sg_policy->update_lock);
 544}
 545
 546static void sugov_work(struct kthread_work *work)
 547{
 548        struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 549        unsigned int freq;
 550        unsigned long flags;
 551
 552        /*
 553         * Hold sg_policy->update_lock shortly to handle the case where:
 554         * incase sg_policy->next_freq is read here, and then updated by
 555         * sugov_deferred_update() just before work_in_progress is set to false
 556         * here, we may miss queueing the new update.
 557         *
 558         * Note: If a work was queued after the update_lock is released,
 559         * sugov_work() will just be called again by kthread_work code; and the
 560         * request will be proceed before the sugov thread sleeps.
 561         */
 562        raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 563        freq = sg_policy->next_freq;
 564        sg_policy->work_in_progress = false;
 565        raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 566
 567        mutex_lock(&sg_policy->work_lock);
 568        __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
 569        mutex_unlock(&sg_policy->work_lock);
 570}
 571
 572static void sugov_irq_work(struct irq_work *irq_work)
 573{
 574        struct sugov_policy *sg_policy;
 575
 576        sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 577
 578        kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 579}
 580
 581/************************** sysfs interface ************************/
 582
 583static struct sugov_tunables *global_tunables;
 584static DEFINE_MUTEX(global_tunables_lock);
 585
 586static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
 587{
 588        return container_of(attr_set, struct sugov_tunables, attr_set);
 589}
 590
 591static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 592{
 593        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 594
 595        return sprintf(buf, "%u\n", tunables->rate_limit_us);
 596}
 597
 598static ssize_t
 599rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
 600{
 601        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 602        struct sugov_policy *sg_policy;
 603        unsigned int rate_limit_us;
 604
 605        if (kstrtouint(buf, 10, &rate_limit_us))
 606                return -EINVAL;
 607
 608        tunables->rate_limit_us = rate_limit_us;
 609
 610        list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
 611                sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
 612
 613        return count;
 614}
 615
 616static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
 617
 618static struct attribute *sugov_attrs[] = {
 619        &rate_limit_us.attr,
 620        NULL
 621};
 622ATTRIBUTE_GROUPS(sugov);
 623
 624static struct kobj_type sugov_tunables_ktype = {
 625        .default_groups = sugov_groups,
 626        .sysfs_ops = &governor_sysfs_ops,
 627};
 628
 629/********************** cpufreq governor interface *********************/
 630
 631struct cpufreq_governor schedutil_gov;
 632
 633static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 634{
 635        struct sugov_policy *sg_policy;
 636
 637        sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
 638        if (!sg_policy)
 639                return NULL;
 640
 641        sg_policy->policy = policy;
 642        raw_spin_lock_init(&sg_policy->update_lock);
 643        return sg_policy;
 644}
 645
 646static void sugov_policy_free(struct sugov_policy *sg_policy)
 647{
 648        kfree(sg_policy);
 649}
 650
 651static int sugov_kthread_create(struct sugov_policy *sg_policy)
 652{
 653        struct task_struct *thread;
 654        struct sched_attr attr = {
 655                .size           = sizeof(struct sched_attr),
 656                .sched_policy   = SCHED_DEADLINE,
 657                .sched_flags    = SCHED_FLAG_SUGOV,
 658                .sched_nice     = 0,
 659                .sched_priority = 0,
 660                /*
 661                 * Fake (unused) bandwidth; workaround to "fix"
 662                 * priority inheritance.
 663                 */
 664                .sched_runtime  =  1000000,
 665                .sched_deadline = 10000000,
 666                .sched_period   = 10000000,
 667        };
 668        struct cpufreq_policy *policy = sg_policy->policy;
 669        int ret;
 670
 671        /* kthread only required for slow path */
 672        if (policy->fast_switch_enabled)
 673                return 0;
 674
 675        kthread_init_work(&sg_policy->work, sugov_work);
 676        kthread_init_worker(&sg_policy->worker);
 677        thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
 678                                "sugov:%d",
 679                                cpumask_first(policy->related_cpus));
 680        if (IS_ERR(thread)) {
 681                pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
 682                return PTR_ERR(thread);
 683        }
 684
 685        ret = sched_setattr_nocheck(thread, &attr);
 686        if (ret) {
 687                kthread_stop(thread);
 688                pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
 689                return ret;
 690        }
 691
 692        sg_policy->thread = thread;
 693        kthread_bind_mask(thread, policy->related_cpus);
 694        init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 695        mutex_init(&sg_policy->work_lock);
 696
 697        wake_up_process(thread);
 698
 699        return 0;
 700}
 701
 702static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 703{
 704        /* kthread only required for slow path */
 705        if (sg_policy->policy->fast_switch_enabled)
 706                return;
 707
 708        kthread_flush_worker(&sg_policy->worker);
 709        kthread_stop(sg_policy->thread);
 710        mutex_destroy(&sg_policy->work_lock);
 711}
 712
 713static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 714{
 715        struct sugov_tunables *tunables;
 716
 717        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
 718        if (tunables) {
 719                gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
 720                if (!have_governor_per_policy())
 721                        global_tunables = tunables;
 722        }
 723        return tunables;
 724}
 725
 726static void sugov_tunables_free(struct sugov_tunables *tunables)
 727{
 728        if (!have_governor_per_policy())
 729                global_tunables = NULL;
 730
 731        kfree(tunables);
 732}
 733
 734static int sugov_init(struct cpufreq_policy *policy)
 735{
 736        struct sugov_policy *sg_policy;
 737        struct sugov_tunables *tunables;
 738        int ret = 0;
 739
 740        /* State should be equivalent to EXIT */
 741        if (policy->governor_data)
 742                return -EBUSY;
 743
 744        cpufreq_enable_fast_switch(policy);
 745
 746        sg_policy = sugov_policy_alloc(policy);
 747        if (!sg_policy) {
 748                ret = -ENOMEM;
 749                goto disable_fast_switch;
 750        }
 751
 752        ret = sugov_kthread_create(sg_policy);
 753        if (ret)
 754                goto free_sg_policy;
 755
 756        mutex_lock(&global_tunables_lock);
 757
 758        if (global_tunables) {
 759                if (WARN_ON(have_governor_per_policy())) {
 760                        ret = -EINVAL;
 761                        goto stop_kthread;
 762                }
 763                policy->governor_data = sg_policy;
 764                sg_policy->tunables = global_tunables;
 765
 766                gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
 767                goto out;
 768        }
 769
 770        tunables = sugov_tunables_alloc(sg_policy);
 771        if (!tunables) {
 772                ret = -ENOMEM;
 773                goto stop_kthread;
 774        }
 775
 776        tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 777
 778        policy->governor_data = sg_policy;
 779        sg_policy->tunables = tunables;
 780
 781        ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
 782                                   get_governor_parent_kobj(policy), "%s",
 783                                   schedutil_gov.name);
 784        if (ret)
 785                goto fail;
 786
 787out:
 788        mutex_unlock(&global_tunables_lock);
 789        return 0;
 790
 791fail:
 792        kobject_put(&tunables->attr_set.kobj);
 793        policy->governor_data = NULL;
 794        sugov_tunables_free(tunables);
 795
 796stop_kthread:
 797        sugov_kthread_stop(sg_policy);
 798        mutex_unlock(&global_tunables_lock);
 799
 800free_sg_policy:
 801        sugov_policy_free(sg_policy);
 802
 803disable_fast_switch:
 804        cpufreq_disable_fast_switch(policy);
 805
 806        pr_err("initialization failed (error %d)\n", ret);
 807        return ret;
 808}
 809
 810static void sugov_exit(struct cpufreq_policy *policy)
 811{
 812        struct sugov_policy *sg_policy = policy->governor_data;
 813        struct sugov_tunables *tunables = sg_policy->tunables;
 814        unsigned int count;
 815
 816        mutex_lock(&global_tunables_lock);
 817
 818        count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
 819        policy->governor_data = NULL;
 820        if (!count)
 821                sugov_tunables_free(tunables);
 822
 823        mutex_unlock(&global_tunables_lock);
 824
 825        sugov_kthread_stop(sg_policy);
 826        sugov_policy_free(sg_policy);
 827        cpufreq_disable_fast_switch(policy);
 828}
 829
 830static int sugov_start(struct cpufreq_policy *policy)
 831{
 832        struct sugov_policy *sg_policy = policy->governor_data;
 833        unsigned int cpu;
 834
 835        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 836        sg_policy->last_freq_update_time        = 0;
 837        sg_policy->next_freq                    = 0;
 838        sg_policy->work_in_progress             = false;
 839        sg_policy->limits_changed               = false;
 840        sg_policy->need_freq_update             = false;
 841        sg_policy->cached_raw_freq              = 0;
 842
 843        for_each_cpu(cpu, policy->cpus) {
 844                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 845
 846                memset(sg_cpu, 0, sizeof(*sg_cpu));
 847                sg_cpu->cpu                     = cpu;
 848                sg_cpu->sg_policy               = sg_policy;
 849        }
 850
 851        for_each_cpu(cpu, policy->cpus) {
 852                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 853
 854                cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 855                                             policy_is_shared(policy) ?
 856                                                        sugov_update_shared :
 857                                                        sugov_update_single);
 858        }
 859        return 0;
 860}
 861
 862static void sugov_stop(struct cpufreq_policy *policy)
 863{
 864        struct sugov_policy *sg_policy = policy->governor_data;
 865        unsigned int cpu;
 866
 867        for_each_cpu(cpu, policy->cpus)
 868                cpufreq_remove_update_util_hook(cpu);
 869
 870        synchronize_rcu();
 871
 872        if (!policy->fast_switch_enabled) {
 873                irq_work_sync(&sg_policy->irq_work);
 874                kthread_cancel_work_sync(&sg_policy->work);
 875        }
 876}
 877
 878static void sugov_limits(struct cpufreq_policy *policy)
 879{
 880        struct sugov_policy *sg_policy = policy->governor_data;
 881
 882        if (!policy->fast_switch_enabled) {
 883                mutex_lock(&sg_policy->work_lock);
 884                cpufreq_policy_apply_limits(policy);
 885                mutex_unlock(&sg_policy->work_lock);
 886        }
 887
 888        sg_policy->limits_changed = true;
 889}
 890
 891struct cpufreq_governor schedutil_gov = {
 892        .name                   = "schedutil",
 893        .owner                  = THIS_MODULE,
 894        .dynamic_switching      = true,
 895        .init                   = sugov_init,
 896        .exit                   = sugov_exit,
 897        .start                  = sugov_start,
 898        .stop                   = sugov_stop,
 899        .limits                 = sugov_limits,
 900};
 901
 902#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 903struct cpufreq_governor *cpufreq_default_governor(void)
 904{
 905        return &schedutil_gov;
 906}
 907#endif
 908
 909static int __init sugov_register(void)
 910{
 911        return cpufreq_register_governor(&schedutil_gov);
 912}
 913fs_initcall(sugov_register);
 914
 915#ifdef CONFIG_ENERGY_MODEL
 916extern bool sched_energy_update;
 917extern struct mutex sched_energy_mutex;
 918
 919static void rebuild_sd_workfn(struct work_struct *work)
 920{
 921        mutex_lock(&sched_energy_mutex);
 922        sched_energy_update = true;
 923        rebuild_sched_domains();
 924        sched_energy_update = false;
 925        mutex_unlock(&sched_energy_mutex);
 926}
 927static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
 928
 929/*
 930 * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
 931 * on governor changes to make sure the scheduler knows about it.
 932 */
 933void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
 934                                  struct cpufreq_governor *old_gov)
 935{
 936        if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
 937                /*
 938                 * When called from the cpufreq_register_driver() path, the
 939                 * cpu_hotplug_lock is already held, so use a work item to
 940                 * avoid nested locking in rebuild_sched_domains().
 941                 */
 942                schedule_work(&rebuild_sd_work);
 943        }
 944
 945}
 946#endif
 947