LXR linux/kernel/sched/sched.h

   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * Scheduler internal types and methods:
   4 */
   5#include <linux/sched.h>
   6
   7#include <linux/sched/autogroup.h>
   8#include <linux/sched/clock.h>
   9#include <linux/sched/coredump.h>
  10#include <linux/sched/cpufreq.h>
  11#include <linux/sched/cputime.h>
  12#include <linux/sched/deadline.h>
  13#include <linux/sched/debug.h>
  14#include <linux/sched/hotplug.h>
  15#include <linux/sched/idle.h>
  16#include <linux/sched/init.h>
  17#include <linux/sched/isolation.h>
  18#include <linux/sched/jobctl.h>
  19#include <linux/sched/loadavg.h>
  20#include <linux/sched/mm.h>
  21#include <linux/sched/nohz.h>
  22#include <linux/sched/numa_balancing.h>
  23#include <linux/sched/prio.h>
  24#include <linux/sched/rt.h>
  25#include <linux/sched/signal.h>
  26#include <linux/sched/smt.h>
  27#include <linux/sched/stat.h>
  28#include <linux/sched/sysctl.h>
  29#include <linux/sched/task.h>
  30#include <linux/sched/task_stack.h>
  31#include <linux/sched/topology.h>
  32#include <linux/sched/user.h>
  33#include <linux/sched/wake_q.h>
  34#include <linux/sched/xacct.h>
  35
  36#include <uapi/linux/sched/types.h>
  37
  38#include <linux/binfmts.h>
  39#include <linux/blkdev.h>
  40#include <linux/compat.h>
  41#include <linux/context_tracking.h>
  42#include <linux/cpufreq.h>
  43#include <linux/cpuidle.h>
  44#include <linux/cpuset.h>
  45#include <linux/ctype.h>
  46#include <linux/debugfs.h>
  47#include <linux/delayacct.h>
  48#include <linux/energy_model.h>
  49#include <linux/init_task.h>
  50#include <linux/kprobes.h>
  51#include <linux/kthread.h>
  52#include <linux/membarrier.h>
  53#include <linux/migrate.h>
  54#include <linux/mmu_context.h>
  55#include <linux/nmi.h>
  56#include <linux/proc_fs.h>
  57#include <linux/prefetch.h>
  58#include <linux/profile.h>
  59#include <linux/psi.h>
  60#include <linux/rcupdate_wait.h>
  61#include <linux/security.h>
  62#include <linux/stop_machine.h>
  63#include <linux/suspend.h>
  64#include <linux/swait.h>
  65#include <linux/syscalls.h>
  66#include <linux/task_work.h>
  67#include <linux/tsacct_kern.h>
  68
  69#include <asm/tlb.h>
  70
  71#ifdef CONFIG_PARAVIRT
  72# include <asm/paravirt.h>
  73#endif
  74
  75#include "cpupri.h"
  76#include "cpudeadline.h"
  77
  78#ifdef CONFIG_SCHED_DEBUG
  79# define SCHED_WARN_ON(x)       WARN_ONCE(x, #x)
  80#else
  81# define SCHED_WARN_ON(x)       ({ (void)(x), 0; })
  82#endif
  83
  84struct rq;
  85struct cpuidle_state;
  86
  87/* task_struct::on_rq states: */
  88#define TASK_ON_RQ_QUEUED       1
  89#define TASK_ON_RQ_MIGRATING    2
  90
  91extern __read_mostly int scheduler_running;
  92
  93extern unsigned long calc_load_update;
  94extern atomic_long_t calc_load_tasks;
  95
  96extern void calc_global_load_tick(struct rq *this_rq);
  97extern long calc_load_fold_active(struct rq *this_rq, long adjust);
  98
  99/*
 100 * Helpers for converting nanosecond timing to jiffy resolution
 101 */
 102#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 103
 104/*
 105 * Increase resolution of nice-level calculations for 64-bit architectures.
 106 * The extra resolution improves shares distribution and load balancing of
 107 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
 108 * hierarchies, especially on larger systems. This is not a user-visible change
 109 * and does not change the user-interface for setting shares/weights.
 110 *
 111 * We increase resolution only if we have enough bits to allow this increased
 112 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
 113 * are pretty high and the returns do not justify the increased costs.
 114 *
 115 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
 116 * increase coverage and consistency always enable it on 64-bit platforms.
 117 */
 118#ifdef CONFIG_64BIT
 119# define NICE_0_LOAD_SHIFT      (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
 120# define scale_load(w)          ((w) << SCHED_FIXEDPOINT_SHIFT)
 121# define scale_load_down(w) \
 122({ \
 123        unsigned long __w = (w); \
 124        if (__w) \
 125                __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
 126        __w; \
 127})
 128#else
 129# define NICE_0_LOAD_SHIFT      (SCHED_FIXEDPOINT_SHIFT)
 130# define scale_load(w)          (w)
 131# define scale_load_down(w)     (w)
 132#endif
 133
 134/*
 135 * Task weight (visible to users) and its load (invisible to users) have
 136 * independent resolution, but they should be well calibrated. We use
 137 * scale_load() and scale_load_down(w) to convert between them. The
 138 * following must be true:
 139 *
 140 *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
 141 *
 142 */
 143#define NICE_0_LOAD             (1L << NICE_0_LOAD_SHIFT)
 144
 145/*
 146 * Single value that decides SCHED_DEADLINE internal math precision.
 147 * 10 -> just above 1us
 148 * 9  -> just above 0.5us
 149 */
 150#define DL_SCALE                10
 151
 152/*
 153 * Single value that denotes runtime == period, ie unlimited time.
 154 */
 155#define RUNTIME_INF             ((u64)~0ULL)
 156
 157static inline int idle_policy(int policy)
 158{
 159        return policy == SCHED_IDLE;
 160}
 161static inline int fair_policy(int policy)
 162{
 163        return policy == SCHED_NORMAL || policy == SCHED_BATCH;
 164}
 165
 166static inline int rt_policy(int policy)
 167{
 168        return policy == SCHED_FIFO || policy == SCHED_RR;
 169}
 170
 171static inline int dl_policy(int policy)
 172{
 173        return policy == SCHED_DEADLINE;
 174}
 175static inline bool valid_policy(int policy)
 176{
 177        return idle_policy(policy) || fair_policy(policy) ||
 178                rt_policy(policy) || dl_policy(policy);
 179}
 180
 181static inline int task_has_idle_policy(struct task_struct *p)
 182{
 183        return idle_policy(p->policy);
 184}
 185
 186static inline int task_has_rt_policy(struct task_struct *p)
 187{
 188        return rt_policy(p->policy);
 189}
 190
 191static inline int task_has_dl_policy(struct task_struct *p)
 192{
 193        return dl_policy(p->policy);
 194}
 195
 196#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 197
 198static inline void update_avg(u64 *avg, u64 sample)
 199{
 200        s64 diff = sample - *avg;
 201        *avg += diff / 8;
 202}
 203
 204/*
 205 * !! For sched_setattr_nocheck() (kernel) only !!
 206 *
 207 * This is actually gross. :(
 208 *
 209 * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
 210 * tasks, but still be able to sleep. We need this on platforms that cannot
 211 * atomically change clock frequency. Remove once fast switching will be
 212 * available on such platforms.
 213 *
 214 * SUGOV stands for SchedUtil GOVernor.
 215 */
 216#define SCHED_FLAG_SUGOV        0x10000000
 217
 218static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
 219{
 220#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
 221        return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
 222#else
 223        return false;
 224#endif
 225}
 226
 227/*
 228 * Tells if entity @a should preempt entity @b.
 229 */
 230static inline bool
 231dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
 232{
 233        return dl_entity_is_special(a) ||
 234               dl_time_before(a->deadline, b->deadline);
 235}
 236
 237/*
 238 * This is the priority-queue data structure of the RT scheduling class:
 239 */
 240struct rt_prio_array {
 241        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 242        struct list_head queue[MAX_RT_PRIO];
 243};
 244
 245struct rt_bandwidth {
 246        /* nests inside the rq lock: */
 247        raw_spinlock_t          rt_runtime_lock;
 248        ktime_t                 rt_period;
 249        u64                     rt_runtime;
 250        struct hrtimer          rt_period_timer;
 251        unsigned int            rt_period_active;
 252};
 253
 254void __dl_clear_params(struct task_struct *p);
 255
 256/*
 257 * To keep the bandwidth of -deadline tasks and groups under control
 258 * we need some place where:
 259 *  - store the maximum -deadline bandwidth of the system (the group);
 260 *  - cache the fraction of that bandwidth that is currently allocated.
 261 *
 262 * This is all done in the data structure below. It is similar to the
 263 * one used for RT-throttling (rt_bandwidth), with the main difference
 264 * that, since here we are only interested in admission control, we
 265 * do not decrease any runtime while the group "executes", neither we
 266 * need a timer to replenish it.
 267 *
 268 * With respect to SMP, the bandwidth is given on a per-CPU basis,
 269 * meaning that:
 270 *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
 271 *  - dl_total_bw array contains, in the i-eth element, the currently
 272 *    allocated bandwidth on the i-eth CPU.
 273 * Moreover, groups consume bandwidth on each CPU, while tasks only
 274 * consume bandwidth on the CPU they're running on.
 275 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
 276 * that will be shown the next time the proc or cgroup controls will
 277 * be red. It on its turn can be changed by writing on its own
 278 * control.
 279 */
 280struct dl_bandwidth {
 281        raw_spinlock_t          dl_runtime_lock;
 282        u64                     dl_runtime;
 283        u64                     dl_period;
 284};
 285
 286static inline int dl_bandwidth_enabled(void)
 287{
 288        return sysctl_sched_rt_runtime >= 0;
 289}
 290
 291struct dl_bw {
 292        raw_spinlock_t          lock;
 293        u64                     bw;
 294        u64                     total_bw;
 295};
 296
 297static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
 298
 299static inline
 300void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 301{
 302        dl_b->total_bw -= tsk_bw;
 303        __dl_update(dl_b, (s32)tsk_bw / cpus);
 304}
 305
 306static inline
 307void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 308{
 309        dl_b->total_bw += tsk_bw;
 310        __dl_update(dl_b, -((s32)tsk_bw / cpus));
 311}
 312
 313static inline
 314bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 315{
 316        return dl_b->bw != -1 &&
 317               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 318}
 319
 320extern void init_dl_bw(struct dl_bw *dl_b);
 321extern int  sched_dl_global_validate(void);
 322extern void sched_dl_do_global(void);
 323extern int  sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
 324extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 325extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 326extern bool __checkparam_dl(const struct sched_attr *attr);
 327extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 328extern int  dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
 329extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 330extern bool dl_cpu_busy(unsigned int cpu);
 331
 332#ifdef CONFIG_CGROUP_SCHED
 333
 334#include <linux/cgroup.h>
 335#include <linux/psi.h>
 336
 337struct cfs_rq;
 338struct rt_rq;
 339
 340extern struct list_head task_groups;
 341
 342struct cfs_bandwidth {
 343#ifdef CONFIG_CFS_BANDWIDTH
 344        raw_spinlock_t          lock;
 345        ktime_t                 period;
 346        u64                     quota;
 347        u64                     runtime;
 348        s64                     hierarchical_quota;
 349
 350        u8                      idle;
 351        u8                      period_active;
 352        u8                      distribute_running;
 353        u8                      slack_started;
 354        struct hrtimer          period_timer;
 355        struct hrtimer          slack_timer;
 356        struct list_head        throttled_cfs_rq;
 357
 358        /* Statistics: */
 359        int                     nr_periods;
 360        int                     nr_throttled;
 361        u64                     throttled_time;
 362#endif
 363};
 364
 365/* Task group related information */
 366struct task_group {
 367        struct cgroup_subsys_state css;
 368
 369#ifdef CONFIG_FAIR_GROUP_SCHED
 370        /* schedulable entities of this group on each CPU */
 371        struct sched_entity     **se;
 372        /* runqueue "owned" by this group on each CPU */
 373        struct cfs_rq           **cfs_rq;
 374        unsigned long           shares;
 375
 376#ifdef  CONFIG_SMP
 377        /*
 378         * load_avg can be heavily contended at clock tick time, so put
 379         * it in its own cacheline separated from the fields above which
 380         * will also be accessed at each tick.
 381         */
 382        atomic_long_t           load_avg ____cacheline_aligned;
 383#endif
 384#endif
 385
 386#ifdef CONFIG_RT_GROUP_SCHED
 387        struct sched_rt_entity  **rt_se;
 388        struct rt_rq            **rt_rq;
 389
 390        struct rt_bandwidth     rt_bandwidth;
 391#endif
 392
 393        struct rcu_head         rcu;
 394        struct list_head        list;
 395
 396        struct task_group       *parent;
 397        struct list_head        siblings;
 398        struct list_head        children;
 399
 400#ifdef CONFIG_SCHED_AUTOGROUP
 401        struct autogroup        *autogroup;
 402#endif
 403
 404        struct cfs_bandwidth    cfs_bandwidth;
 405
 406#ifdef CONFIG_UCLAMP_TASK_GROUP
 407        /* The two decimal precision [%] value requested from user-space */
 408        unsigned int            uclamp_pct[UCLAMP_CNT];
 409        /* Clamp values requested for a task group */
 410        struct uclamp_se        uclamp_req[UCLAMP_CNT];
 411        /* Effective clamp values used for a task group */
 412        struct uclamp_se        uclamp[UCLAMP_CNT];
 413#endif
 414
 415};
 416
 417#ifdef CONFIG_FAIR_GROUP_SCHED
 418#define ROOT_TASK_GROUP_LOAD    NICE_0_LOAD
 419
 420/*
 421 * A weight of 0 or 1 can cause arithmetics problems.
 422 * A weight of a cfs_rq is the sum of weights of which entities
 423 * are queued on this cfs_rq, so a weight of a entity should not be
 424 * too large, so as the shares value of a task group.
 425 * (The default weight is 1024 - so there's no practical
 426 *  limitation from this.)
 427 */
 428#define MIN_SHARES              (1UL <<  1)
 429#define MAX_SHARES              (1UL << 18)
 430#endif
 431
 432typedef int (*tg_visitor)(struct task_group *, void *);
 433
 434extern int walk_tg_tree_from(struct task_group *from,
 435                             tg_visitor down, tg_visitor up, void *data);
 436
 437/*
 438 * Iterate the full tree, calling @down when first entering a node and @up when
 439 * leaving it for the final time.
 440 *
 441 * Caller must hold rcu_lock or sufficient equivalent.
 442 */
 443static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 444{
 445        return walk_tg_tree_from(&root_task_group, down, up, data);
 446}
 447
 448extern int tg_nop(struct task_group *tg, void *data);
 449
 450extern void free_fair_sched_group(struct task_group *tg);
 451extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
 452extern void online_fair_sched_group(struct task_group *tg);
 453extern void unregister_fair_sched_group(struct task_group *tg);
 454extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 455                        struct sched_entity *se, int cpu,
 456                        struct sched_entity *parent);
 457extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 458
 459extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 460extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 461extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 462
 463extern void free_rt_sched_group(struct task_group *tg);
 464extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
 465extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 466                struct sched_rt_entity *rt_se, int cpu,
 467                struct sched_rt_entity *parent);
 468extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
 469extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
 470extern long sched_group_rt_runtime(struct task_group *tg);
 471extern long sched_group_rt_period(struct task_group *tg);
 472extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 473
 474extern struct task_group *sched_create_group(struct task_group *parent);
 475extern void sched_online_group(struct task_group *tg,
 476                               struct task_group *parent);
 477extern void sched_destroy_group(struct task_group *tg);
 478extern void sched_offline_group(struct task_group *tg);
 479
 480extern void sched_move_task(struct task_struct *tsk);
 481
 482#ifdef CONFIG_FAIR_GROUP_SCHED
 483extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 484
 485#ifdef CONFIG_SMP
 486extern void set_task_rq_fair(struct sched_entity *se,
 487                             struct cfs_rq *prev, struct cfs_rq *next);
 488#else /* !CONFIG_SMP */
 489static inline void set_task_rq_fair(struct sched_entity *se,
 490                             struct cfs_rq *prev, struct cfs_rq *next) { }
 491#endif /* CONFIG_SMP */
 492#endif /* CONFIG_FAIR_GROUP_SCHED */
 493
 494#else /* CONFIG_CGROUP_SCHED */
 495
 496struct cfs_bandwidth { };
 497
 498#endif  /* CONFIG_CGROUP_SCHED */
 499
 500/* CFS-related fields in a runqueue */
 501struct cfs_rq {
 502        struct load_weight      load;
 503        unsigned int            nr_running;
 504        unsigned int            h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
 505        unsigned int            idle_h_nr_running; /* SCHED_IDLE */
 506
 507        u64                     exec_clock;
 508        u64                     min_vruntime;
 509#ifndef CONFIG_64BIT
 510        u64                     min_vruntime_copy;
 511#endif
 512
 513        struct rb_root_cached   tasks_timeline;
 514
 515        /*
 516         * 'curr' points to currently running entity on this cfs_rq.
 517         * It is set to NULL otherwise (i.e when none are currently running).
 518         */
 519        struct sched_entity     *curr;
 520        struct sched_entity     *next;
 521        struct sched_entity     *last;
 522        struct sched_entity     *skip;
 523
 524#ifdef  CONFIG_SCHED_DEBUG
 525        unsigned int            nr_spread_over;
 526#endif
 527
 528#ifdef CONFIG_SMP
 529        /*
 530         * CFS load tracking
 531         */
 532        struct sched_avg        avg;
 533#ifndef CONFIG_64BIT
 534        u64                     load_last_update_time_copy;
 535#endif
 536        struct {
 537                raw_spinlock_t  lock ____cacheline_aligned;
 538                int             nr;
 539                unsigned long   load_avg;
 540                unsigned long   util_avg;
 541                unsigned long   runnable_avg;
 542        } removed;
 543
 544#ifdef CONFIG_FAIR_GROUP_SCHED
 545        unsigned long           tg_load_avg_contrib;
 546        long                    propagate;
 547        long                    prop_runnable_sum;
 548
 549        /*
 550         *   h_load = weight * f(tg)
 551         *
 552         * Where f(tg) is the recursive weight fraction assigned to
 553         * this group.
 554         */
 555        unsigned long           h_load;
 556        u64                     last_h_load_update;
 557        struct sched_entity     *h_load_next;
 558#endif /* CONFIG_FAIR_GROUP_SCHED */
 559#endif /* CONFIG_SMP */
 560
 561#ifdef CONFIG_FAIR_GROUP_SCHED
 562        struct rq               *rq;    /* CPU runqueue to which this cfs_rq is attached */
 563
 564        /*
 565         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 566         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 567         * (like users, containers etc.)
 568         *
 569         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
 570         * This list is used during load balance.
 571         */
 572        int                     on_list;
 573        struct list_head        leaf_cfs_rq_list;
 574        struct task_group       *tg;    /* group that "owns" this runqueue */
 575
 576#ifdef CONFIG_CFS_BANDWIDTH
 577        int                     runtime_enabled;
 578        s64                     runtime_remaining;
 579
 580        u64                     throttled_clock;
 581        u64                     throttled_clock_task;
 582        u64                     throttled_clock_task_time;
 583        int                     throttled;
 584        int                     throttle_count;
 585        struct list_head        throttled_list;
 586#endif /* CONFIG_CFS_BANDWIDTH */
 587#endif /* CONFIG_FAIR_GROUP_SCHED */
 588};
 589
 590static inline int rt_bandwidth_enabled(void)
 591{
 592        return sysctl_sched_rt_runtime >= 0;
 593}
 594
 595/* RT IPI pull logic requires IRQ_WORK */
 596#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
 597# define HAVE_RT_PUSH_IPI
 598#endif
 599
 600/* Real-Time classes' related field in a runqueue: */
 601struct rt_rq {
 602        struct rt_prio_array    active;
 603        unsigned int            rt_nr_running;
 604        unsigned int            rr_nr_running;
 605#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 606        struct {
 607                int             curr; /* highest queued rt task prio */
 608#ifdef CONFIG_SMP
 609                int             next; /* next highest */
 610#endif
 611        } highest_prio;
 612#endif
 613#ifdef CONFIG_SMP
 614        unsigned long           rt_nr_migratory;
 615        unsigned long           rt_nr_total;
 616        int                     overloaded;
 617        struct plist_head       pushable_tasks;
 618
 619#endif /* CONFIG_SMP */
 620        int                     rt_queued;
 621
 622        int                     rt_throttled;
 623        u64                     rt_time;
 624        u64                     rt_runtime;
 625        /* Nests inside the rq lock: */
 626        raw_spinlock_t          rt_runtime_lock;
 627
 628#ifdef CONFIG_RT_GROUP_SCHED
 629        unsigned long           rt_nr_boosted;
 630
 631        struct rq               *rq;
 632        struct task_group       *tg;
 633#endif
 634};
 635
 636static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
 637{
 638        return rt_rq->rt_queued && rt_rq->rt_nr_running;
 639}
 640
 641/* Deadline class' related fields in a runqueue */
 642struct dl_rq {
 643        /* runqueue is an rbtree, ordered by deadline */
 644        struct rb_root_cached   root;
 645
 646        unsigned long           dl_nr_running;
 647
 648#ifdef CONFIG_SMP
 649        /*
 650         * Deadline values of the currently executing and the
 651         * earliest ready task on this rq. Caching these facilitates
 652         * the decision whether or not a ready but not running task
 653         * should migrate somewhere else.
 654         */
 655        struct {
 656                u64             curr;
 657                u64             next;
 658        } earliest_dl;
 659
 660        unsigned long           dl_nr_migratory;
 661        int                     overloaded;
 662
 663        /*
 664         * Tasks on this rq that can be pushed away. They are kept in
 665         * an rb-tree, ordered by tasks' deadlines, with caching
 666         * of the leftmost (earliest deadline) element.
 667         */
 668        struct rb_root_cached   pushable_dl_tasks_root;
 669#else
 670        struct dl_bw            dl_bw;
 671#endif
 672        /*
 673         * "Active utilization" for this runqueue: increased when a
 674         * task wakes up (becomes TASK_RUNNING) and decreased when a
 675         * task blocks
 676         */
 677        u64                     running_bw;
 678
 679        /*
 680         * Utilization of the tasks "assigned" to this runqueue (including
 681         * the tasks that are in runqueue and the tasks that executed on this
 682         * CPU and blocked). Increased when a task moves to this runqueue, and
 683         * decreased when the task moves away (migrates, changes scheduling
 684         * policy, or terminates).
 685         * This is needed to compute the "inactive utilization" for the
 686         * runqueue (inactive utilization = this_bw - running_bw).
 687         */
 688        u64                     this_bw;
 689        u64                     extra_bw;
 690
 691        /*
 692         * Inverse of the fraction of CPU utilization that can be reclaimed
 693         * by the GRUB algorithm.
 694         */
 695        u64                     bw_ratio;
 696};
 697
 698#ifdef CONFIG_FAIR_GROUP_SCHED
 699/* An entity is a task if it doesn't "own" a runqueue */
 700#define entity_is_task(se)      (!se->my_q)
 701
 702static inline void se_update_runnable(struct sched_entity *se)
 703{
 704        if (!entity_is_task(se))
 705                se->runnable_weight = se->my_q->h_nr_running;
 706}
 707
 708static inline long se_runnable(struct sched_entity *se)
 709{
 710        if (entity_is_task(se))
 711                return !!se->on_rq;
 712        else
 713                return se->runnable_weight;
 714}
 715
 716#else
 717#define entity_is_task(se)      1
 718
 719static inline void se_update_runnable(struct sched_entity *se) {}
 720
 721static inline long se_runnable(struct sched_entity *se)
 722{
 723        return !!se->on_rq;
 724}
 725#endif
 726
 727#ifdef CONFIG_SMP
 728/*
 729 * XXX we want to get rid of these helpers and use the full load resolution.
 730 */
 731static inline long se_weight(struct sched_entity *se)
 732{
 733        return scale_load_down(se->load.weight);
 734}
 735
 736
 737static inline bool sched_asym_prefer(int a, int b)
 738{
 739        return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
 740}
 741
 742struct perf_domain {
 743        struct em_perf_domain *em_pd;
 744        struct perf_domain *next;
 745        struct rcu_head rcu;
 746};
 747
 748/* Scheduling group status flags */
 749#define SG_OVERLOAD             0x1 /* More than one runnable task on a CPU. */
 750#define SG_OVERUTILIZED         0x2 /* One or more CPUs are over-utilized. */
 751
 752/*
 753 * We add the notion of a root-domain which will be used to define per-domain
 754 * variables. Each exclusive cpuset essentially defines an island domain by
 755 * fully partitioning the member CPUs from any other cpuset. Whenever a new
 756 * exclusive cpuset is created, we also create and attach a new root-domain
 757 * object.
 758 *
 759 */
 760struct root_domain {
 761        atomic_t                refcount;
 762        atomic_t                rto_count;
 763        struct rcu_head         rcu;
 764        cpumask_var_t           span;
 765        cpumask_var_t           online;
 766
 767        /*
 768         * Indicate pullable load on at least one CPU, e.g:
 769         * - More than one runnable task
 770         * - Running task is misfit
 771         */
 772        int                     overload;
 773
 774        /* Indicate one or more cpus over-utilized (tipping point) */
 775        int                     overutilized;
 776
 777        /*
 778         * The bit corresponding to a CPU gets set here if such CPU has more
 779         * than one runnable -deadline task (as it is below for RT tasks).
 780         */
 781        cpumask_var_t           dlo_mask;
 782        atomic_t                dlo_count;
 783        struct dl_bw            dl_bw;
 784        struct cpudl            cpudl;
 785
 786#ifdef HAVE_RT_PUSH_IPI
 787        /*
 788         * For IPI pull requests, loop across the rto_mask.
 789         */
 790        struct irq_work         rto_push_work;
 791        raw_spinlock_t          rto_lock;
 792        /* These are only updated and read within rto_lock */
 793        int                     rto_loop;
 794        int                     rto_cpu;
 795        /* These atomics are updated outside of a lock */
 796        atomic_t                rto_loop_next;
 797        atomic_t                rto_loop_start;
 798#endif
 799        /*
 800         * The "RT overload" flag: it gets set if a CPU has more than
 801         * one runnable RT task.
 802         */
 803        cpumask_var_t           rto_mask;
 804        struct cpupri           cpupri;
 805
 806        unsigned long           max_cpu_capacity;
 807
 808        /*
 809         * NULL-terminated list of performance domains intersecting with the
 810         * CPUs of the rd. Protected by RCU.
 811         */
 812        struct perf_domain __rcu *pd;
 813};
 814
 815extern void init_defrootdomain(void);
 816extern int sched_init_domains(const struct cpumask *cpu_map);
 817extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 818extern void sched_get_rd(struct root_domain *rd);
 819extern void sched_put_rd(struct root_domain *rd);
 820
 821#ifdef HAVE_RT_PUSH_IPI
 822extern void rto_push_irq_work_func(struct irq_work *work);
 823#endif
 824#endif /* CONFIG_SMP */
 825
 826#ifdef CONFIG_UCLAMP_TASK
 827/*
 828 * struct uclamp_bucket - Utilization clamp bucket
 829 * @value: utilization clamp value for tasks on this clamp bucket
 830 * @tasks: number of RUNNABLE tasks on this clamp bucket
 831 *
 832 * Keep track of how many tasks are RUNNABLE for a given utilization
 833 * clamp value.
 834 */
 835struct uclamp_bucket {
 836        unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
 837        unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
 838};
 839
 840/*
 841 * struct uclamp_rq - rq's utilization clamp
 842 * @value: currently active clamp values for a rq
 843 * @bucket: utilization clamp buckets affecting a rq
 844 *
 845 * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
 846 * A clamp value is affecting a rq when there is at least one task RUNNABLE
 847 * (or actually running) with that value.
 848 *
 849 * There are up to UCLAMP_CNT possible different clamp values, currently there
 850 * are only two: minimum utilization and maximum utilization.
 851 *
 852 * All utilization clamping values are MAX aggregated, since:
 853 * - for util_min: we want to run the CPU at least at the max of the minimum
 854 *   utilization required by its currently RUNNABLE tasks.
 855 * - for util_max: we want to allow the CPU to run up to the max of the
 856 *   maximum utilization allowed by its currently RUNNABLE tasks.
 857 *
 858 * Since on each system we expect only a limited number of different
 859 * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
 860 * the metrics required to compute all the per-rq utilization clamp values.
 861 */
 862struct uclamp_rq {
 863        unsigned int value;
 864        struct uclamp_bucket bucket[UCLAMP_BUCKETS];
 865};
 866#endif /* CONFIG_UCLAMP_TASK */
 867
 868/*
 869 * This is the main, per-CPU runqueue data structure.
 870 *
 871 * Locking rule: those places that want to lock multiple runqueues
 872 * (such as the load balancing or the thread migration code), lock
 873 * acquire operations must be ordered by ascending &runqueue.
 874 */
 875struct rq {
 876        /* runqueue lock: */
 877        raw_spinlock_t          lock;
 878
 879        /*
 880         * nr_running and cpu_load should be in the same cacheline because
 881         * remote CPUs use both these fields when doing load calculation.
 882         */
 883        unsigned int            nr_running;
 884#ifdef CONFIG_NUMA_BALANCING
 885        unsigned int            nr_numa_running;
 886        unsigned int            nr_preferred_running;
 887        unsigned int            numa_migrate_on;
 888#endif
 889#ifdef CONFIG_NO_HZ_COMMON
 890#ifdef CONFIG_SMP
 891        unsigned long           last_blocked_load_update_tick;
 892        unsigned int            has_blocked_load;
 893#endif /* CONFIG_SMP */
 894        unsigned int            nohz_tick_stopped;
 895        atomic_t nohz_flags;
 896#endif /* CONFIG_NO_HZ_COMMON */
 897
 898        unsigned long           nr_load_updates;
 899        u64                     nr_switches;
 900
 901#ifdef CONFIG_UCLAMP_TASK
 902        /* Utilization clamp values based on CPU's RUNNABLE tasks */
 903        struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
 904        unsigned int            uclamp_flags;
 905#define UCLAMP_FLAG_IDLE 0x01
 906#endif
 907
 908        struct cfs_rq           cfs;
 909        struct rt_rq            rt;
 910        struct dl_rq            dl;
 911
 912#ifdef CONFIG_FAIR_GROUP_SCHED
 913        /* list of leaf cfs_rq on this CPU: */
 914        struct list_head        leaf_cfs_rq_list;
 915        struct list_head        *tmp_alone_branch;
 916#endif /* CONFIG_FAIR_GROUP_SCHED */
 917
 918        /*
 919         * This is part of a global counter where only the total sum
 920         * over all CPUs matters. A task can increase this counter on
 921         * one CPU and if it got migrated afterwards it may decrease
 922         * it on another CPU. Always updated under the runqueue lock:
 923         */
 924        unsigned long           nr_uninterruptible;
 925
 926        struct task_struct __rcu        *curr;
 927        struct task_struct      *idle;
 928        struct task_struct      *stop;
 929        unsigned long           next_balance;
 930        struct mm_struct        *prev_mm;
 931
 932        unsigned int            clock_update_flags;
 933        u64                     clock;
 934        /* Ensure that all clocks are in the same cache line */
 935        u64                     clock_task ____cacheline_aligned;
 936        u64                     clock_pelt;
 937        unsigned long           lost_idle_time;
 938
 939        atomic_t                nr_iowait;
 940
 941#ifdef CONFIG_MEMBARRIER
 942        int membarrier_state;
 943#endif
 944
 945#ifdef CONFIG_SMP
 946        struct root_domain              *rd;
 947        struct sched_domain __rcu       *sd;
 948
 949        unsigned long           cpu_capacity;
 950        unsigned long           cpu_capacity_orig;
 951
 952        struct callback_head    *balance_callback;
 953
 954        unsigned char           idle_balance;
 955
 956        unsigned long           misfit_task_load;
 957
 958        /* For active balancing */
 959        int                     active_balance;
 960        int                     push_cpu;
 961        struct cpu_stop_work    active_balance_work;
 962
 963        /* CPU of this runqueue: */
 964        int                     cpu;
 965        int                     online;
 966
 967        struct list_head cfs_tasks;
 968
 969        struct sched_avg        avg_rt;
 970        struct sched_avg        avg_dl;
 971#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 972        struct sched_avg        avg_irq;
 973#endif
 974#ifdef CONFIG_SCHED_THERMAL_PRESSURE
 975        struct sched_avg        avg_thermal;
 976#endif
 977        u64                     idle_stamp;
 978        u64                     avg_idle;
 979
 980        /* This is used to determine avg_idle's max value */
 981        u64                     max_idle_balance_cost;
 982#endif
 983
 984#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 985        u64                     prev_irq_time;
 986#endif
 987#ifdef CONFIG_PARAVIRT
 988        u64                     prev_steal_time;
 989#endif
 990#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 991        u64                     prev_steal_time_rq;
 992#endif
 993
 994        /* calc_load related fields */
 995        unsigned long           calc_load_update;
 996        long                    calc_load_active;
 997
 998#ifdef CONFIG_SCHED_HRTICK
 999#ifdef CONFIG_SMP
1000        call_single_data_t      hrtick_csd;

1001#endif
1002        struct hrtimer          hrtick_timer;
1003#endif
1004
1005#ifdef CONFIG_SCHEDSTATS
1006        /* latency stats */
1007        struct sched_info       rq_sched_info;
1008        unsigned long long      rq_cpu_time;
1009        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
1010
1011        /* sys_sched_yield() stats */
1012        unsigned int            yld_count;
1013
1014        /* schedule() stats */
1015        unsigned int            sched_count;
1016        unsigned int            sched_goidle;
1017
1018        /* try_to_wake_up() stats */
1019        unsigned int            ttwu_count;
1020        unsigned int            ttwu_local;
1021#endif
1022
1023#ifdef CONFIG_SMP
1024        struct llist_head       wake_list;
1025#endif
1026
1027#ifdef CONFIG_CPU_IDLE
1028        /* Must be inspected within a rcu lock section */
1029        struct cpuidle_state    *idle_state;
1030#endif
1031};
1032
1033#ifdef CONFIG_FAIR_GROUP_SCHED
1034
1035/* CPU runqueue to which this cfs_rq is attached */
1036static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
1037{
1038        return cfs_rq->rq;
1039}
1040
1041#else
1042
1043static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
1044{
1045        return container_of(cfs_rq, struct rq, cfs);
1046}
1047#endif
1048
1049static inline int cpu_of(struct rq *rq)
1050{
1051#ifdef CONFIG_SMP
1052        return rq->cpu;
1053#else
1054        return 0;
1055#endif
1056}
1057
1058
1059#ifdef CONFIG_SCHED_SMT
1060extern void __update_idle_core(struct rq *rq);
1061
1062static inline void update_idle_core(struct rq *rq)
1063{
1064        if (static_branch_unlikely(&sched_smt_present))
1065                __update_idle_core(rq);
1066}
1067
1068#else
1069static inline void update_idle_core(struct rq *rq) { }
1070#endif
1071
1072DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1073
1074#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
1075#define this_rq()               this_cpu_ptr(&runqueues)
1076#define task_rq(p)              cpu_rq(task_cpu(p))
1077#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
1078#define raw_rq()                raw_cpu_ptr(&runqueues)
1079
1080extern void update_rq_clock(struct rq *rq);
1081
1082static inline u64 __rq_clock_broken(struct rq *rq)
1083{
1084        return READ_ONCE(rq->clock);
1085}
1086
1087/*
1088 * rq::clock_update_flags bits
1089 *
1090 * %RQCF_REQ_SKIP - will request skipping of clock update on the next
1091 *  call to __schedule(). This is an optimisation to avoid
1092 *  neighbouring rq clock updates.
1093 *
1094 * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
1095 *  in effect and calls to update_rq_clock() are being ignored.
1096 *
1097 * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
1098 *  made to update_rq_clock() since the last time rq::lock was pinned.
1099 *
1100 * If inside of __schedule(), clock_update_flags will have been
1101 * shifted left (a left shift is a cheap operation for the fast path
1102 * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
1103 *
1104 *      if (rq-clock_update_flags >= RQCF_UPDATED)
1105 *
1106 * to check if %RQCF_UPADTED is set. It'll never be shifted more than
1107 * one position though, because the next rq_unpin_lock() will shift it
1108 * back.
1109 */
1110#define RQCF_REQ_SKIP           0x01
1111#define RQCF_ACT_SKIP           0x02
1112#define RQCF_UPDATED            0x04
1113
1114static inline void assert_clock_updated(struct rq *rq)
1115{
1116        /*
1117         * The only reason for not seeing a clock update since the
1118         * last rq_pin_lock() is if we're currently skipping updates.
1119         */
1120        SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
1121}
1122
1123static inline u64 rq_clock(struct rq *rq)
1124{
1125        lockdep_assert_held(&rq->lock);
1126        assert_clock_updated(rq);
1127
1128        return rq->clock;
1129}
1130
1131static inline u64 rq_clock_task(struct rq *rq)
1132{
1133        lockdep_assert_held(&rq->lock);
1134        assert_clock_updated(rq);
1135
1136        return rq->clock_task;
1137}
1138
1139/**
1140 * By default the decay is the default pelt decay period.
1141 * The decay shift can change the decay period in
1142 * multiples of 32.
1143 *  Decay shift         Decay period(ms)
1144 *      0                       32
1145 *      1                       64
1146 *      2                       128
1147 *      3                       256
1148 *      4                       512
1149 */
1150extern int sched_thermal_decay_shift;
1151
1152static inline u64 rq_clock_thermal(struct rq *rq)
1153{
1154        return rq_clock_task(rq) >> sched_thermal_decay_shift;
1155}
1156
1157static inline void rq_clock_skip_update(struct rq *rq)
1158{
1159        lockdep_assert_held(&rq->lock);
1160        rq->clock_update_flags |= RQCF_REQ_SKIP;
1161}
1162
1163/*
1164 * See rt task throttling, which is the only time a skip
1165 * request is cancelled.
1166 */
1167static inline void rq_clock_cancel_skipupdate(struct rq *rq)
1168{
1169        lockdep_assert_held(&rq->lock);
1170        rq->clock_update_flags &= ~RQCF_REQ_SKIP;
1171}
1172
1173struct rq_flags {
1174        unsigned long flags;
1175        struct pin_cookie cookie;
1176#ifdef CONFIG_SCHED_DEBUG
1177        /*
1178         * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
1179         * current pin context is stashed here in case it needs to be
1180         * restored in rq_repin_lock().
1181         */
1182        unsigned int clock_update_flags;
1183#endif
1184};
1185
1186static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
1187{
1188        rf->cookie = lockdep_pin_lock(&rq->lock);
1189
1190#ifdef CONFIG_SCHED_DEBUG
1191        rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
1192        rf->clock_update_flags = 0;
1193#endif
1194}
1195
1196static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
1197{
1198#ifdef CONFIG_SCHED_DEBUG
1199        if (rq->clock_update_flags > RQCF_ACT_SKIP)
1200                rf->clock_update_flags = RQCF_UPDATED;
1201#endif
1202
1203        lockdep_unpin_lock(&rq->lock, rf->cookie);
1204}
1205
1206static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
1207{
1208        lockdep_repin_lock(&rq->lock, rf->cookie);
1209
1210#ifdef CONFIG_SCHED_DEBUG
1211        /*
1212         * Restore the value we stashed in @rf for this pin context.
1213         */
1214        rq->clock_update_flags |= rf->clock_update_flags;
1215#endif
1216}
1217
1218struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1219        __acquires(rq->lock);
1220
1221struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1222        __acquires(p->pi_lock)
1223        __acquires(rq->lock);
1224
1225static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
1226        __releases(rq->lock)
1227{
1228        rq_unpin_lock(rq, rf);
1229        raw_spin_unlock(&rq->lock);
1230}
1231
1232static inline void
1233task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1234        __releases(rq->lock)
1235        __releases(p->pi_lock)
1236{
1237        rq_unpin_lock(rq, rf);
1238        raw_spin_unlock(&rq->lock);
1239        raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
1240}
1241
1242static inline void
1243rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
1244        __acquires(rq->lock)
1245{
1246        raw_spin_lock_irqsave(&rq->lock, rf->flags);
1247        rq_pin_lock(rq, rf);
1248}
1249
1250static inline void
1251rq_lock_irq(struct rq *rq, struct rq_flags *rf)
1252        __acquires(rq->lock)
1253{
1254        raw_spin_lock_irq(&rq->lock);
1255        rq_pin_lock(rq, rf);
1256}
1257
1258static inline void
1259rq_lock(struct rq *rq, struct rq_flags *rf)
1260        __acquires(rq->lock)
1261{
1262        raw_spin_lock(&rq->lock);
1263        rq_pin_lock(rq, rf);
1264}
1265
1266static inline void
1267rq_relock(struct rq *rq, struct rq_flags *rf)
1268        __acquires(rq->lock)
1269{
1270        raw_spin_lock(&rq->lock);
1271        rq_repin_lock(rq, rf);
1272}
1273
1274static inline void
1275rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
1276        __releases(rq->lock)
1277{
1278        rq_unpin_lock(rq, rf);
1279        raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
1280}
1281
1282static inline void
1283rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
1284        __releases(rq->lock)
1285{
1286        rq_unpin_lock(rq, rf);
1287        raw_spin_unlock_irq(&rq->lock);
1288}
1289
1290static inline void
1291rq_unlock(struct rq *rq, struct rq_flags *rf)
1292        __releases(rq->lock)
1293{
1294        rq_unpin_lock(rq, rf);
1295        raw_spin_unlock(&rq->lock);
1296}
1297
1298static inline struct rq *
1299this_rq_lock_irq(struct rq_flags *rf)
1300        __acquires(rq->lock)
1301{
1302        struct rq *rq;
1303
1304        local_irq_disable();
1305        rq = this_rq();
1306        rq_lock(rq, rf);
1307        return rq;
1308}
1309
1310#ifdef CONFIG_NUMA
1311enum numa_topology_type {
1312        NUMA_DIRECT,
1313        NUMA_GLUELESS_MESH,
1314        NUMA_BACKPLANE,
1315};
1316extern enum numa_topology_type sched_numa_topology_type;
1317extern int sched_max_numa_distance;
1318extern bool find_numa_distance(int distance);
1319extern void sched_init_numa(void);
1320extern void sched_domains_numa_masks_set(unsigned int cpu);
1321extern void sched_domains_numa_masks_clear(unsigned int cpu);
1322extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
1323#else
1324static inline void sched_init_numa(void) { }
1325static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
1326static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
1327static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1328{
1329        return nr_cpu_ids;
1330}
1331#endif
1332
1333#ifdef CONFIG_NUMA_BALANCING
1334/* The regions in numa_faults array from task_struct */
1335enum numa_faults_stats {
1336        NUMA_MEM = 0,
1337        NUMA_CPU,
1338        NUMA_MEMBUF,
1339        NUMA_CPUBUF
1340};
1341extern void sched_setnuma(struct task_struct *p, int node);
1342extern int migrate_task_to(struct task_struct *p, int cpu);
1343extern int migrate_swap(struct task_struct *p, struct task_struct *t,
1344                        int cpu, int scpu);
1345extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
1346#else
1347static inline void
1348init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1349{
1350}
1351#endif /* CONFIG_NUMA_BALANCING */
1352
1353#ifdef CONFIG_SMP
1354
1355static inline void
1356queue_balance_callback(struct rq *rq,
1357                       struct callback_head *head,
1358                       void (*func)(struct rq *rq))
1359{
1360        lockdep_assert_held(&rq->lock);
1361
1362        if (unlikely(head->next))
1363                return;
1364
1365        head->func = (void (*)(struct callback_head *))func;
1366        head->next = rq->balance_callback;
1367        rq->balance_callback = head;
1368}
1369
1370extern void sched_ttwu_pending(void);
1371
1372#define rcu_dereference_check_sched_domain(p) \
1373        rcu_dereference_check((p), \
1374                              lockdep_is_held(&sched_domains_mutex))
1375
1376/*
1377 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1378 * See destroy_sched_domains: call_rcu for details.
1379 *
1380 * The domain tree of any CPU may only be accessed from within
1381 * preempt-disabled sections.
1382 */
1383#define for_each_domain(cpu, __sd) \
1384        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
1385                        __sd; __sd = __sd->parent)
1386
1387/**
1388 * highest_flag_domain - Return highest sched_domain containing flag.
1389 * @cpu:        The CPU whose highest level of sched domain is to
1390 *              be returned.
1391 * @flag:       The flag to check for the highest sched_domain
1392 *              for the given CPU.
1393 *
1394 * Returns the highest sched_domain of a CPU which contains the given flag.
1395 */
1396static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1397{
1398        struct sched_domain *sd, *hsd = NULL;
1399
1400        for_each_domain(cpu, sd) {
1401                if (!(sd->flags & flag))
1402                        break;
1403                hsd = sd;
1404        }
1405
1406        return hsd;
1407}
1408
1409static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
1410{
1411        struct sched_domain *sd;
1412
1413        for_each_domain(cpu, sd) {
1414                if (sd->flags & flag)
1415                        break;
1416        }
1417
1418        return sd;
1419}
1420
1421DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
1422DECLARE_PER_CPU(int, sd_llc_size);
1423DECLARE_PER_CPU(int, sd_llc_id);
1424DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
1425DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
1426DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
1427DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
1428extern struct static_key_false sched_asym_cpucapacity;
1429
1430struct sched_group_capacity {
1431        atomic_t                ref;
1432        /*
1433         * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1434         * for a single CPU.
1435         */
1436        unsigned long           capacity;
1437        unsigned long           min_capacity;           /* Min per-CPU capacity in group */
1438        unsigned long           max_capacity;           /* Max per-CPU capacity in group */
1439        unsigned long           next_update;
1440        int                     imbalance;              /* XXX unrelated to capacity but shared group state */
1441
1442#ifdef CONFIG_SCHED_DEBUG
1443        int                     id;
1444#endif
1445
1446        unsigned long           cpumask[0];             /* Balance mask */
1447};
1448
1449struct sched_group {
1450        struct sched_group      *next;                  /* Must be a circular list */
1451        atomic_t                ref;
1452
1453        unsigned int            group_weight;
1454        struct sched_group_capacity *sgc;
1455        int                     asym_prefer_cpu;        /* CPU of highest priority in group */
1456
1457        /*
1458         * The CPUs this group covers.
1459         *
1460         * NOTE: this field is variable length. (Allocated dynamically
1461         * by attaching extra space to the end of the structure,
1462         * depending on how many CPUs the kernel has booted up with)
1463         */
1464        unsigned long           cpumask[0];
1465};
1466
1467static inline struct cpumask *sched_group_span(struct sched_group *sg)
1468{
1469        return to_cpumask(sg->cpumask);
1470}
1471
1472/*
1473 * See build_balance_mask().
1474 */
1475static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1476{
1477        return to_cpumask(sg->sgc->cpumask);
1478}
1479
1480/**
1481 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1482 * @group: The group whose first CPU is to be returned.
1483 */
1484static inline unsigned int group_first_cpu(struct sched_group *group)
1485{
1486        return cpumask_first(sched_group_span(group));
1487}
1488
1489extern int group_balance_cpu(struct sched_group *sg);
1490
1491#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
1492void register_sched_domain_sysctl(void);
1493void dirty_sched_domain_sysctl(int cpu);
1494void unregister_sched_domain_sysctl(void);
1495#else
1496static inline void register_sched_domain_sysctl(void)
1497{
1498}
1499static inline void dirty_sched_domain_sysctl(int cpu)
1500{
1501}
1502static inline void unregister_sched_domain_sysctl(void)
1503{
1504}
1505#endif
1506
1507extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
1508
1509#else
1510
1511static inline void sched_ttwu_pending(void) { }
1512
1513static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
1514
1515#endif /* CONFIG_SMP */
1516
1517#include "stats.h"
1518#include "autogroup.h"
1519
1520#ifdef CONFIG_CGROUP_SCHED
1521
1522/*
1523 * Return the group to which this tasks belongs.
1524 *
1525 * We cannot use task_css() and friends because the cgroup subsystem
1526 * changes that value before the cgroup_subsys::attach() method is called,
1527 * therefore we cannot pin it and might observe the wrong value.
1528 *
1529 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
1530 * core changes this before calling sched_move_task().
1531 *
1532 * Instead we use a 'copy' which is updated from sched_move_task() while
1533 * holding both task_struct::pi_lock and rq::lock.
1534 */
1535static inline struct task_group *task_group(struct task_struct *p)
1536{
1537        return p->sched_task_group;
1538}
1539
1540/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
1541static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
1542{
1543#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
1544        struct task_group *tg = task_group(p);
1545#endif
1546
1547#ifdef CONFIG_FAIR_GROUP_SCHED
1548        set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
1549        p->se.cfs_rq = tg->cfs_rq[cpu];
1550        p->se.parent = tg->se[cpu];
1551#endif
1552
1553#ifdef CONFIG_RT_GROUP_SCHED
1554        p->rt.rt_rq  = tg->rt_rq[cpu];
1555        p->rt.parent = tg->rt_se[cpu];
1556#endif
1557}
1558
1559#else /* CONFIG_CGROUP_SCHED */
1560
1561static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
1562static inline struct task_group *task_group(struct task_struct *p)
1563{
1564        return NULL;
1565}
1566
1567#endif /* CONFIG_CGROUP_SCHED */
1568
1569static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1570{
1571        set_task_rq(p, cpu);
1572#ifdef CONFIG_SMP
1573        /*
1574         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1575         * successfully executed on another CPU. We must ensure that updates of
1576         * per-task data have been completed by this moment.
1577         */
1578        smp_wmb();
1579#ifdef CONFIG_THREAD_INFO_IN_TASK
1580        WRITE_ONCE(p->cpu, cpu);
1581#else
1582        WRITE_ONCE(task_thread_info(p)->cpu, cpu);
1583#endif
1584        p->wake_cpu = cpu;
1585#endif
1586}
1587
1588/*
1589 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
1590 */
1591#ifdef CONFIG_SCHED_DEBUG
1592# include <linux/static_key.h>
1593# define const_debug __read_mostly
1594#else
1595# define const_debug const
1596#endif
1597
1598#define SCHED_FEAT(name, enabled)       \
1599        __SCHED_FEAT_##name ,
1600
1601enum {
1602#include "features.h"
1603        __SCHED_FEAT_NR,
1604};
1605
1606#undef SCHED_FEAT
1607
1608#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
1609
1610/*
1611 * To support run-time toggling of sched features, all the translation units
1612 * (but core.c) reference the sysctl_sched_features defined in core.c.
1613 */
1614extern const_debug unsigned int sysctl_sched_features;
1615
1616#define SCHED_FEAT(name, enabled)                                       \
1617static __always_inline bool static_branch_##name(struct static_key *key) \
1618{                                                                       \
1619        return static_key_##enabled(key);                               \
1620}
1621
1622#include "features.h"
1623#undef SCHED_FEAT
1624
1625extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1626#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
1627
1628#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
1629
1630/*
1631 * Each translation unit has its own copy of sysctl_sched_features to allow
1632 * constants propagation at compile time and compiler optimization based on
1633 * features default.
1634 */
1635#define SCHED_FEAT(name, enabled)       \
1636        (1UL << __SCHED_FEAT_##name) * enabled |
1637static const_debug __maybe_unused unsigned int sysctl_sched_features =
1638#include "features.h"
1639        0;
1640#undef SCHED_FEAT
1641
1642#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1643
1644#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
1645
1646extern struct static_key_false sched_numa_balancing;
1647extern struct static_key_false sched_schedstats;
1648
1649static inline u64 global_rt_period(void)
1650{
1651        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1652}
1653
1654static inline u64 global_rt_runtime(void)
1655{
1656        if (sysctl_sched_rt_runtime < 0)
1657                return RUNTIME_INF;
1658
1659        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1660}
1661
1662static inline int task_current(struct rq *rq, struct task_struct *p)
1663{
1664        return rq->curr == p;
1665}
1666
1667static inline int task_running(struct rq *rq, struct task_struct *p)
1668{
1669#ifdef CONFIG_SMP
1670        return p->on_cpu;
1671#else
1672        return task_current(rq, p);
1673#endif
1674}
1675
1676static inline int task_on_rq_queued(struct task_struct *p)
1677{
1678        return p->on_rq == TASK_ON_RQ_QUEUED;
1679}
1680
1681static inline int task_on_rq_migrating(struct task_struct *p)
1682{
1683        return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
1684}
1685
1686/*
1687 * wake flags
1688 */
1689#define WF_SYNC                 0x01            /* Waker goes to sleep after wakeup */
1690#define WF_FORK                 0x02            /* Child wakeup after fork */
1691#define WF_MIGRATED             0x4             /* Internal use, task got migrated */
1692
1693/*
1694 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1695 * of tasks with abnormal "nice" values across CPUs the contribution that
1696 * each task makes to its run queue's load is weighted according to its
1697 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1698 * scaled version of the new time slice allocation that they receive on time
1699 * slice expiry etc.
1700 */
1701
1702#define WEIGHT_IDLEPRIO         3
1703#define WMULT_IDLEPRIO          1431655765
1704
1705extern const int                sched_prio_to_weight[40];
1706extern const u32                sched_prio_to_wmult[40];
1707
1708/*
1709 * {de,en}queue flags:
1710 *
1711 * DEQUEUE_SLEEP  - task is no longer runnable
1712 * ENQUEUE_WAKEUP - task just became runnable
1713 *
1714 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
1715 *                are in a known state which allows modification. Such pairs
1716 *                should preserve as much state as possible.
1717 *
1718 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
1719 *        in the runqueue.
1720 *
1721 * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
1722 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
1723 * ENQUEUE_MIGRATED  - the task was migrated during wakeup
1724 *
1725 */
1726
1727#define DEQUEUE_SLEEP           0x01
1728#define DEQUEUE_SAVE            0x02 /* Matches ENQUEUE_RESTORE */
1729#define DEQUEUE_MOVE            0x04 /* Matches ENQUEUE_MOVE */
1730#define DEQUEUE_NOCLOCK         0x08 /* Matches ENQUEUE_NOCLOCK */
1731
1732#define ENQUEUE_WAKEUP          0x01
1733#define ENQUEUE_RESTORE         0x02
1734#define ENQUEUE_MOVE            0x04
1735#define ENQUEUE_NOCLOCK         0x08
1736
1737#define ENQUEUE_HEAD            0x10
1738#define ENQUEUE_REPLENISH       0x20
1739#ifdef CONFIG_SMP
1740#define ENQUEUE_MIGRATED        0x40
1741#else
1742#define ENQUEUE_MIGRATED        0x00
1743#endif
1744
1745#define RETRY_TASK              ((void *)-1UL)
1746
1747struct sched_class {
1748        const struct sched_class *next;
1749
1750#ifdef CONFIG_UCLAMP_TASK
1751        int uclamp_enabled;
1752#endif
1753
1754        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1755        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1756        void (*yield_task)   (struct rq *rq);
1757        bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1758
1759        void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1760
1761        struct task_struct *(*pick_next_task)(struct rq *rq);
1762
1763        void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1764        void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
1765
1766#ifdef CONFIG_SMP
1767        int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
1768        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1769        void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
1770
1771        void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1772
1773        void (*set_cpus_allowed)(struct task_struct *p,
1774                                 const struct cpumask *newmask);
1775
1776        void (*rq_online)(struct rq *rq);
1777        void (*rq_offline)(struct rq *rq);
1778#endif
1779
1780        void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1781        void (*task_fork)(struct task_struct *p);
1782        void (*task_dead)(struct task_struct *p);
1783
1784        /*
1785         * The switched_from() call is allowed to drop rq->lock, therefore we
1786         * cannot assume the switched_from/switched_to pair is serliazed by
1787         * rq->lock. They are however serialized by p->pi_lock.
1788         */
1789        void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1790        void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
1791        void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1792                              int oldprio);
1793
1794        unsigned int (*get_rr_interval)(struct rq *rq,
1795                                        struct task_struct *task);
1796
1797        void (*update_curr)(struct rq *rq);
1798
1799#define TASK_SET_GROUP          0
1800#define TASK_MOVE_GROUP         1
1801
1802#ifdef CONFIG_FAIR_GROUP_SCHED
1803        void (*task_change_group)(struct task_struct *p, int type);
1804#endif
1805};
1806
1807static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1808{
1809        WARN_ON_ONCE(rq->curr != prev);
1810        prev->sched_class->put_prev_task(rq, prev);
1811}
1812
1813static inline void set_next_task(struct rq *rq, struct task_struct *next)
1814{
1815        WARN_ON_ONCE(rq->curr != next);
1816        next->sched_class->set_next_task(rq, next, false);
1817}
1818
1819#ifdef CONFIG_SMP
1820#define sched_class_highest (&stop_sched_class)
1821#else
1822#define sched_class_highest (&dl_sched_class)
1823#endif
1824
1825#define for_class_range(class, _from, _to) \
1826        for (class = (_from); class != (_to); class = class->next)
1827
1828#define for_each_class(class) \
1829        for_class_range(class, sched_class_highest, NULL)
1830
1831extern const struct sched_class stop_sched_class;
1832extern const struct sched_class dl_sched_class;
1833extern const struct sched_class rt_sched_class;
1834extern const struct sched_class fair_sched_class;
1835extern const struct sched_class idle_sched_class;
1836
1837static inline bool sched_stop_runnable(struct rq *rq)
1838{
1839        return rq->stop && task_on_rq_queued(rq->stop);
1840}
1841
1842static inline bool sched_dl_runnable(struct rq *rq)
1843{
1844        return rq->dl.dl_nr_running > 0;
1845}
1846
1847static inline bool sched_rt_runnable(struct rq *rq)
1848{
1849        return rq->rt.rt_queued > 0;
1850}
1851
1852static inline bool sched_fair_runnable(struct rq *rq)
1853{
1854        return rq->cfs.nr_running > 0;
1855}
1856
1857extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
1858extern struct task_struct *pick_next_task_idle(struct rq *rq);
1859
1860#ifdef CONFIG_SMP
1861
1862extern void update_group_capacity(struct sched_domain *sd, int cpu);
1863
1864extern void trigger_load_balance(struct rq *rq);
1865
1866extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
1867
1868#endif
1869
1870#ifdef CONFIG_CPU_IDLE
1871static inline void idle_set_state(struct rq *rq,
1872                                  struct cpuidle_state *idle_state)
1873{
1874        rq->idle_state = idle_state;
1875}
1876
1877static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1878{
1879        SCHED_WARN_ON(!rcu_read_lock_held());
1880
1881        return rq->idle_state;
1882}
1883#else
1884static inline void idle_set_state(struct rq *rq,
1885                                  struct cpuidle_state *idle_state)
1886{
1887}
1888
1889static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1890{
1891        return NULL;
1892}
1893#endif
1894
1895extern void schedule_idle(void);
1896
1897extern void sysrq_sched_debug_show(void);
1898extern void sched_init_granularity(void);
1899extern void update_max_interval(void);
1900
1901extern void init_sched_dl_class(void);
1902extern void init_sched_rt_class(void);
1903extern void init_sched_fair_class(void);
1904
1905extern void reweight_task(struct task_struct *p, int prio);
1906
1907extern void resched_curr(struct rq *rq);
1908extern void resched_cpu(int cpu);
1909
1910extern struct rt_bandwidth def_rt_bandwidth;
1911extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1912
1913extern struct dl_bandwidth def_dl_bandwidth;
1914extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1915extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1916extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1917
1918#define BW_SHIFT                20
1919#define BW_UNIT                 (1 << BW_SHIFT)
1920#define RATIO_SHIFT             8
1921unsigned long to_ratio(u64 period, u64 runtime);
1922
1923extern void init_entity_runnable_average(struct sched_entity *se);
1924extern void post_init_entity_util_avg(struct task_struct *p);
1925
1926#ifdef CONFIG_NO_HZ_FULL
1927extern bool sched_can_stop_tick(struct rq *rq);
1928extern int __init sched_tick_offload_init(void);
1929
1930/*
1931 * Tick may be needed by tasks in the runqueue depending on their policy and
1932 * requirements. If tick is needed, lets send the target an IPI to kick it out of
1933 * nohz mode if necessary.
1934 */
1935static inline void sched_update_tick_dependency(struct rq *rq)
1936{
1937        int cpu;
1938
1939        if (!tick_nohz_full_enabled())
1940                return;
1941
1942        cpu = cpu_of(rq);
1943
1944        if (!tick_nohz_full_cpu(cpu))
1945                return;
1946
1947        if (sched_can_stop_tick(rq))
1948                tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
1949        else
1950                tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1951}
1952#else
1953static inline int sched_tick_offload_init(void) { return 0; }
1954static inline void sched_update_tick_dependency(struct rq *rq) { }
1955#endif
1956
1957static inline void add_nr_running(struct rq *rq, unsigned count)
1958{
1959        unsigned prev_nr = rq->nr_running;
1960
1961        rq->nr_running = prev_nr + count;
1962
1963#ifdef CONFIG_SMP
1964        if (prev_nr < 2 && rq->nr_running >= 2) {
1965                if (!READ_ONCE(rq->rd->overload))
1966                        WRITE_ONCE(rq->rd->overload, 1);
1967        }
1968#endif
1969
1970        sched_update_tick_dependency(rq);
1971}
1972
1973static inline void sub_nr_running(struct rq *rq, unsigned count)
1974{
1975        rq->nr_running -= count;
1976        /* Check if we still need preemption */
1977        sched_update_tick_dependency(rq);
1978}
1979
1980extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
1981extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1982
1983extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
1984
1985extern const_debug unsigned int sysctl_sched_nr_migrate;
1986extern const_debug unsigned int sysctl_sched_migration_cost;
1987
1988#ifdef CONFIG_SCHED_HRTICK
1989
1990/*
1991 * Use hrtick when:
1992 *  - enabled by features
1993 *  - hrtimer is actually high res
1994 */
1995static inline int hrtick_enabled(struct rq *rq)
1996{
1997        if (!sched_feat(HRTICK))
1998                return 0;
1999        if (!cpu_active(cpu_of(rq)))
2000                return 0;

2001        return hrtimer_is_hres_active(&rq->hrtick_timer);
2002}
2003
2004void hrtick_start(struct rq *rq, u64 delay);
2005
2006#else
2007
2008static inline int hrtick_enabled(struct rq *rq)
2009{
2010        return 0;
2011}
2012
2013#endif /* CONFIG_SCHED_HRTICK */
2014
2015#ifndef arch_scale_freq_tick
2016static __always_inline
2017void arch_scale_freq_tick(void)
2018{
2019}
2020#endif
2021
2022#ifndef arch_scale_freq_capacity
2023static __always_inline
2024unsigned long arch_scale_freq_capacity(int cpu)
2025{
2026        return SCHED_CAPACITY_SCALE;
2027}
2028#endif
2029
2030#ifdef CONFIG_SMP
2031#ifdef CONFIG_PREEMPTION
2032
2033static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
2034
2035/*
2036 * fair double_lock_balance: Safely acquires both rq->locks in a fair
2037 * way at the expense of forcing extra atomic operations in all
2038 * invocations.  This assures that the double_lock is acquired using the
2039 * same underlying policy as the spinlock_t on this architecture, which
2040 * reduces latency compared to the unfair variant below.  However, it
2041 * also adds more overhead and therefore may reduce throughput.
2042 */
2043static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
2044        __releases(this_rq->lock)
2045        __acquires(busiest->lock)
2046        __acquires(this_rq->lock)
2047{
2048        raw_spin_unlock(&this_rq->lock);
2049        double_rq_lock(this_rq, busiest);
2050
2051        return 1;
2052}
2053
2054#else
2055/*
2056 * Unfair double_lock_balance: Optimizes throughput at the expense of
2057 * latency by eliminating extra atomic operations when the locks are
2058 * already in proper order on entry.  This favors lower CPU-ids and will
2059 * grant the double lock to lower CPUs over higher ids under contention,
2060 * regardless of entry order into the function.
2061 */
2062static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
2063        __releases(this_rq->lock)
2064        __acquires(busiest->lock)
2065        __acquires(this_rq->lock)
2066{
2067        int ret = 0;
2068
2069        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
2070                if (busiest < this_rq) {
2071                        raw_spin_unlock(&this_rq->lock);
2072                        raw_spin_lock(&busiest->lock);
2073                        raw_spin_lock_nested(&this_rq->lock,
2074                                              SINGLE_DEPTH_NESTING);
2075                        ret = 1;
2076                } else
2077                        raw_spin_lock_nested(&busiest->lock,
2078                                              SINGLE_DEPTH_NESTING);
2079        }
2080        return ret;
2081}
2082
2083#endif /* CONFIG_PREEMPTION */
2084
2085/*
2086 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2087 */
2088static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2089{
2090        if (unlikely(!irqs_disabled())) {
2091                /* printk() doesn't work well under rq->lock */
2092                raw_spin_unlock(&this_rq->lock);
2093                BUG_ON(1);
2094        }
2095
2096        return _double_lock_balance(this_rq, busiest);
2097}
2098
2099static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2100        __releases(busiest->lock)
2101{
2102        raw_spin_unlock(&busiest->lock);
2103        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2104}
2105
2106static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
2107{
2108        if (l1 > l2)
2109                swap(l1, l2);
2110
2111        spin_lock(l1);
2112        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
2113}
2114
2115static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
2116{
2117        if (l1 > l2)
2118                swap(l1, l2);
2119
2120        spin_lock_irq(l1);
2121        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
2122}
2123
2124static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
2125{
2126        if (l1 > l2)
2127                swap(l1, l2);
2128
2129        raw_spin_lock(l1);
2130        raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
2131}
2132
2133/*
2134 * double_rq_lock - safely lock two runqueues
2135 *
2136 * Note this does not disable interrupts like task_rq_lock,
2137 * you need to do so manually before calling.
2138 */
2139static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
2140        __acquires(rq1->lock)
2141        __acquires(rq2->lock)
2142{
2143        BUG_ON(!irqs_disabled());
2144        if (rq1 == rq2) {
2145                raw_spin_lock(&rq1->lock);
2146                __acquire(rq2->lock);   /* Fake it out ;) */
2147        } else {
2148                if (rq1 < rq2) {
2149                        raw_spin_lock(&rq1->lock);
2150                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2151                } else {
2152                        raw_spin_lock(&rq2->lock);
2153                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2154                }
2155        }
2156}
2157
2158/*
2159 * double_rq_unlock - safely unlock two runqueues
2160 *
2161 * Note this does not restore interrupts like task_rq_unlock,
2162 * you need to do so manually after calling.
2163 */
2164static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2165        __releases(rq1->lock)
2166        __releases(rq2->lock)
2167{
2168        raw_spin_unlock(&rq1->lock);
2169        if (rq1 != rq2)
2170                raw_spin_unlock(&rq2->lock);
2171        else
2172                __release(rq2->lock);
2173}
2174
2175extern void set_rq_online (struct rq *rq);
2176extern void set_rq_offline(struct rq *rq);
2177extern bool sched_smp_initialized;
2178
2179#else /* CONFIG_SMP */
2180
2181/*
2182 * double_rq_lock - safely lock two runqueues
2183 *
2184 * Note this does not disable interrupts like task_rq_lock,
2185 * you need to do so manually before calling.
2186 */
2187static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
2188        __acquires(rq1->lock)
2189        __acquires(rq2->lock)
2190{
2191        BUG_ON(!irqs_disabled());
2192        BUG_ON(rq1 != rq2);
2193        raw_spin_lock(&rq1->lock);
2194        __acquire(rq2->lock);   /* Fake it out ;) */
2195}
2196
2197/*
2198 * double_rq_unlock - safely unlock two runqueues
2199 *
2200 * Note this does not restore interrupts like task_rq_unlock,
2201 * you need to do so manually after calling.
2202 */
2203static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2204        __releases(rq1->lock)
2205        __releases(rq2->lock)
2206{
2207        BUG_ON(rq1 != rq2);
2208        raw_spin_unlock(&rq1->lock);
2209        __release(rq2->lock);
2210}
2211
2212#endif
2213
2214extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
2215extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
2216
2217#ifdef  CONFIG_SCHED_DEBUG
2218extern bool sched_debug_enabled;
2219
2220extern void print_cfs_stats(struct seq_file *m, int cpu);
2221extern void print_rt_stats(struct seq_file *m, int cpu);
2222extern void print_dl_stats(struct seq_file *m, int cpu);
2223extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
2224extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2225extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
2226#ifdef CONFIG_NUMA_BALANCING
2227extern void
2228show_numa_stats(struct task_struct *p, struct seq_file *m);
2229extern void
2230print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
2231        unsigned long tpf, unsigned long gsf, unsigned long gpf);
2232#endif /* CONFIG_NUMA_BALANCING */
2233#endif /* CONFIG_SCHED_DEBUG */
2234
2235extern void init_cfs_rq(struct cfs_rq *cfs_rq);
2236extern void init_rt_rq(struct rt_rq *rt_rq);
2237extern void init_dl_rq(struct dl_rq *dl_rq);
2238
2239extern void cfs_bandwidth_usage_inc(void);
2240extern void cfs_bandwidth_usage_dec(void);
2241
2242#ifdef CONFIG_NO_HZ_COMMON
2243#define NOHZ_BALANCE_KICK_BIT   0
2244#define NOHZ_STATS_KICK_BIT     1
2245
2246#define NOHZ_BALANCE_KICK       BIT(NOHZ_BALANCE_KICK_BIT)
2247#define NOHZ_STATS_KICK         BIT(NOHZ_STATS_KICK_BIT)
2248
2249#define NOHZ_KICK_MASK  (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
2250
2251#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
2252
2253extern void nohz_balance_exit_idle(struct rq *rq);
2254#else
2255static inline void nohz_balance_exit_idle(struct rq *rq) { }
2256#endif
2257
2258
2259#ifdef CONFIG_SMP
2260static inline
2261void __dl_update(struct dl_bw *dl_b, s64 bw)
2262{
2263        struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
2264        int i;
2265
2266        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2267                         "sched RCU must be held");
2268        for_each_cpu_and(i, rd->span, cpu_active_mask) {
2269                struct rq *rq = cpu_rq(i);
2270
2271                rq->dl.extra_bw += bw;
2272        }
2273}
2274#else
2275static inline
2276void __dl_update(struct dl_bw *dl_b, s64 bw)
2277{
2278        struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
2279
2280        dl->extra_bw += bw;
2281}
2282#endif
2283
2284
2285#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2286struct irqtime {
2287        u64                     total;
2288        u64                     tick_delta;
2289        u64                     irq_start_time;
2290        struct u64_stats_sync   sync;
2291};
2292
2293DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
2294
2295/*
2296 * Returns the irqtime minus the softirq time computed by ksoftirqd.
2297 * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
2298 * and never move forward.
2299 */
2300static inline u64 irq_time_read(int cpu)
2301{
2302        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
2303        unsigned int seq;
2304        u64 total;
2305
2306        do {
2307                seq = __u64_stats_fetch_begin(&irqtime->sync);
2308                total = irqtime->total;
2309        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
2310
2311        return total;
2312}
2313#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2314
2315#ifdef CONFIG_CPU_FREQ
2316DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
2317
2318/**
2319 * cpufreq_update_util - Take a note about CPU utilization changes.
2320 * @rq: Runqueue to carry out the update for.
2321 * @flags: Update reason flags.
2322 *
2323 * This function is called by the scheduler on the CPU whose utilization is
2324 * being updated.
2325 *
2326 * It can only be called from RCU-sched read-side critical sections.
2327 *
2328 * The way cpufreq is currently arranged requires it to evaluate the CPU
2329 * performance state (frequency/voltage) on a regular basis to prevent it from
2330 * being stuck in a completely inadequate performance level for too long.
2331 * That is not guaranteed to happen if the updates are only triggered from CFS
2332 * and DL, though, because they may not be coming in if only RT tasks are
2333 * active all the time (or there are RT tasks only).
2334 *
2335 * As a workaround for that issue, this function is called periodically by the
2336 * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
2337 * but that really is a band-aid.  Going forward it should be replaced with
2338 * solutions targeted more specifically at RT tasks.
2339 */
2340static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
2341{
2342        struct update_util_data *data;
2343
2344        data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
2345                                                  cpu_of(rq)));
2346        if (data)
2347                data->func(data, rq_clock(rq), flags);
2348}
2349#else
2350static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2351#endif /* CONFIG_CPU_FREQ */
2352
2353#ifdef CONFIG_UCLAMP_TASK
2354unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
2355
2356static __always_inline
2357unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
2358                                  struct task_struct *p)
2359{
2360        unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
2361        unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
2362
2363        if (p) {
2364                min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
2365                max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
2366        }
2367
2368        /*
2369         * Since CPU's {min,max}_util clamps are MAX aggregated considering
2370         * RUNNABLE tasks with _different_ clamps, we can end up with an
2371         * inversion. Fix it now when the clamps are applied.
2372         */
2373        if (unlikely(min_util >= max_util))
2374                return min_util;
2375
2376        return clamp(util, min_util, max_util);
2377}
2378#else /* CONFIG_UCLAMP_TASK */
2379static inline
2380unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
2381                                  struct task_struct *p)
2382{
2383        return util;
2384}
2385#endif /* CONFIG_UCLAMP_TASK */
2386
2387#ifdef arch_scale_freq_capacity
2388# ifndef arch_scale_freq_invariant
2389#  define arch_scale_freq_invariant()   true
2390# endif
2391#else
2392# define arch_scale_freq_invariant()    false
2393#endif
2394
2395#ifdef CONFIG_SMP
2396static inline unsigned long capacity_orig_of(int cpu)
2397{
2398        return cpu_rq(cpu)->cpu_capacity_orig;
2399}
2400#endif
2401
2402/**
2403 * enum schedutil_type - CPU utilization type
2404 * @FREQUENCY_UTIL:     Utilization used to select frequency
2405 * @ENERGY_UTIL:        Utilization used during energy calculation
2406 *
2407 * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
2408 * need to be aggregated differently depending on the usage made of them. This
2409 * enum is used within schedutil_freq_util() to differentiate the types of
2410 * utilization expected by the callers, and adjust the aggregation accordingly.
2411 */
2412enum schedutil_type {
2413        FREQUENCY_UTIL,
2414        ENERGY_UTIL,
2415};
2416
2417#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2418
2419unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2420                                 unsigned long max, enum schedutil_type type,
2421                                 struct task_struct *p);
2422
2423static inline unsigned long cpu_bw_dl(struct rq *rq)
2424{
2425        return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
2426}
2427
2428static inline unsigned long cpu_util_dl(struct rq *rq)
2429{
2430        return READ_ONCE(rq->avg_dl.util_avg);
2431}
2432
2433static inline unsigned long cpu_util_cfs(struct rq *rq)
2434{
2435        unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
2436
2437        if (sched_feat(UTIL_EST)) {
2438                util = max_t(unsigned long, util,
2439                             READ_ONCE(rq->cfs.avg.util_est.enqueued));
2440        }
2441
2442        return util;
2443}
2444
2445static inline unsigned long cpu_util_rt(struct rq *rq)
2446{
2447        return READ_ONCE(rq->avg_rt.util_avg);
2448}
2449#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2450static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2451                                 unsigned long max, enum schedutil_type type,
2452                                 struct task_struct *p)
2453{
2454        return 0;
2455}
2456#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2457
2458#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
2459static inline unsigned long cpu_util_irq(struct rq *rq)
2460{
2461        return rq->avg_irq.util_avg;
2462}
2463
2464static inline
2465unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2466{
2467        util *= (max - irq);
2468        util /= max;
2469
2470        return util;
2471
2472}
2473#else
2474static inline unsigned long cpu_util_irq(struct rq *rq)
2475{
2476        return 0;
2477}
2478
2479static inline
2480unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2481{
2482        return util;
2483}
2484#endif
2485
2486#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2487
2488#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
2489
2490DECLARE_STATIC_KEY_FALSE(sched_energy_present);
2491
2492static inline bool sched_energy_enabled(void)
2493{
2494        return static_branch_unlikely(&sched_energy_present);
2495}
2496
2497#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
2498
2499#define perf_domain_span(pd) NULL
2500static inline bool sched_energy_enabled(void) { return false; }
2501
2502#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2503
2504#ifdef CONFIG_MEMBARRIER
2505/*
2506 * The scheduler provides memory barriers required by membarrier between:
2507 * - prior user-space memory accesses and store to rq->membarrier_state,
2508 * - store to rq->membarrier_state and following user-space memory accesses.
2509 * In the same way it provides those guarantees around store to rq->curr.
2510 */
2511static inline void membarrier_switch_mm(struct rq *rq,
2512                                        struct mm_struct *prev_mm,
2513                                        struct mm_struct *next_mm)
2514{
2515        int membarrier_state;
2516
2517        if (prev_mm == next_mm)
2518                return;
2519
2520        membarrier_state = atomic_read(&next_mm->membarrier_state);
2521        if (READ_ONCE(rq->membarrier_state) == membarrier_state)
2522                return;
2523
2524        WRITE_ONCE(rq->membarrier_state, membarrier_state);
2525}
2526#else
2527static inline void membarrier_switch_mm(struct rq *rq,
2528                                        struct mm_struct *prev_mm,
2529                                        struct mm_struct *next_mm)
2530{
2531}
2532#endif
2533
2534#ifdef CONFIG_SMP
2535static inline bool is_per_cpu_kthread(struct task_struct *p)
2536{
2537        if (!(p->flags & PF_KTHREAD))
2538                return false;
2539
2540        if (p->nr_cpus_allowed != 1)
2541                return false;
2542
2543        return true;
2544}
2545#endif
2546
2547void swake_up_all_locked(struct swait_queue_head *q);
2548void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
2549