LXR linux/kernel/sched/sched.h

   1
   2#include <linux/sched.h>
   3#include <linux/sched/sysctl.h>
   4#include <linux/sched/rt.h>
   5#include <linux/sched/deadline.h>
   6#include <linux/mutex.h>
   7#include <linux/spinlock.h>
   8#include <linux/stop_machine.h>
   9#include <linux/irq_work.h>
  10#include <linux/tick.h>
  11#include <linux/slab.h>
  12
  13#include "cpupri.h"
  14#include "cpudeadline.h"
  15#include "cpuacct.h"
  16
  17struct rq;
  18struct cpuidle_state;
  19
  20/* task_struct::on_rq states: */
  21#define TASK_ON_RQ_QUEUED       1
  22#define TASK_ON_RQ_MIGRATING    2
  23
  24extern __read_mostly int scheduler_running;
  25
  26extern unsigned long calc_load_update;
  27extern atomic_long_t calc_load_tasks;
  28
  29extern void calc_global_load_tick(struct rq *this_rq);
  30extern long calc_load_fold_active(struct rq *this_rq);
  31
  32#ifdef CONFIG_SMP
  33extern void update_cpu_load_active(struct rq *this_rq);
  34#else
  35static inline void update_cpu_load_active(struct rq *this_rq) { }
  36#endif
  37
  38/*
  39 * Helpers for converting nanosecond timing to jiffy resolution
  40 */
  41#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
  42
  43/*
  44 * Increase resolution of nice-level calculations for 64-bit architectures.
  45 * The extra resolution improves shares distribution and load balancing of
  46 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
  47 * hierarchies, especially on larger systems. This is not a user-visible change
  48 * and does not change the user-interface for setting shares/weights.
  49 *
  50 * We increase resolution only if we have enough bits to allow this increased
  51 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
  52 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  53 * increased costs.
  54 */
  55#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
  56# define SCHED_LOAD_RESOLUTION  10
  57# define scale_load(w)          ((w) << SCHED_LOAD_RESOLUTION)
  58# define scale_load_down(w)     ((w) >> SCHED_LOAD_RESOLUTION)
  59#else
  60# define SCHED_LOAD_RESOLUTION  0
  61# define scale_load(w)          (w)
  62# define scale_load_down(w)     (w)
  63#endif
  64
  65#define SCHED_LOAD_SHIFT        (10 + SCHED_LOAD_RESOLUTION)
  66#define SCHED_LOAD_SCALE        (1L << SCHED_LOAD_SHIFT)
  67
  68#define NICE_0_LOAD             SCHED_LOAD_SCALE
  69#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
  70
  71/*
  72 * Single value that decides SCHED_DEADLINE internal math precision.
  73 * 10 -> just above 1us
  74 * 9  -> just above 0.5us
  75 */
  76#define DL_SCALE (10)
  77
  78/*
  79 * These are the 'tuning knobs' of the scheduler:
  80 */
  81
  82/*
  83 * single value that denotes runtime == period, ie unlimited time.
  84 */
  85#define RUNTIME_INF     ((u64)~0ULL)
  86
  87static inline int idle_policy(int policy)
  88{
  89        return policy == SCHED_IDLE;
  90}
  91static inline int fair_policy(int policy)
  92{
  93        return policy == SCHED_NORMAL || policy == SCHED_BATCH;
  94}
  95
  96static inline int rt_policy(int policy)
  97{
  98        return policy == SCHED_FIFO || policy == SCHED_RR;
  99}
 100
 101static inline int dl_policy(int policy)
 102{
 103        return policy == SCHED_DEADLINE;
 104}
 105static inline bool valid_policy(int policy)
 106{
 107        return idle_policy(policy) || fair_policy(policy) ||
 108                rt_policy(policy) || dl_policy(policy);
 109}
 110
 111static inline int task_has_rt_policy(struct task_struct *p)
 112{
 113        return rt_policy(p->policy);
 114}
 115
 116static inline int task_has_dl_policy(struct task_struct *p)
 117{
 118        return dl_policy(p->policy);
 119}
 120
 121/*
 122 * Tells if entity @a should preempt entity @b.
 123 */
 124static inline bool
 125dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
 126{
 127        return dl_time_before(a->deadline, b->deadline);
 128}
 129
 130/*
 131 * This is the priority-queue data structure of the RT scheduling class:
 132 */
 133struct rt_prio_array {
 134        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 135        struct list_head queue[MAX_RT_PRIO];
 136};
 137
 138struct rt_bandwidth {
 139        /* nests inside the rq lock: */
 140        raw_spinlock_t          rt_runtime_lock;
 141        ktime_t                 rt_period;
 142        u64                     rt_runtime;
 143        struct hrtimer          rt_period_timer;
 144        unsigned int            rt_period_active;
 145};
 146
 147void __dl_clear_params(struct task_struct *p);
 148
 149/*
 150 * To keep the bandwidth of -deadline tasks and groups under control
 151 * we need some place where:
 152 *  - store the maximum -deadline bandwidth of the system (the group);
 153 *  - cache the fraction of that bandwidth that is currently allocated.
 154 *
 155 * This is all done in the data structure below. It is similar to the
 156 * one used for RT-throttling (rt_bandwidth), with the main difference
 157 * that, since here we are only interested in admission control, we
 158 * do not decrease any runtime while the group "executes", neither we
 159 * need a timer to replenish it.
 160 *
 161 * With respect to SMP, the bandwidth is given on a per-CPU basis,
 162 * meaning that:
 163 *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
 164 *  - dl_total_bw array contains, in the i-eth element, the currently
 165 *    allocated bandwidth on the i-eth CPU.
 166 * Moreover, groups consume bandwidth on each CPU, while tasks only
 167 * consume bandwidth on the CPU they're running on.
 168 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
 169 * that will be shown the next time the proc or cgroup controls will
 170 * be red. It on its turn can be changed by writing on its own
 171 * control.
 172 */
 173struct dl_bandwidth {
 174        raw_spinlock_t dl_runtime_lock;
 175        u64 dl_runtime;
 176        u64 dl_period;
 177};
 178
 179static inline int dl_bandwidth_enabled(void)
 180{
 181        return sysctl_sched_rt_runtime >= 0;
 182}
 183
 184extern struct dl_bw *dl_bw_of(int i);
 185
 186struct dl_bw {
 187        raw_spinlock_t lock;
 188        u64 bw, total_bw;
 189};
 190
 191static inline
 192void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
 193{
 194        dl_b->total_bw -= tsk_bw;
 195}
 196
 197static inline
 198void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 199{
 200        dl_b->total_bw += tsk_bw;
 201}
 202
 203static inline
 204bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 205{
 206        return dl_b->bw != -1 &&
 207               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 208}
 209
 210extern struct mutex sched_domains_mutex;
 211
 212#ifdef CONFIG_CGROUP_SCHED
 213
 214#include <linux/cgroup.h>
 215
 216struct cfs_rq;
 217struct rt_rq;
 218
 219extern struct list_head task_groups;
 220
 221struct cfs_bandwidth {
 222#ifdef CONFIG_CFS_BANDWIDTH
 223        raw_spinlock_t lock;
 224        ktime_t period;
 225        u64 quota, runtime;
 226        s64 hierarchical_quota;
 227        u64 runtime_expires;
 228
 229        int idle, period_active;
 230        struct hrtimer period_timer, slack_timer;
 231        struct list_head throttled_cfs_rq;
 232
 233        /* statistics */
 234        int nr_periods, nr_throttled;
 235        u64 throttled_time;
 236#endif
 237};
 238
 239/* task group related information */
 240struct task_group {
 241        struct cgroup_subsys_state css;
 242
 243#ifdef CONFIG_FAIR_GROUP_SCHED
 244        /* schedulable entities of this group on each cpu */
 245        struct sched_entity **se;
 246        /* runqueue "owned" by this group on each cpu */
 247        struct cfs_rq **cfs_rq;
 248        unsigned long shares;
 249
 250#ifdef  CONFIG_SMP
 251        atomic_long_t load_avg;
 252#endif
 253#endif
 254
 255#ifdef CONFIG_RT_GROUP_SCHED
 256        struct sched_rt_entity **rt_se;
 257        struct rt_rq **rt_rq;
 258
 259        struct rt_bandwidth rt_bandwidth;
 260#endif
 261
 262        struct rcu_head rcu;
 263        struct list_head list;
 264
 265        struct task_group *parent;
 266        struct list_head siblings;
 267        struct list_head children;
 268
 269#ifdef CONFIG_SCHED_AUTOGROUP
 270        struct autogroup *autogroup;
 271#endif
 272
 273        struct cfs_bandwidth cfs_bandwidth;
 274};
 275
 276#ifdef CONFIG_FAIR_GROUP_SCHED
 277#define ROOT_TASK_GROUP_LOAD    NICE_0_LOAD
 278
 279/*
 280 * A weight of 0 or 1 can cause arithmetics problems.
 281 * A weight of a cfs_rq is the sum of weights of which entities
 282 * are queued on this cfs_rq, so a weight of a entity should not be
 283 * too large, so as the shares value of a task group.
 284 * (The default weight is 1024 - so there's no practical
 285 *  limitation from this.)
 286 */
 287#define MIN_SHARES      (1UL <<  1)
 288#define MAX_SHARES      (1UL << 18)
 289#endif
 290
 291typedef int (*tg_visitor)(struct task_group *, void *);
 292
 293extern int walk_tg_tree_from(struct task_group *from,
 294                             tg_visitor down, tg_visitor up, void *data);
 295
 296/*
 297 * Iterate the full tree, calling @down when first entering a node and @up when
 298 * leaving it for the final time.
 299 *
 300 * Caller must hold rcu_lock or sufficient equivalent.
 301 */
 302static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 303{
 304        return walk_tg_tree_from(&root_task_group, down, up, data);
 305}
 306
 307extern int tg_nop(struct task_group *tg, void *data);
 308
 309extern void free_fair_sched_group(struct task_group *tg);
 310extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
 311extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
 312extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 313                        struct sched_entity *se, int cpu,
 314                        struct sched_entity *parent);
 315extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 316extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 317
 318extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 319extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 320extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 321
 322extern void free_rt_sched_group(struct task_group *tg);
 323extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
 324extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 325                struct sched_rt_entity *rt_se, int cpu,
 326                struct sched_rt_entity *parent);
 327
 328extern struct task_group *sched_create_group(struct task_group *parent);
 329extern void sched_online_group(struct task_group *tg,
 330                               struct task_group *parent);
 331extern void sched_destroy_group(struct task_group *tg);
 332extern void sched_offline_group(struct task_group *tg);
 333
 334extern void sched_move_task(struct task_struct *tsk);
 335
 336#ifdef CONFIG_FAIR_GROUP_SCHED
 337extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 338#endif
 339
 340#else /* CONFIG_CGROUP_SCHED */
 341
 342struct cfs_bandwidth { };
 343
 344#endif  /* CONFIG_CGROUP_SCHED */
 345
 346/* CFS-related fields in a runqueue */
 347struct cfs_rq {
 348        struct load_weight load;
 349        unsigned int nr_running, h_nr_running;
 350
 351        u64 exec_clock;
 352        u64 min_vruntime;
 353#ifndef CONFIG_64BIT
 354        u64 min_vruntime_copy;
 355#endif
 356
 357        struct rb_root tasks_timeline;
 358        struct rb_node *rb_leftmost;
 359
 360        /*
 361         * 'curr' points to currently running entity on this cfs_rq.
 362         * It is set to NULL otherwise (i.e when none are currently running).
 363         */
 364        struct sched_entity *curr, *next, *last, *skip;
 365
 366#ifdef  CONFIG_SCHED_DEBUG
 367        unsigned int nr_spread_over;
 368#endif
 369
 370#ifdef CONFIG_SMP
 371        /*
 372         * CFS load tracking
 373         */
 374        struct sched_avg avg;
 375        u64 runnable_load_sum;
 376        unsigned long runnable_load_avg;
 377#ifdef CONFIG_FAIR_GROUP_SCHED
 378        unsigned long tg_load_avg_contrib;
 379#endif
 380        atomic_long_t removed_load_avg, removed_util_avg;
 381#ifndef CONFIG_64BIT
 382        u64 load_last_update_time_copy;
 383#endif
 384
 385#ifdef CONFIG_FAIR_GROUP_SCHED
 386        /*
 387         *   h_load = weight * f(tg)
 388         *
 389         * Where f(tg) is the recursive weight fraction assigned to
 390         * this group.
 391         */
 392        unsigned long h_load;
 393        u64 last_h_load_update;
 394        struct sched_entity *h_load_next;
 395#endif /* CONFIG_FAIR_GROUP_SCHED */
 396#endif /* CONFIG_SMP */
 397
 398#ifdef CONFIG_FAIR_GROUP_SCHED
 399        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 400
 401        /*
 402         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 403         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 404         * (like users, containers etc.)
 405         *
 406         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 407         * list is used during load balance.
 408         */
 409        int on_list;
 410        struct list_head leaf_cfs_rq_list;
 411        struct task_group *tg;  /* group that "owns" this runqueue */
 412
 413#ifdef CONFIG_CFS_BANDWIDTH
 414        int runtime_enabled;
 415        u64 runtime_expires;
 416        s64 runtime_remaining;
 417
 418        u64 throttled_clock, throttled_clock_task;
 419        u64 throttled_clock_task_time;
 420        int throttled, throttle_count;
 421        struct list_head throttled_list;
 422#endif /* CONFIG_CFS_BANDWIDTH */
 423#endif /* CONFIG_FAIR_GROUP_SCHED */
 424};
 425
 426static inline int rt_bandwidth_enabled(void)
 427{
 428        return sysctl_sched_rt_runtime >= 0;
 429}
 430
 431/* RT IPI pull logic requires IRQ_WORK */
 432#ifdef CONFIG_IRQ_WORK
 433# define HAVE_RT_PUSH_IPI
 434#endif
 435
 436/* Real-Time classes' related field in a runqueue: */
 437struct rt_rq {
 438        struct rt_prio_array active;
 439        unsigned int rt_nr_running;
 440#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 441        struct {
 442                int curr; /* highest queued rt task prio */
 443#ifdef CONFIG_SMP
 444                int next; /* next highest */
 445#endif
 446        } highest_prio;
 447#endif
 448#ifdef CONFIG_SMP
 449        unsigned long rt_nr_migratory;
 450        unsigned long rt_nr_total;
 451        int overloaded;
 452        struct plist_head pushable_tasks;
 453#ifdef HAVE_RT_PUSH_IPI
 454        int push_flags;
 455        int push_cpu;
 456        struct irq_work push_work;
 457        raw_spinlock_t push_lock;
 458#endif
 459#endif /* CONFIG_SMP */
 460        int rt_queued;
 461
 462        int rt_throttled;
 463        u64 rt_time;
 464        u64 rt_runtime;
 465        /* Nests inside the rq lock: */
 466        raw_spinlock_t rt_runtime_lock;
 467
 468#ifdef CONFIG_RT_GROUP_SCHED
 469        unsigned long rt_nr_boosted;
 470
 471        struct rq *rq;
 472        struct task_group *tg;
 473#endif
 474};
 475
 476/* Deadline class' related fields in a runqueue */
 477struct dl_rq {
 478        /* runqueue is an rbtree, ordered by deadline */
 479        struct rb_root rb_root;
 480        struct rb_node *rb_leftmost;
 481
 482        unsigned long dl_nr_running;
 483
 484#ifdef CONFIG_SMP
 485        /*
 486         * Deadline values of the currently executing and the
 487         * earliest ready task on this rq. Caching these facilitates
 488         * the decision wether or not a ready but not running task
 489         * should migrate somewhere else.
 490         */
 491        struct {
 492                u64 curr;
 493                u64 next;
 494        } earliest_dl;
 495
 496        unsigned long dl_nr_migratory;
 497        int overloaded;
 498
 499        /*
 500         * Tasks on this rq that can be pushed away. They are kept in
 501         * an rb-tree, ordered by tasks' deadlines, with caching
 502         * of the leftmost (earliest deadline) element.
 503         */
 504        struct rb_root pushable_dl_tasks_root;
 505        struct rb_node *pushable_dl_tasks_leftmost;
 506#else
 507        struct dl_bw dl_bw;
 508#endif
 509};
 510
 511#ifdef CONFIG_SMP
 512
 513/*
 514 * We add the notion of a root-domain which will be used to define per-domain
 515 * variables. Each exclusive cpuset essentially defines an island domain by
 516 * fully partitioning the member cpus from any other cpuset. Whenever a new
 517 * exclusive cpuset is created, we also create and attach a new root-domain
 518 * object.
 519 *
 520 */
 521struct root_domain {
 522        atomic_t refcount;
 523        atomic_t rto_count;
 524        struct rcu_head rcu;
 525        cpumask_var_t span;
 526        cpumask_var_t online;
 527
 528        /* Indicate more than one runnable task for any CPU */
 529        bool overload;
 530
 531        /*
 532         * The bit corresponding to a CPU gets set here if such CPU has more
 533         * than one runnable -deadline task (as it is below for RT tasks).
 534         */
 535        cpumask_var_t dlo_mask;
 536        atomic_t dlo_count;
 537        struct dl_bw dl_bw;
 538        struct cpudl cpudl;
 539
 540        /*
 541         * The "RT overload" flag: it gets set if a CPU has more than
 542         * one runnable RT task.
 543         */
 544        cpumask_var_t rto_mask;
 545        struct cpupri cpupri;
 546};
 547
 548extern struct root_domain def_root_domain;
 549
 550#endif /* CONFIG_SMP */
 551
 552/*
 553 * This is the main, per-CPU runqueue data structure.
 554 *
 555 * Locking rule: those places that want to lock multiple runqueues
 556 * (such as the load balancing or the thread migration code), lock
 557 * acquire operations must be ordered by ascending &runqueue.
 558 */
 559struct rq {
 560        /* runqueue lock: */
 561        raw_spinlock_t lock;
 562
 563        /*
 564         * nr_running and cpu_load should be in the same cacheline because
 565         * remote CPUs use both these fields when doing load calculation.
 566         */
 567        unsigned int nr_running;
 568#ifdef CONFIG_NUMA_BALANCING
 569        unsigned int nr_numa_running;
 570        unsigned int nr_preferred_running;
 571#endif
 572        #define CPU_LOAD_IDX_MAX 5
 573        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 574        unsigned long last_load_update_tick;
 575#ifdef CONFIG_NO_HZ_COMMON
 576        u64 nohz_stamp;
 577        unsigned long nohz_flags;
 578#endif
 579#ifdef CONFIG_NO_HZ_FULL
 580        unsigned long last_sched_tick;
 581#endif
 582        /* capture load from *all* tasks on this cpu: */
 583        struct load_weight load;
 584        unsigned long nr_load_updates;
 585        u64 nr_switches;
 586
 587        struct cfs_rq cfs;
 588        struct rt_rq rt;
 589        struct dl_rq dl;
 590
 591#ifdef CONFIG_FAIR_GROUP_SCHED
 592        /* list of leaf cfs_rq on this cpu: */
 593        struct list_head leaf_cfs_rq_list;
 594#endif /* CONFIG_FAIR_GROUP_SCHED */
 595
 596        /*
 597         * This is part of a global counter where only the total sum
 598         * over all CPUs matters. A task can increase this counter on
 599         * one CPU and if it got migrated afterwards it may decrease
 600         * it on another CPU. Always updated under the runqueue lock:
 601         */
 602        unsigned long nr_uninterruptible;
 603
 604        struct task_struct *curr, *idle, *stop;
 605        unsigned long next_balance;
 606        struct mm_struct *prev_mm;
 607
 608        unsigned int clock_skip_update;
 609        u64 clock;
 610        u64 clock_task;
 611
 612        atomic_t nr_iowait;
 613
 614#ifdef CONFIG_SMP
 615        struct root_domain *rd;
 616        struct sched_domain *sd;
 617
 618        unsigned long cpu_capacity;
 619        unsigned long cpu_capacity_orig;
 620
 621        struct callback_head *balance_callback;
 622
 623        unsigned char idle_balance;
 624        /* For active balancing */
 625        int active_balance;
 626        int push_cpu;
 627        struct cpu_stop_work active_balance_work;
 628        /* cpu of this runqueue: */
 629        int cpu;
 630        int online;
 631
 632        struct list_head cfs_tasks;
 633
 634        u64 rt_avg;
 635        u64 age_stamp;
 636        u64 idle_stamp;
 637        u64 avg_idle;
 638
 639        /* This is used to determine avg_idle's max value */
 640        u64 max_idle_balance_cost;
 641#endif
 642
 643#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 644        u64 prev_irq_time;
 645#endif
 646#ifdef CONFIG_PARAVIRT
 647        u64 prev_steal_time;
 648#endif
 649#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 650        u64 prev_steal_time_rq;
 651#endif
 652
 653        /* calc_load related fields */
 654        unsigned long calc_load_update;
 655        long calc_load_active;
 656
 657#ifdef CONFIG_SCHED_HRTICK
 658#ifdef CONFIG_SMP
 659        int hrtick_csd_pending;
 660        struct call_single_data hrtick_csd;
 661#endif
 662        struct hrtimer hrtick_timer;
 663#endif
 664
 665#ifdef CONFIG_SCHEDSTATS
 666        /* latency stats */
 667        struct sched_info rq_sched_info;
 668        unsigned long long rq_cpu_time;
 669        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 670
 671        /* sys_sched_yield() stats */
 672        unsigned int yld_count;
 673
 674        /* schedule() stats */
 675        unsigned int sched_count;
 676        unsigned int sched_goidle;
 677
 678        /* try_to_wake_up() stats */
 679        unsigned int ttwu_count;
 680        unsigned int ttwu_local;
 681#endif
 682
 683#ifdef CONFIG_SMP
 684        struct llist_head wake_list;
 685#endif
 686
 687#ifdef CONFIG_CPU_IDLE
 688        /* Must be inspected within a rcu lock section */
 689        struct cpuidle_state *idle_state;
 690#endif
 691};
 692
 693static inline int cpu_of(struct rq *rq)
 694{
 695#ifdef CONFIG_SMP
 696        return rq->cpu;
 697#else
 698        return 0;
 699#endif
 700}
 701
 702DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 703
 704#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 705#define this_rq()               this_cpu_ptr(&runqueues)
 706#define task_rq(p)              cpu_rq(task_cpu(p))
 707#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 708#define raw_rq()                raw_cpu_ptr(&runqueues)
 709
 710static inline u64 __rq_clock_broken(struct rq *rq)
 711{
 712        return READ_ONCE(rq->clock);
 713}
 714
 715static inline u64 rq_clock(struct rq *rq)
 716{
 717        lockdep_assert_held(&rq->lock);
 718        return rq->clock;
 719}
 720
 721static inline u64 rq_clock_task(struct rq *rq)
 722{
 723        lockdep_assert_held(&rq->lock);
 724        return rq->clock_task;
 725}
 726
 727#define RQCF_REQ_SKIP   0x01
 728#define RQCF_ACT_SKIP   0x02
 729
 730static inline void rq_clock_skip_update(struct rq *rq, bool skip)
 731{
 732        lockdep_assert_held(&rq->lock);
 733        if (skip)
 734                rq->clock_skip_update |= RQCF_REQ_SKIP;
 735        else
 736                rq->clock_skip_update &= ~RQCF_REQ_SKIP;
 737}
 738
 739#ifdef CONFIG_NUMA
 740enum numa_topology_type {
 741        NUMA_DIRECT,
 742        NUMA_GLUELESS_MESH,
 743        NUMA_BACKPLANE,
 744};
 745extern enum numa_topology_type sched_numa_topology_type;
 746extern int sched_max_numa_distance;
 747extern bool find_numa_distance(int distance);
 748#endif
 749
 750#ifdef CONFIG_NUMA_BALANCING
 751/* The regions in numa_faults array from task_struct */
 752enum numa_faults_stats {
 753        NUMA_MEM = 0,
 754        NUMA_CPU,
 755        NUMA_MEMBUF,
 756        NUMA_CPUBUF
 757};
 758extern void sched_setnuma(struct task_struct *p, int node);
 759extern int migrate_task_to(struct task_struct *p, int cpu);
 760extern int migrate_swap(struct task_struct *, struct task_struct *);
 761#endif /* CONFIG_NUMA_BALANCING */
 762
 763#ifdef CONFIG_SMP
 764
 765static inline void
 766queue_balance_callback(struct rq *rq,
 767                       struct callback_head *head,
 768                       void (*func)(struct rq *rq))
 769{
 770        lockdep_assert_held(&rq->lock);
 771
 772        if (unlikely(head->next))
 773                return;
 774
 775        head->func = (void (*)(struct callback_head *))func;
 776        head->next = rq->balance_callback;
 777        rq->balance_callback = head;
 778}
 779
 780extern void sched_ttwu_pending(void);
 781
 782#define rcu_dereference_check_sched_domain(p) \
 783        rcu_dereference_check((p), \
 784                              lockdep_is_held(&sched_domains_mutex))
 785
 786/*
 787 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 788 * See detach_destroy_domains: synchronize_sched for details.
 789 *
 790 * The domain tree of any CPU may only be accessed from within
 791 * preempt-disabled sections.
 792 */
 793#define for_each_domain(cpu, __sd) \
 794        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
 795                        __sd; __sd = __sd->parent)
 796
 797#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
 798
 799/**
 800 * highest_flag_domain - Return highest sched_domain containing flag.
 801 * @cpu:        The cpu whose highest level of sched domain is to
 802 *              be returned.
 803 * @flag:       The flag to check for the highest sched_domain
 804 *              for the given cpu.
 805 *
 806 * Returns the highest sched_domain of a cpu which contains the given flag.
 807 */
 808static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 809{
 810        struct sched_domain *sd, *hsd = NULL;
 811
 812        for_each_domain(cpu, sd) {
 813                if (!(sd->flags & flag))
 814                        break;
 815                hsd = sd;
 816        }
 817
 818        return hsd;
 819}
 820
 821static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 822{
 823        struct sched_domain *sd;
 824
 825        for_each_domain(cpu, sd) {
 826                if (sd->flags & flag)
 827                        break;
 828        }
 829
 830        return sd;
 831}
 832
 833DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 834DECLARE_PER_CPU(int, sd_llc_size);
 835DECLARE_PER_CPU(int, sd_llc_id);
 836DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 837DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 838DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 839
 840struct sched_group_capacity {
 841        atomic_t ref;
 842        /*
 843         * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
 844         * for a single CPU.
 845         */
 846        unsigned int capacity;
 847        unsigned long next_update;
 848        int imbalance; /* XXX unrelated to capacity but shared group state */
 849        /*
 850         * Number of busy cpus in this group.
 851         */
 852        atomic_t nr_busy_cpus;
 853
 854        unsigned long cpumask[0]; /* iteration mask */
 855};
 856
 857struct sched_group {
 858        struct sched_group *next;       /* Must be a circular list */
 859        atomic_t ref;
 860
 861        unsigned int group_weight;
 862        struct sched_group_capacity *sgc;
 863
 864        /*
 865         * The CPUs this group covers.
 866         *
 867         * NOTE: this field is variable length. (Allocated dynamically
 868         * by attaching extra space to the end of the structure,
 869         * depending on how many CPUs the kernel has booted up with)
 870         */
 871        unsigned long cpumask[0];
 872};
 873
 874static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 875{
 876        return to_cpumask(sg->cpumask);
 877}
 878
 879/*
 880 * cpumask masking which cpus in the group are allowed to iterate up the domain
 881 * tree.
 882 */
 883static inline struct cpumask *sched_group_mask(struct sched_group *sg)
 884{
 885        return to_cpumask(sg->sgc->cpumask);
 886}
 887
 888/**
 889 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
 890 * @group: The group whose first cpu is to be returned.
 891 */
 892static inline unsigned int group_first_cpu(struct sched_group *group)
 893{
 894        return cpumask_first(sched_group_cpus(group));
 895}
 896
 897extern int group_balance_cpu(struct sched_group *sg);
 898
 899#else
 900
 901static inline void sched_ttwu_pending(void) { }
 902
 903#endif /* CONFIG_SMP */
 904
 905#include "stats.h"
 906#include "auto_group.h"
 907
 908#ifdef CONFIG_CGROUP_SCHED
 909
 910/*
 911 * Return the group to which this tasks belongs.
 912 *
 913 * We cannot use task_css() and friends because the cgroup subsystem
 914 * changes that value before the cgroup_subsys::attach() method is called,
 915 * therefore we cannot pin it and might observe the wrong value.
 916 *
 917 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
 918 * core changes this before calling sched_move_task().
 919 *
 920 * Instead we use a 'copy' which is updated from sched_move_task() while
 921 * holding both task_struct::pi_lock and rq::lock.
 922 */
 923static inline struct task_group *task_group(struct task_struct *p)
 924{
 925        return p->sched_task_group;
 926}
 927
 928/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 929static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 930{
 931#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
 932        struct task_group *tg = task_group(p);
 933#endif
 934
 935#ifdef CONFIG_FAIR_GROUP_SCHED
 936        p->se.cfs_rq = tg->cfs_rq[cpu];
 937        p->se.parent = tg->se[cpu];
 938#endif
 939
 940#ifdef CONFIG_RT_GROUP_SCHED
 941        p->rt.rt_rq  = tg->rt_rq[cpu];
 942        p->rt.parent = tg->rt_se[cpu];
 943#endif
 944}
 945
 946#else /* CONFIG_CGROUP_SCHED */
 947
 948static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 949static inline struct task_group *task_group(struct task_struct *p)
 950{
 951        return NULL;
 952}
 953
 954#endif /* CONFIG_CGROUP_SCHED */
 955
 956static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 957{
 958        set_task_rq(p, cpu);
 959#ifdef CONFIG_SMP
 960        /*
 961         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 962         * successfuly executed on another CPU. We must ensure that updates of
 963         * per-task data have been completed by this moment.
 964         */
 965        smp_wmb();
 966        task_thread_info(p)->cpu = cpu;
 967        p->wake_cpu = cpu;
 968#endif
 969}
 970
 971/*
 972 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 973 */
 974#ifdef CONFIG_SCHED_DEBUG
 975# include <linux/static_key.h>
 976# define const_debug __read_mostly
 977#else
 978# define const_debug const
 979#endif
 980
 981extern const_debug unsigned int sysctl_sched_features;
 982
 983#define SCHED_FEAT(name, enabled)       \
 984        __SCHED_FEAT_##name ,
 985
 986enum {
 987#include "features.h"
 988        __SCHED_FEAT_NR,
 989};
 990
 991#undef SCHED_FEAT
 992
 993#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 994#define SCHED_FEAT(name, enabled)                                       \
 995static __always_inline bool static_branch_##name(struct static_key *key) \
 996{                                                                       \
 997        return static_key_##enabled(key);                               \
 998}
 999
1000#include "features.h"

1001
1002#undef SCHED_FEAT
1003
1004extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1005#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
1006#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
1007#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1008#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
1009
1010extern struct static_key_false sched_numa_balancing;
1011
1012static inline u64 global_rt_period(void)
1013{
1014        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1015}
1016
1017static inline u64 global_rt_runtime(void)
1018{
1019        if (sysctl_sched_rt_runtime < 0)
1020                return RUNTIME_INF;
1021
1022        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1023}
1024
1025static inline int task_current(struct rq *rq, struct task_struct *p)
1026{
1027        return rq->curr == p;
1028}
1029
1030static inline int task_running(struct rq *rq, struct task_struct *p)
1031{
1032#ifdef CONFIG_SMP
1033        return p->on_cpu;
1034#else
1035        return task_current(rq, p);
1036#endif
1037}
1038
1039static inline int task_on_rq_queued(struct task_struct *p)
1040{
1041        return p->on_rq == TASK_ON_RQ_QUEUED;
1042}
1043
1044static inline int task_on_rq_migrating(struct task_struct *p)
1045{
1046        return p->on_rq == TASK_ON_RQ_MIGRATING;
1047}
1048
1049#ifndef prepare_arch_switch
1050# define prepare_arch_switch(next)      do { } while (0)
1051#endif
1052#ifndef finish_arch_post_lock_switch
1053# define finish_arch_post_lock_switch() do { } while (0)
1054#endif
1055
1056static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1057{
1058#ifdef CONFIG_SMP
1059        /*
1060         * We can optimise this out completely for !SMP, because the
1061         * SMP rebalancing from interrupt is the only thing that cares
1062         * here.
1063         */
1064        next->on_cpu = 1;
1065#endif
1066}
1067
1068static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1069{
1070#ifdef CONFIG_SMP
1071        /*
1072         * After ->on_cpu is cleared, the task can be moved to a different CPU.
1073         * We must ensure this doesn't happen until the switch is completely
1074         * finished.
1075         *
1076         * In particular, the load of prev->state in finish_task_switch() must
1077         * happen before this.
1078         *
1079         * Pairs with the control dependency and rmb in try_to_wake_up().
1080         */
1081        smp_store_release(&prev->on_cpu, 0);
1082#endif
1083#ifdef CONFIG_DEBUG_SPINLOCK
1084        /* this is a valid case when another task releases the spinlock */
1085        rq->lock.owner = current;
1086#endif
1087        /*
1088         * If we are tracking spinlock dependencies then we have to
1089         * fix up the runqueue lock - which gets 'carried over' from
1090         * prev into current:
1091         */
1092        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1093
1094        raw_spin_unlock_irq(&rq->lock);
1095}
1096
1097/*
1098 * wake flags
1099 */
1100#define WF_SYNC         0x01            /* waker goes to sleep after wakeup */
1101#define WF_FORK         0x02            /* child wakeup after fork */
1102#define WF_MIGRATED     0x4             /* internal use, task got migrated */
1103
1104/*
1105 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1106 * of tasks with abnormal "nice" values across CPUs the contribution that
1107 * each task makes to its run queue's load is weighted according to its
1108 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1109 * scaled version of the new time slice allocation that they receive on time
1110 * slice expiry etc.
1111 */
1112
1113#define WEIGHT_IDLEPRIO                3
1114#define WMULT_IDLEPRIO         1431655765
1115
1116/*
1117 * Nice levels are multiplicative, with a gentle 10% change for every
1118 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1119 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1120 * that remained on nice 0.
1121 *
1122 * The "10% effect" is relative and cumulative: from _any_ nice level,
1123 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1124 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1125 * If a task goes up by ~10% and another task goes down by ~10% then
1126 * the relative distance between them is ~25%.)
1127 */
1128static const int prio_to_weight[40] = {
1129 /* -20 */     88761,     71755,     56483,     46273,     36291,
1130 /* -15 */     29154,     23254,     18705,     14949,     11916,
1131 /* -10 */      9548,      7620,      6100,      4904,      3906,
1132 /*  -5 */      3121,      2501,      1991,      1586,      1277,
1133 /*   0 */      1024,       820,       655,       526,       423,
1134 /*   5 */       335,       272,       215,       172,       137,
1135 /*  10 */       110,        87,        70,        56,        45,
1136 /*  15 */        36,        29,        23,        18,        15,
1137};
1138
1139/*
1140 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1141 *
1142 * In cases where the weight does not change often, we can use the
1143 * precalculated inverse to speed up arithmetics by turning divisions
1144 * into multiplications:
1145 */
1146static const u32 prio_to_wmult[40] = {
1147 /* -20 */     48388,     59856,     76040,     92818,    118348,
1148 /* -15 */    147320,    184698,    229616,    287308,    360437,
1149 /* -10 */    449829,    563644,    704093,    875809,   1099582,
1150 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1151 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1152 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1153 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1154 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1155};
1156
1157#define ENQUEUE_WAKEUP          0x01
1158#define ENQUEUE_HEAD            0x02
1159#ifdef CONFIG_SMP
1160#define ENQUEUE_WAKING          0x04    /* sched_class::task_waking was called */
1161#else
1162#define ENQUEUE_WAKING          0x00
1163#endif
1164#define ENQUEUE_REPLENISH       0x08
1165#define ENQUEUE_RESTORE 0x10
1166
1167#define DEQUEUE_SLEEP           0x01
1168#define DEQUEUE_SAVE            0x02
1169
1170#define RETRY_TASK              ((void *)-1UL)
1171
1172struct sched_class {
1173        const struct sched_class *next;
1174
1175        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1176        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1177        void (*yield_task) (struct rq *rq);
1178        bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
1179
1180        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1181
1182        /*
1183         * It is the responsibility of the pick_next_task() method that will
1184         * return the next task to call put_prev_task() on the @prev task or
1185         * something equivalent.
1186         *
1187         * May return RETRY_TASK when it finds a higher prio class has runnable
1188         * tasks.
1189         */
1190        struct task_struct * (*pick_next_task) (struct rq *rq,
1191                                                struct task_struct *prev);
1192        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1193
1194#ifdef CONFIG_SMP
1195        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1196        void (*migrate_task_rq)(struct task_struct *p);
1197
1198        void (*task_waking) (struct task_struct *task);
1199        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1200
1201        void (*set_cpus_allowed)(struct task_struct *p,
1202                                 const struct cpumask *newmask);
1203
1204        void (*rq_online)(struct rq *rq);
1205        void (*rq_offline)(struct rq *rq);
1206#endif
1207
1208        void (*set_curr_task) (struct rq *rq);
1209        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1210        void (*task_fork) (struct task_struct *p);
1211        void (*task_dead) (struct task_struct *p);
1212
1213        /*
1214         * The switched_from() call is allowed to drop rq->lock, therefore we
1215         * cannot assume the switched_from/switched_to pair is serliazed by
1216         * rq->lock. They are however serialized by p->pi_lock.
1217         */
1218        void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1219        void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1220        void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1221                             int oldprio);
1222
1223        unsigned int (*get_rr_interval) (struct rq *rq,
1224                                         struct task_struct *task);
1225
1226        void (*update_curr) (struct rq *rq);
1227
1228#ifdef CONFIG_FAIR_GROUP_SCHED
1229        void (*task_move_group) (struct task_struct *p);
1230#endif
1231};
1232
1233static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1234{
1235        prev->sched_class->put_prev_task(rq, prev);
1236}
1237
1238#define sched_class_highest (&stop_sched_class)
1239#define for_each_class(class) \
1240   for (class = sched_class_highest; class; class = class->next)
1241
1242extern const struct sched_class stop_sched_class;
1243extern const struct sched_class dl_sched_class;
1244extern const struct sched_class rt_sched_class;
1245extern const struct sched_class fair_sched_class;
1246extern const struct sched_class idle_sched_class;
1247
1248
1249#ifdef CONFIG_SMP
1250
1251extern void update_group_capacity(struct sched_domain *sd, int cpu);
1252
1253extern void trigger_load_balance(struct rq *rq);
1254
1255extern void idle_enter_fair(struct rq *this_rq);
1256extern void idle_exit_fair(struct rq *this_rq);
1257
1258extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
1259
1260#else
1261
1262static inline void idle_enter_fair(struct rq *rq) { }
1263static inline void idle_exit_fair(struct rq *rq) { }
1264
1265#endif
1266
1267#ifdef CONFIG_CPU_IDLE
1268static inline void idle_set_state(struct rq *rq,
1269                                  struct cpuidle_state *idle_state)
1270{
1271        rq->idle_state = idle_state;
1272}
1273
1274static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1275{
1276        WARN_ON(!rcu_read_lock_held());
1277        return rq->idle_state;
1278}
1279#else
1280static inline void idle_set_state(struct rq *rq,
1281                                  struct cpuidle_state *idle_state)
1282{
1283}
1284
1285static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1286{
1287        return NULL;
1288}
1289#endif
1290
1291extern void sysrq_sched_debug_show(void);
1292extern void sched_init_granularity(void);
1293extern void update_max_interval(void);
1294
1295extern void init_sched_dl_class(void);
1296extern void init_sched_rt_class(void);
1297extern void init_sched_fair_class(void);
1298
1299extern void resched_curr(struct rq *rq);
1300extern void resched_cpu(int cpu);
1301
1302extern struct rt_bandwidth def_rt_bandwidth;
1303extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1304
1305extern struct dl_bandwidth def_dl_bandwidth;
1306extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1307extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1308
1309unsigned long to_ratio(u64 period, u64 runtime);
1310
1311extern void init_entity_runnable_average(struct sched_entity *se);
1312
1313static inline void add_nr_running(struct rq *rq, unsigned count)
1314{
1315        unsigned prev_nr = rq->nr_running;
1316
1317        rq->nr_running = prev_nr + count;
1318
1319        if (prev_nr < 2 && rq->nr_running >= 2) {
1320#ifdef CONFIG_SMP
1321                if (!rq->rd->overload)
1322                        rq->rd->overload = true;
1323#endif
1324
1325#ifdef CONFIG_NO_HZ_FULL
1326                if (tick_nohz_full_cpu(rq->cpu)) {
1327                        /*
1328                         * Tick is needed if more than one task runs on a CPU.
1329                         * Send the target an IPI to kick it out of nohz mode.
1330                         *
1331                         * We assume that IPI implies full memory barrier and the
1332                         * new value of rq->nr_running is visible on reception
1333                         * from the target.
1334                         */
1335                        tick_nohz_full_kick_cpu(rq->cpu);
1336                }
1337#endif
1338        }
1339}
1340
1341static inline void sub_nr_running(struct rq *rq, unsigned count)
1342{
1343        rq->nr_running -= count;
1344}
1345
1346static inline void rq_last_tick_reset(struct rq *rq)
1347{
1348#ifdef CONFIG_NO_HZ_FULL
1349        rq->last_sched_tick = jiffies;
1350#endif
1351}
1352
1353extern void update_rq_clock(struct rq *rq);
1354
1355extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
1356extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1357
1358extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
1359
1360extern const_debug unsigned int sysctl_sched_time_avg;
1361extern const_debug unsigned int sysctl_sched_nr_migrate;
1362extern const_debug unsigned int sysctl_sched_migration_cost;
1363
1364static inline u64 sched_avg_period(void)
1365{
1366        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1367}
1368
1369#ifdef CONFIG_SCHED_HRTICK
1370
1371/*
1372 * Use hrtick when:
1373 *  - enabled by features
1374 *  - hrtimer is actually high res
1375 */
1376static inline int hrtick_enabled(struct rq *rq)
1377{
1378        if (!sched_feat(HRTICK))
1379                return 0;
1380        if (!cpu_active(cpu_of(rq)))
1381                return 0;
1382        return hrtimer_is_hres_active(&rq->hrtick_timer);
1383}
1384
1385void hrtick_start(struct rq *rq, u64 delay);
1386
1387#else
1388
1389static inline int hrtick_enabled(struct rq *rq)
1390{
1391        return 0;
1392}
1393
1394#endif /* CONFIG_SCHED_HRTICK */
1395
1396#ifdef CONFIG_SMP
1397extern void sched_avg_update(struct rq *rq);
1398
1399#ifndef arch_scale_freq_capacity
1400static __always_inline
1401unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1402{
1403        return SCHED_CAPACITY_SCALE;
1404}
1405#endif
1406
1407#ifndef arch_scale_cpu_capacity
1408static __always_inline
1409unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1410{
1411        if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
1412                return sd->smt_gain / sd->span_weight;
1413
1414        return SCHED_CAPACITY_SCALE;
1415}
1416#endif
1417
1418static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1419{
1420        rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
1421        sched_avg_update(rq);
1422}
1423#else
1424static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1425static inline void sched_avg_update(struct rq *rq) { }
1426#endif
1427
1428/*
1429 * __task_rq_lock - lock the rq @p resides on.
1430 */
1431static inline struct rq *__task_rq_lock(struct task_struct *p)
1432        __acquires(rq->lock)
1433{
1434        struct rq *rq;
1435
1436        lockdep_assert_held(&p->pi_lock);
1437
1438        for (;;) {
1439                rq = task_rq(p);
1440                raw_spin_lock(&rq->lock);
1441                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
1442                        lockdep_pin_lock(&rq->lock);
1443                        return rq;
1444                }
1445                raw_spin_unlock(&rq->lock);
1446
1447                while (unlikely(task_on_rq_migrating(p)))
1448                        cpu_relax();
1449        }
1450}
1451
1452/*
1453 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
1454 */
1455static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1456        __acquires(p->pi_lock)
1457        __acquires(rq->lock)
1458{
1459        struct rq *rq;
1460
1461        for (;;) {
1462                raw_spin_lock_irqsave(&p->pi_lock, *flags);
1463                rq = task_rq(p);
1464                raw_spin_lock(&rq->lock);
1465                /*
1466                 *      move_queued_task()              task_rq_lock()
1467                 *
1468                 *      ACQUIRE (rq->lock)
1469                 *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
1470                 *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
1471                 *      [S] ->cpu = new_cpu             [L] task_rq()
1472                 *                                      [L] ->on_rq
1473                 *      RELEASE (rq->lock)
1474                 *
1475                 * If we observe the old cpu in task_rq_lock, the acquire of
1476                 * the old rq->lock will fully serialize against the stores.
1477                 *
1478                 * If we observe the new cpu in task_rq_lock, the acquire will
1479                 * pair with the WMB to ensure we must then also see migrating.
1480                 */
1481                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
1482                        lockdep_pin_lock(&rq->lock);
1483                        return rq;
1484                }
1485                raw_spin_unlock(&rq->lock);
1486                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1487
1488                while (unlikely(task_on_rq_migrating(p)))
1489                        cpu_relax();
1490        }
1491}
1492
1493static inline void __task_rq_unlock(struct rq *rq)
1494        __releases(rq->lock)
1495{
1496        lockdep_unpin_lock(&rq->lock);
1497        raw_spin_unlock(&rq->lock);
1498}
1499
1500static inline void
1501task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
1502        __releases(rq->lock)
1503        __releases(p->pi_lock)
1504{
1505        lockdep_unpin_lock(&rq->lock);
1506        raw_spin_unlock(&rq->lock);
1507        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1508}
1509
1510#ifdef CONFIG_SMP
1511#ifdef CONFIG_PREEMPT
1512
1513static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1514
1515/*
1516 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1517 * way at the expense of forcing extra atomic operations in all
1518 * invocations.  This assures that the double_lock is acquired using the
1519 * same underlying policy as the spinlock_t on this architecture, which
1520 * reduces latency compared to the unfair variant below.  However, it
1521 * also adds more overhead and therefore may reduce throughput.
1522 */
1523static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1524        __releases(this_rq->lock)
1525        __acquires(busiest->lock)
1526        __acquires(this_rq->lock)
1527{
1528        raw_spin_unlock(&this_rq->lock);
1529        double_rq_lock(this_rq, busiest);
1530
1531        return 1;
1532}
1533
1534#else
1535/*
1536 * Unfair double_lock_balance: Optimizes throughput at the expense of
1537 * latency by eliminating extra atomic operations when the locks are
1538 * already in proper order on entry.  This favors lower cpu-ids and will
1539 * grant the double lock to lower cpus over higher ids under contention,
1540 * regardless of entry order into the function.
1541 */
1542static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1543        __releases(this_rq->lock)
1544        __acquires(busiest->lock)
1545        __acquires(this_rq->lock)
1546{
1547        int ret = 0;
1548
1549        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1550                if (busiest < this_rq) {
1551                        raw_spin_unlock(&this_rq->lock);
1552                        raw_spin_lock(&busiest->lock);
1553                        raw_spin_lock_nested(&this_rq->lock,
1554                                              SINGLE_DEPTH_NESTING);
1555                        ret = 1;
1556                } else
1557                        raw_spin_lock_nested(&busiest->lock,
1558                                              SINGLE_DEPTH_NESTING);
1559        }
1560        return ret;
1561}
1562
1563#endif /* CONFIG_PREEMPT */
1564
1565/*
1566 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1567 */
1568static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1569{
1570        if (unlikely(!irqs_disabled())) {
1571                /* printk() doesn't work good under rq->lock */
1572                raw_spin_unlock(&this_rq->lock);
1573                BUG_ON(1);
1574        }
1575
1576        return _double_lock_balance(this_rq, busiest);
1577}
1578
1579static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1580        __releases(busiest->lock)
1581{
1582        raw_spin_unlock(&busiest->lock);
1583        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1584}
1585
1586static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1587{
1588        if (l1 > l2)
1589                swap(l1, l2);
1590
1591        spin_lock(l1);
1592        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1593}
1594
1595static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
1596{
1597        if (l1 > l2)
1598                swap(l1, l2);
1599
1600        spin_lock_irq(l1);
1601        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1602}
1603
1604static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1605{
1606        if (l1 > l2)
1607                swap(l1, l2);
1608
1609        raw_spin_lock(l1);
1610        raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1611}
1612
1613/*
1614 * double_rq_lock - safely lock two runqueues
1615 *
1616 * Note this does not disable interrupts like task_rq_lock,
1617 * you need to do so manually before calling.
1618 */
1619static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1620        __acquires(rq1->lock)
1621        __acquires(rq2->lock)
1622{
1623        BUG_ON(!irqs_disabled());
1624        if (rq1 == rq2) {
1625                raw_spin_lock(&rq1->lock);
1626                __acquire(rq2->lock);   /* Fake it out ;) */
1627        } else {
1628                if (rq1 < rq2) {
1629                        raw_spin_lock(&rq1->lock);
1630                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1631                } else {
1632                        raw_spin_lock(&rq2->lock);
1633                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1634                }
1635        }
1636}
1637
1638/*
1639 * double_rq_unlock - safely unlock two runqueues
1640 *
1641 * Note this does not restore interrupts like task_rq_unlock,
1642 * you need to do so manually after calling.
1643 */
1644static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1645        __releases(rq1->lock)
1646        __releases(rq2->lock)
1647{
1648        raw_spin_unlock(&rq1->lock);
1649        if (rq1 != rq2)
1650                raw_spin_unlock(&rq2->lock);
1651        else
1652                __release(rq2->lock);
1653}
1654
1655#else /* CONFIG_SMP */
1656
1657/*
1658 * double_rq_lock - safely lock two runqueues
1659 *
1660 * Note this does not disable interrupts like task_rq_lock,
1661 * you need to do so manually before calling.
1662 */
1663static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1664        __acquires(rq1->lock)
1665        __acquires(rq2->lock)
1666{
1667        BUG_ON(!irqs_disabled());
1668        BUG_ON(rq1 != rq2);
1669        raw_spin_lock(&rq1->lock);
1670        __acquire(rq2->lock);   /* Fake it out ;) */
1671}
1672
1673/*
1674 * double_rq_unlock - safely unlock two runqueues
1675 *
1676 * Note this does not restore interrupts like task_rq_unlock,
1677 * you need to do so manually after calling.
1678 */
1679static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1680        __releases(rq1->lock)
1681        __releases(rq2->lock)
1682{
1683        BUG_ON(rq1 != rq2);
1684        raw_spin_unlock(&rq1->lock);
1685        __release(rq2->lock);
1686}
1687
1688#endif
1689
1690extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1691extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1692
1693#ifdef  CONFIG_SCHED_DEBUG
1694extern void print_cfs_stats(struct seq_file *m, int cpu);
1695extern void print_rt_stats(struct seq_file *m, int cpu);
1696extern void print_dl_stats(struct seq_file *m, int cpu);
1697extern void
1698print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
1699
1700#ifdef CONFIG_NUMA_BALANCING
1701extern void
1702show_numa_stats(struct task_struct *p, struct seq_file *m);
1703extern void
1704print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
1705        unsigned long tpf, unsigned long gsf, unsigned long gpf);
1706#endif /* CONFIG_NUMA_BALANCING */
1707#endif /* CONFIG_SCHED_DEBUG */
1708
1709extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1710extern void init_rt_rq(struct rt_rq *rt_rq);
1711extern void init_dl_rq(struct dl_rq *dl_rq);
1712
1713extern void cfs_bandwidth_usage_inc(void);
1714extern void cfs_bandwidth_usage_dec(void);
1715
1716#ifdef CONFIG_NO_HZ_COMMON
1717enum rq_nohz_flag_bits {
1718        NOHZ_TICK_STOPPED,
1719        NOHZ_BALANCE_KICK,
1720};
1721
1722#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1723#endif
1724
1725#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1726
1727DECLARE_PER_CPU(u64, cpu_hardirq_time);
1728DECLARE_PER_CPU(u64, cpu_softirq_time);
1729
1730#ifndef CONFIG_64BIT
1731DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1732
1733static inline void irq_time_write_begin(void)
1734{
1735        __this_cpu_inc(irq_time_seq.sequence);
1736        smp_wmb();
1737}
1738
1739static inline void irq_time_write_end(void)
1740{
1741        smp_wmb();
1742        __this_cpu_inc(irq_time_seq.sequence);
1743}
1744
1745static inline u64 irq_time_read(int cpu)
1746{
1747        u64 irq_time;
1748        unsigned seq;
1749
1750        do {
1751                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1752                irq_time = per_cpu(cpu_softirq_time, cpu) +
1753                           per_cpu(cpu_hardirq_time, cpu);
1754        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1755
1756        return irq_time;
1757}
1758#else /* CONFIG_64BIT */
1759static inline void irq_time_write_begin(void)
1760{
1761}
1762
1763static inline void irq_time_write_end(void)
1764{
1765}
1766
1767static inline u64 irq_time_read(int cpu)
1768{
1769        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1770}
1771#endif /* CONFIG_64BIT */
1772#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1773