linux/kernel/sched/rt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4 * policies)
   5 */
   6#include "sched.h"
   7
   8#include "pelt.h"
   9
  10int sched_rr_timeslice = RR_TIMESLICE;
  11int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  12
  13static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  14
  15struct rt_bandwidth def_rt_bandwidth;
  16
  17static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  18{
  19        struct rt_bandwidth *rt_b =
  20                container_of(timer, struct rt_bandwidth, rt_period_timer);
  21        int idle = 0;
  22        int overrun;
  23
  24        raw_spin_lock(&rt_b->rt_runtime_lock);
  25        for (;;) {
  26                overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  27                if (!overrun)
  28                        break;
  29
  30                raw_spin_unlock(&rt_b->rt_runtime_lock);
  31                idle = do_sched_rt_period_timer(rt_b, overrun);
  32                raw_spin_lock(&rt_b->rt_runtime_lock);
  33        }
  34        if (idle)
  35                rt_b->rt_period_active = 0;
  36        raw_spin_unlock(&rt_b->rt_runtime_lock);
  37
  38        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  39}
  40
  41void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  42{
  43        rt_b->rt_period = ns_to_ktime(period);
  44        rt_b->rt_runtime = runtime;
  45
  46        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  47
  48        hrtimer_init(&rt_b->rt_period_timer,
  49                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  50        rt_b->rt_period_timer.function = sched_rt_period_timer;
  51}
  52
  53static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  54{
  55        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  56                return;
  57
  58        raw_spin_lock(&rt_b->rt_runtime_lock);
  59        if (!rt_b->rt_period_active) {
  60                rt_b->rt_period_active = 1;
  61                /*
  62                 * SCHED_DEADLINE updates the bandwidth, as a run away
  63                 * RT task with a DL task could hog a CPU. But DL does
  64                 * not reset the period. If a deadline task was running
  65                 * without an RT task running, it can cause RT tasks to
  66                 * throttle when they start up. Kick the timer right away
  67                 * to update the period.
  68                 */
  69                hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  70                hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  71        }
  72        raw_spin_unlock(&rt_b->rt_runtime_lock);
  73}
  74
  75void init_rt_rq(struct rt_rq *rt_rq)
  76{
  77        struct rt_prio_array *array;
  78        int i;
  79
  80        array = &rt_rq->active;
  81        for (i = 0; i < MAX_RT_PRIO; i++) {
  82                INIT_LIST_HEAD(array->queue + i);
  83                __clear_bit(i, array->bitmap);
  84        }
  85        /* delimiter for bitsearch: */
  86        __set_bit(MAX_RT_PRIO, array->bitmap);
  87
  88#if defined CONFIG_SMP
  89        rt_rq->highest_prio.curr = MAX_RT_PRIO;
  90        rt_rq->highest_prio.next = MAX_RT_PRIO;
  91        rt_rq->rt_nr_migratory = 0;
  92        rt_rq->overloaded = 0;
  93        plist_head_init(&rt_rq->pushable_tasks);
  94#endif /* CONFIG_SMP */
  95        /* We start is dequeued state, because no RT tasks are queued */
  96        rt_rq->rt_queued = 0;
  97
  98        rt_rq->rt_time = 0;
  99        rt_rq->rt_throttled = 0;
 100        rt_rq->rt_runtime = 0;
 101        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 102}
 103
 104#ifdef CONFIG_RT_GROUP_SCHED
 105static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 106{
 107        hrtimer_cancel(&rt_b->rt_period_timer);
 108}
 109
 110#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 111
 112static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 113{
 114#ifdef CONFIG_SCHED_DEBUG
 115        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 116#endif
 117        return container_of(rt_se, struct task_struct, rt);
 118}
 119
 120static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 121{
 122        return rt_rq->rq;
 123}
 124
 125static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 126{
 127        return rt_se->rt_rq;
 128}
 129
 130static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 131{
 132        struct rt_rq *rt_rq = rt_se->rt_rq;
 133
 134        return rt_rq->rq;
 135}
 136
 137void free_rt_sched_group(struct task_group *tg)
 138{
 139        int i;
 140
 141        if (tg->rt_se)
 142                destroy_rt_bandwidth(&tg->rt_bandwidth);
 143
 144        for_each_possible_cpu(i) {
 145                if (tg->rt_rq)
 146                        kfree(tg->rt_rq[i]);
 147                if (tg->rt_se)
 148                        kfree(tg->rt_se[i]);
 149        }
 150
 151        kfree(tg->rt_rq);
 152        kfree(tg->rt_se);
 153}
 154
 155void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 156                struct sched_rt_entity *rt_se, int cpu,
 157                struct sched_rt_entity *parent)
 158{
 159        struct rq *rq = cpu_rq(cpu);
 160
 161        rt_rq->highest_prio.curr = MAX_RT_PRIO;
 162        rt_rq->rt_nr_boosted = 0;
 163        rt_rq->rq = rq;
 164        rt_rq->tg = tg;
 165
 166        tg->rt_rq[cpu] = rt_rq;
 167        tg->rt_se[cpu] = rt_se;
 168
 169        if (!rt_se)
 170                return;
 171
 172        if (!parent)
 173                rt_se->rt_rq = &rq->rt;
 174        else
 175                rt_se->rt_rq = parent->my_q;
 176
 177        rt_se->my_q = rt_rq;
 178        rt_se->parent = parent;
 179        INIT_LIST_HEAD(&rt_se->run_list);
 180}
 181
 182int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 183{
 184        struct rt_rq *rt_rq;
 185        struct sched_rt_entity *rt_se;
 186        int i;
 187
 188        tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
 189        if (!tg->rt_rq)
 190                goto err;
 191        tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
 192        if (!tg->rt_se)
 193                goto err;
 194
 195        init_rt_bandwidth(&tg->rt_bandwidth,
 196                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 197
 198        for_each_possible_cpu(i) {
 199                rt_rq = kzalloc_node(sizeof(struct rt_rq),
 200                                     GFP_KERNEL, cpu_to_node(i));
 201                if (!rt_rq)
 202                        goto err;
 203
 204                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 205                                     GFP_KERNEL, cpu_to_node(i));
 206                if (!rt_se)
 207                        goto err_free_rq;
 208
 209                init_rt_rq(rt_rq);
 210                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 211                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 212        }
 213
 214        return 1;
 215
 216err_free_rq:
 217        kfree(rt_rq);
 218err:
 219        return 0;
 220}
 221
 222#else /* CONFIG_RT_GROUP_SCHED */
 223
 224#define rt_entity_is_task(rt_se) (1)
 225
 226static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 227{
 228        return container_of(rt_se, struct task_struct, rt);
 229}
 230
 231static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 232{
 233        return container_of(rt_rq, struct rq, rt);
 234}
 235
 236static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 237{
 238        struct task_struct *p = rt_task_of(rt_se);
 239
 240        return task_rq(p);
 241}
 242
 243static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 244{
 245        struct rq *rq = rq_of_rt_se(rt_se);
 246
 247        return &rq->rt;
 248}
 249
 250void free_rt_sched_group(struct task_group *tg) { }
 251
 252int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 253{
 254        return 1;
 255}
 256#endif /* CONFIG_RT_GROUP_SCHED */
 257
 258#ifdef CONFIG_SMP
 259
 260static void pull_rt_task(struct rq *this_rq);
 261
 262static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 263{
 264        /* Try to pull RT tasks here if we lower this rq's prio */
 265        return rq->rt.highest_prio.curr > prev->prio;
 266}
 267
 268static inline int rt_overloaded(struct rq *rq)
 269{
 270        return atomic_read(&rq->rd->rto_count);
 271}
 272
 273static inline void rt_set_overload(struct rq *rq)
 274{
 275        if (!rq->online)
 276                return;
 277
 278        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 279        /*
 280         * Make sure the mask is visible before we set
 281         * the overload count. That is checked to determine
 282         * if we should look at the mask. It would be a shame
 283         * if we looked at the mask, but the mask was not
 284         * updated yet.
 285         *
 286         * Matched by the barrier in pull_rt_task().
 287         */
 288        smp_wmb();
 289        atomic_inc(&rq->rd->rto_count);
 290}
 291
 292static inline void rt_clear_overload(struct rq *rq)
 293{
 294        if (!rq->online)
 295                return;
 296
 297        /* the order here really doesn't matter */
 298        atomic_dec(&rq->rd->rto_count);
 299        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 300}
 301
 302static void update_rt_migration(struct rt_rq *rt_rq)
 303{
 304        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 305                if (!rt_rq->overloaded) {
 306                        rt_set_overload(rq_of_rt_rq(rt_rq));
 307                        rt_rq->overloaded = 1;
 308                }
 309        } else if (rt_rq->overloaded) {
 310                rt_clear_overload(rq_of_rt_rq(rt_rq));
 311                rt_rq->overloaded = 0;
 312        }
 313}
 314
 315static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 316{
 317        struct task_struct *p;
 318
 319        if (!rt_entity_is_task(rt_se))
 320                return;
 321
 322        p = rt_task_of(rt_se);
 323        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 324
 325        rt_rq->rt_nr_total++;
 326        if (p->nr_cpus_allowed > 1)
 327                rt_rq->rt_nr_migratory++;
 328
 329        update_rt_migration(rt_rq);
 330}
 331
 332static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 333{
 334        struct task_struct *p;
 335
 336        if (!rt_entity_is_task(rt_se))
 337                return;
 338
 339        p = rt_task_of(rt_se);
 340        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 341
 342        rt_rq->rt_nr_total--;
 343        if (p->nr_cpus_allowed > 1)
 344                rt_rq->rt_nr_migratory--;
 345
 346        update_rt_migration(rt_rq);
 347}
 348
 349static inline int has_pushable_tasks(struct rq *rq)
 350{
 351        return !plist_head_empty(&rq->rt.pushable_tasks);
 352}
 353
 354static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 355static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 356
 357static void push_rt_tasks(struct rq *);
 358static void pull_rt_task(struct rq *);
 359
 360static inline void rt_queue_push_tasks(struct rq *rq)
 361{
 362        if (!has_pushable_tasks(rq))
 363                return;
 364
 365        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 366}
 367
 368static inline void rt_queue_pull_task(struct rq *rq)
 369{
 370        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 371}
 372
 373static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 374{
 375        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 376        plist_node_init(&p->pushable_tasks, p->prio);
 377        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 378
 379        /* Update the highest prio pushable task */
 380        if (p->prio < rq->rt.highest_prio.next)
 381                rq->rt.highest_prio.next = p->prio;
 382}
 383
 384static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 385{
 386        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 387
 388        /* Update the new highest prio pushable task */
 389        if (has_pushable_tasks(rq)) {
 390                p = plist_first_entry(&rq->rt.pushable_tasks,
 391                                      struct task_struct, pushable_tasks);
 392                rq->rt.highest_prio.next = p->prio;
 393        } else
 394                rq->rt.highest_prio.next = MAX_RT_PRIO;
 395}
 396
 397#else
 398
 399static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 400{
 401}
 402
 403static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 404{
 405}
 406
 407static inline
 408void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 409{
 410}
 411
 412static inline
 413void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 414{
 415}
 416
 417static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 418{
 419        return false;
 420}
 421
 422static inline void pull_rt_task(struct rq *this_rq)
 423{
 424}
 425
 426static inline void rt_queue_push_tasks(struct rq *rq)
 427{
 428}
 429#endif /* CONFIG_SMP */
 430
 431static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 432static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 433
 434static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 435{
 436        return rt_se->on_rq;
 437}
 438
 439#ifdef CONFIG_RT_GROUP_SCHED
 440
 441static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 442{
 443        if (!rt_rq->tg)
 444                return RUNTIME_INF;
 445
 446        return rt_rq->rt_runtime;
 447}
 448
 449static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 450{
 451        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 452}
 453
 454typedef struct task_group *rt_rq_iter_t;
 455
 456static inline struct task_group *next_task_group(struct task_group *tg)
 457{
 458        do {
 459                tg = list_entry_rcu(tg->list.next,
 460                        typeof(struct task_group), list);
 461        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 462
 463        if (&tg->list == &task_groups)
 464                tg = NULL;
 465
 466        return tg;
 467}
 468
 469#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 470        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 471                (iter = next_task_group(iter)) &&                       \
 472                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 473
 474#define for_each_sched_rt_entity(rt_se) \
 475        for (; rt_se; rt_se = rt_se->parent)
 476
 477static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 478{
 479        return rt_se->my_q;
 480}
 481
 482static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 483static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 484
 485static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 486{
 487        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 488        struct rq *rq = rq_of_rt_rq(rt_rq);
 489        struct sched_rt_entity *rt_se;
 490
 491        int cpu = cpu_of(rq);
 492
 493        rt_se = rt_rq->tg->rt_se[cpu];
 494
 495        if (rt_rq->rt_nr_running) {
 496                if (!rt_se)
 497                        enqueue_top_rt_rq(rt_rq);
 498                else if (!on_rt_rq(rt_se))
 499                        enqueue_rt_entity(rt_se, 0);
 500
 501                if (rt_rq->highest_prio.curr < curr->prio)
 502                        resched_curr(rq);
 503        }
 504}
 505
 506static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 507{
 508        struct sched_rt_entity *rt_se;
 509        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 510
 511        rt_se = rt_rq->tg->rt_se[cpu];
 512
 513        if (!rt_se) {
 514                dequeue_top_rt_rq(rt_rq);
 515                /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 516                cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
 517        }
 518        else if (on_rt_rq(rt_se))
 519                dequeue_rt_entity(rt_se, 0);
 520}
 521
 522static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 523{
 524        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 525}
 526
 527static int rt_se_boosted(struct sched_rt_entity *rt_se)
 528{
 529        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 530        struct task_struct *p;
 531
 532        if (rt_rq)
 533                return !!rt_rq->rt_nr_boosted;
 534
 535        p = rt_task_of(rt_se);
 536        return p->prio != p->normal_prio;
 537}
 538
 539#ifdef CONFIG_SMP
 540static inline const struct cpumask *sched_rt_period_mask(void)
 541{
 542        return this_rq()->rd->span;
 543}
 544#else
 545static inline const struct cpumask *sched_rt_period_mask(void)
 546{
 547        return cpu_online_mask;
 548}
 549#endif
 550
 551static inline
 552struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 553{
 554        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 555}
 556
 557static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 558{
 559        return &rt_rq->tg->rt_bandwidth;
 560}
 561
 562#else /* !CONFIG_RT_GROUP_SCHED */
 563
 564static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 565{
 566        return rt_rq->rt_runtime;
 567}
 568
 569static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 570{
 571        return ktime_to_ns(def_rt_bandwidth.rt_period);
 572}
 573
 574typedef struct rt_rq *rt_rq_iter_t;
 575
 576#define for_each_rt_rq(rt_rq, iter, rq) \
 577        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 578
 579#define for_each_sched_rt_entity(rt_se) \
 580        for (; rt_se; rt_se = NULL)
 581
 582static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 583{
 584        return NULL;
 585}
 586
 587static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 588{
 589        struct rq *rq = rq_of_rt_rq(rt_rq);
 590
 591        if (!rt_rq->rt_nr_running)
 592                return;
 593
 594        enqueue_top_rt_rq(rt_rq);
 595        resched_curr(rq);
 596}
 597
 598static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 599{
 600        dequeue_top_rt_rq(rt_rq);
 601}
 602
 603static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 604{
 605        return rt_rq->rt_throttled;
 606}
 607
 608static inline const struct cpumask *sched_rt_period_mask(void)
 609{
 610        return cpu_online_mask;
 611}
 612
 613static inline
 614struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 615{
 616        return &cpu_rq(cpu)->rt;
 617}
 618
 619static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 620{
 621        return &def_rt_bandwidth;
 622}
 623
 624#endif /* CONFIG_RT_GROUP_SCHED */
 625
 626bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 627{
 628        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 629
 630        return (hrtimer_active(&rt_b->rt_period_timer) ||
 631                rt_rq->rt_time < rt_b->rt_runtime);
 632}
 633
 634#ifdef CONFIG_SMP
 635/*
 636 * We ran out of runtime, see if we can borrow some from our neighbours.
 637 */
 638static void do_balance_runtime(struct rt_rq *rt_rq)
 639{
 640        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 641        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 642        int i, weight;
 643        u64 rt_period;
 644
 645        weight = cpumask_weight(rd->span);
 646
 647        raw_spin_lock(&rt_b->rt_runtime_lock);
 648        rt_period = ktime_to_ns(rt_b->rt_period);
 649        for_each_cpu(i, rd->span) {
 650                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 651                s64 diff;
 652
 653                if (iter == rt_rq)
 654                        continue;
 655
 656                raw_spin_lock(&iter->rt_runtime_lock);
 657                /*
 658                 * Either all rqs have inf runtime and there's nothing to steal
 659                 * or __disable_runtime() below sets a specific rq to inf to
 660                 * indicate its been disabled and disalow stealing.
 661                 */
 662                if (iter->rt_runtime == RUNTIME_INF)
 663                        goto next;
 664
 665                /*
 666                 * From runqueues with spare time, take 1/n part of their
 667                 * spare time, but no more than our period.
 668                 */
 669                diff = iter->rt_runtime - iter->rt_time;
 670                if (diff > 0) {
 671                        diff = div_u64((u64)diff, weight);
 672                        if (rt_rq->rt_runtime + diff > rt_period)
 673                                diff = rt_period - rt_rq->rt_runtime;
 674                        iter->rt_runtime -= diff;
 675                        rt_rq->rt_runtime += diff;
 676                        if (rt_rq->rt_runtime == rt_period) {
 677                                raw_spin_unlock(&iter->rt_runtime_lock);
 678                                break;
 679                        }
 680                }
 681next:
 682                raw_spin_unlock(&iter->rt_runtime_lock);
 683        }
 684        raw_spin_unlock(&rt_b->rt_runtime_lock);
 685}
 686
 687/*
 688 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 689 */
 690static void __disable_runtime(struct rq *rq)
 691{
 692        struct root_domain *rd = rq->rd;
 693        rt_rq_iter_t iter;
 694        struct rt_rq *rt_rq;
 695
 696        if (unlikely(!scheduler_running))
 697                return;
 698
 699        for_each_rt_rq(rt_rq, iter, rq) {
 700                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 701                s64 want;
 702                int i;
 703
 704                raw_spin_lock(&rt_b->rt_runtime_lock);
 705                raw_spin_lock(&rt_rq->rt_runtime_lock);
 706                /*
 707                 * Either we're all inf and nobody needs to borrow, or we're
 708                 * already disabled and thus have nothing to do, or we have
 709                 * exactly the right amount of runtime to take out.
 710                 */
 711                if (rt_rq->rt_runtime == RUNTIME_INF ||
 712                                rt_rq->rt_runtime == rt_b->rt_runtime)
 713                        goto balanced;
 714                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 715
 716                /*
 717                 * Calculate the difference between what we started out with
 718                 * and what we current have, that's the amount of runtime
 719                 * we lend and now have to reclaim.
 720                 */
 721                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 722
 723                /*
 724                 * Greedy reclaim, take back as much as we can.
 725                 */
 726                for_each_cpu(i, rd->span) {
 727                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 728                        s64 diff;
 729
 730                        /*
 731                         * Can't reclaim from ourselves or disabled runqueues.
 732                         */
 733                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 734                                continue;
 735
 736                        raw_spin_lock(&iter->rt_runtime_lock);
 737                        if (want > 0) {
 738                                diff = min_t(s64, iter->rt_runtime, want);
 739                                iter->rt_runtime -= diff;
 740                                want -= diff;
 741                        } else {
 742                                iter->rt_runtime -= want;
 743                                want -= want;
 744                        }
 745                        raw_spin_unlock(&iter->rt_runtime_lock);
 746
 747                        if (!want)
 748                                break;
 749                }
 750
 751                raw_spin_lock(&rt_rq->rt_runtime_lock);
 752                /*
 753                 * We cannot be left wanting - that would mean some runtime
 754                 * leaked out of the system.
 755                 */
 756                BUG_ON(want);
 757balanced:
 758                /*
 759                 * Disable all the borrow logic by pretending we have inf
 760                 * runtime - in which case borrowing doesn't make sense.
 761                 */
 762                rt_rq->rt_runtime = RUNTIME_INF;
 763                rt_rq->rt_throttled = 0;
 764                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 765                raw_spin_unlock(&rt_b->rt_runtime_lock);
 766
 767                /* Make rt_rq available for pick_next_task() */
 768                sched_rt_rq_enqueue(rt_rq);
 769        }
 770}
 771
 772static void __enable_runtime(struct rq *rq)
 773{
 774        rt_rq_iter_t iter;
 775        struct rt_rq *rt_rq;
 776
 777        if (unlikely(!scheduler_running))
 778                return;
 779
 780        /*
 781         * Reset each runqueue's bandwidth settings
 782         */
 783        for_each_rt_rq(rt_rq, iter, rq) {
 784                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 785
 786                raw_spin_lock(&rt_b->rt_runtime_lock);
 787                raw_spin_lock(&rt_rq->rt_runtime_lock);
 788                rt_rq->rt_runtime = rt_b->rt_runtime;
 789                rt_rq->rt_time = 0;
 790                rt_rq->rt_throttled = 0;
 791                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 792                raw_spin_unlock(&rt_b->rt_runtime_lock);
 793        }
 794}
 795
 796static void balance_runtime(struct rt_rq *rt_rq)
 797{
 798        if (!sched_feat(RT_RUNTIME_SHARE))
 799                return;
 800
 801        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 802                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 803                do_balance_runtime(rt_rq);
 804                raw_spin_lock(&rt_rq->rt_runtime_lock);
 805        }
 806}
 807#else /* !CONFIG_SMP */
 808static inline void balance_runtime(struct rt_rq *rt_rq) {}
 809#endif /* CONFIG_SMP */
 810
 811static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 812{
 813        int i, idle = 1, throttled = 0;
 814        const struct cpumask *span;
 815
 816        span = sched_rt_period_mask();
 817#ifdef CONFIG_RT_GROUP_SCHED
 818        /*
 819         * FIXME: isolated CPUs should really leave the root task group,
 820         * whether they are isolcpus or were isolated via cpusets, lest
 821         * the timer run on a CPU which does not service all runqueues,
 822         * potentially leaving other CPUs indefinitely throttled.  If
 823         * isolation is really required, the user will turn the throttle
 824         * off to kill the perturbations it causes anyway.  Meanwhile,
 825         * this maintains functionality for boot and/or troubleshooting.
 826         */
 827        if (rt_b == &root_task_group.rt_bandwidth)
 828                span = cpu_online_mask;
 829#endif
 830        for_each_cpu(i, span) {
 831                int enqueue = 0;
 832                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 833                struct rq *rq = rq_of_rt_rq(rt_rq);
 834                int skip;
 835
 836                /*
 837                 * When span == cpu_online_mask, taking each rq->lock
 838                 * can be time-consuming. Try to avoid it when possible.
 839                 */
 840                raw_spin_lock(&rt_rq->rt_runtime_lock);
 841                if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 842                        rt_rq->rt_runtime = rt_b->rt_runtime;
 843                skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 844                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 845                if (skip)
 846                        continue;
 847
 848                raw_spin_lock(&rq->lock);
 849                update_rq_clock(rq);
 850
 851                if (rt_rq->rt_time) {
 852                        u64 runtime;
 853
 854                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 855                        if (rt_rq->rt_throttled)
 856                                balance_runtime(rt_rq);
 857                        runtime = rt_rq->rt_runtime;
 858                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 859                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 860                                rt_rq->rt_throttled = 0;
 861                                enqueue = 1;
 862
 863                                /*
 864                                 * When we're idle and a woken (rt) task is
 865                                 * throttled check_preempt_curr() will set
 866                                 * skip_update and the time between the wakeup
 867                                 * and this unthrottle will get accounted as
 868                                 * 'runtime'.
 869                                 */
 870                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 871                                        rq_clock_cancel_skipupdate(rq);
 872                        }
 873                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 874                                idle = 0;
 875                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 876                } else if (rt_rq->rt_nr_running) {
 877                        idle = 0;
 878                        if (!rt_rq_throttled(rt_rq))
 879                                enqueue = 1;
 880                }
 881                if (rt_rq->rt_throttled)
 882                        throttled = 1;
 883
 884                if (enqueue)
 885                        sched_rt_rq_enqueue(rt_rq);
 886                raw_spin_unlock(&rq->lock);
 887        }
 888
 889        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 890                return 1;
 891
 892        return idle;
 893}
 894
 895static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 896{
 897#ifdef CONFIG_RT_GROUP_SCHED
 898        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 899
 900        if (rt_rq)
 901                return rt_rq->highest_prio.curr;
 902#endif
 903
 904        return rt_task_of(rt_se)->prio;
 905}
 906
 907static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 908{
 909        u64 runtime = sched_rt_runtime(rt_rq);
 910
 911        if (rt_rq->rt_throttled)
 912                return rt_rq_throttled(rt_rq);
 913
 914        if (runtime >= sched_rt_period(rt_rq))
 915                return 0;
 916
 917        balance_runtime(rt_rq);
 918        runtime = sched_rt_runtime(rt_rq);
 919        if (runtime == RUNTIME_INF)
 920                return 0;
 921
 922        if (rt_rq->rt_time > runtime) {
 923                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 924
 925                /*
 926                 * Don't actually throttle groups that have no runtime assigned
 927                 * but accrue some time due to boosting.
 928                 */
 929                if (likely(rt_b->rt_runtime)) {
 930                        rt_rq->rt_throttled = 1;
 931                        printk_deferred_once("sched: RT throttling activated\n");
 932                } else {
 933                        /*
 934                         * In case we did anyway, make it go away,
 935                         * replenishment is a joke, since it will replenish us
 936                         * with exactly 0 ns.
 937                         */
 938                        rt_rq->rt_time = 0;
 939                }
 940
 941                if (rt_rq_throttled(rt_rq)) {
 942                        sched_rt_rq_dequeue(rt_rq);
 943                        return 1;
 944                }
 945        }
 946
 947        return 0;
 948}
 949
 950/*
 951 * Update the current task's runtime statistics. Skip current tasks that
 952 * are not in our scheduling class.
 953 */
 954static void update_curr_rt(struct rq *rq)
 955{
 956        struct task_struct *curr = rq->curr;
 957        struct sched_rt_entity *rt_se = &curr->rt;
 958        u64 delta_exec;
 959        u64 now;
 960
 961        if (curr->sched_class != &rt_sched_class)
 962                return;
 963
 964        now = rq_clock_task(rq);
 965        delta_exec = now - curr->se.exec_start;
 966        if (unlikely((s64)delta_exec <= 0))
 967                return;
 968
 969        schedstat_set(curr->se.statistics.exec_max,
 970                      max(curr->se.statistics.exec_max, delta_exec));
 971
 972        curr->se.sum_exec_runtime += delta_exec;
 973        account_group_exec_runtime(curr, delta_exec);
 974
 975        curr->se.exec_start = now;
 976        cgroup_account_cputime(curr, delta_exec);
 977
 978        if (!rt_bandwidth_enabled())
 979                return;
 980
 981        for_each_sched_rt_entity(rt_se) {
 982                struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 983
 984                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 985                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 986                        rt_rq->rt_time += delta_exec;
 987                        if (sched_rt_runtime_exceeded(rt_rq))
 988                                resched_curr(rq);
 989                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 990                }
 991        }
 992}
 993
 994static void
 995dequeue_top_rt_rq(struct rt_rq *rt_rq)
 996{
 997        struct rq *rq = rq_of_rt_rq(rt_rq);
 998
 999        BUG_ON(&rq->rt != rt_rq);
1000
1001        if (!rt_rq->rt_queued)
1002                return;
1003
1004        BUG_ON(!rq->nr_running);
1005
1006        sub_nr_running(rq, rt_rq->rt_nr_running);
1007        rt_rq->rt_queued = 0;
1008
1009}
1010
1011static void
1012enqueue_top_rt_rq(struct rt_rq *rt_rq)
1013{
1014        struct rq *rq = rq_of_rt_rq(rt_rq);
1015
1016        BUG_ON(&rq->rt != rt_rq);
1017
1018        if (rt_rq->rt_queued)
1019                return;
1020
1021        if (rt_rq_throttled(rt_rq))
1022                return;
1023
1024        if (rt_rq->rt_nr_running) {
1025                add_nr_running(rq, rt_rq->rt_nr_running);
1026                rt_rq->rt_queued = 1;
1027        }
1028
1029        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1030        cpufreq_update_util(rq, 0);
1031}
1032
1033#if defined CONFIG_SMP
1034
1035static void
1036inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1037{
1038        struct rq *rq = rq_of_rt_rq(rt_rq);
1039
1040#ifdef CONFIG_RT_GROUP_SCHED
1041        /*
1042         * Change rq's cpupri only if rt_rq is the top queue.
1043         */
1044        if (&rq->rt != rt_rq)
1045                return;
1046#endif
1047        if (rq->online && prio < prev_prio)
1048                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1049}
1050
1051static void
1052dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1053{
1054        struct rq *rq = rq_of_rt_rq(rt_rq);
1055
1056#ifdef CONFIG_RT_GROUP_SCHED
1057        /*
1058         * Change rq's cpupri only if rt_rq is the top queue.
1059         */
1060        if (&rq->rt != rt_rq)
1061                return;
1062#endif
1063        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1064                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1065}
1066
1067#else /* CONFIG_SMP */
1068
1069static inline
1070void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1071static inline
1072void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1073
1074#endif /* CONFIG_SMP */
1075
1076#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1077static void
1078inc_rt_prio(struct rt_rq *rt_rq, int prio)
1079{
1080        int prev_prio = rt_rq->highest_prio.curr;
1081
1082        if (prio < prev_prio)
1083                rt_rq->highest_prio.curr = prio;
1084
1085        inc_rt_prio_smp(rt_rq, prio, prev_prio);
1086}
1087
1088static void
1089dec_rt_prio(struct rt_rq *rt_rq, int prio)
1090{
1091        int prev_prio = rt_rq->highest_prio.curr;
1092
1093        if (rt_rq->rt_nr_running) {
1094
1095                WARN_ON(prio < prev_prio);
1096
1097                /*
1098                 * This may have been our highest task, and therefore
1099                 * we may have some recomputation to do
1100                 */
1101                if (prio == prev_prio) {
1102                        struct rt_prio_array *array = &rt_rq->active;
1103
1104                        rt_rq->highest_prio.curr =
1105                                sched_find_first_bit(array->bitmap);
1106                }
1107
1108        } else
1109                rt_rq->highest_prio.curr = MAX_RT_PRIO;
1110
1111        dec_rt_prio_smp(rt_rq, prio, prev_prio);
1112}
1113
1114#else
1115
1116static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1117static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1118
1119#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1120
1121#ifdef CONFIG_RT_GROUP_SCHED
1122
1123static void
1124inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1125{
1126        if (rt_se_boosted(rt_se))
1127                rt_rq->rt_nr_boosted++;
1128
1129        if (rt_rq->tg)
1130                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1131}
1132
1133static void
1134dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1135{
1136        if (rt_se_boosted(rt_se))
1137                rt_rq->rt_nr_boosted--;
1138
1139        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1140}
1141
1142#else /* CONFIG_RT_GROUP_SCHED */
1143
1144static void
1145inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1146{
1147        start_rt_bandwidth(&def_rt_bandwidth);
1148}
1149
1150static inline
1151void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1152
1153#endif /* CONFIG_RT_GROUP_SCHED */
1154
1155static inline
1156unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1157{
1158        struct rt_rq *group_rq = group_rt_rq(rt_se);
1159
1160        if (group_rq)
1161                return group_rq->rt_nr_running;
1162        else
1163                return 1;
1164}
1165
1166static inline
1167unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1168{
1169        struct rt_rq *group_rq = group_rt_rq(rt_se);
1170        struct task_struct *tsk;
1171
1172        if (group_rq)
1173                return group_rq->rr_nr_running;
1174
1175        tsk = rt_task_of(rt_se);
1176
1177        return (tsk->policy == SCHED_RR) ? 1 : 0;
1178}
1179
1180static inline
1181void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1182{
1183        int prio = rt_se_prio(rt_se);
1184
1185        WARN_ON(!rt_prio(prio));
1186        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1187        rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1188
1189        inc_rt_prio(rt_rq, prio);
1190        inc_rt_migration(rt_se, rt_rq);
1191        inc_rt_group(rt_se, rt_rq);
1192}
1193
1194static inline
1195void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1196{
1197        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1198        WARN_ON(!rt_rq->rt_nr_running);
1199        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1200        rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1201
1202        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1203        dec_rt_migration(rt_se, rt_rq);
1204        dec_rt_group(rt_se, rt_rq);
1205}
1206
1207/*
1208 * Change rt_se->run_list location unless SAVE && !MOVE
1209 *
1210 * assumes ENQUEUE/DEQUEUE flags match
1211 */
1212static inline bool move_entity(unsigned int flags)
1213{
1214        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1215                return false;
1216
1217        return true;
1218}
1219
1220static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1221{
1222        list_del_init(&rt_se->run_list);
1223
1224        if (list_empty(array->queue + rt_se_prio(rt_se)))
1225                __clear_bit(rt_se_prio(rt_se), array->bitmap);
1226
1227        rt_se->on_list = 0;
1228}
1229
1230static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1231{
1232        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1233        struct rt_prio_array *array = &rt_rq->active;
1234        struct rt_rq *group_rq = group_rt_rq(rt_se);
1235        struct list_head *queue = array->queue + rt_se_prio(rt_se);
1236
1237        /*
1238         * Don't enqueue the group if its throttled, or when empty.
1239         * The latter is a consequence of the former when a child group
1240         * get throttled and the current group doesn't have any other
1241         * active members.
1242         */
1243        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1244                if (rt_se->on_list)
1245                        __delist_rt_entity(rt_se, array);
1246                return;
1247        }
1248
1249        if (move_entity(flags)) {
1250                WARN_ON_ONCE(rt_se->on_list);
1251                if (flags & ENQUEUE_HEAD)
1252                        list_add(&rt_se->run_list, queue);
1253                else
1254                        list_add_tail(&rt_se->run_list, queue);
1255
1256                __set_bit(rt_se_prio(rt_se), array->bitmap);
1257                rt_se->on_list = 1;
1258        }
1259        rt_se->on_rq = 1;
1260
1261        inc_rt_tasks(rt_se, rt_rq);
1262}
1263
1264static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1265{
1266        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1267        struct rt_prio_array *array = &rt_rq->active;
1268
1269        if (move_entity(flags)) {
1270                WARN_ON_ONCE(!rt_se->on_list);
1271                __delist_rt_entity(rt_se, array);
1272        }
1273        rt_se->on_rq = 0;
1274
1275        dec_rt_tasks(rt_se, rt_rq);
1276}
1277
1278/*
1279 * Because the prio of an upper entry depends on the lower
1280 * entries, we must remove entries top - down.
1281 */
1282static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1283{
1284        struct sched_rt_entity *back = NULL;
1285
1286        for_each_sched_rt_entity(rt_se) {
1287                rt_se->back = back;
1288                back = rt_se;
1289        }
1290
1291        dequeue_top_rt_rq(rt_rq_of_se(back));
1292
1293        for (rt_se = back; rt_se; rt_se = rt_se->back) {
1294                if (on_rt_rq(rt_se))
1295                        __dequeue_rt_entity(rt_se, flags);
1296        }
1297}
1298
1299static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1300{
1301        struct rq *rq = rq_of_rt_se(rt_se);
1302
1303        dequeue_rt_stack(rt_se, flags);
1304        for_each_sched_rt_entity(rt_se)
1305                __enqueue_rt_entity(rt_se, flags);
1306        enqueue_top_rt_rq(&rq->rt);
1307}
1308
1309static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1310{
1311        struct rq *rq = rq_of_rt_se(rt_se);
1312
1313        dequeue_rt_stack(rt_se, flags);
1314
1315        for_each_sched_rt_entity(rt_se) {
1316                struct rt_rq *rt_rq = group_rt_rq(rt_se);
1317
1318                if (rt_rq && rt_rq->rt_nr_running)
1319                        __enqueue_rt_entity(rt_se, flags);
1320        }
1321        enqueue_top_rt_rq(&rq->rt);
1322}
1323
1324/*
1325 * Adding/removing a task to/from a priority array:
1326 */
1327static void
1328enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1329{
1330        struct sched_rt_entity *rt_se = &p->rt;
1331
1332        if (flags & ENQUEUE_WAKEUP)
1333                rt_se->timeout = 0;
1334
1335        enqueue_rt_entity(rt_se, flags);
1336
1337        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1338                enqueue_pushable_task(rq, p);
1339}
1340
1341static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1342{
1343        struct sched_rt_entity *rt_se = &p->rt;
1344
1345        update_curr_rt(rq);
1346        dequeue_rt_entity(rt_se, flags);
1347
1348        dequeue_pushable_task(rq, p);
1349}
1350
1351/*
1352 * Put task to the head or the end of the run list without the overhead of
1353 * dequeue followed by enqueue.
1354 */
1355static void
1356requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1357{
1358        if (on_rt_rq(rt_se)) {
1359                struct rt_prio_array *array = &rt_rq->active;
1360                struct list_head *queue = array->queue + rt_se_prio(rt_se);
1361
1362                if (head)
1363                        list_move(&rt_se->run_list, queue);
1364                else
1365                        list_move_tail(&rt_se->run_list, queue);
1366        }
1367}
1368
1369static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1370{
1371        struct sched_rt_entity *rt_se = &p->rt;
1372        struct rt_rq *rt_rq;
1373
1374        for_each_sched_rt_entity(rt_se) {
1375                rt_rq = rt_rq_of_se(rt_se);
1376                requeue_rt_entity(rt_rq, rt_se, head);
1377        }
1378}
1379
1380static void yield_task_rt(struct rq *rq)
1381{
1382        requeue_task_rt(rq, rq->curr, 0);
1383}
1384
1385#ifdef CONFIG_SMP
1386static int find_lowest_rq(struct task_struct *task);
1387
1388static int
1389select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1390{
1391        struct task_struct *curr;
1392        struct rq *rq;
1393
1394        /* For anything but wake ups, just return the task_cpu */
1395        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1396                goto out;
1397
1398        rq = cpu_rq(cpu);
1399
1400        rcu_read_lock();
1401        curr = READ_ONCE(rq->curr); /* unlocked access */
1402
1403        /*
1404         * If the current task on @p's runqueue is an RT task, then
1405         * try to see if we can wake this RT task up on another
1406         * runqueue. Otherwise simply start this RT task
1407         * on its current runqueue.
1408         *
1409         * We want to avoid overloading runqueues. If the woken
1410         * task is a higher priority, then it will stay on this CPU
1411         * and the lower prio task should be moved to another CPU.
1412         * Even though this will probably make the lower prio task
1413         * lose its cache, we do not want to bounce a higher task
1414         * around just because it gave up its CPU, perhaps for a
1415         * lock?
1416         *
1417         * For equal prio tasks, we just let the scheduler sort it out.
1418         *
1419         * Otherwise, just let it ride on the affined RQ and the
1420         * post-schedule router will push the preempted task away
1421         *
1422         * This test is optimistic, if we get it wrong the load-balancer
1423         * will have to sort it out.
1424         */
1425        if (curr && unlikely(rt_task(curr)) &&
1426            (curr->nr_cpus_allowed < 2 ||
1427             curr->prio <= p->prio)) {
1428                int target = find_lowest_rq(p);
1429
1430                /*
1431                 * Don't bother moving it if the destination CPU is
1432                 * not running a lower priority task.
1433                 */
1434                if (target != -1 &&
1435                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
1436                        cpu = target;
1437        }
1438        rcu_read_unlock();
1439
1440out:
1441        return cpu;
1442}
1443
1444static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1445{
1446        /*
1447         * Current can't be migrated, useless to reschedule,
1448         * let's hope p can move out.
1449         */
1450        if (rq->curr->nr_cpus_allowed == 1 ||
1451            !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1452                return;
1453
1454        /*
1455         * p is migratable, so let's not schedule it and
1456         * see if it is pushed or pulled somewhere else.
1457         */
1458        if (p->nr_cpus_allowed != 1
1459            && cpupri_find(&rq->rd->cpupri, p, NULL))
1460                return;
1461
1462        /*
1463         * There appear to be other CPUs that can accept
1464         * the current task but none can run 'p', so lets reschedule
1465         * to try and push the current task away:
1466         */
1467        requeue_task_rt(rq, p, 1);
1468        resched_curr(rq);
1469}
1470
1471#endif /* CONFIG_SMP */
1472
1473/*
1474 * Preempt the current task with a newly woken task if needed:
1475 */
1476static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1477{
1478        if (p->prio < rq->curr->prio) {
1479                resched_curr(rq);
1480                return;
1481        }
1482
1483#ifdef CONFIG_SMP
1484        /*
1485         * If:
1486         *
1487         * - the newly woken task is of equal priority to the current task
1488         * - the newly woken task is non-migratable while current is migratable
1489         * - current will be preempted on the next reschedule
1490         *
1491         * we should check to see if current can readily move to a different
1492         * cpu.  If so, we will reschedule to allow the push logic to try
1493         * to move current somewhere else, making room for our non-migratable
1494         * task.
1495         */
1496        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1497                check_preempt_equal_prio(rq, p);
1498#endif
1499}
1500
1501static inline void set_next_task(struct rq *rq, struct task_struct *p)
1502{
1503        p->se.exec_start = rq_clock_task(rq);
1504
1505        /* The running task is never eligible for pushing */
1506        dequeue_pushable_task(rq, p);
1507}
1508
1509static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1510                                                   struct rt_rq *rt_rq)
1511{
1512        struct rt_prio_array *array = &rt_rq->active;
1513        struct sched_rt_entity *next = NULL;
1514        struct list_head *queue;
1515        int idx;
1516
1517        idx = sched_find_first_bit(array->bitmap);
1518        BUG_ON(idx >= MAX_RT_PRIO);
1519
1520        queue = array->queue + idx;
1521        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1522
1523        return next;
1524}
1525
1526static struct task_struct *_pick_next_task_rt(struct rq *rq)
1527{
1528        struct sched_rt_entity *rt_se;
1529        struct rt_rq *rt_rq  = &rq->rt;
1530
1531        do {
1532                rt_se = pick_next_rt_entity(rq, rt_rq);
1533                BUG_ON(!rt_se);
1534                rt_rq = group_rt_rq(rt_se);
1535        } while (rt_rq);
1536
1537        return rt_task_of(rt_se);
1538}
1539
1540static struct task_struct *
1541pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1542{
1543        struct task_struct *p;
1544        struct rt_rq *rt_rq = &rq->rt;
1545
1546        if (need_pull_rt_task(rq, prev)) {
1547                /*
1548                 * This is OK, because current is on_cpu, which avoids it being
1549                 * picked for load-balance and preemption/IRQs are still
1550                 * disabled avoiding further scheduler activity on it and we're
1551                 * being very careful to re-start the picking loop.
1552                 */
1553                rq_unpin_lock(rq, rf);
1554                pull_rt_task(rq);
1555                rq_repin_lock(rq, rf);
1556                /*
1557                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1558                 * means a dl or stop task can slip in, in which case we need
1559                 * to re-start task selection.
1560                 */
1561                if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1562                             rq->dl.dl_nr_running))
1563                        return RETRY_TASK;
1564        }
1565
1566        /*
1567         * We may dequeue prev's rt_rq in put_prev_task().
1568         * So, we update time before rt_queued check.
1569         */
1570        if (prev->sched_class == &rt_sched_class)
1571                update_curr_rt(rq);
1572
1573        if (!rt_rq->rt_queued)
1574                return NULL;
1575
1576        put_prev_task(rq, prev);
1577
1578        p = _pick_next_task_rt(rq);
1579
1580        set_next_task(rq, p);
1581
1582        rt_queue_push_tasks(rq);
1583
1584        /*
1585         * If prev task was rt, put_prev_task() has already updated the
1586         * utilization. We only care of the case where we start to schedule a
1587         * rt task
1588         */
1589        if (rq->curr->sched_class != &rt_sched_class)
1590                update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1591
1592        return p;
1593}
1594
1595static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1596{
1597        update_curr_rt(rq);
1598
1599        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1600
1601        /*
1602         * The previous task needs to be made eligible for pushing
1603         * if it is still active
1604         */
1605        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1606                enqueue_pushable_task(rq, p);
1607}
1608
1609#ifdef CONFIG_SMP
1610
1611/* Only try algorithms three times */
1612#define RT_MAX_TRIES 3
1613
1614static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1615{
1616        if (!task_running(rq, p) &&
1617            cpumask_test_cpu(cpu, p->cpus_ptr))
1618                return 1;
1619
1620        return 0;
1621}
1622
1623/*
1624 * Return the highest pushable rq's task, which is suitable to be executed
1625 * on the CPU, NULL otherwise
1626 */
1627static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1628{
1629        struct plist_head *head = &rq->rt.pushable_tasks;
1630        struct task_struct *p;
1631
1632        if (!has_pushable_tasks(rq))
1633                return NULL;
1634
1635        plist_for_each_entry(p, head, pushable_tasks) {
1636                if (pick_rt_task(rq, p, cpu))
1637                        return p;
1638        }
1639
1640        return NULL;
1641}
1642
1643static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1644
1645static int find_lowest_rq(struct task_struct *task)
1646{
1647        struct sched_domain *sd;
1648        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1649        int this_cpu = smp_processor_id();
1650        int cpu      = task_cpu(task);
1651
1652        /* Make sure the mask is initialized first */
1653        if (unlikely(!lowest_mask))
1654                return -1;
1655
1656        if (task->nr_cpus_allowed == 1)
1657                return -1; /* No other targets possible */
1658
1659        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1660                return -1; /* No targets found */
1661
1662        /*
1663         * At this point we have built a mask of CPUs representing the
1664         * lowest priority tasks in the system.  Now we want to elect
1665         * the best one based on our affinity and topology.
1666         *
1667         * We prioritize the last CPU that the task executed on since
1668         * it is most likely cache-hot in that location.
1669         */
1670        if (cpumask_test_cpu(cpu, lowest_mask))
1671                return cpu;
1672
1673        /*
1674         * Otherwise, we consult the sched_domains span maps to figure
1675         * out which CPU is logically closest to our hot cache data.
1676         */
1677        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1678                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1679
1680        rcu_read_lock();
1681        for_each_domain(cpu, sd) {
1682                if (sd->flags & SD_WAKE_AFFINE) {
1683                        int best_cpu;
1684
1685                        /*
1686                         * "this_cpu" is cheaper to preempt than a
1687                         * remote processor.
1688                         */
1689                        if (this_cpu != -1 &&
1690                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1691                                rcu_read_unlock();
1692                                return this_cpu;
1693                        }
1694
1695                        best_cpu = cpumask_first_and(lowest_mask,
1696                                                     sched_domain_span(sd));
1697                        if (best_cpu < nr_cpu_ids) {
1698                                rcu_read_unlock();
1699                                return best_cpu;
1700                        }
1701                }
1702        }
1703        rcu_read_unlock();
1704
1705        /*
1706         * And finally, if there were no matches within the domains
1707         * just give the caller *something* to work with from the compatible
1708         * locations.
1709         */
1710        if (this_cpu != -1)
1711                return this_cpu;
1712
1713        cpu = cpumask_any(lowest_mask);
1714        if (cpu < nr_cpu_ids)
1715                return cpu;
1716
1717        return -1;
1718}
1719
1720/* Will lock the rq it finds */
1721static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1722{
1723        struct rq *lowest_rq = NULL;
1724        int tries;
1725        int cpu;
1726
1727        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1728                cpu = find_lowest_rq(task);
1729
1730                if ((cpu == -1) || (cpu == rq->cpu))
1731                        break;
1732
1733                lowest_rq = cpu_rq(cpu);
1734
1735                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1736                        /*
1737                         * Target rq has tasks of equal or higher priority,
1738                         * retrying does not release any lock and is unlikely
1739                         * to yield a different result.
1740                         */
1741                        lowest_rq = NULL;
1742                        break;
1743                }
1744
1745                /* if the prio of this runqueue changed, try again */
1746                if (double_lock_balance(rq, lowest_rq)) {
1747                        /*
1748                         * We had to unlock the run queue. In
1749                         * the mean time, task could have
1750                         * migrated already or had its affinity changed.
1751                         * Also make sure that it wasn't scheduled on its rq.
1752                         */
1753                        if (unlikely(task_rq(task) != rq ||
1754                                     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
1755                                     task_running(rq, task) ||
1756                                     !rt_task(task) ||
1757                                     !task_on_rq_queued(task))) {
1758
1759                                double_unlock_balance(rq, lowest_rq);
1760                                lowest_rq = NULL;
1761                                break;
1762                        }
1763                }
1764
1765                /* If this rq is still suitable use it. */
1766                if (lowest_rq->rt.highest_prio.curr > task->prio)
1767                        break;
1768
1769                /* try again */
1770                double_unlock_balance(rq, lowest_rq);
1771                lowest_rq = NULL;
1772        }
1773
1774        return lowest_rq;
1775}
1776
1777static struct task_struct *pick_next_pushable_task(struct rq *rq)
1778{
1779        struct task_struct *p;
1780
1781        if (!has_pushable_tasks(rq))
1782                return NULL;
1783
1784        p = plist_first_entry(&rq->rt.pushable_tasks,
1785                              struct task_struct, pushable_tasks);
1786
1787        BUG_ON(rq->cpu != task_cpu(p));
1788        BUG_ON(task_current(rq, p));
1789        BUG_ON(p->nr_cpus_allowed <= 1);
1790
1791        BUG_ON(!task_on_rq_queued(p));
1792        BUG_ON(!rt_task(p));
1793
1794        return p;
1795}
1796
1797/*
1798 * If the current CPU has more than one RT task, see if the non
1799 * running task can migrate over to a CPU that is running a task
1800 * of lesser priority.
1801 */
1802static int push_rt_task(struct rq *rq)
1803{
1804        struct task_struct *next_task;
1805        struct rq *lowest_rq;
1806        int ret = 0;
1807
1808        if (!rq->rt.overloaded)
1809                return 0;
1810
1811        next_task = pick_next_pushable_task(rq);
1812        if (!next_task)
1813                return 0;
1814
1815retry:
1816        if (WARN_ON(next_task == rq->curr))
1817                return 0;
1818
1819        /*
1820         * It's possible that the next_task slipped in of
1821         * higher priority than current. If that's the case
1822         * just reschedule current.
1823         */
1824        if (unlikely(next_task->prio < rq->curr->prio)) {
1825                resched_curr(rq);
1826                return 0;
1827        }
1828
1829        /* We might release rq lock */
1830        get_task_struct(next_task);
1831
1832        /* find_lock_lowest_rq locks the rq if found */
1833        lowest_rq = find_lock_lowest_rq(next_task, rq);
1834        if (!lowest_rq) {
1835                struct task_struct *task;
1836                /*
1837                 * find_lock_lowest_rq releases rq->lock
1838                 * so it is possible that next_task has migrated.
1839                 *
1840                 * We need to make sure that the task is still on the same
1841                 * run-queue and is also still the next task eligible for
1842                 * pushing.
1843                 */
1844                task = pick_next_pushable_task(rq);
1845                if (task == next_task) {
1846                        /*
1847                         * The task hasn't migrated, and is still the next
1848                         * eligible task, but we failed to find a run-queue
1849                         * to push it to.  Do not retry in this case, since
1850                         * other CPUs will pull from us when ready.
1851                         */
1852                        goto out;
1853                }
1854
1855                if (!task)
1856                        /* No more tasks, just exit */
1857                        goto out;
1858
1859                /*
1860                 * Something has shifted, try again.
1861                 */
1862                put_task_struct(next_task);
1863                next_task = task;
1864                goto retry;
1865        }
1866
1867        deactivate_task(rq, next_task, 0);
1868        set_task_cpu(next_task, lowest_rq->cpu);
1869        activate_task(lowest_rq, next_task, 0);
1870        ret = 1;
1871
1872        resched_curr(lowest_rq);
1873
1874        double_unlock_balance(rq, lowest_rq);
1875
1876out:
1877        put_task_struct(next_task);
1878
1879        return ret;
1880}
1881
1882static void push_rt_tasks(struct rq *rq)
1883{
1884        /* push_rt_task will return true if it moved an RT */
1885        while (push_rt_task(rq))
1886                ;
1887}
1888
1889#ifdef HAVE_RT_PUSH_IPI
1890
1891/*
1892 * When a high priority task schedules out from a CPU and a lower priority
1893 * task is scheduled in, a check is made to see if there's any RT tasks
1894 * on other CPUs that are waiting to run because a higher priority RT task
1895 * is currently running on its CPU. In this case, the CPU with multiple RT
1896 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1897 * up that may be able to run one of its non-running queued RT tasks.
1898 *
1899 * All CPUs with overloaded RT tasks need to be notified as there is currently
1900 * no way to know which of these CPUs have the highest priority task waiting
1901 * to run. Instead of trying to take a spinlock on each of these CPUs,
1902 * which has shown to cause large latency when done on machines with many
1903 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1904 * RT tasks waiting to run.
1905 *
1906 * Just sending an IPI to each of the CPUs is also an issue, as on large
1907 * count CPU machines, this can cause an IPI storm on a CPU, especially
1908 * if its the only CPU with multiple RT tasks queued, and a large number
1909 * of CPUs scheduling a lower priority task at the same time.
1910 *
1911 * Each root domain has its own irq work function that can iterate over
1912 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1913 * tassk must be checked if there's one or many CPUs that are lowering
1914 * their priority, there's a single irq work iterator that will try to
1915 * push off RT tasks that are waiting to run.
1916 *
1917 * When a CPU schedules a lower priority task, it will kick off the
1918 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1919 * As it only takes the first CPU that schedules a lower priority task
1920 * to start the process, the rto_start variable is incremented and if
1921 * the atomic result is one, then that CPU will try to take the rto_lock.
1922 * This prevents high contention on the lock as the process handles all
1923 * CPUs scheduling lower priority tasks.
1924 *
1925 * All CPUs that are scheduling a lower priority task will increment the
1926 * rt_loop_next variable. This will make sure that the irq work iterator
1927 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1928 * priority task, even if the iterator is in the middle of a scan. Incrementing
1929 * the rt_loop_next will cause the iterator to perform another scan.
1930 *
1931 */
1932static int rto_next_cpu(struct root_domain *rd)
1933{
1934        int next;
1935        int cpu;
1936
1937        /*
1938         * When starting the IPI RT pushing, the rto_cpu is set to -1,
1939         * rt_next_cpu() will simply return the first CPU found in
1940         * the rto_mask.
1941         *
1942         * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1943         * will return the next CPU found in the rto_mask.
1944         *
1945         * If there are no more CPUs left in the rto_mask, then a check is made
1946         * against rto_loop and rto_loop_next. rto_loop is only updated with
1947         * the rto_lock held, but any CPU may increment the rto_loop_next
1948         * without any locking.
1949         */
1950        for (;;) {
1951
1952                /* When rto_cpu is -1 this acts like cpumask_first() */
1953                cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1954
1955                rd->rto_cpu = cpu;
1956
1957                if (cpu < nr_cpu_ids)
1958                        return cpu;
1959
1960                rd->rto_cpu = -1;
1961
1962                /*
1963                 * ACQUIRE ensures we see the @rto_mask changes
1964                 * made prior to the @next value observed.
1965                 *
1966                 * Matches WMB in rt_set_overload().
1967                 */
1968                next = atomic_read_acquire(&rd->rto_loop_next);
1969
1970                if (rd->rto_loop == next)
1971                        break;
1972
1973                rd->rto_loop = next;
1974        }
1975
1976        return -1;
1977}
1978
1979static inline bool rto_start_trylock(atomic_t *v)
1980{
1981        return !atomic_cmpxchg_acquire(v, 0, 1);
1982}
1983
1984static inline void rto_start_unlock(atomic_t *v)
1985{
1986        atomic_set_release(v, 0);
1987}
1988
1989static void tell_cpu_to_push(struct rq *rq)
1990{
1991        int cpu = -1;
1992
1993        /* Keep the loop going if the IPI is currently active */
1994        atomic_inc(&rq->rd->rto_loop_next);
1995
1996        /* Only one CPU can initiate a loop at a time */
1997        if (!rto_start_trylock(&rq->rd->rto_loop_start))
1998                return;
1999
2000        raw_spin_lock(&rq->rd->rto_lock);
2001
2002        /*
2003         * The rto_cpu is updated under the lock, if it has a valid CPU
2004         * then the IPI is still running and will continue due to the
2005         * update to loop_next, and nothing needs to be done here.
2006         * Otherwise it is finishing up and an ipi needs to be sent.
2007         */
2008        if (rq->rd->rto_cpu < 0)
2009                cpu = rto_next_cpu(rq->rd);
2010
2011        raw_spin_unlock(&rq->rd->rto_lock);
2012
2013        rto_start_unlock(&rq->rd->rto_loop_start);
2014
2015        if (cpu >= 0) {
2016                /* Make sure the rd does not get freed while pushing */
2017                sched_get_rd(rq->rd);
2018                irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2019        }
2020}
2021
2022/* Called from hardirq context */
2023void rto_push_irq_work_func(struct irq_work *work)
2024{
2025        struct root_domain *rd =
2026                container_of(work, struct root_domain, rto_push_work);
2027        struct rq *rq;
2028        int cpu;
2029
2030        rq = this_rq();
2031
2032        /*
2033         * We do not need to grab the lock to check for has_pushable_tasks.
2034         * When it gets updated, a check is made if a push is possible.
2035         */
2036        if (has_pushable_tasks(rq)) {
2037                raw_spin_lock(&rq->lock);
2038                push_rt_tasks(rq);
2039                raw_spin_unlock(&rq->lock);
2040        }
2041
2042        raw_spin_lock(&rd->rto_lock);
2043
2044        /* Pass the IPI to the next rt overloaded queue */
2045        cpu = rto_next_cpu(rd);
2046
2047        raw_spin_unlock(&rd->rto_lock);
2048
2049        if (cpu < 0) {
2050                sched_put_rd(rd);
2051                return;
2052        }
2053
2054        /* Try the next RT overloaded CPU */
2055        irq_work_queue_on(&rd->rto_push_work, cpu);
2056}
2057#endif /* HAVE_RT_PUSH_IPI */
2058
2059static void pull_rt_task(struct rq *this_rq)
2060{
2061        int this_cpu = this_rq->cpu, cpu;
2062        bool resched = false;
2063        struct task_struct *p;
2064        struct rq *src_rq;
2065        int rt_overload_count = rt_overloaded(this_rq);
2066
2067        if (likely(!rt_overload_count))
2068                return;
2069
2070        /*
2071         * Match the barrier from rt_set_overloaded; this guarantees that if we
2072         * see overloaded we must also see the rto_mask bit.
2073         */
2074        smp_rmb();
2075
2076        /* If we are the only overloaded CPU do nothing */
2077        if (rt_overload_count == 1 &&
2078            cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2079                return;
2080
2081#ifdef HAVE_RT_PUSH_IPI
2082        if (sched_feat(RT_PUSH_IPI)) {
2083                tell_cpu_to_push(this_rq);
2084                return;
2085        }
2086#endif
2087
2088        for_each_cpu(cpu, this_rq->rd->rto_mask) {
2089                if (this_cpu == cpu)
2090                        continue;
2091
2092                src_rq = cpu_rq(cpu);
2093
2094                /*
2095                 * Don't bother taking the src_rq->lock if the next highest
2096                 * task is known to be lower-priority than our current task.
2097                 * This may look racy, but if this value is about to go
2098                 * logically higher, the src_rq will push this task away.
2099                 * And if its going logically lower, we do not care
2100                 */
2101                if (src_rq->rt.highest_prio.next >=
2102                    this_rq->rt.highest_prio.curr)
2103                        continue;
2104
2105                /*
2106                 * We can potentially drop this_rq's lock in
2107                 * double_lock_balance, and another CPU could
2108                 * alter this_rq
2109                 */
2110                double_lock_balance(this_rq, src_rq);
2111
2112                /*
2113                 * We can pull only a task, which is pushable
2114                 * on its rq, and no others.
2115                 */
2116                p = pick_highest_pushable_task(src_rq, this_cpu);
2117
2118                /*
2119                 * Do we have an RT task that preempts
2120                 * the to-be-scheduled task?
2121                 */
2122                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2123                        WARN_ON(p == src_rq->curr);
2124                        WARN_ON(!task_on_rq_queued(p));
2125
2126                        /*
2127                         * There's a chance that p is higher in priority
2128                         * than what's currently running on its CPU.
2129                         * This is just that p is wakeing up and hasn't
2130                         * had a chance to schedule. We only pull
2131                         * p if it is lower in priority than the
2132                         * current task on the run queue
2133                         */
2134                        if (p->prio < src_rq->curr->prio)
2135                                goto skip;
2136
2137                        resched = true;
2138
2139                        deactivate_task(src_rq, p, 0);
2140                        set_task_cpu(p, this_cpu);
2141                        activate_task(this_rq, p, 0);
2142                        /*
2143                         * We continue with the search, just in
2144                         * case there's an even higher prio task
2145                         * in another runqueue. (low likelihood
2146                         * but possible)
2147                         */
2148                }
2149skip:
2150                double_unlock_balance(this_rq, src_rq);
2151        }
2152
2153        if (resched)
2154                resched_curr(this_rq);
2155}
2156
2157/*
2158 * If we are not running and we are not going to reschedule soon, we should
2159 * try to push tasks away now
2160 */
2161static void task_woken_rt(struct rq *rq, struct task_struct *p)
2162{
2163        if (!task_running(rq, p) &&
2164            !test_tsk_need_resched(rq->curr) &&
2165            p->nr_cpus_allowed > 1 &&
2166            (dl_task(rq->curr) || rt_task(rq->curr)) &&
2167            (rq->curr->nr_cpus_allowed < 2 ||
2168             rq->curr->prio <= p->prio))
2169                push_rt_tasks(rq);
2170}
2171
2172/* Assumes rq->lock is held */
2173static void rq_online_rt(struct rq *rq)
2174{
2175        if (rq->rt.overloaded)
2176                rt_set_overload(rq);
2177
2178        __enable_runtime(rq);
2179
2180        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2181}
2182
2183/* Assumes rq->lock is held */
2184static void rq_offline_rt(struct rq *rq)
2185{
2186        if (rq->rt.overloaded)
2187                rt_clear_overload(rq);
2188
2189        __disable_runtime(rq);
2190
2191        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2192}
2193
2194/*
2195 * When switch from the rt queue, we bring ourselves to a position
2196 * that we might want to pull RT tasks from other runqueues.
2197 */
2198static void switched_from_rt(struct rq *rq, struct task_struct *p)
2199{
2200        /*
2201         * If there are other RT tasks then we will reschedule
2202         * and the scheduling of the other RT tasks will handle
2203         * the balancing. But if we are the last RT task
2204         * we may need to handle the pulling of RT tasks
2205         * now.
2206         */
2207        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2208                return;
2209
2210        rt_queue_pull_task(rq);
2211}
2212
2213void __init init_sched_rt_class(void)
2214{
2215        unsigned int i;
2216
2217        for_each_possible_cpu(i) {
2218                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2219                                        GFP_KERNEL, cpu_to_node(i));
2220        }
2221}
2222#endif /* CONFIG_SMP */
2223
2224/*
2225 * When switching a task to RT, we may overload the runqueue
2226 * with RT tasks. In this case we try to push them off to
2227 * other runqueues.
2228 */
2229static void switched_to_rt(struct rq *rq, struct task_struct *p)
2230{
2231        /*
2232         * If we are already running, then there's nothing
2233         * that needs to be done. But if we are not running
2234         * we may need to preempt the current running task.
2235         * If that current running task is also an RT task
2236         * then see if we can move to another run queue.
2237         */
2238        if (task_on_rq_queued(p) && rq->curr != p) {
2239#ifdef CONFIG_SMP
2240                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2241                        rt_queue_push_tasks(rq);
2242#endif /* CONFIG_SMP */
2243                if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2244                        resched_curr(rq);
2245        }
2246}
2247
2248/*
2249 * Priority of the task has changed. This may cause
2250 * us to initiate a push or pull.
2251 */
2252static void
2253prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2254{
2255        if (!task_on_rq_queued(p))
2256                return;
2257
2258        if (rq->curr == p) {
2259#ifdef CONFIG_SMP
2260                /*
2261                 * If our priority decreases while running, we
2262                 * may need to pull tasks to this runqueue.
2263                 */
2264                if (oldprio < p->prio)
2265                        rt_queue_pull_task(rq);
2266
2267                /*
2268                 * If there's a higher priority task waiting to run
2269                 * then reschedule.
2270                 */
2271                if (p->prio > rq->rt.highest_prio.curr)
2272                        resched_curr(rq);
2273#else
2274                /* For UP simply resched on drop of prio */
2275                if (oldprio < p->prio)
2276                        resched_curr(rq);
2277#endif /* CONFIG_SMP */
2278        } else {
2279                /*
2280                 * This task is not running, but if it is
2281                 * greater than the current running task
2282                 * then reschedule.
2283                 */
2284                if (p->prio < rq->curr->prio)
2285                        resched_curr(rq);
2286        }
2287}
2288
2289#ifdef CONFIG_POSIX_TIMERS
2290static void watchdog(struct rq *rq, struct task_struct *p)
2291{
2292        unsigned long soft, hard;
2293
2294        /* max may change after cur was read, this will be fixed next tick */
2295        soft = task_rlimit(p, RLIMIT_RTTIME);
2296        hard = task_rlimit_max(p, RLIMIT_RTTIME);
2297
2298        if (soft != RLIM_INFINITY) {
2299                unsigned long next;
2300
2301                if (p->rt.watchdog_stamp != jiffies) {
2302                        p->rt.timeout++;
2303                        p->rt.watchdog_stamp = jiffies;
2304                }
2305
2306                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2307                if (p->rt.timeout > next)
2308                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2309        }
2310}
2311#else
2312static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2313#endif
2314
2315/*
2316 * scheduler tick hitting a task of our scheduling class.
2317 *
2318 * NOTE: This function can be called remotely by the tick offload that
2319 * goes along full dynticks. Therefore no local assumption can be made
2320 * and everything must be accessed through the @rq and @curr passed in
2321 * parameters.
2322 */
2323static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2324{
2325        struct sched_rt_entity *rt_se = &p->rt;
2326
2327        update_curr_rt(rq);
2328        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2329
2330        watchdog(rq, p);
2331
2332        /*
2333         * RR tasks need a special form of timeslice management.
2334         * FIFO tasks have no timeslices.
2335         */
2336        if (p->policy != SCHED_RR)
2337                return;
2338
2339        if (--p->rt.time_slice)
2340                return;
2341
2342        p->rt.time_slice = sched_rr_timeslice;
2343
2344        /*
2345         * Requeue to the end of queue if we (and all of our ancestors) are not
2346         * the only element on the queue
2347         */
2348        for_each_sched_rt_entity(rt_se) {
2349                if (rt_se->run_list.prev != rt_se->run_list.next) {
2350                        requeue_task_rt(rq, p, 0);
2351                        resched_curr(rq);
2352                        return;
2353                }
2354        }
2355}
2356
2357static void set_curr_task_rt(struct rq *rq)
2358{
2359        set_next_task(rq, rq->curr);
2360}
2361
2362static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2363{
2364        /*
2365         * Time slice is 0 for SCHED_FIFO tasks
2366         */
2367        if (task->policy == SCHED_RR)
2368                return sched_rr_timeslice;
2369        else
2370                return 0;
2371}
2372
2373const struct sched_class rt_sched_class = {
2374        .next                   = &fair_sched_class,
2375        .enqueue_task           = enqueue_task_rt,
2376        .dequeue_task           = dequeue_task_rt,
2377        .yield_task             = yield_task_rt,
2378
2379        .check_preempt_curr     = check_preempt_curr_rt,
2380
2381        .pick_next_task         = pick_next_task_rt,
2382        .put_prev_task          = put_prev_task_rt,
2383
2384#ifdef CONFIG_SMP
2385        .select_task_rq         = select_task_rq_rt,
2386
2387        .set_cpus_allowed       = set_cpus_allowed_common,
2388        .rq_online              = rq_online_rt,
2389        .rq_offline             = rq_offline_rt,
2390        .task_woken             = task_woken_rt,
2391        .switched_from          = switched_from_rt,
2392#endif
2393
2394        .set_curr_task          = set_curr_task_rt,
2395        .task_tick              = task_tick_rt,
2396
2397        .get_rr_interval        = get_rr_interval_rt,
2398
2399        .prio_changed           = prio_changed_rt,
2400        .switched_to            = switched_to_rt,
2401
2402        .update_curr            = update_curr_rt,
2403
2404#ifdef CONFIG_UCLAMP_TASK
2405        .uclamp_enabled         = 1,
2406#endif
2407};
2408
2409#ifdef CONFIG_RT_GROUP_SCHED
2410/*
2411 * Ensure that the real time constraints are schedulable.
2412 */
2413static DEFINE_MUTEX(rt_constraints_mutex);
2414
2415/* Must be called with tasklist_lock held */
2416static inline int tg_has_rt_tasks(struct task_group *tg)
2417{
2418        struct task_struct *g, *p;
2419
2420        /*
2421         * Autogroups do not have RT tasks; see autogroup_create().
2422         */
2423        if (task_group_is_autogroup(tg))
2424                return 0;
2425
2426        for_each_process_thread(g, p) {
2427                if (rt_task(p) && task_group(p) == tg)
2428                        return 1;
2429        }
2430
2431        return 0;
2432}
2433
2434struct rt_schedulable_data {
2435        struct task_group *tg;
2436        u64 rt_period;
2437        u64 rt_runtime;
2438};
2439
2440static int tg_rt_schedulable(struct task_group *tg, void *data)
2441{
2442        struct rt_schedulable_data *d = data;
2443        struct task_group *child;
2444        unsigned long total, sum = 0;
2445        u64 period, runtime;
2446
2447        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2448        runtime = tg->rt_bandwidth.rt_runtime;
2449
2450        if (tg == d->tg) {
2451                period = d->rt_period;
2452                runtime = d->rt_runtime;
2453        }
2454
2455        /*
2456         * Cannot have more runtime than the period.
2457         */
2458        if (runtime > period && runtime != RUNTIME_INF)
2459                return -EINVAL;
2460
2461        /*
2462         * Ensure we don't starve existing RT tasks.
2463         */
2464        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2465                return -EBUSY;
2466
2467        total = to_ratio(period, runtime);
2468
2469        /*
2470         * Nobody can have more than the global setting allows.
2471         */
2472        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2473                return -EINVAL;
2474
2475        /*
2476         * The sum of our children's runtime should not exceed our own.
2477         */
2478        list_for_each_entry_rcu(child, &tg->children, siblings) {
2479                period = ktime_to_ns(child->rt_bandwidth.rt_period);
2480                runtime = child->rt_bandwidth.rt_runtime;
2481
2482                if (child == d->tg) {
2483                        period = d->rt_period;
2484                        runtime = d->rt_runtime;
2485                }
2486
2487                sum += to_ratio(period, runtime);
2488        }
2489
2490        if (sum > total)
2491                return -EINVAL;
2492
2493        return 0;
2494}
2495
2496static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2497{
2498        int ret;
2499
2500        struct rt_schedulable_data data = {
2501                .tg = tg,
2502                .rt_period = period,
2503                .rt_runtime = runtime,
2504        };
2505
2506        rcu_read_lock();
2507        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2508        rcu_read_unlock();
2509
2510        return ret;
2511}
2512
2513static int tg_set_rt_bandwidth(struct task_group *tg,
2514                u64 rt_period, u64 rt_runtime)
2515{
2516        int i, err = 0;
2517
2518        /*
2519         * Disallowing the root group RT runtime is BAD, it would disallow the
2520         * kernel creating (and or operating) RT threads.
2521         */
2522        if (tg == &root_task_group && rt_runtime == 0)
2523                return -EINVAL;
2524
2525        /* No period doesn't make any sense. */
2526        if (rt_period == 0)
2527                return -EINVAL;
2528
2529        mutex_lock(&rt_constraints_mutex);
2530        read_lock(&tasklist_lock);
2531        err = __rt_schedulable(tg, rt_period, rt_runtime);
2532        if (err)
2533                goto unlock;
2534
2535        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2536        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2537        tg->rt_bandwidth.rt_runtime = rt_runtime;
2538
2539        for_each_possible_cpu(i) {
2540                struct rt_rq *rt_rq = tg->rt_rq[i];
2541
2542                raw_spin_lock(&rt_rq->rt_runtime_lock);
2543                rt_rq->rt_runtime = rt_runtime;
2544                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2545        }
2546        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2547unlock:
2548        read_unlock(&tasklist_lock);
2549        mutex_unlock(&rt_constraints_mutex);
2550
2551        return err;
2552}
2553
2554int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2555{
2556        u64 rt_runtime, rt_period;
2557
2558        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2559        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2560        if (rt_runtime_us < 0)
2561                rt_runtime = RUNTIME_INF;
2562        else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2563                return -EINVAL;
2564
2565        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2566}
2567
2568long sched_group_rt_runtime(struct task_group *tg)
2569{
2570        u64 rt_runtime_us;
2571
2572        if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2573                return -1;
2574
2575        rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2576        do_div(rt_runtime_us, NSEC_PER_USEC);
2577        return rt_runtime_us;
2578}
2579
2580int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2581{
2582        u64 rt_runtime, rt_period;
2583
2584        if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2585                return -EINVAL;
2586
2587        rt_period = rt_period_us * NSEC_PER_USEC;
2588        rt_runtime = tg->rt_bandwidth.rt_runtime;
2589
2590        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2591}
2592
2593long sched_group_rt_period(struct task_group *tg)
2594{
2595        u64 rt_period_us;
2596
2597        rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2598        do_div(rt_period_us, NSEC_PER_USEC);
2599        return rt_period_us;
2600}
2601
2602static int sched_rt_global_constraints(void)
2603{
2604        int ret = 0;
2605
2606        mutex_lock(&rt_constraints_mutex);
2607        read_lock(&tasklist_lock);
2608        ret = __rt_schedulable(NULL, 0, 0);
2609        read_unlock(&tasklist_lock);
2610        mutex_unlock(&rt_constraints_mutex);
2611
2612        return ret;
2613}
2614
2615int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2616{
2617        /* Don't accept realtime tasks when there is no way for them to run */
2618        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2619                return 0;
2620
2621        return 1;
2622}
2623
2624#else /* !CONFIG_RT_GROUP_SCHED */
2625static int sched_rt_global_constraints(void)
2626{
2627        unsigned long flags;
2628        int i;
2629
2630        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2631        for_each_possible_cpu(i) {
2632                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2633
2634                raw_spin_lock(&rt_rq->rt_runtime_lock);
2635                rt_rq->rt_runtime = global_rt_runtime();
2636                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2637        }
2638        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2639
2640        return 0;
2641}
2642#endif /* CONFIG_RT_GROUP_SCHED */
2643
2644static int sched_rt_global_validate(void)
2645{
2646        if (sysctl_sched_rt_period <= 0)
2647                return -EINVAL;
2648
2649        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2650                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2651                return -EINVAL;
2652
2653        return 0;
2654}
2655
2656static void sched_rt_do_global(void)
2657{
2658        def_rt_bandwidth.rt_runtime = global_rt_runtime();
2659        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2660}
2661
2662int sched_rt_handler(struct ctl_table *table, int write,
2663                void __user *buffer, size_t *lenp,
2664                loff_t *ppos)
2665{
2666        int old_period, old_runtime;
2667        static DEFINE_MUTEX(mutex);
2668        int ret;
2669
2670        mutex_lock(&mutex);
2671        old_period = sysctl_sched_rt_period;
2672        old_runtime = sysctl_sched_rt_runtime;
2673
2674        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2675
2676        if (!ret && write) {
2677                ret = sched_rt_global_validate();
2678                if (ret)
2679                        goto undo;
2680
2681                ret = sched_dl_global_validate();
2682                if (ret)
2683                        goto undo;
2684
2685                ret = sched_rt_global_constraints();
2686                if (ret)
2687                        goto undo;
2688
2689                sched_rt_do_global();
2690                sched_dl_do_global();
2691        }
2692        if (0) {
2693undo:
2694                sysctl_sched_rt_period = old_period;
2695                sysctl_sched_rt_runtime = old_runtime;
2696        }
2697        mutex_unlock(&mutex);
2698
2699        return ret;
2700}
2701
2702int sched_rr_handler(struct ctl_table *table, int write,
2703                void __user *buffer, size_t *lenp,
2704                loff_t *ppos)
2705{
2706        int ret;
2707        static DEFINE_MUTEX(mutex);
2708
2709        mutex_lock(&mutex);
2710        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2711        /*
2712         * Make sure that internally we keep jiffies.
2713         * Also, writing zero resets the timeslice to default:
2714         */
2715        if (!ret && write) {
2716                sched_rr_timeslice =
2717                        sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2718                        msecs_to_jiffies(sysctl_sched_rr_timeslice);
2719        }
2720        mutex_unlock(&mutex);
2721
2722        return ret;
2723}
2724
2725#ifdef CONFIG_SCHED_DEBUG
2726void print_rt_stats(struct seq_file *m, int cpu)
2727{
2728        rt_rq_iter_t iter;
2729        struct rt_rq *rt_rq;
2730
2731        rcu_read_lock();
2732        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2733                print_rt_rq(m, cpu, rt_rq);
2734        rcu_read_unlock();
2735}
2736#endif /* CONFIG_SCHED_DEBUG */
2737