LXR linux/kernel/sched/rt.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4 * policies)
   5 */
   6#include "sched.h"
   7
   8int sched_rr_timeslice = RR_TIMESLICE;
   9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  10
  11static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  12
  13struct rt_bandwidth def_rt_bandwidth;
  14
  15static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  16{
  17        struct rt_bandwidth *rt_b =
  18                container_of(timer, struct rt_bandwidth, rt_period_timer);
  19        int idle = 0;
  20        int overrun;
  21
  22        raw_spin_lock(&rt_b->rt_runtime_lock);
  23        for (;;) {
  24                overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  25                if (!overrun)
  26                        break;
  27
  28                raw_spin_unlock(&rt_b->rt_runtime_lock);
  29                idle = do_sched_rt_period_timer(rt_b, overrun);
  30                raw_spin_lock(&rt_b->rt_runtime_lock);
  31        }
  32        if (idle)
  33                rt_b->rt_period_active = 0;
  34        raw_spin_unlock(&rt_b->rt_runtime_lock);
  35
  36        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  37}
  38
  39void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  40{
  41        rt_b->rt_period = ns_to_ktime(period);
  42        rt_b->rt_runtime = runtime;
  43
  44        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  45
  46        hrtimer_init(&rt_b->rt_period_timer,
  47                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  48        rt_b->rt_period_timer.function = sched_rt_period_timer;
  49}
  50
  51static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  52{
  53        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  54                return;
  55
  56        raw_spin_lock(&rt_b->rt_runtime_lock);
  57        if (!rt_b->rt_period_active) {
  58                rt_b->rt_period_active = 1;
  59                /*
  60                 * SCHED_DEADLINE updates the bandwidth, as a run away
  61                 * RT task with a DL task could hog a CPU. But DL does
  62                 * not reset the period. If a deadline task was running
  63                 * without an RT task running, it can cause RT tasks to
  64                 * throttle when they start up. Kick the timer right away
  65                 * to update the period.
  66                 */
  67                hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  68                hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  69        }
  70        raw_spin_unlock(&rt_b->rt_runtime_lock);
  71}
  72
  73void init_rt_rq(struct rt_rq *rt_rq)
  74{
  75        struct rt_prio_array *array;
  76        int i;
  77
  78        array = &rt_rq->active;
  79        for (i = 0; i < MAX_RT_PRIO; i++) {
  80                INIT_LIST_HEAD(array->queue + i);
  81                __clear_bit(i, array->bitmap);
  82        }
  83        /* delimiter for bitsearch: */
  84        __set_bit(MAX_RT_PRIO, array->bitmap);
  85
  86#if defined CONFIG_SMP
  87        rt_rq->highest_prio.curr = MAX_RT_PRIO;
  88        rt_rq->highest_prio.next = MAX_RT_PRIO;
  89        rt_rq->rt_nr_migratory = 0;
  90        rt_rq->overloaded = 0;
  91        plist_head_init(&rt_rq->pushable_tasks);
  92#endif /* CONFIG_SMP */
  93        /* We start is dequeued state, because no RT tasks are queued */
  94        rt_rq->rt_queued = 0;
  95
  96        rt_rq->rt_time = 0;
  97        rt_rq->rt_throttled = 0;
  98        rt_rq->rt_runtime = 0;
  99        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 100}
 101
 102#ifdef CONFIG_RT_GROUP_SCHED
 103static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 104{
 105        hrtimer_cancel(&rt_b->rt_period_timer);
 106}
 107
 108#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 109
 110static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 111{
 112#ifdef CONFIG_SCHED_DEBUG
 113        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 114#endif
 115        return container_of(rt_se, struct task_struct, rt);
 116}
 117
 118static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 119{
 120        return rt_rq->rq;
 121}
 122
 123static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 124{
 125        return rt_se->rt_rq;
 126}
 127
 128static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 129{
 130        struct rt_rq *rt_rq = rt_se->rt_rq;
 131
 132        return rt_rq->rq;
 133}
 134
 135void free_rt_sched_group(struct task_group *tg)
 136{
 137        int i;
 138
 139        if (tg->rt_se)
 140                destroy_rt_bandwidth(&tg->rt_bandwidth);
 141
 142        for_each_possible_cpu(i) {
 143                if (tg->rt_rq)
 144                        kfree(tg->rt_rq[i]);
 145                if (tg->rt_se)
 146                        kfree(tg->rt_se[i]);
 147        }
 148
 149        kfree(tg->rt_rq);
 150        kfree(tg->rt_se);
 151}
 152
 153void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 154                struct sched_rt_entity *rt_se, int cpu,
 155                struct sched_rt_entity *parent)
 156{
 157        struct rq *rq = cpu_rq(cpu);
 158
 159        rt_rq->highest_prio.curr = MAX_RT_PRIO;
 160        rt_rq->rt_nr_boosted = 0;
 161        rt_rq->rq = rq;
 162        rt_rq->tg = tg;
 163
 164        tg->rt_rq[cpu] = rt_rq;
 165        tg->rt_se[cpu] = rt_se;
 166
 167        if (!rt_se)
 168                return;
 169
 170        if (!parent)
 171                rt_se->rt_rq = &rq->rt;
 172        else
 173                rt_se->rt_rq = parent->my_q;
 174
 175        rt_se->my_q = rt_rq;
 176        rt_se->parent = parent;
 177        INIT_LIST_HEAD(&rt_se->run_list);
 178}
 179
 180int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 181{
 182        struct rt_rq *rt_rq;
 183        struct sched_rt_entity *rt_se;
 184        int i;
 185
 186        tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
 187        if (!tg->rt_rq)
 188                goto err;
 189        tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
 190        if (!tg->rt_se)
 191                goto err;
 192
 193        init_rt_bandwidth(&tg->rt_bandwidth,
 194                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 195
 196        for_each_possible_cpu(i) {
 197                rt_rq = kzalloc_node(sizeof(struct rt_rq),
 198                                     GFP_KERNEL, cpu_to_node(i));
 199                if (!rt_rq)
 200                        goto err;
 201
 202                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 203                                     GFP_KERNEL, cpu_to_node(i));
 204                if (!rt_se)
 205                        goto err_free_rq;
 206
 207                init_rt_rq(rt_rq);
 208                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 209                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 210        }
 211
 212        return 1;
 213
 214err_free_rq:
 215        kfree(rt_rq);
 216err:
 217        return 0;
 218}
 219
 220#else /* CONFIG_RT_GROUP_SCHED */
 221
 222#define rt_entity_is_task(rt_se) (1)
 223
 224static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 225{
 226        return container_of(rt_se, struct task_struct, rt);
 227}
 228
 229static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 230{
 231        return container_of(rt_rq, struct rq, rt);
 232}
 233
 234static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 235{
 236        struct task_struct *p = rt_task_of(rt_se);
 237
 238        return task_rq(p);
 239}
 240
 241static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 242{
 243        struct rq *rq = rq_of_rt_se(rt_se);
 244
 245        return &rq->rt;
 246}
 247
 248void free_rt_sched_group(struct task_group *tg) { }
 249
 250int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 251{
 252        return 1;
 253}
 254#endif /* CONFIG_RT_GROUP_SCHED */
 255
 256#ifdef CONFIG_SMP
 257
 258static void pull_rt_task(struct rq *this_rq);
 259
 260static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 261{
 262        /* Try to pull RT tasks here if we lower this rq's prio */
 263        return rq->rt.highest_prio.curr > prev->prio;
 264}
 265
 266static inline int rt_overloaded(struct rq *rq)
 267{
 268        return atomic_read(&rq->rd->rto_count);
 269}
 270
 271static inline void rt_set_overload(struct rq *rq)
 272{
 273        if (!rq->online)
 274                return;
 275
 276        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 277        /*
 278         * Make sure the mask is visible before we set
 279         * the overload count. That is checked to determine
 280         * if we should look at the mask. It would be a shame
 281         * if we looked at the mask, but the mask was not
 282         * updated yet.
 283         *
 284         * Matched by the barrier in pull_rt_task().
 285         */
 286        smp_wmb();
 287        atomic_inc(&rq->rd->rto_count);
 288}
 289
 290static inline void rt_clear_overload(struct rq *rq)
 291{
 292        if (!rq->online)
 293                return;
 294
 295        /* the order here really doesn't matter */
 296        atomic_dec(&rq->rd->rto_count);
 297        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 298}
 299
 300static void update_rt_migration(struct rt_rq *rt_rq)
 301{
 302        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 303                if (!rt_rq->overloaded) {
 304                        rt_set_overload(rq_of_rt_rq(rt_rq));
 305                        rt_rq->overloaded = 1;
 306                }
 307        } else if (rt_rq->overloaded) {
 308                rt_clear_overload(rq_of_rt_rq(rt_rq));
 309                rt_rq->overloaded = 0;
 310        }
 311}
 312
 313static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 314{
 315        struct task_struct *p;
 316
 317        if (!rt_entity_is_task(rt_se))
 318                return;
 319
 320        p = rt_task_of(rt_se);
 321        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 322
 323        rt_rq->rt_nr_total++;
 324        if (p->nr_cpus_allowed > 1)
 325                rt_rq->rt_nr_migratory++;
 326
 327        update_rt_migration(rt_rq);
 328}
 329
 330static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 331{
 332        struct task_struct *p;
 333
 334        if (!rt_entity_is_task(rt_se))
 335                return;
 336
 337        p = rt_task_of(rt_se);
 338        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 339
 340        rt_rq->rt_nr_total--;
 341        if (p->nr_cpus_allowed > 1)
 342                rt_rq->rt_nr_migratory--;
 343
 344        update_rt_migration(rt_rq);
 345}
 346
 347static inline int has_pushable_tasks(struct rq *rq)
 348{
 349        return !plist_head_empty(&rq->rt.pushable_tasks);
 350}
 351
 352static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 353static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 354
 355static void push_rt_tasks(struct rq *);
 356static void pull_rt_task(struct rq *);
 357
 358static inline void rt_queue_push_tasks(struct rq *rq)
 359{
 360        if (!has_pushable_tasks(rq))
 361                return;
 362
 363        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 364}
 365
 366static inline void rt_queue_pull_task(struct rq *rq)
 367{
 368        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 369}
 370
 371static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 372{
 373        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 374        plist_node_init(&p->pushable_tasks, p->prio);
 375        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 376
 377        /* Update the highest prio pushable task */
 378        if (p->prio < rq->rt.highest_prio.next)
 379                rq->rt.highest_prio.next = p->prio;
 380}
 381
 382static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 383{
 384        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 385
 386        /* Update the new highest prio pushable task */
 387        if (has_pushable_tasks(rq)) {
 388                p = plist_first_entry(&rq->rt.pushable_tasks,
 389                                      struct task_struct, pushable_tasks);
 390                rq->rt.highest_prio.next = p->prio;
 391        } else
 392                rq->rt.highest_prio.next = MAX_RT_PRIO;
 393}
 394
 395#else
 396
 397static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 398{
 399}
 400
 401static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 402{
 403}
 404
 405static inline
 406void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 407{
 408}
 409
 410static inline
 411void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 412{
 413}
 414
 415static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 416{
 417        return false;
 418}
 419
 420static inline void pull_rt_task(struct rq *this_rq)
 421{
 422}
 423
 424static inline void rt_queue_push_tasks(struct rq *rq)
 425{
 426}
 427#endif /* CONFIG_SMP */
 428
 429static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 430static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 431
 432static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 433{
 434        return rt_se->on_rq;
 435}
 436
 437#ifdef CONFIG_RT_GROUP_SCHED
 438
 439static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 440{
 441        if (!rt_rq->tg)
 442                return RUNTIME_INF;
 443
 444        return rt_rq->rt_runtime;
 445}
 446
 447static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 448{
 449        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 450}
 451
 452typedef struct task_group *rt_rq_iter_t;
 453
 454static inline struct task_group *next_task_group(struct task_group *tg)
 455{
 456        do {
 457                tg = list_entry_rcu(tg->list.next,
 458                        typeof(struct task_group), list);
 459        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 460
 461        if (&tg->list == &task_groups)
 462                tg = NULL;
 463
 464        return tg;
 465}
 466
 467#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 468        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 469                (iter = next_task_group(iter)) &&                       \
 470                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 471
 472#define for_each_sched_rt_entity(rt_se) \
 473        for (; rt_se; rt_se = rt_se->parent)
 474
 475static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 476{
 477        return rt_se->my_q;
 478}
 479
 480static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 481static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 482
 483static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 484{
 485        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 486        struct rq *rq = rq_of_rt_rq(rt_rq);
 487        struct sched_rt_entity *rt_se;
 488
 489        int cpu = cpu_of(rq);
 490
 491        rt_se = rt_rq->tg->rt_se[cpu];
 492
 493        if (rt_rq->rt_nr_running) {
 494                if (!rt_se)
 495                        enqueue_top_rt_rq(rt_rq);
 496                else if (!on_rt_rq(rt_se))
 497                        enqueue_rt_entity(rt_se, 0);
 498
 499                if (rt_rq->highest_prio.curr < curr->prio)
 500                        resched_curr(rq);
 501        }
 502}
 503
 504static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 505{
 506        struct sched_rt_entity *rt_se;
 507        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 508
 509        rt_se = rt_rq->tg->rt_se[cpu];
 510
 511        if (!rt_se) {
 512                dequeue_top_rt_rq(rt_rq);
 513                /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 514                cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
 515        }
 516        else if (on_rt_rq(rt_se))
 517                dequeue_rt_entity(rt_se, 0);
 518}
 519
 520static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 521{
 522        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 523}
 524
 525static int rt_se_boosted(struct sched_rt_entity *rt_se)
 526{
 527        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 528        struct task_struct *p;
 529
 530        if (rt_rq)
 531                return !!rt_rq->rt_nr_boosted;
 532
 533        p = rt_task_of(rt_se);
 534        return p->prio != p->normal_prio;
 535}
 536
 537#ifdef CONFIG_SMP
 538static inline const struct cpumask *sched_rt_period_mask(void)
 539{
 540        return this_rq()->rd->span;
 541}
 542#else
 543static inline const struct cpumask *sched_rt_period_mask(void)
 544{
 545        return cpu_online_mask;
 546}
 547#endif
 548
 549static inline
 550struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 551{
 552        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 553}
 554
 555static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 556{
 557        return &rt_rq->tg->rt_bandwidth;
 558}
 559
 560#else /* !CONFIG_RT_GROUP_SCHED */
 561
 562static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 563{
 564        return rt_rq->rt_runtime;
 565}
 566
 567static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 568{
 569        return ktime_to_ns(def_rt_bandwidth.rt_period);
 570}
 571
 572typedef struct rt_rq *rt_rq_iter_t;
 573
 574#define for_each_rt_rq(rt_rq, iter, rq) \
 575        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 576
 577#define for_each_sched_rt_entity(rt_se) \
 578        for (; rt_se; rt_se = NULL)
 579
 580static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 581{
 582        return NULL;
 583}
 584
 585static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 586{
 587        struct rq *rq = rq_of_rt_rq(rt_rq);
 588
 589        if (!rt_rq->rt_nr_running)
 590                return;
 591
 592        enqueue_top_rt_rq(rt_rq);
 593        resched_curr(rq);
 594}
 595
 596static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 597{
 598        dequeue_top_rt_rq(rt_rq);
 599}
 600
 601static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 602{
 603        return rt_rq->rt_throttled;
 604}
 605
 606static inline const struct cpumask *sched_rt_period_mask(void)
 607{
 608        return cpu_online_mask;
 609}
 610
 611static inline
 612struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 613{
 614        return &cpu_rq(cpu)->rt;
 615}
 616
 617static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 618{
 619        return &def_rt_bandwidth;
 620}
 621
 622#endif /* CONFIG_RT_GROUP_SCHED */
 623
 624bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 625{
 626        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 627
 628        return (hrtimer_active(&rt_b->rt_period_timer) ||
 629                rt_rq->rt_time < rt_b->rt_runtime);
 630}
 631
 632#ifdef CONFIG_SMP
 633/*
 634 * We ran out of runtime, see if we can borrow some from our neighbours.
 635 */
 636static void do_balance_runtime(struct rt_rq *rt_rq)
 637{
 638        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 639        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 640        int i, weight;
 641        u64 rt_period;
 642
 643        weight = cpumask_weight(rd->span);
 644
 645        raw_spin_lock(&rt_b->rt_runtime_lock);
 646        rt_period = ktime_to_ns(rt_b->rt_period);
 647        for_each_cpu(i, rd->span) {
 648                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 649                s64 diff;
 650
 651                if (iter == rt_rq)
 652                        continue;
 653
 654                raw_spin_lock(&iter->rt_runtime_lock);
 655                /*
 656                 * Either all rqs have inf runtime and there's nothing to steal
 657                 * or __disable_runtime() below sets a specific rq to inf to
 658                 * indicate its been disabled and disalow stealing.
 659                 */
 660                if (iter->rt_runtime == RUNTIME_INF)
 661                        goto next;
 662
 663                /*
 664                 * From runqueues with spare time, take 1/n part of their
 665                 * spare time, but no more than our period.
 666                 */
 667                diff = iter->rt_runtime - iter->rt_time;
 668                if (diff > 0) {
 669                        diff = div_u64((u64)diff, weight);
 670                        if (rt_rq->rt_runtime + diff > rt_period)
 671                                diff = rt_period - rt_rq->rt_runtime;
 672                        iter->rt_runtime -= diff;
 673                        rt_rq->rt_runtime += diff;
 674                        if (rt_rq->rt_runtime == rt_period) {
 675                                raw_spin_unlock(&iter->rt_runtime_lock);
 676                                break;
 677                        }
 678                }
 679next:
 680                raw_spin_unlock(&iter->rt_runtime_lock);
 681        }
 682        raw_spin_unlock(&rt_b->rt_runtime_lock);
 683}
 684
 685/*
 686 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 687 */
 688static void __disable_runtime(struct rq *rq)
 689{
 690        struct root_domain *rd = rq->rd;
 691        rt_rq_iter_t iter;
 692        struct rt_rq *rt_rq;
 693
 694        if (unlikely(!scheduler_running))
 695                return;
 696
 697        for_each_rt_rq(rt_rq, iter, rq) {
 698                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 699                s64 want;
 700                int i;
 701
 702                raw_spin_lock(&rt_b->rt_runtime_lock);
 703                raw_spin_lock(&rt_rq->rt_runtime_lock);
 704                /*
 705                 * Either we're all inf and nobody needs to borrow, or we're
 706                 * already disabled and thus have nothing to do, or we have
 707                 * exactly the right amount of runtime to take out.
 708                 */
 709                if (rt_rq->rt_runtime == RUNTIME_INF ||
 710                                rt_rq->rt_runtime == rt_b->rt_runtime)
 711                        goto balanced;
 712                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 713
 714                /*
 715                 * Calculate the difference between what we started out with
 716                 * and what we current have, that's the amount of runtime
 717                 * we lend and now have to reclaim.
 718                 */
 719                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 720
 721                /*
 722                 * Greedy reclaim, take back as much as we can.
 723                 */
 724                for_each_cpu(i, rd->span) {
 725                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 726                        s64 diff;
 727
 728                        /*
 729                         * Can't reclaim from ourselves or disabled runqueues.
 730                         */
 731                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 732                                continue;
 733
 734                        raw_spin_lock(&iter->rt_runtime_lock);
 735                        if (want > 0) {
 736                                diff = min_t(s64, iter->rt_runtime, want);
 737                                iter->rt_runtime -= diff;
 738                                want -= diff;
 739                        } else {
 740                                iter->rt_runtime -= want;
 741                                want -= want;
 742                        }
 743                        raw_spin_unlock(&iter->rt_runtime_lock);
 744
 745                        if (!want)
 746                                break;
 747                }
 748
 749                raw_spin_lock(&rt_rq->rt_runtime_lock);
 750                /*
 751                 * We cannot be left wanting - that would mean some runtime
 752                 * leaked out of the system.
 753                 */
 754                BUG_ON(want);
 755balanced:
 756                /*
 757                 * Disable all the borrow logic by pretending we have inf
 758                 * runtime - in which case borrowing doesn't make sense.
 759                 */
 760                rt_rq->rt_runtime = RUNTIME_INF;
 761                rt_rq->rt_throttled = 0;
 762                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 763                raw_spin_unlock(&rt_b->rt_runtime_lock);
 764
 765                /* Make rt_rq available for pick_next_task() */
 766                sched_rt_rq_enqueue(rt_rq);
 767        }
 768}
 769
 770static void __enable_runtime(struct rq *rq)
 771{
 772        rt_rq_iter_t iter;
 773        struct rt_rq *rt_rq;
 774
 775        if (unlikely(!scheduler_running))
 776                return;
 777
 778        /*
 779         * Reset each runqueue's bandwidth settings
 780         */
 781        for_each_rt_rq(rt_rq, iter, rq) {
 782                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 783
 784                raw_spin_lock(&rt_b->rt_runtime_lock);
 785                raw_spin_lock(&rt_rq->rt_runtime_lock);
 786                rt_rq->rt_runtime = rt_b->rt_runtime;
 787                rt_rq->rt_time = 0;
 788                rt_rq->rt_throttled = 0;
 789                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 790                raw_spin_unlock(&rt_b->rt_runtime_lock);
 791        }
 792}
 793
 794static void balance_runtime(struct rt_rq *rt_rq)
 795{
 796        if (!sched_feat(RT_RUNTIME_SHARE))
 797                return;
 798
 799        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 800                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 801                do_balance_runtime(rt_rq);
 802                raw_spin_lock(&rt_rq->rt_runtime_lock);
 803        }
 804}
 805#else /* !CONFIG_SMP */
 806static inline void balance_runtime(struct rt_rq *rt_rq) {}
 807#endif /* CONFIG_SMP */
 808
 809static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 810{
 811        int i, idle = 1, throttled = 0;
 812        const struct cpumask *span;
 813
 814        span = sched_rt_period_mask();
 815#ifdef CONFIG_RT_GROUP_SCHED
 816        /*
 817         * FIXME: isolated CPUs should really leave the root task group,
 818         * whether they are isolcpus or were isolated via cpusets, lest
 819         * the timer run on a CPU which does not service all runqueues,
 820         * potentially leaving other CPUs indefinitely throttled.  If
 821         * isolation is really required, the user will turn the throttle
 822         * off to kill the perturbations it causes anyway.  Meanwhile,
 823         * this maintains functionality for boot and/or troubleshooting.
 824         */
 825        if (rt_b == &root_task_group.rt_bandwidth)
 826                span = cpu_online_mask;
 827#endif
 828        for_each_cpu(i, span) {
 829                int enqueue = 0;
 830                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 831                struct rq *rq = rq_of_rt_rq(rt_rq);
 832                int skip;
 833
 834                /*
 835                 * When span == cpu_online_mask, taking each rq->lock
 836                 * can be time-consuming. Try to avoid it when possible.
 837                 */
 838                raw_spin_lock(&rt_rq->rt_runtime_lock);
 839                if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 840                        rt_rq->rt_runtime = rt_b->rt_runtime;
 841                skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 842                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 843                if (skip)
 844                        continue;
 845
 846                raw_spin_lock(&rq->lock);
 847                update_rq_clock(rq);
 848
 849                if (rt_rq->rt_time) {
 850                        u64 runtime;
 851
 852                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 853                        if (rt_rq->rt_throttled)
 854                                balance_runtime(rt_rq);
 855                        runtime = rt_rq->rt_runtime;
 856                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 857                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 858                                rt_rq->rt_throttled = 0;
 859                                enqueue = 1;
 860
 861                                /*
 862                                 * When we're idle and a woken (rt) task is
 863                                 * throttled check_preempt_curr() will set
 864                                 * skip_update and the time between the wakeup
 865                                 * and this unthrottle will get accounted as
 866                                 * 'runtime'.
 867                                 */
 868                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 869                                        rq_clock_cancel_skipupdate(rq);
 870                        }
 871                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 872                                idle = 0;
 873                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 874                } else if (rt_rq->rt_nr_running) {
 875                        idle = 0;
 876                        if (!rt_rq_throttled(rt_rq))
 877                                enqueue = 1;
 878                }
 879                if (rt_rq->rt_throttled)
 880                        throttled = 1;
 881
 882                if (enqueue)
 883                        sched_rt_rq_enqueue(rt_rq);
 884                raw_spin_unlock(&rq->lock);
 885        }
 886
 887        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 888                return 1;
 889
 890        return idle;
 891}
 892
 893static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 894{
 895#ifdef CONFIG_RT_GROUP_SCHED
 896        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 897
 898        if (rt_rq)
 899                return rt_rq->highest_prio.curr;
 900#endif
 901
 902        return rt_task_of(rt_se)->prio;
 903}
 904
 905static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 906{
 907        u64 runtime = sched_rt_runtime(rt_rq);
 908
 909        if (rt_rq->rt_throttled)
 910                return rt_rq_throttled(rt_rq);
 911
 912        if (runtime >= sched_rt_period(rt_rq))
 913                return 0;
 914
 915        balance_runtime(rt_rq);
 916        runtime = sched_rt_runtime(rt_rq);
 917        if (runtime == RUNTIME_INF)
 918                return 0;
 919
 920        if (rt_rq->rt_time > runtime) {
 921                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 922
 923                /*
 924                 * Don't actually throttle groups that have no runtime assigned
 925                 * but accrue some time due to boosting.
 926                 */
 927                if (likely(rt_b->rt_runtime)) {
 928                        rt_rq->rt_throttled = 1;
 929                        printk_deferred_once("sched: RT throttling activated\n");
 930                } else {
 931                        /*
 932                         * In case we did anyway, make it go away,
 933                         * replenishment is a joke, since it will replenish us
 934                         * with exactly 0 ns.
 935                         */
 936                        rt_rq->rt_time = 0;
 937                }
 938
 939                if (rt_rq_throttled(rt_rq)) {
 940                        sched_rt_rq_dequeue(rt_rq);
 941                        return 1;
 942                }
 943        }
 944
 945        return 0;
 946}
 947
 948/*
 949 * Update the current task's runtime statistics. Skip current tasks that
 950 * are not in our scheduling class.
 951 */
 952static void update_curr_rt(struct rq *rq)
 953{
 954        struct task_struct *curr = rq->curr;
 955        struct sched_rt_entity *rt_se = &curr->rt;
 956        u64 delta_exec;
 957        u64 now;
 958
 959        if (curr->sched_class != &rt_sched_class)
 960                return;
 961
 962        now = rq_clock_task(rq);
 963        delta_exec = now - curr->se.exec_start;
 964        if (unlikely((s64)delta_exec <= 0))
 965                return;
 966
 967        schedstat_set(curr->se.statistics.exec_max,
 968                      max(curr->se.statistics.exec_max, delta_exec));
 969
 970        curr->se.sum_exec_runtime += delta_exec;
 971        account_group_exec_runtime(curr, delta_exec);
 972
 973        curr->se.exec_start = now;
 974        cgroup_account_cputime(curr, delta_exec);
 975
 976        sched_rt_avg_update(rq, delta_exec);
 977
 978        if (!rt_bandwidth_enabled())
 979                return;
 980
 981        for_each_sched_rt_entity(rt_se) {
 982                struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 983
 984                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 985                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 986                        rt_rq->rt_time += delta_exec;
 987                        if (sched_rt_runtime_exceeded(rt_rq))
 988                                resched_curr(rq);
 989                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 990                }
 991        }
 992}
 993
 994static void
 995dequeue_top_rt_rq(struct rt_rq *rt_rq)
 996{
 997        struct rq *rq = rq_of_rt_rq(rt_rq);
 998
 999        BUG_ON(&rq->rt != rt_rq);
1000

1001        if (!rt_rq->rt_queued)
1002                return;
1003
1004        BUG_ON(!rq->nr_running);
1005
1006        sub_nr_running(rq, rt_rq->rt_nr_running);
1007        rt_rq->rt_queued = 0;
1008
1009}
1010
1011static void
1012enqueue_top_rt_rq(struct rt_rq *rt_rq)
1013{
1014        struct rq *rq = rq_of_rt_rq(rt_rq);
1015
1016        BUG_ON(&rq->rt != rt_rq);
1017
1018        if (rt_rq->rt_queued)
1019                return;
1020
1021        if (rt_rq_throttled(rt_rq))
1022                return;
1023
1024        if (rt_rq->rt_nr_running) {
1025                add_nr_running(rq, rt_rq->rt_nr_running);
1026                rt_rq->rt_queued = 1;
1027        }
1028
1029        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1030        cpufreq_update_util(rq, 0);
1031}
1032
1033#if defined CONFIG_SMP
1034
1035static void
1036inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1037{
1038        struct rq *rq = rq_of_rt_rq(rt_rq);
1039
1040#ifdef CONFIG_RT_GROUP_SCHED
1041        /*
1042         * Change rq's cpupri only if rt_rq is the top queue.
1043         */
1044        if (&rq->rt != rt_rq)
1045                return;
1046#endif
1047        if (rq->online && prio < prev_prio)
1048                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1049}
1050
1051static void
1052dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1053{
1054        struct rq *rq = rq_of_rt_rq(rt_rq);
1055
1056#ifdef CONFIG_RT_GROUP_SCHED
1057        /*
1058         * Change rq's cpupri only if rt_rq is the top queue.
1059         */
1060        if (&rq->rt != rt_rq)
1061                return;
1062#endif
1063        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1064                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1065}
1066
1067#else /* CONFIG_SMP */
1068
1069static inline
1070void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1071static inline
1072void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1073
1074#endif /* CONFIG_SMP */
1075
1076#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1077static void
1078inc_rt_prio(struct rt_rq *rt_rq, int prio)
1079{
1080        int prev_prio = rt_rq->highest_prio.curr;
1081
1082        if (prio < prev_prio)
1083                rt_rq->highest_prio.curr = prio;
1084
1085        inc_rt_prio_smp(rt_rq, prio, prev_prio);
1086}
1087
1088static void
1089dec_rt_prio(struct rt_rq *rt_rq, int prio)
1090{
1091        int prev_prio = rt_rq->highest_prio.curr;
1092
1093        if (rt_rq->rt_nr_running) {
1094
1095                WARN_ON(prio < prev_prio);
1096
1097                /*
1098                 * This may have been our highest task, and therefore
1099                 * we may have some recomputation to do
1100                 */
1101                if (prio == prev_prio) {
1102                        struct rt_prio_array *array = &rt_rq->active;
1103
1104                        rt_rq->highest_prio.curr =
1105                                sched_find_first_bit(array->bitmap);
1106                }
1107
1108        } else
1109                rt_rq->highest_prio.curr = MAX_RT_PRIO;
1110
1111        dec_rt_prio_smp(rt_rq, prio, prev_prio);
1112}
1113
1114#else
1115
1116static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1117static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1118
1119#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1120
1121#ifdef CONFIG_RT_GROUP_SCHED
1122
1123static void
1124inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1125{
1126        if (rt_se_boosted(rt_se))
1127                rt_rq->rt_nr_boosted++;
1128
1129        if (rt_rq->tg)
1130                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1131}
1132
1133static void
1134dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1135{
1136        if (rt_se_boosted(rt_se))
1137                rt_rq->rt_nr_boosted--;
1138
1139        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1140}
1141
1142#else /* CONFIG_RT_GROUP_SCHED */
1143
1144static void
1145inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1146{
1147        start_rt_bandwidth(&def_rt_bandwidth);
1148}
1149
1150static inline
1151void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1152
1153#endif /* CONFIG_RT_GROUP_SCHED */
1154
1155static inline
1156unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1157{
1158        struct rt_rq *group_rq = group_rt_rq(rt_se);
1159
1160        if (group_rq)
1161                return group_rq->rt_nr_running;
1162        else
1163                return 1;
1164}
1165
1166static inline
1167unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1168{
1169        struct rt_rq *group_rq = group_rt_rq(rt_se);
1170        struct task_struct *tsk;
1171
1172        if (group_rq)
1173                return group_rq->rr_nr_running;
1174
1175        tsk = rt_task_of(rt_se);
1176
1177        return (tsk->policy == SCHED_RR) ? 1 : 0;
1178}
1179
1180static inline
1181void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1182{
1183        int prio = rt_se_prio(rt_se);
1184
1185        WARN_ON(!rt_prio(prio));
1186        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1187        rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1188
1189        inc_rt_prio(rt_rq, prio);
1190        inc_rt_migration(rt_se, rt_rq);
1191        inc_rt_group(rt_se, rt_rq);
1192}
1193
1194static inline
1195void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1196{
1197        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1198        WARN_ON(!rt_rq->rt_nr_running);
1199        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1200        rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1201
1202        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1203        dec_rt_migration(rt_se, rt_rq);
1204        dec_rt_group(rt_se, rt_rq);
1205}
1206
1207/*
1208 * Change rt_se->run_list location unless SAVE && !MOVE
1209 *
1210 * assumes ENQUEUE/DEQUEUE flags match
1211 */
1212static inline bool move_entity(unsigned int flags)
1213{
1214        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1215                return false;
1216
1217        return true;
1218}
1219
1220static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1221{
1222        list_del_init(&rt_se->run_list);
1223
1224        if (list_empty(array->queue + rt_se_prio(rt_se)))
1225                __clear_bit(rt_se_prio(rt_se), array->bitmap);
1226
1227        rt_se->on_list = 0;
1228}
1229
1230static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1231{
1232        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1233        struct rt_prio_array *array = &rt_rq->active;
1234        struct rt_rq *group_rq = group_rt_rq(rt_se);
1235        struct list_head *queue = array->queue + rt_se_prio(rt_se);
1236
1237        /*
1238         * Don't enqueue the group if its throttled, or when empty.
1239         * The latter is a consequence of the former when a child group
1240         * get throttled and the current group doesn't have any other
1241         * active members.
1242         */
1243        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1244                if (rt_se->on_list)
1245                        __delist_rt_entity(rt_se, array);
1246                return;
1247        }
1248
1249        if (move_entity(flags)) {
1250                WARN_ON_ONCE(rt_se->on_list);
1251                if (flags & ENQUEUE_HEAD)
1252                        list_add(&rt_se->run_list, queue);
1253                else
1254                        list_add_tail(&rt_se->run_list, queue);
1255
1256                __set_bit(rt_se_prio(rt_se), array->bitmap);
1257                rt_se->on_list = 1;
1258        }
1259        rt_se->on_rq = 1;
1260
1261        inc_rt_tasks(rt_se, rt_rq);
1262}
1263
1264static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1265{
1266        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1267        struct rt_prio_array *array = &rt_rq->active;
1268
1269        if (move_entity(flags)) {
1270                WARN_ON_ONCE(!rt_se->on_list);
1271                __delist_rt_entity(rt_se, array);
1272        }
1273        rt_se->on_rq = 0;
1274
1275        dec_rt_tasks(rt_se, rt_rq);
1276}
1277
1278/*
1279 * Because the prio of an upper entry depends on the lower
1280 * entries, we must remove entries top - down.
1281 */
1282static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1283{
1284        struct sched_rt_entity *back = NULL;
1285
1286        for_each_sched_rt_entity(rt_se) {
1287                rt_se->back = back;
1288                back = rt_se;
1289        }
1290
1291        dequeue_top_rt_rq(rt_rq_of_se(back));
1292
1293        for (rt_se = back; rt_se; rt_se = rt_se->back) {
1294                if (on_rt_rq(rt_se))
1295                        __dequeue_rt_entity(rt_se, flags);
1296        }
1297}
1298
1299static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1300{
1301        struct rq *rq = rq_of_rt_se(rt_se);
1302
1303        dequeue_rt_stack(rt_se, flags);
1304        for_each_sched_rt_entity(rt_se)
1305                __enqueue_rt_entity(rt_se, flags);
1306        enqueue_top_rt_rq(&rq->rt);
1307}
1308
1309static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1310{
1311        struct rq *rq = rq_of_rt_se(rt_se);
1312
1313        dequeue_rt_stack(rt_se, flags);
1314
1315        for_each_sched_rt_entity(rt_se) {
1316                struct rt_rq *rt_rq = group_rt_rq(rt_se);
1317
1318                if (rt_rq && rt_rq->rt_nr_running)
1319                        __enqueue_rt_entity(rt_se, flags);
1320        }
1321        enqueue_top_rt_rq(&rq->rt);
1322}
1323
1324/*
1325 * Adding/removing a task to/from a priority array:
1326 */
1327static void
1328enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1329{
1330        struct sched_rt_entity *rt_se = &p->rt;
1331
1332        if (flags & ENQUEUE_WAKEUP)
1333                rt_se->timeout = 0;
1334
1335        enqueue_rt_entity(rt_se, flags);
1336
1337        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1338                enqueue_pushable_task(rq, p);
1339}
1340
1341static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1342{
1343        struct sched_rt_entity *rt_se = &p->rt;
1344
1345        update_curr_rt(rq);
1346        dequeue_rt_entity(rt_se, flags);
1347
1348        dequeue_pushable_task(rq, p);
1349}
1350
1351/*
1352 * Put task to the head or the end of the run list without the overhead of
1353 * dequeue followed by enqueue.
1354 */
1355static void
1356requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1357{
1358        if (on_rt_rq(rt_se)) {
1359                struct rt_prio_array *array = &rt_rq->active;
1360                struct list_head *queue = array->queue + rt_se_prio(rt_se);
1361
1362                if (head)
1363                        list_move(&rt_se->run_list, queue);
1364                else
1365                        list_move_tail(&rt_se->run_list, queue);
1366        }
1367}
1368
1369static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1370{
1371        struct sched_rt_entity *rt_se = &p->rt;
1372        struct rt_rq *rt_rq;
1373
1374        for_each_sched_rt_entity(rt_se) {
1375                rt_rq = rt_rq_of_se(rt_se);
1376                requeue_rt_entity(rt_rq, rt_se, head);
1377        }
1378}
1379
1380static void yield_task_rt(struct rq *rq)
1381{
1382        requeue_task_rt(rq, rq->curr, 0);
1383}
1384
1385#ifdef CONFIG_SMP
1386static int find_lowest_rq(struct task_struct *task);
1387
1388static int
1389select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1390{
1391        struct task_struct *curr;
1392        struct rq *rq;
1393
1394        /* For anything but wake ups, just return the task_cpu */
1395        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1396                goto out;
1397
1398        rq = cpu_rq(cpu);
1399
1400        rcu_read_lock();
1401        curr = READ_ONCE(rq->curr); /* unlocked access */
1402
1403        /*
1404         * If the current task on @p's runqueue is an RT task, then
1405         * try to see if we can wake this RT task up on another
1406         * runqueue. Otherwise simply start this RT task
1407         * on its current runqueue.
1408         *
1409         * We want to avoid overloading runqueues. If the woken
1410         * task is a higher priority, then it will stay on this CPU
1411         * and the lower prio task should be moved to another CPU.
1412         * Even though this will probably make the lower prio task
1413         * lose its cache, we do not want to bounce a higher task
1414         * around just because it gave up its CPU, perhaps for a
1415         * lock?
1416         *
1417         * For equal prio tasks, we just let the scheduler sort it out.
1418         *
1419         * Otherwise, just let it ride on the affined RQ and the
1420         * post-schedule router will push the preempted task away
1421         *
1422         * This test is optimistic, if we get it wrong the load-balancer
1423         * will have to sort it out.
1424         */
1425        if (curr && unlikely(rt_task(curr)) &&
1426            (curr->nr_cpus_allowed < 2 ||
1427             curr->prio <= p->prio)) {
1428                int target = find_lowest_rq(p);
1429
1430                /*
1431                 * Don't bother moving it if the destination CPU is
1432                 * not running a lower priority task.
1433                 */
1434                if (target != -1 &&
1435                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
1436                        cpu = target;
1437        }
1438        rcu_read_unlock();
1439
1440out:
1441        return cpu;
1442}
1443
1444static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1445{
1446        /*
1447         * Current can't be migrated, useless to reschedule,
1448         * let's hope p can move out.
1449         */
1450        if (rq->curr->nr_cpus_allowed == 1 ||
1451            !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1452                return;
1453
1454        /*
1455         * p is migratable, so let's not schedule it and
1456         * see if it is pushed or pulled somewhere else.
1457         */
1458        if (p->nr_cpus_allowed != 1
1459            && cpupri_find(&rq->rd->cpupri, p, NULL))
1460                return;
1461
1462        /*
1463         * There appear to be other CPUs that can accept
1464         * the current task but none can run 'p', so lets reschedule
1465         * to try and push the current task away:
1466         */
1467        requeue_task_rt(rq, p, 1);
1468        resched_curr(rq);
1469}
1470
1471#endif /* CONFIG_SMP */
1472
1473/*
1474 * Preempt the current task with a newly woken task if needed:
1475 */
1476static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1477{
1478        if (p->prio < rq->curr->prio) {
1479                resched_curr(rq);
1480                return;
1481        }
1482
1483#ifdef CONFIG_SMP
1484        /*
1485         * If:
1486         *
1487         * - the newly woken task is of equal priority to the current task
1488         * - the newly woken task is non-migratable while current is migratable
1489         * - current will be preempted on the next reschedule
1490         *
1491         * we should check to see if current can readily move to a different
1492         * cpu.  If so, we will reschedule to allow the push logic to try
1493         * to move current somewhere else, making room for our non-migratable
1494         * task.
1495         */
1496        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1497                check_preempt_equal_prio(rq, p);
1498#endif
1499}
1500
1501static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1502                                                   struct rt_rq *rt_rq)
1503{
1504        struct rt_prio_array *array = &rt_rq->active;
1505        struct sched_rt_entity *next = NULL;
1506        struct list_head *queue;
1507        int idx;
1508
1509        idx = sched_find_first_bit(array->bitmap);
1510        BUG_ON(idx >= MAX_RT_PRIO);
1511
1512        queue = array->queue + idx;
1513        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1514
1515        return next;
1516}
1517
1518static struct task_struct *_pick_next_task_rt(struct rq *rq)
1519{
1520        struct sched_rt_entity *rt_se;
1521        struct task_struct *p;
1522        struct rt_rq *rt_rq  = &rq->rt;
1523
1524        do {
1525                rt_se = pick_next_rt_entity(rq, rt_rq);
1526                BUG_ON(!rt_se);
1527                rt_rq = group_rt_rq(rt_se);
1528        } while (rt_rq);
1529
1530        p = rt_task_of(rt_se);
1531        p->se.exec_start = rq_clock_task(rq);
1532
1533        return p;
1534}
1535
1536static struct task_struct *
1537pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1538{
1539        struct task_struct *p;
1540        struct rt_rq *rt_rq = &rq->rt;
1541
1542        if (need_pull_rt_task(rq, prev)) {
1543                /*
1544                 * This is OK, because current is on_cpu, which avoids it being
1545                 * picked for load-balance and preemption/IRQs are still
1546                 * disabled avoiding further scheduler activity on it and we're
1547                 * being very careful to re-start the picking loop.
1548                 */
1549                rq_unpin_lock(rq, rf);
1550                pull_rt_task(rq);
1551                rq_repin_lock(rq, rf);
1552                /*
1553                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1554                 * means a dl or stop task can slip in, in which case we need
1555                 * to re-start task selection.
1556                 */
1557                if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1558                             rq->dl.dl_nr_running))
1559                        return RETRY_TASK;
1560        }
1561
1562        /*
1563         * We may dequeue prev's rt_rq in put_prev_task().
1564         * So, we update time before rt_nr_running check.
1565         */
1566        if (prev->sched_class == &rt_sched_class)
1567                update_curr_rt(rq);
1568
1569        if (!rt_rq->rt_queued)
1570                return NULL;
1571
1572        put_prev_task(rq, prev);
1573
1574        p = _pick_next_task_rt(rq);
1575
1576        /* The running task is never eligible for pushing */
1577        dequeue_pushable_task(rq, p);
1578
1579        rt_queue_push_tasks(rq);
1580
1581        return p;
1582}
1583
1584static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1585{
1586        update_curr_rt(rq);
1587
1588        /*
1589         * The previous task needs to be made eligible for pushing
1590         * if it is still active
1591         */
1592        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1593                enqueue_pushable_task(rq, p);
1594}
1595
1596#ifdef CONFIG_SMP
1597
1598/* Only try algorithms three times */
1599#define RT_MAX_TRIES 3
1600
1601static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1602{
1603        if (!task_running(rq, p) &&
1604            cpumask_test_cpu(cpu, &p->cpus_allowed))
1605                return 1;
1606
1607        return 0;
1608}
1609
1610/*
1611 * Return the highest pushable rq's task, which is suitable to be executed
1612 * on the CPU, NULL otherwise
1613 */
1614static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1615{
1616        struct plist_head *head = &rq->rt.pushable_tasks;
1617        struct task_struct *p;
1618
1619        if (!has_pushable_tasks(rq))
1620                return NULL;
1621
1622        plist_for_each_entry(p, head, pushable_tasks) {
1623                if (pick_rt_task(rq, p, cpu))
1624                        return p;
1625        }
1626
1627        return NULL;
1628}
1629
1630static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1631
1632static int find_lowest_rq(struct task_struct *task)
1633{
1634        struct sched_domain *sd;
1635        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1636        int this_cpu = smp_processor_id();
1637        int cpu      = task_cpu(task);
1638
1639        /* Make sure the mask is initialized first */
1640        if (unlikely(!lowest_mask))
1641                return -1;
1642
1643        if (task->nr_cpus_allowed == 1)
1644                return -1; /* No other targets possible */
1645
1646        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1647                return -1; /* No targets found */
1648
1649        /*
1650         * At this point we have built a mask of CPUs representing the
1651         * lowest priority tasks in the system.  Now we want to elect
1652         * the best one based on our affinity and topology.
1653         *
1654         * We prioritize the last CPU that the task executed on since
1655         * it is most likely cache-hot in that location.
1656         */
1657        if (cpumask_test_cpu(cpu, lowest_mask))
1658                return cpu;
1659
1660        /*
1661         * Otherwise, we consult the sched_domains span maps to figure
1662         * out which CPU is logically closest to our hot cache data.
1663         */
1664        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1665                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1666
1667        rcu_read_lock();
1668        for_each_domain(cpu, sd) {
1669                if (sd->flags & SD_WAKE_AFFINE) {
1670                        int best_cpu;
1671
1672                        /*
1673                         * "this_cpu" is cheaper to preempt than a
1674                         * remote processor.
1675                         */
1676                        if (this_cpu != -1 &&
1677                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1678                                rcu_read_unlock();
1679                                return this_cpu;
1680                        }
1681
1682                        best_cpu = cpumask_first_and(lowest_mask,
1683                                                     sched_domain_span(sd));
1684                        if (best_cpu < nr_cpu_ids) {
1685                                rcu_read_unlock();
1686                                return best_cpu;
1687                        }
1688                }
1689        }
1690        rcu_read_unlock();
1691
1692        /*
1693         * And finally, if there were no matches within the domains
1694         * just give the caller *something* to work with from the compatible
1695         * locations.
1696         */
1697        if (this_cpu != -1)
1698                return this_cpu;
1699
1700        cpu = cpumask_any(lowest_mask);
1701        if (cpu < nr_cpu_ids)
1702                return cpu;
1703
1704        return -1;
1705}
1706
1707/* Will lock the rq it finds */
1708static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1709{
1710        struct rq *lowest_rq = NULL;
1711        int tries;
1712        int cpu;
1713
1714        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1715                cpu = find_lowest_rq(task);
1716
1717                if ((cpu == -1) || (cpu == rq->cpu))
1718                        break;
1719
1720                lowest_rq = cpu_rq(cpu);
1721
1722                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1723                        /*
1724                         * Target rq has tasks of equal or higher priority,
1725                         * retrying does not release any lock and is unlikely
1726                         * to yield a different result.
1727                         */
1728                        lowest_rq = NULL;
1729                        break;
1730                }
1731
1732                /* if the prio of this runqueue changed, try again */
1733                if (double_lock_balance(rq, lowest_rq)) {
1734                        /*
1735                         * We had to unlock the run queue. In
1736                         * the mean time, task could have
1737                         * migrated already or had its affinity changed.
1738                         * Also make sure that it wasn't scheduled on its rq.
1739                         */
1740                        if (unlikely(task_rq(task) != rq ||
1741                                     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1742                                     task_running(rq, task) ||
1743                                     !rt_task(task) ||
1744                                     !task_on_rq_queued(task))) {
1745
1746                                double_unlock_balance(rq, lowest_rq);
1747                                lowest_rq = NULL;
1748                                break;
1749                        }
1750                }
1751
1752                /* If this rq is still suitable use it. */
1753                if (lowest_rq->rt.highest_prio.curr > task->prio)
1754                        break;
1755
1756                /* try again */
1757                double_unlock_balance(rq, lowest_rq);
1758                lowest_rq = NULL;
1759        }
1760
1761        return lowest_rq;
1762}
1763
1764static struct task_struct *pick_next_pushable_task(struct rq *rq)
1765{
1766        struct task_struct *p;
1767
1768        if (!has_pushable_tasks(rq))
1769                return NULL;
1770
1771        p = plist_first_entry(&rq->rt.pushable_tasks,
1772                              struct task_struct, pushable_tasks);
1773
1774        BUG_ON(rq->cpu != task_cpu(p));
1775        BUG_ON(task_current(rq, p));
1776        BUG_ON(p->nr_cpus_allowed <= 1);
1777
1778        BUG_ON(!task_on_rq_queued(p));
1779        BUG_ON(!rt_task(p));
1780
1781        return p;
1782}
1783
1784/*
1785 * If the current CPU has more than one RT task, see if the non
1786 * running task can migrate over to a CPU that is running a task
1787 * of lesser priority.
1788 */
1789static int push_rt_task(struct rq *rq)
1790{
1791        struct task_struct *next_task;
1792        struct rq *lowest_rq;
1793        int ret = 0;
1794
1795        if (!rq->rt.overloaded)
1796                return 0;
1797
1798        next_task = pick_next_pushable_task(rq);
1799        if (!next_task)
1800                return 0;
1801
1802retry:
1803        if (unlikely(next_task == rq->curr)) {
1804                WARN_ON(1);
1805                return 0;
1806        }
1807
1808        /*
1809         * It's possible that the next_task slipped in of
1810         * higher priority than current. If that's the case
1811         * just reschedule current.
1812         */
1813        if (unlikely(next_task->prio < rq->curr->prio)) {
1814                resched_curr(rq);
1815                return 0;
1816        }
1817
1818        /* We might release rq lock */
1819        get_task_struct(next_task);
1820
1821        /* find_lock_lowest_rq locks the rq if found */
1822        lowest_rq = find_lock_lowest_rq(next_task, rq);
1823        if (!lowest_rq) {
1824                struct task_struct *task;
1825                /*
1826                 * find_lock_lowest_rq releases rq->lock
1827                 * so it is possible that next_task has migrated.
1828                 *
1829                 * We need to make sure that the task is still on the same
1830                 * run-queue and is also still the next task eligible for
1831                 * pushing.
1832                 */
1833                task = pick_next_pushable_task(rq);
1834                if (task == next_task) {
1835                        /*
1836                         * The task hasn't migrated, and is still the next
1837                         * eligible task, but we failed to find a run-queue
1838                         * to push it to.  Do not retry in this case, since
1839                         * other CPUs will pull from us when ready.
1840                         */
1841                        goto out;
1842                }
1843
1844                if (!task)
1845                        /* No more tasks, just exit */
1846                        goto out;
1847
1848                /*
1849                 * Something has shifted, try again.
1850                 */
1851                put_task_struct(next_task);
1852                next_task = task;
1853                goto retry;
1854        }
1855
1856        deactivate_task(rq, next_task, 0);
1857        set_task_cpu(next_task, lowest_rq->cpu);
1858        activate_task(lowest_rq, next_task, 0);
1859        ret = 1;
1860
1861        resched_curr(lowest_rq);
1862
1863        double_unlock_balance(rq, lowest_rq);
1864
1865out:
1866        put_task_struct(next_task);
1867
1868        return ret;
1869}
1870
1871static void push_rt_tasks(struct rq *rq)
1872{
1873        /* push_rt_task will return true if it moved an RT */
1874        while (push_rt_task(rq))
1875                ;
1876}
1877
1878#ifdef HAVE_RT_PUSH_IPI
1879
1880/*
1881 * When a high priority task schedules out from a CPU and a lower priority
1882 * task is scheduled in, a check is made to see if there's any RT tasks
1883 * on other CPUs that are waiting to run because a higher priority RT task
1884 * is currently running on its CPU. In this case, the CPU with multiple RT
1885 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1886 * up that may be able to run one of its non-running queued RT tasks.
1887 *
1888 * All CPUs with overloaded RT tasks need to be notified as there is currently
1889 * no way to know which of these CPUs have the highest priority task waiting
1890 * to run. Instead of trying to take a spinlock on each of these CPUs,
1891 * which has shown to cause large latency when done on machines with many
1892 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1893 * RT tasks waiting to run.
1894 *
1895 * Just sending an IPI to each of the CPUs is also an issue, as on large
1896 * count CPU machines, this can cause an IPI storm on a CPU, especially
1897 * if its the only CPU with multiple RT tasks queued, and a large number
1898 * of CPUs scheduling a lower priority task at the same time.
1899 *
1900 * Each root domain has its own irq work function that can iterate over
1901 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1902 * tassk must be checked if there's one or many CPUs that are lowering
1903 * their priority, there's a single irq work iterator that will try to
1904 * push off RT tasks that are waiting to run.
1905 *
1906 * When a CPU schedules a lower priority task, it will kick off the
1907 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1908 * As it only takes the first CPU that schedules a lower priority task
1909 * to start the process, the rto_start variable is incremented and if
1910 * the atomic result is one, then that CPU will try to take the rto_lock.
1911 * This prevents high contention on the lock as the process handles all
1912 * CPUs scheduling lower priority tasks.
1913 *
1914 * All CPUs that are scheduling a lower priority task will increment the
1915 * rt_loop_next variable. This will make sure that the irq work iterator
1916 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1917 * priority task, even if the iterator is in the middle of a scan. Incrementing
1918 * the rt_loop_next will cause the iterator to perform another scan.
1919 *
1920 */
1921static int rto_next_cpu(struct root_domain *rd)
1922{
1923        int next;
1924        int cpu;
1925
1926        /*
1927         * When starting the IPI RT pushing, the rto_cpu is set to -1,
1928         * rt_next_cpu() will simply return the first CPU found in
1929         * the rto_mask.
1930         *
1931         * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1932         * will return the next CPU found in the rto_mask.
1933         *
1934         * If there are no more CPUs left in the rto_mask, then a check is made
1935         * against rto_loop and rto_loop_next. rto_loop is only updated with
1936         * the rto_lock held, but any CPU may increment the rto_loop_next
1937         * without any locking.
1938         */
1939        for (;;) {
1940
1941                /* When rto_cpu is -1 this acts like cpumask_first() */
1942                cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1943
1944                rd->rto_cpu = cpu;
1945
1946                if (cpu < nr_cpu_ids)
1947                        return cpu;
1948
1949                rd->rto_cpu = -1;
1950
1951                /*
1952                 * ACQUIRE ensures we see the @rto_mask changes
1953                 * made prior to the @next value observed.
1954                 *
1955                 * Matches WMB in rt_set_overload().
1956                 */
1957                next = atomic_read_acquire(&rd->rto_loop_next);
1958
1959                if (rd->rto_loop == next)
1960                        break;
1961
1962                rd->rto_loop = next;
1963        }
1964
1965        return -1;
1966}
1967
1968static inline bool rto_start_trylock(atomic_t *v)
1969{
1970        return !atomic_cmpxchg_acquire(v, 0, 1);
1971}
1972
1973static inline void rto_start_unlock(atomic_t *v)
1974{
1975        atomic_set_release(v, 0);
1976}
1977
1978static void tell_cpu_to_push(struct rq *rq)
1979{
1980        int cpu = -1;
1981
1982        /* Keep the loop going if the IPI is currently active */
1983        atomic_inc(&rq->rd->rto_loop_next);
1984
1985        /* Only one CPU can initiate a loop at a time */
1986        if (!rto_start_trylock(&rq->rd->rto_loop_start))
1987                return;
1988
1989        raw_spin_lock(&rq->rd->rto_lock);
1990
1991        /*
1992         * The rto_cpu is updated under the lock, if it has a valid CPU
1993         * then the IPI is still running and will continue due to the
1994         * update to loop_next, and nothing needs to be done here.
1995         * Otherwise it is finishing up and an ipi needs to be sent.
1996         */
1997        if (rq->rd->rto_cpu < 0)
1998                cpu = rto_next_cpu(rq->rd);
1999
2000        raw_spin_unlock(&rq->rd->rto_lock);

2001
2002        rto_start_unlock(&rq->rd->rto_loop_start);
2003
2004        if (cpu >= 0) {
2005                /* Make sure the rd does not get freed while pushing */
2006                sched_get_rd(rq->rd);
2007                irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2008        }
2009}
2010
2011/* Called from hardirq context */
2012void rto_push_irq_work_func(struct irq_work *work)
2013{
2014        struct root_domain *rd =
2015                container_of(work, struct root_domain, rto_push_work);
2016        struct rq *rq;
2017        int cpu;
2018
2019        rq = this_rq();
2020
2021        /*
2022         * We do not need to grab the lock to check for has_pushable_tasks.
2023         * When it gets updated, a check is made if a push is possible.
2024         */
2025        if (has_pushable_tasks(rq)) {
2026                raw_spin_lock(&rq->lock);
2027                push_rt_tasks(rq);
2028                raw_spin_unlock(&rq->lock);
2029        }
2030
2031        raw_spin_lock(&rd->rto_lock);
2032
2033        /* Pass the IPI to the next rt overloaded queue */
2034        cpu = rto_next_cpu(rd);
2035
2036        raw_spin_unlock(&rd->rto_lock);
2037
2038        if (cpu < 0) {
2039                sched_put_rd(rd);
2040                return;
2041        }
2042
2043        /* Try the next RT overloaded CPU */
2044        irq_work_queue_on(&rd->rto_push_work, cpu);
2045}
2046#endif /* HAVE_RT_PUSH_IPI */
2047
2048static void pull_rt_task(struct rq *this_rq)
2049{
2050        int this_cpu = this_rq->cpu, cpu;
2051        bool resched = false;
2052        struct task_struct *p;
2053        struct rq *src_rq;
2054        int rt_overload_count = rt_overloaded(this_rq);
2055
2056        if (likely(!rt_overload_count))
2057                return;
2058
2059        /*
2060         * Match the barrier from rt_set_overloaded; this guarantees that if we
2061         * see overloaded we must also see the rto_mask bit.
2062         */
2063        smp_rmb();
2064
2065        /* If we are the only overloaded CPU do nothing */
2066        if (rt_overload_count == 1 &&
2067            cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2068                return;
2069
2070#ifdef HAVE_RT_PUSH_IPI
2071        if (sched_feat(RT_PUSH_IPI)) {
2072                tell_cpu_to_push(this_rq);
2073                return;
2074        }
2075#endif
2076
2077        for_each_cpu(cpu, this_rq->rd->rto_mask) {
2078                if (this_cpu == cpu)
2079                        continue;
2080
2081                src_rq = cpu_rq(cpu);
2082
2083                /*
2084                 * Don't bother taking the src_rq->lock if the next highest
2085                 * task is known to be lower-priority than our current task.
2086                 * This may look racy, but if this value is about to go
2087                 * logically higher, the src_rq will push this task away.
2088                 * And if its going logically lower, we do not care
2089                 */
2090                if (src_rq->rt.highest_prio.next >=
2091                    this_rq->rt.highest_prio.curr)
2092                        continue;
2093
2094                /*
2095                 * We can potentially drop this_rq's lock in
2096                 * double_lock_balance, and another CPU could
2097                 * alter this_rq
2098                 */
2099                double_lock_balance(this_rq, src_rq);
2100
2101                /*
2102                 * We can pull only a task, which is pushable
2103                 * on its rq, and no others.
2104                 */
2105                p = pick_highest_pushable_task(src_rq, this_cpu);
2106
2107                /*
2108                 * Do we have an RT task that preempts
2109                 * the to-be-scheduled task?
2110                 */
2111                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2112                        WARN_ON(p == src_rq->curr);
2113                        WARN_ON(!task_on_rq_queued(p));
2114
2115                        /*
2116                         * There's a chance that p is higher in priority
2117                         * than what's currently running on its CPU.
2118                         * This is just that p is wakeing up and hasn't
2119                         * had a chance to schedule. We only pull
2120                         * p if it is lower in priority than the
2121                         * current task on the run queue
2122                         */
2123                        if (p->prio < src_rq->curr->prio)
2124                                goto skip;
2125
2126                        resched = true;
2127
2128                        deactivate_task(src_rq, p, 0);
2129                        set_task_cpu(p, this_cpu);
2130                        activate_task(this_rq, p, 0);
2131                        /*
2132                         * We continue with the search, just in
2133                         * case there's an even higher prio task
2134                         * in another runqueue. (low likelihood
2135                         * but possible)
2136                         */
2137                }
2138skip:
2139                double_unlock_balance(this_rq, src_rq);
2140        }
2141
2142        if (resched)
2143                resched_curr(this_rq);
2144}
2145
2146/*
2147 * If we are not running and we are not going to reschedule soon, we should
2148 * try to push tasks away now
2149 */
2150static void task_woken_rt(struct rq *rq, struct task_struct *p)
2151{
2152        if (!task_running(rq, p) &&
2153            !test_tsk_need_resched(rq->curr) &&
2154            p->nr_cpus_allowed > 1 &&
2155            (dl_task(rq->curr) || rt_task(rq->curr)) &&
2156            (rq->curr->nr_cpus_allowed < 2 ||
2157             rq->curr->prio <= p->prio))
2158                push_rt_tasks(rq);
2159}
2160
2161/* Assumes rq->lock is held */
2162static void rq_online_rt(struct rq *rq)
2163{
2164        if (rq->rt.overloaded)
2165                rt_set_overload(rq);
2166
2167        __enable_runtime(rq);
2168
2169        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2170}
2171
2172/* Assumes rq->lock is held */
2173static void rq_offline_rt(struct rq *rq)
2174{
2175        if (rq->rt.overloaded)
2176                rt_clear_overload(rq);
2177
2178        __disable_runtime(rq);
2179
2180        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2181}
2182
2183/*
2184 * When switch from the rt queue, we bring ourselves to a position
2185 * that we might want to pull RT tasks from other runqueues.
2186 */
2187static void switched_from_rt(struct rq *rq, struct task_struct *p)
2188{
2189        /*
2190         * If there are other RT tasks then we will reschedule
2191         * and the scheduling of the other RT tasks will handle
2192         * the balancing. But if we are the last RT task
2193         * we may need to handle the pulling of RT tasks
2194         * now.
2195         */
2196        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2197                return;
2198
2199        rt_queue_pull_task(rq);
2200}
2201
2202void __init init_sched_rt_class(void)
2203{
2204        unsigned int i;
2205
2206        for_each_possible_cpu(i) {
2207                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2208                                        GFP_KERNEL, cpu_to_node(i));
2209        }
2210}
2211#endif /* CONFIG_SMP */
2212
2213/*
2214 * When switching a task to RT, we may overload the runqueue
2215 * with RT tasks. In this case we try to push them off to
2216 * other runqueues.
2217 */
2218static void switched_to_rt(struct rq *rq, struct task_struct *p)
2219{
2220        /*
2221         * If we are already running, then there's nothing
2222         * that needs to be done. But if we are not running
2223         * we may need to preempt the current running task.
2224         * If that current running task is also an RT task
2225         * then see if we can move to another run queue.
2226         */
2227        if (task_on_rq_queued(p) && rq->curr != p) {
2228#ifdef CONFIG_SMP
2229                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2230                        rt_queue_push_tasks(rq);
2231#endif /* CONFIG_SMP */
2232                if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2233                        resched_curr(rq);
2234        }
2235}
2236
2237/*
2238 * Priority of the task has changed. This may cause
2239 * us to initiate a push or pull.
2240 */
2241static void
2242prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2243{
2244        if (!task_on_rq_queued(p))
2245                return;
2246
2247        if (rq->curr == p) {
2248#ifdef CONFIG_SMP
2249                /*
2250                 * If our priority decreases while running, we
2251                 * may need to pull tasks to this runqueue.
2252                 */
2253                if (oldprio < p->prio)
2254                        rt_queue_pull_task(rq);
2255
2256                /*
2257                 * If there's a higher priority task waiting to run
2258                 * then reschedule.
2259                 */
2260                if (p->prio > rq->rt.highest_prio.curr)
2261                        resched_curr(rq);
2262#else
2263                /* For UP simply resched on drop of prio */
2264                if (oldprio < p->prio)
2265                        resched_curr(rq);
2266#endif /* CONFIG_SMP */
2267        } else {
2268                /*
2269                 * This task is not running, but if it is
2270                 * greater than the current running task
2271                 * then reschedule.
2272                 */
2273                if (p->prio < rq->curr->prio)
2274                        resched_curr(rq);
2275        }
2276}
2277
2278#ifdef CONFIG_POSIX_TIMERS
2279static void watchdog(struct rq *rq, struct task_struct *p)
2280{
2281        unsigned long soft, hard;
2282
2283        /* max may change after cur was read, this will be fixed next tick */
2284        soft = task_rlimit(p, RLIMIT_RTTIME);
2285        hard = task_rlimit_max(p, RLIMIT_RTTIME);
2286
2287        if (soft != RLIM_INFINITY) {
2288                unsigned long next;
2289
2290                if (p->rt.watchdog_stamp != jiffies) {
2291                        p->rt.timeout++;
2292                        p->rt.watchdog_stamp = jiffies;
2293                }
2294
2295                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2296                if (p->rt.timeout > next)
2297                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2298        }
2299}
2300#else
2301static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2302#endif
2303
2304/*
2305 * scheduler tick hitting a task of our scheduling class.
2306 *
2307 * NOTE: This function can be called remotely by the tick offload that
2308 * goes along full dynticks. Therefore no local assumption can be made
2309 * and everything must be accessed through the @rq and @curr passed in
2310 * parameters.
2311 */
2312static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2313{
2314        struct sched_rt_entity *rt_se = &p->rt;
2315
2316        update_curr_rt(rq);
2317
2318        watchdog(rq, p);
2319
2320        /*
2321         * RR tasks need a special form of timeslice management.
2322         * FIFO tasks have no timeslices.
2323         */
2324        if (p->policy != SCHED_RR)
2325                return;
2326
2327        if (--p->rt.time_slice)
2328                return;
2329
2330        p->rt.time_slice = sched_rr_timeslice;
2331
2332        /*
2333         * Requeue to the end of queue if we (and all of our ancestors) are not
2334         * the only element on the queue
2335         */
2336        for_each_sched_rt_entity(rt_se) {
2337                if (rt_se->run_list.prev != rt_se->run_list.next) {
2338                        requeue_task_rt(rq, p, 0);
2339                        resched_curr(rq);
2340                        return;
2341                }
2342        }
2343}
2344
2345static void set_curr_task_rt(struct rq *rq)
2346{
2347        struct task_struct *p = rq->curr;
2348
2349        p->se.exec_start = rq_clock_task(rq);
2350
2351        /* The running task is never eligible for pushing */
2352        dequeue_pushable_task(rq, p);
2353}
2354
2355static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2356{
2357        /*
2358         * Time slice is 0 for SCHED_FIFO tasks
2359         */
2360        if (task->policy == SCHED_RR)
2361                return sched_rr_timeslice;
2362        else
2363                return 0;
2364}
2365
2366const struct sched_class rt_sched_class = {
2367        .next                   = &fair_sched_class,
2368        .enqueue_task           = enqueue_task_rt,
2369        .dequeue_task           = dequeue_task_rt,
2370        .yield_task             = yield_task_rt,
2371
2372        .check_preempt_curr     = check_preempt_curr_rt,
2373
2374        .pick_next_task         = pick_next_task_rt,
2375        .put_prev_task          = put_prev_task_rt,
2376
2377#ifdef CONFIG_SMP
2378        .select_task_rq         = select_task_rq_rt,
2379
2380        .set_cpus_allowed       = set_cpus_allowed_common,
2381        .rq_online              = rq_online_rt,
2382        .rq_offline             = rq_offline_rt,
2383        .task_woken             = task_woken_rt,
2384        .switched_from          = switched_from_rt,
2385#endif
2386
2387        .set_curr_task          = set_curr_task_rt,
2388        .task_tick              = task_tick_rt,
2389
2390        .get_rr_interval        = get_rr_interval_rt,
2391
2392        .prio_changed           = prio_changed_rt,
2393        .switched_to            = switched_to_rt,
2394
2395        .update_curr            = update_curr_rt,
2396};
2397
2398#ifdef CONFIG_RT_GROUP_SCHED
2399/*
2400 * Ensure that the real time constraints are schedulable.
2401 */
2402static DEFINE_MUTEX(rt_constraints_mutex);
2403
2404/* Must be called with tasklist_lock held */
2405static inline int tg_has_rt_tasks(struct task_group *tg)
2406{
2407        struct task_struct *g, *p;
2408
2409        /*
2410         * Autogroups do not have RT tasks; see autogroup_create().
2411         */
2412        if (task_group_is_autogroup(tg))
2413                return 0;
2414
2415        for_each_process_thread(g, p) {
2416                if (rt_task(p) && task_group(p) == tg)
2417                        return 1;
2418        }
2419
2420        return 0;
2421}
2422
2423struct rt_schedulable_data {
2424        struct task_group *tg;
2425        u64 rt_period;
2426        u64 rt_runtime;
2427};
2428
2429static int tg_rt_schedulable(struct task_group *tg, void *data)
2430{
2431        struct rt_schedulable_data *d = data;
2432        struct task_group *child;
2433        unsigned long total, sum = 0;
2434        u64 period, runtime;
2435
2436        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2437        runtime = tg->rt_bandwidth.rt_runtime;
2438
2439        if (tg == d->tg) {
2440                period = d->rt_period;
2441                runtime = d->rt_runtime;
2442        }
2443
2444        /*
2445         * Cannot have more runtime than the period.
2446         */
2447        if (runtime > period && runtime != RUNTIME_INF)
2448                return -EINVAL;
2449
2450        /*
2451         * Ensure we don't starve existing RT tasks.
2452         */
2453        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2454                return -EBUSY;
2455
2456        total = to_ratio(period, runtime);
2457
2458        /*
2459         * Nobody can have more than the global setting allows.
2460         */
2461        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2462                return -EINVAL;
2463
2464        /*
2465         * The sum of our children's runtime should not exceed our own.
2466         */
2467        list_for_each_entry_rcu(child, &tg->children, siblings) {
2468                period = ktime_to_ns(child->rt_bandwidth.rt_period);
2469                runtime = child->rt_bandwidth.rt_runtime;
2470
2471                if (child == d->tg) {
2472                        period = d->rt_period;
2473                        runtime = d->rt_runtime;
2474                }
2475
2476                sum += to_ratio(period, runtime);
2477        }
2478
2479        if (sum > total)
2480                return -EINVAL;
2481
2482        return 0;
2483}
2484
2485static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2486{
2487        int ret;
2488
2489        struct rt_schedulable_data data = {
2490                .tg = tg,
2491                .rt_period = period,
2492                .rt_runtime = runtime,
2493        };
2494
2495        rcu_read_lock();
2496        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2497        rcu_read_unlock();
2498
2499        return ret;
2500}
2501
2502static int tg_set_rt_bandwidth(struct task_group *tg,
2503                u64 rt_period, u64 rt_runtime)
2504{
2505        int i, err = 0;
2506
2507        /*
2508         * Disallowing the root group RT runtime is BAD, it would disallow the
2509         * kernel creating (and or operating) RT threads.
2510         */
2511        if (tg == &root_task_group && rt_runtime == 0)
2512                return -EINVAL;
2513
2514        /* No period doesn't make any sense. */
2515        if (rt_period == 0)
2516                return -EINVAL;
2517
2518        mutex_lock(&rt_constraints_mutex);
2519        read_lock(&tasklist_lock);
2520        err = __rt_schedulable(tg, rt_period, rt_runtime);
2521        if (err)
2522                goto unlock;
2523
2524        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2525        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2526        tg->rt_bandwidth.rt_runtime = rt_runtime;
2527
2528        for_each_possible_cpu(i) {
2529                struct rt_rq *rt_rq = tg->rt_rq[i];
2530
2531                raw_spin_lock(&rt_rq->rt_runtime_lock);
2532                rt_rq->rt_runtime = rt_runtime;
2533                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2534        }
2535        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2536unlock:
2537        read_unlock(&tasklist_lock);
2538        mutex_unlock(&rt_constraints_mutex);
2539
2540        return err;
2541}
2542
2543int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2544{
2545        u64 rt_runtime, rt_period;
2546
2547        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2548        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2549        if (rt_runtime_us < 0)
2550                rt_runtime = RUNTIME_INF;
2551
2552        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2553}
2554
2555long sched_group_rt_runtime(struct task_group *tg)
2556{
2557        u64 rt_runtime_us;
2558
2559        if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2560                return -1;
2561
2562        rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2563        do_div(rt_runtime_us, NSEC_PER_USEC);
2564        return rt_runtime_us;
2565}
2566
2567int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2568{
2569        u64 rt_runtime, rt_period;
2570
2571        rt_period = rt_period_us * NSEC_PER_USEC;
2572        rt_runtime = tg->rt_bandwidth.rt_runtime;
2573
2574        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2575}
2576
2577long sched_group_rt_period(struct task_group *tg)
2578{
2579        u64 rt_period_us;
2580
2581        rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2582        do_div(rt_period_us, NSEC_PER_USEC);
2583        return rt_period_us;
2584}
2585
2586static int sched_rt_global_constraints(void)
2587{
2588        int ret = 0;
2589
2590        mutex_lock(&rt_constraints_mutex);
2591        read_lock(&tasklist_lock);
2592        ret = __rt_schedulable(NULL, 0, 0);
2593        read_unlock(&tasklist_lock);
2594        mutex_unlock(&rt_constraints_mutex);
2595
2596        return ret;
2597}
2598
2599int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2600{
2601        /* Don't accept realtime tasks when there is no way for them to run */
2602        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2603                return 0;
2604
2605        return 1;
2606}
2607
2608#else /* !CONFIG_RT_GROUP_SCHED */
2609static int sched_rt_global_constraints(void)
2610{
2611        unsigned long flags;
2612        int i;
2613
2614        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2615        for_each_possible_cpu(i) {
2616                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2617
2618                raw_spin_lock(&rt_rq->rt_runtime_lock);
2619                rt_rq->rt_runtime = global_rt_runtime();
2620                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2621        }
2622        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2623
2624        return 0;
2625}
2626#endif /* CONFIG_RT_GROUP_SCHED */
2627
2628static int sched_rt_global_validate(void)
2629{
2630        if (sysctl_sched_rt_period <= 0)
2631                return -EINVAL;
2632
2633        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2634                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2635                return -EINVAL;
2636
2637        return 0;
2638}
2639
2640static void sched_rt_do_global(void)
2641{
2642        def_rt_bandwidth.rt_runtime = global_rt_runtime();
2643        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2644}
2645
2646int sched_rt_handler(struct ctl_table *table, int write,
2647                void __user *buffer, size_t *lenp,
2648                loff_t *ppos)
2649{
2650        int old_period, old_runtime;
2651        static DEFINE_MUTEX(mutex);
2652        int ret;
2653
2654        mutex_lock(&mutex);
2655        old_period = sysctl_sched_rt_period;
2656        old_runtime = sysctl_sched_rt_runtime;
2657
2658        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2659
2660        if (!ret && write) {
2661                ret = sched_rt_global_validate();
2662                if (ret)
2663                        goto undo;
2664
2665                ret = sched_dl_global_validate();
2666                if (ret)
2667                        goto undo;
2668
2669                ret = sched_rt_global_constraints();
2670                if (ret)
2671                        goto undo;
2672
2673                sched_rt_do_global();
2674                sched_dl_do_global();
2675        }
2676        if (0) {
2677undo:
2678                sysctl_sched_rt_period = old_period;
2679                sysctl_sched_rt_runtime = old_runtime;
2680        }
2681        mutex_unlock(&mutex);
2682
2683        return ret;
2684}
2685
2686int sched_rr_handler(struct ctl_table *table, int write,
2687                void __user *buffer, size_t *lenp,
2688                loff_t *ppos)
2689{
2690        int ret;
2691        static DEFINE_MUTEX(mutex);
2692
2693        mutex_lock(&mutex);
2694        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2695        /*
2696         * Make sure that internally we keep jiffies.
2697         * Also, writing zero resets the timeslice to default:
2698         */
2699        if (!ret && write) {
2700                sched_rr_timeslice =
2701                        sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2702                        msecs_to_jiffies(sysctl_sched_rr_timeslice);
2703        }
2704        mutex_unlock(&mutex);
2705
2706        return ret;
2707}
2708
2709#ifdef CONFIG_SCHED_DEBUG
2710void print_rt_stats(struct seq_file *m, int cpu)
2711{
2712        rt_rq_iter_t iter;
2713        struct rt_rq *rt_rq;
2714
2715        rcu_read_lock();
2716        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2717                print_rt_rq(m, cpu, rt_rq);
2718        rcu_read_unlock();
2719}
2720#endif /* CONFIG_SCHED_DEBUG */
2721