LXR linux/kernel/sched/rt.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4 * policies)
   5 */
   6
   7#include "sched.h"
   8
   9#include <linux/slab.h>
  10#include <linux/irq_work.h>
  11
  12int sched_rr_timeslice = RR_TIMESLICE;
  13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  14
  15static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  16
  17struct rt_bandwidth def_rt_bandwidth;
  18
  19static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  20{
  21        struct rt_bandwidth *rt_b =
  22                container_of(timer, struct rt_bandwidth, rt_period_timer);
  23        int idle = 0;
  24        int overrun;
  25
  26        raw_spin_lock(&rt_b->rt_runtime_lock);
  27        for (;;) {
  28                overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  29                if (!overrun)
  30                        break;
  31
  32                raw_spin_unlock(&rt_b->rt_runtime_lock);
  33                idle = do_sched_rt_period_timer(rt_b, overrun);
  34                raw_spin_lock(&rt_b->rt_runtime_lock);
  35        }
  36        if (idle)
  37                rt_b->rt_period_active = 0;
  38        raw_spin_unlock(&rt_b->rt_runtime_lock);
  39
  40        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  41}
  42
  43void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  44{
  45        rt_b->rt_period = ns_to_ktime(period);
  46        rt_b->rt_runtime = runtime;
  47
  48        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  49
  50        hrtimer_init(&rt_b->rt_period_timer,
  51                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  52        rt_b->rt_period_timer.function = sched_rt_period_timer;
  53}
  54
  55static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  56{
  57        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  58                return;
  59
  60        raw_spin_lock(&rt_b->rt_runtime_lock);
  61        if (!rt_b->rt_period_active) {
  62                rt_b->rt_period_active = 1;
  63                /*
  64                 * SCHED_DEADLINE updates the bandwidth, as a run away
  65                 * RT task with a DL task could hog a CPU. But DL does
  66                 * not reset the period. If a deadline task was running
  67                 * without an RT task running, it can cause RT tasks to
  68                 * throttle when they start up. Kick the timer right away
  69                 * to update the period.
  70                 */
  71                hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  72                hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  73        }
  74        raw_spin_unlock(&rt_b->rt_runtime_lock);
  75}
  76
  77#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
  78static void push_irq_work_func(struct irq_work *work);
  79#endif
  80
  81void init_rt_rq(struct rt_rq *rt_rq)
  82{
  83        struct rt_prio_array *array;
  84        int i;
  85
  86        array = &rt_rq->active;
  87        for (i = 0; i < MAX_RT_PRIO; i++) {
  88                INIT_LIST_HEAD(array->queue + i);
  89                __clear_bit(i, array->bitmap);
  90        }
  91        /* delimiter for bitsearch: */
  92        __set_bit(MAX_RT_PRIO, array->bitmap);
  93
  94#if defined CONFIG_SMP
  95        rt_rq->highest_prio.curr = MAX_RT_PRIO;
  96        rt_rq->highest_prio.next = MAX_RT_PRIO;
  97        rt_rq->rt_nr_migratory = 0;
  98        rt_rq->overloaded = 0;
  99        plist_head_init(&rt_rq->pushable_tasks);
 100
 101#ifdef HAVE_RT_PUSH_IPI
 102        rt_rq->push_flags = 0;
 103        rt_rq->push_cpu = nr_cpu_ids;
 104        raw_spin_lock_init(&rt_rq->push_lock);
 105        init_irq_work(&rt_rq->push_work, push_irq_work_func);
 106#endif
 107#endif /* CONFIG_SMP */
 108        /* We start is dequeued state, because no RT tasks are queued */
 109        rt_rq->rt_queued = 0;
 110
 111        rt_rq->rt_time = 0;
 112        rt_rq->rt_throttled = 0;
 113        rt_rq->rt_runtime = 0;
 114        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 115}
 116
 117#ifdef CONFIG_RT_GROUP_SCHED
 118static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 119{
 120        hrtimer_cancel(&rt_b->rt_period_timer);
 121}
 122
 123#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 124
 125static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 126{
 127#ifdef CONFIG_SCHED_DEBUG
 128        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 129#endif
 130        return container_of(rt_se, struct task_struct, rt);
 131}
 132
 133static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 134{
 135        return rt_rq->rq;
 136}
 137
 138static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 139{
 140        return rt_se->rt_rq;
 141}
 142
 143static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 144{
 145        struct rt_rq *rt_rq = rt_se->rt_rq;
 146
 147        return rt_rq->rq;
 148}
 149
 150void free_rt_sched_group(struct task_group *tg)
 151{
 152        int i;
 153
 154        if (tg->rt_se)
 155                destroy_rt_bandwidth(&tg->rt_bandwidth);
 156
 157        for_each_possible_cpu(i) {
 158                if (tg->rt_rq)
 159                        kfree(tg->rt_rq[i]);
 160                if (tg->rt_se)
 161                        kfree(tg->rt_se[i]);
 162        }
 163
 164        kfree(tg->rt_rq);
 165        kfree(tg->rt_se);
 166}
 167
 168void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 169                struct sched_rt_entity *rt_se, int cpu,
 170                struct sched_rt_entity *parent)
 171{
 172        struct rq *rq = cpu_rq(cpu);
 173
 174        rt_rq->highest_prio.curr = MAX_RT_PRIO;
 175        rt_rq->rt_nr_boosted = 0;
 176        rt_rq->rq = rq;
 177        rt_rq->tg = tg;
 178
 179        tg->rt_rq[cpu] = rt_rq;
 180        tg->rt_se[cpu] = rt_se;
 181
 182        if (!rt_se)
 183                return;
 184
 185        if (!parent)
 186                rt_se->rt_rq = &rq->rt;
 187        else
 188                rt_se->rt_rq = parent->my_q;
 189
 190        rt_se->my_q = rt_rq;
 191        rt_se->parent = parent;
 192        INIT_LIST_HEAD(&rt_se->run_list);
 193}
 194
 195int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 196{
 197        struct rt_rq *rt_rq;
 198        struct sched_rt_entity *rt_se;
 199        int i;
 200
 201        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 202        if (!tg->rt_rq)
 203                goto err;
 204        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 205        if (!tg->rt_se)
 206                goto err;
 207
 208        init_rt_bandwidth(&tg->rt_bandwidth,
 209                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 210
 211        for_each_possible_cpu(i) {
 212                rt_rq = kzalloc_node(sizeof(struct rt_rq),
 213                                     GFP_KERNEL, cpu_to_node(i));
 214                if (!rt_rq)
 215                        goto err;
 216
 217                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 218                                     GFP_KERNEL, cpu_to_node(i));
 219                if (!rt_se)
 220                        goto err_free_rq;
 221
 222                init_rt_rq(rt_rq);
 223                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 224                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 225        }
 226
 227        return 1;
 228
 229err_free_rq:
 230        kfree(rt_rq);
 231err:
 232        return 0;
 233}
 234
 235#else /* CONFIG_RT_GROUP_SCHED */
 236
 237#define rt_entity_is_task(rt_se) (1)
 238
 239static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 240{
 241        return container_of(rt_se, struct task_struct, rt);
 242}
 243
 244static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 245{
 246        return container_of(rt_rq, struct rq, rt);
 247}
 248
 249static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 250{
 251        struct task_struct *p = rt_task_of(rt_se);
 252
 253        return task_rq(p);
 254}
 255
 256static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 257{
 258        struct rq *rq = rq_of_rt_se(rt_se);
 259
 260        return &rq->rt;
 261}
 262
 263void free_rt_sched_group(struct task_group *tg) { }
 264
 265int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 266{
 267        return 1;
 268}
 269#endif /* CONFIG_RT_GROUP_SCHED */
 270
 271#ifdef CONFIG_SMP
 272
 273static void pull_rt_task(struct rq *this_rq);
 274
 275static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 276{
 277        /* Try to pull RT tasks here if we lower this rq's prio */
 278        return rq->rt.highest_prio.curr > prev->prio;
 279}
 280
 281static inline int rt_overloaded(struct rq *rq)
 282{
 283        return atomic_read(&rq->rd->rto_count);
 284}
 285
 286static inline void rt_set_overload(struct rq *rq)
 287{
 288        if (!rq->online)
 289                return;
 290
 291        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 292        /*
 293         * Make sure the mask is visible before we set
 294         * the overload count. That is checked to determine
 295         * if we should look at the mask. It would be a shame
 296         * if we looked at the mask, but the mask was not
 297         * updated yet.
 298         *
 299         * Matched by the barrier in pull_rt_task().
 300         */
 301        smp_wmb();
 302        atomic_inc(&rq->rd->rto_count);
 303}
 304
 305static inline void rt_clear_overload(struct rq *rq)
 306{
 307        if (!rq->online)
 308                return;
 309
 310        /* the order here really doesn't matter */
 311        atomic_dec(&rq->rd->rto_count);
 312        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 313}
 314
 315static void update_rt_migration(struct rt_rq *rt_rq)
 316{
 317        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 318                if (!rt_rq->overloaded) {
 319                        rt_set_overload(rq_of_rt_rq(rt_rq));
 320                        rt_rq->overloaded = 1;
 321                }
 322        } else if (rt_rq->overloaded) {
 323                rt_clear_overload(rq_of_rt_rq(rt_rq));
 324                rt_rq->overloaded = 0;
 325        }
 326}
 327
 328static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 329{
 330        struct task_struct *p;
 331
 332        if (!rt_entity_is_task(rt_se))
 333                return;
 334
 335        p = rt_task_of(rt_se);
 336        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 337
 338        rt_rq->rt_nr_total++;
 339        if (p->nr_cpus_allowed > 1)
 340                rt_rq->rt_nr_migratory++;
 341
 342        update_rt_migration(rt_rq);
 343}
 344
 345static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 346{
 347        struct task_struct *p;
 348
 349        if (!rt_entity_is_task(rt_se))
 350                return;
 351
 352        p = rt_task_of(rt_se);
 353        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 354
 355        rt_rq->rt_nr_total--;
 356        if (p->nr_cpus_allowed > 1)
 357                rt_rq->rt_nr_migratory--;
 358
 359        update_rt_migration(rt_rq);
 360}
 361
 362static inline int has_pushable_tasks(struct rq *rq)
 363{
 364        return !plist_head_empty(&rq->rt.pushable_tasks);
 365}
 366
 367static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 368static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 369
 370static void push_rt_tasks(struct rq *);
 371static void pull_rt_task(struct rq *);
 372
 373static inline void queue_push_tasks(struct rq *rq)
 374{
 375        if (!has_pushable_tasks(rq))
 376                return;
 377
 378        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 379}
 380
 381static inline void queue_pull_task(struct rq *rq)
 382{
 383        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 384}
 385
 386static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 387{
 388        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 389        plist_node_init(&p->pushable_tasks, p->prio);
 390        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 391
 392        /* Update the highest prio pushable task */
 393        if (p->prio < rq->rt.highest_prio.next)
 394                rq->rt.highest_prio.next = p->prio;
 395}
 396
 397static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 398{
 399        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 400
 401        /* Update the new highest prio pushable task */
 402        if (has_pushable_tasks(rq)) {
 403                p = plist_first_entry(&rq->rt.pushable_tasks,
 404                                      struct task_struct, pushable_tasks);
 405                rq->rt.highest_prio.next = p->prio;
 406        } else
 407                rq->rt.highest_prio.next = MAX_RT_PRIO;
 408}
 409
 410#else
 411
 412static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 413{
 414}
 415
 416static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 417{
 418}
 419
 420static inline
 421void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 422{
 423}
 424
 425static inline
 426void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 427{
 428}
 429
 430static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 431{
 432        return false;
 433}
 434
 435static inline void pull_rt_task(struct rq *this_rq)
 436{
 437}
 438
 439static inline void queue_push_tasks(struct rq *rq)
 440{
 441}
 442#endif /* CONFIG_SMP */
 443
 444static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 445static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 446
 447static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 448{
 449        return rt_se->on_rq;
 450}
 451
 452#ifdef CONFIG_RT_GROUP_SCHED
 453
 454static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 455{
 456        if (!rt_rq->tg)
 457                return RUNTIME_INF;
 458
 459        return rt_rq->rt_runtime;
 460}
 461
 462static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 463{
 464        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 465}
 466
 467typedef struct task_group *rt_rq_iter_t;
 468
 469static inline struct task_group *next_task_group(struct task_group *tg)
 470{
 471        do {
 472                tg = list_entry_rcu(tg->list.next,
 473                        typeof(struct task_group), list);
 474        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 475
 476        if (&tg->list == &task_groups)
 477                tg = NULL;
 478
 479        return tg;
 480}
 481
 482#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 483        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 484                (iter = next_task_group(iter)) &&                       \
 485                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 486
 487#define for_each_sched_rt_entity(rt_se) \
 488        for (; rt_se; rt_se = rt_se->parent)
 489
 490static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 491{
 492        return rt_se->my_q;
 493}
 494
 495static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 496static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 497
 498static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 499{
 500        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 501        struct rq *rq = rq_of_rt_rq(rt_rq);
 502        struct sched_rt_entity *rt_se;
 503
 504        int cpu = cpu_of(rq);
 505
 506        rt_se = rt_rq->tg->rt_se[cpu];
 507
 508        if (rt_rq->rt_nr_running) {
 509                if (!rt_se)
 510                        enqueue_top_rt_rq(rt_rq);
 511                else if (!on_rt_rq(rt_se))
 512                        enqueue_rt_entity(rt_se, 0);
 513
 514                if (rt_rq->highest_prio.curr < curr->prio)
 515                        resched_curr(rq);
 516        }
 517}
 518
 519static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 520{
 521        struct sched_rt_entity *rt_se;
 522        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 523
 524        rt_se = rt_rq->tg->rt_se[cpu];
 525
 526        if (!rt_se)
 527                dequeue_top_rt_rq(rt_rq);
 528        else if (on_rt_rq(rt_se))
 529                dequeue_rt_entity(rt_se, 0);
 530}
 531
 532static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 533{
 534        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 535}
 536
 537static int rt_se_boosted(struct sched_rt_entity *rt_se)
 538{
 539        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 540        struct task_struct *p;
 541
 542        if (rt_rq)
 543                return !!rt_rq->rt_nr_boosted;
 544
 545        p = rt_task_of(rt_se);
 546        return p->prio != p->normal_prio;
 547}
 548
 549#ifdef CONFIG_SMP
 550static inline const struct cpumask *sched_rt_period_mask(void)
 551{
 552        return this_rq()->rd->span;
 553}
 554#else
 555static inline const struct cpumask *sched_rt_period_mask(void)
 556{
 557        return cpu_online_mask;
 558}
 559#endif
 560
 561static inline
 562struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 563{
 564        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 565}
 566
 567static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 568{
 569        return &rt_rq->tg->rt_bandwidth;
 570}
 571
 572#else /* !CONFIG_RT_GROUP_SCHED */
 573
 574static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 575{
 576        return rt_rq->rt_runtime;
 577}
 578
 579static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 580{
 581        return ktime_to_ns(def_rt_bandwidth.rt_period);
 582}
 583
 584typedef struct rt_rq *rt_rq_iter_t;
 585
 586#define for_each_rt_rq(rt_rq, iter, rq) \
 587        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 588
 589#define for_each_sched_rt_entity(rt_se) \
 590        for (; rt_se; rt_se = NULL)
 591
 592static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 593{
 594        return NULL;
 595}
 596
 597static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 598{
 599        struct rq *rq = rq_of_rt_rq(rt_rq);
 600
 601        if (!rt_rq->rt_nr_running)
 602                return;
 603
 604        enqueue_top_rt_rq(rt_rq);
 605        resched_curr(rq);
 606}
 607
 608static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 609{
 610        dequeue_top_rt_rq(rt_rq);
 611}
 612
 613static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 614{
 615        return rt_rq->rt_throttled;
 616}
 617
 618static inline const struct cpumask *sched_rt_period_mask(void)
 619{
 620        return cpu_online_mask;
 621}
 622
 623static inline
 624struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 625{
 626        return &cpu_rq(cpu)->rt;
 627}
 628
 629static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 630{
 631        return &def_rt_bandwidth;
 632}
 633
 634#endif /* CONFIG_RT_GROUP_SCHED */
 635
 636bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 637{
 638        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 639
 640        return (hrtimer_active(&rt_b->rt_period_timer) ||
 641                rt_rq->rt_time < rt_b->rt_runtime);
 642}
 643
 644#ifdef CONFIG_SMP
 645/*
 646 * We ran out of runtime, see if we can borrow some from our neighbours.
 647 */
 648static void do_balance_runtime(struct rt_rq *rt_rq)
 649{
 650        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 651        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 652        int i, weight;
 653        u64 rt_period;
 654
 655        weight = cpumask_weight(rd->span);
 656
 657        raw_spin_lock(&rt_b->rt_runtime_lock);
 658        rt_period = ktime_to_ns(rt_b->rt_period);
 659        for_each_cpu(i, rd->span) {
 660                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 661                s64 diff;
 662
 663                if (iter == rt_rq)
 664                        continue;
 665
 666                raw_spin_lock(&iter->rt_runtime_lock);
 667                /*
 668                 * Either all rqs have inf runtime and there's nothing to steal
 669                 * or __disable_runtime() below sets a specific rq to inf to
 670                 * indicate its been disabled and disalow stealing.
 671                 */
 672                if (iter->rt_runtime == RUNTIME_INF)
 673                        goto next;
 674
 675                /*
 676                 * From runqueues with spare time, take 1/n part of their
 677                 * spare time, but no more than our period.
 678                 */
 679                diff = iter->rt_runtime - iter->rt_time;
 680                if (diff > 0) {
 681                        diff = div_u64((u64)diff, weight);
 682                        if (rt_rq->rt_runtime + diff > rt_period)
 683                                diff = rt_period - rt_rq->rt_runtime;
 684                        iter->rt_runtime -= diff;
 685                        rt_rq->rt_runtime += diff;
 686                        if (rt_rq->rt_runtime == rt_period) {
 687                                raw_spin_unlock(&iter->rt_runtime_lock);
 688                                break;
 689                        }
 690                }
 691next:
 692                raw_spin_unlock(&iter->rt_runtime_lock);
 693        }
 694        raw_spin_unlock(&rt_b->rt_runtime_lock);
 695}
 696
 697/*
 698 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 699 */
 700static void __disable_runtime(struct rq *rq)
 701{
 702        struct root_domain *rd = rq->rd;
 703        rt_rq_iter_t iter;
 704        struct rt_rq *rt_rq;
 705
 706        if (unlikely(!scheduler_running))
 707                return;
 708
 709        for_each_rt_rq(rt_rq, iter, rq) {
 710                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 711                s64 want;
 712                int i;
 713
 714                raw_spin_lock(&rt_b->rt_runtime_lock);
 715                raw_spin_lock(&rt_rq->rt_runtime_lock);
 716                /*
 717                 * Either we're all inf and nobody needs to borrow, or we're
 718                 * already disabled and thus have nothing to do, or we have
 719                 * exactly the right amount of runtime to take out.
 720                 */
 721                if (rt_rq->rt_runtime == RUNTIME_INF ||
 722                                rt_rq->rt_runtime == rt_b->rt_runtime)
 723                        goto balanced;
 724                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 725
 726                /*
 727                 * Calculate the difference between what we started out with
 728                 * and what we current have, that's the amount of runtime
 729                 * we lend and now have to reclaim.
 730                 */
 731                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 732
 733                /*
 734                 * Greedy reclaim, take back as much as we can.
 735                 */
 736                for_each_cpu(i, rd->span) {
 737                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 738                        s64 diff;
 739
 740                        /*
 741                         * Can't reclaim from ourselves or disabled runqueues.
 742                         */
 743                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 744                                continue;
 745
 746                        raw_spin_lock(&iter->rt_runtime_lock);
 747                        if (want > 0) {
 748                                diff = min_t(s64, iter->rt_runtime, want);
 749                                iter->rt_runtime -= diff;
 750                                want -= diff;
 751                        } else {
 752                                iter->rt_runtime -= want;
 753                                want -= want;
 754                        }
 755                        raw_spin_unlock(&iter->rt_runtime_lock);
 756
 757                        if (!want)
 758                                break;
 759                }
 760
 761                raw_spin_lock(&rt_rq->rt_runtime_lock);
 762                /*
 763                 * We cannot be left wanting - that would mean some runtime
 764                 * leaked out of the system.
 765                 */
 766                BUG_ON(want);
 767balanced:
 768                /*
 769                 * Disable all the borrow logic by pretending we have inf
 770                 * runtime - in which case borrowing doesn't make sense.
 771                 */
 772                rt_rq->rt_runtime = RUNTIME_INF;
 773                rt_rq->rt_throttled = 0;
 774                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 775                raw_spin_unlock(&rt_b->rt_runtime_lock);
 776
 777                /* Make rt_rq available for pick_next_task() */
 778                sched_rt_rq_enqueue(rt_rq);
 779        }
 780}
 781
 782static void __enable_runtime(struct rq *rq)
 783{
 784        rt_rq_iter_t iter;
 785        struct rt_rq *rt_rq;
 786
 787        if (unlikely(!scheduler_running))
 788                return;
 789
 790        /*
 791         * Reset each runqueue's bandwidth settings
 792         */
 793        for_each_rt_rq(rt_rq, iter, rq) {
 794                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 795
 796                raw_spin_lock(&rt_b->rt_runtime_lock);
 797                raw_spin_lock(&rt_rq->rt_runtime_lock);
 798                rt_rq->rt_runtime = rt_b->rt_runtime;
 799                rt_rq->rt_time = 0;
 800                rt_rq->rt_throttled = 0;
 801                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 802                raw_spin_unlock(&rt_b->rt_runtime_lock);
 803        }
 804}
 805
 806static void balance_runtime(struct rt_rq *rt_rq)
 807{
 808        if (!sched_feat(RT_RUNTIME_SHARE))
 809                return;
 810
 811        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 812                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 813                do_balance_runtime(rt_rq);
 814                raw_spin_lock(&rt_rq->rt_runtime_lock);
 815        }
 816}
 817#else /* !CONFIG_SMP */
 818static inline void balance_runtime(struct rt_rq *rt_rq) {}
 819#endif /* CONFIG_SMP */
 820
 821static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 822{
 823        int i, idle = 1, throttled = 0;
 824        const struct cpumask *span;
 825
 826        span = sched_rt_period_mask();
 827#ifdef CONFIG_RT_GROUP_SCHED
 828        /*
 829         * FIXME: isolated CPUs should really leave the root task group,
 830         * whether they are isolcpus or were isolated via cpusets, lest
 831         * the timer run on a CPU which does not service all runqueues,
 832         * potentially leaving other CPUs indefinitely throttled.  If
 833         * isolation is really required, the user will turn the throttle
 834         * off to kill the perturbations it causes anyway.  Meanwhile,
 835         * this maintains functionality for boot and/or troubleshooting.
 836         */
 837        if (rt_b == &root_task_group.rt_bandwidth)
 838                span = cpu_online_mask;
 839#endif
 840        for_each_cpu(i, span) {
 841                int enqueue = 0;
 842                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 843                struct rq *rq = rq_of_rt_rq(rt_rq);
 844                int skip;
 845
 846                /*
 847                 * When span == cpu_online_mask, taking each rq->lock
 848                 * can be time-consuming. Try to avoid it when possible.
 849                 */
 850                raw_spin_lock(&rt_rq->rt_runtime_lock);
 851                skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 852                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 853                if (skip)
 854                        continue;
 855
 856                raw_spin_lock(&rq->lock);
 857                if (rt_rq->rt_time) {
 858                        u64 runtime;
 859
 860                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 861                        if (rt_rq->rt_throttled)
 862                                balance_runtime(rt_rq);
 863                        runtime = rt_rq->rt_runtime;
 864                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 865                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 866                                rt_rq->rt_throttled = 0;
 867                                enqueue = 1;
 868
 869                                /*
 870                                 * When we're idle and a woken (rt) task is
 871                                 * throttled check_preempt_curr() will set
 872                                 * skip_update and the time between the wakeup
 873                                 * and this unthrottle will get accounted as
 874                                 * 'runtime'.
 875                                 */
 876                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 877                                        rq_clock_skip_update(rq, false);
 878                        }
 879                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 880                                idle = 0;
 881                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 882                } else if (rt_rq->rt_nr_running) {
 883                        idle = 0;
 884                        if (!rt_rq_throttled(rt_rq))
 885                                enqueue = 1;
 886                }
 887                if (rt_rq->rt_throttled)
 888                        throttled = 1;
 889
 890                if (enqueue)
 891                        sched_rt_rq_enqueue(rt_rq);
 892                raw_spin_unlock(&rq->lock);
 893        }
 894
 895        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 896                return 1;
 897
 898        return idle;
 899}
 900
 901static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 902{
 903#ifdef CONFIG_RT_GROUP_SCHED
 904        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 905
 906        if (rt_rq)
 907                return rt_rq->highest_prio.curr;
 908#endif
 909
 910        return rt_task_of(rt_se)->prio;
 911}
 912
 913static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 914{
 915        u64 runtime = sched_rt_runtime(rt_rq);
 916
 917        if (rt_rq->rt_throttled)
 918                return rt_rq_throttled(rt_rq);
 919
 920        if (runtime >= sched_rt_period(rt_rq))
 921                return 0;
 922
 923        balance_runtime(rt_rq);
 924        runtime = sched_rt_runtime(rt_rq);
 925        if (runtime == RUNTIME_INF)
 926                return 0;
 927
 928        if (rt_rq->rt_time > runtime) {
 929                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 930
 931                /*
 932                 * Don't actually throttle groups that have no runtime assigned
 933                 * but accrue some time due to boosting.
 934                 */
 935                if (likely(rt_b->rt_runtime)) {
 936                        rt_rq->rt_throttled = 1;
 937                        printk_deferred_once("sched: RT throttling activated\n");
 938                } else {
 939                        /*
 940                         * In case we did anyway, make it go away,
 941                         * replenishment is a joke, since it will replenish us
 942                         * with exactly 0 ns.
 943                         */
 944                        rt_rq->rt_time = 0;
 945                }
 946
 947                if (rt_rq_throttled(rt_rq)) {
 948                        sched_rt_rq_dequeue(rt_rq);
 949                        return 1;
 950                }
 951        }
 952
 953        return 0;
 954}
 955
 956/*
 957 * Update the current task's runtime statistics. Skip current tasks that
 958 * are not in our scheduling class.
 959 */
 960static void update_curr_rt(struct rq *rq)
 961{
 962        struct task_struct *curr = rq->curr;
 963        struct sched_rt_entity *rt_se = &curr->rt;
 964        u64 delta_exec;
 965
 966        if (curr->sched_class != &rt_sched_class)
 967                return;
 968
 969        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 970        if (unlikely((s64)delta_exec <= 0))
 971                return;
 972
 973        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 974        cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
 975
 976        schedstat_set(curr->se.statistics.exec_max,
 977                      max(curr->se.statistics.exec_max, delta_exec));
 978
 979        curr->se.sum_exec_runtime += delta_exec;
 980        account_group_exec_runtime(curr, delta_exec);
 981
 982        curr->se.exec_start = rq_clock_task(rq);
 983        cpuacct_charge(curr, delta_exec);
 984
 985        sched_rt_avg_update(rq, delta_exec);
 986
 987        if (!rt_bandwidth_enabled())
 988                return;
 989
 990        for_each_sched_rt_entity(rt_se) {
 991                struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 992
 993                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 994                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 995                        rt_rq->rt_time += delta_exec;
 996                        if (sched_rt_runtime_exceeded(rt_rq))
 997                                resched_curr(rq);
 998                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 999                }
1000        }

1001}
1002
1003static void
1004dequeue_top_rt_rq(struct rt_rq *rt_rq)
1005{
1006        struct rq *rq = rq_of_rt_rq(rt_rq);
1007
1008        BUG_ON(&rq->rt != rt_rq);
1009
1010        if (!rt_rq->rt_queued)
1011                return;
1012
1013        BUG_ON(!rq->nr_running);
1014
1015        sub_nr_running(rq, rt_rq->rt_nr_running);
1016        rt_rq->rt_queued = 0;
1017}
1018
1019static void
1020enqueue_top_rt_rq(struct rt_rq *rt_rq)
1021{
1022        struct rq *rq = rq_of_rt_rq(rt_rq);
1023
1024        BUG_ON(&rq->rt != rt_rq);
1025
1026        if (rt_rq->rt_queued)
1027                return;
1028        if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1029                return;
1030
1031        add_nr_running(rq, rt_rq->rt_nr_running);
1032        rt_rq->rt_queued = 1;
1033}
1034
1035#if defined CONFIG_SMP
1036
1037static void
1038inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1039{
1040        struct rq *rq = rq_of_rt_rq(rt_rq);
1041
1042#ifdef CONFIG_RT_GROUP_SCHED
1043        /*
1044         * Change rq's cpupri only if rt_rq is the top queue.
1045         */
1046        if (&rq->rt != rt_rq)
1047                return;
1048#endif
1049        if (rq->online && prio < prev_prio)
1050                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1051}
1052
1053static void
1054dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1055{
1056        struct rq *rq = rq_of_rt_rq(rt_rq);
1057
1058#ifdef CONFIG_RT_GROUP_SCHED
1059        /*
1060         * Change rq's cpupri only if rt_rq is the top queue.
1061         */
1062        if (&rq->rt != rt_rq)
1063                return;
1064#endif
1065        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1066                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1067}
1068
1069#else /* CONFIG_SMP */
1070
1071static inline
1072void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1073static inline
1074void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1075
1076#endif /* CONFIG_SMP */
1077
1078#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1079static void
1080inc_rt_prio(struct rt_rq *rt_rq, int prio)
1081{
1082        int prev_prio = rt_rq->highest_prio.curr;
1083
1084        if (prio < prev_prio)
1085                rt_rq->highest_prio.curr = prio;
1086
1087        inc_rt_prio_smp(rt_rq, prio, prev_prio);
1088}
1089
1090static void
1091dec_rt_prio(struct rt_rq *rt_rq, int prio)
1092{
1093        int prev_prio = rt_rq->highest_prio.curr;
1094
1095        if (rt_rq->rt_nr_running) {
1096
1097                WARN_ON(prio < prev_prio);
1098
1099                /*
1100                 * This may have been our highest task, and therefore
1101                 * we may have some recomputation to do
1102                 */
1103                if (prio == prev_prio) {
1104                        struct rt_prio_array *array = &rt_rq->active;
1105
1106                        rt_rq->highest_prio.curr =
1107                                sched_find_first_bit(array->bitmap);
1108                }
1109
1110        } else
1111                rt_rq->highest_prio.curr = MAX_RT_PRIO;
1112
1113        dec_rt_prio_smp(rt_rq, prio, prev_prio);
1114}
1115
1116#else
1117
1118static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1119static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1120
1121#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1122
1123#ifdef CONFIG_RT_GROUP_SCHED
1124
1125static void
1126inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1127{
1128        if (rt_se_boosted(rt_se))
1129                rt_rq->rt_nr_boosted++;
1130
1131        if (rt_rq->tg)
1132                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1133}
1134
1135static void
1136dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1137{
1138        if (rt_se_boosted(rt_se))
1139                rt_rq->rt_nr_boosted--;
1140
1141        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1142}
1143
1144#else /* CONFIG_RT_GROUP_SCHED */
1145
1146static void
1147inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1148{
1149        start_rt_bandwidth(&def_rt_bandwidth);
1150}
1151
1152static inline
1153void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1154
1155#endif /* CONFIG_RT_GROUP_SCHED */
1156
1157static inline
1158unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1159{
1160        struct rt_rq *group_rq = group_rt_rq(rt_se);
1161
1162        if (group_rq)
1163                return group_rq->rt_nr_running;
1164        else
1165                return 1;
1166}
1167
1168static inline
1169unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1170{
1171        struct rt_rq *group_rq = group_rt_rq(rt_se);
1172        struct task_struct *tsk;
1173
1174        if (group_rq)
1175                return group_rq->rr_nr_running;
1176
1177        tsk = rt_task_of(rt_se);
1178
1179        return (tsk->policy == SCHED_RR) ? 1 : 0;
1180}
1181
1182static inline
1183void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1184{
1185        int prio = rt_se_prio(rt_se);
1186
1187        WARN_ON(!rt_prio(prio));
1188        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1189        rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1190
1191        inc_rt_prio(rt_rq, prio);
1192        inc_rt_migration(rt_se, rt_rq);
1193        inc_rt_group(rt_se, rt_rq);
1194}
1195
1196static inline
1197void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1198{
1199        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1200        WARN_ON(!rt_rq->rt_nr_running);
1201        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1202        rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1203
1204        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1205        dec_rt_migration(rt_se, rt_rq);
1206        dec_rt_group(rt_se, rt_rq);
1207}
1208
1209/*
1210 * Change rt_se->run_list location unless SAVE && !MOVE
1211 *
1212 * assumes ENQUEUE/DEQUEUE flags match
1213 */
1214static inline bool move_entity(unsigned int flags)
1215{
1216        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1217                return false;
1218
1219        return true;
1220}
1221
1222static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1223{
1224        list_del_init(&rt_se->run_list);
1225
1226        if (list_empty(array->queue + rt_se_prio(rt_se)))
1227                __clear_bit(rt_se_prio(rt_se), array->bitmap);
1228
1229        rt_se->on_list = 0;
1230}
1231
1232static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1233{
1234        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1235        struct rt_prio_array *array = &rt_rq->active;
1236        struct rt_rq *group_rq = group_rt_rq(rt_se);
1237        struct list_head *queue = array->queue + rt_se_prio(rt_se);
1238
1239        /*
1240         * Don't enqueue the group if its throttled, or when empty.
1241         * The latter is a consequence of the former when a child group
1242         * get throttled and the current group doesn't have any other
1243         * active members.
1244         */
1245        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1246                if (rt_se->on_list)
1247                        __delist_rt_entity(rt_se, array);
1248                return;
1249        }
1250
1251        if (move_entity(flags)) {
1252                WARN_ON_ONCE(rt_se->on_list);
1253                if (flags & ENQUEUE_HEAD)
1254                        list_add(&rt_se->run_list, queue);
1255                else
1256                        list_add_tail(&rt_se->run_list, queue);
1257
1258                __set_bit(rt_se_prio(rt_se), array->bitmap);
1259                rt_se->on_list = 1;
1260        }
1261        rt_se->on_rq = 1;
1262
1263        inc_rt_tasks(rt_se, rt_rq);
1264}
1265
1266static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1267{
1268        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1269        struct rt_prio_array *array = &rt_rq->active;
1270
1271        if (move_entity(flags)) {
1272                WARN_ON_ONCE(!rt_se->on_list);
1273                __delist_rt_entity(rt_se, array);
1274        }
1275        rt_se->on_rq = 0;
1276
1277        dec_rt_tasks(rt_se, rt_rq);
1278}
1279
1280/*
1281 * Because the prio of an upper entry depends on the lower
1282 * entries, we must remove entries top - down.
1283 */
1284static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1285{
1286        struct sched_rt_entity *back = NULL;
1287
1288        for_each_sched_rt_entity(rt_se) {
1289                rt_se->back = back;
1290                back = rt_se;
1291        }
1292
1293        dequeue_top_rt_rq(rt_rq_of_se(back));
1294
1295        for (rt_se = back; rt_se; rt_se = rt_se->back) {
1296                if (on_rt_rq(rt_se))
1297                        __dequeue_rt_entity(rt_se, flags);
1298        }
1299}
1300
1301static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1302{
1303        struct rq *rq = rq_of_rt_se(rt_se);
1304
1305        dequeue_rt_stack(rt_se, flags);
1306        for_each_sched_rt_entity(rt_se)
1307                __enqueue_rt_entity(rt_se, flags);
1308        enqueue_top_rt_rq(&rq->rt);
1309}
1310
1311static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1312{
1313        struct rq *rq = rq_of_rt_se(rt_se);
1314
1315        dequeue_rt_stack(rt_se, flags);
1316
1317        for_each_sched_rt_entity(rt_se) {
1318                struct rt_rq *rt_rq = group_rt_rq(rt_se);
1319
1320                if (rt_rq && rt_rq->rt_nr_running)
1321                        __enqueue_rt_entity(rt_se, flags);
1322        }
1323        enqueue_top_rt_rq(&rq->rt);
1324}
1325
1326/*
1327 * Adding/removing a task to/from a priority array:
1328 */
1329static void
1330enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1331{
1332        struct sched_rt_entity *rt_se = &p->rt;
1333
1334        if (flags & ENQUEUE_WAKEUP)
1335                rt_se->timeout = 0;
1336
1337        enqueue_rt_entity(rt_se, flags);
1338
1339        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1340                enqueue_pushable_task(rq, p);
1341}
1342
1343static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1344{
1345        struct sched_rt_entity *rt_se = &p->rt;
1346
1347        update_curr_rt(rq);
1348        dequeue_rt_entity(rt_se, flags);
1349
1350        dequeue_pushable_task(rq, p);
1351}
1352
1353/*
1354 * Put task to the head or the end of the run list without the overhead of
1355 * dequeue followed by enqueue.
1356 */
1357static void
1358requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1359{
1360        if (on_rt_rq(rt_se)) {
1361                struct rt_prio_array *array = &rt_rq->active;
1362                struct list_head *queue = array->queue + rt_se_prio(rt_se);
1363
1364                if (head)
1365                        list_move(&rt_se->run_list, queue);
1366                else
1367                        list_move_tail(&rt_se->run_list, queue);
1368        }
1369}
1370
1371static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1372{
1373        struct sched_rt_entity *rt_se = &p->rt;
1374        struct rt_rq *rt_rq;
1375
1376        for_each_sched_rt_entity(rt_se) {
1377                rt_rq = rt_rq_of_se(rt_se);
1378                requeue_rt_entity(rt_rq, rt_se, head);
1379        }
1380}
1381
1382static void yield_task_rt(struct rq *rq)
1383{
1384        requeue_task_rt(rq, rq->curr, 0);
1385}
1386
1387#ifdef CONFIG_SMP
1388static int find_lowest_rq(struct task_struct *task);
1389
1390static int
1391select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1392{
1393        struct task_struct *curr;
1394        struct rq *rq;
1395
1396        /* For anything but wake ups, just return the task_cpu */
1397        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1398                goto out;
1399
1400        rq = cpu_rq(cpu);
1401
1402        rcu_read_lock();
1403        curr = READ_ONCE(rq->curr); /* unlocked access */
1404
1405        /*
1406         * If the current task on @p's runqueue is an RT task, then
1407         * try to see if we can wake this RT task up on another
1408         * runqueue. Otherwise simply start this RT task
1409         * on its current runqueue.
1410         *
1411         * We want to avoid overloading runqueues. If the woken
1412         * task is a higher priority, then it will stay on this CPU
1413         * and the lower prio task should be moved to another CPU.
1414         * Even though this will probably make the lower prio task
1415         * lose its cache, we do not want to bounce a higher task
1416         * around just because it gave up its CPU, perhaps for a
1417         * lock?
1418         *
1419         * For equal prio tasks, we just let the scheduler sort it out.
1420         *
1421         * Otherwise, just let it ride on the affined RQ and the
1422         * post-schedule router will push the preempted task away
1423         *
1424         * This test is optimistic, if we get it wrong the load-balancer
1425         * will have to sort it out.
1426         */
1427        if (curr && unlikely(rt_task(curr)) &&
1428            (curr->nr_cpus_allowed < 2 ||
1429             curr->prio <= p->prio)) {
1430                int target = find_lowest_rq(p);
1431
1432                /*
1433                 * Don't bother moving it if the destination CPU is
1434                 * not running a lower priority task.
1435                 */
1436                if (target != -1 &&
1437                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
1438                        cpu = target;
1439        }
1440        rcu_read_unlock();
1441
1442out:
1443        return cpu;
1444}
1445
1446static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1447{
1448        /*
1449         * Current can't be migrated, useless to reschedule,
1450         * let's hope p can move out.
1451         */
1452        if (rq->curr->nr_cpus_allowed == 1 ||
1453            !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1454                return;
1455
1456        /*
1457         * p is migratable, so let's not schedule it and
1458         * see if it is pushed or pulled somewhere else.
1459         */
1460        if (p->nr_cpus_allowed != 1
1461            && cpupri_find(&rq->rd->cpupri, p, NULL))
1462                return;
1463
1464        /*
1465         * There appears to be other cpus that can accept
1466         * current and none to run 'p', so lets reschedule
1467         * to try and push current away:
1468         */
1469        requeue_task_rt(rq, p, 1);
1470        resched_curr(rq);
1471}
1472
1473#endif /* CONFIG_SMP */
1474
1475/*
1476 * Preempt the current task with a newly woken task if needed:
1477 */
1478static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1479{
1480        if (p->prio < rq->curr->prio) {
1481                resched_curr(rq);
1482                return;
1483        }
1484
1485#ifdef CONFIG_SMP
1486        /*
1487         * If:
1488         *
1489         * - the newly woken task is of equal priority to the current task
1490         * - the newly woken task is non-migratable while current is migratable
1491         * - current will be preempted on the next reschedule
1492         *
1493         * we should check to see if current can readily move to a different
1494         * cpu.  If so, we will reschedule to allow the push logic to try
1495         * to move current somewhere else, making room for our non-migratable
1496         * task.
1497         */
1498        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1499                check_preempt_equal_prio(rq, p);
1500#endif
1501}
1502
1503static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1504                                                   struct rt_rq *rt_rq)
1505{
1506        struct rt_prio_array *array = &rt_rq->active;
1507        struct sched_rt_entity *next = NULL;
1508        struct list_head *queue;
1509        int idx;
1510
1511        idx = sched_find_first_bit(array->bitmap);
1512        BUG_ON(idx >= MAX_RT_PRIO);
1513
1514        queue = array->queue + idx;
1515        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1516
1517        return next;
1518}
1519
1520static struct task_struct *_pick_next_task_rt(struct rq *rq)
1521{
1522        struct sched_rt_entity *rt_se;
1523        struct task_struct *p;
1524        struct rt_rq *rt_rq  = &rq->rt;
1525
1526        do {
1527                rt_se = pick_next_rt_entity(rq, rt_rq);
1528                BUG_ON(!rt_se);
1529                rt_rq = group_rt_rq(rt_se);
1530        } while (rt_rq);
1531
1532        p = rt_task_of(rt_se);
1533        p->se.exec_start = rq_clock_task(rq);
1534
1535        return p;
1536}
1537
1538static struct task_struct *
1539pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1540{
1541        struct task_struct *p;
1542        struct rt_rq *rt_rq = &rq->rt;
1543
1544        if (need_pull_rt_task(rq, prev)) {
1545                /*
1546                 * This is OK, because current is on_cpu, which avoids it being
1547                 * picked for load-balance and preemption/IRQs are still
1548                 * disabled avoiding further scheduler activity on it and we're
1549                 * being very careful to re-start the picking loop.
1550                 */
1551                rq_unpin_lock(rq, rf);
1552                pull_rt_task(rq);
1553                rq_repin_lock(rq, rf);
1554                /*
1555                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1556                 * means a dl or stop task can slip in, in which case we need
1557                 * to re-start task selection.
1558                 */
1559                if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1560                             rq->dl.dl_nr_running))
1561                        return RETRY_TASK;
1562        }
1563
1564        /*
1565         * We may dequeue prev's rt_rq in put_prev_task().
1566         * So, we update time before rt_nr_running check.
1567         */
1568        if (prev->sched_class == &rt_sched_class)
1569                update_curr_rt(rq);
1570
1571        if (!rt_rq->rt_queued)
1572                return NULL;
1573
1574        put_prev_task(rq, prev);
1575
1576        p = _pick_next_task_rt(rq);
1577
1578        /* The running task is never eligible for pushing */
1579        dequeue_pushable_task(rq, p);
1580
1581        queue_push_tasks(rq);
1582
1583        return p;
1584}
1585
1586static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1587{
1588        update_curr_rt(rq);
1589
1590        /*
1591         * The previous task needs to be made eligible for pushing
1592         * if it is still active
1593         */
1594        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1595                enqueue_pushable_task(rq, p);
1596}
1597
1598#ifdef CONFIG_SMP
1599
1600/* Only try algorithms three times */
1601#define RT_MAX_TRIES 3
1602
1603static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1604{
1605        if (!task_running(rq, p) &&
1606            cpumask_test_cpu(cpu, &p->cpus_allowed))
1607                return 1;
1608        return 0;
1609}
1610
1611/*
1612 * Return the highest pushable rq's task, which is suitable to be executed
1613 * on the cpu, NULL otherwise
1614 */
1615static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1616{
1617        struct plist_head *head = &rq->rt.pushable_tasks;
1618        struct task_struct *p;
1619
1620        if (!has_pushable_tasks(rq))
1621                return NULL;
1622
1623        plist_for_each_entry(p, head, pushable_tasks) {
1624                if (pick_rt_task(rq, p, cpu))
1625                        return p;
1626        }
1627
1628        return NULL;
1629}
1630
1631static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1632
1633static int find_lowest_rq(struct task_struct *task)
1634{
1635        struct sched_domain *sd;
1636        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1637        int this_cpu = smp_processor_id();
1638        int cpu      = task_cpu(task);
1639
1640        /* Make sure the mask is initialized first */
1641        if (unlikely(!lowest_mask))
1642                return -1;
1643
1644        if (task->nr_cpus_allowed == 1)
1645                return -1; /* No other targets possible */
1646
1647        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1648                return -1; /* No targets found */
1649
1650        /*
1651         * At this point we have built a mask of cpus representing the
1652         * lowest priority tasks in the system.  Now we want to elect
1653         * the best one based on our affinity and topology.
1654         *
1655         * We prioritize the last cpu that the task executed on since
1656         * it is most likely cache-hot in that location.
1657         */
1658        if (cpumask_test_cpu(cpu, lowest_mask))
1659                return cpu;
1660
1661        /*
1662         * Otherwise, we consult the sched_domains span maps to figure
1663         * out which cpu is logically closest to our hot cache data.
1664         */
1665        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1666                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1667
1668        rcu_read_lock();
1669        for_each_domain(cpu, sd) {
1670                if (sd->flags & SD_WAKE_AFFINE) {
1671                        int best_cpu;
1672
1673                        /*
1674                         * "this_cpu" is cheaper to preempt than a
1675                         * remote processor.
1676                         */
1677                        if (this_cpu != -1 &&
1678                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1679                                rcu_read_unlock();
1680                                return this_cpu;
1681                        }
1682
1683                        best_cpu = cpumask_first_and(lowest_mask,
1684                                                     sched_domain_span(sd));
1685                        if (best_cpu < nr_cpu_ids) {
1686                                rcu_read_unlock();
1687                                return best_cpu;
1688                        }
1689                }
1690        }
1691        rcu_read_unlock();
1692
1693        /*
1694         * And finally, if there were no matches within the domains
1695         * just give the caller *something* to work with from the compatible
1696         * locations.
1697         */
1698        if (this_cpu != -1)
1699                return this_cpu;
1700
1701        cpu = cpumask_any(lowest_mask);
1702        if (cpu < nr_cpu_ids)
1703                return cpu;
1704        return -1;
1705}
1706
1707/* Will lock the rq it finds */
1708static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1709{
1710        struct rq *lowest_rq = NULL;
1711        int tries;
1712        int cpu;
1713
1714        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1715                cpu = find_lowest_rq(task);
1716
1717                if ((cpu == -1) || (cpu == rq->cpu))
1718                        break;
1719
1720                lowest_rq = cpu_rq(cpu);
1721
1722                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1723                        /*
1724                         * Target rq has tasks of equal or higher priority,
1725                         * retrying does not release any lock and is unlikely
1726                         * to yield a different result.
1727                         */
1728                        lowest_rq = NULL;
1729                        break;
1730                }
1731
1732                /* if the prio of this runqueue changed, try again */
1733                if (double_lock_balance(rq, lowest_rq)) {
1734                        /*
1735                         * We had to unlock the run queue. In
1736                         * the mean time, task could have
1737                         * migrated already or had its affinity changed.
1738                         * Also make sure that it wasn't scheduled on its rq.
1739                         */
1740                        if (unlikely(task_rq(task) != rq ||
1741                                     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1742                                     task_running(rq, task) ||
1743                                     !rt_task(task) ||
1744                                     !task_on_rq_queued(task))) {
1745
1746                                double_unlock_balance(rq, lowest_rq);
1747                                lowest_rq = NULL;
1748                                break;
1749                        }
1750                }
1751
1752                /* If this rq is still suitable use it. */
1753                if (lowest_rq->rt.highest_prio.curr > task->prio)
1754                        break;
1755
1756                /* try again */
1757                double_unlock_balance(rq, lowest_rq);
1758                lowest_rq = NULL;
1759        }
1760
1761        return lowest_rq;
1762}
1763
1764static struct task_struct *pick_next_pushable_task(struct rq *rq)
1765{
1766        struct task_struct *p;
1767
1768        if (!has_pushable_tasks(rq))
1769                return NULL;
1770
1771        p = plist_first_entry(&rq->rt.pushable_tasks,
1772                              struct task_struct, pushable_tasks);
1773
1774        BUG_ON(rq->cpu != task_cpu(p));
1775        BUG_ON(task_current(rq, p));
1776        BUG_ON(p->nr_cpus_allowed <= 1);
1777
1778        BUG_ON(!task_on_rq_queued(p));
1779        BUG_ON(!rt_task(p));
1780
1781        return p;
1782}
1783
1784/*
1785 * If the current CPU has more than one RT task, see if the non
1786 * running task can migrate over to a CPU that is running a task
1787 * of lesser priority.
1788 */
1789static int push_rt_task(struct rq *rq)
1790{
1791        struct task_struct *next_task;
1792        struct rq *lowest_rq;
1793        int ret = 0;
1794
1795        if (!rq->rt.overloaded)
1796                return 0;
1797
1798        next_task = pick_next_pushable_task(rq);
1799        if (!next_task)
1800                return 0;
1801
1802retry:
1803        if (unlikely(next_task == rq->curr)) {
1804                WARN_ON(1);
1805                return 0;
1806        }
1807
1808        /*
1809         * It's possible that the next_task slipped in of
1810         * higher priority than current. If that's the case
1811         * just reschedule current.
1812         */
1813        if (unlikely(next_task->prio < rq->curr->prio)) {
1814                resched_curr(rq);
1815                return 0;
1816        }
1817
1818        /* We might release rq lock */
1819        get_task_struct(next_task);
1820
1821        /* find_lock_lowest_rq locks the rq if found */
1822        lowest_rq = find_lock_lowest_rq(next_task, rq);
1823        if (!lowest_rq) {
1824                struct task_struct *task;
1825                /*
1826                 * find_lock_lowest_rq releases rq->lock
1827                 * so it is possible that next_task has migrated.
1828                 *
1829                 * We need to make sure that the task is still on the same
1830                 * run-queue and is also still the next task eligible for
1831                 * pushing.
1832                 */
1833                task = pick_next_pushable_task(rq);
1834                if (task == next_task) {
1835                        /*
1836                         * The task hasn't migrated, and is still the next
1837                         * eligible task, but we failed to find a run-queue
1838                         * to push it to.  Do not retry in this case, since
1839                         * other cpus will pull from us when ready.
1840                         */
1841                        goto out;
1842                }
1843
1844                if (!task)
1845                        /* No more tasks, just exit */
1846                        goto out;
1847
1848                /*
1849                 * Something has shifted, try again.
1850                 */
1851                put_task_struct(next_task);
1852                next_task = task;
1853                goto retry;
1854        }
1855
1856        deactivate_task(rq, next_task, 0);
1857        set_task_cpu(next_task, lowest_rq->cpu);
1858        activate_task(lowest_rq, next_task, 0);
1859        ret = 1;
1860
1861        resched_curr(lowest_rq);
1862
1863        double_unlock_balance(rq, lowest_rq);
1864
1865out:
1866        put_task_struct(next_task);
1867
1868        return ret;
1869}
1870
1871static void push_rt_tasks(struct rq *rq)
1872{
1873        /* push_rt_task will return true if it moved an RT */
1874        while (push_rt_task(rq))
1875                ;
1876}
1877
1878#ifdef HAVE_RT_PUSH_IPI
1879/*
1880 * The search for the next cpu always starts at rq->cpu and ends
1881 * when we reach rq->cpu again. It will never return rq->cpu.
1882 * This returns the next cpu to check, or nr_cpu_ids if the loop
1883 * is complete.
1884 *
1885 * rq->rt.push_cpu holds the last cpu returned by this function,
1886 * or if this is the first instance, it must hold rq->cpu.
1887 */
1888static int rto_next_cpu(struct rq *rq)
1889{
1890        int prev_cpu = rq->rt.push_cpu;
1891        int cpu;
1892
1893        cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
1894
1895        /*
1896         * If the previous cpu is less than the rq's CPU, then it already
1897         * passed the end of the mask, and has started from the beginning.
1898         * We end if the next CPU is greater or equal to rq's CPU.
1899         */
1900        if (prev_cpu < rq->cpu) {
1901                if (cpu >= rq->cpu)
1902                        return nr_cpu_ids;
1903
1904        } else if (cpu >= nr_cpu_ids) {
1905                /*
1906                 * We passed the end of the mask, start at the beginning.
1907                 * If the result is greater or equal to the rq's CPU, then
1908                 * the loop is finished.
1909                 */
1910                cpu = cpumask_first(rq->rd->rto_mask);
1911                if (cpu >= rq->cpu)
1912                        return nr_cpu_ids;
1913        }
1914        rq->rt.push_cpu = cpu;
1915
1916        /* Return cpu to let the caller know if the loop is finished or not */
1917        return cpu;
1918}
1919
1920static int find_next_push_cpu(struct rq *rq)
1921{
1922        struct rq *next_rq;
1923        int cpu;
1924
1925        while (1) {
1926                cpu = rto_next_cpu(rq);
1927                if (cpu >= nr_cpu_ids)
1928                        break;
1929                next_rq = cpu_rq(cpu);
1930
1931                /* Make sure the next rq can push to this rq */
1932                if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
1933                        break;
1934        }
1935
1936        return cpu;
1937}
1938
1939#define RT_PUSH_IPI_EXECUTING           1
1940#define RT_PUSH_IPI_RESTART             2
1941
1942/*
1943 * When a high priority task schedules out from a CPU and a lower priority
1944 * task is scheduled in, a check is made to see if there's any RT tasks
1945 * on other CPUs that are waiting to run because a higher priority RT task
1946 * is currently running on its CPU. In this case, the CPU with multiple RT
1947 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1948 * up that may be able to run one of its non-running queued RT tasks.
1949 *
1950 * On large CPU boxes, there's the case that several CPUs could schedule
1951 * a lower priority task at the same time, in which case it will look for
1952 * any overloaded CPUs that it could pull a task from. To do this, the runqueue
1953 * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
1954 * for a single overloaded CPU's runqueue lock can produce a large latency.
1955 * (This has actually been observed on large boxes running cyclictest).
1956 * Instead of taking the runqueue lock of the overloaded CPU, each of the
1957 * CPUs that scheduled a lower priority task simply sends an IPI to the
1958 * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
1959 * lots of contention. The overloaded CPU will look to push its non-running
1960 * RT task off, and if it does, it can then ignore the other IPIs coming
1961 * in, and just pass those IPIs off to any other overloaded CPU.
1962 *
1963 * When a CPU schedules a lower priority task, it only sends an IPI to
1964 * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
1965 * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
1966 * RT overloaded tasks, would cause 100 IPIs to go out at once.
1967 *
1968 * The overloaded RT CPU, when receiving an IPI, will try to push off its
1969 * overloaded RT tasks and then send an IPI to the next CPU that has
1970 * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
1971 * have completed. Just because a CPU may have pushed off its own overloaded
1972 * RT task does not mean it should stop sending the IPI around to other
1973 * overloaded CPUs. There may be another RT task waiting to run on one of
1974 * those CPUs that are of higher priority than the one that was just
1975 * pushed.
1976 *
1977 * An optimization that could possibly be made is to make a CPU array similar
1978 * to the cpupri array mask of all running RT tasks, but for the overloaded
1979 * case, then the IPI could be sent to only the CPU with the highest priority
1980 * RT task waiting, and that CPU could send off further IPIs to the CPU with
1981 * the next highest waiting task. Since the overloaded case is much less likely
1982 * to happen, the complexity of this implementation may not be worth it.
1983 * Instead, just send an IPI around to all overloaded CPUs.
1984 *
1985 * The rq->rt.push_flags holds the status of the IPI that is going around.
1986 * A run queue can only send out a single IPI at a time. The possible flags
1987 * for rq->rt.push_flags are:
1988 *
1989 *    (None or zero):           No IPI is going around for the current rq
1990 *    RT_PUSH_IPI_EXECUTING:    An IPI for the rq is being passed around
1991 *    RT_PUSH_IPI_RESTART:      The priority of the running task for the rq
1992 *                              has changed, and the IPI should restart
1993 *                              circulating the overloaded CPUs again.
1994 *
1995 * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
1996 * before sending to the next CPU.
1997 *
1998 * Instead of having all CPUs that schedule a lower priority task send
1999 * an IPI to the same "first" CPU in the RT overload mask, they send it
2000 * to the next overloaded CPU after their own CPU. This helps distribute

2001 * the work when there's more than one overloaded CPU and multiple CPUs
2002 * scheduling in lower priority tasks.
2003 *
2004 * When a rq schedules a lower priority task than what was currently
2005 * running, the next CPU with overloaded RT tasks is examined first.
2006 * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
2007 * priority task, it will send an IPI first to CPU 5, then CPU 5 will
2008 * send to CPU 1 if it is still overloaded. CPU 1 will clear the
2009 * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
2010 *
2011 * The first CPU to notice IPI_RESTART is set, will clear that flag and then
2012 * send an IPI to the next overloaded CPU after the rq->cpu and not the next
2013 * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
2014 * schedules a lower priority task, and the IPI_RESTART gets set while the
2015 * handling is being done on CPU 5, it will clear the flag and send it back to
2016 * CPU 4 instead of CPU 1.
2017 *
2018 * Note, the above logic can be disabled by turning off the sched_feature
2019 * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
2020 * taken by the CPU requesting a pull and the waiting RT task will be pulled
2021 * by that CPU. This may be fine for machines with few CPUs.
2022 */
2023static void tell_cpu_to_push(struct rq *rq)
2024{
2025        int cpu;
2026
2027        if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
2028                raw_spin_lock(&rq->rt.push_lock);
2029                /* Make sure it's still executing */
2030                if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
2031                        /*
2032                         * Tell the IPI to restart the loop as things have
2033                         * changed since it started.
2034                         */
2035                        rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
2036                        raw_spin_unlock(&rq->rt.push_lock);
2037                        return;
2038                }
2039                raw_spin_unlock(&rq->rt.push_lock);
2040        }
2041
2042        /* When here, there's no IPI going around */
2043
2044        rq->rt.push_cpu = rq->cpu;
2045        cpu = find_next_push_cpu(rq);
2046        if (cpu >= nr_cpu_ids)
2047                return;
2048
2049        rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
2050
2051        irq_work_queue_on(&rq->rt.push_work, cpu);
2052}
2053
2054/* Called from hardirq context */
2055static void try_to_push_tasks(void *arg)
2056{
2057        struct rt_rq *rt_rq = arg;
2058        struct rq *rq, *src_rq;
2059        int this_cpu;
2060        int cpu;
2061
2062        this_cpu = rt_rq->push_cpu;
2063
2064        /* Paranoid check */
2065        BUG_ON(this_cpu != smp_processor_id());
2066
2067        rq = cpu_rq(this_cpu);
2068        src_rq = rq_of_rt_rq(rt_rq);
2069
2070again:
2071        if (has_pushable_tasks(rq)) {
2072                raw_spin_lock(&rq->lock);
2073                push_rt_task(rq);
2074                raw_spin_unlock(&rq->lock);
2075        }
2076
2077        /* Pass the IPI to the next rt overloaded queue */
2078        raw_spin_lock(&rt_rq->push_lock);
2079        /*
2080         * If the source queue changed since the IPI went out,
2081         * we need to restart the search from that CPU again.
2082         */
2083        if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
2084                rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
2085                rt_rq->push_cpu = src_rq->cpu;
2086        }
2087
2088        cpu = find_next_push_cpu(src_rq);
2089
2090        if (cpu >= nr_cpu_ids)
2091                rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
2092        raw_spin_unlock(&rt_rq->push_lock);
2093
2094        if (cpu >= nr_cpu_ids)
2095                return;
2096
2097        /*
2098         * It is possible that a restart caused this CPU to be
2099         * chosen again. Don't bother with an IPI, just see if we
2100         * have more to push.
2101         */
2102        if (unlikely(cpu == rq->cpu))
2103                goto again;
2104
2105        /* Try the next RT overloaded CPU */
2106        irq_work_queue_on(&rt_rq->push_work, cpu);
2107}
2108
2109static void push_irq_work_func(struct irq_work *work)
2110{
2111        struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
2112
2113        try_to_push_tasks(rt_rq);
2114}
2115#endif /* HAVE_RT_PUSH_IPI */
2116
2117static void pull_rt_task(struct rq *this_rq)
2118{
2119        int this_cpu = this_rq->cpu, cpu;
2120        bool resched = false;
2121        struct task_struct *p;
2122        struct rq *src_rq;
2123
2124        if (likely(!rt_overloaded(this_rq)))
2125                return;
2126
2127        /*
2128         * Match the barrier from rt_set_overloaded; this guarantees that if we
2129         * see overloaded we must also see the rto_mask bit.
2130         */
2131        smp_rmb();
2132
2133#ifdef HAVE_RT_PUSH_IPI
2134        if (sched_feat(RT_PUSH_IPI)) {
2135                tell_cpu_to_push(this_rq);
2136                return;
2137        }
2138#endif
2139
2140        for_each_cpu(cpu, this_rq->rd->rto_mask) {
2141                if (this_cpu == cpu)
2142                        continue;
2143
2144                src_rq = cpu_rq(cpu);
2145
2146                /*
2147                 * Don't bother taking the src_rq->lock if the next highest
2148                 * task is known to be lower-priority than our current task.
2149                 * This may look racy, but if this value is about to go
2150                 * logically higher, the src_rq will push this task away.
2151                 * And if its going logically lower, we do not care
2152                 */
2153                if (src_rq->rt.highest_prio.next >=
2154                    this_rq->rt.highest_prio.curr)
2155                        continue;
2156
2157                /*
2158                 * We can potentially drop this_rq's lock in
2159                 * double_lock_balance, and another CPU could
2160                 * alter this_rq
2161                 */
2162                double_lock_balance(this_rq, src_rq);
2163
2164                /*
2165                 * We can pull only a task, which is pushable
2166                 * on its rq, and no others.
2167                 */
2168                p = pick_highest_pushable_task(src_rq, this_cpu);
2169
2170                /*
2171                 * Do we have an RT task that preempts
2172                 * the to-be-scheduled task?
2173                 */
2174                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2175                        WARN_ON(p == src_rq->curr);
2176                        WARN_ON(!task_on_rq_queued(p));
2177
2178                        /*
2179                         * There's a chance that p is higher in priority
2180                         * than what's currently running on its cpu.
2181                         * This is just that p is wakeing up and hasn't
2182                         * had a chance to schedule. We only pull
2183                         * p if it is lower in priority than the
2184                         * current task on the run queue
2185                         */
2186                        if (p->prio < src_rq->curr->prio)
2187                                goto skip;
2188
2189                        resched = true;
2190
2191                        deactivate_task(src_rq, p, 0);
2192                        set_task_cpu(p, this_cpu);
2193                        activate_task(this_rq, p, 0);
2194                        /*
2195                         * We continue with the search, just in
2196                         * case there's an even higher prio task
2197                         * in another runqueue. (low likelihood
2198                         * but possible)
2199                         */
2200                }
2201skip:
2202                double_unlock_balance(this_rq, src_rq);
2203        }
2204
2205        if (resched)
2206                resched_curr(this_rq);
2207}
2208
2209/*
2210 * If we are not running and we are not going to reschedule soon, we should
2211 * try to push tasks away now
2212 */
2213static void task_woken_rt(struct rq *rq, struct task_struct *p)
2214{
2215        if (!task_running(rq, p) &&
2216            !test_tsk_need_resched(rq->curr) &&
2217            p->nr_cpus_allowed > 1 &&
2218            (dl_task(rq->curr) || rt_task(rq->curr)) &&
2219            (rq->curr->nr_cpus_allowed < 2 ||
2220             rq->curr->prio <= p->prio))
2221                push_rt_tasks(rq);
2222}
2223
2224/* Assumes rq->lock is held */
2225static void rq_online_rt(struct rq *rq)
2226{
2227        if (rq->rt.overloaded)
2228                rt_set_overload(rq);
2229
2230        __enable_runtime(rq);
2231
2232        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2233}
2234
2235/* Assumes rq->lock is held */
2236static void rq_offline_rt(struct rq *rq)
2237{
2238        if (rq->rt.overloaded)
2239                rt_clear_overload(rq);
2240
2241        __disable_runtime(rq);
2242
2243        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2244}
2245
2246/*
2247 * When switch from the rt queue, we bring ourselves to a position
2248 * that we might want to pull RT tasks from other runqueues.
2249 */
2250static void switched_from_rt(struct rq *rq, struct task_struct *p)
2251{
2252        /*
2253         * If there are other RT tasks then we will reschedule
2254         * and the scheduling of the other RT tasks will handle
2255         * the balancing. But if we are the last RT task
2256         * we may need to handle the pulling of RT tasks
2257         * now.
2258         */
2259        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2260                return;
2261
2262        queue_pull_task(rq);
2263}
2264
2265void __init init_sched_rt_class(void)
2266{
2267        unsigned int i;
2268
2269        for_each_possible_cpu(i) {
2270                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2271                                        GFP_KERNEL, cpu_to_node(i));
2272        }
2273}
2274#endif /* CONFIG_SMP */
2275
2276/*
2277 * When switching a task to RT, we may overload the runqueue
2278 * with RT tasks. In this case we try to push them off to
2279 * other runqueues.
2280 */
2281static void switched_to_rt(struct rq *rq, struct task_struct *p)
2282{
2283        /*
2284         * If we are already running, then there's nothing
2285         * that needs to be done. But if we are not running
2286         * we may need to preempt the current running task.
2287         * If that current running task is also an RT task
2288         * then see if we can move to another run queue.
2289         */
2290        if (task_on_rq_queued(p) && rq->curr != p) {
2291#ifdef CONFIG_SMP
2292                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2293                        queue_push_tasks(rq);
2294#endif /* CONFIG_SMP */
2295                if (p->prio < rq->curr->prio)
2296                        resched_curr(rq);
2297        }
2298}
2299
2300/*
2301 * Priority of the task has changed. This may cause
2302 * us to initiate a push or pull.
2303 */
2304static void
2305prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2306{
2307        if (!task_on_rq_queued(p))
2308                return;
2309
2310        if (rq->curr == p) {
2311#ifdef CONFIG_SMP
2312                /*
2313                 * If our priority decreases while running, we
2314                 * may need to pull tasks to this runqueue.
2315                 */
2316                if (oldprio < p->prio)
2317                        queue_pull_task(rq);
2318
2319                /*
2320                 * If there's a higher priority task waiting to run
2321                 * then reschedule.
2322                 */
2323                if (p->prio > rq->rt.highest_prio.curr)
2324                        resched_curr(rq);
2325#else
2326                /* For UP simply resched on drop of prio */
2327                if (oldprio < p->prio)
2328                        resched_curr(rq);
2329#endif /* CONFIG_SMP */
2330        } else {
2331                /*
2332                 * This task is not running, but if it is
2333                 * greater than the current running task
2334                 * then reschedule.
2335                 */
2336                if (p->prio < rq->curr->prio)
2337                        resched_curr(rq);
2338        }
2339}
2340
2341#ifdef CONFIG_POSIX_TIMERS
2342static void watchdog(struct rq *rq, struct task_struct *p)
2343{
2344        unsigned long soft, hard;
2345
2346        /* max may change after cur was read, this will be fixed next tick */
2347        soft = task_rlimit(p, RLIMIT_RTTIME);
2348        hard = task_rlimit_max(p, RLIMIT_RTTIME);
2349
2350        if (soft != RLIM_INFINITY) {
2351                unsigned long next;
2352
2353                if (p->rt.watchdog_stamp != jiffies) {
2354                        p->rt.timeout++;
2355                        p->rt.watchdog_stamp = jiffies;
2356                }
2357
2358                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2359                if (p->rt.timeout > next)
2360                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2361        }
2362}
2363#else
2364static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2365#endif
2366
2367static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2368{
2369        struct sched_rt_entity *rt_se = &p->rt;
2370
2371        update_curr_rt(rq);
2372
2373        watchdog(rq, p);
2374
2375        /*
2376         * RR tasks need a special form of timeslice management.
2377         * FIFO tasks have no timeslices.
2378         */
2379        if (p->policy != SCHED_RR)
2380                return;
2381
2382        if (--p->rt.time_slice)
2383                return;
2384
2385        p->rt.time_slice = sched_rr_timeslice;
2386
2387        /*
2388         * Requeue to the end of queue if we (and all of our ancestors) are not
2389         * the only element on the queue
2390         */
2391        for_each_sched_rt_entity(rt_se) {
2392                if (rt_se->run_list.prev != rt_se->run_list.next) {
2393                        requeue_task_rt(rq, p, 0);
2394                        resched_curr(rq);
2395                        return;
2396                }
2397        }
2398}
2399
2400static void set_curr_task_rt(struct rq *rq)
2401{
2402        struct task_struct *p = rq->curr;
2403
2404        p->se.exec_start = rq_clock_task(rq);
2405
2406        /* The running task is never eligible for pushing */
2407        dequeue_pushable_task(rq, p);
2408}
2409
2410static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2411{
2412        /*
2413         * Time slice is 0 for SCHED_FIFO tasks
2414         */
2415        if (task->policy == SCHED_RR)
2416                return sched_rr_timeslice;
2417        else
2418                return 0;
2419}
2420
2421const struct sched_class rt_sched_class = {
2422        .next                   = &fair_sched_class,
2423        .enqueue_task           = enqueue_task_rt,
2424        .dequeue_task           = dequeue_task_rt,
2425        .yield_task             = yield_task_rt,
2426
2427        .check_preempt_curr     = check_preempt_curr_rt,
2428
2429        .pick_next_task         = pick_next_task_rt,
2430        .put_prev_task          = put_prev_task_rt,
2431
2432#ifdef CONFIG_SMP
2433        .select_task_rq         = select_task_rq_rt,
2434
2435        .set_cpus_allowed       = set_cpus_allowed_common,
2436        .rq_online              = rq_online_rt,
2437        .rq_offline             = rq_offline_rt,
2438        .task_woken             = task_woken_rt,
2439        .switched_from          = switched_from_rt,
2440#endif
2441
2442        .set_curr_task          = set_curr_task_rt,
2443        .task_tick              = task_tick_rt,
2444
2445        .get_rr_interval        = get_rr_interval_rt,
2446
2447        .prio_changed           = prio_changed_rt,
2448        .switched_to            = switched_to_rt,
2449
2450        .update_curr            = update_curr_rt,
2451};
2452
2453#ifdef CONFIG_RT_GROUP_SCHED
2454/*
2455 * Ensure that the real time constraints are schedulable.
2456 */
2457static DEFINE_MUTEX(rt_constraints_mutex);
2458
2459/* Must be called with tasklist_lock held */
2460static inline int tg_has_rt_tasks(struct task_group *tg)
2461{
2462        struct task_struct *g, *p;
2463
2464        /*
2465         * Autogroups do not have RT tasks; see autogroup_create().
2466         */
2467        if (task_group_is_autogroup(tg))
2468                return 0;
2469
2470        for_each_process_thread(g, p) {
2471                if (rt_task(p) && task_group(p) == tg)
2472                        return 1;
2473        }
2474
2475        return 0;
2476}
2477
2478struct rt_schedulable_data {
2479        struct task_group *tg;
2480        u64 rt_period;
2481        u64 rt_runtime;
2482};
2483
2484static int tg_rt_schedulable(struct task_group *tg, void *data)
2485{
2486        struct rt_schedulable_data *d = data;
2487        struct task_group *child;
2488        unsigned long total, sum = 0;
2489        u64 period, runtime;
2490
2491        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2492        runtime = tg->rt_bandwidth.rt_runtime;
2493
2494        if (tg == d->tg) {
2495                period = d->rt_period;
2496                runtime = d->rt_runtime;
2497        }
2498
2499        /*
2500         * Cannot have more runtime than the period.
2501         */
2502        if (runtime > period && runtime != RUNTIME_INF)
2503                return -EINVAL;
2504
2505        /*
2506         * Ensure we don't starve existing RT tasks.
2507         */
2508        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2509                return -EBUSY;
2510
2511        total = to_ratio(period, runtime);
2512
2513        /*
2514         * Nobody can have more than the global setting allows.
2515         */
2516        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2517                return -EINVAL;
2518
2519        /*
2520         * The sum of our children's runtime should not exceed our own.
2521         */
2522        list_for_each_entry_rcu(child, &tg->children, siblings) {
2523                period = ktime_to_ns(child->rt_bandwidth.rt_period);
2524                runtime = child->rt_bandwidth.rt_runtime;
2525
2526                if (child == d->tg) {
2527                        period = d->rt_period;
2528                        runtime = d->rt_runtime;
2529                }
2530
2531                sum += to_ratio(period, runtime);
2532        }
2533
2534        if (sum > total)
2535                return -EINVAL;
2536
2537        return 0;
2538}
2539
2540static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2541{
2542        int ret;
2543
2544        struct rt_schedulable_data data = {
2545                .tg = tg,
2546                .rt_period = period,
2547                .rt_runtime = runtime,
2548        };
2549
2550        rcu_read_lock();
2551        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2552        rcu_read_unlock();
2553
2554        return ret;
2555}
2556
2557static int tg_set_rt_bandwidth(struct task_group *tg,
2558                u64 rt_period, u64 rt_runtime)
2559{
2560        int i, err = 0;
2561
2562        /*
2563         * Disallowing the root group RT runtime is BAD, it would disallow the
2564         * kernel creating (and or operating) RT threads.
2565         */
2566        if (tg == &root_task_group && rt_runtime == 0)
2567                return -EINVAL;
2568
2569        /* No period doesn't make any sense. */
2570        if (rt_period == 0)
2571                return -EINVAL;
2572
2573        mutex_lock(&rt_constraints_mutex);
2574        read_lock(&tasklist_lock);
2575        err = __rt_schedulable(tg, rt_period, rt_runtime);
2576        if (err)
2577                goto unlock;
2578
2579        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2580        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2581        tg->rt_bandwidth.rt_runtime = rt_runtime;
2582
2583        for_each_possible_cpu(i) {
2584                struct rt_rq *rt_rq = tg->rt_rq[i];
2585
2586                raw_spin_lock(&rt_rq->rt_runtime_lock);
2587                rt_rq->rt_runtime = rt_runtime;
2588                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2589        }
2590        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2591unlock:
2592        read_unlock(&tasklist_lock);
2593        mutex_unlock(&rt_constraints_mutex);
2594
2595        return err;
2596}
2597
2598int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2599{
2600        u64 rt_runtime, rt_period;
2601
2602        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2603        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2604        if (rt_runtime_us < 0)
2605                rt_runtime = RUNTIME_INF;
2606
2607        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2608}
2609
2610long sched_group_rt_runtime(struct task_group *tg)
2611{
2612        u64 rt_runtime_us;
2613
2614        if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2615                return -1;
2616
2617        rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2618        do_div(rt_runtime_us, NSEC_PER_USEC);
2619        return rt_runtime_us;
2620}
2621
2622int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2623{
2624        u64 rt_runtime, rt_period;
2625
2626        rt_period = rt_period_us * NSEC_PER_USEC;
2627        rt_runtime = tg->rt_bandwidth.rt_runtime;
2628
2629        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2630}
2631
2632long sched_group_rt_period(struct task_group *tg)
2633{
2634        u64 rt_period_us;
2635
2636        rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2637        do_div(rt_period_us, NSEC_PER_USEC);
2638        return rt_period_us;
2639}
2640
2641static int sched_rt_global_constraints(void)
2642{
2643        int ret = 0;
2644
2645        mutex_lock(&rt_constraints_mutex);
2646        read_lock(&tasklist_lock);
2647        ret = __rt_schedulable(NULL, 0, 0);
2648        read_unlock(&tasklist_lock);
2649        mutex_unlock(&rt_constraints_mutex);
2650
2651        return ret;
2652}
2653
2654int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2655{
2656        /* Don't accept realtime tasks when there is no way for them to run */
2657        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2658                return 0;
2659
2660        return 1;
2661}
2662
2663#else /* !CONFIG_RT_GROUP_SCHED */
2664static int sched_rt_global_constraints(void)
2665{
2666        unsigned long flags;
2667        int i;
2668
2669        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2670        for_each_possible_cpu(i) {
2671                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2672
2673                raw_spin_lock(&rt_rq->rt_runtime_lock);
2674                rt_rq->rt_runtime = global_rt_runtime();
2675                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2676        }
2677        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2678
2679        return 0;
2680}
2681#endif /* CONFIG_RT_GROUP_SCHED */
2682
2683static int sched_rt_global_validate(void)
2684{
2685        if (sysctl_sched_rt_period <= 0)
2686                return -EINVAL;
2687
2688        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2689                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2690                return -EINVAL;
2691
2692        return 0;
2693}
2694
2695static void sched_rt_do_global(void)
2696{
2697        def_rt_bandwidth.rt_runtime = global_rt_runtime();
2698        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2699}
2700
2701int sched_rt_handler(struct ctl_table *table, int write,
2702                void __user *buffer, size_t *lenp,
2703                loff_t *ppos)
2704{
2705        int old_period, old_runtime;
2706        static DEFINE_MUTEX(mutex);
2707        int ret;
2708
2709        mutex_lock(&mutex);
2710        old_period = sysctl_sched_rt_period;
2711        old_runtime = sysctl_sched_rt_runtime;
2712
2713        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2714
2715        if (!ret && write) {
2716                ret = sched_rt_global_validate();
2717                if (ret)
2718                        goto undo;
2719
2720                ret = sched_dl_global_validate();
2721                if (ret)
2722                        goto undo;
2723
2724                ret = sched_rt_global_constraints();
2725                if (ret)
2726                        goto undo;
2727
2728                sched_rt_do_global();
2729                sched_dl_do_global();
2730        }
2731        if (0) {
2732undo:
2733                sysctl_sched_rt_period = old_period;
2734                sysctl_sched_rt_runtime = old_runtime;
2735        }
2736        mutex_unlock(&mutex);
2737
2738        return ret;
2739}
2740
2741int sched_rr_handler(struct ctl_table *table, int write,
2742                void __user *buffer, size_t *lenp,
2743                loff_t *ppos)
2744{
2745        int ret;
2746        static DEFINE_MUTEX(mutex);
2747
2748        mutex_lock(&mutex);
2749        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2750        /*
2751         * Make sure that internally we keep jiffies.
2752         * Also, writing zero resets the timeslice to default:
2753         */
2754        if (!ret && write) {
2755                sched_rr_timeslice =
2756                        sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2757                        msecs_to_jiffies(sysctl_sched_rr_timeslice);
2758        }
2759        mutex_unlock(&mutex);
2760        return ret;
2761}
2762
2763#ifdef CONFIG_SCHED_DEBUG
2764extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2765
2766void print_rt_stats(struct seq_file *m, int cpu)
2767{
2768        rt_rq_iter_t iter;
2769        struct rt_rq *rt_rq;
2770
2771        rcu_read_lock();
2772        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2773                print_rt_rq(m, cpu, rt_rq);
2774        rcu_read_unlock();
2775}
2776#endif /* CONFIG_SCHED_DEBUG */
2777