linux/kernel/sched/rt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4 * policies)
   5 */
   6#include "sched.h"
   7
   8#include "pelt.h"
   9
  10int sched_rr_timeslice = RR_TIMESLICE;
  11int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  12/* More than 4 hours if BW_SHIFT equals 20. */
  13static const u64 max_rt_runtime = MAX_BW;
  14
  15static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  16
  17struct rt_bandwidth def_rt_bandwidth;
  18
  19static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  20{
  21        struct rt_bandwidth *rt_b =
  22                container_of(timer, struct rt_bandwidth, rt_period_timer);
  23        int idle = 0;
  24        int overrun;
  25
  26        raw_spin_lock(&rt_b->rt_runtime_lock);
  27        for (;;) {
  28                overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  29                if (!overrun)
  30                        break;
  31
  32                raw_spin_unlock(&rt_b->rt_runtime_lock);
  33                idle = do_sched_rt_period_timer(rt_b, overrun);
  34                raw_spin_lock(&rt_b->rt_runtime_lock);
  35        }
  36        if (idle)
  37                rt_b->rt_period_active = 0;
  38        raw_spin_unlock(&rt_b->rt_runtime_lock);
  39
  40        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  41}
  42
  43void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  44{
  45        rt_b->rt_period = ns_to_ktime(period);
  46        rt_b->rt_runtime = runtime;
  47
  48        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  49
  50        hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
  51                     HRTIMER_MODE_REL_HARD);
  52        rt_b->rt_period_timer.function = sched_rt_period_timer;
  53}
  54
  55static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  56{
  57        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  58                return;
  59
  60        raw_spin_lock(&rt_b->rt_runtime_lock);
  61        if (!rt_b->rt_period_active) {
  62                rt_b->rt_period_active = 1;
  63                /*
  64                 * SCHED_DEADLINE updates the bandwidth, as a run away
  65                 * RT task with a DL task could hog a CPU. But DL does
  66                 * not reset the period. If a deadline task was running
  67                 * without an RT task running, it can cause RT tasks to
  68                 * throttle when they start up. Kick the timer right away
  69                 * to update the period.
  70                 */
  71                hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  72                hrtimer_start_expires(&rt_b->rt_period_timer,
  73                                      HRTIMER_MODE_ABS_PINNED_HARD);
  74        }
  75        raw_spin_unlock(&rt_b->rt_runtime_lock);
  76}
  77
  78void init_rt_rq(struct rt_rq *rt_rq)
  79{
  80        struct rt_prio_array *array;
  81        int i;
  82
  83        array = &rt_rq->active;
  84        for (i = 0; i < MAX_RT_PRIO; i++) {
  85                INIT_LIST_HEAD(array->queue + i);
  86                __clear_bit(i, array->bitmap);
  87        }
  88        /* delimiter for bitsearch: */
  89        __set_bit(MAX_RT_PRIO, array->bitmap);
  90
  91#if defined CONFIG_SMP
  92        rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
  93        rt_rq->highest_prio.next = MAX_RT_PRIO-1;
  94        rt_rq->rt_nr_migratory = 0;
  95        rt_rq->overloaded = 0;
  96        plist_head_init(&rt_rq->pushable_tasks);
  97#endif /* CONFIG_SMP */
  98        /* We start is dequeued state, because no RT tasks are queued */
  99        rt_rq->rt_queued = 0;
 100
 101        rt_rq->rt_time = 0;
 102        rt_rq->rt_throttled = 0;
 103        rt_rq->rt_runtime = 0;
 104        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 105}
 106
 107#ifdef CONFIG_RT_GROUP_SCHED
 108static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 109{
 110        hrtimer_cancel(&rt_b->rt_period_timer);
 111}
 112
 113#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 114
 115static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 116{
 117#ifdef CONFIG_SCHED_DEBUG
 118        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 119#endif
 120        return container_of(rt_se, struct task_struct, rt);
 121}
 122
 123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 124{
 125        return rt_rq->rq;
 126}
 127
 128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 129{
 130        return rt_se->rt_rq;
 131}
 132
 133static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 134{
 135        struct rt_rq *rt_rq = rt_se->rt_rq;
 136
 137        return rt_rq->rq;
 138}
 139
 140void free_rt_sched_group(struct task_group *tg)
 141{
 142        int i;
 143
 144        if (tg->rt_se)
 145                destroy_rt_bandwidth(&tg->rt_bandwidth);
 146
 147        for_each_possible_cpu(i) {
 148                if (tg->rt_rq)
 149                        kfree(tg->rt_rq[i]);
 150                if (tg->rt_se)
 151                        kfree(tg->rt_se[i]);
 152        }
 153
 154        kfree(tg->rt_rq);
 155        kfree(tg->rt_se);
 156}
 157
 158void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 159                struct sched_rt_entity *rt_se, int cpu,
 160                struct sched_rt_entity *parent)
 161{
 162        struct rq *rq = cpu_rq(cpu);
 163
 164        rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
 165        rt_rq->rt_nr_boosted = 0;
 166        rt_rq->rq = rq;
 167        rt_rq->tg = tg;
 168
 169        tg->rt_rq[cpu] = rt_rq;
 170        tg->rt_se[cpu] = rt_se;
 171
 172        if (!rt_se)
 173                return;
 174
 175        if (!parent)
 176                rt_se->rt_rq = &rq->rt;
 177        else
 178                rt_se->rt_rq = parent->my_q;
 179
 180        rt_se->my_q = rt_rq;
 181        rt_se->parent = parent;
 182        INIT_LIST_HEAD(&rt_se->run_list);
 183}
 184
 185int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 186{
 187        struct rt_rq *rt_rq;
 188        struct sched_rt_entity *rt_se;
 189        int i;
 190
 191        tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
 192        if (!tg->rt_rq)
 193                goto err;
 194        tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
 195        if (!tg->rt_se)
 196                goto err;
 197
 198        init_rt_bandwidth(&tg->rt_bandwidth,
 199                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 200
 201        for_each_possible_cpu(i) {
 202                rt_rq = kzalloc_node(sizeof(struct rt_rq),
 203                                     GFP_KERNEL, cpu_to_node(i));
 204                if (!rt_rq)
 205                        goto err;
 206
 207                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 208                                     GFP_KERNEL, cpu_to_node(i));
 209                if (!rt_se)
 210                        goto err_free_rq;
 211
 212                init_rt_rq(rt_rq);
 213                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 214                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 215        }
 216
 217        return 1;
 218
 219err_free_rq:
 220        kfree(rt_rq);
 221err:
 222        return 0;
 223}
 224
 225#else /* CONFIG_RT_GROUP_SCHED */
 226
 227#define rt_entity_is_task(rt_se) (1)
 228
 229static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 230{
 231        return container_of(rt_se, struct task_struct, rt);
 232}
 233
 234static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 235{
 236        return container_of(rt_rq, struct rq, rt);
 237}
 238
 239static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 240{
 241        struct task_struct *p = rt_task_of(rt_se);
 242
 243        return task_rq(p);
 244}
 245
 246static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 247{
 248        struct rq *rq = rq_of_rt_se(rt_se);
 249
 250        return &rq->rt;
 251}
 252
 253void free_rt_sched_group(struct task_group *tg) { }
 254
 255int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 256{
 257        return 1;
 258}
 259#endif /* CONFIG_RT_GROUP_SCHED */
 260
 261#ifdef CONFIG_SMP
 262
 263static void pull_rt_task(struct rq *this_rq);
 264
 265static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 266{
 267        /* Try to pull RT tasks here if we lower this rq's prio */
 268        return rq->online && rq->rt.highest_prio.curr > prev->prio;
 269}
 270
 271static inline int rt_overloaded(struct rq *rq)
 272{
 273        return atomic_read(&rq->rd->rto_count);
 274}
 275
 276static inline void rt_set_overload(struct rq *rq)
 277{
 278        if (!rq->online)
 279                return;
 280
 281        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 282        /*
 283         * Make sure the mask is visible before we set
 284         * the overload count. That is checked to determine
 285         * if we should look at the mask. It would be a shame
 286         * if we looked at the mask, but the mask was not
 287         * updated yet.
 288         *
 289         * Matched by the barrier in pull_rt_task().
 290         */
 291        smp_wmb();
 292        atomic_inc(&rq->rd->rto_count);
 293}
 294
 295static inline void rt_clear_overload(struct rq *rq)
 296{
 297        if (!rq->online)
 298                return;
 299
 300        /* the order here really doesn't matter */
 301        atomic_dec(&rq->rd->rto_count);
 302        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 303}
 304
 305static void update_rt_migration(struct rt_rq *rt_rq)
 306{
 307        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 308                if (!rt_rq->overloaded) {
 309                        rt_set_overload(rq_of_rt_rq(rt_rq));
 310                        rt_rq->overloaded = 1;
 311                }
 312        } else if (rt_rq->overloaded) {
 313                rt_clear_overload(rq_of_rt_rq(rt_rq));
 314                rt_rq->overloaded = 0;
 315        }
 316}
 317
 318static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 319{
 320        struct task_struct *p;
 321
 322        if (!rt_entity_is_task(rt_se))
 323                return;
 324
 325        p = rt_task_of(rt_se);
 326        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 327
 328        rt_rq->rt_nr_total++;
 329        if (p->nr_cpus_allowed > 1)
 330                rt_rq->rt_nr_migratory++;
 331
 332        update_rt_migration(rt_rq);
 333}
 334
 335static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 336{
 337        struct task_struct *p;
 338
 339        if (!rt_entity_is_task(rt_se))
 340                return;
 341
 342        p = rt_task_of(rt_se);
 343        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 344
 345        rt_rq->rt_nr_total--;
 346        if (p->nr_cpus_allowed > 1)
 347                rt_rq->rt_nr_migratory--;
 348
 349        update_rt_migration(rt_rq);
 350}
 351
 352static inline int has_pushable_tasks(struct rq *rq)
 353{
 354        return !plist_head_empty(&rq->rt.pushable_tasks);
 355}
 356
 357static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 358static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 359
 360static void push_rt_tasks(struct rq *);
 361static void pull_rt_task(struct rq *);
 362
 363static inline void rt_queue_push_tasks(struct rq *rq)
 364{
 365        if (!has_pushable_tasks(rq))
 366                return;
 367
 368        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 369}
 370
 371static inline void rt_queue_pull_task(struct rq *rq)
 372{
 373        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 374}
 375
 376static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 377{
 378        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 379        plist_node_init(&p->pushable_tasks, p->prio);
 380        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 381
 382        /* Update the highest prio pushable task */
 383        if (p->prio < rq->rt.highest_prio.next)
 384                rq->rt.highest_prio.next = p->prio;
 385}
 386
 387static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 388{
 389        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 390
 391        /* Update the new highest prio pushable task */
 392        if (has_pushable_tasks(rq)) {
 393                p = plist_first_entry(&rq->rt.pushable_tasks,
 394                                      struct task_struct, pushable_tasks);
 395                rq->rt.highest_prio.next = p->prio;
 396        } else {
 397                rq->rt.highest_prio.next = MAX_RT_PRIO-1;
 398        }
 399}
 400
 401#else
 402
 403static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 404{
 405}
 406
 407static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 408{
 409}
 410
 411static inline
 412void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 413{
 414}
 415
 416static inline
 417void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 418{
 419}
 420
 421static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 422{
 423        return false;
 424}
 425
 426static inline void pull_rt_task(struct rq *this_rq)
 427{
 428}
 429
 430static inline void rt_queue_push_tasks(struct rq *rq)
 431{
 432}
 433#endif /* CONFIG_SMP */
 434
 435static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 436static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 437
 438static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 439{
 440        return rt_se->on_rq;
 441}
 442
 443#ifdef CONFIG_UCLAMP_TASK
 444/*
 445 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
 446 * settings.
 447 *
 448 * This check is only important for heterogeneous systems where uclamp_min value
 449 * is higher than the capacity of a @cpu. For non-heterogeneous system this
 450 * function will always return true.
 451 *
 452 * The function will return true if the capacity of the @cpu is >= the
 453 * uclamp_min and false otherwise.
 454 *
 455 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
 456 * > uclamp_max.
 457 */
 458static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 459{
 460        unsigned int min_cap;
 461        unsigned int max_cap;
 462        unsigned int cpu_cap;
 463
 464        /* Only heterogeneous systems can benefit from this check */
 465        if (!static_branch_unlikely(&sched_asym_cpucapacity))
 466                return true;
 467
 468        min_cap = uclamp_eff_value(p, UCLAMP_MIN);
 469        max_cap = uclamp_eff_value(p, UCLAMP_MAX);
 470
 471        cpu_cap = capacity_orig_of(cpu);
 472
 473        return cpu_cap >= min(min_cap, max_cap);
 474}
 475#else
 476static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 477{
 478        return true;
 479}
 480#endif
 481
 482#ifdef CONFIG_RT_GROUP_SCHED
 483
 484static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 485{
 486        if (!rt_rq->tg)
 487                return RUNTIME_INF;
 488
 489        return rt_rq->rt_runtime;
 490}
 491
 492static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 493{
 494        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 495}
 496
 497typedef struct task_group *rt_rq_iter_t;
 498
 499static inline struct task_group *next_task_group(struct task_group *tg)
 500{
 501        do {
 502                tg = list_entry_rcu(tg->list.next,
 503                        typeof(struct task_group), list);
 504        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 505
 506        if (&tg->list == &task_groups)
 507                tg = NULL;
 508
 509        return tg;
 510}
 511
 512#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 513        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 514                (iter = next_task_group(iter)) &&                       \
 515                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 516
 517#define for_each_sched_rt_entity(rt_se) \
 518        for (; rt_se; rt_se = rt_se->parent)
 519
 520static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 521{
 522        return rt_se->my_q;
 523}
 524
 525static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 526static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 527
 528static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 529{
 530        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 531        struct rq *rq = rq_of_rt_rq(rt_rq);
 532        struct sched_rt_entity *rt_se;
 533
 534        int cpu = cpu_of(rq);
 535
 536        rt_se = rt_rq->tg->rt_se[cpu];
 537
 538        if (rt_rq->rt_nr_running) {
 539                if (!rt_se)
 540                        enqueue_top_rt_rq(rt_rq);
 541                else if (!on_rt_rq(rt_se))
 542                        enqueue_rt_entity(rt_se, 0);
 543
 544                if (rt_rq->highest_prio.curr < curr->prio)
 545                        resched_curr(rq);
 546        }
 547}
 548
 549static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 550{
 551        struct sched_rt_entity *rt_se;
 552        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 553
 554        rt_se = rt_rq->tg->rt_se[cpu];
 555
 556        if (!rt_se) {
 557                dequeue_top_rt_rq(rt_rq);
 558                /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 559                cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
 560        }
 561        else if (on_rt_rq(rt_se))
 562                dequeue_rt_entity(rt_se, 0);
 563}
 564
 565static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 566{
 567        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 568}
 569
 570static int rt_se_boosted(struct sched_rt_entity *rt_se)
 571{
 572        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 573        struct task_struct *p;
 574
 575        if (rt_rq)
 576                return !!rt_rq->rt_nr_boosted;
 577
 578        p = rt_task_of(rt_se);
 579        return p->prio != p->normal_prio;
 580}
 581
 582#ifdef CONFIG_SMP
 583static inline const struct cpumask *sched_rt_period_mask(void)
 584{
 585        return this_rq()->rd->span;
 586}
 587#else
 588static inline const struct cpumask *sched_rt_period_mask(void)
 589{
 590        return cpu_online_mask;
 591}
 592#endif
 593
 594static inline
 595struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 596{
 597        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 598}
 599
 600static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 601{
 602        return &rt_rq->tg->rt_bandwidth;
 603}
 604
 605#else /* !CONFIG_RT_GROUP_SCHED */
 606
 607static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 608{
 609        return rt_rq->rt_runtime;
 610}
 611
 612static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 613{
 614        return ktime_to_ns(def_rt_bandwidth.rt_period);
 615}
 616
 617typedef struct rt_rq *rt_rq_iter_t;
 618
 619#define for_each_rt_rq(rt_rq, iter, rq) \
 620        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 621
 622#define for_each_sched_rt_entity(rt_se) \
 623        for (; rt_se; rt_se = NULL)
 624
 625static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 626{
 627        return NULL;
 628}
 629
 630static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 631{
 632        struct rq *rq = rq_of_rt_rq(rt_rq);
 633
 634        if (!rt_rq->rt_nr_running)
 635                return;
 636
 637        enqueue_top_rt_rq(rt_rq);
 638        resched_curr(rq);
 639}
 640
 641static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 642{
 643        dequeue_top_rt_rq(rt_rq);
 644}
 645
 646static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 647{
 648        return rt_rq->rt_throttled;
 649}
 650
 651static inline const struct cpumask *sched_rt_period_mask(void)
 652{
 653        return cpu_online_mask;
 654}
 655
 656static inline
 657struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 658{
 659        return &cpu_rq(cpu)->rt;
 660}
 661
 662static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 663{
 664        return &def_rt_bandwidth;
 665}
 666
 667#endif /* CONFIG_RT_GROUP_SCHED */
 668
 669bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 670{
 671        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 672
 673        return (hrtimer_active(&rt_b->rt_period_timer) ||
 674                rt_rq->rt_time < rt_b->rt_runtime);
 675}
 676
 677#ifdef CONFIG_SMP
 678/*
 679 * We ran out of runtime, see if we can borrow some from our neighbours.
 680 */
 681static void do_balance_runtime(struct rt_rq *rt_rq)
 682{
 683        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 684        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 685        int i, weight;
 686        u64 rt_period;
 687
 688        weight = cpumask_weight(rd->span);
 689
 690        raw_spin_lock(&rt_b->rt_runtime_lock);
 691        rt_period = ktime_to_ns(rt_b->rt_period);
 692        for_each_cpu(i, rd->span) {
 693                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 694                s64 diff;
 695
 696                if (iter == rt_rq)
 697                        continue;
 698
 699                raw_spin_lock(&iter->rt_runtime_lock);
 700                /*
 701                 * Either all rqs have inf runtime and there's nothing to steal
 702                 * or __disable_runtime() below sets a specific rq to inf to
 703                 * indicate its been disabled and disallow stealing.
 704                 */
 705                if (iter->rt_runtime == RUNTIME_INF)
 706                        goto next;
 707
 708                /*
 709                 * From runqueues with spare time, take 1/n part of their
 710                 * spare time, but no more than our period.
 711                 */
 712                diff = iter->rt_runtime - iter->rt_time;
 713                if (diff > 0) {
 714                        diff = div_u64((u64)diff, weight);
 715                        if (rt_rq->rt_runtime + diff > rt_period)
 716                                diff = rt_period - rt_rq->rt_runtime;
 717                        iter->rt_runtime -= diff;
 718                        rt_rq->rt_runtime += diff;
 719                        if (rt_rq->rt_runtime == rt_period) {
 720                                raw_spin_unlock(&iter->rt_runtime_lock);
 721                                break;
 722                        }
 723                }
 724next:
 725                raw_spin_unlock(&iter->rt_runtime_lock);
 726        }
 727        raw_spin_unlock(&rt_b->rt_runtime_lock);
 728}
 729
 730/*
 731 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 732 */
 733static void __disable_runtime(struct rq *rq)
 734{
 735        struct root_domain *rd = rq->rd;
 736        rt_rq_iter_t iter;
 737        struct rt_rq *rt_rq;
 738
 739        if (unlikely(!scheduler_running))
 740                return;
 741
 742        for_each_rt_rq(rt_rq, iter, rq) {
 743                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 744                s64 want;
 745                int i;
 746
 747                raw_spin_lock(&rt_b->rt_runtime_lock);
 748                raw_spin_lock(&rt_rq->rt_runtime_lock);
 749                /*
 750                 * Either we're all inf and nobody needs to borrow, or we're
 751                 * already disabled and thus have nothing to do, or we have
 752                 * exactly the right amount of runtime to take out.
 753                 */
 754                if (rt_rq->rt_runtime == RUNTIME_INF ||
 755                                rt_rq->rt_runtime == rt_b->rt_runtime)
 756                        goto balanced;
 757                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 758
 759                /*
 760                 * Calculate the difference between what we started out with
 761                 * and what we current have, that's the amount of runtime
 762                 * we lend and now have to reclaim.
 763                 */
 764                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 765
 766                /*
 767                 * Greedy reclaim, take back as much as we can.
 768                 */
 769                for_each_cpu(i, rd->span) {
 770                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 771                        s64 diff;
 772
 773                        /*
 774                         * Can't reclaim from ourselves or disabled runqueues.
 775                         */
 776                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 777                                continue;
 778
 779                        raw_spin_lock(&iter->rt_runtime_lock);
 780                        if (want > 0) {
 781                                diff = min_t(s64, iter->rt_runtime, want);
 782                                iter->rt_runtime -= diff;
 783                                want -= diff;
 784                        } else {
 785                                iter->rt_runtime -= want;
 786                                want -= want;
 787                        }
 788                        raw_spin_unlock(&iter->rt_runtime_lock);
 789
 790                        if (!want)
 791                                break;
 792                }
 793
 794                raw_spin_lock(&rt_rq->rt_runtime_lock);
 795                /*
 796                 * We cannot be left wanting - that would mean some runtime
 797                 * leaked out of the system.
 798                 */
 799                BUG_ON(want);
 800balanced:
 801                /*
 802                 * Disable all the borrow logic by pretending we have inf
 803                 * runtime - in which case borrowing doesn't make sense.
 804                 */
 805                rt_rq->rt_runtime = RUNTIME_INF;
 806                rt_rq->rt_throttled = 0;
 807                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 808                raw_spin_unlock(&rt_b->rt_runtime_lock);
 809
 810                /* Make rt_rq available for pick_next_task() */
 811                sched_rt_rq_enqueue(rt_rq);
 812        }
 813}
 814
 815static void __enable_runtime(struct rq *rq)
 816{
 817        rt_rq_iter_t iter;
 818        struct rt_rq *rt_rq;
 819
 820        if (unlikely(!scheduler_running))
 821                return;
 822
 823        /*
 824         * Reset each runqueue's bandwidth settings
 825         */
 826        for_each_rt_rq(rt_rq, iter, rq) {
 827                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 828
 829                raw_spin_lock(&rt_b->rt_runtime_lock);
 830                raw_spin_lock(&rt_rq->rt_runtime_lock);
 831                rt_rq->rt_runtime = rt_b->rt_runtime;
 832                rt_rq->rt_time = 0;
 833                rt_rq->rt_throttled = 0;
 834                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 835                raw_spin_unlock(&rt_b->rt_runtime_lock);
 836        }
 837}
 838
 839static void balance_runtime(struct rt_rq *rt_rq)
 840{
 841        if (!sched_feat(RT_RUNTIME_SHARE))
 842                return;
 843
 844        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 845                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 846                do_balance_runtime(rt_rq);
 847                raw_spin_lock(&rt_rq->rt_runtime_lock);
 848        }
 849}
 850#else /* !CONFIG_SMP */
 851static inline void balance_runtime(struct rt_rq *rt_rq) {}
 852#endif /* CONFIG_SMP */
 853
 854static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 855{
 856        int i, idle = 1, throttled = 0;
 857        const struct cpumask *span;
 858
 859        span = sched_rt_period_mask();
 860#ifdef CONFIG_RT_GROUP_SCHED
 861        /*
 862         * FIXME: isolated CPUs should really leave the root task group,
 863         * whether they are isolcpus or were isolated via cpusets, lest
 864         * the timer run on a CPU which does not service all runqueues,
 865         * potentially leaving other CPUs indefinitely throttled.  If
 866         * isolation is really required, the user will turn the throttle
 867         * off to kill the perturbations it causes anyway.  Meanwhile,
 868         * this maintains functionality for boot and/or troubleshooting.
 869         */
 870        if (rt_b == &root_task_group.rt_bandwidth)
 871                span = cpu_online_mask;
 872#endif
 873        for_each_cpu(i, span) {
 874                int enqueue = 0;
 875                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 876                struct rq *rq = rq_of_rt_rq(rt_rq);
 877                int skip;
 878
 879                /*
 880                 * When span == cpu_online_mask, taking each rq->lock
 881                 * can be time-consuming. Try to avoid it when possible.
 882                 */
 883                raw_spin_lock(&rt_rq->rt_runtime_lock);
 884                if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 885                        rt_rq->rt_runtime = rt_b->rt_runtime;
 886                skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 887                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 888                if (skip)
 889                        continue;
 890
 891                raw_spin_rq_lock(rq);
 892                update_rq_clock(rq);
 893
 894                if (rt_rq->rt_time) {
 895                        u64 runtime;
 896
 897                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 898                        if (rt_rq->rt_throttled)
 899                                balance_runtime(rt_rq);
 900                        runtime = rt_rq->rt_runtime;
 901                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 902                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 903                                rt_rq->rt_throttled = 0;
 904                                enqueue = 1;
 905
 906                                /*
 907                                 * When we're idle and a woken (rt) task is
 908                                 * throttled check_preempt_curr() will set
 909                                 * skip_update and the time between the wakeup
 910                                 * and this unthrottle will get accounted as
 911                                 * 'runtime'.
 912                                 */
 913                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 914                                        rq_clock_cancel_skipupdate(rq);
 915                        }
 916                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 917                                idle = 0;
 918                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 919                } else if (rt_rq->rt_nr_running) {
 920                        idle = 0;
 921                        if (!rt_rq_throttled(rt_rq))
 922                                enqueue = 1;
 923                }
 924                if (rt_rq->rt_throttled)
 925                        throttled = 1;
 926
 927                if (enqueue)
 928                        sched_rt_rq_enqueue(rt_rq);
 929                raw_spin_rq_unlock(rq);
 930        }
 931
 932        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 933                return 1;
 934
 935        return idle;
 936}
 937
 938static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 939{
 940#ifdef CONFIG_RT_GROUP_SCHED
 941        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 942
 943        if (rt_rq)
 944                return rt_rq->highest_prio.curr;
 945#endif
 946
 947        return rt_task_of(rt_se)->prio;
 948}
 949
 950static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 951{
 952        u64 runtime = sched_rt_runtime(rt_rq);
 953
 954        if (rt_rq->rt_throttled)
 955                return rt_rq_throttled(rt_rq);
 956
 957        if (runtime >= sched_rt_period(rt_rq))
 958                return 0;
 959
 960        balance_runtime(rt_rq);
 961        runtime = sched_rt_runtime(rt_rq);
 962        if (runtime == RUNTIME_INF)
 963                return 0;
 964
 965        if (rt_rq->rt_time > runtime) {
 966                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 967
 968                /*
 969                 * Don't actually throttle groups that have no runtime assigned
 970                 * but accrue some time due to boosting.
 971                 */
 972                if (likely(rt_b->rt_runtime)) {
 973                        rt_rq->rt_throttled = 1;
 974                        printk_deferred_once("sched: RT throttling activated\n");
 975                } else {
 976                        /*
 977                         * In case we did anyway, make it go away,
 978                         * replenishment is a joke, since it will replenish us
 979                         * with exactly 0 ns.
 980                         */
 981                        rt_rq->rt_time = 0;
 982                }
 983
 984                if (rt_rq_throttled(rt_rq)) {
 985                        sched_rt_rq_dequeue(rt_rq);
 986                        return 1;
 987                }
 988        }
 989
 990        return 0;
 991}
 992
 993/*
 994 * Update the current task's runtime statistics. Skip current tasks that
 995 * are not in our scheduling class.
 996 */
 997static void update_curr_rt(struct rq *rq)
 998{
 999        struct task_struct *curr = rq->curr;
1000        struct sched_rt_entity *rt_se = &curr->rt;
1001        u64 delta_exec;
1002        u64 now;
1003
1004        if (curr->sched_class != &rt_sched_class)
1005                return;
1006
1007        now = rq_clock_task(rq);
1008        delta_exec = now - curr->se.exec_start;
1009        if (unlikely((s64)delta_exec <= 0))
1010                return;
1011
1012        schedstat_set(curr->se.statistics.exec_max,
1013                      max(curr->se.statistics.exec_max, delta_exec));
1014
1015        curr->se.sum_exec_runtime += delta_exec;
1016        account_group_exec_runtime(curr, delta_exec);
1017
1018        curr->se.exec_start = now;
1019        cgroup_account_cputime(curr, delta_exec);
1020
1021        if (!rt_bandwidth_enabled())
1022                return;
1023
1024        for_each_sched_rt_entity(rt_se) {
1025                struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1026
1027                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1028                        raw_spin_lock(&rt_rq->rt_runtime_lock);
1029                        rt_rq->rt_time += delta_exec;
1030                        if (sched_rt_runtime_exceeded(rt_rq))
1031                                resched_curr(rq);
1032                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
1033                }
1034        }
1035}
1036
1037static void
1038dequeue_top_rt_rq(struct rt_rq *rt_rq)
1039{
1040        struct rq *rq = rq_of_rt_rq(rt_rq);
1041
1042        BUG_ON(&rq->rt != rt_rq);
1043
1044        if (!rt_rq->rt_queued)
1045                return;
1046
1047        BUG_ON(!rq->nr_running);
1048
1049        sub_nr_running(rq, rt_rq->rt_nr_running);
1050        rt_rq->rt_queued = 0;
1051
1052}
1053
1054static void
1055enqueue_top_rt_rq(struct rt_rq *rt_rq)
1056{
1057        struct rq *rq = rq_of_rt_rq(rt_rq);
1058
1059        BUG_ON(&rq->rt != rt_rq);
1060
1061        if (rt_rq->rt_queued)
1062                return;
1063
1064        if (rt_rq_throttled(rt_rq))
1065                return;
1066
1067        if (rt_rq->rt_nr_running) {
1068                add_nr_running(rq, rt_rq->rt_nr_running);
1069                rt_rq->rt_queued = 1;
1070        }
1071
1072        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1073        cpufreq_update_util(rq, 0);
1074}
1075
1076#if defined CONFIG_SMP
1077
1078static void
1079inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1080{
1081        struct rq *rq = rq_of_rt_rq(rt_rq);
1082
1083#ifdef CONFIG_RT_GROUP_SCHED
1084        /*
1085         * Change rq's cpupri only if rt_rq is the top queue.
1086         */
1087        if (&rq->rt != rt_rq)
1088                return;
1089#endif
1090        if (rq->online && prio < prev_prio)
1091                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1092}
1093
1094static void
1095dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1096{
1097        struct rq *rq = rq_of_rt_rq(rt_rq);
1098
1099#ifdef CONFIG_RT_GROUP_SCHED
1100        /*
1101         * Change rq's cpupri only if rt_rq is the top queue.
1102         */
1103        if (&rq->rt != rt_rq)
1104                return;
1105#endif
1106        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1107                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1108}
1109
1110#else /* CONFIG_SMP */
1111
1112static inline
1113void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1114static inline
1115void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1116
1117#endif /* CONFIG_SMP */
1118
1119#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1120static void
1121inc_rt_prio(struct rt_rq *rt_rq, int prio)
1122{
1123        int prev_prio = rt_rq->highest_prio.curr;
1124
1125        if (prio < prev_prio)
1126                rt_rq->highest_prio.curr = prio;
1127
1128        inc_rt_prio_smp(rt_rq, prio, prev_prio);
1129}
1130
1131static void
1132dec_rt_prio(struct rt_rq *rt_rq, int prio)
1133{
1134        int prev_prio = rt_rq->highest_prio.curr;
1135
1136        if (rt_rq->rt_nr_running) {
1137
1138                WARN_ON(prio < prev_prio);
1139
1140                /*
1141                 * This may have been our highest task, and therefore
1142                 * we may have some recomputation to do
1143                 */
1144                if (prio == prev_prio) {
1145                        struct rt_prio_array *array = &rt_rq->active;
1146
1147                        rt_rq->highest_prio.curr =
1148                                sched_find_first_bit(array->bitmap);
1149                }
1150
1151        } else {
1152                rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1153        }
1154
1155        dec_rt_prio_smp(rt_rq, prio, prev_prio);
1156}
1157
1158#else
1159
1160static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1161static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1162
1163#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1164
1165#ifdef CONFIG_RT_GROUP_SCHED
1166
1167static void
1168inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1169{
1170        if (rt_se_boosted(rt_se))
1171                rt_rq->rt_nr_boosted++;
1172
1173        if (rt_rq->tg)
1174                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1175}
1176
1177static void
1178dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1179{
1180        if (rt_se_boosted(rt_se))
1181                rt_rq->rt_nr_boosted--;
1182
1183        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1184}
1185
1186#else /* CONFIG_RT_GROUP_SCHED */
1187
1188static void
1189inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1190{
1191        start_rt_bandwidth(&def_rt_bandwidth);
1192}
1193
1194static inline
1195void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1196
1197#endif /* CONFIG_RT_GROUP_SCHED */
1198
1199static inline
1200unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1201{
1202        struct rt_rq *group_rq = group_rt_rq(rt_se);
1203
1204        if (group_rq)
1205                return group_rq->rt_nr_running;
1206        else
1207                return 1;
1208}
1209
1210static inline
1211unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1212{
1213        struct rt_rq *group_rq = group_rt_rq(rt_se);
1214        struct task_struct *tsk;
1215
1216        if (group_rq)
1217                return group_rq->rr_nr_running;
1218
1219        tsk = rt_task_of(rt_se);
1220
1221        return (tsk->policy == SCHED_RR) ? 1 : 0;
1222}
1223
1224static inline
1225void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1226{
1227        int prio = rt_se_prio(rt_se);
1228
1229        WARN_ON(!rt_prio(prio));
1230        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1231        rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1232
1233        inc_rt_prio(rt_rq, prio);
1234        inc_rt_migration(rt_se, rt_rq);
1235        inc_rt_group(rt_se, rt_rq);
1236}
1237
1238static inline
1239void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1240{
1241        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1242        WARN_ON(!rt_rq->rt_nr_running);
1243        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1244        rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1245
1246        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1247        dec_rt_migration(rt_se, rt_rq);
1248        dec_rt_group(rt_se, rt_rq);
1249}
1250
1251/*
1252 * Change rt_se->run_list location unless SAVE && !MOVE
1253 *
1254 * assumes ENQUEUE/DEQUEUE flags match
1255 */
1256static inline bool move_entity(unsigned int flags)
1257{
1258        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1259                return false;
1260
1261        return true;
1262}
1263
1264static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1265{
1266        list_del_init(&rt_se->run_list);
1267
1268        if (list_empty(array->queue + rt_se_prio(rt_se)))
1269                __clear_bit(rt_se_prio(rt_se), array->bitmap);
1270
1271        rt_se->on_list = 0;
1272}
1273
1274static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1275{
1276        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1277        struct rt_prio_array *array = &rt_rq->active;
1278        struct rt_rq *group_rq = group_rt_rq(rt_se);
1279        struct list_head *queue = array->queue + rt_se_prio(rt_se);
1280
1281        /*
1282         * Don't enqueue the group if its throttled, or when empty.
1283         * The latter is a consequence of the former when a child group
1284         * get throttled and the current group doesn't have any other
1285         * active members.
1286         */
1287        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1288                if (rt_se->on_list)
1289                        __delist_rt_entity(rt_se, array);
1290                return;
1291        }
1292
1293        if (move_entity(flags)) {
1294                WARN_ON_ONCE(rt_se->on_list);
1295                if (flags & ENQUEUE_HEAD)
1296                        list_add(&rt_se->run_list, queue);
1297                else
1298                        list_add_tail(&rt_se->run_list, queue);
1299
1300                __set_bit(rt_se_prio(rt_se), array->bitmap);
1301                rt_se->on_list = 1;
1302        }
1303        rt_se->on_rq = 1;
1304
1305        inc_rt_tasks(rt_se, rt_rq);
1306}
1307
1308static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1309{
1310        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1311        struct rt_prio_array *array = &rt_rq->active;
1312
1313        if (move_entity(flags)) {
1314                WARN_ON_ONCE(!rt_se->on_list);
1315                __delist_rt_entity(rt_se, array);
1316        }
1317        rt_se->on_rq = 0;
1318
1319        dec_rt_tasks(rt_se, rt_rq);
1320}
1321
1322/*
1323 * Because the prio of an upper entry depends on the lower
1324 * entries, we must remove entries top - down.
1325 */
1326static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1327{
1328        struct sched_rt_entity *back = NULL;
1329
1330        for_each_sched_rt_entity(rt_se) {
1331                rt_se->back = back;
1332                back = rt_se;
1333        }
1334
1335        dequeue_top_rt_rq(rt_rq_of_se(back));
1336
1337        for (rt_se = back; rt_se; rt_se = rt_se->back) {
1338                if (on_rt_rq(rt_se))
1339                        __dequeue_rt_entity(rt_se, flags);
1340        }
1341}
1342
1343static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1344{
1345        struct rq *rq = rq_of_rt_se(rt_se);
1346
1347        dequeue_rt_stack(rt_se, flags);
1348        for_each_sched_rt_entity(rt_se)
1349                __enqueue_rt_entity(rt_se, flags);
1350        enqueue_top_rt_rq(&rq->rt);
1351}
1352
1353static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1354{
1355        struct rq *rq = rq_of_rt_se(rt_se);
1356
1357        dequeue_rt_stack(rt_se, flags);
1358
1359        for_each_sched_rt_entity(rt_se) {
1360                struct rt_rq *rt_rq = group_rt_rq(rt_se);
1361
1362                if (rt_rq && rt_rq->rt_nr_running)
1363                        __enqueue_rt_entity(rt_se, flags);
1364        }
1365        enqueue_top_rt_rq(&rq->rt);
1366}
1367
1368/*
1369 * Adding/removing a task to/from a priority array:
1370 */
1371static void
1372enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1373{
1374        struct sched_rt_entity *rt_se = &p->rt;
1375
1376        if (flags & ENQUEUE_WAKEUP)
1377                rt_se->timeout = 0;
1378
1379        enqueue_rt_entity(rt_se, flags);
1380
1381        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1382                enqueue_pushable_task(rq, p);
1383}
1384
1385static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1386{
1387        struct sched_rt_entity *rt_se = &p->rt;
1388
1389        update_curr_rt(rq);
1390        dequeue_rt_entity(rt_se, flags);
1391
1392        dequeue_pushable_task(rq, p);
1393}
1394
1395/*
1396 * Put task to the head or the end of the run list without the overhead of
1397 * dequeue followed by enqueue.
1398 */
1399static void
1400requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1401{
1402        if (on_rt_rq(rt_se)) {
1403                struct rt_prio_array *array = &rt_rq->active;
1404                struct list_head *queue = array->queue + rt_se_prio(rt_se);
1405
1406                if (head)
1407                        list_move(&rt_se->run_list, queue);
1408                else
1409                        list_move_tail(&rt_se->run_list, queue);
1410        }
1411}
1412
1413static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1414{
1415        struct sched_rt_entity *rt_se = &p->rt;
1416        struct rt_rq *rt_rq;
1417
1418        for_each_sched_rt_entity(rt_se) {
1419                rt_rq = rt_rq_of_se(rt_se);
1420                requeue_rt_entity(rt_rq, rt_se, head);
1421        }
1422}
1423
1424static void yield_task_rt(struct rq *rq)
1425{
1426        requeue_task_rt(rq, rq->curr, 0);
1427}
1428
1429#ifdef CONFIG_SMP
1430static int find_lowest_rq(struct task_struct *task);
1431
1432static int
1433select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1434{
1435        struct task_struct *curr;
1436        struct rq *rq;
1437        bool test;
1438
1439        /* For anything but wake ups, just return the task_cpu */
1440        if (!(flags & (WF_TTWU | WF_FORK)))
1441                goto out;
1442
1443        rq = cpu_rq(cpu);
1444
1445        rcu_read_lock();
1446        curr = READ_ONCE(rq->curr); /* unlocked access */
1447
1448        /*
1449         * If the current task on @p's runqueue is an RT task, then
1450         * try to see if we can wake this RT task up on another
1451         * runqueue. Otherwise simply start this RT task
1452         * on its current runqueue.
1453         *
1454         * We want to avoid overloading runqueues. If the woken
1455         * task is a higher priority, then it will stay on this CPU
1456         * and the lower prio task should be moved to another CPU.
1457         * Even though this will probably make the lower prio task
1458         * lose its cache, we do not want to bounce a higher task
1459         * around just because it gave up its CPU, perhaps for a
1460         * lock?
1461         *
1462         * For equal prio tasks, we just let the scheduler sort it out.
1463         *
1464         * Otherwise, just let it ride on the affined RQ and the
1465         * post-schedule router will push the preempted task away
1466         *
1467         * This test is optimistic, if we get it wrong the load-balancer
1468         * will have to sort it out.
1469         *
1470         * We take into account the capacity of the CPU to ensure it fits the
1471         * requirement of the task - which is only important on heterogeneous
1472         * systems like big.LITTLE.
1473         */
1474        test = curr &&
1475               unlikely(rt_task(curr)) &&
1476               (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1477
1478        if (test || !rt_task_fits_capacity(p, cpu)) {
1479                int target = find_lowest_rq(p);
1480
1481                /*
1482                 * Bail out if we were forcing a migration to find a better
1483                 * fitting CPU but our search failed.
1484                 */
1485                if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1486                        goto out_unlock;
1487
1488                /*
1489                 * Don't bother moving it if the destination CPU is
1490                 * not running a lower priority task.
1491                 */
1492                if (target != -1 &&
1493                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
1494                        cpu = target;
1495        }
1496
1497out_unlock:
1498        rcu_read_unlock();
1499
1500out:
1501        return cpu;
1502}
1503
1504static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1505{
1506        /*
1507         * Current can't be migrated, useless to reschedule,
1508         * let's hope p can move out.
1509         */
1510        if (rq->curr->nr_cpus_allowed == 1 ||
1511            !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1512                return;
1513
1514        /*
1515         * p is migratable, so let's not schedule it and
1516         * see if it is pushed or pulled somewhere else.
1517         */
1518        if (p->nr_cpus_allowed != 1 &&
1519            cpupri_find(&rq->rd->cpupri, p, NULL))
1520                return;
1521
1522        /*
1523         * There appear to be other CPUs that can accept
1524         * the current task but none can run 'p', so lets reschedule
1525         * to try and push the current task away:
1526         */
1527        requeue_task_rt(rq, p, 1);
1528        resched_curr(rq);
1529}
1530
1531static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1532{
1533        if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1534                /*
1535                 * This is OK, because current is on_cpu, which avoids it being
1536                 * picked for load-balance and preemption/IRQs are still
1537                 * disabled avoiding further scheduler activity on it and we've
1538                 * not yet started the picking loop.
1539                 */
1540                rq_unpin_lock(rq, rf);
1541                pull_rt_task(rq);
1542                rq_repin_lock(rq, rf);
1543        }
1544
1545        return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1546}
1547#endif /* CONFIG_SMP */
1548
1549/*
1550 * Preempt the current task with a newly woken task if needed:
1551 */
1552static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1553{
1554        if (p->prio < rq->curr->prio) {
1555                resched_curr(rq);
1556                return;
1557        }
1558
1559#ifdef CONFIG_SMP
1560        /*
1561         * If:
1562         *
1563         * - the newly woken task is of equal priority to the current task
1564         * - the newly woken task is non-migratable while current is migratable
1565         * - current will be preempted on the next reschedule
1566         *
1567         * we should check to see if current can readily move to a different
1568         * cpu.  If so, we will reschedule to allow the push logic to try
1569         * to move current somewhere else, making room for our non-migratable
1570         * task.
1571         */
1572        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1573                check_preempt_equal_prio(rq, p);
1574#endif
1575}
1576
1577static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1578{
1579        p->se.exec_start = rq_clock_task(rq);
1580
1581        /* The running task is never eligible for pushing */
1582        dequeue_pushable_task(rq, p);
1583
1584        if (!first)
1585                return;
1586
1587        /*
1588         * If prev task was rt, put_prev_task() has already updated the
1589         * utilization. We only care of the case where we start to schedule a
1590         * rt task
1591         */
1592        if (rq->curr->sched_class != &rt_sched_class)
1593                update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1594
1595        rt_queue_push_tasks(rq);
1596}
1597
1598static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1599                                                   struct rt_rq *rt_rq)
1600{
1601        struct rt_prio_array *array = &rt_rq->active;
1602        struct sched_rt_entity *next = NULL;
1603        struct list_head *queue;
1604        int idx;
1605
1606        idx = sched_find_first_bit(array->bitmap);
1607        BUG_ON(idx >= MAX_RT_PRIO);
1608
1609        queue = array->queue + idx;
1610        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1611
1612        return next;
1613}
1614
1615static struct task_struct *_pick_next_task_rt(struct rq *rq)
1616{
1617        struct sched_rt_entity *rt_se;
1618        struct rt_rq *rt_rq  = &rq->rt;
1619
1620        do {
1621                rt_se = pick_next_rt_entity(rq, rt_rq);
1622                BUG_ON(!rt_se);
1623                rt_rq = group_rt_rq(rt_se);
1624        } while (rt_rq);
1625
1626        return rt_task_of(rt_se);
1627}
1628
1629static struct task_struct *pick_task_rt(struct rq *rq)
1630{
1631        struct task_struct *p;
1632
1633        if (!sched_rt_runnable(rq))
1634                return NULL;
1635
1636        p = _pick_next_task_rt(rq);
1637
1638        return p;
1639}
1640
1641static struct task_struct *pick_next_task_rt(struct rq *rq)
1642{
1643        struct task_struct *p = pick_task_rt(rq);
1644
1645        if (p)
1646                set_next_task_rt(rq, p, true);
1647
1648        return p;
1649}
1650
1651static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1652{
1653        update_curr_rt(rq);
1654
1655        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1656
1657        /*
1658         * The previous task needs to be made eligible for pushing
1659         * if it is still active
1660         */
1661        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1662                enqueue_pushable_task(rq, p);
1663}
1664
1665#ifdef CONFIG_SMP
1666
1667/* Only try algorithms three times */
1668#define RT_MAX_TRIES 3
1669
1670static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1671{
1672        if (!task_running(rq, p) &&
1673            cpumask_test_cpu(cpu, &p->cpus_mask))
1674                return 1;
1675
1676        return 0;
1677}
1678
1679/*
1680 * Return the highest pushable rq's task, which is suitable to be executed
1681 * on the CPU, NULL otherwise
1682 */
1683static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1684{
1685        struct plist_head *head = &rq->rt.pushable_tasks;
1686        struct task_struct *p;
1687
1688        if (!has_pushable_tasks(rq))
1689                return NULL;
1690
1691        plist_for_each_entry(p, head, pushable_tasks) {
1692                if (pick_rt_task(rq, p, cpu))
1693                        return p;
1694        }
1695
1696        return NULL;
1697}
1698
1699static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1700
1701static int find_lowest_rq(struct task_struct *task)
1702{
1703        struct sched_domain *sd;
1704        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1705        int this_cpu = smp_processor_id();
1706        int cpu      = task_cpu(task);
1707        int ret;
1708
1709        /* Make sure the mask is initialized first */
1710        if (unlikely(!lowest_mask))
1711                return -1;
1712
1713        if (task->nr_cpus_allowed == 1)
1714                return -1; /* No other targets possible */
1715
1716        /*
1717         * If we're on asym system ensure we consider the different capacities
1718         * of the CPUs when searching for the lowest_mask.
1719         */
1720        if (static_branch_unlikely(&sched_asym_cpucapacity)) {
1721
1722                ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1723                                          task, lowest_mask,
1724                                          rt_task_fits_capacity);
1725        } else {
1726
1727                ret = cpupri_find(&task_rq(task)->rd->cpupri,
1728                                  task, lowest_mask);
1729        }
1730
1731        if (!ret)
1732                return -1; /* No targets found */
1733
1734        /*
1735         * At this point we have built a mask of CPUs representing the
1736         * lowest priority tasks in the system.  Now we want to elect
1737         * the best one based on our affinity and topology.
1738         *
1739         * We prioritize the last CPU that the task executed on since
1740         * it is most likely cache-hot in that location.
1741         */
1742        if (cpumask_test_cpu(cpu, lowest_mask))
1743                return cpu;
1744
1745        /*
1746         * Otherwise, we consult the sched_domains span maps to figure
1747         * out which CPU is logically closest to our hot cache data.
1748         */
1749        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1750                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1751
1752        rcu_read_lock();
1753        for_each_domain(cpu, sd) {
1754                if (sd->flags & SD_WAKE_AFFINE) {
1755                        int best_cpu;
1756
1757                        /*
1758                         * "this_cpu" is cheaper to preempt than a
1759                         * remote processor.
1760                         */
1761                        if (this_cpu != -1 &&
1762                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1763                                rcu_read_unlock();
1764                                return this_cpu;
1765                        }
1766
1767                        best_cpu = cpumask_any_and_distribute(lowest_mask,
1768                                                              sched_domain_span(sd));
1769                        if (best_cpu < nr_cpu_ids) {
1770                                rcu_read_unlock();
1771                                return best_cpu;
1772                        }
1773                }
1774        }
1775        rcu_read_unlock();
1776
1777        /*
1778         * And finally, if there were no matches within the domains
1779         * just give the caller *something* to work with from the compatible
1780         * locations.
1781         */
1782        if (this_cpu != -1)
1783                return this_cpu;
1784
1785        cpu = cpumask_any_distribute(lowest_mask);
1786        if (cpu < nr_cpu_ids)
1787                return cpu;
1788
1789        return -1;
1790}
1791
1792/* Will lock the rq it finds */
1793static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1794{
1795        struct rq *lowest_rq = NULL;
1796        int tries;
1797        int cpu;
1798
1799        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1800                cpu = find_lowest_rq(task);
1801
1802                if ((cpu == -1) || (cpu == rq->cpu))
1803                        break;
1804
1805                lowest_rq = cpu_rq(cpu);
1806
1807                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1808                        /*
1809                         * Target rq has tasks of equal or higher priority,
1810                         * retrying does not release any lock and is unlikely
1811                         * to yield a different result.
1812                         */
1813                        lowest_rq = NULL;
1814                        break;
1815                }
1816
1817                /* if the prio of this runqueue changed, try again */
1818                if (double_lock_balance(rq, lowest_rq)) {
1819                        /*
1820                         * We had to unlock the run queue. In
1821                         * the mean time, task could have
1822                         * migrated already or had its affinity changed.
1823                         * Also make sure that it wasn't scheduled on its rq.
1824                         */
1825                        if (unlikely(task_rq(task) != rq ||
1826                                     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
1827                                     task_running(rq, task) ||
1828                                     !rt_task(task) ||
1829                                     !task_on_rq_queued(task))) {
1830
1831                                double_unlock_balance(rq, lowest_rq);
1832                                lowest_rq = NULL;
1833                                break;
1834                        }
1835                }
1836
1837                /* If this rq is still suitable use it. */
1838                if (lowest_rq->rt.highest_prio.curr > task->prio)
1839                        break;
1840
1841                /* try again */
1842                double_unlock_balance(rq, lowest_rq);
1843                lowest_rq = NULL;
1844        }
1845
1846        return lowest_rq;
1847}
1848
1849static struct task_struct *pick_next_pushable_task(struct rq *rq)
1850{
1851        struct task_struct *p;
1852
1853        if (!has_pushable_tasks(rq))
1854                return NULL;
1855
1856        p = plist_first_entry(&rq->rt.pushable_tasks,
1857                              struct task_struct, pushable_tasks);
1858
1859        BUG_ON(rq->cpu != task_cpu(p));
1860        BUG_ON(task_current(rq, p));
1861        BUG_ON(p->nr_cpus_allowed <= 1);
1862
1863        BUG_ON(!task_on_rq_queued(p));
1864        BUG_ON(!rt_task(p));
1865
1866        return p;
1867}
1868
1869/*
1870 * If the current CPU has more than one RT task, see if the non
1871 * running task can migrate over to a CPU that is running a task
1872 * of lesser priority.
1873 */
1874static int push_rt_task(struct rq *rq, bool pull)
1875{
1876        struct task_struct *next_task;
1877        struct rq *lowest_rq;
1878        int ret = 0;
1879
1880        if (!rq->rt.overloaded)
1881                return 0;
1882
1883        next_task = pick_next_pushable_task(rq);
1884        if (!next_task)
1885                return 0;
1886
1887retry:
1888        if (is_migration_disabled(next_task)) {
1889                struct task_struct *push_task = NULL;
1890                int cpu;
1891
1892                if (!pull || rq->push_busy)
1893                        return 0;
1894
1895                cpu = find_lowest_rq(rq->curr);
1896                if (cpu == -1 || cpu == rq->cpu)
1897                        return 0;
1898
1899                /*
1900                 * Given we found a CPU with lower priority than @next_task,
1901                 * therefore it should be running. However we cannot migrate it
1902                 * to this other CPU, instead attempt to push the current
1903                 * running task on this CPU away.
1904                 */
1905                push_task = get_push_task(rq);
1906                if (push_task) {
1907                        raw_spin_rq_unlock(rq);
1908                        stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
1909                                            push_task, &rq->push_work);
1910                        raw_spin_rq_lock(rq);
1911                }
1912
1913                return 0;
1914        }
1915
1916        if (WARN_ON(next_task == rq->curr))
1917                return 0;
1918
1919        /*
1920         * It's possible that the next_task slipped in of
1921         * higher priority than current. If that's the case
1922         * just reschedule current.
1923         */
1924        if (unlikely(next_task->prio < rq->curr->prio)) {
1925                resched_curr(rq);
1926                return 0;
1927        }
1928
1929        /* We might release rq lock */
1930        get_task_struct(next_task);
1931
1932        /* find_lock_lowest_rq locks the rq if found */
1933        lowest_rq = find_lock_lowest_rq(next_task, rq);
1934        if (!lowest_rq) {
1935                struct task_struct *task;
1936                /*
1937                 * find_lock_lowest_rq releases rq->lock
1938                 * so it is possible that next_task has migrated.
1939                 *
1940                 * We need to make sure that the task is still on the same
1941                 * run-queue and is also still the next task eligible for
1942                 * pushing.
1943                 */
1944                task = pick_next_pushable_task(rq);
1945                if (task == next_task) {
1946                        /*
1947                         * The task hasn't migrated, and is still the next
1948                         * eligible task, but we failed to find a run-queue
1949                         * to push it to.  Do not retry in this case, since
1950                         * other CPUs will pull from us when ready.
1951                         */
1952                        goto out;
1953                }
1954
1955                if (!task)
1956                        /* No more tasks, just exit */
1957                        goto out;
1958
1959                /*
1960                 * Something has shifted, try again.
1961                 */
1962                put_task_struct(next_task);
1963                next_task = task;
1964                goto retry;
1965        }
1966
1967        deactivate_task(rq, next_task, 0);
1968        set_task_cpu(next_task, lowest_rq->cpu);
1969        activate_task(lowest_rq, next_task, 0);
1970        resched_curr(lowest_rq);
1971        ret = 1;
1972
1973        double_unlock_balance(rq, lowest_rq);
1974out:
1975        put_task_struct(next_task);
1976
1977        return ret;
1978}
1979
1980static void push_rt_tasks(struct rq *rq)
1981{
1982        /* push_rt_task will return true if it moved an RT */
1983        while (push_rt_task(rq, false))
1984                ;
1985}
1986
1987#ifdef HAVE_RT_PUSH_IPI
1988
1989/*
1990 * When a high priority task schedules out from a CPU and a lower priority
1991 * task is scheduled in, a check is made to see if there's any RT tasks
1992 * on other CPUs that are waiting to run because a higher priority RT task
1993 * is currently running on its CPU. In this case, the CPU with multiple RT
1994 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1995 * up that may be able to run one of its non-running queued RT tasks.
1996 *
1997 * All CPUs with overloaded RT tasks need to be notified as there is currently
1998 * no way to know which of these CPUs have the highest priority task waiting
1999 * to run. Instead of trying to take a spinlock on each of these CPUs,
2000 * which has shown to cause large latency when done on machines with many
2001 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2002 * RT tasks waiting to run.
2003 *
2004 * Just sending an IPI to each of the CPUs is also an issue, as on large
2005 * count CPU machines, this can cause an IPI storm on a CPU, especially
2006 * if its the only CPU with multiple RT tasks queued, and a large number
2007 * of CPUs scheduling a lower priority task at the same time.
2008 *
2009 * Each root domain has its own irq work function that can iterate over
2010 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2011 * task must be checked if there's one or many CPUs that are lowering
2012 * their priority, there's a single irq work iterator that will try to
2013 * push off RT tasks that are waiting to run.
2014 *
2015 * When a CPU schedules a lower priority task, it will kick off the
2016 * irq work iterator that will jump to each CPU with overloaded RT tasks.
2017 * As it only takes the first CPU that schedules a lower priority task
2018 * to start the process, the rto_start variable is incremented and if
2019 * the atomic result is one, then that CPU will try to take the rto_lock.
2020 * This prevents high contention on the lock as the process handles all
2021 * CPUs scheduling lower priority tasks.
2022 *
2023 * All CPUs that are scheduling a lower priority task will increment the
2024 * rt_loop_next variable. This will make sure that the irq work iterator
2025 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2026 * priority task, even if the iterator is in the middle of a scan. Incrementing
2027 * the rt_loop_next will cause the iterator to perform another scan.
2028 *
2029 */
2030static int rto_next_cpu(struct root_domain *rd)
2031{
2032        int next;
2033        int cpu;
2034
2035        /*
2036         * When starting the IPI RT pushing, the rto_cpu is set to -1,
2037         * rt_next_cpu() will simply return the first CPU found in
2038         * the rto_mask.
2039         *
2040         * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2041         * will return the next CPU found in the rto_mask.
2042         *
2043         * If there are no more CPUs left in the rto_mask, then a check is made
2044         * against rto_loop and rto_loop_next. rto_loop is only updated with
2045         * the rto_lock held, but any CPU may increment the rto_loop_next
2046         * without any locking.
2047         */
2048        for (;;) {
2049
2050                /* When rto_cpu is -1 this acts like cpumask_first() */
2051                cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2052
2053                rd->rto_cpu = cpu;
2054
2055                if (cpu < nr_cpu_ids)
2056                        return cpu;
2057
2058                rd->rto_cpu = -1;
2059
2060                /*
2061                 * ACQUIRE ensures we see the @rto_mask changes
2062                 * made prior to the @next value observed.
2063                 *
2064                 * Matches WMB in rt_set_overload().
2065                 */
2066                next = atomic_read_acquire(&rd->rto_loop_next);
2067
2068                if (rd->rto_loop == next)
2069                        break;
2070
2071                rd->rto_loop = next;
2072        }
2073
2074        return -1;
2075}
2076
2077static inline bool rto_start_trylock(atomic_t *v)
2078{
2079        return !atomic_cmpxchg_acquire(v, 0, 1);
2080}
2081
2082static inline void rto_start_unlock(atomic_t *v)
2083{
2084        atomic_set_release(v, 0);
2085}
2086
2087static void tell_cpu_to_push(struct rq *rq)
2088{
2089        int cpu = -1;
2090
2091        /* Keep the loop going if the IPI is currently active */
2092        atomic_inc(&rq->rd->rto_loop_next);
2093
2094        /* Only one CPU can initiate a loop at a time */
2095        if (!rto_start_trylock(&rq->rd->rto_loop_start))
2096                return;
2097
2098        raw_spin_lock(&rq->rd->rto_lock);
2099
2100        /*
2101         * The rto_cpu is updated under the lock, if it has a valid CPU
2102         * then the IPI is still running and will continue due to the
2103         * update to loop_next, and nothing needs to be done here.
2104         * Otherwise it is finishing up and an ipi needs to be sent.
2105         */
2106        if (rq->rd->rto_cpu < 0)
2107                cpu = rto_next_cpu(rq->rd);
2108
2109        raw_spin_unlock(&rq->rd->rto_lock);
2110
2111        rto_start_unlock(&rq->rd->rto_loop_start);
2112
2113        if (cpu >= 0) {
2114                /* Make sure the rd does not get freed while pushing */
2115                sched_get_rd(rq->rd);
2116                irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2117        }
2118}
2119
2120/* Called from hardirq context */
2121void rto_push_irq_work_func(struct irq_work *work)
2122{
2123        struct root_domain *rd =
2124                container_of(work, struct root_domain, rto_push_work);
2125        struct rq *rq;
2126        int cpu;
2127
2128        rq = this_rq();
2129
2130        /*
2131         * We do not need to grab the lock to check for has_pushable_tasks.
2132         * When it gets updated, a check is made if a push is possible.
2133         */
2134        if (has_pushable_tasks(rq)) {
2135                raw_spin_rq_lock(rq);
2136                while (push_rt_task(rq, true))
2137                        ;
2138                raw_spin_rq_unlock(rq);
2139        }
2140
2141        raw_spin_lock(&rd->rto_lock);
2142
2143        /* Pass the IPI to the next rt overloaded queue */
2144        cpu = rto_next_cpu(rd);
2145
2146        raw_spin_unlock(&rd->rto_lock);
2147
2148        if (cpu < 0) {
2149                sched_put_rd(rd);
2150                return;
2151        }
2152
2153        /* Try the next RT overloaded CPU */
2154        irq_work_queue_on(&rd->rto_push_work, cpu);
2155}
2156#endif /* HAVE_RT_PUSH_IPI */
2157
2158static void pull_rt_task(struct rq *this_rq)
2159{
2160        int this_cpu = this_rq->cpu, cpu;
2161        bool resched = false;
2162        struct task_struct *p, *push_task;
2163        struct rq *src_rq;
2164        int rt_overload_count = rt_overloaded(this_rq);
2165
2166        if (likely(!rt_overload_count))
2167                return;
2168
2169        /*
2170         * Match the barrier from rt_set_overloaded; this guarantees that if we
2171         * see overloaded we must also see the rto_mask bit.
2172         */
2173        smp_rmb();
2174
2175        /* If we are the only overloaded CPU do nothing */
2176        if (rt_overload_count == 1 &&
2177            cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2178                return;
2179
2180#ifdef HAVE_RT_PUSH_IPI
2181        if (sched_feat(RT_PUSH_IPI)) {
2182                tell_cpu_to_push(this_rq);
2183                return;
2184        }
2185#endif
2186
2187        for_each_cpu(cpu, this_rq->rd->rto_mask) {
2188                if (this_cpu == cpu)
2189                        continue;
2190
2191                src_rq = cpu_rq(cpu);
2192
2193                /*
2194                 * Don't bother taking the src_rq->lock if the next highest
2195                 * task is known to be lower-priority than our current task.
2196                 * This may look racy, but if this value is about to go
2197                 * logically higher, the src_rq will push this task away.
2198                 * And if its going logically lower, we do not care
2199                 */
2200                if (src_rq->rt.highest_prio.next >=
2201                    this_rq->rt.highest_prio.curr)
2202                        continue;
2203
2204                /*
2205                 * We can potentially drop this_rq's lock in
2206                 * double_lock_balance, and another CPU could
2207                 * alter this_rq
2208                 */
2209                push_task = NULL;
2210                double_lock_balance(this_rq, src_rq);
2211
2212                /*
2213                 * We can pull only a task, which is pushable
2214                 * on its rq, and no others.
2215                 */
2216                p = pick_highest_pushable_task(src_rq, this_cpu);
2217
2218                /*
2219                 * Do we have an RT task that preempts
2220                 * the to-be-scheduled task?
2221                 */
2222                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2223                        WARN_ON(p == src_rq->curr);
2224                        WARN_ON(!task_on_rq_queued(p));
2225
2226                        /*
2227                         * There's a chance that p is higher in priority
2228                         * than what's currently running on its CPU.
2229                         * This is just that p is waking up and hasn't
2230                         * had a chance to schedule. We only pull
2231                         * p if it is lower in priority than the
2232                         * current task on the run queue
2233                         */
2234                        if (p->prio < src_rq->curr->prio)
2235                                goto skip;
2236
2237                        if (is_migration_disabled(p)) {
2238                                push_task = get_push_task(src_rq);
2239                        } else {
2240                                deactivate_task(src_rq, p, 0);
2241                                set_task_cpu(p, this_cpu);
2242                                activate_task(this_rq, p, 0);
2243                                resched = true;
2244                        }
2245                        /*
2246                         * We continue with the search, just in
2247                         * case there's an even higher prio task
2248                         * in another runqueue. (low likelihood
2249                         * but possible)
2250                         */
2251                }
2252skip:
2253                double_unlock_balance(this_rq, src_rq);
2254
2255                if (push_task) {
2256                        raw_spin_rq_unlock(this_rq);
2257                        stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2258                                            push_task, &src_rq->push_work);
2259                        raw_spin_rq_lock(this_rq);
2260                }
2261        }
2262
2263        if (resched)
2264                resched_curr(this_rq);
2265}
2266
2267/*
2268 * If we are not running and we are not going to reschedule soon, we should
2269 * try to push tasks away now
2270 */
2271static void task_woken_rt(struct rq *rq, struct task_struct *p)
2272{
2273        bool need_to_push = !task_running(rq, p) &&
2274                            !test_tsk_need_resched(rq->curr) &&
2275                            p->nr_cpus_allowed > 1 &&
2276                            (dl_task(rq->curr) || rt_task(rq->curr)) &&
2277                            (rq->curr->nr_cpus_allowed < 2 ||
2278                             rq->curr->prio <= p->prio);
2279
2280        if (need_to_push)
2281                push_rt_tasks(rq);
2282}
2283
2284/* Assumes rq->lock is held */
2285static void rq_online_rt(struct rq *rq)
2286{
2287        if (rq->rt.overloaded)
2288                rt_set_overload(rq);
2289
2290        __enable_runtime(rq);
2291
2292        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2293}
2294
2295/* Assumes rq->lock is held */
2296static void rq_offline_rt(struct rq *rq)
2297{
2298        if (rq->rt.overloaded)
2299                rt_clear_overload(rq);
2300
2301        __disable_runtime(rq);
2302
2303        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2304}
2305
2306/*
2307 * When switch from the rt queue, we bring ourselves to a position
2308 * that we might want to pull RT tasks from other runqueues.
2309 */
2310static void switched_from_rt(struct rq *rq, struct task_struct *p)
2311{
2312        /*
2313         * If there are other RT tasks then we will reschedule
2314         * and the scheduling of the other RT tasks will handle
2315         * the balancing. But if we are the last RT task
2316         * we may need to handle the pulling of RT tasks
2317         * now.
2318         */
2319        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2320                return;
2321
2322        rt_queue_pull_task(rq);
2323}
2324
2325void __init init_sched_rt_class(void)
2326{
2327        unsigned int i;
2328
2329        for_each_possible_cpu(i) {
2330                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2331                                        GFP_KERNEL, cpu_to_node(i));
2332        }
2333}
2334#endif /* CONFIG_SMP */
2335
2336/*
2337 * When switching a task to RT, we may overload the runqueue
2338 * with RT tasks. In this case we try to push them off to
2339 * other runqueues.
2340 */
2341static void switched_to_rt(struct rq *rq, struct task_struct *p)
2342{
2343        /*
2344         * If we are running, update the avg_rt tracking, as the running time
2345         * will now on be accounted into the latter.
2346         */
2347        if (task_current(rq, p)) {
2348                update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2349                return;
2350        }
2351
2352        /*
2353         * If we are not running we may need to preempt the current
2354         * running task. If that current running task is also an RT task
2355         * then see if we can move to another run queue.
2356         */
2357        if (task_on_rq_queued(p)) {
2358#ifdef CONFIG_SMP
2359                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2360                        rt_queue_push_tasks(rq);
2361#endif /* CONFIG_SMP */
2362                if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2363                        resched_curr(rq);
2364        }
2365}
2366
2367/*
2368 * Priority of the task has changed. This may cause
2369 * us to initiate a push or pull.
2370 */
2371static void
2372prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2373{
2374        if (!task_on_rq_queued(p))
2375                return;
2376
2377        if (task_current(rq, p)) {
2378#ifdef CONFIG_SMP
2379                /*
2380                 * If our priority decreases while running, we
2381                 * may need to pull tasks to this runqueue.
2382                 */
2383                if (oldprio < p->prio)
2384                        rt_queue_pull_task(rq);
2385
2386                /*
2387                 * If there's a higher priority task waiting to run
2388                 * then reschedule.
2389                 */
2390                if (p->prio > rq->rt.highest_prio.curr)
2391                        resched_curr(rq);
2392#else
2393                /* For UP simply resched on drop of prio */
2394                if (oldprio < p->prio)
2395                        resched_curr(rq);
2396#endif /* CONFIG_SMP */
2397        } else {
2398                /*
2399                 * This task is not running, but if it is
2400                 * greater than the current running task
2401                 * then reschedule.
2402                 */
2403                if (p->prio < rq->curr->prio)
2404                        resched_curr(rq);
2405        }
2406}
2407
2408#ifdef CONFIG_POSIX_TIMERS
2409static void watchdog(struct rq *rq, struct task_struct *p)
2410{
2411        unsigned long soft, hard;
2412
2413        /* max may change after cur was read, this will be fixed next tick */
2414        soft = task_rlimit(p, RLIMIT_RTTIME);
2415        hard = task_rlimit_max(p, RLIMIT_RTTIME);
2416
2417        if (soft != RLIM_INFINITY) {
2418                unsigned long next;
2419
2420                if (p->rt.watchdog_stamp != jiffies) {
2421                        p->rt.timeout++;
2422                        p->rt.watchdog_stamp = jiffies;
2423                }
2424
2425                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2426                if (p->rt.timeout > next) {
2427                        posix_cputimers_rt_watchdog(&p->posix_cputimers,
2428                                                    p->se.sum_exec_runtime);
2429                }
2430        }
2431}
2432#else
2433static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2434#endif
2435
2436/*
2437 * scheduler tick hitting a task of our scheduling class.
2438 *
2439 * NOTE: This function can be called remotely by the tick offload that
2440 * goes along full dynticks. Therefore no local assumption can be made
2441 * and everything must be accessed through the @rq and @curr passed in
2442 * parameters.
2443 */
2444static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2445{
2446        struct sched_rt_entity *rt_se = &p->rt;
2447
2448        update_curr_rt(rq);
2449        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2450
2451        watchdog(rq, p);
2452
2453        /*
2454         * RR tasks need a special form of timeslice management.
2455         * FIFO tasks have no timeslices.
2456         */
2457        if (p->policy != SCHED_RR)
2458                return;
2459
2460        if (--p->rt.time_slice)
2461                return;
2462
2463        p->rt.time_slice = sched_rr_timeslice;
2464
2465        /*
2466         * Requeue to the end of queue if we (and all of our ancestors) are not
2467         * the only element on the queue
2468         */
2469        for_each_sched_rt_entity(rt_se) {
2470                if (rt_se->run_list.prev != rt_se->run_list.next) {
2471                        requeue_task_rt(rq, p, 0);
2472                        resched_curr(rq);
2473                        return;
2474                }
2475        }
2476}
2477
2478static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2479{
2480        /*
2481         * Time slice is 0 for SCHED_FIFO tasks
2482         */
2483        if (task->policy == SCHED_RR)
2484                return sched_rr_timeslice;
2485        else
2486                return 0;
2487}
2488
2489DEFINE_SCHED_CLASS(rt) = {
2490
2491        .enqueue_task           = enqueue_task_rt,
2492        .dequeue_task           = dequeue_task_rt,
2493        .yield_task             = yield_task_rt,
2494
2495        .check_preempt_curr     = check_preempt_curr_rt,
2496
2497        .pick_next_task         = pick_next_task_rt,
2498        .put_prev_task          = put_prev_task_rt,
2499        .set_next_task          = set_next_task_rt,
2500
2501#ifdef CONFIG_SMP
2502        .balance                = balance_rt,
2503        .pick_task              = pick_task_rt,
2504        .select_task_rq         = select_task_rq_rt,
2505        .set_cpus_allowed       = set_cpus_allowed_common,
2506        .rq_online              = rq_online_rt,
2507        .rq_offline             = rq_offline_rt,
2508        .task_woken             = task_woken_rt,
2509        .switched_from          = switched_from_rt,
2510        .find_lock_rq           = find_lock_lowest_rq,
2511#endif
2512
2513        .task_tick              = task_tick_rt,
2514
2515        .get_rr_interval        = get_rr_interval_rt,
2516
2517        .prio_changed           = prio_changed_rt,
2518        .switched_to            = switched_to_rt,
2519
2520        .update_curr            = update_curr_rt,
2521
2522#ifdef CONFIG_UCLAMP_TASK
2523        .uclamp_enabled         = 1,
2524#endif
2525};
2526
2527#ifdef CONFIG_RT_GROUP_SCHED
2528/*
2529 * Ensure that the real time constraints are schedulable.
2530 */
2531static DEFINE_MUTEX(rt_constraints_mutex);
2532
2533static inline int tg_has_rt_tasks(struct task_group *tg)
2534{
2535        struct task_struct *task;
2536        struct css_task_iter it;
2537        int ret = 0;
2538
2539        /*
2540         * Autogroups do not have RT tasks; see autogroup_create().
2541         */
2542        if (task_group_is_autogroup(tg))
2543                return 0;
2544
2545        css_task_iter_start(&tg->css, 0, &it);
2546        while (!ret && (task = css_task_iter_next(&it)))
2547                ret |= rt_task(task);
2548        css_task_iter_end(&it);
2549
2550        return ret;
2551}
2552
2553struct rt_schedulable_data {
2554        struct task_group *tg;
2555        u64 rt_period;
2556        u64 rt_runtime;
2557};
2558
2559static int tg_rt_schedulable(struct task_group *tg, void *data)
2560{
2561        struct rt_schedulable_data *d = data;
2562        struct task_group *child;
2563        unsigned long total, sum = 0;
2564        u64 period, runtime;
2565
2566        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2567        runtime = tg->rt_bandwidth.rt_runtime;
2568
2569        if (tg == d->tg) {
2570                period = d->rt_period;
2571                runtime = d->rt_runtime;
2572        }
2573
2574        /*
2575         * Cannot have more runtime than the period.
2576         */
2577        if (runtime > period && runtime != RUNTIME_INF)
2578                return -EINVAL;
2579
2580        /*
2581         * Ensure we don't starve existing RT tasks if runtime turns zero.
2582         */
2583        if (rt_bandwidth_enabled() && !runtime &&
2584            tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2585                return -EBUSY;
2586
2587        total = to_ratio(period, runtime);
2588
2589        /*
2590         * Nobody can have more than the global setting allows.
2591         */
2592        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2593                return -EINVAL;
2594
2595        /*
2596         * The sum of our children's runtime should not exceed our own.
2597         */
2598        list_for_each_entry_rcu(child, &tg->children, siblings) {
2599                period = ktime_to_ns(child->rt_bandwidth.rt_period);
2600                runtime = child->rt_bandwidth.rt_runtime;
2601
2602                if (child == d->tg) {
2603                        period = d->rt_period;
2604                        runtime = d->rt_runtime;
2605                }
2606
2607                sum += to_ratio(period, runtime);
2608        }
2609
2610        if (sum > total)
2611                return -EINVAL;
2612
2613        return 0;
2614}
2615
2616static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2617{
2618        int ret;
2619
2620        struct rt_schedulable_data data = {
2621                .tg = tg,
2622                .rt_period = period,
2623                .rt_runtime = runtime,
2624        };
2625
2626        rcu_read_lock();
2627        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2628        rcu_read_unlock();
2629
2630        return ret;
2631}
2632
2633static int tg_set_rt_bandwidth(struct task_group *tg,
2634                u64 rt_period, u64 rt_runtime)
2635{
2636        int i, err = 0;
2637
2638        /*
2639         * Disallowing the root group RT runtime is BAD, it would disallow the
2640         * kernel creating (and or operating) RT threads.
2641         */
2642        if (tg == &root_task_group && rt_runtime == 0)
2643                return -EINVAL;
2644
2645        /* No period doesn't make any sense. */
2646        if (rt_period == 0)
2647                return -EINVAL;
2648
2649        /*
2650         * Bound quota to defend quota against overflow during bandwidth shift.
2651         */
2652        if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2653                return -EINVAL;
2654
2655        mutex_lock(&rt_constraints_mutex);
2656        err = __rt_schedulable(tg, rt_period, rt_runtime);
2657        if (err)
2658                goto unlock;
2659
2660        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2661        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2662        tg->rt_bandwidth.rt_runtime = rt_runtime;
2663
2664        for_each_possible_cpu(i) {
2665                struct rt_rq *rt_rq = tg->rt_rq[i];
2666
2667                raw_spin_lock(&rt_rq->rt_runtime_lock);
2668                rt_rq->rt_runtime = rt_runtime;
2669                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2670        }
2671        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2672unlock:
2673        mutex_unlock(&rt_constraints_mutex);
2674
2675        return err;
2676}
2677
2678int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2679{
2680        u64 rt_runtime, rt_period;
2681
2682        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2683        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2684        if (rt_runtime_us < 0)
2685                rt_runtime = RUNTIME_INF;
2686        else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2687                return -EINVAL;
2688
2689        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2690}
2691
2692long sched_group_rt_runtime(struct task_group *tg)
2693{
2694        u64 rt_runtime_us;
2695
2696        if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2697                return -1;
2698
2699        rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2700        do_div(rt_runtime_us, NSEC_PER_USEC);
2701        return rt_runtime_us;
2702}
2703
2704int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2705{
2706        u64 rt_runtime, rt_period;
2707
2708        if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2709                return -EINVAL;
2710
2711        rt_period = rt_period_us * NSEC_PER_USEC;
2712        rt_runtime = tg->rt_bandwidth.rt_runtime;
2713
2714        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2715}
2716
2717long sched_group_rt_period(struct task_group *tg)
2718{
2719        u64 rt_period_us;
2720
2721        rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2722        do_div(rt_period_us, NSEC_PER_USEC);
2723        return rt_period_us;
2724}
2725
2726static int sched_rt_global_constraints(void)
2727{
2728        int ret = 0;
2729
2730        mutex_lock(&rt_constraints_mutex);
2731        ret = __rt_schedulable(NULL, 0, 0);
2732        mutex_unlock(&rt_constraints_mutex);
2733
2734        return ret;
2735}
2736
2737int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2738{
2739        /* Don't accept realtime tasks when there is no way for them to run */
2740        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2741                return 0;
2742
2743        return 1;
2744}
2745
2746#else /* !CONFIG_RT_GROUP_SCHED */
2747static int sched_rt_global_constraints(void)
2748{
2749        unsigned long flags;
2750        int i;
2751
2752        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2753        for_each_possible_cpu(i) {
2754                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2755
2756                raw_spin_lock(&rt_rq->rt_runtime_lock);
2757                rt_rq->rt_runtime = global_rt_runtime();
2758                raw_spin_unlock(&rt_rq->rt_runtime_lock);
2759        }
2760        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2761
2762        return 0;
2763}
2764#endif /* CONFIG_RT_GROUP_SCHED */
2765
2766static int sched_rt_global_validate(void)
2767{
2768        if (sysctl_sched_rt_period <= 0)
2769                return -EINVAL;
2770
2771        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2772                ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2773                 ((u64)sysctl_sched_rt_runtime *
2774                        NSEC_PER_USEC > max_rt_runtime)))
2775                return -EINVAL;
2776
2777        return 0;
2778}
2779
2780static void sched_rt_do_global(void)
2781{
2782        def_rt_bandwidth.rt_runtime = global_rt_runtime();
2783        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2784}
2785
2786int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2787                size_t *lenp, loff_t *ppos)
2788{
2789        int old_period, old_runtime;
2790        static DEFINE_MUTEX(mutex);
2791        int ret;
2792
2793        mutex_lock(&mutex);
2794        old_period = sysctl_sched_rt_period;
2795        old_runtime = sysctl_sched_rt_runtime;
2796
2797        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2798
2799        if (!ret && write) {
2800                ret = sched_rt_global_validate();
2801                if (ret)
2802                        goto undo;
2803
2804                ret = sched_dl_global_validate();
2805                if (ret)
2806                        goto undo;
2807
2808                ret = sched_rt_global_constraints();
2809                if (ret)
2810                        goto undo;
2811
2812                sched_rt_do_global();
2813                sched_dl_do_global();
2814        }
2815        if (0) {
2816undo:
2817                sysctl_sched_rt_period = old_period;
2818                sysctl_sched_rt_runtime = old_runtime;
2819        }
2820        mutex_unlock(&mutex);
2821
2822        return ret;
2823}
2824
2825int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
2826                size_t *lenp, loff_t *ppos)
2827{
2828        int ret;
2829        static DEFINE_MUTEX(mutex);
2830
2831        mutex_lock(&mutex);
2832        ret = proc_dointvec(table, write, buffer, lenp, ppos);
2833        /*
2834         * Make sure that internally we keep jiffies.
2835         * Also, writing zero resets the timeslice to default:
2836         */
2837        if (!ret && write) {
2838                sched_rr_timeslice =
2839                        sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2840                        msecs_to_jiffies(sysctl_sched_rr_timeslice);
2841        }
2842        mutex_unlock(&mutex);
2843
2844        return ret;
2845}
2846
2847#ifdef CONFIG_SCHED_DEBUG
2848void print_rt_stats(struct seq_file *m, int cpu)
2849{
2850        rt_rq_iter_t iter;
2851        struct rt_rq *rt_rq;
2852
2853        rcu_read_lock();
2854        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2855                print_rt_rq(m, cpu, rt_rq);
2856        rcu_read_unlock();
2857}
2858#endif /* CONFIG_SCHED_DEBUG */
2859