LXR linux/kernel/sched/rt.c

   1/*
   2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3 * policies)
   4 */
   5
   6#include "sched.h"
   7
   8#include <linux/slab.h>
   9
  10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  11
  12struct rt_bandwidth def_rt_bandwidth;
  13
  14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  15{
  16        struct rt_bandwidth *rt_b =
  17                container_of(timer, struct rt_bandwidth, rt_period_timer);
  18        ktime_t now;
  19        int overrun;
  20        int idle = 0;
  21
  22        for (;;) {
  23                now = hrtimer_cb_get_time(timer);
  24                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
  25
  26                if (!overrun)
  27                        break;
  28
  29                idle = do_sched_rt_period_timer(rt_b, overrun);
  30        }
  31
  32        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  33}
  34
  35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  36{
  37        rt_b->rt_period = ns_to_ktime(period);
  38        rt_b->rt_runtime = runtime;
  39
  40        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  41
  42        hrtimer_init(&rt_b->rt_period_timer,
  43                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  44        rt_b->rt_period_timer.function = sched_rt_period_timer;
  45}
  46
  47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  48{
  49        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  50                return;
  51
  52        if (hrtimer_active(&rt_b->rt_period_timer))
  53                return;
  54
  55        raw_spin_lock(&rt_b->rt_runtime_lock);
  56        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
  57        raw_spin_unlock(&rt_b->rt_runtime_lock);
  58}
  59
  60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  61{
  62        struct rt_prio_array *array;
  63        int i;
  64
  65        array = &rt_rq->active;
  66        for (i = 0; i < MAX_RT_PRIO; i++) {
  67                INIT_LIST_HEAD(array->queue + i);
  68                __clear_bit(i, array->bitmap);
  69        }
  70        /* delimiter for bitsearch: */
  71        __set_bit(MAX_RT_PRIO, array->bitmap);
  72
  73#if defined CONFIG_SMP
  74        rt_rq->highest_prio.curr = MAX_RT_PRIO;
  75        rt_rq->highest_prio.next = MAX_RT_PRIO;
  76        rt_rq->rt_nr_migratory = 0;
  77        rt_rq->overloaded = 0;
  78        plist_head_init(&rt_rq->pushable_tasks);
  79#endif
  80
  81        rt_rq->rt_time = 0;
  82        rt_rq->rt_throttled = 0;
  83        rt_rq->rt_runtime = 0;
  84        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
  85}
  86
  87#ifdef CONFIG_RT_GROUP_SCHED
  88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
  89{
  90        hrtimer_cancel(&rt_b->rt_period_timer);
  91}
  92
  93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
  94
  95static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  96{
  97#ifdef CONFIG_SCHED_DEBUG
  98        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  99#endif
 100        return container_of(rt_se, struct task_struct, rt);
 101}
 102
 103static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 104{
 105        return rt_rq->rq;
 106}
 107
 108static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 109{
 110        return rt_se->rt_rq;
 111}
 112
 113void free_rt_sched_group(struct task_group *tg)
 114{
 115        int i;
 116
 117        if (tg->rt_se)
 118                destroy_rt_bandwidth(&tg->rt_bandwidth);
 119
 120        for_each_possible_cpu(i) {
 121                if (tg->rt_rq)
 122                        kfree(tg->rt_rq[i]);
 123                if (tg->rt_se)
 124                        kfree(tg->rt_se[i]);
 125        }
 126
 127        kfree(tg->rt_rq);
 128        kfree(tg->rt_se);
 129}
 130
 131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 132                struct sched_rt_entity *rt_se, int cpu,
 133                struct sched_rt_entity *parent)
 134{
 135        struct rq *rq = cpu_rq(cpu);
 136
 137        rt_rq->highest_prio.curr = MAX_RT_PRIO;
 138        rt_rq->rt_nr_boosted = 0;
 139        rt_rq->rq = rq;
 140        rt_rq->tg = tg;
 141
 142        tg->rt_rq[cpu] = rt_rq;
 143        tg->rt_se[cpu] = rt_se;
 144
 145        if (!rt_se)
 146                return;
 147
 148        if (!parent)
 149                rt_se->rt_rq = &rq->rt;
 150        else
 151                rt_se->rt_rq = parent->my_q;
 152
 153        rt_se->my_q = rt_rq;
 154        rt_se->parent = parent;
 155        INIT_LIST_HEAD(&rt_se->run_list);
 156}
 157
 158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 159{
 160        struct rt_rq *rt_rq;
 161        struct sched_rt_entity *rt_se;
 162        int i;
 163
 164        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 165        if (!tg->rt_rq)
 166                goto err;
 167        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 168        if (!tg->rt_se)
 169                goto err;
 170
 171        init_rt_bandwidth(&tg->rt_bandwidth,
 172                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 173
 174        for_each_possible_cpu(i) {
 175                rt_rq = kzalloc_node(sizeof(struct rt_rq),
 176                                     GFP_KERNEL, cpu_to_node(i));
 177                if (!rt_rq)
 178                        goto err;
 179
 180                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 181                                     GFP_KERNEL, cpu_to_node(i));
 182                if (!rt_se)
 183                        goto err_free_rq;
 184
 185                init_rt_rq(rt_rq, cpu_rq(i));
 186                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 187                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 188        }
 189
 190        return 1;
 191
 192err_free_rq:
 193        kfree(rt_rq);
 194err:
 195        return 0;
 196}
 197
 198#else /* CONFIG_RT_GROUP_SCHED */
 199
 200#define rt_entity_is_task(rt_se) (1)
 201
 202static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 203{
 204        return container_of(rt_se, struct task_struct, rt);
 205}
 206
 207static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 208{
 209        return container_of(rt_rq, struct rq, rt);
 210}
 211
 212static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 213{
 214        struct task_struct *p = rt_task_of(rt_se);
 215        struct rq *rq = task_rq(p);
 216
 217        return &rq->rt;
 218}
 219
 220void free_rt_sched_group(struct task_group *tg) { }
 221
 222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 223{
 224        return 1;
 225}
 226#endif /* CONFIG_RT_GROUP_SCHED */
 227
 228#ifdef CONFIG_SMP
 229
 230static inline int rt_overloaded(struct rq *rq)
 231{
 232        return atomic_read(&rq->rd->rto_count);
 233}
 234
 235static inline void rt_set_overload(struct rq *rq)
 236{
 237        if (!rq->online)
 238                return;
 239
 240        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 241        /*
 242         * Make sure the mask is visible before we set
 243         * the overload count. That is checked to determine
 244         * if we should look at the mask. It would be a shame
 245         * if we looked at the mask, but the mask was not
 246         * updated yet.
 247         */
 248        wmb();
 249        atomic_inc(&rq->rd->rto_count);
 250}
 251
 252static inline void rt_clear_overload(struct rq *rq)
 253{
 254        if (!rq->online)
 255                return;
 256
 257        /* the order here really doesn't matter */
 258        atomic_dec(&rq->rd->rto_count);
 259        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 260}
 261
 262static void update_rt_migration(struct rt_rq *rt_rq)
 263{
 264        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 265                if (!rt_rq->overloaded) {
 266                        rt_set_overload(rq_of_rt_rq(rt_rq));
 267                        rt_rq->overloaded = 1;
 268                }
 269        } else if (rt_rq->overloaded) {
 270                rt_clear_overload(rq_of_rt_rq(rt_rq));
 271                rt_rq->overloaded = 0;
 272        }
 273}
 274
 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 276{
 277        if (!rt_entity_is_task(rt_se))
 278                return;
 279
 280        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 281
 282        rt_rq->rt_nr_total++;
 283        if (rt_se->nr_cpus_allowed > 1)
 284                rt_rq->rt_nr_migratory++;
 285
 286        update_rt_migration(rt_rq);
 287}
 288
 289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 290{
 291        if (!rt_entity_is_task(rt_se))
 292                return;
 293
 294        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 295
 296        rt_rq->rt_nr_total--;
 297        if (rt_se->nr_cpus_allowed > 1)
 298                rt_rq->rt_nr_migratory--;
 299
 300        update_rt_migration(rt_rq);
 301}
 302
 303static inline int has_pushable_tasks(struct rq *rq)
 304{
 305        return !plist_head_empty(&rq->rt.pushable_tasks);
 306}
 307
 308static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 309{
 310        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 311        plist_node_init(&p->pushable_tasks, p->prio);
 312        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 313
 314        /* Update the highest prio pushable task */
 315        if (p->prio < rq->rt.highest_prio.next)
 316                rq->rt.highest_prio.next = p->prio;
 317}
 318
 319static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 320{
 321        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 322
 323        /* Update the new highest prio pushable task */
 324        if (has_pushable_tasks(rq)) {
 325                p = plist_first_entry(&rq->rt.pushable_tasks,
 326                                      struct task_struct, pushable_tasks);
 327                rq->rt.highest_prio.next = p->prio;
 328        } else
 329                rq->rt.highest_prio.next = MAX_RT_PRIO;
 330}
 331
 332#else
 333
 334static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 335{
 336}
 337
 338static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 339{
 340}
 341
 342static inline
 343void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 344{
 345}
 346
 347static inline
 348void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 349{
 350}
 351
 352#endif /* CONFIG_SMP */
 353
 354static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 355{
 356        return !list_empty(&rt_se->run_list);
 357}
 358
 359#ifdef CONFIG_RT_GROUP_SCHED
 360
 361static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 362{
 363        if (!rt_rq->tg)
 364                return RUNTIME_INF;
 365
 366        return rt_rq->rt_runtime;
 367}
 368
 369static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 370{
 371        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 372}
 373
 374typedef struct task_group *rt_rq_iter_t;
 375
 376static inline struct task_group *next_task_group(struct task_group *tg)
 377{
 378        do {
 379                tg = list_entry_rcu(tg->list.next,
 380                        typeof(struct task_group), list);
 381        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 382
 383        if (&tg->list == &task_groups)
 384                tg = NULL;
 385
 386        return tg;
 387}
 388
 389#define for_each_rt_rq(rt_rq, iter, rq)                                 \
 390        for (iter = container_of(&task_groups, typeof(*iter), list);    \
 391                (iter = next_task_group(iter)) &&                       \
 392                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 393
 394static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 395{
 396        list_add_rcu(&rt_rq->leaf_rt_rq_list,
 397                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
 398}
 399
 400static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 401{
 402        list_del_rcu(&rt_rq->leaf_rt_rq_list);
 403}
 404
 405#define for_each_leaf_rt_rq(rt_rq, rq) \
 406        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 407
 408#define for_each_sched_rt_entity(rt_se) \
 409        for (; rt_se; rt_se = rt_se->parent)
 410
 411static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 412{
 413        return rt_se->my_q;
 414}
 415
 416static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 417static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 418
 419static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 420{
 421        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 422        struct sched_rt_entity *rt_se;
 423
 424        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 425
 426        rt_se = rt_rq->tg->rt_se[cpu];
 427
 428        if (rt_rq->rt_nr_running) {
 429                if (rt_se && !on_rt_rq(rt_se))
 430                        enqueue_rt_entity(rt_se, false);
 431                if (rt_rq->highest_prio.curr < curr->prio)
 432                        resched_task(curr);
 433        }
 434}
 435
 436static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 437{
 438        struct sched_rt_entity *rt_se;
 439        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 440
 441        rt_se = rt_rq->tg->rt_se[cpu];
 442
 443        if (rt_se && on_rt_rq(rt_se))
 444                dequeue_rt_entity(rt_se);
 445}
 446
 447static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 448{
 449        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 450}
 451
 452static int rt_se_boosted(struct sched_rt_entity *rt_se)
 453{
 454        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 455        struct task_struct *p;
 456
 457        if (rt_rq)
 458                return !!rt_rq->rt_nr_boosted;
 459
 460        p = rt_task_of(rt_se);
 461        return p->prio != p->normal_prio;
 462}
 463
 464#ifdef CONFIG_SMP
 465static inline const struct cpumask *sched_rt_period_mask(void)
 466{
 467        return cpu_rq(smp_processor_id())->rd->span;
 468}
 469#else
 470static inline const struct cpumask *sched_rt_period_mask(void)
 471{
 472        return cpu_online_mask;
 473}
 474#endif
 475
 476static inline
 477struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 478{
 479        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 480}
 481
 482static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 483{
 484        return &rt_rq->tg->rt_bandwidth;
 485}
 486
 487#else /* !CONFIG_RT_GROUP_SCHED */
 488
 489static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 490{
 491        return rt_rq->rt_runtime;
 492}
 493
 494static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 495{
 496        return ktime_to_ns(def_rt_bandwidth.rt_period);
 497}
 498
 499typedef struct rt_rq *rt_rq_iter_t;
 500
 501#define for_each_rt_rq(rt_rq, iter, rq) \
 502        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 503
 504static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 505{
 506}
 507
 508static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
 509{
 510}
 511
 512#define for_each_leaf_rt_rq(rt_rq, rq) \
 513        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 514
 515#define for_each_sched_rt_entity(rt_se) \
 516        for (; rt_se; rt_se = NULL)
 517
 518static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 519{
 520        return NULL;
 521}
 522
 523static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 524{
 525        if (rt_rq->rt_nr_running)
 526                resched_task(rq_of_rt_rq(rt_rq)->curr);
 527}
 528
 529static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 530{
 531}
 532
 533static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 534{
 535        return rt_rq->rt_throttled;
 536}
 537
 538static inline const struct cpumask *sched_rt_period_mask(void)
 539{
 540        return cpu_online_mask;
 541}
 542
 543static inline
 544struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 545{
 546        return &cpu_rq(cpu)->rt;
 547}
 548
 549static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 550{
 551        return &def_rt_bandwidth;
 552}
 553
 554#endif /* CONFIG_RT_GROUP_SCHED */
 555
 556#ifdef CONFIG_SMP
 557/*
 558 * We ran out of runtime, see if we can borrow some from our neighbours.
 559 */
 560static int do_balance_runtime(struct rt_rq *rt_rq)
 561{
 562        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 563        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 564        int i, weight, more = 0;
 565        u64 rt_period;
 566
 567        weight = cpumask_weight(rd->span);
 568
 569        raw_spin_lock(&rt_b->rt_runtime_lock);
 570        rt_period = ktime_to_ns(rt_b->rt_period);
 571        for_each_cpu(i, rd->span) {
 572                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 573                s64 diff;
 574
 575                if (iter == rt_rq)
 576                        continue;
 577
 578                raw_spin_lock(&iter->rt_runtime_lock);
 579                /*
 580                 * Either all rqs have inf runtime and there's nothing to steal
 581                 * or __disable_runtime() below sets a specific rq to inf to
 582                 * indicate its been disabled and disalow stealing.
 583                 */
 584                if (iter->rt_runtime == RUNTIME_INF)
 585                        goto next;
 586
 587                /*
 588                 * From runqueues with spare time, take 1/n part of their
 589                 * spare time, but no more than our period.
 590                 */
 591                diff = iter->rt_runtime - iter->rt_time;
 592                if (diff > 0) {
 593                        diff = div_u64((u64)diff, weight);
 594                        if (rt_rq->rt_runtime + diff > rt_period)
 595                                diff = rt_period - rt_rq->rt_runtime;
 596                        iter->rt_runtime -= diff;
 597                        rt_rq->rt_runtime += diff;
 598                        more = 1;
 599                        if (rt_rq->rt_runtime == rt_period) {
 600                                raw_spin_unlock(&iter->rt_runtime_lock);
 601                                break;
 602                        }
 603                }
 604next:
 605                raw_spin_unlock(&iter->rt_runtime_lock);
 606        }
 607        raw_spin_unlock(&rt_b->rt_runtime_lock);
 608
 609        return more;
 610}
 611
 612/*
 613 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 614 */
 615static void __disable_runtime(struct rq *rq)
 616{
 617        struct root_domain *rd = rq->rd;
 618        rt_rq_iter_t iter;
 619        struct rt_rq *rt_rq;
 620
 621        if (unlikely(!scheduler_running))
 622                return;
 623
 624        for_each_rt_rq(rt_rq, iter, rq) {
 625                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 626                s64 want;
 627                int i;
 628
 629                raw_spin_lock(&rt_b->rt_runtime_lock);
 630                raw_spin_lock(&rt_rq->rt_runtime_lock);
 631                /*
 632                 * Either we're all inf and nobody needs to borrow, or we're
 633                 * already disabled and thus have nothing to do, or we have
 634                 * exactly the right amount of runtime to take out.
 635                 */
 636                if (rt_rq->rt_runtime == RUNTIME_INF ||
 637                                rt_rq->rt_runtime == rt_b->rt_runtime)
 638                        goto balanced;
 639                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 640
 641                /*
 642                 * Calculate the difference between what we started out with
 643                 * and what we current have, that's the amount of runtime
 644                 * we lend and now have to reclaim.
 645                 */
 646                want = rt_b->rt_runtime - rt_rq->rt_runtime;
 647
 648                /*
 649                 * Greedy reclaim, take back as much as we can.
 650                 */
 651                for_each_cpu(i, rd->span) {
 652                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 653                        s64 diff;
 654
 655                        /*
 656                         * Can't reclaim from ourselves or disabled runqueues.
 657                         */
 658                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 659                                continue;
 660
 661                        raw_spin_lock(&iter->rt_runtime_lock);
 662                        if (want > 0) {
 663                                diff = min_t(s64, iter->rt_runtime, want);
 664                                iter->rt_runtime -= diff;
 665                                want -= diff;
 666                        } else {
 667                                iter->rt_runtime -= want;
 668                                want -= want;
 669                        }
 670                        raw_spin_unlock(&iter->rt_runtime_lock);
 671
 672                        if (!want)
 673                                break;
 674                }
 675
 676                raw_spin_lock(&rt_rq->rt_runtime_lock);
 677                /*
 678                 * We cannot be left wanting - that would mean some runtime
 679                 * leaked out of the system.
 680                 */
 681                BUG_ON(want);
 682balanced:
 683                /*
 684                 * Disable all the borrow logic by pretending we have inf
 685                 * runtime - in which case borrowing doesn't make sense.
 686                 */
 687                rt_rq->rt_runtime = RUNTIME_INF;
 688                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 689                raw_spin_unlock(&rt_b->rt_runtime_lock);
 690        }
 691}
 692
 693static void disable_runtime(struct rq *rq)
 694{
 695        unsigned long flags;
 696
 697        raw_spin_lock_irqsave(&rq->lock, flags);
 698        __disable_runtime(rq);
 699        raw_spin_unlock_irqrestore(&rq->lock, flags);
 700}
 701
 702static void __enable_runtime(struct rq *rq)
 703{
 704        rt_rq_iter_t iter;
 705        struct rt_rq *rt_rq;
 706
 707        if (unlikely(!scheduler_running))
 708                return;
 709
 710        /*
 711         * Reset each runqueue's bandwidth settings
 712         */
 713        for_each_rt_rq(rt_rq, iter, rq) {
 714                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 715
 716                raw_spin_lock(&rt_b->rt_runtime_lock);
 717                raw_spin_lock(&rt_rq->rt_runtime_lock);
 718                rt_rq->rt_runtime = rt_b->rt_runtime;
 719                rt_rq->rt_time = 0;
 720                rt_rq->rt_throttled = 0;
 721                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 722                raw_spin_unlock(&rt_b->rt_runtime_lock);
 723        }
 724}
 725
 726static void enable_runtime(struct rq *rq)
 727{
 728        unsigned long flags;
 729
 730        raw_spin_lock_irqsave(&rq->lock, flags);
 731        __enable_runtime(rq);
 732        raw_spin_unlock_irqrestore(&rq->lock, flags);
 733}
 734
 735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
 736{
 737        int cpu = (int)(long)hcpu;
 738
 739        switch (action) {
 740        case CPU_DOWN_PREPARE:
 741        case CPU_DOWN_PREPARE_FROZEN:
 742                disable_runtime(cpu_rq(cpu));
 743                return NOTIFY_OK;
 744
 745        case CPU_DOWN_FAILED:
 746        case CPU_DOWN_FAILED_FROZEN:
 747        case CPU_ONLINE:
 748        case CPU_ONLINE_FROZEN:
 749                enable_runtime(cpu_rq(cpu));
 750                return NOTIFY_OK;
 751
 752        default:
 753                return NOTIFY_DONE;
 754        }
 755}
 756
 757static int balance_runtime(struct rt_rq *rt_rq)
 758{
 759        int more = 0;
 760
 761        if (!sched_feat(RT_RUNTIME_SHARE))
 762                return more;
 763
 764        if (rt_rq->rt_time > rt_rq->rt_runtime) {
 765                raw_spin_unlock(&rt_rq->rt_runtime_lock);
 766                more = do_balance_runtime(rt_rq);
 767                raw_spin_lock(&rt_rq->rt_runtime_lock);
 768        }
 769
 770        return more;
 771}
 772#else /* !CONFIG_SMP */
 773static inline int balance_runtime(struct rt_rq *rt_rq)
 774{
 775        return 0;
 776}
 777#endif /* CONFIG_SMP */
 778
 779static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 780{
 781        int i, idle = 1;
 782        const struct cpumask *span;
 783
 784        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 785                return 1;
 786
 787        span = sched_rt_period_mask();
 788        for_each_cpu(i, span) {
 789                int enqueue = 0;
 790                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 791                struct rq *rq = rq_of_rt_rq(rt_rq);
 792
 793                raw_spin_lock(&rq->lock);
 794                if (rt_rq->rt_time) {
 795                        u64 runtime;
 796
 797                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 798                        if (rt_rq->rt_throttled)
 799                                balance_runtime(rt_rq);
 800                        runtime = rt_rq->rt_runtime;
 801                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 802                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 803                                rt_rq->rt_throttled = 0;
 804                                enqueue = 1;
 805
 806                                /*
 807                                 * Force a clock update if the CPU was idle,
 808                                 * lest wakeup -> unthrottle time accumulate.
 809                                 */
 810                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 811                                        rq->skip_clock_update = -1;
 812                        }
 813                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
 814                                idle = 0;
 815                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 816                } else if (rt_rq->rt_nr_running) {
 817                        idle = 0;
 818                        if (!rt_rq_throttled(rt_rq))
 819                                enqueue = 1;
 820                }
 821
 822                if (enqueue)
 823                        sched_rt_rq_enqueue(rt_rq);
 824                raw_spin_unlock(&rq->lock);
 825        }
 826
 827        return idle;
 828}
 829
 830static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 831{
 832#ifdef CONFIG_RT_GROUP_SCHED
 833        struct rt_rq *rt_rq = group_rt_rq(rt_se);
 834
 835        if (rt_rq)
 836                return rt_rq->highest_prio.curr;
 837#endif
 838
 839        return rt_task_of(rt_se)->prio;
 840}
 841
 842static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 843{
 844        u64 runtime = sched_rt_runtime(rt_rq);
 845
 846        if (rt_rq->rt_throttled)
 847                return rt_rq_throttled(rt_rq);
 848
 849        if (runtime >= sched_rt_period(rt_rq))
 850                return 0;
 851
 852        balance_runtime(rt_rq);
 853        runtime = sched_rt_runtime(rt_rq);
 854        if (runtime == RUNTIME_INF)
 855                return 0;
 856
 857        if (rt_rq->rt_time > runtime) {
 858                rt_rq->rt_throttled = 1;
 859                printk_once(KERN_WARNING "sched: RT throttling activated\n");
 860                if (rt_rq_throttled(rt_rq)) {
 861                        sched_rt_rq_dequeue(rt_rq);
 862                        return 1;
 863                }
 864        }
 865
 866        return 0;
 867}
 868
 869/*
 870 * Update the current task's runtime statistics. Skip current tasks that
 871 * are not in our scheduling class.
 872 */
 873static void update_curr_rt(struct rq *rq)
 874{
 875        struct task_struct *curr = rq->curr;
 876        struct sched_rt_entity *rt_se = &curr->rt;
 877        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 878        u64 delta_exec;
 879
 880        if (curr->sched_class != &rt_sched_class)
 881                return;
 882
 883        delta_exec = rq->clock_task - curr->se.exec_start;
 884        if (unlikely((s64)delta_exec < 0))
 885                delta_exec = 0;
 886
 887        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 888
 889        curr->se.sum_exec_runtime += delta_exec;
 890        account_group_exec_runtime(curr, delta_exec);
 891
 892        curr->se.exec_start = rq->clock_task;
 893        cpuacct_charge(curr, delta_exec);
 894
 895        sched_rt_avg_update(rq, delta_exec);
 896
 897        if (!rt_bandwidth_enabled())
 898                return;
 899
 900        for_each_sched_rt_entity(rt_se) {
 901                rt_rq = rt_rq_of_se(rt_se);
 902
 903                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 904                        raw_spin_lock(&rt_rq->rt_runtime_lock);
 905                        rt_rq->rt_time += delta_exec;
 906                        if (sched_rt_runtime_exceeded(rt_rq))
 907                                resched_task(curr);
 908                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
 909                }
 910        }
 911}
 912
 913#if defined CONFIG_SMP
 914
 915static void
 916inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 917{
 918        struct rq *rq = rq_of_rt_rq(rt_rq);
 919
 920        if (rq->online && prio < prev_prio)
 921                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 922}
 923
 924static void
 925dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 926{
 927        struct rq *rq = rq_of_rt_rq(rt_rq);
 928
 929        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 930                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 931}
 932
 933#else /* CONFIG_SMP */
 934
 935static inline
 936void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 937static inline
 938void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 939
 940#endif /* CONFIG_SMP */
 941
 942#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 943static void
 944inc_rt_prio(struct rt_rq *rt_rq, int prio)
 945{
 946        int prev_prio = rt_rq->highest_prio.curr;
 947
 948        if (prio < prev_prio)
 949                rt_rq->highest_prio.curr = prio;
 950
 951        inc_rt_prio_smp(rt_rq, prio, prev_prio);
 952}
 953
 954static void
 955dec_rt_prio(struct rt_rq *rt_rq, int prio)
 956{
 957        int prev_prio = rt_rq->highest_prio.curr;
 958
 959        if (rt_rq->rt_nr_running) {
 960
 961                WARN_ON(prio < prev_prio);
 962
 963                /*
 964                 * This may have been our highest task, and therefore
 965                 * we may have some recomputation to do
 966                 */
 967                if (prio == prev_prio) {
 968                        struct rt_prio_array *array = &rt_rq->active;
 969
 970                        rt_rq->highest_prio.curr =
 971                                sched_find_first_bit(array->bitmap);
 972                }
 973
 974        } else
 975                rt_rq->highest_prio.curr = MAX_RT_PRIO;
 976
 977        dec_rt_prio_smp(rt_rq, prio, prev_prio);
 978}
 979
 980#else
 981
 982static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 983static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 984
 985#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 986
 987#ifdef CONFIG_RT_GROUP_SCHED
 988
 989static void
 990inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 991{
 992        if (rt_se_boosted(rt_se))
 993                rt_rq->rt_nr_boosted++;
 994
 995        if (rt_rq->tg)
 996                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 997}
 998
 999static void
1000dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)

1001{
1002        if (rt_se_boosted(rt_se))
1003                rt_rq->rt_nr_boosted--;
1004
1005        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1006}
1007
1008#else /* CONFIG_RT_GROUP_SCHED */
1009
1010static void
1011inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1012{
1013        start_rt_bandwidth(&def_rt_bandwidth);
1014}
1015
1016static inline
1017void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1018
1019#endif /* CONFIG_RT_GROUP_SCHED */
1020
1021static inline
1022void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1023{
1024        int prio = rt_se_prio(rt_se);
1025
1026        WARN_ON(!rt_prio(prio));
1027        rt_rq->rt_nr_running++;
1028
1029        inc_rt_prio(rt_rq, prio);
1030        inc_rt_migration(rt_se, rt_rq);
1031        inc_rt_group(rt_se, rt_rq);
1032}
1033
1034static inline
1035void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1036{
1037        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1038        WARN_ON(!rt_rq->rt_nr_running);
1039        rt_rq->rt_nr_running--;
1040
1041        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1042        dec_rt_migration(rt_se, rt_rq);
1043        dec_rt_group(rt_se, rt_rq);
1044}
1045
1046static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1047{
1048        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1049        struct rt_prio_array *array = &rt_rq->active;
1050        struct rt_rq *group_rq = group_rt_rq(rt_se);
1051        struct list_head *queue = array->queue + rt_se_prio(rt_se);
1052
1053        /*
1054         * Don't enqueue the group if its throttled, or when empty.
1055         * The latter is a consequence of the former when a child group
1056         * get throttled and the current group doesn't have any other
1057         * active members.
1058         */
1059        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1060                return;
1061
1062        if (!rt_rq->rt_nr_running)
1063                list_add_leaf_rt_rq(rt_rq);
1064
1065        if (head)
1066                list_add(&rt_se->run_list, queue);
1067        else
1068                list_add_tail(&rt_se->run_list, queue);
1069        __set_bit(rt_se_prio(rt_se), array->bitmap);
1070
1071        inc_rt_tasks(rt_se, rt_rq);
1072}
1073
1074static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1075{
1076        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1077        struct rt_prio_array *array = &rt_rq->active;
1078
1079        list_del_init(&rt_se->run_list);
1080        if (list_empty(array->queue + rt_se_prio(rt_se)))
1081                __clear_bit(rt_se_prio(rt_se), array->bitmap);
1082
1083        dec_rt_tasks(rt_se, rt_rq);
1084        if (!rt_rq->rt_nr_running)
1085                list_del_leaf_rt_rq(rt_rq);
1086}
1087
1088/*
1089 * Because the prio of an upper entry depends on the lower
1090 * entries, we must remove entries top - down.
1091 */
1092static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1093{
1094        struct sched_rt_entity *back = NULL;
1095
1096        for_each_sched_rt_entity(rt_se) {
1097                rt_se->back = back;
1098                back = rt_se;
1099        }
1100
1101        for (rt_se = back; rt_se; rt_se = rt_se->back) {
1102                if (on_rt_rq(rt_se))
1103                        __dequeue_rt_entity(rt_se);
1104        }
1105}
1106
1107static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1108{
1109        dequeue_rt_stack(rt_se);
1110        for_each_sched_rt_entity(rt_se)
1111                __enqueue_rt_entity(rt_se, head);
1112}
1113
1114static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1115{
1116        dequeue_rt_stack(rt_se);
1117
1118        for_each_sched_rt_entity(rt_se) {
1119                struct rt_rq *rt_rq = group_rt_rq(rt_se);
1120
1121                if (rt_rq && rt_rq->rt_nr_running)
1122                        __enqueue_rt_entity(rt_se, false);
1123        }
1124}
1125
1126/*
1127 * Adding/removing a task to/from a priority array:
1128 */
1129static void
1130enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1131{
1132        struct sched_rt_entity *rt_se = &p->rt;
1133
1134        if (flags & ENQUEUE_WAKEUP)
1135                rt_se->timeout = 0;
1136
1137        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1138
1139        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
1140                enqueue_pushable_task(rq, p);
1141
1142        inc_nr_running(rq);
1143}
1144
1145static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1146{
1147        struct sched_rt_entity *rt_se = &p->rt;
1148
1149        update_curr_rt(rq);
1150        dequeue_rt_entity(rt_se);
1151
1152        dequeue_pushable_task(rq, p);
1153
1154        dec_nr_running(rq);
1155}
1156
1157/*
1158 * Put task to the head or the end of the run list without the overhead of
1159 * dequeue followed by enqueue.
1160 */
1161static void
1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1163{
1164        if (on_rt_rq(rt_se)) {
1165                struct rt_prio_array *array = &rt_rq->active;
1166                struct list_head *queue = array->queue + rt_se_prio(rt_se);
1167
1168                if (head)
1169                        list_move(&rt_se->run_list, queue);
1170                else
1171                        list_move_tail(&rt_se->run_list, queue);
1172        }
1173}
1174
1175static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1176{
1177        struct sched_rt_entity *rt_se = &p->rt;
1178        struct rt_rq *rt_rq;
1179
1180        for_each_sched_rt_entity(rt_se) {
1181                rt_rq = rt_rq_of_se(rt_se);
1182                requeue_rt_entity(rt_rq, rt_se, head);
1183        }
1184}
1185
1186static void yield_task_rt(struct rq *rq)
1187{
1188        requeue_task_rt(rq, rq->curr, 0);
1189}
1190
1191#ifdef CONFIG_SMP
1192static int find_lowest_rq(struct task_struct *task);
1193
1194static int
1195select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1196{
1197        struct task_struct *curr;
1198        struct rq *rq;
1199        int cpu;
1200
1201        cpu = task_cpu(p);
1202
1203        if (p->rt.nr_cpus_allowed == 1)
1204                goto out;
1205
1206        /* For anything but wake ups, just return the task_cpu */
1207        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1208                goto out;
1209
1210        rq = cpu_rq(cpu);
1211
1212        rcu_read_lock();
1213        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1214
1215        /*
1216         * If the current task on @p's runqueue is an RT task, then
1217         * try to see if we can wake this RT task up on another
1218         * runqueue. Otherwise simply start this RT task
1219         * on its current runqueue.
1220         *
1221         * We want to avoid overloading runqueues. If the woken
1222         * task is a higher priority, then it will stay on this CPU
1223         * and the lower prio task should be moved to another CPU.
1224         * Even though this will probably make the lower prio task
1225         * lose its cache, we do not want to bounce a higher task
1226         * around just because it gave up its CPU, perhaps for a
1227         * lock?
1228         *
1229         * For equal prio tasks, we just let the scheduler sort it out.
1230         *
1231         * Otherwise, just let it ride on the affined RQ and the
1232         * post-schedule router will push the preempted task away
1233         *
1234         * This test is optimistic, if we get it wrong the load-balancer
1235         * will have to sort it out.
1236         */
1237        if (curr && unlikely(rt_task(curr)) &&
1238            (curr->rt.nr_cpus_allowed < 2 ||
1239             curr->prio <= p->prio) &&
1240            (p->rt.nr_cpus_allowed > 1)) {
1241                int target = find_lowest_rq(p);
1242
1243                if (target != -1)
1244                        cpu = target;
1245        }
1246        rcu_read_unlock();
1247
1248out:
1249        return cpu;
1250}
1251
1252static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1253{
1254        if (rq->curr->rt.nr_cpus_allowed == 1)
1255                return;
1256
1257        if (p->rt.nr_cpus_allowed != 1
1258            && cpupri_find(&rq->rd->cpupri, p, NULL))
1259                return;
1260
1261        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1262                return;
1263
1264        /*
1265         * There appears to be other cpus that can accept
1266         * current and none to run 'p', so lets reschedule
1267         * to try and push current away:
1268         */
1269        requeue_task_rt(rq, p, 1);
1270        resched_task(rq->curr);
1271}
1272
1273#endif /* CONFIG_SMP */
1274
1275/*
1276 * Preempt the current task with a newly woken task if needed:
1277 */
1278static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1279{
1280        if (p->prio < rq->curr->prio) {
1281                resched_task(rq->curr);
1282                return;
1283        }
1284
1285#ifdef CONFIG_SMP
1286        /*
1287         * If:
1288         *
1289         * - the newly woken task is of equal priority to the current task
1290         * - the newly woken task is non-migratable while current is migratable
1291         * - current will be preempted on the next reschedule
1292         *
1293         * we should check to see if current can readily move to a different
1294         * cpu.  If so, we will reschedule to allow the push logic to try
1295         * to move current somewhere else, making room for our non-migratable
1296         * task.
1297         */
1298        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1299                check_preempt_equal_prio(rq, p);
1300#endif
1301}
1302
1303static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1304                                                   struct rt_rq *rt_rq)
1305{
1306        struct rt_prio_array *array = &rt_rq->active;
1307        struct sched_rt_entity *next = NULL;
1308        struct list_head *queue;
1309        int idx;
1310
1311        idx = sched_find_first_bit(array->bitmap);
1312        BUG_ON(idx >= MAX_RT_PRIO);
1313
1314        queue = array->queue + idx;
1315        next = list_entry(queue->next, struct sched_rt_entity, run_list);
1316
1317        return next;
1318}
1319
1320static struct task_struct *_pick_next_task_rt(struct rq *rq)
1321{
1322        struct sched_rt_entity *rt_se;
1323        struct task_struct *p;
1324        struct rt_rq *rt_rq;
1325
1326        rt_rq = &rq->rt;
1327
1328        if (!rt_rq->rt_nr_running)
1329                return NULL;
1330
1331        if (rt_rq_throttled(rt_rq))
1332                return NULL;
1333
1334        do {
1335                rt_se = pick_next_rt_entity(rq, rt_rq);
1336                BUG_ON(!rt_se);
1337                rt_rq = group_rt_rq(rt_se);
1338        } while (rt_rq);
1339
1340        p = rt_task_of(rt_se);
1341        p->se.exec_start = rq->clock_task;
1342
1343        return p;
1344}
1345
1346static struct task_struct *pick_next_task_rt(struct rq *rq)
1347{
1348        struct task_struct *p = _pick_next_task_rt(rq);
1349
1350        /* The running task is never eligible for pushing */
1351        if (p)
1352                dequeue_pushable_task(rq, p);
1353
1354#ifdef CONFIG_SMP
1355        /*
1356         * We detect this state here so that we can avoid taking the RQ
1357         * lock again later if there is no need to push
1358         */
1359        rq->post_schedule = has_pushable_tasks(rq);
1360#endif
1361
1362        return p;
1363}
1364
1365static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1366{
1367        update_curr_rt(rq);
1368
1369        /*
1370         * The previous task needs to be made eligible for pushing
1371         * if it is still active
1372         */
1373        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1374                enqueue_pushable_task(rq, p);
1375}
1376
1377#ifdef CONFIG_SMP
1378
1379/* Only try algorithms three times */
1380#define RT_MAX_TRIES 3
1381
1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1383{
1384        if (!task_running(rq, p) &&
1385            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1386            (p->rt.nr_cpus_allowed > 1))
1387                return 1;
1388        return 0;
1389}
1390
1391/* Return the second highest RT task, NULL otherwise */
1392static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1393{
1394        struct task_struct *next = NULL;
1395        struct sched_rt_entity *rt_se;
1396        struct rt_prio_array *array;
1397        struct rt_rq *rt_rq;
1398        int idx;
1399
1400        for_each_leaf_rt_rq(rt_rq, rq) {
1401                array = &rt_rq->active;
1402                idx = sched_find_first_bit(array->bitmap);
1403next_idx:
1404                if (idx >= MAX_RT_PRIO)
1405                        continue;
1406                if (next && next->prio < idx)
1407                        continue;
1408                list_for_each_entry(rt_se, array->queue + idx, run_list) {
1409                        struct task_struct *p;
1410
1411                        if (!rt_entity_is_task(rt_se))
1412                                continue;
1413
1414                        p = rt_task_of(rt_se);
1415                        if (pick_rt_task(rq, p, cpu)) {
1416                                next = p;
1417                                break;
1418                        }
1419                }
1420                if (!next) {
1421                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1422                        goto next_idx;
1423                }
1424        }
1425
1426        return next;
1427}
1428
1429static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1430
1431static int find_lowest_rq(struct task_struct *task)
1432{
1433        struct sched_domain *sd;
1434        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1435        int this_cpu = smp_processor_id();
1436        int cpu      = task_cpu(task);
1437
1438        /* Make sure the mask is initialized first */
1439        if (unlikely(!lowest_mask))
1440                return -1;
1441
1442        if (task->rt.nr_cpus_allowed == 1)
1443                return -1; /* No other targets possible */
1444
1445        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1446                return -1; /* No targets found */
1447
1448        /*
1449         * At this point we have built a mask of cpus representing the
1450         * lowest priority tasks in the system.  Now we want to elect
1451         * the best one based on our affinity and topology.
1452         *
1453         * We prioritize the last cpu that the task executed on since
1454         * it is most likely cache-hot in that location.
1455         */
1456        if (cpumask_test_cpu(cpu, lowest_mask))
1457                return cpu;
1458
1459        /*
1460         * Otherwise, we consult the sched_domains span maps to figure
1461         * out which cpu is logically closest to our hot cache data.
1462         */
1463        if (!cpumask_test_cpu(this_cpu, lowest_mask))
1464                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1465
1466        rcu_read_lock();
1467        for_each_domain(cpu, sd) {
1468                if (sd->flags & SD_WAKE_AFFINE) {
1469                        int best_cpu;
1470
1471                        /*
1472                         * "this_cpu" is cheaper to preempt than a
1473                         * remote processor.
1474                         */
1475                        if (this_cpu != -1 &&
1476                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1477                                rcu_read_unlock();
1478                                return this_cpu;
1479                        }
1480
1481                        best_cpu = cpumask_first_and(lowest_mask,
1482                                                     sched_domain_span(sd));
1483                        if (best_cpu < nr_cpu_ids) {
1484                                rcu_read_unlock();
1485                                return best_cpu;
1486                        }
1487                }
1488        }
1489        rcu_read_unlock();
1490
1491        /*
1492         * And finally, if there were no matches within the domains
1493         * just give the caller *something* to work with from the compatible
1494         * locations.
1495         */
1496        if (this_cpu != -1)
1497                return this_cpu;
1498
1499        cpu = cpumask_any(lowest_mask);
1500        if (cpu < nr_cpu_ids)
1501                return cpu;
1502        return -1;
1503}
1504
1505/* Will lock the rq it finds */
1506static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1507{
1508        struct rq *lowest_rq = NULL;
1509        int tries;
1510        int cpu;
1511
1512        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1513                cpu = find_lowest_rq(task);
1514
1515                if ((cpu == -1) || (cpu == rq->cpu))
1516                        break;
1517
1518                lowest_rq = cpu_rq(cpu);
1519
1520                /* if the prio of this runqueue changed, try again */
1521                if (double_lock_balance(rq, lowest_rq)) {
1522                        /*
1523                         * We had to unlock the run queue. In
1524                         * the mean time, task could have
1525                         * migrated already or had its affinity changed.
1526                         * Also make sure that it wasn't scheduled on its rq.
1527                         */
1528                        if (unlikely(task_rq(task) != rq ||
1529                                     !cpumask_test_cpu(lowest_rq->cpu,
1530                                                       tsk_cpus_allowed(task)) ||
1531                                     task_running(rq, task) ||
1532                                     !task->on_rq)) {
1533
1534                                raw_spin_unlock(&lowest_rq->lock);
1535                                lowest_rq = NULL;
1536                                break;
1537                        }
1538                }
1539
1540                /* If this rq is still suitable use it. */
1541                if (lowest_rq->rt.highest_prio.curr > task->prio)
1542                        break;
1543
1544                /* try again */
1545                double_unlock_balance(rq, lowest_rq);
1546                lowest_rq = NULL;
1547        }
1548
1549        return lowest_rq;
1550}
1551
1552static struct task_struct *pick_next_pushable_task(struct rq *rq)
1553{
1554        struct task_struct *p;
1555
1556        if (!has_pushable_tasks(rq))
1557                return NULL;
1558
1559        p = plist_first_entry(&rq->rt.pushable_tasks,
1560                              struct task_struct, pushable_tasks);
1561
1562        BUG_ON(rq->cpu != task_cpu(p));
1563        BUG_ON(task_current(rq, p));
1564        BUG_ON(p->rt.nr_cpus_allowed <= 1);
1565
1566        BUG_ON(!p->on_rq);
1567        BUG_ON(!rt_task(p));
1568
1569        return p;
1570}
1571
1572/*
1573 * If the current CPU has more than one RT task, see if the non
1574 * running task can migrate over to a CPU that is running a task
1575 * of lesser priority.
1576 */
1577static int push_rt_task(struct rq *rq)
1578{
1579        struct task_struct *next_task;
1580        struct rq *lowest_rq;
1581        int ret = 0;
1582
1583        if (!rq->rt.overloaded)
1584                return 0;
1585
1586        next_task = pick_next_pushable_task(rq);
1587        if (!next_task)
1588                return 0;
1589
1590#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1591       if (unlikely(task_running(rq, next_task)))
1592               return 0;
1593#endif
1594
1595retry:
1596        if (unlikely(next_task == rq->curr)) {
1597                WARN_ON(1);
1598                return 0;
1599        }
1600
1601        /*
1602         * It's possible that the next_task slipped in of
1603         * higher priority than current. If that's the case
1604         * just reschedule current.
1605         */
1606        if (unlikely(next_task->prio < rq->curr->prio)) {
1607                resched_task(rq->curr);
1608                return 0;
1609        }
1610
1611        /* We might release rq lock */
1612        get_task_struct(next_task);
1613
1614        /* find_lock_lowest_rq locks the rq if found */
1615        lowest_rq = find_lock_lowest_rq(next_task, rq);
1616        if (!lowest_rq) {
1617                struct task_struct *task;
1618                /*
1619                 * find_lock_lowest_rq releases rq->lock
1620                 * so it is possible that next_task has migrated.
1621                 *
1622                 * We need to make sure that the task is still on the same
1623                 * run-queue and is also still the next task eligible for
1624                 * pushing.
1625                 */
1626                task = pick_next_pushable_task(rq);
1627                if (task_cpu(next_task) == rq->cpu && task == next_task) {
1628                        /*
1629                         * The task hasn't migrated, and is still the next
1630                         * eligible task, but we failed to find a run-queue
1631                         * to push it to.  Do not retry in this case, since
1632                         * other cpus will pull from us when ready.
1633                         */
1634                        goto out;
1635                }
1636
1637                if (!task)
1638                        /* No more tasks, just exit */
1639                        goto out;
1640
1641                /*
1642                 * Something has shifted, try again.
1643                 */
1644                put_task_struct(next_task);
1645                next_task = task;
1646                goto retry;
1647        }
1648
1649        deactivate_task(rq, next_task, 0);
1650        set_task_cpu(next_task, lowest_rq->cpu);
1651        activate_task(lowest_rq, next_task, 0);
1652        ret = 1;
1653
1654        resched_task(lowest_rq->curr);
1655
1656        double_unlock_balance(rq, lowest_rq);
1657
1658out:
1659        put_task_struct(next_task);
1660
1661        return ret;
1662}
1663
1664static void push_rt_tasks(struct rq *rq)
1665{
1666        /* push_rt_task will return true if it moved an RT */
1667        while (push_rt_task(rq))
1668                ;
1669}
1670
1671static int pull_rt_task(struct rq *this_rq)
1672{
1673        int this_cpu = this_rq->cpu, ret = 0, cpu;
1674        struct task_struct *p;
1675        struct rq *src_rq;
1676
1677        if (likely(!rt_overloaded(this_rq)))
1678                return 0;
1679
1680        for_each_cpu(cpu, this_rq->rd->rto_mask) {
1681                if (this_cpu == cpu)
1682                        continue;
1683
1684                src_rq = cpu_rq(cpu);
1685
1686                /*
1687                 * Don't bother taking the src_rq->lock if the next highest
1688                 * task is known to be lower-priority than our current task.
1689                 * This may look racy, but if this value is about to go
1690                 * logically higher, the src_rq will push this task away.
1691                 * And if its going logically lower, we do not care
1692                 */
1693                if (src_rq->rt.highest_prio.next >=
1694                    this_rq->rt.highest_prio.curr)
1695                        continue;
1696
1697                /*
1698                 * We can potentially drop this_rq's lock in
1699                 * double_lock_balance, and another CPU could
1700                 * alter this_rq
1701                 */
1702                double_lock_balance(this_rq, src_rq);
1703
1704                /*
1705                 * Are there still pullable RT tasks?
1706                 */
1707                if (src_rq->rt.rt_nr_running <= 1)
1708                        goto skip;
1709
1710                p = pick_next_highest_task_rt(src_rq, this_cpu);
1711
1712                /*
1713                 * Do we have an RT task that preempts
1714                 * the to-be-scheduled task?
1715                 */
1716                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1717                        WARN_ON(p == src_rq->curr);
1718                        WARN_ON(!p->on_rq);
1719
1720                        /*
1721                         * There's a chance that p is higher in priority
1722                         * than what's currently running on its cpu.
1723                         * This is just that p is wakeing up and hasn't
1724                         * had a chance to schedule. We only pull
1725                         * p if it is lower in priority than the
1726                         * current task on the run queue
1727                         */
1728                        if (p->prio < src_rq->curr->prio)
1729                                goto skip;
1730
1731                        ret = 1;
1732
1733                        deactivate_task(src_rq, p, 0);
1734                        set_task_cpu(p, this_cpu);
1735                        activate_task(this_rq, p, 0);
1736                        /*
1737                         * We continue with the search, just in
1738                         * case there's an even higher prio task
1739                         * in another runqueue. (low likelihood
1740                         * but possible)
1741                         */
1742                }
1743skip:
1744                double_unlock_balance(this_rq, src_rq);
1745        }
1746
1747        return ret;
1748}
1749
1750static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1751{
1752        /* Try to pull RT tasks here if we lower this rq's prio */
1753        if (rq->rt.highest_prio.curr > prev->prio)
1754                pull_rt_task(rq);
1755}
1756
1757static void post_schedule_rt(struct rq *rq)
1758{
1759        push_rt_tasks(rq);
1760}
1761
1762/*
1763 * If we are not running and we are not going to reschedule soon, we should
1764 * try to push tasks away now
1765 */
1766static void task_woken_rt(struct rq *rq, struct task_struct *p)
1767{
1768        if (!task_running(rq, p) &&
1769            !test_tsk_need_resched(rq->curr) &&
1770            has_pushable_tasks(rq) &&
1771            p->rt.nr_cpus_allowed > 1 &&
1772            rt_task(rq->curr) &&
1773            (rq->curr->rt.nr_cpus_allowed < 2 ||
1774             rq->curr->prio <= p->prio))
1775                push_rt_tasks(rq);
1776}
1777
1778static void set_cpus_allowed_rt(struct task_struct *p,
1779                                const struct cpumask *new_mask)
1780{
1781        int weight = cpumask_weight(new_mask);
1782
1783        BUG_ON(!rt_task(p));
1784
1785        /*
1786         * Update the migration status of the RQ if we have an RT task
1787         * which is running AND changing its weight value.
1788         */
1789        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1790                struct rq *rq = task_rq(p);
1791
1792                if (!task_current(rq, p)) {
1793                        /*
1794                         * Make sure we dequeue this task from the pushable list
1795                         * before going further.  It will either remain off of
1796                         * the list because we are no longer pushable, or it
1797                         * will be requeued.
1798                         */
1799                        if (p->rt.nr_cpus_allowed > 1)
1800                                dequeue_pushable_task(rq, p);
1801
1802                        /*
1803                         * Requeue if our weight is changing and still > 1
1804                         */
1805                        if (weight > 1)
1806                                enqueue_pushable_task(rq, p);
1807
1808                }
1809
1810                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1811                        rq->rt.rt_nr_migratory++;
1812                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1813                        BUG_ON(!rq->rt.rt_nr_migratory);
1814                        rq->rt.rt_nr_migratory--;
1815                }
1816
1817                update_rt_migration(&rq->rt);
1818        }
1819}
1820
1821/* Assumes rq->lock is held */
1822static void rq_online_rt(struct rq *rq)
1823{
1824        if (rq->rt.overloaded)
1825                rt_set_overload(rq);
1826
1827        __enable_runtime(rq);
1828
1829        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1830}
1831
1832/* Assumes rq->lock is held */
1833static void rq_offline_rt(struct rq *rq)
1834{
1835        if (rq->rt.overloaded)
1836                rt_clear_overload(rq);
1837
1838        __disable_runtime(rq);
1839
1840        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1841}
1842
1843/*
1844 * When switch from the rt queue, we bring ourselves to a position
1845 * that we might want to pull RT tasks from other runqueues.
1846 */
1847static void switched_from_rt(struct rq *rq, struct task_struct *p)
1848{
1849        /*
1850         * If there are other RT tasks then we will reschedule
1851         * and the scheduling of the other RT tasks will handle
1852         * the balancing. But if we are the last RT task
1853         * we may need to handle the pulling of RT tasks
1854         * now.
1855         */
1856        if (p->on_rq && !rq->rt.rt_nr_running)
1857                pull_rt_task(rq);
1858}
1859
1860void init_sched_rt_class(void)
1861{
1862        unsigned int i;
1863
1864        for_each_possible_cpu(i) {
1865                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1866                                        GFP_KERNEL, cpu_to_node(i));
1867        }
1868}
1869#endif /* CONFIG_SMP */
1870
1871/*
1872 * When switching a task to RT, we may overload the runqueue
1873 * with RT tasks. In this case we try to push them off to
1874 * other runqueues.
1875 */
1876static void switched_to_rt(struct rq *rq, struct task_struct *p)
1877{
1878        int check_resched = 1;
1879
1880        /*
1881         * If we are already running, then there's nothing
1882         * that needs to be done. But if we are not running
1883         * we may need to preempt the current running task.
1884         * If that current running task is also an RT task
1885         * then see if we can move to another run queue.
1886         */
1887        if (p->on_rq && rq->curr != p) {
1888#ifdef CONFIG_SMP
1889                if (rq->rt.overloaded && push_rt_task(rq) &&
1890                    /* Don't resched if we changed runqueues */
1891                    rq != task_rq(p))
1892                        check_resched = 0;
1893#endif /* CONFIG_SMP */
1894                if (check_resched && p->prio < rq->curr->prio)
1895                        resched_task(rq->curr);
1896        }
1897}
1898
1899/*
1900 * Priority of the task has changed. This may cause
1901 * us to initiate a push or pull.
1902 */
1903static void
1904prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1905{
1906        if (!p->on_rq)
1907                return;
1908
1909        if (rq->curr == p) {
1910#ifdef CONFIG_SMP
1911                /*
1912                 * If our priority decreases while running, we
1913                 * may need to pull tasks to this runqueue.
1914                 */
1915                if (oldprio < p->prio)
1916                        pull_rt_task(rq);
1917                /*
1918                 * If there's a higher priority task waiting to run
1919                 * then reschedule. Note, the above pull_rt_task
1920                 * can release the rq lock and p could migrate.
1921                 * Only reschedule if p is still on the same runqueue.
1922                 */
1923                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1924                        resched_task(p);
1925#else
1926                /* For UP simply resched on drop of prio */
1927                if (oldprio < p->prio)
1928                        resched_task(p);
1929#endif /* CONFIG_SMP */
1930        } else {
1931                /*
1932                 * This task is not running, but if it is
1933                 * greater than the current running task
1934                 * then reschedule.
1935                 */
1936                if (p->prio < rq->curr->prio)
1937                        resched_task(rq->curr);
1938        }
1939}
1940
1941static void watchdog(struct rq *rq, struct task_struct *p)
1942{
1943        unsigned long soft, hard;
1944
1945        /* max may change after cur was read, this will be fixed next tick */
1946        soft = task_rlimit(p, RLIMIT_RTTIME);
1947        hard = task_rlimit_max(p, RLIMIT_RTTIME);
1948
1949        if (soft != RLIM_INFINITY) {
1950                unsigned long next;
1951
1952                p->rt.timeout++;
1953                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1954                if (p->rt.timeout > next)
1955                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1956        }
1957}
1958
1959static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1960{
1961        update_curr_rt(rq);
1962
1963        watchdog(rq, p);
1964
1965        /*
1966         * RR tasks need a special form of timeslice management.
1967         * FIFO tasks have no timeslices.
1968         */
1969        if (p->policy != SCHED_RR)
1970                return;
1971
1972        if (--p->rt.time_slice)
1973                return;
1974
1975        p->rt.time_slice = DEF_TIMESLICE;
1976
1977        /*
1978         * Requeue to the end of queue if we are not the only element
1979         * on the queue:
1980         */
1981        if (p->rt.run_list.prev != p->rt.run_list.next) {
1982                requeue_task_rt(rq, p, 0);
1983                set_tsk_need_resched(p);
1984        }
1985}
1986
1987static void set_curr_task_rt(struct rq *rq)
1988{
1989        struct task_struct *p = rq->curr;
1990
1991        p->se.exec_start = rq->clock_task;
1992
1993        /* The running task is never eligible for pushing */
1994        dequeue_pushable_task(rq, p);
1995}
1996
1997static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1998{
1999        /*
2000         * Time slice is 0 for SCHED_FIFO tasks

2001         */
2002        if (task->policy == SCHED_RR)
2003                return DEF_TIMESLICE;
2004        else
2005                return 0;
2006}
2007
2008const struct sched_class rt_sched_class = {
2009        .next                   = &fair_sched_class,
2010        .enqueue_task           = enqueue_task_rt,
2011        .dequeue_task           = dequeue_task_rt,
2012        .yield_task             = yield_task_rt,
2013
2014        .check_preempt_curr     = check_preempt_curr_rt,
2015
2016        .pick_next_task         = pick_next_task_rt,
2017        .put_prev_task          = put_prev_task_rt,
2018
2019#ifdef CONFIG_SMP
2020        .select_task_rq         = select_task_rq_rt,
2021
2022        .set_cpus_allowed       = set_cpus_allowed_rt,
2023        .rq_online              = rq_online_rt,
2024        .rq_offline             = rq_offline_rt,
2025        .pre_schedule           = pre_schedule_rt,
2026        .post_schedule          = post_schedule_rt,
2027        .task_woken             = task_woken_rt,
2028        .switched_from          = switched_from_rt,
2029#endif
2030
2031        .set_curr_task          = set_curr_task_rt,
2032        .task_tick              = task_tick_rt,
2033
2034        .get_rr_interval        = get_rr_interval_rt,
2035
2036        .prio_changed           = prio_changed_rt,
2037        .switched_to            = switched_to_rt,
2038};
2039
2040#ifdef CONFIG_SCHED_DEBUG
2041extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2042
2043void print_rt_stats(struct seq_file *m, int cpu)
2044{
2045        rt_rq_iter_t iter;
2046        struct rt_rq *rt_rq;
2047
2048        rcu_read_lock();
2049        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2050                print_rt_rq(m, cpu, rt_rq);
2051        rcu_read_unlock();
2052}
2053#endif /* CONFIG_SCHED_DEBUG */
2054