1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
33
34
35
36
37
38
39
40
41
42
43
44
45
46unsigned int sysctl_sched_latency = 6000000ULL;
47unsigned int normalized_sysctl_sched_latency = 6000000ULL;
48
49
50
51
52
53
54
55
56
57
58enum sched_tunable_scaling sysctl_sched_tunable_scaling
59 = SCHED_TUNABLESCALING_LOG;
60
61
62
63
64
65unsigned int sysctl_sched_min_granularity = 750000ULL;
66unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
67
68
69
70
71static unsigned int sched_nr_latency = 8;
72
73
74
75
76
77unsigned int sysctl_sched_child_runs_first __read_mostly;
78
79
80
81
82
83
84
85
86
87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91
92
93
94
95
96
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
99#ifdef CONFIG_CFS_BANDWIDTH
100
101
102
103
104
105
106
107
108
109
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111#endif
112
113
114
115
116
117
118
119
120
121
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168
169
170
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173
174
175
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182
183
184
185
186
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203
204
205
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
217
218
219
220
221
222#ifdef CONFIG_FAIR_GROUP_SCHED
223
224
225static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
226{
227 return cfs_rq->rq;
228}
229
230
231#define entity_is_task(se) (!se->my_q)
232
233static inline struct task_struct *task_of(struct sched_entity *se)
234{
235#ifdef CONFIG_SCHED_DEBUG
236 WARN_ON_ONCE(!entity_is_task(se));
237#endif
238 return container_of(se, struct task_struct, se);
239}
240
241
242#define for_each_sched_entity(se) \
243 for (; se; se = se->parent)
244
245static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
246{
247 return p->se.cfs_rq;
248}
249
250
251static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
252{
253 return se->cfs_rq;
254}
255
256
257static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
258{
259 return grp->my_q;
260}
261
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{
264 if (!cfs_rq->on_list) {
265
266
267
268
269
270
271 if (cfs_rq->tg->parent &&
272 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
273 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
274 &rq_of(cfs_rq)->leaf_cfs_rq_list);
275 } else {
276 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
277 &rq_of(cfs_rq)->leaf_cfs_rq_list);
278 }
279
280 cfs_rq->on_list = 1;
281 }
282}
283
284static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
285{
286 if (cfs_rq->on_list) {
287 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
288 cfs_rq->on_list = 0;
289 }
290}
291
292
293#define for_each_leaf_cfs_rq(rq, cfs_rq) \
294 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
295
296
297static inline int
298is_same_group(struct sched_entity *se, struct sched_entity *pse)
299{
300 if (se->cfs_rq == pse->cfs_rq)
301 return 1;
302
303 return 0;
304}
305
306static inline struct sched_entity *parent_entity(struct sched_entity *se)
307{
308 return se->parent;
309}
310
311
312static inline int depth_se(struct sched_entity *se)
313{
314 int depth = 0;
315
316 for_each_sched_entity(se)
317 depth++;
318
319 return depth;
320}
321
322static void
323find_matching_se(struct sched_entity **se, struct sched_entity **pse)
324{
325 int se_depth, pse_depth;
326
327
328
329
330
331
332
333
334
335 se_depth = depth_se(*se);
336 pse_depth = depth_se(*pse);
337
338 while (se_depth > pse_depth) {
339 se_depth--;
340 *se = parent_entity(*se);
341 }
342
343 while (pse_depth > se_depth) {
344 pse_depth--;
345 *pse = parent_entity(*pse);
346 }
347
348 while (!is_same_group(*se, *pse)) {
349 *se = parent_entity(*se);
350 *pse = parent_entity(*pse);
351 }
352}
353
354#else
355
356static inline struct task_struct *task_of(struct sched_entity *se)
357{
358 return container_of(se, struct task_struct, se);
359}
360
361static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
362{
363 return container_of(cfs_rq, struct rq, cfs);
364}
365
366#define entity_is_task(se) 1
367
368#define for_each_sched_entity(se) \
369 for (; se; se = NULL)
370
371static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
372{
373 return &task_rq(p)->cfs;
374}
375
376static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
377{
378 struct task_struct *p = task_of(se);
379 struct rq *rq = task_rq(p);
380
381 return &rq->cfs;
382}
383
384
385static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
386{
387 return NULL;
388}
389
390static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
391{
392}
393
394static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
395{
396}
397
398#define for_each_leaf_cfs_rq(rq, cfs_rq) \
399 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
400
401static inline int
402is_same_group(struct sched_entity *se, struct sched_entity *pse)
403{
404 return 1;
405}
406
407static inline struct sched_entity *parent_entity(struct sched_entity *se)
408{
409 return NULL;
410}
411
412static inline void
413find_matching_se(struct sched_entity **se, struct sched_entity **pse)
414{
415}
416
417#endif
418
419static __always_inline
420void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
421
422
423
424
425
426static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
427{
428 s64 delta = (s64)(vruntime - min_vruntime);
429 if (delta > 0)
430 min_vruntime = vruntime;
431
432 return min_vruntime;
433}
434
435static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
436{
437 s64 delta = (s64)(vruntime - min_vruntime);
438 if (delta < 0)
439 min_vruntime = vruntime;
440
441 return min_vruntime;
442}
443
444static inline int entity_before(struct sched_entity *a,
445 struct sched_entity *b)
446{
447 return (s64)(a->vruntime - b->vruntime) < 0;
448}
449
450static void update_min_vruntime(struct cfs_rq *cfs_rq)
451{
452 u64 vruntime = cfs_rq->min_vruntime;
453
454 if (cfs_rq->curr)
455 vruntime = cfs_rq->curr->vruntime;
456
457 if (cfs_rq->rb_leftmost) {
458 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
459 struct sched_entity,
460 run_node);
461
462 if (!cfs_rq->curr)
463 vruntime = se->vruntime;
464 else
465 vruntime = min_vruntime(vruntime, se->vruntime);
466 }
467
468 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
469#ifndef CONFIG_64BIT
470 smp_wmb();
471 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
472#endif
473}
474
475
476
477
478static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
479{
480 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
481 struct rb_node *parent = NULL;
482 struct sched_entity *entry;
483 int leftmost = 1;
484
485
486
487
488 while (*link) {
489 parent = *link;
490 entry = rb_entry(parent, struct sched_entity, run_node);
491
492
493
494
495 if (entity_before(se, entry)) {
496 link = &parent->rb_left;
497 } else {
498 link = &parent->rb_right;
499 leftmost = 0;
500 }
501 }
502
503
504
505
506
507 if (leftmost)
508 cfs_rq->rb_leftmost = &se->run_node;
509
510 rb_link_node(&se->run_node, parent, link);
511 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
512}
513
514static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{
516 if (cfs_rq->rb_leftmost == &se->run_node) {
517 struct rb_node *next_node;
518
519 next_node = rb_next(&se->run_node);
520 cfs_rq->rb_leftmost = next_node;
521 }
522
523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
524}
525
526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
527{
528 struct rb_node *left = cfs_rq->rb_leftmost;
529
530 if (!left)
531 return NULL;
532
533 return rb_entry(left, struct sched_entity, run_node);
534}
535
536static struct sched_entity *__pick_next_entity(struct sched_entity *se)
537{
538 struct rb_node *next = rb_next(&se->run_node);
539
540 if (!next)
541 return NULL;
542
543 return rb_entry(next, struct sched_entity, run_node);
544}
545
546#ifdef CONFIG_SCHED_DEBUG
547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
548{
549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
550
551 if (!last)
552 return NULL;
553
554 return rb_entry(last, struct sched_entity, run_node);
555}
556
557
558
559
560
561int sched_proc_update_handler(struct ctl_table *table, int write,
562 void __user *buffer, size_t *lenp,
563 loff_t *ppos)
564{
565 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
566 int factor = get_update_sysctl_factor();
567
568 if (ret || !write)
569 return ret;
570
571 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
572 sysctl_sched_min_granularity);
573
574#define WRT_SYSCTL(name) \
575 (normalized_sysctl_##name = sysctl_##name / (factor))
576 WRT_SYSCTL(sched_min_granularity);
577 WRT_SYSCTL(sched_latency);
578 WRT_SYSCTL(sched_wakeup_granularity);
579#undef WRT_SYSCTL
580
581 return 0;
582}
583#endif
584
585
586
587
588static inline unsigned long
589calc_delta_fair(unsigned long delta, struct sched_entity *se)
590{
591 if (unlikely(se->load.weight != NICE_0_LOAD))
592 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
593
594 return delta;
595}
596
597
598
599
600
601
602
603
604
605static u64 __sched_period(unsigned long nr_running)
606{
607 u64 period = sysctl_sched_latency;
608 unsigned long nr_latency = sched_nr_latency;
609
610 if (unlikely(nr_running > nr_latency)) {
611 period = sysctl_sched_min_granularity;
612 period *= nr_running;
613 }
614
615 return period;
616}
617
618
619
620
621
622
623
624static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
625{
626 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
627
628 for_each_sched_entity(se) {
629 struct load_weight *load;
630 struct load_weight lw;
631
632 cfs_rq = cfs_rq_of(se);
633 load = &cfs_rq->load;
634
635 if (unlikely(!se->on_rq)) {
636 lw = cfs_rq->load;
637
638 update_load_add(&lw, se->load.weight);
639 load = &lw;
640 }
641 slice = calc_delta_mine(slice, se->load.weight, load);
642 }
643 return slice;
644}
645
646
647
648
649
650
651static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652{
653 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654}
655
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659
660
661
662
663static inline void
664__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
665 unsigned long delta_exec)
666{
667 unsigned long delta_exec_weighted;
668
669 schedstat_set(curr->statistics.exec_max,
670 max((u64)delta_exec, curr->statistics.exec_max));
671
672 curr->sum_exec_runtime += delta_exec;
673 schedstat_add(cfs_rq, exec_clock, delta_exec);
674 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
675
676 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682}
683
684static void update_curr(struct cfs_rq *cfs_rq)
685{
686 struct sched_entity *curr = cfs_rq->curr;
687 u64 now = rq_of(cfs_rq)->clock_task;
688 unsigned long delta_exec;
689
690 if (unlikely(!curr))
691 return;
692
693
694
695
696
697
698 delta_exec = (unsigned long)(now - curr->exec_start);
699 if (!delta_exec)
700 return;
701
702 __update_curr(cfs_rq, curr, delta_exec);
703 curr->exec_start = now;
704
705 if (entity_is_task(curr)) {
706 struct task_struct *curtask = task_of(curr);
707
708 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
709 cpuacct_charge(curtask, delta_exec);
710 account_group_exec_runtime(curtask, delta_exec);
711 }
712
713 account_cfs_rq_runtime(cfs_rq, delta_exec);
714}
715
716static inline void
717update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
718{
719 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
720}
721
722
723
724
725static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
726{
727
728
729
730
731 if (se != cfs_rq->curr)
732 update_stats_wait_start(cfs_rq, se);
733}
734
735static void
736update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
737{
738 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
739 rq_of(cfs_rq)->clock - se->statistics.wait_start));
740 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
741 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
742 rq_of(cfs_rq)->clock - se->statistics.wait_start);
743#ifdef CONFIG_SCHEDSTATS
744 if (entity_is_task(se)) {
745 trace_sched_stat_wait(task_of(se),
746 rq_of(cfs_rq)->clock - se->statistics.wait_start);
747 }
748#endif
749 schedstat_set(se->statistics.wait_start, 0);
750}
751
752static inline void
753update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754{
755
756
757
758
759 if (se != cfs_rq->curr)
760 update_stats_wait_end(cfs_rq, se);
761}
762
763
764
765
766static inline void
767update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
768{
769
770
771
772 se->exec_start = rq_of(cfs_rq)->clock_task;
773}
774
775
776
777
778
779static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{
782 update_load_add(&cfs_rq->load, se->load.weight);
783 if (!parent_entity(se))
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP
786 if (entity_is_task(se))
787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif
789 cfs_rq->nr_running++;
790}
791
792static void
793account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794{
795 update_load_sub(&cfs_rq->load, se->load.weight);
796 if (!parent_entity(se))
797 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
798 if (entity_is_task(se))
799 list_del_init(&se->group_node);
800 cfs_rq->nr_running--;
801}
802
803#ifdef CONFIG_FAIR_GROUP_SCHED
804
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856
857
858
859
860
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{
872 long tg_weight;
873
874
875
876
877
878
879 tg_weight = atomic_read(&tg->load_weight);
880 tg_weight -= cfs_rq->load_contribution;
881 tg_weight += cfs_rq->load.weight;
882
883 return tg_weight;
884}
885
886static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
887{
888 long tg_weight, load, shares;
889
890 tg_weight = calc_tg_weight(tg, cfs_rq);
891 load = cfs_rq->load.weight;
892
893 shares = (tg->shares * load);
894 if (tg_weight)
895 shares /= tg_weight;
896
897 if (shares < MIN_SHARES)
898 shares = MIN_SHARES;
899 if (shares > tg->shares)
900 shares = tg->shares;
901
902 return shares;
903}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{
919 return tg->shares;
920}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight)
928{
929 if (se->on_rq) {
930
931 if (cfs_rq->curr == se)
932 update_curr(cfs_rq);
933 account_entity_dequeue(cfs_rq, se);
934 }
935
936 update_load_set(&se->load, weight);
937
938 if (se->on_rq)
939 account_entity_enqueue(cfs_rq, se);
940}
941
942static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{
944 struct task_group *tg;
945 struct sched_entity *se;
946 long shares;
947
948 tg = cfs_rq->tg;
949 se = tg->se[cpu_of(rq_of(cfs_rq))];
950 if (!se || throttled_hierarchy(cfs_rq))
951 return;
952#ifndef CONFIG_SMP
953 if (likely(se->load.weight == tg->shares))
954 return;
955#endif
956 shares = calc_cfs_shares(cfs_rq, tg);
957
958 reweight_entity(cfs_rq_of(se), se, shares);
959}
960#else
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
962{
963}
964
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
966{
967}
968
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
970{
971}
972#endif
973
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{
976#ifdef CONFIG_SCHEDSTATS
977 struct task_struct *tsk = NULL;
978
979 if (entity_is_task(se))
980 tsk = task_of(se);
981
982 if (se->statistics.sleep_start) {
983 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
984
985 if ((s64)delta < 0)
986 delta = 0;
987
988 if (unlikely(delta > se->statistics.sleep_max))
989 se->statistics.sleep_max = delta;
990
991 se->statistics.sleep_start = 0;
992 se->statistics.sum_sleep_runtime += delta;
993
994 if (tsk) {
995 account_scheduler_latency(tsk, delta >> 10, 1);
996 trace_sched_stat_sleep(tsk, delta);
997 }
998 }
999 if (se->statistics.block_start) {
1000 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1001
1002 if ((s64)delta < 0)
1003 delta = 0;
1004
1005 if (unlikely(delta > se->statistics.block_max))
1006 se->statistics.block_max = delta;
1007
1008 se->statistics.block_start = 0;
1009 se->statistics.sum_sleep_runtime += delta;
1010
1011 if (tsk) {
1012 if (tsk->in_iowait) {
1013 se->statistics.iowait_sum += delta;
1014 se->statistics.iowait_count++;
1015 trace_sched_stat_iowait(tsk, delta);
1016 }
1017
1018 trace_sched_stat_blocked(tsk, delta);
1019
1020
1021
1022
1023
1024
1025 if (unlikely(prof_on == SLEEP_PROFILING)) {
1026 profile_hits(SLEEP_PROFILING,
1027 (void *)get_wchan(tsk),
1028 delta >> 20);
1029 }
1030 account_scheduler_latency(tsk, delta >> 10, 0);
1031 }
1032 }
1033#endif
1034}
1035
1036static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1037{
1038#ifdef CONFIG_SCHED_DEBUG
1039 s64 d = se->vruntime - cfs_rq->min_vruntime;
1040
1041 if (d < 0)
1042 d = -d;
1043
1044 if (d > 3*sysctl_sched_latency)
1045 schedstat_inc(cfs_rq, nr_spread_over);
1046#endif
1047}
1048
1049static void
1050place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1051{
1052 u64 vruntime = cfs_rq->min_vruntime;
1053
1054
1055
1056
1057
1058
1059
1060 if (initial && sched_feat(START_DEBIT))
1061 vruntime += sched_vslice(cfs_rq, se);
1062
1063
1064 if (!initial) {
1065 unsigned long thresh = sysctl_sched_latency;
1066
1067
1068
1069
1070
1071 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1072 thresh >>= 1;
1073
1074 vruntime -= thresh;
1075 }
1076
1077
1078 vruntime = max_vruntime(se->vruntime, vruntime);
1079
1080 se->vruntime = vruntime;
1081}
1082
1083static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1084
1085static void
1086enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1087{
1088
1089
1090
1091
1092 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1093 se->vruntime += cfs_rq->min_vruntime;
1094
1095
1096
1097
1098 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0);
1100 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq);
1102
1103 if (flags & ENQUEUE_WAKEUP) {
1104 place_entity(cfs_rq, se, 0);
1105 enqueue_sleeper(cfs_rq, se);
1106 }
1107
1108 update_stats_enqueue(cfs_rq, se);
1109 check_spread(cfs_rq, se);
1110 if (se != cfs_rq->curr)
1111 __enqueue_entity(cfs_rq, se);
1112 se->on_rq = 1;
1113
1114 if (cfs_rq->nr_running == 1) {
1115 list_add_leaf_cfs_rq(cfs_rq);
1116 check_enqueue_throttle(cfs_rq);
1117 }
1118}
1119
1120static void __clear_buddies_last(struct sched_entity *se)
1121{
1122 for_each_sched_entity(se) {
1123 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1124 if (cfs_rq->last == se)
1125 cfs_rq->last = NULL;
1126 else
1127 break;
1128 }
1129}
1130
1131static void __clear_buddies_next(struct sched_entity *se)
1132{
1133 for_each_sched_entity(se) {
1134 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1135 if (cfs_rq->next == se)
1136 cfs_rq->next = NULL;
1137 else
1138 break;
1139 }
1140}
1141
1142static void __clear_buddies_skip(struct sched_entity *se)
1143{
1144 for_each_sched_entity(se) {
1145 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1146 if (cfs_rq->skip == se)
1147 cfs_rq->skip = NULL;
1148 else
1149 break;
1150 }
1151}
1152
1153static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1154{
1155 if (cfs_rq->last == se)
1156 __clear_buddies_last(se);
1157
1158 if (cfs_rq->next == se)
1159 __clear_buddies_next(se);
1160
1161 if (cfs_rq->skip == se)
1162 __clear_buddies_skip(se);
1163}
1164
1165static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1166
1167static void
1168dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1169{
1170
1171
1172
1173 update_curr(cfs_rq);
1174
1175 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) {
1177#ifdef CONFIG_SCHEDSTATS
1178 if (entity_is_task(se)) {
1179 struct task_struct *tsk = task_of(se);
1180
1181 if (tsk->state & TASK_INTERRUPTIBLE)
1182 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1183 if (tsk->state & TASK_UNINTERRUPTIBLE)
1184 se->statistics.block_start = rq_of(cfs_rq)->clock;
1185 }
1186#endif
1187 }
1188
1189 clear_buddies(cfs_rq, se);
1190
1191 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se);
1196
1197
1198
1199
1200
1201
1202 if (!(flags & DEQUEUE_SLEEP))
1203 se->vruntime -= cfs_rq->min_vruntime;
1204
1205
1206 return_cfs_rq_runtime(cfs_rq);
1207
1208 update_min_vruntime(cfs_rq);
1209 update_cfs_shares(cfs_rq);
1210}
1211
1212
1213
1214
1215static void
1216check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1217{
1218 unsigned long ideal_runtime, delta_exec;
1219 struct sched_entity *se;
1220 s64 delta;
1221
1222 ideal_runtime = sched_slice(cfs_rq, curr);
1223 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1224 if (delta_exec > ideal_runtime) {
1225 resched_task(rq_of(cfs_rq)->curr);
1226
1227
1228
1229
1230 clear_buddies(cfs_rq, curr);
1231 return;
1232 }
1233
1234
1235
1236
1237
1238
1239 if (delta_exec < sysctl_sched_min_granularity)
1240 return;
1241
1242 se = __pick_first_entity(cfs_rq);
1243 delta = curr->vruntime - se->vruntime;
1244
1245 if (delta < 0)
1246 return;
1247
1248 if (delta > ideal_runtime)
1249 resched_task(rq_of(cfs_rq)->curr);
1250}
1251
1252static void
1253set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1254{
1255
1256 if (se->on_rq) {
1257
1258
1259
1260
1261
1262 update_stats_wait_end(cfs_rq, se);
1263 __dequeue_entity(cfs_rq, se);
1264 }
1265
1266 update_stats_curr_start(cfs_rq, se);
1267 cfs_rq->curr = se;
1268#ifdef CONFIG_SCHEDSTATS
1269
1270
1271
1272
1273
1274 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1275 se->statistics.slice_max = max(se->statistics.slice_max,
1276 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1277 }
1278#endif
1279 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1280}
1281
1282static int
1283wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1284
1285
1286
1287
1288
1289
1290
1291
1292static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1293{
1294 struct sched_entity *se = __pick_first_entity(cfs_rq);
1295 struct sched_entity *left = se;
1296
1297
1298
1299
1300
1301 if (cfs_rq->skip == se) {
1302 struct sched_entity *second = __pick_next_entity(se);
1303 if (second && wakeup_preempt_entity(second, left) < 1)
1304 se = second;
1305 }
1306
1307
1308
1309
1310 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1311 se = cfs_rq->last;
1312
1313
1314
1315
1316 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1317 se = cfs_rq->next;
1318
1319 clear_buddies(cfs_rq, se);
1320
1321 return se;
1322}
1323
1324static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1325
1326static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1327{
1328
1329
1330
1331
1332 if (prev->on_rq)
1333 update_curr(cfs_rq);
1334
1335
1336 check_cfs_rq_runtime(cfs_rq);
1337
1338 check_spread(cfs_rq, prev);
1339 if (prev->on_rq) {
1340 update_stats_wait_start(cfs_rq, prev);
1341
1342 __enqueue_entity(cfs_rq, prev);
1343 }
1344 cfs_rq->curr = NULL;
1345}
1346
1347static void
1348entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1349{
1350
1351
1352
1353 update_curr(cfs_rq);
1354
1355
1356
1357
1358 update_entity_shares_tick(cfs_rq);
1359
1360#ifdef CONFIG_SCHED_HRTICK
1361
1362
1363
1364
1365 if (queued) {
1366 resched_task(rq_of(cfs_rq)->curr);
1367 return;
1368 }
1369
1370
1371
1372 if (!sched_feat(DOUBLE_TICK) &&
1373 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1374 return;
1375#endif
1376
1377 if (cfs_rq->nr_running > 1)
1378 check_preempt_tick(cfs_rq, curr);
1379}
1380
1381
1382
1383
1384
1385
1386#ifdef CONFIG_CFS_BANDWIDTH
1387
1388#ifdef HAVE_JUMP_LABEL
1389static struct static_key __cfs_bandwidth_used;
1390
1391static inline bool cfs_bandwidth_used(void)
1392{
1393 return static_key_false(&__cfs_bandwidth_used);
1394}
1395
1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1397{
1398
1399 if (enabled && !was_enabled)
1400 static_key_slow_inc(&__cfs_bandwidth_used);
1401 else if (!enabled && was_enabled)
1402 static_key_slow_dec(&__cfs_bandwidth_used);
1403}
1404#else
1405static bool cfs_bandwidth_used(void)
1406{
1407 return true;
1408}
1409
1410void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1411#endif
1412
1413
1414
1415
1416
1417static inline u64 default_cfs_period(void)
1418{
1419 return 100000000ULL;
1420}
1421
1422static inline u64 sched_cfs_bandwidth_slice(void)
1423{
1424 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1425}
1426
1427
1428
1429
1430
1431
1432
1433
1434void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1435{
1436 u64 now;
1437
1438 if (cfs_b->quota == RUNTIME_INF)
1439 return;
1440
1441 now = sched_clock_cpu(smp_processor_id());
1442 cfs_b->runtime = cfs_b->quota;
1443 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1444}
1445
1446static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1447{
1448 return &tg->cfs_bandwidth;
1449}
1450
1451
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{
1454 struct task_group *tg = cfs_rq->tg;
1455 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1456 u64 amount = 0, min_amount, expires;
1457
1458
1459 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1460
1461 raw_spin_lock(&cfs_b->lock);
1462 if (cfs_b->quota == RUNTIME_INF)
1463 amount = min_amount;
1464 else {
1465
1466
1467
1468
1469
1470
1471 if (!cfs_b->timer_active) {
1472 __refill_cfs_bandwidth_runtime(cfs_b);
1473 __start_cfs_bandwidth(cfs_b);
1474 }
1475
1476 if (cfs_b->runtime > 0) {
1477 amount = min(cfs_b->runtime, min_amount);
1478 cfs_b->runtime -= amount;
1479 cfs_b->idle = 0;
1480 }
1481 }
1482 expires = cfs_b->runtime_expires;
1483 raw_spin_unlock(&cfs_b->lock);
1484
1485 cfs_rq->runtime_remaining += amount;
1486
1487
1488
1489
1490
1491 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1492 cfs_rq->runtime_expires = expires;
1493
1494 return cfs_rq->runtime_remaining > 0;
1495}
1496
1497
1498
1499
1500
1501static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1502{
1503 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1504 struct rq *rq = rq_of(cfs_rq);
1505
1506
1507 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1508 return;
1509
1510 if (cfs_rq->runtime_remaining < 0)
1511 return;
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1523
1524 cfs_rq->runtime_expires += TICK_NSEC;
1525 } else {
1526
1527 cfs_rq->runtime_remaining = 0;
1528 }
1529}
1530
1531static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1532 unsigned long delta_exec)
1533{
1534
1535 cfs_rq->runtime_remaining -= delta_exec;
1536 expire_cfs_rq_runtime(cfs_rq);
1537
1538 if (likely(cfs_rq->runtime_remaining > 0))
1539 return;
1540
1541
1542
1543
1544
1545 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1546 resched_task(rq_of(cfs_rq)->curr);
1547}
1548
1549static __always_inline
1550void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1551{
1552 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1553 return;
1554
1555 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1556}
1557
1558static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1559{
1560 return cfs_bandwidth_used() && cfs_rq->throttled;
1561}
1562
1563
1564static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1565{
1566 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1567}
1568
1569
1570
1571
1572
1573
1574static inline int throttled_lb_pair(struct task_group *tg,
1575 int src_cpu, int dest_cpu)
1576{
1577 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1578
1579 src_cfs_rq = tg->cfs_rq[src_cpu];
1580 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1581
1582 return throttled_hierarchy(src_cfs_rq) ||
1583 throttled_hierarchy(dest_cfs_rq);
1584}
1585
1586
1587static int tg_unthrottle_up(struct task_group *tg, void *data)
1588{
1589 struct rq *rq = data;
1590 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1591
1592 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1596
1597
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601
1602 update_cfs_shares(cfs_rq);
1603 }
1604#endif
1605
1606 return 0;
1607}
1608
1609static int tg_throttle_down(struct task_group *tg, void *data)
1610{
1611 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613
1614
1615 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0);
1617 cfs_rq->throttle_count++;
1618
1619 return 0;
1620}
1621
1622static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1623{
1624 struct rq *rq = rq_of(cfs_rq);
1625 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1626 struct sched_entity *se;
1627 long task_delta, dequeue = 1;
1628
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630
1631
1632 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock();
1635
1636 task_delta = cfs_rq->h_nr_running;
1637 for_each_sched_entity(se) {
1638 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1639
1640 if (!se->on_rq)
1641 break;
1642
1643 if (dequeue)
1644 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1645 qcfs_rq->h_nr_running -= task_delta;
1646
1647 if (qcfs_rq->load.weight)
1648 dequeue = 0;
1649 }
1650
1651 if (!se)
1652 rq->nr_running -= task_delta;
1653
1654 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock;
1656 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock);
1659}
1660
1661void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1662{
1663 struct rq *rq = rq_of(cfs_rq);
1664 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1665 struct sched_entity *se;
1666 int enqueue = 1;
1667 long task_delta;
1668
1669 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1670
1671 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1674 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677
1678 update_rq_clock(rq);
1679
1680 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1681
1682 if (!cfs_rq->load.weight)
1683 return;
1684
1685 task_delta = cfs_rq->h_nr_running;
1686 for_each_sched_entity(se) {
1687 if (se->on_rq)
1688 enqueue = 0;
1689
1690 cfs_rq = cfs_rq_of(se);
1691 if (enqueue)
1692 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1693 cfs_rq->h_nr_running += task_delta;
1694
1695 if (cfs_rq_throttled(cfs_rq))
1696 break;
1697 }
1698
1699 if (!se)
1700 rq->nr_running += task_delta;
1701
1702
1703 if (rq->curr == rq->idle && rq->cfs.nr_running)
1704 resched_task(rq->curr);
1705}
1706
1707static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1708 u64 remaining, u64 expires)
1709{
1710 struct cfs_rq *cfs_rq;
1711 u64 runtime = remaining;
1712
1713 rcu_read_lock();
1714 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1715 throttled_list) {
1716 struct rq *rq = rq_of(cfs_rq);
1717
1718 raw_spin_lock(&rq->lock);
1719 if (!cfs_rq_throttled(cfs_rq))
1720 goto next;
1721
1722 runtime = -cfs_rq->runtime_remaining + 1;
1723 if (runtime > remaining)
1724 runtime = remaining;
1725 remaining -= runtime;
1726
1727 cfs_rq->runtime_remaining += runtime;
1728 cfs_rq->runtime_expires = expires;
1729
1730
1731 if (cfs_rq->runtime_remaining > 0)
1732 unthrottle_cfs_rq(cfs_rq);
1733
1734next:
1735 raw_spin_unlock(&rq->lock);
1736
1737 if (!remaining)
1738 break;
1739 }
1740 rcu_read_unlock();
1741
1742 return remaining;
1743}
1744
1745
1746
1747
1748
1749
1750
1751static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1752{
1753 u64 runtime, runtime_expires;
1754 int idle = 1, throttled;
1755
1756 raw_spin_lock(&cfs_b->lock);
1757
1758 if (cfs_b->quota == RUNTIME_INF)
1759 goto out_unlock;
1760
1761 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1762
1763 idle = cfs_b->idle && !throttled;
1764 cfs_b->nr_periods += overrun;
1765
1766
1767 if (idle)
1768 goto out_unlock;
1769
1770 __refill_cfs_bandwidth_runtime(cfs_b);
1771
1772 if (!throttled) {
1773
1774 cfs_b->idle = 1;
1775 goto out_unlock;
1776 }
1777
1778
1779 cfs_b->nr_throttled += overrun;
1780
1781
1782
1783
1784
1785
1786
1787 runtime = cfs_b->runtime;
1788 runtime_expires = cfs_b->runtime_expires;
1789 cfs_b->runtime = 0;
1790
1791
1792
1793
1794
1795
1796 while (throttled && runtime > 0) {
1797 raw_spin_unlock(&cfs_b->lock);
1798
1799 runtime = distribute_cfs_runtime(cfs_b, runtime,
1800 runtime_expires);
1801 raw_spin_lock(&cfs_b->lock);
1802
1803 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1804 }
1805
1806
1807 cfs_b->runtime = runtime;
1808
1809
1810
1811
1812
1813
1814 cfs_b->idle = 0;
1815out_unlock:
1816 if (idle)
1817 cfs_b->timer_active = 0;
1818 raw_spin_unlock(&cfs_b->lock);
1819
1820 return idle;
1821}
1822
1823
1824static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1825
1826static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1827
1828static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1829
1830
1831static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1832{
1833 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1834 u64 remaining;
1835
1836
1837 if (hrtimer_callback_running(refresh_timer))
1838 return 1;
1839
1840
1841 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1842 if (remaining < min_expire)
1843 return 1;
1844
1845 return 0;
1846}
1847
1848static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1849{
1850 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1851
1852
1853 if (runtime_refresh_within(cfs_b, min_left))
1854 return;
1855
1856 start_bandwidth_timer(&cfs_b->slack_timer,
1857 ns_to_ktime(cfs_bandwidth_slack_period));
1858}
1859
1860
1861static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1862{
1863 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1864 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1865
1866 if (slack_runtime <= 0)
1867 return;
1868
1869 raw_spin_lock(&cfs_b->lock);
1870 if (cfs_b->quota != RUNTIME_INF &&
1871 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1872 cfs_b->runtime += slack_runtime;
1873
1874
1875 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1876 !list_empty(&cfs_b->throttled_cfs_rq))
1877 start_cfs_slack_bandwidth(cfs_b);
1878 }
1879 raw_spin_unlock(&cfs_b->lock);
1880
1881
1882 cfs_rq->runtime_remaining -= slack_runtime;
1883}
1884
1885static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1886{
1887 if (!cfs_bandwidth_used())
1888 return;
1889
1890 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1891 return;
1892
1893 __return_cfs_rq_runtime(cfs_rq);
1894}
1895
1896
1897
1898
1899
1900static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1901{
1902 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1903 u64 expires;
1904
1905
1906 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1907 return;
1908
1909 raw_spin_lock(&cfs_b->lock);
1910 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1911 runtime = cfs_b->runtime;
1912 cfs_b->runtime = 0;
1913 }
1914 expires = cfs_b->runtime_expires;
1915 raw_spin_unlock(&cfs_b->lock);
1916
1917 if (!runtime)
1918 return;
1919
1920 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1921
1922 raw_spin_lock(&cfs_b->lock);
1923 if (expires == cfs_b->runtime_expires)
1924 cfs_b->runtime = runtime;
1925 raw_spin_unlock(&cfs_b->lock);
1926}
1927
1928
1929
1930
1931
1932
1933static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1934{
1935 if (!cfs_bandwidth_used())
1936 return;
1937
1938
1939 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1940 return;
1941
1942
1943 if (cfs_rq_throttled(cfs_rq))
1944 return;
1945
1946
1947 account_cfs_rq_runtime(cfs_rq, 0);
1948 if (cfs_rq->runtime_remaining <= 0)
1949 throttle_cfs_rq(cfs_rq);
1950}
1951
1952
1953static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1954{
1955 if (!cfs_bandwidth_used())
1956 return;
1957
1958 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1959 return;
1960
1961
1962
1963
1964
1965 if (cfs_rq_throttled(cfs_rq))
1966 return;
1967
1968 throttle_cfs_rq(cfs_rq);
1969}
1970
1971static inline u64 default_cfs_period(void);
1972static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1973static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1974
1975static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1976{
1977 struct cfs_bandwidth *cfs_b =
1978 container_of(timer, struct cfs_bandwidth, slack_timer);
1979 do_sched_cfs_slack_timer(cfs_b);
1980
1981 return HRTIMER_NORESTART;
1982}
1983
1984static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1985{
1986 struct cfs_bandwidth *cfs_b =
1987 container_of(timer, struct cfs_bandwidth, period_timer);
1988 ktime_t now;
1989 int overrun;
1990 int idle = 0;
1991
1992 for (;;) {
1993 now = hrtimer_cb_get_time(timer);
1994 overrun = hrtimer_forward(timer, now, cfs_b->period);
1995
1996 if (!overrun)
1997 break;
1998
1999 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2000 }
2001
2002 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2003}
2004
2005void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2006{
2007 raw_spin_lock_init(&cfs_b->lock);
2008 cfs_b->runtime = 0;
2009 cfs_b->quota = RUNTIME_INF;
2010 cfs_b->period = ns_to_ktime(default_cfs_period());
2011
2012 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2013 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2014 cfs_b->period_timer.function = sched_cfs_period_timer;
2015 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2016 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2017}
2018
2019static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2020{
2021 cfs_rq->runtime_enabled = 0;
2022 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2023}
2024
2025
2026void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2027{
2028
2029
2030
2031
2032
2033
2034 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2035 raw_spin_unlock(&cfs_b->lock);
2036
2037 hrtimer_cancel(&cfs_b->period_timer);
2038
2039 raw_spin_lock(&cfs_b->lock);
2040
2041 if (cfs_b->timer_active)
2042 return;
2043 }
2044
2045 cfs_b->timer_active = 1;
2046 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2047}
2048
2049static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2050{
2051 hrtimer_cancel(&cfs_b->period_timer);
2052 hrtimer_cancel(&cfs_b->slack_timer);
2053}
2054
2055static void unthrottle_offline_cfs_rqs(struct rq *rq)
2056{
2057 struct cfs_rq *cfs_rq;
2058
2059 for_each_leaf_cfs_rq(rq, cfs_rq) {
2060 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2061
2062 if (!cfs_rq->runtime_enabled)
2063 continue;
2064
2065
2066
2067
2068
2069 cfs_rq->runtime_remaining = cfs_b->quota;
2070 if (cfs_rq_throttled(cfs_rq))
2071 unthrottle_cfs_rq(cfs_rq);
2072 }
2073}
2074
2075#else
2076static __always_inline
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2081
2082static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2083{
2084 return 0;
2085}
2086
2087static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2088{
2089 return 0;
2090}
2091
2092static inline int throttled_lb_pair(struct task_group *tg,
2093 int src_cpu, int dest_cpu)
2094{
2095 return 0;
2096}
2097
2098void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2099
2100#ifdef CONFIG_FAIR_GROUP_SCHED
2101static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2102#endif
2103
2104static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2105{
2106 return NULL;
2107}
2108static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2109static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2110
2111#endif
2112
2113
2114
2115
2116
2117#ifdef CONFIG_SCHED_HRTICK
2118static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2119{
2120 struct sched_entity *se = &p->se;
2121 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2122
2123 WARN_ON(task_rq(p) != rq);
2124
2125 if (cfs_rq->nr_running > 1) {
2126 u64 slice = sched_slice(cfs_rq, se);
2127 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2128 s64 delta = slice - ran;
2129
2130 if (delta < 0) {
2131 if (rq->curr == p)
2132 resched_task(p);
2133 return;
2134 }
2135
2136
2137
2138
2139
2140 if (rq->curr != p)
2141 delta = max_t(s64, 10000LL, delta);
2142
2143 hrtick_start(rq, delta);
2144 }
2145}
2146
2147
2148
2149
2150
2151
2152static void hrtick_update(struct rq *rq)
2153{
2154 struct task_struct *curr = rq->curr;
2155
2156 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2157 return;
2158
2159 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2160 hrtick_start_fair(rq, curr);
2161}
2162#else
2163static inline void
2164hrtick_start_fair(struct rq *rq, struct task_struct *p)
2165{
2166}
2167
2168static inline void hrtick_update(struct rq *rq)
2169{
2170}
2171#endif
2172
2173
2174
2175
2176
2177
2178static void
2179enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2180{
2181 struct cfs_rq *cfs_rq;
2182 struct sched_entity *se = &p->se;
2183
2184 for_each_sched_entity(se) {
2185 if (se->on_rq)
2186 break;
2187 cfs_rq = cfs_rq_of(se);
2188 enqueue_entity(cfs_rq, se, flags);
2189
2190
2191
2192
2193
2194
2195
2196 if (cfs_rq_throttled(cfs_rq))
2197 break;
2198 cfs_rq->h_nr_running++;
2199
2200 flags = ENQUEUE_WAKEUP;
2201 }
2202
2203 for_each_sched_entity(se) {
2204 cfs_rq = cfs_rq_of(se);
2205 cfs_rq->h_nr_running++;
2206
2207 if (cfs_rq_throttled(cfs_rq))
2208 break;
2209
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq);
2212 }
2213
2214 if (!se)
2215 inc_nr_running(rq);
2216 hrtick_update(rq);
2217}
2218
2219static void set_next_buddy(struct sched_entity *se);
2220
2221
2222
2223
2224
2225
2226static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2227{
2228 struct cfs_rq *cfs_rq;
2229 struct sched_entity *se = &p->se;
2230 int task_sleep = flags & DEQUEUE_SLEEP;
2231
2232 for_each_sched_entity(se) {
2233 cfs_rq = cfs_rq_of(se);
2234 dequeue_entity(cfs_rq, se, flags);
2235
2236
2237
2238
2239
2240
2241
2242 if (cfs_rq_throttled(cfs_rq))
2243 break;
2244 cfs_rq->h_nr_running--;
2245
2246
2247 if (cfs_rq->load.weight) {
2248
2249
2250
2251
2252 if (task_sleep && parent_entity(se))
2253 set_next_buddy(parent_entity(se));
2254
2255
2256 se = parent_entity(se);
2257 break;
2258 }
2259 flags |= DEQUEUE_SLEEP;
2260 }
2261
2262 for_each_sched_entity(se) {
2263 cfs_rq = cfs_rq_of(se);
2264 cfs_rq->h_nr_running--;
2265
2266 if (cfs_rq_throttled(cfs_rq))
2267 break;
2268
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq);
2271 }
2272
2273 if (!se)
2274 dec_nr_running(rq);
2275 hrtick_update(rq);
2276}
2277
2278#ifdef CONFIG_SMP
2279
2280static unsigned long weighted_cpuload(const int cpu)
2281{
2282 return cpu_rq(cpu)->load.weight;
2283}
2284
2285
2286
2287
2288
2289
2290
2291
2292static unsigned long source_load(int cpu, int type)
2293{
2294 struct rq *rq = cpu_rq(cpu);
2295 unsigned long total = weighted_cpuload(cpu);
2296
2297 if (type == 0 || !sched_feat(LB_BIAS))
2298 return total;
2299
2300 return min(rq->cpu_load[type-1], total);
2301}
2302
2303
2304
2305
2306
2307static unsigned long target_load(int cpu, int type)
2308{
2309 struct rq *rq = cpu_rq(cpu);
2310 unsigned long total = weighted_cpuload(cpu);
2311
2312 if (type == 0 || !sched_feat(LB_BIAS))
2313 return total;
2314
2315 return max(rq->cpu_load[type-1], total);
2316}
2317
2318static unsigned long power_of(int cpu)
2319{
2320 return cpu_rq(cpu)->cpu_power;
2321}
2322
2323static unsigned long cpu_avg_load_per_task(int cpu)
2324{
2325 struct rq *rq = cpu_rq(cpu);
2326 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2327
2328 if (nr_running)
2329 return rq->load.weight / nr_running;
2330
2331 return 0;
2332}
2333
2334
2335static void task_waking_fair(struct task_struct *p)
2336{
2337 struct sched_entity *se = &p->se;
2338 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2339 u64 min_vruntime;
2340
2341#ifndef CONFIG_64BIT
2342 u64 min_vruntime_copy;
2343
2344 do {
2345 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2346 smp_rmb();
2347 min_vruntime = cfs_rq->min_vruntime;
2348 } while (min_vruntime != min_vruntime_copy);
2349#else
2350 min_vruntime = cfs_rq->min_vruntime;
2351#endif
2352
2353 se->vruntime -= min_vruntime;
2354}
2355
2356#ifdef CONFIG_FAIR_GROUP_SCHED
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2408{
2409 struct sched_entity *se = tg->se[cpu];
2410
2411 if (!tg->parent)
2412 return wl;
2413
2414 for_each_sched_entity(se) {
2415 long w, W;
2416
2417 tg = se->my_q->tg;
2418
2419
2420
2421
2422 W = wg + calc_tg_weight(tg, se->my_q);
2423
2424
2425
2426
2427 w = se->my_q->load.weight + wl;
2428
2429
2430
2431
2432 if (W > 0 && w < W)
2433 wl = (w * tg->shares) / W;
2434 else
2435 wl = tg->shares;
2436
2437
2438
2439
2440
2441
2442 if (wl < MIN_SHARES)
2443 wl = MIN_SHARES;
2444
2445
2446
2447
2448 wl -= se->load.weight;
2449
2450
2451
2452
2453
2454
2455
2456
2457 wg = 0;
2458 }
2459
2460 return wl;
2461}
2462#else
2463
2464static inline unsigned long effective_load(struct task_group *tg, int cpu,
2465 unsigned long wl, unsigned long wg)
2466{
2467 return wl;
2468}
2469
2470#endif
2471
2472static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2473{
2474 s64 this_load, load;
2475 int idx, this_cpu, prev_cpu;
2476 unsigned long tl_per_task;
2477 struct task_group *tg;
2478 unsigned long weight;
2479 int balanced;
2480
2481 idx = sd->wake_idx;
2482 this_cpu = smp_processor_id();
2483 prev_cpu = task_cpu(p);
2484 load = source_load(prev_cpu, idx);
2485 this_load = target_load(this_cpu, idx);
2486
2487
2488
2489
2490
2491
2492 if (sync) {
2493 tg = task_group(current);
2494 weight = current->se.load.weight;
2495
2496 this_load += effective_load(tg, this_cpu, -weight, -weight);
2497 load += effective_load(tg, prev_cpu, 0, -weight);
2498 }
2499
2500 tg = task_group(p);
2501 weight = p->se.load.weight;
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512 if (this_load > 0) {
2513 s64 this_eff_load, prev_eff_load;
2514
2515 this_eff_load = 100;
2516 this_eff_load *= power_of(prev_cpu);
2517 this_eff_load *= this_load +
2518 effective_load(tg, this_cpu, weight, weight);
2519
2520 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2521 prev_eff_load *= power_of(this_cpu);
2522 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2523
2524 balanced = this_eff_load <= prev_eff_load;
2525 } else
2526 balanced = true;
2527
2528
2529
2530
2531
2532
2533 if (sync && balanced)
2534 return 1;
2535
2536 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2537 tl_per_task = cpu_avg_load_per_task(this_cpu);
2538
2539 if (balanced ||
2540 (this_load <= load &&
2541 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2542
2543
2544
2545
2546
2547 schedstat_inc(sd, ttwu_move_affine);
2548 schedstat_inc(p, se.statistics.nr_wakeups_affine);
2549
2550 return 1;
2551 }
2552 return 0;
2553}
2554
2555
2556
2557
2558
2559static struct sched_group *
2560find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2561 int this_cpu, int load_idx)
2562{
2563 struct sched_group *idlest = NULL, *group = sd->groups;
2564 unsigned long min_load = ULONG_MAX, this_load = 0;
2565 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2566
2567 do {
2568 unsigned long load, avg_load;
2569 int local_group;
2570 int i;
2571
2572
2573 if (!cpumask_intersects(sched_group_cpus(group),
2574 tsk_cpus_allowed(p)))
2575 continue;
2576
2577 local_group = cpumask_test_cpu(this_cpu,
2578 sched_group_cpus(group));
2579
2580
2581 avg_load = 0;
2582
2583 for_each_cpu(i, sched_group_cpus(group)) {
2584
2585 if (local_group)
2586 load = source_load(i, load_idx);
2587 else
2588 load = target_load(i, load_idx);
2589
2590 avg_load += load;
2591 }
2592
2593
2594 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
2595
2596 if (local_group) {
2597 this_load = avg_load;
2598 } else if (avg_load < min_load) {
2599 min_load = avg_load;
2600 idlest = group;
2601 }
2602 } while (group = group->next, group != sd->groups);
2603
2604 if (!idlest || 100*this_load < imbalance*min_load)
2605 return NULL;
2606 return idlest;
2607}
2608
2609
2610
2611
2612static int
2613find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2614{
2615 unsigned long load, min_load = ULONG_MAX;
2616 int idlest = -1;
2617 int i;
2618
2619
2620 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
2621 load = weighted_cpuload(i);
2622
2623 if (load < min_load || (load == min_load && i == this_cpu)) {
2624 min_load = load;
2625 idlest = i;
2626 }
2627 }
2628
2629 return idlest;
2630}
2631
2632
2633
2634
2635static int select_idle_sibling(struct task_struct *p, int target)
2636{
2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642
2643
2644
2645
2646
2647 if (target == cpu && idle_cpu(cpu))
2648 return cpu;
2649
2650
2651
2652
2653
2654 if (target == prev_cpu && idle_cpu(prev_cpu))
2655 return prev_cpu;
2656
2657
2658
2659
2660 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672
2673 target = cpumask_first_and(sched_group_cpus(sg),
2674 tsk_cpus_allowed(p));
2675 goto done;
2676next:
2677 sg = sg->next;
2678 } while (sg != sd->groups);
2679 }
2680done:
2681 return target;
2682}
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695static int
2696select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2697{
2698 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
2699 int cpu = smp_processor_id();
2700 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu;
2702 int want_affine = 0;
2703 int sync = wake_flags & WF_SYNC;
2704
2705 if (p->nr_cpus_allowed == 1)
2706 return prev_cpu;
2707
2708 if (sd_flag & SD_BALANCE_WAKE) {
2709 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2710 want_affine = 1;
2711 new_cpu = prev_cpu;
2712 }
2713
2714 rcu_read_lock();
2715 for_each_domain(cpu, tmp) {
2716 if (!(tmp->flags & SD_LOAD_BALANCE))
2717 continue;
2718
2719
2720
2721
2722
2723 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2724 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2725 affine_sd = tmp;
2726 break;
2727 }
2728
2729 if (tmp->flags & sd_flag)
2730 sd = tmp;
2731 }
2732
2733 if (affine_sd) {
2734 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2735 prev_cpu = cpu;
2736
2737 new_cpu = select_idle_sibling(p, prev_cpu);
2738 goto unlock;
2739 }
2740
2741 while (sd) {
2742 int load_idx = sd->forkexec_idx;
2743 struct sched_group *group;
2744 int weight;
2745
2746 if (!(sd->flags & sd_flag)) {
2747 sd = sd->child;
2748 continue;
2749 }
2750
2751 if (sd_flag & SD_BALANCE_WAKE)
2752 load_idx = sd->wake_idx;
2753
2754 group = find_idlest_group(sd, p, cpu, load_idx);
2755 if (!group) {
2756 sd = sd->child;
2757 continue;
2758 }
2759
2760 new_cpu = find_idlest_cpu(group, p, cpu);
2761 if (new_cpu == -1 || new_cpu == cpu) {
2762
2763 sd = sd->child;
2764 continue;
2765 }
2766
2767
2768 cpu = new_cpu;
2769 weight = sd->span_weight;
2770 sd = NULL;
2771 for_each_domain(cpu, tmp) {
2772 if (weight <= tmp->span_weight)
2773 break;
2774 if (tmp->flags & sd_flag)
2775 sd = tmp;
2776 }
2777
2778 }
2779unlock:
2780 rcu_read_unlock();
2781
2782 return new_cpu;
2783}
2784#endif
2785
2786static unsigned long
2787wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
2788{
2789 unsigned long gran = sysctl_sched_wakeup_granularity;
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804 return calc_delta_fair(gran, se);
2805}
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821static int
2822wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
2823{
2824 s64 gran, vdiff = curr->vruntime - se->vruntime;
2825
2826 if (vdiff <= 0)
2827 return -1;
2828
2829 gran = wakeup_gran(curr, se);
2830 if (vdiff > gran)
2831 return 1;
2832
2833 return 0;
2834}
2835
2836static void set_last_buddy(struct sched_entity *se)
2837{
2838 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2839 return;
2840
2841 for_each_sched_entity(se)
2842 cfs_rq_of(se)->last = se;
2843}
2844
2845static void set_next_buddy(struct sched_entity *se)
2846{
2847 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2848 return;
2849
2850 for_each_sched_entity(se)
2851 cfs_rq_of(se)->next = se;
2852}
2853
2854static void set_skip_buddy(struct sched_entity *se)
2855{
2856 for_each_sched_entity(se)
2857 cfs_rq_of(se)->skip = se;
2858}
2859
2860
2861
2862
2863static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2864{
2865 struct task_struct *curr = rq->curr;
2866 struct sched_entity *se = &curr->se, *pse = &p->se;
2867 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2868 int scale = cfs_rq->nr_running >= sched_nr_latency;
2869 int next_buddy_marked = 0;
2870
2871 if (unlikely(se == pse))
2872 return;
2873
2874
2875
2876
2877
2878
2879
2880 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2881 return;
2882
2883 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
2884 set_next_buddy(pse);
2885 next_buddy_marked = 1;
2886 }
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898 if (test_tsk_need_resched(curr))
2899 return;
2900
2901
2902 if (unlikely(curr->policy == SCHED_IDLE) &&
2903 likely(p->policy != SCHED_IDLE))
2904 goto preempt;
2905
2906
2907
2908
2909
2910 if (unlikely(p->policy != SCHED_NORMAL))
2911 return;
2912
2913 find_matching_se(&se, &pse);
2914 update_curr(cfs_rq_of(se));
2915 BUG_ON(!pse);
2916 if (wakeup_preempt_entity(se, pse) == 1) {
2917
2918
2919
2920
2921 if (!next_buddy_marked)
2922 set_next_buddy(pse);
2923 goto preempt;
2924 }
2925
2926 return;
2927
2928preempt:
2929 resched_task(curr);
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939 if (unlikely(!se->on_rq || curr == rq->idle))
2940 return;
2941
2942 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
2943 set_last_buddy(se);
2944}
2945
2946static struct task_struct *pick_next_task_fair(struct rq *rq)
2947{
2948 struct task_struct *p;
2949 struct cfs_rq *cfs_rq = &rq->cfs;
2950 struct sched_entity *se;
2951
2952 if (!cfs_rq->nr_running)
2953 return NULL;
2954
2955 do {
2956 se = pick_next_entity(cfs_rq);
2957 set_next_entity(cfs_rq, se);
2958 cfs_rq = group_cfs_rq(se);
2959 } while (cfs_rq);
2960
2961 p = task_of(se);
2962 if (hrtick_enabled(rq))
2963 hrtick_start_fair(rq, p);
2964
2965 return p;
2966}
2967
2968
2969
2970
2971static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
2972{
2973 struct sched_entity *se = &prev->se;
2974 struct cfs_rq *cfs_rq;
2975
2976 for_each_sched_entity(se) {
2977 cfs_rq = cfs_rq_of(se);
2978 put_prev_entity(cfs_rq, se);
2979 }
2980}
2981
2982
2983
2984
2985
2986
2987static void yield_task_fair(struct rq *rq)
2988{
2989 struct task_struct *curr = rq->curr;
2990 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2991 struct sched_entity *se = &curr->se;
2992
2993
2994
2995
2996 if (unlikely(rq->nr_running == 1))
2997 return;
2998
2999 clear_buddies(cfs_rq, se);
3000
3001 if (curr->policy != SCHED_BATCH) {
3002 update_rq_clock(rq);
3003
3004
3005
3006 update_curr(cfs_rq);
3007
3008
3009
3010
3011
3012 rq->skip_clock_update = 1;
3013 }
3014
3015 set_skip_buddy(se);
3016}
3017
3018static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3019{
3020 struct sched_entity *se = &p->se;
3021
3022
3023 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3024 return false;
3025
3026
3027 set_next_buddy(se);
3028
3029 yield_task_fair(rq);
3030
3031 return true;
3032}
3033
3034#ifdef CONFIG_SMP
3035
3036
3037
3038
3039static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3040
3041#define LBF_ALL_PINNED 0x01
3042#define LBF_NEED_BREAK 0x02
3043#define LBF_SOME_PINNED 0x04
3044
3045struct lb_env {
3046 struct sched_domain *sd;
3047
3048 struct rq *src_rq;
3049 int src_cpu;
3050
3051 int dst_cpu;
3052 struct rq *dst_rq;
3053
3054 struct cpumask *dst_grpmask;
3055 int new_dst_cpu;
3056 enum cpu_idle_type idle;
3057 long imbalance;
3058
3059 struct cpumask *cpus;
3060
3061 unsigned int flags;
3062
3063 unsigned int loop;
3064 unsigned int loop_break;
3065 unsigned int loop_max;
3066};
3067
3068
3069
3070
3071
3072static void move_task(struct task_struct *p, struct lb_env *env)
3073{
3074 deactivate_task(env->src_rq, p, 0);
3075 set_task_cpu(p, env->dst_cpu);
3076 activate_task(env->dst_rq, p, 0);
3077 check_preempt_curr(env->dst_rq, p, 0);
3078}
3079
3080
3081
3082
3083static int
3084task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3085{
3086 s64 delta;
3087
3088 if (p->sched_class != &fair_sched_class)
3089 return 0;
3090
3091 if (unlikely(p->policy == SCHED_IDLE))
3092 return 0;
3093
3094
3095
3096
3097 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3098 (&p->se == cfs_rq_of(&p->se)->next ||
3099 &p->se == cfs_rq_of(&p->se)->last))
3100 return 1;
3101
3102 if (sysctl_sched_migration_cost == -1)
3103 return 1;
3104 if (sysctl_sched_migration_cost == 0)
3105 return 0;
3106
3107 delta = now - p->se.exec_start;
3108
3109 return delta < (s64)sysctl_sched_migration_cost;
3110}
3111
3112
3113
3114
3115static
3116int can_migrate_task(struct task_struct *p, struct lb_env *env)
3117{
3118 int tsk_cache_hot = 0;
3119
3120
3121
3122
3123
3124
3125 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3126 int new_dst_cpu;
3127
3128 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3139 return 0;
3140
3141 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3142 tsk_cpus_allowed(p));
3143 if (new_dst_cpu < nr_cpu_ids) {
3144 env->flags |= LBF_SOME_PINNED;
3145 env->new_dst_cpu = new_dst_cpu;
3146 }
3147 return 0;
3148 }
3149
3150
3151 env->flags &= ~LBF_ALL_PINNED;
3152
3153 if (task_running(env->src_rq, p)) {
3154 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3155 return 0;
3156 }
3157
3158
3159
3160
3161
3162
3163
3164 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3165 if (!tsk_cache_hot ||
3166 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3167#ifdef CONFIG_SCHEDSTATS
3168 if (tsk_cache_hot) {
3169 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3170 schedstat_inc(p, se.statistics.nr_forced_migrations);
3171 }
3172#endif
3173 return 1;
3174 }
3175
3176 if (tsk_cache_hot) {
3177 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3178 return 0;
3179 }
3180 return 1;
3181}
3182
3183
3184
3185
3186
3187
3188
3189
3190static int move_one_task(struct lb_env *env)
3191{
3192 struct task_struct *p, *n;
3193
3194 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3195 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3196 continue;
3197
3198 if (!can_migrate_task(p, env))
3199 continue;
3200
3201 move_task(p, env);
3202
3203
3204
3205
3206
3207 schedstat_inc(env->sd, lb_gained[env->idle]);
3208 return 1;
3209 }
3210 return 0;
3211}
3212
3213static unsigned long task_h_load(struct task_struct *p);
3214
3215static const unsigned int sched_nr_migrate_break = 32;
3216
3217
3218
3219
3220
3221
3222
3223
3224static int move_tasks(struct lb_env *env)
3225{
3226 struct list_head *tasks = &env->src_rq->cfs_tasks;
3227 struct task_struct *p;
3228 unsigned long load;
3229 int pulled = 0;
3230
3231 if (env->imbalance <= 0)
3232 return 0;
3233
3234 while (!list_empty(tasks)) {
3235 p = list_first_entry(tasks, struct task_struct, se.group_node);
3236
3237 env->loop++;
3238
3239 if (env->loop > env->loop_max)
3240 break;
3241
3242
3243 if (env->loop > env->loop_break) {
3244 env->loop_break += sched_nr_migrate_break;
3245 env->flags |= LBF_NEED_BREAK;
3246 break;
3247 }
3248
3249 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3250 goto next;
3251
3252 load = task_h_load(p);
3253
3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3255 goto next;
3256
3257 if ((load / 2) > env->imbalance)
3258 goto next;
3259
3260 if (!can_migrate_task(p, env))
3261 goto next;
3262
3263 move_task(p, env);
3264 pulled++;
3265 env->imbalance -= load;
3266
3267#ifdef CONFIG_PREEMPT
3268
3269
3270
3271
3272
3273 if (env->idle == CPU_NEWLY_IDLE)
3274 break;
3275#endif
3276
3277
3278
3279
3280
3281 if (env->imbalance <= 0)
3282 break;
3283
3284 continue;
3285next:
3286 list_move_tail(&p->se.group_node, tasks);
3287 }
3288
3289
3290
3291
3292
3293
3294 schedstat_add(env->sd, lb_gained[env->idle], pulled);
3295
3296 return pulled;
3297}
3298
3299#ifdef CONFIG_FAIR_GROUP_SCHED
3300
3301
3302
3303static int update_shares_cpu(struct task_group *tg, int cpu)
3304{
3305 struct cfs_rq *cfs_rq;
3306 unsigned long flags;
3307 struct rq *rq;
3308
3309 if (!tg->se[cpu])
3310 return 0;
3311
3312 rq = cpu_rq(cpu);
3313 cfs_rq = tg->cfs_rq[cpu];
3314
3315 raw_spin_lock_irqsave(&rq->lock, flags);
3316
3317 update_rq_clock(rq);
3318 update_cfs_load(cfs_rq, 1);
3319
3320
3321
3322
3323
3324 update_cfs_shares(cfs_rq);
3325
3326 raw_spin_unlock_irqrestore(&rq->lock, flags);
3327
3328 return 0;
3329}
3330
3331static void update_shares(int cpu)
3332{
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq = cpu_rq(cpu);
3335
3336 rcu_read_lock();
3337
3338
3339
3340
3341 for_each_leaf_cfs_rq(rq, cfs_rq) {
3342
3343 if (throttled_hierarchy(cfs_rq))
3344 continue;
3345
3346 update_shares_cpu(cfs_rq->tg, cpu);
3347 }
3348 rcu_read_unlock();
3349}
3350
3351
3352
3353
3354
3355
3356static int tg_load_down(struct task_group *tg, void *data)
3357{
3358 unsigned long load;
3359 long cpu = (long)data;
3360
3361 if (!tg->parent) {
3362 load = cpu_rq(cpu)->load.weight;
3363 } else {
3364 load = tg->parent->cfs_rq[cpu]->h_load;
3365 load *= tg->se[cpu]->load.weight;
3366 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
3367 }
3368
3369 tg->cfs_rq[cpu]->h_load = load;
3370
3371 return 0;
3372}
3373
3374static void update_h_load(long cpu)
3375{
3376 struct rq *rq = cpu_rq(cpu);
3377 unsigned long now = jiffies;
3378
3379 if (rq->h_load_throttle == now)
3380 return;
3381
3382 rq->h_load_throttle = now;
3383
3384 rcu_read_lock();
3385 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3386 rcu_read_unlock();
3387}
3388
3389static unsigned long task_h_load(struct task_struct *p)
3390{
3391 struct cfs_rq *cfs_rq = task_cfs_rq(p);
3392 unsigned long load;
3393
3394 load = p->se.load.weight;
3395 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3396
3397 return load;
3398}
3399#else
3400static inline void update_shares(int cpu)
3401{
3402}
3403
3404static inline void update_h_load(long cpu)
3405{
3406}
3407
3408static unsigned long task_h_load(struct task_struct *p)
3409{
3410 return p->se.load.weight;
3411}
3412#endif
3413
3414
3415
3416
3417
3418
3419struct sd_lb_stats {
3420 struct sched_group *busiest;
3421 struct sched_group *this;
3422 unsigned long total_load;
3423 unsigned long total_pwr;
3424 unsigned long avg_load;
3425
3426
3427 unsigned long this_load;
3428 unsigned long this_load_per_task;
3429 unsigned long this_nr_running;
3430 unsigned long this_has_capacity;
3431 unsigned int this_idle_cpus;
3432
3433
3434 unsigned int busiest_idle_cpus;
3435 unsigned long max_load;
3436 unsigned long busiest_load_per_task;
3437 unsigned long busiest_nr_running;
3438 unsigned long busiest_group_capacity;
3439 unsigned long busiest_has_capacity;
3440 unsigned int busiest_group_weight;
3441
3442 int group_imb;
3443};
3444
3445
3446
3447
3448struct sg_lb_stats {
3449 unsigned long avg_load;
3450 unsigned long group_load;
3451 unsigned long sum_nr_running;
3452 unsigned long sum_weighted_load;
3453 unsigned long group_capacity;
3454 unsigned long idle_cpus;
3455 unsigned long group_weight;
3456 int group_imb;
3457 int group_has_capacity;
3458};
3459
3460
3461
3462
3463
3464
3465static inline int get_sd_load_idx(struct sched_domain *sd,
3466 enum cpu_idle_type idle)
3467{
3468 int load_idx;
3469
3470 switch (idle) {
3471 case CPU_NOT_IDLE:
3472 load_idx = sd->busy_idx;
3473 break;
3474
3475 case CPU_NEWLY_IDLE:
3476 load_idx = sd->newidle_idx;
3477 break;
3478 default:
3479 load_idx = sd->idle_idx;
3480 break;
3481 }
3482
3483 return load_idx;
3484}
3485
3486unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3487{
3488 return SCHED_POWER_SCALE;
3489}
3490
3491unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3492{
3493 return default_scale_freq_power(sd, cpu);
3494}
3495
3496unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3497{
3498 unsigned long weight = sd->span_weight;
3499 unsigned long smt_gain = sd->smt_gain;
3500
3501 smt_gain /= weight;
3502
3503 return smt_gain;
3504}
3505
3506unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3507{
3508 return default_scale_smt_power(sd, cpu);
3509}
3510
3511unsigned long scale_rt_power(int cpu)
3512{
3513 struct rq *rq = cpu_rq(cpu);
3514 u64 total, available, age_stamp, avg;
3515
3516
3517
3518
3519
3520 age_stamp = ACCESS_ONCE(rq->age_stamp);
3521 avg = ACCESS_ONCE(rq->rt_avg);
3522
3523 total = sched_avg_period() + (rq->clock - age_stamp);
3524
3525 if (unlikely(total < avg)) {
3526
3527 available = 0;
3528 } else {
3529 available = total - avg;
3530 }
3531
3532 if (unlikely((s64)total < SCHED_POWER_SCALE))
3533 total = SCHED_POWER_SCALE;
3534
3535 total >>= SCHED_POWER_SHIFT;
3536
3537 return div_u64(available, total);
3538}
3539
3540static void update_cpu_power(struct sched_domain *sd, int cpu)
3541{
3542 unsigned long weight = sd->span_weight;
3543 unsigned long power = SCHED_POWER_SCALE;
3544 struct sched_group *sdg = sd->groups;
3545
3546 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3547 if (sched_feat(ARCH_POWER))
3548 power *= arch_scale_smt_power(sd, cpu);
3549 else
3550 power *= default_scale_smt_power(sd, cpu);
3551
3552 power >>= SCHED_POWER_SHIFT;
3553 }
3554
3555 sdg->sgp->power_orig = power;
3556
3557 if (sched_feat(ARCH_POWER))
3558 power *= arch_scale_freq_power(sd, cpu);
3559 else
3560 power *= default_scale_freq_power(sd, cpu);
3561
3562 power >>= SCHED_POWER_SHIFT;
3563
3564 power *= scale_rt_power(cpu);
3565 power >>= SCHED_POWER_SHIFT;
3566
3567 if (!power)
3568 power = 1;
3569
3570 cpu_rq(cpu)->cpu_power = power;
3571 sdg->sgp->power = power;
3572}
3573
3574void update_group_power(struct sched_domain *sd, int cpu)
3575{
3576 struct sched_domain *child = sd->child;
3577 struct sched_group *group, *sdg = sd->groups;
3578 unsigned long power;
3579 unsigned long interval;
3580
3581 interval = msecs_to_jiffies(sd->balance_interval);
3582 interval = clamp(interval, 1UL, max_load_balance_interval);
3583 sdg->sgp->next_update = jiffies + interval;
3584
3585 if (!child) {
3586 update_cpu_power(sd, cpu);
3587 return;
3588 }
3589
3590 power = 0;
3591
3592 if (child->flags & SD_OVERLAP) {
3593
3594
3595
3596
3597
3598 for_each_cpu(cpu, sched_group_cpus(sdg))
3599 power += power_of(cpu);
3600 } else {
3601
3602
3603
3604
3605
3606 group = child->groups;
3607 do {
3608 power += group->sgp->power;
3609 group = group->next;
3610 } while (group != child->groups);
3611 }
3612
3613 sdg->sgp->power_orig = sdg->sgp->power = power;
3614}
3615
3616
3617
3618
3619
3620
3621
3622
3623static inline int
3624fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3625{
3626
3627
3628
3629 if (!(sd->flags & SD_SHARE_CPUPOWER))
3630 return 0;
3631
3632
3633
3634
3635 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3636 return 1;
3637
3638 return 0;
3639}
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650static inline void update_sg_lb_stats(struct lb_env *env,
3651 struct sched_group *group, int load_idx,
3652 int local_group, int *balance, struct sg_lb_stats *sgs)
3653{
3654 unsigned long nr_running, max_nr_running, min_nr_running;
3655 unsigned long load, max_cpu_load, min_cpu_load;
3656 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3657 unsigned long avg_load_per_task = 0;
3658 int i;
3659
3660 if (local_group)
3661 balance_cpu = group_balance_cpu(group);
3662
3663
3664 max_cpu_load = 0;
3665 min_cpu_load = ~0UL;
3666 max_nr_running = 0;
3667 min_nr_running = ~0UL;
3668
3669 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
3670 struct rq *rq = cpu_rq(i);
3671
3672 nr_running = rq->nr_running;
3673
3674
3675 if (local_group) {
3676 if (idle_cpu(i) && !first_idle_cpu &&
3677 cpumask_test_cpu(i, sched_group_mask(group))) {
3678 first_idle_cpu = 1;
3679 balance_cpu = i;
3680 }
3681
3682 load = target_load(i, load_idx);
3683 } else {
3684 load = source_load(i, load_idx);
3685 if (load > max_cpu_load)
3686 max_cpu_load = load;
3687 if (min_cpu_load > load)
3688 min_cpu_load = load;
3689
3690 if (nr_running > max_nr_running)
3691 max_nr_running = nr_running;
3692 if (min_nr_running > nr_running)
3693 min_nr_running = nr_running;
3694 }
3695
3696 sgs->group_load += load;
3697 sgs->sum_nr_running += nr_running;
3698 sgs->sum_weighted_load += weighted_cpuload(i);
3699 if (idle_cpu(i))
3700 sgs->idle_cpus++;
3701 }
3702
3703
3704
3705
3706
3707
3708
3709 if (local_group) {
3710 if (env->idle != CPU_NEWLY_IDLE) {
3711 if (balance_cpu != env->dst_cpu) {
3712 *balance = 0;
3713 return;
3714 }
3715 update_group_power(env->sd, env->dst_cpu);
3716 } else if (time_after_eq(jiffies, group->sgp->next_update))
3717 update_group_power(env->sd, env->dst_cpu);
3718 }
3719
3720
3721 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732 if (sgs->sum_nr_running)
3733 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3734
3735 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3736 (max_nr_running - min_nr_running) > 1)
3737 sgs->group_imb = 1;
3738
3739 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3740 SCHED_POWER_SCALE);
3741 if (!sgs->group_capacity)
3742 sgs->group_capacity = fix_small_capacity(env->sd, group);
3743 sgs->group_weight = group->group_weight;
3744
3745 if (sgs->group_capacity > sgs->sum_nr_running)
3746 sgs->group_has_capacity = 1;
3747}
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759static bool update_sd_pick_busiest(struct lb_env *env,
3760 struct sd_lb_stats *sds,
3761 struct sched_group *sg,
3762 struct sg_lb_stats *sgs)
3763{
3764 if (sgs->avg_load <= sds->max_load)
3765 return false;
3766
3767 if (sgs->sum_nr_running > sgs->group_capacity)
3768 return true;
3769
3770 if (sgs->group_imb)
3771 return true;
3772
3773
3774
3775
3776
3777
3778 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3779 env->dst_cpu < group_first_cpu(sg)) {
3780 if (!sds->busiest)
3781 return true;
3782
3783 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
3784 return true;
3785 }
3786
3787 return false;
3788}
3789
3790
3791
3792
3793
3794
3795
3796static inline void update_sd_lb_stats(struct lb_env *env,
3797 int *balance, struct sd_lb_stats *sds)
3798{
3799 struct sched_domain *child = env->sd->child;
3800 struct sched_group *sg = env->sd->groups;
3801 struct sg_lb_stats sgs;
3802 int load_idx, prefer_sibling = 0;
3803
3804 if (child && child->flags & SD_PREFER_SIBLING)
3805 prefer_sibling = 1;
3806
3807 load_idx = get_sd_load_idx(env->sd, env->idle);
3808
3809 do {
3810 int local_group;
3811
3812 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3813 memset(&sgs, 0, sizeof(sgs));
3814 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3815
3816 if (local_group && !(*balance))
3817 return;
3818
3819 sds->total_load += sgs.group_load;
3820 sds->total_pwr += sg->sgp->power;
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832 if (prefer_sibling && !local_group && sds->this_has_capacity)
3833 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3834
3835 if (local_group) {
3836 sds->this_load = sgs.avg_load;
3837 sds->this = sg;
3838 sds->this_nr_running = sgs.sum_nr_running;
3839 sds->this_load_per_task = sgs.sum_weighted_load;
3840 sds->this_has_capacity = sgs.group_has_capacity;
3841 sds->this_idle_cpus = sgs.idle_cpus;
3842 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3843 sds->max_load = sgs.avg_load;
3844 sds->busiest = sg;
3845 sds->busiest_nr_running = sgs.sum_nr_running;
3846 sds->busiest_idle_cpus = sgs.idle_cpus;
3847 sds->busiest_group_capacity = sgs.group_capacity;
3848 sds->busiest_load_per_task = sgs.sum_weighted_load;
3849 sds->busiest_has_capacity = sgs.group_has_capacity;
3850 sds->busiest_group_weight = sgs.group_weight;
3851 sds->group_imb = sgs.group_imb;
3852 }
3853
3854 sg = sg->next;
3855 } while (sg != env->sd->groups);
3856}
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3882{
3883 int busiest_cpu;
3884
3885 if (!(env->sd->flags & SD_ASYM_PACKING))
3886 return 0;
3887
3888 if (!sds->busiest)
3889 return 0;
3890
3891 busiest_cpu = group_first_cpu(sds->busiest);
3892 if (env->dst_cpu > busiest_cpu)
3893 return 0;
3894
3895 env->imbalance = DIV_ROUND_CLOSEST(
3896 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3897
3898 return 1;
3899}
3900
3901
3902
3903
3904
3905
3906
3907
3908static inline
3909void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
3910{
3911 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3912 unsigned int imbn = 2;
3913 unsigned long scaled_busy_load_per_task;
3914
3915 if (sds->this_nr_running) {
3916 sds->this_load_per_task /= sds->this_nr_running;
3917 if (sds->busiest_load_per_task >
3918 sds->this_load_per_task)
3919 imbn = 1;
3920 } else {
3921 sds->this_load_per_task =
3922 cpu_avg_load_per_task(env->dst_cpu);
3923 }
3924
3925 scaled_busy_load_per_task = sds->busiest_load_per_task
3926 * SCHED_POWER_SCALE;
3927 scaled_busy_load_per_task /= sds->busiest->sgp->power;
3928
3929 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
3930 (scaled_busy_load_per_task * imbn)) {
3931 env->imbalance = sds->busiest_load_per_task;
3932 return;
3933 }
3934
3935
3936
3937
3938
3939
3940
3941 pwr_now += sds->busiest->sgp->power *
3942 min(sds->busiest_load_per_task, sds->max_load);
3943 pwr_now += sds->this->sgp->power *
3944 min(sds->this_load_per_task, sds->this_load);
3945 pwr_now /= SCHED_POWER_SCALE;
3946
3947
3948 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3949 sds->busiest->sgp->power;
3950 if (sds->max_load > tmp)
3951 pwr_move += sds->busiest->sgp->power *
3952 min(sds->busiest_load_per_task, sds->max_load - tmp);
3953
3954
3955 if (sds->max_load * sds->busiest->sgp->power <
3956 sds->busiest_load_per_task * SCHED_POWER_SCALE)
3957 tmp = (sds->max_load * sds->busiest->sgp->power) /
3958 sds->this->sgp->power;
3959 else
3960 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3961 sds->this->sgp->power;
3962 pwr_move += sds->this->sgp->power *
3963 min(sds->this_load_per_task, sds->this_load + tmp);
3964 pwr_move /= SCHED_POWER_SCALE;
3965
3966
3967 if (pwr_move > pwr_now)
3968 env->imbalance = sds->busiest_load_per_task;
3969}
3970
3971
3972
3973
3974
3975
3976
3977static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
3978{
3979 unsigned long max_pull, load_above_capacity = ~0UL;
3980
3981 sds->busiest_load_per_task /= sds->busiest_nr_running;
3982 if (sds->group_imb) {
3983 sds->busiest_load_per_task =
3984 min(sds->busiest_load_per_task, sds->avg_load);
3985 }
3986
3987
3988
3989
3990
3991
3992 if (sds->max_load < sds->avg_load) {
3993 env->imbalance = 0;
3994 return fix_small_imbalance(env, sds);
3995 }
3996
3997 if (!sds->group_imb) {
3998
3999
4000
4001 load_above_capacity = (sds->busiest_nr_running -
4002 sds->busiest_group_capacity);
4003
4004 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4005
4006 load_above_capacity /= sds->busiest->sgp->power;
4007 }
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4020
4021
4022 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4023 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4024 / SCHED_POWER_SCALE;
4025
4026
4027
4028
4029
4030
4031
4032 if (env->imbalance < sds->busiest_load_per_task)
4033 return fix_small_imbalance(env, sds);
4034
4035}
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058static struct sched_group *
4059find_busiest_group(struct lb_env *env, int *balance)
4060{
4061 struct sd_lb_stats sds;
4062
4063 memset(&sds, 0, sizeof(sds));
4064
4065
4066
4067
4068
4069 update_sd_lb_stats(env, balance, &sds);
4070
4071
4072
4073
4074
4075 if (!(*balance))
4076 goto ret;
4077
4078 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4079 check_asym_packing(env, &sds))
4080 return sds.busiest;
4081
4082
4083 if (!sds.busiest || sds.busiest_nr_running == 0)
4084 goto out_balanced;
4085
4086 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4087
4088
4089
4090
4091
4092
4093 if (sds.group_imb)
4094 goto force_balance;
4095
4096
4097 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4098 !sds.busiest_has_capacity)
4099 goto force_balance;
4100
4101
4102
4103
4104
4105 if (sds.this_load >= sds.max_load)
4106 goto out_balanced;
4107
4108
4109
4110
4111
4112 if (sds.this_load >= sds.avg_load)
4113 goto out_balanced;
4114
4115 if (env->idle == CPU_IDLE) {
4116
4117
4118
4119
4120
4121
4122 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4123 sds.busiest_nr_running <= sds.busiest_group_weight)
4124 goto out_balanced;
4125 } else {
4126
4127
4128
4129
4130 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4131 goto out_balanced;
4132 }
4133
4134force_balance:
4135
4136 calculate_imbalance(env, &sds);
4137 return sds.busiest;
4138
4139out_balanced:
4140ret:
4141 env->imbalance = 0;
4142 return NULL;
4143}
4144
4145
4146
4147
4148static struct rq *find_busiest_queue(struct lb_env *env,
4149 struct sched_group *group)
4150{
4151 struct rq *busiest = NULL, *rq;
4152 unsigned long max_load = 0;
4153 int i;
4154
4155 for_each_cpu(i, sched_group_cpus(group)) {
4156 unsigned long power = power_of(i);
4157 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4158 SCHED_POWER_SCALE);
4159 unsigned long wl;
4160
4161 if (!capacity)
4162 capacity = fix_small_capacity(env->sd, group);
4163
4164 if (!cpumask_test_cpu(i, env->cpus))
4165 continue;
4166
4167 rq = cpu_rq(i);
4168 wl = weighted_cpuload(i);
4169
4170
4171
4172
4173
4174 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4175 continue;
4176
4177
4178
4179
4180
4181
4182
4183 wl = (wl * SCHED_POWER_SCALE) / power;
4184
4185 if (wl > max_load) {
4186 max_load = wl;
4187 busiest = rq;
4188 }
4189 }
4190
4191 return busiest;
4192}
4193
4194
4195
4196
4197
4198#define MAX_PINNED_INTERVAL 512
4199
4200
4201DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4202
4203static int need_active_balance(struct lb_env *env)
4204{
4205 struct sched_domain *sd = env->sd;
4206
4207 if (env->idle == CPU_NEWLY_IDLE) {
4208
4209
4210
4211
4212
4213
4214 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4215 return 1;
4216 }
4217
4218 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4219}
4220
4221static int active_load_balance_cpu_stop(void *data);
4222
4223
4224
4225
4226
4227static int load_balance(int this_cpu, struct rq *this_rq,
4228 struct sched_domain *sd, enum cpu_idle_type idle,
4229 int *balance)
4230{
4231 int ld_moved, cur_ld_moved, active_balance = 0;
4232 int lb_iterations, max_lb_iterations;
4233 struct sched_group *group;
4234 struct rq *busiest;
4235 unsigned long flags;
4236 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4237
4238 struct lb_env env = {
4239 .sd = sd,
4240 .dst_cpu = this_cpu,
4241 .dst_rq = this_rq,
4242 .dst_grpmask = sched_group_cpus(sd->groups),
4243 .idle = idle,
4244 .loop_break = sched_nr_migrate_break,
4245 .cpus = cpus,
4246 };
4247
4248 cpumask_copy(cpus, cpu_active_mask);
4249 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4250
4251 schedstat_inc(sd, lb_count[idle]);
4252
4253redo:
4254 group = find_busiest_group(&env, balance);
4255
4256 if (*balance == 0)
4257 goto out_balanced;
4258
4259 if (!group) {
4260 schedstat_inc(sd, lb_nobusyg[idle]);
4261 goto out_balanced;
4262 }
4263
4264 busiest = find_busiest_queue(&env, group);
4265 if (!busiest) {
4266 schedstat_inc(sd, lb_nobusyq[idle]);
4267 goto out_balanced;
4268 }
4269
4270 BUG_ON(busiest == env.dst_rq);
4271
4272 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4273
4274 ld_moved = 0;
4275 lb_iterations = 1;
4276 if (busiest->nr_running > 1) {
4277
4278
4279
4280
4281
4282
4283 env.flags |= LBF_ALL_PINNED;
4284 env.src_cpu = busiest->cpu;
4285 env.src_rq = busiest;
4286 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4287
4288 update_h_load(env.src_cpu);
4289more_balance:
4290 local_irq_save(flags);
4291 double_rq_lock(env.dst_rq, busiest);
4292
4293
4294
4295
4296
4297 cur_ld_moved = move_tasks(&env);
4298 ld_moved += cur_ld_moved;
4299 double_rq_unlock(env.dst_rq, busiest);
4300 local_irq_restore(flags);
4301
4302 if (env.flags & LBF_NEED_BREAK) {
4303 env.flags &= ~LBF_NEED_BREAK;
4304 goto more_balance;
4305 }
4306
4307
4308
4309
4310 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4311 resched_cpu(env.dst_cpu);
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4333 lb_iterations++ < max_lb_iterations) {
4334
4335 env.dst_rq = cpu_rq(env.new_dst_cpu);
4336 env.dst_cpu = env.new_dst_cpu;
4337 env.flags &= ~LBF_SOME_PINNED;
4338 env.loop = 0;
4339 env.loop_break = sched_nr_migrate_break;
4340
4341
4342
4343
4344 goto more_balance;
4345 }
4346
4347
4348 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4349 cpumask_clear_cpu(cpu_of(busiest), cpus);
4350 if (!cpumask_empty(cpus)) {
4351 env.loop = 0;
4352 env.loop_break = sched_nr_migrate_break;
4353 goto redo;
4354 }
4355 goto out_balanced;
4356 }
4357 }
4358
4359 if (!ld_moved) {
4360 schedstat_inc(sd, lb_failed[idle]);
4361
4362
4363
4364
4365
4366
4367 if (idle != CPU_NEWLY_IDLE)
4368 sd->nr_balance_failed++;
4369
4370 if (need_active_balance(&env)) {
4371 raw_spin_lock_irqsave(&busiest->lock, flags);
4372
4373
4374
4375
4376
4377 if (!cpumask_test_cpu(this_cpu,
4378 tsk_cpus_allowed(busiest->curr))) {
4379 raw_spin_unlock_irqrestore(&busiest->lock,
4380 flags);
4381 env.flags |= LBF_ALL_PINNED;
4382 goto out_one_pinned;
4383 }
4384
4385
4386
4387
4388
4389
4390 if (!busiest->active_balance) {
4391 busiest->active_balance = 1;
4392 busiest->push_cpu = this_cpu;
4393 active_balance = 1;
4394 }
4395 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4396
4397 if (active_balance) {
4398 stop_one_cpu_nowait(cpu_of(busiest),
4399 active_load_balance_cpu_stop, busiest,
4400 &busiest->active_balance_work);
4401 }
4402
4403
4404
4405
4406
4407 sd->nr_balance_failed = sd->cache_nice_tries+1;
4408 }
4409 } else
4410 sd->nr_balance_failed = 0;
4411
4412 if (likely(!active_balance)) {
4413
4414 sd->balance_interval = sd->min_interval;
4415 } else {
4416
4417
4418
4419
4420
4421
4422 if (sd->balance_interval < sd->max_interval)
4423 sd->balance_interval *= 2;
4424 }
4425
4426 goto out;
4427
4428out_balanced:
4429 schedstat_inc(sd, lb_balanced[idle]);
4430
4431 sd->nr_balance_failed = 0;
4432
4433out_one_pinned:
4434
4435 if (((env.flags & LBF_ALL_PINNED) &&
4436 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4437 (sd->balance_interval < sd->max_interval))
4438 sd->balance_interval *= 2;
4439
4440 ld_moved = 0;
4441out:
4442 return ld_moved;
4443}
4444
4445
4446
4447
4448
4449void idle_balance(int this_cpu, struct rq *this_rq)
4450{
4451 struct sched_domain *sd;
4452 int pulled_task = 0;
4453 unsigned long next_balance = jiffies + HZ;
4454
4455 this_rq->idle_stamp = this_rq->clock;
4456
4457 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4458 return;
4459
4460
4461
4462
4463 raw_spin_unlock(&this_rq->lock);
4464
4465 update_shares(this_cpu);
4466 rcu_read_lock();
4467 for_each_domain(this_cpu, sd) {
4468 unsigned long interval;
4469 int balance = 1;
4470
4471 if (!(sd->flags & SD_LOAD_BALANCE))
4472 continue;
4473
4474 if (sd->flags & SD_BALANCE_NEWIDLE) {
4475
4476 pulled_task = load_balance(this_cpu, this_rq,
4477 sd, CPU_NEWLY_IDLE, &balance);
4478 }
4479
4480 interval = msecs_to_jiffies(sd->balance_interval);
4481 if (time_after(next_balance, sd->last_balance + interval))
4482 next_balance = sd->last_balance + interval;
4483 if (pulled_task) {
4484 this_rq->idle_stamp = 0;
4485 break;
4486 }
4487 }
4488 rcu_read_unlock();
4489
4490 raw_spin_lock(&this_rq->lock);
4491
4492 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4493
4494
4495
4496
4497 this_rq->next_balance = next_balance;
4498 }
4499}
4500
4501
4502
4503
4504
4505
4506
4507static int active_load_balance_cpu_stop(void *data)
4508{
4509 struct rq *busiest_rq = data;
4510 int busiest_cpu = cpu_of(busiest_rq);
4511 int target_cpu = busiest_rq->push_cpu;
4512 struct rq *target_rq = cpu_rq(target_cpu);
4513 struct sched_domain *sd;
4514
4515 raw_spin_lock_irq(&busiest_rq->lock);
4516
4517
4518 if (unlikely(busiest_cpu != smp_processor_id() ||
4519 !busiest_rq->active_balance))
4520 goto out_unlock;
4521
4522
4523 if (busiest_rq->nr_running <= 1)
4524 goto out_unlock;
4525
4526
4527
4528
4529
4530
4531 BUG_ON(busiest_rq == target_rq);
4532
4533
4534 double_lock_balance(busiest_rq, target_rq);
4535
4536
4537 rcu_read_lock();
4538 for_each_domain(target_cpu, sd) {
4539 if ((sd->flags & SD_LOAD_BALANCE) &&
4540 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4541 break;
4542 }
4543
4544 if (likely(sd)) {
4545 struct lb_env env = {
4546 .sd = sd,
4547 .dst_cpu = target_cpu,
4548 .dst_rq = target_rq,
4549 .src_cpu = busiest_rq->cpu,
4550 .src_rq = busiest_rq,
4551 .idle = CPU_IDLE,
4552 };
4553
4554 schedstat_inc(sd, alb_count);
4555
4556 if (move_one_task(&env))
4557 schedstat_inc(sd, alb_pushed);
4558 else
4559 schedstat_inc(sd, alb_failed);
4560 }
4561 rcu_read_unlock();
4562 double_unlock_balance(busiest_rq, target_rq);
4563out_unlock:
4564 busiest_rq->active_balance = 0;
4565 raw_spin_unlock_irq(&busiest_rq->lock);
4566 return 0;
4567}
4568
4569#ifdef CONFIG_NO_HZ
4570
4571
4572
4573
4574
4575
4576static struct {
4577 cpumask_var_t idle_cpus_mask;
4578 atomic_t nr_cpus;
4579 unsigned long next_balance;
4580} nohz ____cacheline_aligned;
4581
4582static inline int find_new_ilb(int call_cpu)
4583{
4584 int ilb = cpumask_first(nohz.idle_cpus_mask);
4585
4586 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4587 return ilb;
4588
4589 return nr_cpu_ids;
4590}
4591
4592
4593
4594
4595
4596
4597static void nohz_balancer_kick(int cpu)
4598{
4599 int ilb_cpu;
4600
4601 nohz.next_balance++;
4602
4603 ilb_cpu = find_new_ilb(cpu);
4604
4605 if (ilb_cpu >= nr_cpu_ids)
4606 return;
4607
4608 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4609 return;
4610
4611
4612
4613
4614
4615
4616 smp_send_reschedule(ilb_cpu);
4617 return;
4618}
4619
4620static inline void nohz_balance_exit_idle(int cpu)
4621{
4622 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4623 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4624 atomic_dec(&nohz.nr_cpus);
4625 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4626 }
4627}
4628
4629static inline void set_cpu_sd_state_busy(void)
4630{
4631 struct sched_domain *sd;
4632 int cpu = smp_processor_id();
4633
4634 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4635 return;
4636 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4637
4638 rcu_read_lock();
4639 for_each_domain(cpu, sd)
4640 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4641 rcu_read_unlock();
4642}
4643
4644void set_cpu_sd_state_idle(void)
4645{
4646 struct sched_domain *sd;
4647 int cpu = smp_processor_id();
4648
4649 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4650 return;
4651 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4652
4653 rcu_read_lock();
4654 for_each_domain(cpu, sd)
4655 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4656 rcu_read_unlock();
4657}
4658
4659
4660
4661
4662
4663void nohz_balance_enter_idle(int cpu)
4664{
4665
4666
4667
4668 if (!cpu_active(cpu))
4669 return;
4670
4671 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4672 return;
4673
4674 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4675 atomic_inc(&nohz.nr_cpus);
4676 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4677}
4678
4679static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4680 unsigned long action, void *hcpu)
4681{
4682 switch (action & ~CPU_TASKS_FROZEN) {
4683 case CPU_DYING:
4684 nohz_balance_exit_idle(smp_processor_id());
4685 return NOTIFY_OK;
4686 default:
4687 return NOTIFY_DONE;
4688 }
4689}
4690#endif
4691
4692static DEFINE_SPINLOCK(balancing);
4693
4694
4695
4696
4697
4698void update_max_interval(void)
4699{
4700 max_load_balance_interval = HZ*num_online_cpus()/10;
4701}
4702
4703
4704
4705
4706
4707
4708
4709static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4710{
4711 int balance = 1;
4712 struct rq *rq = cpu_rq(cpu);
4713 unsigned long interval;
4714 struct sched_domain *sd;
4715
4716 unsigned long next_balance = jiffies + 60*HZ;
4717 int update_next_balance = 0;
4718 int need_serialize;
4719
4720 update_shares(cpu);
4721
4722 rcu_read_lock();
4723 for_each_domain(cpu, sd) {
4724 if (!(sd->flags & SD_LOAD_BALANCE))
4725 continue;
4726
4727 interval = sd->balance_interval;
4728 if (idle != CPU_IDLE)
4729 interval *= sd->busy_factor;
4730
4731
4732 interval = msecs_to_jiffies(interval);
4733 interval = clamp(interval, 1UL, max_load_balance_interval);
4734
4735 need_serialize = sd->flags & SD_SERIALIZE;
4736
4737 if (need_serialize) {
4738 if (!spin_trylock(&balancing))
4739 goto out;
4740 }
4741
4742 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4743 if (load_balance(cpu, rq, sd, idle, &balance)) {
4744
4745
4746
4747
4748 idle = CPU_NOT_IDLE;
4749 }
4750 sd->last_balance = jiffies;
4751 }
4752 if (need_serialize)
4753 spin_unlock(&balancing);
4754out:
4755 if (time_after(next_balance, sd->last_balance + interval)) {
4756 next_balance = sd->last_balance + interval;
4757 update_next_balance = 1;
4758 }
4759
4760
4761
4762
4763
4764
4765 if (!balance)
4766 break;
4767 }
4768 rcu_read_unlock();
4769
4770
4771
4772
4773
4774
4775 if (likely(update_next_balance))
4776 rq->next_balance = next_balance;
4777}
4778
4779#ifdef CONFIG_NO_HZ
4780
4781
4782
4783
4784static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4785{
4786 struct rq *this_rq = cpu_rq(this_cpu);
4787 struct rq *rq;
4788 int balance_cpu;
4789
4790 if (idle != CPU_IDLE ||
4791 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
4792 goto end;
4793
4794 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4795 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4796 continue;
4797
4798
4799
4800
4801
4802
4803 if (need_resched())
4804 break;
4805
4806 rq = cpu_rq(balance_cpu);
4807
4808 raw_spin_lock_irq(&rq->lock);
4809 update_rq_clock(rq);
4810 update_idle_cpu_load(rq);
4811 raw_spin_unlock_irq(&rq->lock);
4812
4813 rebalance_domains(balance_cpu, CPU_IDLE);
4814
4815 if (time_after(this_rq->next_balance, rq->next_balance))
4816 this_rq->next_balance = rq->next_balance;
4817 }
4818 nohz.next_balance = this_rq->next_balance;
4819end:
4820 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4821}
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832static inline int nohz_kick_needed(struct rq *rq, int cpu)
4833{
4834 unsigned long now = jiffies;
4835 struct sched_domain *sd;
4836
4837 if (unlikely(idle_cpu(cpu)))
4838 return 0;
4839
4840
4841
4842
4843
4844 set_cpu_sd_state_busy();
4845 nohz_balance_exit_idle(cpu);
4846
4847
4848
4849
4850
4851 if (likely(!atomic_read(&nohz.nr_cpus)))
4852 return 0;
4853
4854 if (time_before(now, nohz.next_balance))
4855 return 0;
4856
4857 if (rq->nr_running >= 2)
4858 goto need_kick;
4859
4860 rcu_read_lock();
4861 for_each_domain(cpu, sd) {
4862 struct sched_group *sg = sd->groups;
4863 struct sched_group_power *sgp = sg->sgp;
4864 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4865
4866 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4867 goto need_kick_unlock;
4868
4869 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
4870 && (cpumask_first_and(nohz.idle_cpus_mask,
4871 sched_domain_span(sd)) < cpu))
4872 goto need_kick_unlock;
4873
4874 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
4875 break;
4876 }
4877 rcu_read_unlock();
4878 return 0;
4879
4880need_kick_unlock:
4881 rcu_read_unlock();
4882need_kick:
4883 return 1;
4884}
4885#else
4886static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
4887#endif
4888
4889
4890
4891
4892
4893static void run_rebalance_domains(struct softirq_action *h)
4894{
4895 int this_cpu = smp_processor_id();
4896 struct rq *this_rq = cpu_rq(this_cpu);
4897 enum cpu_idle_type idle = this_rq->idle_balance ?
4898 CPU_IDLE : CPU_NOT_IDLE;
4899
4900 rebalance_domains(this_cpu, idle);
4901
4902
4903
4904
4905
4906
4907 nohz_idle_balance(this_cpu, idle);
4908}
4909
4910static inline int on_null_domain(int cpu)
4911{
4912 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4913}
4914
4915
4916
4917
4918void trigger_load_balance(struct rq *rq, int cpu)
4919{
4920
4921 if (time_after_eq(jiffies, rq->next_balance) &&
4922 likely(!on_null_domain(cpu)))
4923 raise_softirq(SCHED_SOFTIRQ);
4924#ifdef CONFIG_NO_HZ
4925 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4926 nohz_balancer_kick(cpu);
4927#endif
4928}
4929
4930static void rq_online_fair(struct rq *rq)
4931{
4932 update_sysctl();
4933}
4934
4935static void rq_offline_fair(struct rq *rq)
4936{
4937 update_sysctl();
4938
4939
4940 unthrottle_offline_cfs_rqs(rq);
4941}
4942
4943#endif
4944
4945
4946
4947
4948static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4949{
4950 struct cfs_rq *cfs_rq;
4951 struct sched_entity *se = &curr->se;
4952
4953 for_each_sched_entity(se) {
4954 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued);
4956 }
4957}
4958
4959
4960
4961
4962
4963
4964static void task_fork_fair(struct task_struct *p)
4965{
4966 struct cfs_rq *cfs_rq;
4967 struct sched_entity *se = &p->se, *curr;
4968 int this_cpu = smp_processor_id();
4969 struct rq *rq = this_rq();
4970 unsigned long flags;
4971
4972 raw_spin_lock_irqsave(&rq->lock, flags);
4973
4974 update_rq_clock(rq);
4975
4976 cfs_rq = task_cfs_rq(current);
4977 curr = cfs_rq->curr;
4978
4979 if (unlikely(task_cpu(p) != this_cpu)) {
4980 rcu_read_lock();
4981 __set_task_cpu(p, this_cpu);
4982 rcu_read_unlock();
4983 }
4984
4985 update_curr(cfs_rq);
4986
4987 if (curr)
4988 se->vruntime = curr->vruntime;
4989 place_entity(cfs_rq, se, 1);
4990
4991 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
4992
4993
4994
4995
4996 swap(curr->vruntime, se->vruntime);
4997 resched_task(rq->curr);
4998 }
4999
5000 se->vruntime -= cfs_rq->min_vruntime;
5001
5002 raw_spin_unlock_irqrestore(&rq->lock, flags);
5003}
5004
5005
5006
5007
5008
5009static void
5010prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5011{
5012 if (!p->se.on_rq)
5013 return;
5014
5015
5016
5017
5018
5019
5020 if (rq->curr == p) {
5021 if (p->prio > oldprio)
5022 resched_task(rq->curr);
5023 } else
5024 check_preempt_curr(rq, p, 0);
5025}
5026
5027static void switched_from_fair(struct rq *rq, struct task_struct *p)
5028{
5029 struct sched_entity *se = &p->se;
5030 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041 if (!se->on_rq && p->state != TASK_RUNNING) {
5042
5043
5044
5045
5046 place_entity(cfs_rq, se, 0);
5047 se->vruntime -= cfs_rq->min_vruntime;
5048 }
5049}
5050
5051
5052
5053
5054static void switched_to_fair(struct rq *rq, struct task_struct *p)
5055{
5056 if (!p->se.on_rq)
5057 return;
5058
5059
5060
5061
5062
5063
5064 if (rq->curr == p)
5065 resched_task(rq->curr);
5066 else
5067 check_preempt_curr(rq, p, 0);
5068}
5069
5070
5071
5072
5073
5074
5075static void set_curr_task_fair(struct rq *rq)
5076{
5077 struct sched_entity *se = &rq->curr->se;
5078
5079 for_each_sched_entity(se) {
5080 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5081
5082 set_next_entity(cfs_rq, se);
5083
5084 account_cfs_rq_runtime(cfs_rq, 0);
5085 }
5086}
5087
5088void init_cfs_rq(struct cfs_rq *cfs_rq)
5089{
5090 cfs_rq->tasks_timeline = RB_ROOT;
5091 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5092#ifndef CONFIG_64BIT
5093 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5094#endif
5095}
5096
5097#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void task_move_group_fair(struct task_struct *p, int on_rq)
5099{
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5126 on_rq = 1;
5127
5128 if (!on_rq)
5129 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p));
5131 if (!on_rq)
5132 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5133}
5134
5135void free_fair_sched_group(struct task_group *tg)
5136{
5137 int i;
5138
5139 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5140
5141 for_each_possible_cpu(i) {
5142 if (tg->cfs_rq)
5143 kfree(tg->cfs_rq[i]);
5144 if (tg->se)
5145 kfree(tg->se[i]);
5146 }
5147
5148 kfree(tg->cfs_rq);
5149 kfree(tg->se);
5150}
5151
5152int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5153{
5154 struct cfs_rq *cfs_rq;
5155 struct sched_entity *se;
5156 int i;
5157
5158 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5159 if (!tg->cfs_rq)
5160 goto err;
5161 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5162 if (!tg->se)
5163 goto err;
5164
5165 tg->shares = NICE_0_LOAD;
5166
5167 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5168
5169 for_each_possible_cpu(i) {
5170 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5171 GFP_KERNEL, cpu_to_node(i));
5172 if (!cfs_rq)
5173 goto err;
5174
5175 se = kzalloc_node(sizeof(struct sched_entity),
5176 GFP_KERNEL, cpu_to_node(i));
5177 if (!se)
5178 goto err_free_rq;
5179
5180 init_cfs_rq(cfs_rq);
5181 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5182 }
5183
5184 return 1;
5185
5186err_free_rq:
5187 kfree(cfs_rq);
5188err:
5189 return 0;
5190}
5191
5192void unregister_fair_sched_group(struct task_group *tg, int cpu)
5193{
5194 struct rq *rq = cpu_rq(cpu);
5195 unsigned long flags;
5196
5197
5198
5199
5200
5201 if (!tg->cfs_rq[cpu]->on_list)
5202 return;
5203
5204 raw_spin_lock_irqsave(&rq->lock, flags);
5205 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5206 raw_spin_unlock_irqrestore(&rq->lock, flags);
5207}
5208
5209void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5210 struct sched_entity *se, int cpu,
5211 struct sched_entity *parent)
5212{
5213 struct rq *rq = cpu_rq(cpu);
5214
5215 cfs_rq->tg = tg;
5216 cfs_rq->rq = rq;
5217#ifdef CONFIG_SMP
5218
5219 cfs_rq->load_stamp = 1;
5220#endif
5221 init_cfs_rq_runtime(cfs_rq);
5222
5223 tg->cfs_rq[cpu] = cfs_rq;
5224 tg->se[cpu] = se;
5225
5226
5227 if (!se)
5228 return;
5229
5230 if (!parent)
5231 se->cfs_rq = &rq->cfs;
5232 else
5233 se->cfs_rq = parent->my_q;
5234
5235 se->my_q = cfs_rq;
5236 update_load_set(&se->load, 0);
5237 se->parent = parent;
5238}
5239
5240static DEFINE_MUTEX(shares_mutex);
5241
5242int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5243{
5244 int i;
5245 unsigned long flags;
5246
5247
5248
5249
5250 if (!tg->se[0])
5251 return -EINVAL;
5252
5253 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5254
5255 mutex_lock(&shares_mutex);
5256 if (tg->shares == shares)
5257 goto done;
5258
5259 tg->shares = shares;
5260 for_each_possible_cpu(i) {
5261 struct rq *rq = cpu_rq(i);
5262 struct sched_entity *se;
5263
5264 se = tg->se[i];
5265
5266 raw_spin_lock_irqsave(&rq->lock, flags);
5267 for_each_sched_entity(se)
5268 update_cfs_shares(group_cfs_rq(se));
5269 raw_spin_unlock_irqrestore(&rq->lock, flags);
5270 }
5271
5272done:
5273 mutex_unlock(&shares_mutex);
5274 return 0;
5275}
5276#else
5277
5278void free_fair_sched_group(struct task_group *tg) { }
5279
5280int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5281{
5282 return 1;
5283}
5284
5285void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5286
5287#endif
5288
5289
5290static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5291{
5292 struct sched_entity *se = &task->se;
5293 unsigned int rr_interval = 0;
5294
5295
5296
5297
5298
5299 if (rq->cfs.load.weight)
5300 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5301
5302 return rr_interval;
5303}
5304
5305
5306
5307
5308const struct sched_class fair_sched_class = {
5309 .next = &idle_sched_class,
5310 .enqueue_task = enqueue_task_fair,
5311 .dequeue_task = dequeue_task_fair,
5312 .yield_task = yield_task_fair,
5313 .yield_to_task = yield_to_task_fair,
5314
5315 .check_preempt_curr = check_preempt_wakeup,
5316
5317 .pick_next_task = pick_next_task_fair,
5318 .put_prev_task = put_prev_task_fair,
5319
5320#ifdef CONFIG_SMP
5321 .select_task_rq = select_task_rq_fair,
5322
5323 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair,
5325
5326 .task_waking = task_waking_fair,
5327#endif
5328
5329 .set_curr_task = set_curr_task_fair,
5330 .task_tick = task_tick_fair,
5331 .task_fork = task_fork_fair,
5332
5333 .prio_changed = prio_changed_fair,
5334 .switched_from = switched_from_fair,
5335 .switched_to = switched_to_fair,
5336
5337 .get_rr_interval = get_rr_interval_fair,
5338
5339#ifdef CONFIG_FAIR_GROUP_SCHED
5340 .task_move_group = task_move_group_fair,
5341#endif
5342};
5343
5344#ifdef CONFIG_SCHED_DEBUG
5345void print_cfs_stats(struct seq_file *m, int cpu)
5346{
5347 struct cfs_rq *cfs_rq;
5348
5349 rcu_read_lock();
5350 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5351 print_cfs_rq(m, cpu, cfs_rq);
5352 rcu_read_unlock();
5353}
5354#endif
5355
5356__init void init_sched_fair_class(void)
5357{
5358#ifdef CONFIG_SMP
5359 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5360
5361#ifdef CONFIG_NO_HZ
5362 nohz.next_balance = jiffies;
5363 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5364 cpu_notifier(sched_ilb_notifier, 0);
5365#endif
5366#endif
5367
5368}
5369