1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25
26
27
28
29
30
31
32
33
34
35
36
37
38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40
41
42
43
44
45
46
47
48
49
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
52
53
54
55
56
57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59
60
61
62
63static unsigned int sched_nr_latency = 8;
64
65
66
67
68
69unsigned int sysctl_sched_child_runs_first __read_mostly;
70
71
72
73
74
75
76
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79
80
81
82
83
84
85
86
87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91
92
93
94
95
96
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
99static const struct sched_class fair_sched_class;
100
101
102
103
104
105#ifdef CONFIG_FAIR_GROUP_SCHED
106
107
108static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
109{
110 return cfs_rq->rq;
111}
112
113
114#define entity_is_task(se) (!se->my_q)
115
116static inline struct task_struct *task_of(struct sched_entity *se)
117{
118#ifdef CONFIG_SCHED_DEBUG
119 WARN_ON_ONCE(!entity_is_task(se));
120#endif
121 return container_of(se, struct task_struct, se);
122}
123
124
125#define for_each_sched_entity(se) \
126 for (; se; se = se->parent)
127
128static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
129{
130 return p->se.cfs_rq;
131}
132
133
134static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
135{
136 return se->cfs_rq;
137}
138
139
140static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
141{
142 return grp->my_q;
143}
144
145
146
147
148static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
149{
150 return cfs_rq->tg->cfs_rq[this_cpu];
151}
152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156
157
158
159
160
161
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
183
184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
186
187
188static inline int
189is_same_group(struct sched_entity *se, struct sched_entity *pse)
190{
191 if (se->cfs_rq == pse->cfs_rq)
192 return 1;
193
194 return 0;
195}
196
197static inline struct sched_entity *parent_entity(struct sched_entity *se)
198{
199 return se->parent;
200}
201
202
203static inline int depth_se(struct sched_entity *se)
204{
205 int depth = 0;
206
207 for_each_sched_entity(se)
208 depth++;
209
210 return depth;
211}
212
213static void
214find_matching_se(struct sched_entity **se, struct sched_entity **pse)
215{
216 int se_depth, pse_depth;
217
218
219
220
221
222
223
224
225
226 se_depth = depth_se(*se);
227 pse_depth = depth_se(*pse);
228
229 while (se_depth > pse_depth) {
230 se_depth--;
231 *se = parent_entity(*se);
232 }
233
234 while (pse_depth > se_depth) {
235 pse_depth--;
236 *pse = parent_entity(*pse);
237 }
238
239 while (!is_same_group(*se, *pse)) {
240 *se = parent_entity(*se);
241 *pse = parent_entity(*pse);
242 }
243}
244
245#else
246
247static inline struct task_struct *task_of(struct sched_entity *se)
248{
249 return container_of(se, struct task_struct, se);
250}
251
252static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
253{
254 return container_of(cfs_rq, struct rq, cfs);
255}
256
257#define entity_is_task(se) 1
258
259#define for_each_sched_entity(se) \
260 for (; se; se = NULL)
261
262static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
263{
264 return &task_rq(p)->cfs;
265}
266
267static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
268{
269 struct task_struct *p = task_of(se);
270 struct rq *rq = task_rq(p);
271
272 return &rq->cfs;
273}
274
275
276static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
277{
278 return NULL;
279}
280
281static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
282{
283 return &cpu_rq(this_cpu)->cfs;
284}
285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
296
297static inline int
298is_same_group(struct sched_entity *se, struct sched_entity *pse)
299{
300 return 1;
301}
302
303static inline struct sched_entity *parent_entity(struct sched_entity *se)
304{
305 return NULL;
306}
307
308static inline void
309find_matching_se(struct sched_entity **se, struct sched_entity **pse)
310{
311}
312
313#endif
314
315
316
317
318
319
320static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
321{
322 s64 delta = (s64)(vruntime - min_vruntime);
323 if (delta > 0)
324 min_vruntime = vruntime;
325
326 return min_vruntime;
327}
328
329static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
330{
331 s64 delta = (s64)(vruntime - min_vruntime);
332 if (delta < 0)
333 min_vruntime = vruntime;
334
335 return min_vruntime;
336}
337
338static inline int entity_before(struct sched_entity *a,
339 struct sched_entity *b)
340{
341 return (s64)(a->vruntime - b->vruntime) < 0;
342}
343
344static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
345{
346 return se->vruntime - cfs_rq->min_vruntime;
347}
348
349static void update_min_vruntime(struct cfs_rq *cfs_rq)
350{
351 u64 vruntime = cfs_rq->min_vruntime;
352
353 if (cfs_rq->curr)
354 vruntime = cfs_rq->curr->vruntime;
355
356 if (cfs_rq->rb_leftmost) {
357 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
358 struct sched_entity,
359 run_node);
360
361 if (!cfs_rq->curr)
362 vruntime = se->vruntime;
363 else
364 vruntime = min_vruntime(vruntime, se->vruntime);
365 }
366
367 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
368}
369
370
371
372
373static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
374{
375 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
376 struct rb_node *parent = NULL;
377 struct sched_entity *entry;
378 s64 key = entity_key(cfs_rq, se);
379 int leftmost = 1;
380
381
382
383
384 while (*link) {
385 parent = *link;
386 entry = rb_entry(parent, struct sched_entity, run_node);
387
388
389
390
391 if (key < entity_key(cfs_rq, entry)) {
392 link = &parent->rb_left;
393 } else {
394 link = &parent->rb_right;
395 leftmost = 0;
396 }
397 }
398
399
400
401
402
403 if (leftmost)
404 cfs_rq->rb_leftmost = &se->run_node;
405
406 rb_link_node(&se->run_node, parent, link);
407 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
408}
409
410static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
411{
412 if (cfs_rq->rb_leftmost == &se->run_node) {
413 struct rb_node *next_node;
414
415 next_node = rb_next(&se->run_node);
416 cfs_rq->rb_leftmost = next_node;
417 }
418
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420}
421
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
423{
424 struct rb_node *left = cfs_rq->rb_leftmost;
425
426 if (!left)
427 return NULL;
428
429 return rb_entry(left, struct sched_entity, run_node);
430}
431
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
435
436 if (!last)
437 return NULL;
438
439 return rb_entry(last, struct sched_entity, run_node);
440}
441
442
443
444
445
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp,
449 loff_t *ppos)
450{
451 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
452 int factor = get_update_sysctl_factor();
453
454 if (ret || !write)
455 return ret;
456
457 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
458 sysctl_sched_min_granularity);
459
460#define WRT_SYSCTL(name) \
461 (normalized_sysctl_##name = sysctl_##name / (factor))
462 WRT_SYSCTL(sched_min_granularity);
463 WRT_SYSCTL(sched_latency);
464 WRT_SYSCTL(sched_wakeup_granularity);
465#undef WRT_SYSCTL
466
467 return 0;
468}
469#endif
470
471
472
473
474static inline unsigned long
475calc_delta_fair(unsigned long delta, struct sched_entity *se)
476{
477 if (unlikely(se->load.weight != NICE_0_LOAD))
478 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
479
480 return delta;
481}
482
483
484
485
486
487
488
489
490
491static u64 __sched_period(unsigned long nr_running)
492{
493 u64 period = sysctl_sched_latency;
494 unsigned long nr_latency = sched_nr_latency;
495
496 if (unlikely(nr_running > nr_latency)) {
497 period = sysctl_sched_min_granularity;
498 period *= nr_running;
499 }
500
501 return period;
502}
503
504
505
506
507
508
509
510static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
511{
512 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
513
514 for_each_sched_entity(se) {
515 struct load_weight *load;
516 struct load_weight lw;
517
518 cfs_rq = cfs_rq_of(se);
519 load = &cfs_rq->load;
520
521 if (unlikely(!se->on_rq)) {
522 lw = cfs_rq->load;
523
524 update_load_add(&lw, se->load.weight);
525 load = &lw;
526 }
527 slice = calc_delta_mine(slice, se->load.weight, load);
528 }
529 return slice;
530}
531
532
533
534
535
536
537static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
538{
539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
540}
541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
545
546
547
548
549static inline void
550__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
551 unsigned long delta_exec)
552{
553 unsigned long delta_exec_weighted;
554
555 schedstat_set(curr->statistics.exec_max,
556 max((u64)delta_exec, curr->statistics.exec_max));
557
558 curr->sum_exec_runtime += delta_exec;
559 schedstat_add(cfs_rq, exec_clock, delta_exec);
560 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
561
562 curr->vruntime += delta_exec_weighted;
563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
568}
569
570static void update_curr(struct cfs_rq *cfs_rq)
571{
572 struct sched_entity *curr = cfs_rq->curr;
573 u64 now = rq_of(cfs_rq)->clock_task;
574 unsigned long delta_exec;
575
576 if (unlikely(!curr))
577 return;
578
579
580
581
582
583
584 delta_exec = (unsigned long)(now - curr->exec_start);
585 if (!delta_exec)
586 return;
587
588 __update_curr(cfs_rq, curr, delta_exec);
589 curr->exec_start = now;
590
591 if (entity_is_task(curr)) {
592 struct task_struct *curtask = task_of(curr);
593
594 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
595 cpuacct_charge(curtask, delta_exec);
596 account_group_exec_runtime(curtask, delta_exec);
597 }
598}
599
600static inline void
601update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602{
603 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
604}
605
606
607
608
609static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
610{
611
612
613
614
615 if (se != cfs_rq->curr)
616 update_stats_wait_start(cfs_rq, se);
617}
618
619static void
620update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
621{
622 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
623 rq_of(cfs_rq)->clock - se->statistics.wait_start));
624 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
625 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
626 rq_of(cfs_rq)->clock - se->statistics.wait_start);
627#ifdef CONFIG_SCHEDSTATS
628 if (entity_is_task(se)) {
629 trace_sched_stat_wait(task_of(se),
630 rq_of(cfs_rq)->clock - se->statistics.wait_start);
631 }
632#endif
633 schedstat_set(se->statistics.wait_start, 0);
634}
635
636static inline void
637update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
638{
639
640
641
642
643 if (se != cfs_rq->curr)
644 update_stats_wait_end(cfs_rq, se);
645}
646
647
648
649
650static inline void
651update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
652{
653
654
655
656 se->exec_start = rq_of(cfs_rq)->clock_task;
657}
658
659
660
661
662
663#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
664static void
665add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
666{
667 cfs_rq->task_weight += weight;
668}
669#else
670static inline void
671add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
672{
673}
674#endif
675
676static void
677account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
678{
679 update_load_add(&cfs_rq->load, se->load.weight);
680 if (!parent_entity(se))
681 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
682 if (entity_is_task(se)) {
683 add_cfs_task_weight(cfs_rq, se->load.weight);
684 list_add(&se->group_node, &cfs_rq->tasks);
685 }
686 cfs_rq->nr_running++;
687}
688
689static void
690account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
691{
692 update_load_sub(&cfs_rq->load, se->load.weight);
693 if (!parent_entity(se))
694 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
695 if (entity_is_task(se)) {
696 add_cfs_task_weight(cfs_rq, -se->load.weight);
697 list_del_init(&se->group_node);
698 }
699 cfs_rq->nr_running--;
700}
701
702#ifdef CONFIG_FAIR_GROUP_SCHED
703# ifdef CONFIG_SMP
704static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
705 int global_update)
706{
707 struct task_group *tg = cfs_rq->tg;
708 long load_avg;
709
710 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
711 load_avg -= cfs_rq->load_contribution;
712
713 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
714 atomic_add(load_avg, &tg->load_weight);
715 cfs_rq->load_contribution += load_avg;
716 }
717}
718
719static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
720{
721 u64 period = sysctl_sched_shares_window;
722 u64 now, delta;
723 unsigned long load = cfs_rq->load.weight;
724
725 if (cfs_rq->tg == &root_task_group)
726 return;
727
728 now = rq_of(cfs_rq)->clock_task;
729 delta = now - cfs_rq->load_stamp;
730
731
732 if (cfs_rq->load_stamp > cfs_rq->load_last &&
733 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0;
736 }
737
738 cfs_rq->load_stamp = now;
739 cfs_rq->load_unacc_exec_time = 0;
740 cfs_rq->load_period += delta;
741 if (load) {
742 cfs_rq->load_last = now;
743 cfs_rq->load_avg += delta * load;
744 }
745
746
747 if (global_update || cfs_rq->load_period > period
748 || !cfs_rq->load_period)
749 update_cfs_rq_load_contribution(cfs_rq, global_update);
750
751 while (cfs_rq->load_period > period) {
752
753
754
755
756
757 asm("" : "+rm" (cfs_rq->load_period));
758 cfs_rq->load_period /= 2;
759 cfs_rq->load_avg /= 2;
760 }
761
762 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
763 list_del_leaf_cfs_rq(cfs_rq);
764}
765
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
767 long weight_delta)
768{
769 long load_weight, load, shares;
770
771 load = cfs_rq->load.weight + weight_delta;
772
773 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load;
776
777 shares = (tg->shares * load);
778 if (load_weight)
779 shares /= load_weight;
780
781 if (shares < MIN_SHARES)
782 shares = MIN_SHARES;
783 if (shares > tg->shares)
784 shares = tg->shares;
785
786 return shares;
787}
788
789static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0);
794 }
795}
796# else
797static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{
799}
800
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
802 long weight_delta)
803{
804 return tg->shares;
805}
806
807static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
808{
809}
810# endif
811static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
812 unsigned long weight)
813{
814 if (se->on_rq) {
815
816 if (cfs_rq->curr == se)
817 update_curr(cfs_rq);
818 account_entity_dequeue(cfs_rq, se);
819 }
820
821 update_load_set(&se->load, weight);
822
823 if (se->on_rq)
824 account_entity_enqueue(cfs_rq, se);
825}
826
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
828{
829 struct task_group *tg;
830 struct sched_entity *se;
831 long shares;
832
833 tg = cfs_rq->tg;
834 se = tg->se[cpu_of(rq_of(cfs_rq))];
835 if (!se)
836 return;
837#ifndef CONFIG_SMP
838 if (likely(se->load.weight == tg->shares))
839 return;
840#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
842
843 reweight_entity(cfs_rq_of(se), se, shares);
844}
845#else
846static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{
848}
849
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
851{
852}
853
854static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
855{
856}
857#endif
858
859static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
860{
861#ifdef CONFIG_SCHEDSTATS
862 struct task_struct *tsk = NULL;
863
864 if (entity_is_task(se))
865 tsk = task_of(se);
866
867 if (se->statistics.sleep_start) {
868 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
869
870 if ((s64)delta < 0)
871 delta = 0;
872
873 if (unlikely(delta > se->statistics.sleep_max))
874 se->statistics.sleep_max = delta;
875
876 se->statistics.sleep_start = 0;
877 se->statistics.sum_sleep_runtime += delta;
878
879 if (tsk) {
880 account_scheduler_latency(tsk, delta >> 10, 1);
881 trace_sched_stat_sleep(tsk, delta);
882 }
883 }
884 if (se->statistics.block_start) {
885 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
886
887 if ((s64)delta < 0)
888 delta = 0;
889
890 if (unlikely(delta > se->statistics.block_max))
891 se->statistics.block_max = delta;
892
893 se->statistics.block_start = 0;
894 se->statistics.sum_sleep_runtime += delta;
895
896 if (tsk) {
897 if (tsk->in_iowait) {
898 se->statistics.iowait_sum += delta;
899 se->statistics.iowait_count++;
900 trace_sched_stat_iowait(tsk, delta);
901 }
902
903
904
905
906
907
908 if (unlikely(prof_on == SLEEP_PROFILING)) {
909 profile_hits(SLEEP_PROFILING,
910 (void *)get_wchan(tsk),
911 delta >> 20);
912 }
913 account_scheduler_latency(tsk, delta >> 10, 0);
914 }
915 }
916#endif
917}
918
919static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
920{
921#ifdef CONFIG_SCHED_DEBUG
922 s64 d = se->vruntime - cfs_rq->min_vruntime;
923
924 if (d < 0)
925 d = -d;
926
927 if (d > 3*sysctl_sched_latency)
928 schedstat_inc(cfs_rq, nr_spread_over);
929#endif
930}
931
932static void
933place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
934{
935 u64 vruntime = cfs_rq->min_vruntime;
936
937
938
939
940
941
942
943 if (initial && sched_feat(START_DEBIT))
944 vruntime += sched_vslice(cfs_rq, se);
945
946
947 if (!initial) {
948 unsigned long thresh = sysctl_sched_latency;
949
950
951
952
953
954 if (sched_feat(GENTLE_FAIR_SLEEPERS))
955 thresh >>= 1;
956
957 vruntime -= thresh;
958 }
959
960
961 vruntime = max_vruntime(se->vruntime, vruntime);
962
963 se->vruntime = vruntime;
964}
965
966static void
967enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
968{
969
970
971
972
973 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
974 se->vruntime += cfs_rq->min_vruntime;
975
976
977
978
979 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se);
983
984 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0);
986 enqueue_sleeper(cfs_rq, se);
987 }
988
989 update_stats_enqueue(cfs_rq, se);
990 check_spread(cfs_rq, se);
991 if (se != cfs_rq->curr)
992 __enqueue_entity(cfs_rq, se);
993 se->on_rq = 1;
994
995 if (cfs_rq->nr_running == 1)
996 list_add_leaf_cfs_rq(cfs_rq);
997}
998
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1000{
1001 if (!se || cfs_rq->last == se)
1002 cfs_rq->last = NULL;
1003
1004 if (!se || cfs_rq->next == se)
1005 cfs_rq->next = NULL;
1006}
1007
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{
1010 for_each_sched_entity(se)
1011 __clear_buddies(cfs_rq_of(se), se);
1012}
1013
1014static void
1015dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1016{
1017
1018
1019
1020 update_curr(cfs_rq);
1021
1022 update_stats_dequeue(cfs_rq, se);
1023 if (flags & DEQUEUE_SLEEP) {
1024#ifdef CONFIG_SCHEDSTATS
1025 if (entity_is_task(se)) {
1026 struct task_struct *tsk = task_of(se);
1027
1028 if (tsk->state & TASK_INTERRUPTIBLE)
1029 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1030 if (tsk->state & TASK_UNINTERRUPTIBLE)
1031 se->statistics.block_start = rq_of(cfs_rq)->clock;
1032 }
1033#endif
1034 }
1035
1036 clear_buddies(cfs_rq, se);
1037
1038 if (se != cfs_rq->curr)
1039 __dequeue_entity(cfs_rq, se);
1040 se->on_rq = 0;
1041 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0);
1045
1046
1047
1048
1049
1050
1051 if (!(flags & DEQUEUE_SLEEP))
1052 se->vruntime -= cfs_rq->min_vruntime;
1053}
1054
1055
1056
1057
1058static void
1059check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1060{
1061 unsigned long ideal_runtime, delta_exec;
1062
1063 ideal_runtime = sched_slice(cfs_rq, curr);
1064 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1065 if (delta_exec > ideal_runtime) {
1066 resched_task(rq_of(cfs_rq)->curr);
1067
1068
1069
1070
1071 clear_buddies(cfs_rq, curr);
1072 return;
1073 }
1074
1075
1076
1077
1078
1079
1080 if (!sched_feat(WAKEUP_PREEMPT))
1081 return;
1082
1083 if (delta_exec < sysctl_sched_min_granularity)
1084 return;
1085
1086 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime;
1089
1090 if (delta < 0)
1091 return;
1092
1093 if (delta > ideal_runtime)
1094 resched_task(rq_of(cfs_rq)->curr);
1095 }
1096}
1097
1098static void
1099set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1100{
1101
1102 if (se->on_rq) {
1103
1104
1105
1106
1107
1108 update_stats_wait_end(cfs_rq, se);
1109 __dequeue_entity(cfs_rq, se);
1110 }
1111
1112 update_stats_curr_start(cfs_rq, se);
1113 cfs_rq->curr = se;
1114#ifdef CONFIG_SCHEDSTATS
1115
1116
1117
1118
1119
1120 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1121 se->statistics.slice_max = max(se->statistics.slice_max,
1122 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1123 }
1124#endif
1125 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1126}
1127
1128static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq);
1134 struct sched_entity *left = se;
1135
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1137 se = cfs_rq->next;
1138
1139
1140
1141
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last;
1144
1145 clear_buddies(cfs_rq, se);
1146
1147 return se;
1148}
1149
1150static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1151{
1152
1153
1154
1155
1156 if (prev->on_rq)
1157 update_curr(cfs_rq);
1158
1159 check_spread(cfs_rq, prev);
1160 if (prev->on_rq) {
1161 update_stats_wait_start(cfs_rq, prev);
1162
1163 __enqueue_entity(cfs_rq, prev);
1164 }
1165 cfs_rq->curr = NULL;
1166}
1167
1168static void
1169entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1170{
1171
1172
1173
1174 update_curr(cfs_rq);
1175
1176
1177
1178
1179 update_entity_shares_tick(cfs_rq);
1180
1181#ifdef CONFIG_SCHED_HRTICK
1182
1183
1184
1185
1186 if (queued) {
1187 resched_task(rq_of(cfs_rq)->curr);
1188 return;
1189 }
1190
1191
1192
1193 if (!sched_feat(DOUBLE_TICK) &&
1194 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1195 return;
1196#endif
1197
1198 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
1199 check_preempt_tick(cfs_rq, curr);
1200}
1201
1202
1203
1204
1205
1206#ifdef CONFIG_SCHED_HRTICK
1207static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1208{
1209 struct sched_entity *se = &p->se;
1210 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1211
1212 WARN_ON(task_rq(p) != rq);
1213
1214 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
1215 u64 slice = sched_slice(cfs_rq, se);
1216 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1217 s64 delta = slice - ran;
1218
1219 if (delta < 0) {
1220 if (rq->curr == p)
1221 resched_task(p);
1222 return;
1223 }
1224
1225
1226
1227
1228
1229 if (rq->curr != p)
1230 delta = max_t(s64, 10000LL, delta);
1231
1232 hrtick_start(rq, delta);
1233 }
1234}
1235
1236
1237
1238
1239
1240
1241static void hrtick_update(struct rq *rq)
1242{
1243 struct task_struct *curr = rq->curr;
1244
1245 if (curr->sched_class != &fair_sched_class)
1246 return;
1247
1248 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
1249 hrtick_start_fair(rq, curr);
1250}
1251#else
1252static inline void
1253hrtick_start_fair(struct rq *rq, struct task_struct *p)
1254{
1255}
1256
1257static inline void hrtick_update(struct rq *rq)
1258{
1259}
1260#endif
1261
1262
1263
1264
1265
1266
1267static void
1268enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1269{
1270 struct cfs_rq *cfs_rq;
1271 struct sched_entity *se = &p->se;
1272
1273 for_each_sched_entity(se) {
1274 if (se->on_rq)
1275 break;
1276 cfs_rq = cfs_rq_of(se);
1277 enqueue_entity(cfs_rq, se, flags);
1278 flags = ENQUEUE_WAKEUP;
1279 }
1280
1281 for_each_sched_entity(se) {
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283
1284 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0);
1286 }
1287
1288 hrtick_update(rq);
1289}
1290
1291
1292
1293
1294
1295
1296static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1297{
1298 struct cfs_rq *cfs_rq;
1299 struct sched_entity *se = &p->se;
1300
1301 for_each_sched_entity(se) {
1302 cfs_rq = cfs_rq_of(se);
1303 dequeue_entity(cfs_rq, se, flags);
1304
1305
1306 if (cfs_rq->load.weight)
1307 break;
1308 flags |= DEQUEUE_SLEEP;
1309 }
1310
1311 for_each_sched_entity(se) {
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313
1314 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0);
1316 }
1317
1318 hrtick_update(rq);
1319}
1320
1321
1322
1323
1324
1325
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332
1333
1334
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342
1343
1344
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349
1350
1351
1352 rightmost = __pick_last_entity(cfs_rq);
1353
1354
1355
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359
1360
1361
1362
1363
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP
1368
1369static void task_waking_fair(struct rq *rq, struct task_struct *p)
1370{
1371 struct sched_entity *se = &p->se;
1372 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1373
1374 se->vruntime -= cfs_rq->min_vruntime;
1375}
1376
1377#ifdef CONFIG_FAIR_GROUP_SCHED
1378
1379
1380
1381
1382
1383
1384
1385static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1386{
1387 struct sched_entity *se = tg->se[cpu];
1388
1389 if (!tg->parent)
1390 return wl;
1391
1392 for_each_sched_entity(se) {
1393 long lw, w;
1394
1395 tg = se->my_q->tg;
1396 w = se->my_q->load.weight;
1397
1398
1399 lw = atomic_read(&tg->load_weight);
1400 lw -= se->my_q->load_contribution;
1401 lw += w + wg;
1402
1403 wl += w;
1404
1405 if (lw > 0 && wl < lw)
1406 wl = (wl * tg->shares) / lw;
1407 else
1408 wl = tg->shares;
1409
1410
1411 if (wl < MIN_SHARES)
1412 wl = MIN_SHARES;
1413 wl -= se->load.weight;
1414 wg = 0;
1415 }
1416
1417 return wl;
1418}
1419
1420#else
1421
1422static inline unsigned long effective_load(struct task_group *tg, int cpu,
1423 unsigned long wl, unsigned long wg)
1424{
1425 return wl;
1426}
1427
1428#endif
1429
1430static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1431{
1432 s64 this_load, load;
1433 int idx, this_cpu, prev_cpu;
1434 unsigned long tl_per_task;
1435 struct task_group *tg;
1436 unsigned long weight;
1437 int balanced;
1438
1439 idx = sd->wake_idx;
1440 this_cpu = smp_processor_id();
1441 prev_cpu = task_cpu(p);
1442 load = source_load(prev_cpu, idx);
1443 this_load = target_load(this_cpu, idx);
1444
1445
1446
1447
1448
1449
1450 rcu_read_lock();
1451 if (sync) {
1452 tg = task_group(current);
1453 weight = current->se.load.weight;
1454
1455 this_load += effective_load(tg, this_cpu, -weight, -weight);
1456 load += effective_load(tg, prev_cpu, 0, -weight);
1457 }
1458
1459 tg = task_group(p);
1460 weight = p->se.load.weight;
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471 if (this_load > 0) {
1472 s64 this_eff_load, prev_eff_load;
1473
1474 this_eff_load = 100;
1475 this_eff_load *= power_of(prev_cpu);
1476 this_eff_load *= this_load +
1477 effective_load(tg, this_cpu, weight, weight);
1478
1479 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1480 prev_eff_load *= power_of(this_cpu);
1481 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1482
1483 balanced = this_eff_load <= prev_eff_load;
1484 } else
1485 balanced = true;
1486 rcu_read_unlock();
1487
1488
1489
1490
1491
1492
1493 if (sync && balanced)
1494 return 1;
1495
1496 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1497 tl_per_task = cpu_avg_load_per_task(this_cpu);
1498
1499 if (balanced ||
1500 (this_load <= load &&
1501 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1502
1503
1504
1505
1506
1507 schedstat_inc(sd, ttwu_move_affine);
1508 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1509
1510 return 1;
1511 }
1512 return 0;
1513}
1514
1515
1516
1517
1518
1519static struct sched_group *
1520find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1521 int this_cpu, int load_idx)
1522{
1523 struct sched_group *idlest = NULL, *group = sd->groups;
1524 unsigned long min_load = ULONG_MAX, this_load = 0;
1525 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1526
1527 do {
1528 unsigned long load, avg_load;
1529 int local_group;
1530 int i;
1531
1532
1533 if (!cpumask_intersects(sched_group_cpus(group),
1534 &p->cpus_allowed))
1535 continue;
1536
1537 local_group = cpumask_test_cpu(this_cpu,
1538 sched_group_cpus(group));
1539
1540
1541 avg_load = 0;
1542
1543 for_each_cpu(i, sched_group_cpus(group)) {
1544
1545 if (local_group)
1546 load = source_load(i, load_idx);
1547 else
1548 load = target_load(i, load_idx);
1549
1550 avg_load += load;
1551 }
1552
1553
1554 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1555
1556 if (local_group) {
1557 this_load = avg_load;
1558 } else if (avg_load < min_load) {
1559 min_load = avg_load;
1560 idlest = group;
1561 }
1562 } while (group = group->next, group != sd->groups);
1563
1564 if (!idlest || 100*this_load < imbalance*min_load)
1565 return NULL;
1566 return idlest;
1567}
1568
1569
1570
1571
1572static int
1573find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1574{
1575 unsigned long load, min_load = ULONG_MAX;
1576 int idlest = -1;
1577 int i;
1578
1579
1580 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1581 load = weighted_cpuload(i);
1582
1583 if (load < min_load || (load == min_load && i == this_cpu)) {
1584 min_load = load;
1585 idlest = i;
1586 }
1587 }
1588
1589 return idlest;
1590}
1591
1592
1593
1594
1595static int select_idle_sibling(struct task_struct *p, int target)
1596{
1597 int cpu = smp_processor_id();
1598 int prev_cpu = task_cpu(p);
1599 struct sched_domain *sd;
1600 int i;
1601
1602
1603
1604
1605
1606 if (target == cpu && idle_cpu(cpu))
1607 return cpu;
1608
1609
1610
1611
1612
1613 if (target == prev_cpu && idle_cpu(prev_cpu))
1614 return prev_cpu;
1615
1616
1617
1618
1619 for_each_domain(target, sd) {
1620 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1621 break;
1622
1623 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1624 if (idle_cpu(i)) {
1625 target = i;
1626 break;
1627 }
1628 }
1629
1630
1631
1632
1633
1634 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1635 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1636 break;
1637 }
1638
1639 return target;
1640}
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653static int
1654select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1655{
1656 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1657 int cpu = smp_processor_id();
1658 int prev_cpu = task_cpu(p);
1659 int new_cpu = cpu;
1660 int want_affine = 0;
1661 int want_sd = 1;
1662 int sync = wake_flags & WF_SYNC;
1663
1664 if (sd_flag & SD_BALANCE_WAKE) {
1665 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1666 want_affine = 1;
1667 new_cpu = prev_cpu;
1668 }
1669
1670 for_each_domain(cpu, tmp) {
1671 if (!(tmp->flags & SD_LOAD_BALANCE))
1672 continue;
1673
1674
1675
1676
1677
1678 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1679 unsigned long power = 0;
1680 unsigned long nr_running = 0;
1681 unsigned long capacity;
1682 int i;
1683
1684 for_each_cpu(i, sched_domain_span(tmp)) {
1685 power += power_of(i);
1686 nr_running += cpu_rq(i)->cfs.nr_running;
1687 }
1688
1689 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1690
1691 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1692 nr_running /= 2;
1693
1694 if (nr_running < capacity)
1695 want_sd = 0;
1696 }
1697
1698
1699
1700
1701
1702 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1703 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1704 affine_sd = tmp;
1705 want_affine = 0;
1706 }
1707
1708 if (!want_sd && !want_affine)
1709 break;
1710
1711 if (!(tmp->flags & sd_flag))
1712 continue;
1713
1714 if (want_sd)
1715 sd = tmp;
1716 }
1717
1718 if (affine_sd) {
1719 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1720 return select_idle_sibling(p, cpu);
1721 else
1722 return select_idle_sibling(p, prev_cpu);
1723 }
1724
1725 while (sd) {
1726 int load_idx = sd->forkexec_idx;
1727 struct sched_group *group;
1728 int weight;
1729
1730 if (!(sd->flags & sd_flag)) {
1731 sd = sd->child;
1732 continue;
1733 }
1734
1735 if (sd_flag & SD_BALANCE_WAKE)
1736 load_idx = sd->wake_idx;
1737
1738 group = find_idlest_group(sd, p, cpu, load_idx);
1739 if (!group) {
1740 sd = sd->child;
1741 continue;
1742 }
1743
1744 new_cpu = find_idlest_cpu(group, p, cpu);
1745 if (new_cpu == -1 || new_cpu == cpu) {
1746
1747 sd = sd->child;
1748 continue;
1749 }
1750
1751
1752 cpu = new_cpu;
1753 weight = sd->span_weight;
1754 sd = NULL;
1755 for_each_domain(cpu, tmp) {
1756 if (weight <= tmp->span_weight)
1757 break;
1758 if (tmp->flags & sd_flag)
1759 sd = tmp;
1760 }
1761
1762 }
1763
1764 return new_cpu;
1765}
1766#endif
1767
1768static unsigned long
1769wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1770{
1771 unsigned long gran = sysctl_sched_wakeup_granularity;
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786 if (unlikely(se->load.weight != NICE_0_LOAD))
1787 gran = calc_delta_fair(gran, se);
1788
1789 return gran;
1790}
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806static int
1807wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1808{
1809 s64 gran, vdiff = curr->vruntime - se->vruntime;
1810
1811 if (vdiff <= 0)
1812 return -1;
1813
1814 gran = wakeup_gran(curr, se);
1815 if (vdiff > gran)
1816 return 1;
1817
1818 return 0;
1819}
1820
1821static void set_last_buddy(struct sched_entity *se)
1822{
1823 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1824 for_each_sched_entity(se)
1825 cfs_rq_of(se)->last = se;
1826 }
1827}
1828
1829static void set_next_buddy(struct sched_entity *se)
1830{
1831 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1832 for_each_sched_entity(se)
1833 cfs_rq_of(se)->next = se;
1834 }
1835}
1836
1837
1838
1839
1840static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1841{
1842 struct task_struct *curr = rq->curr;
1843 struct sched_entity *se = &curr->se, *pse = &p->se;
1844 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1845 int scale = cfs_rq->nr_running >= sched_nr_latency;
1846
1847 if (unlikely(se == pse))
1848 return;
1849
1850 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1851 set_next_buddy(pse);
1852
1853
1854
1855
1856
1857 if (test_tsk_need_resched(curr))
1858 return;
1859
1860
1861
1862
1863
1864 if (unlikely(p->policy != SCHED_NORMAL))
1865 return;
1866
1867
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870
1871 if (!sched_feat(WAKEUP_PREEMPT))
1872 return;
1873
1874 update_curr(cfs_rq);
1875 find_matching_se(&se, &pse);
1876 BUG_ON(!pse);
1877 if (wakeup_preempt_entity(se, pse) == 1)
1878 goto preempt;
1879
1880 return;
1881
1882preempt:
1883 resched_task(curr);
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893 if (unlikely(!se->on_rq || curr == rq->idle))
1894 return;
1895
1896 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1897 set_last_buddy(se);
1898}
1899
1900static struct task_struct *pick_next_task_fair(struct rq *rq)
1901{
1902 struct task_struct *p;
1903 struct cfs_rq *cfs_rq = &rq->cfs;
1904 struct sched_entity *se;
1905
1906 if (!cfs_rq->nr_running)
1907 return NULL;
1908
1909 do {
1910 se = pick_next_entity(cfs_rq);
1911 set_next_entity(cfs_rq, se);
1912 cfs_rq = group_cfs_rq(se);
1913 } while (cfs_rq);
1914
1915 p = task_of(se);
1916 hrtick_start_fair(rq, p);
1917
1918 return p;
1919}
1920
1921
1922
1923
1924static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1925{
1926 struct sched_entity *se = &prev->se;
1927 struct cfs_rq *cfs_rq;
1928
1929 for_each_sched_entity(se) {
1930 cfs_rq = cfs_rq_of(se);
1931 put_prev_entity(cfs_rq, se);
1932 }
1933}
1934
1935#ifdef CONFIG_SMP
1936
1937
1938
1939
1940
1941
1942
1943
1944static void pull_task(struct rq *src_rq, struct task_struct *p,
1945 struct rq *this_rq, int this_cpu)
1946{
1947 deactivate_task(src_rq, p, 0);
1948 set_task_cpu(p, this_cpu);
1949 activate_task(this_rq, p, 0);
1950 check_preempt_curr(this_rq, p, 0);
1951}
1952
1953
1954
1955
1956static
1957int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1958 struct sched_domain *sd, enum cpu_idle_type idle,
1959 int *all_pinned)
1960{
1961 int tsk_cache_hot = 0;
1962
1963
1964
1965
1966
1967
1968 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1969 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1970 return 0;
1971 }
1972 *all_pinned = 0;
1973
1974 if (task_running(rq, p)) {
1975 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1976 return 0;
1977 }
1978
1979
1980
1981
1982
1983
1984
1985 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1986 if (!tsk_cache_hot ||
1987 sd->nr_balance_failed > sd->cache_nice_tries) {
1988#ifdef CONFIG_SCHEDSTATS
1989 if (tsk_cache_hot) {
1990 schedstat_inc(sd, lb_hot_gained[idle]);
1991 schedstat_inc(p, se.statistics.nr_forced_migrations);
1992 }
1993#endif
1994 return 1;
1995 }
1996
1997 if (tsk_cache_hot) {
1998 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1999 return 0;
2000 }
2001 return 1;
2002}
2003
2004
2005
2006
2007
2008
2009
2010
2011static int
2012move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2013 struct sched_domain *sd, enum cpu_idle_type idle)
2014{
2015 struct task_struct *p, *n;
2016 struct cfs_rq *cfs_rq;
2017 int pinned = 0;
2018
2019 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2020 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2021
2022 if (!can_migrate_task(p, busiest, this_cpu,
2023 sd, idle, &pinned))
2024 continue;
2025
2026 pull_task(busiest, p, this_rq, this_cpu);
2027
2028
2029
2030
2031
2032 schedstat_inc(sd, lb_gained[idle]);
2033 return 1;
2034 }
2035 }
2036
2037 return 0;
2038}
2039
2040static unsigned long
2041balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2042 unsigned long max_load_move, struct sched_domain *sd,
2043 enum cpu_idle_type idle, int *all_pinned,
2044 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2045{
2046 int loops = 0, pulled = 0, pinned = 0;
2047 long rem_load_move = max_load_move;
2048 struct task_struct *p, *n;
2049
2050 if (max_load_move == 0)
2051 goto out;
2052
2053 pinned = 1;
2054
2055 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2056 if (loops++ > sysctl_sched_nr_migrate)
2057 break;
2058
2059 if ((p->se.load.weight >> 1) > rem_load_move ||
2060 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
2061 continue;
2062
2063 pull_task(busiest, p, this_rq, this_cpu);
2064 pulled++;
2065 rem_load_move -= p->se.load.weight;
2066
2067#ifdef CONFIG_PREEMPT
2068
2069
2070
2071
2072
2073 if (idle == CPU_NEWLY_IDLE)
2074 break;
2075#endif
2076
2077
2078
2079
2080
2081 if (rem_load_move <= 0)
2082 break;
2083
2084 if (p->prio < *this_best_prio)
2085 *this_best_prio = p->prio;
2086 }
2087out:
2088
2089
2090
2091
2092
2093 schedstat_add(sd, lb_gained[idle], pulled);
2094
2095 if (all_pinned)
2096 *all_pinned = pinned;
2097
2098 return max_load_move - rem_load_move;
2099}
2100
2101#ifdef CONFIG_FAIR_GROUP_SCHED
2102
2103
2104
2105static int update_shares_cpu(struct task_group *tg, int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 unsigned long flags;
2109 struct rq *rq;
2110
2111 if (!tg->se[cpu])
2112 return 0;
2113
2114 rq = cpu_rq(cpu);
2115 cfs_rq = tg->cfs_rq[cpu];
2116
2117 raw_spin_lock_irqsave(&rq->lock, flags);
2118
2119 update_rq_clock(rq);
2120 update_cfs_load(cfs_rq, 1);
2121
2122
2123
2124
2125
2126 update_cfs_shares(cfs_rq, 0);
2127
2128 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129
2130 return 0;
2131}
2132
2133static void update_shares(int cpu)
2134{
2135 struct cfs_rq *cfs_rq;
2136 struct rq *rq = cpu_rq(cpu);
2137
2138 rcu_read_lock();
2139 for_each_leaf_cfs_rq(rq, cfs_rq)
2140 update_shares_cpu(cfs_rq->tg, cpu);
2141 rcu_read_unlock();
2142}
2143
2144static unsigned long
2145load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2146 unsigned long max_load_move,
2147 struct sched_domain *sd, enum cpu_idle_type idle,
2148 int *all_pinned, int *this_best_prio)
2149{
2150 long rem_load_move = max_load_move;
2151 int busiest_cpu = cpu_of(busiest);
2152 struct task_group *tg;
2153
2154 rcu_read_lock();
2155 update_h_load(busiest_cpu);
2156
2157 list_for_each_entry_rcu(tg, &task_groups, list) {
2158 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2159 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2160 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2161 u64 rem_load, moved_load;
2162
2163
2164
2165
2166 if (!busiest_cfs_rq->task_weight)
2167 continue;
2168
2169 rem_load = (u64)rem_load_move * busiest_weight;
2170 rem_load = div_u64(rem_load, busiest_h_load + 1);
2171
2172 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2173 rem_load, sd, idle, all_pinned, this_best_prio,
2174 busiest_cfs_rq);
2175
2176 if (!moved_load)
2177 continue;
2178
2179 moved_load *= busiest_h_load;
2180 moved_load = div_u64(moved_load, busiest_weight + 1);
2181
2182 rem_load_move -= moved_load;
2183 if (rem_load_move < 0)
2184 break;
2185 }
2186 rcu_read_unlock();
2187
2188 return max_load_move - rem_load_move;
2189}
2190#else
2191static inline void update_shares(int cpu)
2192{
2193}
2194
2195static unsigned long
2196load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2197 unsigned long max_load_move,
2198 struct sched_domain *sd, enum cpu_idle_type idle,
2199 int *all_pinned, int *this_best_prio)
2200{
2201 return balance_tasks(this_rq, this_cpu, busiest,
2202 max_load_move, sd, idle, all_pinned,
2203 this_best_prio, &busiest->cfs);
2204}
2205#endif
2206
2207
2208
2209
2210
2211
2212
2213
2214static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2215 unsigned long max_load_move,
2216 struct sched_domain *sd, enum cpu_idle_type idle,
2217 int *all_pinned)
2218{
2219 unsigned long total_load_moved = 0, load_moved;
2220 int this_best_prio = this_rq->curr->prio;
2221
2222 do {
2223 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2224 max_load_move - total_load_moved,
2225 sd, idle, all_pinned, &this_best_prio);
2226
2227 total_load_moved += load_moved;
2228
2229#ifdef CONFIG_PREEMPT
2230
2231
2232
2233
2234
2235 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2236 break;
2237
2238 if (raw_spin_is_contended(&this_rq->lock) ||
2239 raw_spin_is_contended(&busiest->lock))
2240 break;
2241#endif
2242 } while (load_moved && max_load_move > total_load_moved);
2243
2244 return total_load_moved > 0;
2245}
2246
2247
2248
2249
2250
2251
2252struct sd_lb_stats {
2253 struct sched_group *busiest;
2254 struct sched_group *this;
2255 unsigned long total_load;
2256 unsigned long total_pwr;
2257 unsigned long avg_load;
2258
2259
2260 unsigned long this_load;
2261 unsigned long this_load_per_task;
2262 unsigned long this_nr_running;
2263 unsigned long this_has_capacity;
2264 unsigned int this_idle_cpus;
2265
2266
2267 unsigned int busiest_idle_cpus;
2268 unsigned long max_load;
2269 unsigned long busiest_load_per_task;
2270 unsigned long busiest_nr_running;
2271 unsigned long busiest_group_capacity;
2272 unsigned long busiest_has_capacity;
2273 unsigned int busiest_group_weight;
2274
2275 int group_imb;
2276#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2277 int power_savings_balance;
2278 struct sched_group *group_min;
2279 struct sched_group *group_leader;
2280 unsigned long min_load_per_task;
2281 unsigned long leader_nr_running;
2282 unsigned long min_nr_running;
2283#endif
2284};
2285
2286
2287
2288
2289struct sg_lb_stats {
2290 unsigned long avg_load;
2291 unsigned long group_load;
2292 unsigned long sum_nr_running;
2293 unsigned long sum_weighted_load;
2294 unsigned long group_capacity;
2295 unsigned long idle_cpus;
2296 unsigned long group_weight;
2297 int group_imb;
2298 int group_has_capacity;
2299};
2300
2301
2302
2303
2304
2305static inline unsigned int group_first_cpu(struct sched_group *group)
2306{
2307 return cpumask_first(sched_group_cpus(group));
2308}
2309
2310
2311
2312
2313
2314
2315static inline int get_sd_load_idx(struct sched_domain *sd,
2316 enum cpu_idle_type idle)
2317{
2318 int load_idx;
2319
2320 switch (idle) {
2321 case CPU_NOT_IDLE:
2322 load_idx = sd->busy_idx;
2323 break;
2324
2325 case CPU_NEWLY_IDLE:
2326 load_idx = sd->newidle_idx;
2327 break;
2328 default:
2329 load_idx = sd->idle_idx;
2330 break;
2331 }
2332
2333 return load_idx;
2334}
2335
2336
2337#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2338
2339
2340
2341
2342
2343
2344
2345
2346static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2347 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2348{
2349
2350
2351
2352
2353 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2354 sds->power_savings_balance = 0;
2355 else {
2356 sds->power_savings_balance = 1;
2357 sds->min_nr_running = ULONG_MAX;
2358 sds->leader_nr_running = 0;
2359 }
2360}
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372static inline void update_sd_power_savings_stats(struct sched_group *group,
2373 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2374{
2375
2376 if (!sds->power_savings_balance)
2377 return;
2378
2379
2380
2381
2382
2383 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2384 !sds->this_nr_running))
2385 sds->power_savings_balance = 0;
2386
2387
2388
2389
2390
2391 if (!sds->power_savings_balance ||
2392 sgs->sum_nr_running >= sgs->group_capacity ||
2393 !sgs->sum_nr_running)
2394 return;
2395
2396
2397
2398
2399
2400
2401 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2402 (sgs->sum_nr_running == sds->min_nr_running &&
2403 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2404 sds->group_min = group;
2405 sds->min_nr_running = sgs->sum_nr_running;
2406 sds->min_load_per_task = sgs->sum_weighted_load /
2407 sgs->sum_nr_running;
2408 }
2409
2410
2411
2412
2413
2414
2415 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2416 return;
2417
2418 if (sgs->sum_nr_running > sds->leader_nr_running ||
2419 (sgs->sum_nr_running == sds->leader_nr_running &&
2420 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2421 sds->group_leader = group;
2422 sds->leader_nr_running = sgs->sum_nr_running;
2423 }
2424}
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2442 int this_cpu, unsigned long *imbalance)
2443{
2444 if (!sds->power_savings_balance)
2445 return 0;
2446
2447 if (sds->this != sds->group_leader ||
2448 sds->group_leader == sds->group_min)
2449 return 0;
2450
2451 *imbalance = sds->min_load_per_task;
2452 sds->busiest = sds->group_min;
2453
2454 return 1;
2455
2456}
2457#else
2458static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2459 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2460{
2461 return;
2462}
2463
2464static inline void update_sd_power_savings_stats(struct sched_group *group,
2465 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2466{
2467 return;
2468}
2469
2470static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2471 int this_cpu, unsigned long *imbalance)
2472{
2473 return 0;
2474}
2475#endif
2476
2477
2478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2479{
2480 return SCHED_LOAD_SCALE;
2481}
2482
2483unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2484{
2485 return default_scale_freq_power(sd, cpu);
2486}
2487
2488unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2489{
2490 unsigned long weight = sd->span_weight;
2491 unsigned long smt_gain = sd->smt_gain;
2492
2493 smt_gain /= weight;
2494
2495 return smt_gain;
2496}
2497
2498unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2499{
2500 return default_scale_smt_power(sd, cpu);
2501}
2502
2503unsigned long scale_rt_power(int cpu)
2504{
2505 struct rq *rq = cpu_rq(cpu);
2506 u64 total, available;
2507
2508 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2509
2510 if (unlikely(total < rq->rt_avg)) {
2511
2512 available = 0;
2513 } else {
2514 available = total - rq->rt_avg;
2515 }
2516
2517 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2518 total = SCHED_LOAD_SCALE;
2519
2520 total >>= SCHED_LOAD_SHIFT;
2521
2522 return div_u64(available, total);
2523}
2524
2525static void update_cpu_power(struct sched_domain *sd, int cpu)
2526{
2527 unsigned long weight = sd->span_weight;
2528 unsigned long power = SCHED_LOAD_SCALE;
2529 struct sched_group *sdg = sd->groups;
2530
2531 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2532 if (sched_feat(ARCH_POWER))
2533 power *= arch_scale_smt_power(sd, cpu);
2534 else
2535 power *= default_scale_smt_power(sd, cpu);
2536
2537 power >>= SCHED_LOAD_SHIFT;
2538 }
2539
2540 sdg->cpu_power_orig = power;
2541
2542 if (sched_feat(ARCH_POWER))
2543 power *= arch_scale_freq_power(sd, cpu);
2544 else
2545 power *= default_scale_freq_power(sd, cpu);
2546
2547 power >>= SCHED_LOAD_SHIFT;
2548
2549 power *= scale_rt_power(cpu);
2550 power >>= SCHED_LOAD_SHIFT;
2551
2552 if (!power)
2553 power = 1;
2554
2555 cpu_rq(cpu)->cpu_power = power;
2556 sdg->cpu_power = power;
2557}
2558
2559static void update_group_power(struct sched_domain *sd, int cpu)
2560{
2561 struct sched_domain *child = sd->child;
2562 struct sched_group *group, *sdg = sd->groups;
2563 unsigned long power;
2564
2565 if (!child) {
2566 update_cpu_power(sd, cpu);
2567 return;
2568 }
2569
2570 power = 0;
2571
2572 group = child->groups;
2573 do {
2574 power += group->cpu_power;
2575 group = group->next;
2576 } while (group != child->groups);
2577
2578 sdg->cpu_power = power;
2579}
2580
2581
2582
2583
2584
2585
2586
2587
2588static inline int
2589fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2590{
2591
2592
2593
2594 if (sd->level != SD_LV_SIBLING)
2595 return 0;
2596
2597
2598
2599
2600 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2601 return 1;
2602
2603 return 0;
2604}
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2622 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs)
2624{
2625 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2626 int i;
2627 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2628 unsigned long avg_load_per_task = 0;
2629
2630 if (local_group)
2631 balance_cpu = group_first_cpu(group);
2632
2633
2634 max_cpu_load = 0;
2635 min_cpu_load = ~0UL;
2636 max_nr_running = 0;
2637
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i);
2640
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644
2645 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) {
2647 first_idle_cpu = 1;
2648 balance_cpu = i;
2649 }
2650
2651 load = target_load(i, load_idx);
2652 } else {
2653 load = source_load(i, load_idx);
2654 if (load > max_cpu_load) {
2655 max_cpu_load = load;
2656 max_nr_running = rq->nr_running;
2657 }
2658 if (min_cpu_load > load)
2659 min_cpu_load = load;
2660 }
2661
2662 sgs->group_load += load;
2663 sgs->sum_nr_running += rq->nr_running;
2664 sgs->sum_weighted_load += weighted_cpuload(i);
2665 if (idle_cpu(i))
2666 sgs->idle_cpus++;
2667 }
2668
2669
2670
2671
2672
2673
2674
2675 if (idle != CPU_NEWLY_IDLE && local_group) {
2676 if (balance_cpu != this_cpu) {
2677 *balance = 0;
2678 return;
2679 }
2680 update_group_power(sd, this_cpu);
2681 }
2682
2683
2684 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1;
2700
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2702 if (!sgs->group_capacity)
2703 sgs->group_capacity = fix_small_capacity(sd, group);
2704 sgs->group_weight = group->group_weight;
2705
2706 if (sgs->group_capacity > sgs->sum_nr_running)
2707 sgs->group_has_capacity = 1;
2708}
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721static bool update_sd_pick_busiest(struct sched_domain *sd,
2722 struct sd_lb_stats *sds,
2723 struct sched_group *sg,
2724 struct sg_lb_stats *sgs,
2725 int this_cpu)
2726{
2727 if (sgs->avg_load <= sds->max_load)
2728 return false;
2729
2730 if (sgs->sum_nr_running > sgs->group_capacity)
2731 return true;
2732
2733 if (sgs->group_imb)
2734 return true;
2735
2736
2737
2738
2739
2740
2741 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2742 this_cpu < group_first_cpu(sg)) {
2743 if (!sds->busiest)
2744 return true;
2745
2746 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2747 return true;
2748 }
2749
2750 return false;
2751}
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle,
2765 const struct cpumask *cpus, int *balance,
2766 struct sd_lb_stats *sds)
2767{
2768 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups;
2770 struct sg_lb_stats sgs;
2771 int load_idx, prefer_sibling = 0;
2772
2773 if (child && child->flags & SD_PREFER_SIBLING)
2774 prefer_sibling = 1;
2775
2776 init_sd_power_savings_stats(sd, sds, idle);
2777 load_idx = get_sd_load_idx(sd, idle);
2778
2779 do {
2780 int local_group;
2781
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2785 local_group, cpus, balance, &sgs);
2786
2787 if (local_group && !(*balance))
2788 return;
2789
2790 sds->total_load += sgs.group_load;
2791 sds->total_pwr += sg->cpu_power;
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803 if (prefer_sibling && !local_group && sds->this_has_capacity)
2804 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2805
2806 if (local_group) {
2807 sds->this_load = sgs.avg_load;
2808 sds->this = sg;
2809 sds->this_nr_running = sgs.sum_nr_running;
2810 sds->this_load_per_task = sgs.sum_weighted_load;
2811 sds->this_has_capacity = sgs.group_has_capacity;
2812 sds->this_idle_cpus = sgs.idle_cpus;
2813 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2814 sds->max_load = sgs.avg_load;
2815 sds->busiest = sg;
2816 sds->busiest_nr_running = sgs.sum_nr_running;
2817 sds->busiest_idle_cpus = sgs.idle_cpus;
2818 sds->busiest_group_capacity = sgs.group_capacity;
2819 sds->busiest_load_per_task = sgs.sum_weighted_load;
2820 sds->busiest_has_capacity = sgs.group_has_capacity;
2821 sds->busiest_group_weight = sgs.group_weight;
2822 sds->group_imb = sgs.group_imb;
2823 }
2824
2825 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2826 sg = sg->next;
2827 } while (sg != sd->groups);
2828}
2829
2830int __weak arch_sd_sibling_asym_packing(void)
2831{
2832 return 0*SD_ASYM_PACKING;
2833}
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860static int check_asym_packing(struct sched_domain *sd,
2861 struct sd_lb_stats *sds,
2862 int this_cpu, unsigned long *imbalance)
2863{
2864 int busiest_cpu;
2865
2866 if (!(sd->flags & SD_ASYM_PACKING))
2867 return 0;
2868
2869 if (!sds->busiest)
2870 return 0;
2871
2872 busiest_cpu = group_first_cpu(sds->busiest);
2873 if (this_cpu > busiest_cpu)
2874 return 0;
2875
2876 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2877 SCHED_LOAD_SCALE);
2878 return 1;
2879}
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2890 int this_cpu, unsigned long *imbalance)
2891{
2892 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2893 unsigned int imbn = 2;
2894 unsigned long scaled_busy_load_per_task;
2895
2896 if (sds->this_nr_running) {
2897 sds->this_load_per_task /= sds->this_nr_running;
2898 if (sds->busiest_load_per_task >
2899 sds->this_load_per_task)
2900 imbn = 1;
2901 } else
2902 sds->this_load_per_task =
2903 cpu_avg_load_per_task(this_cpu);
2904
2905 scaled_busy_load_per_task = sds->busiest_load_per_task
2906 * SCHED_LOAD_SCALE;
2907 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2908
2909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2910 (scaled_busy_load_per_task * imbn)) {
2911 *imbalance = sds->busiest_load_per_task;
2912 return;
2913 }
2914
2915
2916
2917
2918
2919
2920
2921 pwr_now += sds->busiest->cpu_power *
2922 min(sds->busiest_load_per_task, sds->max_load);
2923 pwr_now += sds->this->cpu_power *
2924 min(sds->this_load_per_task, sds->this_load);
2925 pwr_now /= SCHED_LOAD_SCALE;
2926
2927
2928 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2929 sds->busiest->cpu_power;
2930 if (sds->max_load > tmp)
2931 pwr_move += sds->busiest->cpu_power *
2932 min(sds->busiest_load_per_task, sds->max_load - tmp);
2933
2934
2935 if (sds->max_load * sds->busiest->cpu_power <
2936 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2937 tmp = (sds->max_load * sds->busiest->cpu_power) /
2938 sds->this->cpu_power;
2939 else
2940 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2941 sds->this->cpu_power;
2942 pwr_move += sds->this->cpu_power *
2943 min(sds->this_load_per_task, sds->this_load + tmp);
2944 pwr_move /= SCHED_LOAD_SCALE;
2945
2946
2947 if (pwr_move > pwr_now)
2948 *imbalance = sds->busiest_load_per_task;
2949}
2950
2951
2952
2953
2954
2955
2956
2957
2958static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2959 unsigned long *imbalance)
2960{
2961 unsigned long max_pull, load_above_capacity = ~0UL;
2962
2963 sds->busiest_load_per_task /= sds->busiest_nr_running;
2964 if (sds->group_imb) {
2965 sds->busiest_load_per_task =
2966 min(sds->busiest_load_per_task, sds->avg_load);
2967 }
2968
2969
2970
2971
2972
2973
2974 if (sds->max_load < sds->avg_load) {
2975 *imbalance = 0;
2976 return fix_small_imbalance(sds, this_cpu, imbalance);
2977 }
2978
2979 if (!sds->group_imb) {
2980
2981
2982
2983 load_above_capacity = (sds->busiest_nr_running -
2984 sds->busiest_group_capacity);
2985
2986 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2987
2988 load_above_capacity /= sds->busiest->cpu_power;
2989 }
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3002
3003
3004 *imbalance = min(max_pull * sds->busiest->cpu_power,
3005 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3006 / SCHED_LOAD_SCALE;
3007
3008
3009
3010
3011
3012
3013
3014 if (*imbalance < sds->busiest_load_per_task)
3015 return fix_small_imbalance(sds, this_cpu, imbalance);
3016
3017}
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance)
3050{
3051 struct sd_lb_stats sds;
3052
3053 memset(&sds, 0, sizeof(sds));
3054
3055
3056
3057
3058
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3060 balance, &sds);
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076 if (!(*balance))
3077 goto ret;
3078
3079 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
3080 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest;
3082
3083 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced;
3085
3086
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity)
3089 goto force_balance;
3090
3091 if (sds.this_load >= sds.max_load)
3092 goto out_balanced;
3093
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3095
3096 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced;
3098
3099
3100
3101
3102
3103
3104
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109
3110
3111
3112
3113
3114
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced;
3118 }
3119
3120force_balance:
3121
3122 calculate_imbalance(&sds, this_cpu, imbalance);
3123 return sds.busiest;
3124
3125out_balanced:
3126
3127
3128
3129
3130 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3131 return sds.busiest;
3132ret:
3133 *imbalance = 0;
3134 return NULL;
3135}
3136
3137
3138
3139
3140static struct rq *
3141find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3142 enum cpu_idle_type idle, unsigned long imbalance,
3143 const struct cpumask *cpus)
3144{
3145 struct rq *busiest = NULL, *rq;
3146 unsigned long max_load = 0;
3147 int i;
3148
3149 for_each_cpu(i, sched_group_cpus(group)) {
3150 unsigned long power = power_of(i);
3151 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3152 unsigned long wl;
3153
3154 if (!capacity)
3155 capacity = fix_small_capacity(sd, group);
3156
3157 if (!cpumask_test_cpu(i, cpus))
3158 continue;
3159
3160 rq = cpu_rq(i);
3161 wl = weighted_cpuload(i);
3162
3163
3164
3165
3166
3167 if (capacity && rq->nr_running == 1 && wl > imbalance)
3168 continue;
3169
3170
3171
3172
3173
3174
3175
3176 wl = (wl * SCHED_LOAD_SCALE) / power;
3177
3178 if (wl > max_load) {
3179 max_load = wl;
3180 busiest = rq;
3181 }
3182 }
3183
3184 return busiest;
3185}
3186
3187
3188
3189
3190
3191#define MAX_PINNED_INTERVAL 512
3192
3193
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3197 int busiest_cpu, int this_cpu)
3198{
3199 if (idle == CPU_NEWLY_IDLE) {
3200
3201
3202
3203
3204
3205
3206 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
3207 return 1;
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0;
3234 }
3235
3236 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
3237}
3238
3239static int active_load_balance_cpu_stop(void *data);
3240
3241
3242
3243
3244
3245static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance)
3248{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3250 struct sched_group *group;
3251 unsigned long imbalance;
3252 struct rq *busiest;
3253 unsigned long flags;
3254 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3255
3256 cpumask_copy(cpus, cpu_active_mask);
3257
3258
3259
3260
3261
3262
3263
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]);
3269
3270redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3272 cpus, balance);
3273
3274 if (*balance == 0)
3275 goto out_balanced;
3276
3277 if (!group) {
3278 schedstat_inc(sd, lb_nobusyg[idle]);
3279 goto out_balanced;
3280 }
3281
3282 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
3283 if (!busiest) {
3284 schedstat_inc(sd, lb_nobusyq[idle]);
3285 goto out_balanced;
3286 }
3287
3288 BUG_ON(busiest == this_rq);
3289
3290 schedstat_add(sd, lb_imbalance[idle], imbalance);
3291
3292 ld_moved = 0;
3293 if (busiest->nr_running > 1) {
3294
3295
3296
3297
3298
3299
3300 local_irq_save(flags);
3301 double_rq_lock(this_rq, busiest);
3302 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3303 imbalance, sd, idle, &all_pinned);
3304 double_rq_unlock(this_rq, busiest);
3305 local_irq_restore(flags);
3306
3307
3308
3309
3310 if (ld_moved && this_cpu != smp_processor_id())
3311 resched_cpu(this_cpu);
3312
3313
3314 if (unlikely(all_pinned)) {
3315 cpumask_clear_cpu(cpu_of(busiest), cpus);
3316 if (!cpumask_empty(cpus))
3317 goto redo;
3318 goto out_balanced;
3319 }
3320 }
3321
3322 if (!ld_moved) {
3323 schedstat_inc(sd, lb_failed[idle]);
3324
3325
3326
3327
3328
3329
3330 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++;
3332
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags);
3336
3337
3338
3339
3340
3341 if (!cpumask_test_cpu(this_cpu,
3342 &busiest->curr->cpus_allowed)) {
3343 raw_spin_unlock_irqrestore(&busiest->lock,
3344 flags);
3345 all_pinned = 1;
3346 goto out_one_pinned;
3347 }
3348
3349
3350
3351
3352
3353
3354 if (!busiest->active_balance) {
3355 busiest->active_balance = 1;
3356 busiest->push_cpu = this_cpu;
3357 active_balance = 1;
3358 }
3359 raw_spin_unlock_irqrestore(&busiest->lock, flags);
3360
3361 if (active_balance)
3362 stop_one_cpu_nowait(cpu_of(busiest),
3363 active_load_balance_cpu_stop, busiest,
3364 &busiest->active_balance_work);
3365
3366
3367
3368
3369
3370 sd->nr_balance_failed = sd->cache_nice_tries+1;
3371 }
3372 } else
3373 sd->nr_balance_failed = 0;
3374
3375 if (likely(!active_balance)) {
3376
3377 sd->balance_interval = sd->min_interval;
3378 } else {
3379
3380
3381
3382
3383
3384
3385 if (sd->balance_interval < sd->max_interval)
3386 sd->balance_interval *= 2;
3387 }
3388
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out;
3394
3395out_balanced:
3396 schedstat_inc(sd, lb_balanced[idle]);
3397
3398 sd->nr_balance_failed = 0;
3399
3400out_one_pinned:
3401
3402 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3403 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2;
3405
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out:
3412 return ld_moved;
3413}
3414
3415
3416
3417
3418
3419static void idle_balance(int this_cpu, struct rq *this_rq)
3420{
3421 struct sched_domain *sd;
3422 int pulled_task = 0;
3423 unsigned long next_balance = jiffies + HZ;
3424
3425 this_rq->idle_stamp = this_rq->clock;
3426
3427 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3428 return;
3429
3430
3431
3432
3433 raw_spin_unlock(&this_rq->lock);
3434
3435 update_shares(this_cpu);
3436 for_each_domain(this_cpu, sd) {
3437 unsigned long interval;
3438 int balance = 1;
3439
3440 if (!(sd->flags & SD_LOAD_BALANCE))
3441 continue;
3442
3443 if (sd->flags & SD_BALANCE_NEWIDLE) {
3444
3445 pulled_task = load_balance(this_cpu, this_rq,
3446 sd, CPU_NEWLY_IDLE, &balance);
3447 }
3448
3449 interval = msecs_to_jiffies(sd->balance_interval);
3450 if (time_after(next_balance, sd->last_balance + interval))
3451 next_balance = sd->last_balance + interval;
3452 if (pulled_task) {
3453 this_rq->idle_stamp = 0;
3454 break;
3455 }
3456 }
3457
3458 raw_spin_lock(&this_rq->lock);
3459
3460 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3461
3462
3463
3464
3465 this_rq->next_balance = next_balance;
3466 }
3467}
3468
3469
3470
3471
3472
3473
3474
3475static int active_load_balance_cpu_stop(void *data)
3476{
3477 struct rq *busiest_rq = data;
3478 int busiest_cpu = cpu_of(busiest_rq);
3479 int target_cpu = busiest_rq->push_cpu;
3480 struct rq *target_rq = cpu_rq(target_cpu);
3481 struct sched_domain *sd;
3482
3483 raw_spin_lock_irq(&busiest_rq->lock);
3484
3485
3486 if (unlikely(busiest_cpu != smp_processor_id() ||
3487 !busiest_rq->active_balance))
3488 goto out_unlock;
3489
3490
3491 if (busiest_rq->nr_running <= 1)
3492 goto out_unlock;
3493
3494
3495
3496
3497
3498
3499 BUG_ON(busiest_rq == target_rq);
3500
3501
3502 double_lock_balance(busiest_rq, target_rq);
3503
3504
3505 for_each_domain(target_cpu, sd) {
3506 if ((sd->flags & SD_LOAD_BALANCE) &&
3507 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3508 break;
3509 }
3510
3511 if (likely(sd)) {
3512 schedstat_inc(sd, alb_count);
3513
3514 if (move_one_task(target_rq, target_cpu, busiest_rq,
3515 sd, CPU_IDLE))
3516 schedstat_inc(sd, alb_pushed);
3517 else
3518 schedstat_inc(sd, alb_failed);
3519 }
3520 double_unlock_balance(busiest_rq, target_rq);
3521out_unlock:
3522 busiest_rq->active_balance = 0;
3523 raw_spin_unlock_irq(&busiest_rq->lock);
3524 return 0;
3525}
3526
3527#ifdef CONFIG_NO_HZ
3528
3529static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3530
3531static void trigger_sched_softirq(void *data)
3532{
3533 raise_softirq_irqoff(SCHED_SOFTIRQ);
3534}
3535
3536static inline void init_sched_softirq_csd(struct call_single_data *csd)
3537{
3538 csd->func = trigger_sched_softirq;
3539 csd->info = NULL;
3540 csd->flags = 0;
3541 csd->priv = 0;
3542}
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554static struct {
3555 atomic_t load_balancer;
3556 atomic_t first_pick_cpu;
3557 atomic_t second_pick_cpu;
3558 cpumask_var_t idle_cpus_mask;
3559 cpumask_var_t grp_idle_mask;
3560 unsigned long next_balance;
3561} nohz ____cacheline_aligned;
3562
3563int get_nohz_load_balancer(void)
3564{
3565 return atomic_read(&nohz.load_balancer);
3566}
3567
3568#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3579{
3580 struct sched_domain *sd;
3581
3582 for_each_domain(cpu, sd)
3583 if (sd && (sd->flags & flag))
3584 break;
3585
3586 return sd;
3587}
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599#define for_each_flag_domain(cpu, sd, flag) \
3600 for (sd = lowest_flag_domain(cpu, flag); \
3601 (sd && (sd->flags & flag)); sd = sd->parent)
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613static inline int is_semi_idle_group(struct sched_group *ilb_group)
3614{
3615 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3616 sched_group_cpus(ilb_group));
3617
3618
3619
3620
3621
3622 if (cpumask_empty(nohz.grp_idle_mask))
3623 return 0;
3624
3625 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3626 return 0;
3627
3628 return 1;
3629}
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642static int find_new_ilb(int cpu)
3643{
3644 struct sched_domain *sd;
3645 struct sched_group *ilb_group;
3646
3647
3648
3649
3650
3651 if (!(sched_smt_power_savings || sched_mc_power_savings))
3652 goto out_done;
3653
3654
3655
3656
3657
3658 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3659 goto out_done;
3660
3661 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3662 ilb_group = sd->groups;
3663
3664 do {
3665 if (is_semi_idle_group(ilb_group))
3666 return cpumask_first(nohz.grp_idle_mask);
3667
3668 ilb_group = ilb_group->next;
3669
3670 } while (ilb_group != sd->groups);
3671 }
3672
3673out_done:
3674 return nr_cpu_ids;
3675}
3676#else
3677static inline int find_new_ilb(int call_cpu)
3678{
3679 return nr_cpu_ids;
3680}
3681#endif
3682
3683
3684
3685
3686
3687
3688static void nohz_balancer_kick(int cpu)
3689{
3690 int ilb_cpu;
3691
3692 nohz.next_balance++;
3693
3694 ilb_cpu = get_nohz_load_balancer();
3695
3696 if (ilb_cpu >= nr_cpu_ids) {
3697 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3698 if (ilb_cpu >= nr_cpu_ids)
3699 return;
3700 }
3701
3702 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3703 struct call_single_data *cp;
3704
3705 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3706 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3707 __smp_call_function_single(ilb_cpu, cp, 0);
3708 }
3709 return;
3710}
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725void select_nohz_load_balancer(int stop_tick)
3726{
3727 int cpu = smp_processor_id();
3728
3729 if (stop_tick) {
3730 if (!cpu_active(cpu)) {
3731 if (atomic_read(&nohz.load_balancer) != cpu)
3732 return;
3733
3734
3735
3736
3737
3738 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3739 nr_cpu_ids) != cpu)
3740 BUG();
3741
3742 return;
3743 }
3744
3745 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3746
3747 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3748 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3749 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3750 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3751
3752 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3753 int new_ilb;
3754
3755
3756 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3757 cpu) != nr_cpu_ids)
3758 return;
3759
3760
3761
3762
3763
3764 new_ilb = find_new_ilb(cpu);
3765 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3766 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3767 resched_cpu(new_ilb);
3768 return;
3769 }
3770 return;
3771 }
3772 } else {
3773 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3774 return;
3775
3776 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3777
3778 if (atomic_read(&nohz.load_balancer) == cpu)
3779 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3780 nr_cpu_ids) != cpu)
3781 BUG();
3782 }
3783 return;
3784}
3785#endif
3786
3787static DEFINE_SPINLOCK(balancing);
3788
3789
3790
3791
3792
3793
3794
3795static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3796{
3797 int balance = 1;
3798 struct rq *rq = cpu_rq(cpu);
3799 unsigned long interval;
3800 struct sched_domain *sd;
3801
3802 unsigned long next_balance = jiffies + 60*HZ;
3803 int update_next_balance = 0;
3804 int need_serialize;
3805
3806 update_shares(cpu);
3807
3808 for_each_domain(cpu, sd) {
3809 if (!(sd->flags & SD_LOAD_BALANCE))
3810 continue;
3811
3812 interval = sd->balance_interval;
3813 if (idle != CPU_IDLE)
3814 interval *= sd->busy_factor;
3815
3816
3817 interval = msecs_to_jiffies(interval);
3818 if (unlikely(!interval))
3819 interval = 1;
3820 if (interval > HZ*NR_CPUS/10)
3821 interval = HZ*NR_CPUS/10;
3822
3823 need_serialize = sd->flags & SD_SERIALIZE;
3824
3825 if (need_serialize) {
3826 if (!spin_trylock(&balancing))
3827 goto out;
3828 }
3829
3830 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3831 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832
3833
3834
3835
3836
3837 idle = CPU_NOT_IDLE;
3838 }
3839 sd->last_balance = jiffies;
3840 }
3841 if (need_serialize)
3842 spin_unlock(&balancing);
3843out:
3844 if (time_after(next_balance, sd->last_balance + interval)) {
3845 next_balance = sd->last_balance + interval;
3846 update_next_balance = 1;
3847 }
3848
3849
3850
3851
3852
3853
3854 if (!balance)
3855 break;
3856 }
3857
3858
3859
3860
3861
3862
3863 if (likely(update_next_balance))
3864 rq->next_balance = next_balance;
3865}
3866
3867#ifdef CONFIG_NO_HZ
3868
3869
3870
3871
3872static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3873{
3874 struct rq *this_rq = cpu_rq(this_cpu);
3875 struct rq *rq;
3876 int balance_cpu;
3877
3878 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3879 return;
3880
3881 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3882 if (balance_cpu == this_cpu)
3883 continue;
3884
3885
3886
3887
3888
3889
3890 if (need_resched()) {
3891 this_rq->nohz_balance_kick = 0;
3892 break;
3893 }
3894
3895 raw_spin_lock_irq(&this_rq->lock);
3896 update_rq_clock(this_rq);
3897 update_cpu_load(this_rq);
3898 raw_spin_unlock_irq(&this_rq->lock);
3899
3900 rebalance_domains(balance_cpu, CPU_IDLE);
3901
3902 rq = cpu_rq(balance_cpu);
3903 if (time_after(this_rq->next_balance, rq->next_balance))
3904 this_rq->next_balance = rq->next_balance;
3905 }
3906 nohz.next_balance = this_rq->next_balance;
3907 this_rq->nohz_balance_kick = 0;
3908}
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922static inline int nohz_kick_needed(struct rq *rq, int cpu)
3923{
3924 unsigned long now = jiffies;
3925 int ret;
3926 int first_pick_cpu, second_pick_cpu;
3927
3928 if (time_before(now, nohz.next_balance))
3929 return 0;
3930
3931 if (rq->idle_at_tick)
3932 return 0;
3933
3934 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3935 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3936
3937 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3938 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3939 return 0;
3940
3941 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3942 if (ret == nr_cpu_ids || ret == cpu) {
3943 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3944 if (rq->nr_running > 1)
3945 return 1;
3946 } else {
3947 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3948 if (ret == nr_cpu_ids || ret == cpu) {
3949 if (rq->nr_running)
3950 return 1;
3951 }
3952 }
3953 return 0;
3954}
3955#else
3956static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3957#endif
3958
3959
3960
3961
3962
3963static void run_rebalance_domains(struct softirq_action *h)
3964{
3965 int this_cpu = smp_processor_id();
3966 struct rq *this_rq = cpu_rq(this_cpu);
3967 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3968 CPU_IDLE : CPU_NOT_IDLE;
3969
3970 rebalance_domains(this_cpu, idle);
3971
3972
3973
3974
3975
3976
3977 nohz_idle_balance(this_cpu, idle);
3978}
3979
3980static inline int on_null_domain(int cpu)
3981{
3982 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3983}
3984
3985
3986
3987
3988static inline void trigger_load_balance(struct rq *rq, int cpu)
3989{
3990
3991 if (time_after_eq(jiffies, rq->next_balance) &&
3992 likely(!on_null_domain(cpu)))
3993 raise_softirq(SCHED_SOFTIRQ);
3994#ifdef CONFIG_NO_HZ
3995 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3996 nohz_balancer_kick(cpu);
3997#endif
3998}
3999
4000static void rq_online_fair(struct rq *rq)
4001{
4002 update_sysctl();
4003}
4004
4005static void rq_offline_fair(struct rq *rq)
4006{
4007 update_sysctl();
4008}
4009
4010#else
4011
4012
4013
4014
4015static inline void idle_balance(int cpu, struct rq *rq)
4016{
4017}
4018
4019#endif
4020
4021
4022
4023
4024static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4025{
4026 struct cfs_rq *cfs_rq;
4027 struct sched_entity *se = &curr->se;
4028
4029 for_each_sched_entity(se) {
4030 cfs_rq = cfs_rq_of(se);
4031 entity_tick(cfs_rq, se, queued);
4032 }
4033}
4034
4035
4036
4037
4038
4039
4040static void task_fork_fair(struct task_struct *p)
4041{
4042 struct cfs_rq *cfs_rq = task_cfs_rq(current);
4043 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
4044 int this_cpu = smp_processor_id();
4045 struct rq *rq = this_rq();
4046 unsigned long flags;
4047
4048 raw_spin_lock_irqsave(&rq->lock, flags);
4049
4050 update_rq_clock(rq);
4051
4052 if (unlikely(task_cpu(p) != this_cpu)) {
4053 rcu_read_lock();
4054 __set_task_cpu(p, this_cpu);
4055 rcu_read_unlock();
4056 }
4057
4058 update_curr(cfs_rq);
4059
4060 if (curr)
4061 se->vruntime = curr->vruntime;
4062 place_entity(cfs_rq, se, 1);
4063
4064 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
4065
4066
4067
4068
4069 swap(curr->vruntime, se->vruntime);
4070 resched_task(rq->curr);
4071 }
4072
4073 se->vruntime -= cfs_rq->min_vruntime;
4074
4075 raw_spin_unlock_irqrestore(&rq->lock, flags);
4076}
4077
4078
4079
4080
4081
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p,
4083 int oldprio, int running)
4084{
4085
4086
4087
4088
4089
4090 if (running) {
4091 if (p->prio > oldprio)
4092 resched_task(rq->curr);
4093 } else
4094 check_preempt_curr(rq, p, 0);
4095}
4096
4097
4098
4099
4100static void switched_to_fair(struct rq *rq, struct task_struct *p,
4101 int running)
4102{
4103
4104
4105
4106
4107
4108 if (running)
4109 resched_task(rq->curr);
4110 else
4111 check_preempt_curr(rq, p, 0);
4112}
4113
4114
4115
4116
4117
4118
4119static void set_curr_task_fair(struct rq *rq)
4120{
4121 struct sched_entity *se = &rq->curr->se;
4122
4123 for_each_sched_entity(se)
4124 set_next_entity(cfs_rq_of(se), se);
4125}
4126
4127#ifdef CONFIG_FAIR_GROUP_SCHED
4128static void task_move_group_fair(struct task_struct *p, int on_rq)
4129{
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143 if (!on_rq)
4144 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4145 set_task_rq(p, task_cpu(p));
4146 if (!on_rq)
4147 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
4148}
4149#endif
4150
4151static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
4152{
4153 struct sched_entity *se = &task->se;
4154 unsigned int rr_interval = 0;
4155
4156
4157
4158
4159
4160 if (rq->cfs.load.weight)
4161 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
4162
4163 return rr_interval;
4164}
4165
4166
4167
4168
4169static const struct sched_class fair_sched_class = {
4170 .next = &idle_sched_class,
4171 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair,
4174
4175 .check_preempt_curr = check_preempt_wakeup,
4176
4177 .pick_next_task = pick_next_task_fair,
4178 .put_prev_task = put_prev_task_fair,
4179
4180#ifdef CONFIG_SMP
4181 .select_task_rq = select_task_rq_fair,
4182
4183 .rq_online = rq_online_fair,
4184 .rq_offline = rq_offline_fair,
4185
4186 .task_waking = task_waking_fair,
4187#endif
4188
4189 .set_curr_task = set_curr_task_fair,
4190 .task_tick = task_tick_fair,
4191 .task_fork = task_fork_fair,
4192
4193 .prio_changed = prio_changed_fair,
4194 .switched_to = switched_to_fair,
4195
4196 .get_rr_interval = get_rr_interval_fair,
4197
4198#ifdef CONFIG_FAIR_GROUP_SCHED
4199 .task_move_group = task_move_group_fair,
4200#endif
4201};
4202
4203#ifdef CONFIG_SCHED_DEBUG
4204static void print_cfs_stats(struct seq_file *m, int cpu)
4205{
4206 struct cfs_rq *cfs_rq;
4207
4208 rcu_read_lock();
4209 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
4210 print_cfs_rq(m, cpu, cfs_rq);
4211 rcu_read_unlock();
4212}
4213#endif
4214