1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25#include <trace/events/sched.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40unsigned int sysctl_sched_latency = 6000000ULL;
41unsigned int normalized_sysctl_sched_latency = 6000000ULL;
42
43
44
45
46
47
48
49
50
51
52
53
54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
55
56
57
58
59
60
61unsigned int sysctl_sched_min_granularity = 750000ULL;
62unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
63
64
65
66
67static unsigned int sched_nr_latency = 8;
68
69
70
71
72
73unsigned int sysctl_sched_child_runs_first __read_mostly;
74
75
76
77
78
79
80
81
82
83
84unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
86
87const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
88
89#ifdef CONFIG_SMP
90
91
92
93int __weak arch_asym_cpu_priority(int cpu)
94{
95 return -cpu;
96}
97#endif
98
99#ifdef CONFIG_CFS_BANDWIDTH
100
101
102
103
104
105
106
107
108
109
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111#endif
112
113
114
115
116
117
118
119unsigned int capacity_margin = 1280;
120
121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122{
123 lw->weight += inc;
124 lw->inv_weight = 0;
125}
126
127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128{
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131}
132
133static inline void update_load_set(struct load_weight *lw, unsigned long w)
134{
135 lw->weight = w;
136 lw->inv_weight = 0;
137}
138
139
140
141
142
143
144
145
146
147
148static unsigned int get_update_sysctl_factor(void)
149{
150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167}
168
169static void update_sysctl(void)
170{
171 unsigned int factor = get_update_sysctl_factor();
172
173#define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178#undef SET_SYSCTL
179}
180
181void sched_init_granularity(void)
182{
183 update_sysctl();
184}
185
186#define WMULT_CONST (~0U)
187#define WMULT_SHIFT 32
188
189static void __update_inv_weight(struct load_weight *lw)
190{
191 unsigned long w;
192
193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204}
205
206
207
208
209
210
211
212
213
214
215
216
217
218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219{
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
222
223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
230 }
231
232
233 fact = (u64)(u32)fact * lw->inv_weight;
234
235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
239
240 return mul_u64_u32_shr(delta_exec, fact, shift);
241}
242
243
244const struct sched_class fair_sched_class;
245
246
247
248
249
250#ifdef CONFIG_FAIR_GROUP_SCHED
251
252
253static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
254{
255 return cfs_rq->rq;
256}
257
258
259#define entity_is_task(se) (!se->my_q)
260
261static inline struct task_struct *task_of(struct sched_entity *se)
262{
263 SCHED_WARN_ON(!entity_is_task(se));
264 return container_of(se, struct task_struct, se);
265}
266
267
268#define for_each_sched_entity(se) \
269 for (; se; se = se->parent)
270
271static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
272{
273 return p->se.cfs_rq;
274}
275
276
277static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
278{
279 return se->cfs_rq;
280}
281
282
283static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
284{
285 return grp->my_q;
286}
287
288static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
289{
290 if (!cfs_rq->on_list) {
291 struct rq *rq = rq_of(cfs_rq);
292 int cpu = cpu_of(rq);
293
294
295
296
297
298
299
300
301
302 if (cfs_rq->tg->parent &&
303 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
304
305
306
307
308
309
310 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
311 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
312
313
314
315
316
317 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
318 } else if (!cfs_rq->tg->parent) {
319
320
321
322
323 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
324 &rq->leaf_cfs_rq_list);
325
326
327
328
329 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
330 } else {
331
332
333
334
335
336
337 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
338 rq->tmp_alone_branch);
339
340
341
342
343 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
344 }
345
346 cfs_rq->on_list = 1;
347 }
348}
349
350static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
351{
352 if (cfs_rq->on_list) {
353 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
354 cfs_rq->on_list = 0;
355 }
356}
357
358
359#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
360 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
361 leaf_cfs_rq_list)
362
363
364static inline struct cfs_rq *
365is_same_group(struct sched_entity *se, struct sched_entity *pse)
366{
367 if (se->cfs_rq == pse->cfs_rq)
368 return se->cfs_rq;
369
370 return NULL;
371}
372
373static inline struct sched_entity *parent_entity(struct sched_entity *se)
374{
375 return se->parent;
376}
377
378static void
379find_matching_se(struct sched_entity **se, struct sched_entity **pse)
380{
381 int se_depth, pse_depth;
382
383
384
385
386
387
388
389
390
391 se_depth = (*se)->depth;
392 pse_depth = (*pse)->depth;
393
394 while (se_depth > pse_depth) {
395 se_depth--;
396 *se = parent_entity(*se);
397 }
398
399 while (pse_depth > se_depth) {
400 pse_depth--;
401 *pse = parent_entity(*pse);
402 }
403
404 while (!is_same_group(*se, *pse)) {
405 *se = parent_entity(*se);
406 *pse = parent_entity(*pse);
407 }
408}
409
410#else
411
412static inline struct task_struct *task_of(struct sched_entity *se)
413{
414 return container_of(se, struct task_struct, se);
415}
416
417static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
418{
419 return container_of(cfs_rq, struct rq, cfs);
420}
421
422#define entity_is_task(se) 1
423
424#define for_each_sched_entity(se) \
425 for (; se; se = NULL)
426
427static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
428{
429 return &task_rq(p)->cfs;
430}
431
432static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
433{
434 struct task_struct *p = task_of(se);
435 struct rq *rq = task_rq(p);
436
437 return &rq->cfs;
438}
439
440
441static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
442{
443 return NULL;
444}
445
446static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
447{
448}
449
450static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
451{
452}
453
454#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
455 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
456
457static inline struct sched_entity *parent_entity(struct sched_entity *se)
458{
459 return NULL;
460}
461
462static inline void
463find_matching_se(struct sched_entity **se, struct sched_entity **pse)
464{
465}
466
467#endif
468
469static __always_inline
470void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
471
472
473
474
475
476static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
477{
478 s64 delta = (s64)(vruntime - max_vruntime);
479 if (delta > 0)
480 max_vruntime = vruntime;
481
482 return max_vruntime;
483}
484
485static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
486{
487 s64 delta = (s64)(vruntime - min_vruntime);
488 if (delta < 0)
489 min_vruntime = vruntime;
490
491 return min_vruntime;
492}
493
494static inline int entity_before(struct sched_entity *a,
495 struct sched_entity *b)
496{
497 return (s64)(a->vruntime - b->vruntime) < 0;
498}
499
500static void update_min_vruntime(struct cfs_rq *cfs_rq)
501{
502 struct sched_entity *curr = cfs_rq->curr;
503 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
504
505 u64 vruntime = cfs_rq->min_vruntime;
506
507 if (curr) {
508 if (curr->on_rq)
509 vruntime = curr->vruntime;
510 else
511 curr = NULL;
512 }
513
514 if (leftmost) {
515 struct sched_entity *se;
516 se = rb_entry(leftmost, struct sched_entity, run_node);
517
518 if (!curr)
519 vruntime = se->vruntime;
520 else
521 vruntime = min_vruntime(vruntime, se->vruntime);
522 }
523
524
525 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
526#ifndef CONFIG_64BIT
527 smp_wmb();
528 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
529#endif
530}
531
532
533
534
535static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
536{
537 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
538 struct rb_node *parent = NULL;
539 struct sched_entity *entry;
540 bool leftmost = true;
541
542
543
544
545 while (*link) {
546 parent = *link;
547 entry = rb_entry(parent, struct sched_entity, run_node);
548
549
550
551
552 if (entity_before(se, entry)) {
553 link = &parent->rb_left;
554 } else {
555 link = &parent->rb_right;
556 leftmost = false;
557 }
558 }
559
560 rb_link_node(&se->run_node, parent, link);
561 rb_insert_color_cached(&se->run_node,
562 &cfs_rq->tasks_timeline, leftmost);
563}
564
565static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
566{
567 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
568}
569
570struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
571{
572 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
573
574 if (!left)
575 return NULL;
576
577 return rb_entry(left, struct sched_entity, run_node);
578}
579
580static struct sched_entity *__pick_next_entity(struct sched_entity *se)
581{
582 struct rb_node *next = rb_next(&se->run_node);
583
584 if (!next)
585 return NULL;
586
587 return rb_entry(next, struct sched_entity, run_node);
588}
589
590#ifdef CONFIG_SCHED_DEBUG
591struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
592{
593 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
594
595 if (!last)
596 return NULL;
597
598 return rb_entry(last, struct sched_entity, run_node);
599}
600
601
602
603
604
605int sched_proc_update_handler(struct ctl_table *table, int write,
606 void __user *buffer, size_t *lenp,
607 loff_t *ppos)
608{
609 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
610 unsigned int factor = get_update_sysctl_factor();
611
612 if (ret || !write)
613 return ret;
614
615 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
616 sysctl_sched_min_granularity);
617
618#define WRT_SYSCTL(name) \
619 (normalized_sysctl_##name = sysctl_##name / (factor))
620 WRT_SYSCTL(sched_min_granularity);
621 WRT_SYSCTL(sched_latency);
622 WRT_SYSCTL(sched_wakeup_granularity);
623#undef WRT_SYSCTL
624
625 return 0;
626}
627#endif
628
629
630
631
632static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
633{
634 if (unlikely(se->load.weight != NICE_0_LOAD))
635 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
636
637 return delta;
638}
639
640
641
642
643
644
645
646
647
648static u64 __sched_period(unsigned long nr_running)
649{
650 if (unlikely(nr_running > sched_nr_latency))
651 return nr_running * sysctl_sched_min_granularity;
652 else
653 return sysctl_sched_latency;
654}
655
656
657
658
659
660
661
662static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
663{
664 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
665
666 for_each_sched_entity(se) {
667 struct load_weight *load;
668 struct load_weight lw;
669
670 cfs_rq = cfs_rq_of(se);
671 load = &cfs_rq->load;
672
673 if (unlikely(!se->on_rq)) {
674 lw = cfs_rq->load;
675
676 update_load_add(&lw, se->load.weight);
677 load = &lw;
678 }
679 slice = __calc_delta(slice, se->load.weight, load);
680 }
681 return slice;
682}
683
684
685
686
687
688
689static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
690{
691 return calc_delta_fair(sched_slice(cfs_rq, se), se);
692}
693
694#ifdef CONFIG_SMP
695
696#include "sched-pelt.h"
697
698static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
699static unsigned long task_h_load(struct task_struct *p);
700
701
702void init_entity_runnable_average(struct sched_entity *se)
703{
704 struct sched_avg *sa = &se->avg;
705
706 memset(sa, 0, sizeof(*sa));
707
708
709
710
711
712
713
714 if (entity_is_task(se))
715 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
716
717 se->runnable_weight = se->load.weight;
718
719
720}
721
722static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
723static void attach_entity_cfs_rq(struct sched_entity *se);
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750void post_init_entity_util_avg(struct sched_entity *se)
751{
752 struct cfs_rq *cfs_rq = cfs_rq_of(se);
753 struct sched_avg *sa = &se->avg;
754 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
755
756 if (cap > 0) {
757 if (cfs_rq->avg.util_avg != 0) {
758 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
759 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
760
761 if (sa->util_avg > cap)
762 sa->util_avg = cap;
763 } else {
764 sa->util_avg = cap;
765 }
766 }
767
768 if (entity_is_task(se)) {
769 struct task_struct *p = task_of(se);
770 if (p->sched_class != &fair_sched_class) {
771
772
773
774
775
776
777
778
779
780
781 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
782 return;
783 }
784 }
785
786 attach_entity_cfs_rq(se);
787}
788
789#else
790void init_entity_runnable_average(struct sched_entity *se)
791{
792}
793void post_init_entity_util_avg(struct sched_entity *se)
794{
795}
796static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
797{
798}
799#endif
800
801
802
803
804static void update_curr(struct cfs_rq *cfs_rq)
805{
806 struct sched_entity *curr = cfs_rq->curr;
807 u64 now = rq_clock_task(rq_of(cfs_rq));
808 u64 delta_exec;
809
810 if (unlikely(!curr))
811 return;
812
813 delta_exec = now - curr->exec_start;
814 if (unlikely((s64)delta_exec <= 0))
815 return;
816
817 curr->exec_start = now;
818
819 schedstat_set(curr->statistics.exec_max,
820 max(delta_exec, curr->statistics.exec_max));
821
822 curr->sum_exec_runtime += delta_exec;
823 schedstat_add(cfs_rq->exec_clock, delta_exec);
824
825 curr->vruntime += calc_delta_fair(delta_exec, curr);
826 update_min_vruntime(cfs_rq);
827
828 if (entity_is_task(curr)) {
829 struct task_struct *curtask = task_of(curr);
830
831 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
832 cgroup_account_cputime(curtask, delta_exec);
833 account_group_exec_runtime(curtask, delta_exec);
834 }
835
836 account_cfs_rq_runtime(cfs_rq, delta_exec);
837}
838
839static void update_curr_fair(struct rq *rq)
840{
841 update_curr(cfs_rq_of(&rq->curr->se));
842}
843
844static inline void
845update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
846{
847 u64 wait_start, prev_wait_start;
848
849 if (!schedstat_enabled())
850 return;
851
852 wait_start = rq_clock(rq_of(cfs_rq));
853 prev_wait_start = schedstat_val(se->statistics.wait_start);
854
855 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
856 likely(wait_start > prev_wait_start))
857 wait_start -= prev_wait_start;
858
859 __schedstat_set(se->statistics.wait_start, wait_start);
860}
861
862static inline void
863update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
864{
865 struct task_struct *p;
866 u64 delta;
867
868 if (!schedstat_enabled())
869 return;
870
871 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
872
873 if (entity_is_task(se)) {
874 p = task_of(se);
875 if (task_on_rq_migrating(p)) {
876
877
878
879
880
881 __schedstat_set(se->statistics.wait_start, delta);
882 return;
883 }
884 trace_sched_stat_wait(p, delta);
885 }
886
887 __schedstat_set(se->statistics.wait_max,
888 max(schedstat_val(se->statistics.wait_max), delta));
889 __schedstat_inc(se->statistics.wait_count);
890 __schedstat_add(se->statistics.wait_sum, delta);
891 __schedstat_set(se->statistics.wait_start, 0);
892}
893
894static inline void
895update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
896{
897 struct task_struct *tsk = NULL;
898 u64 sleep_start, block_start;
899
900 if (!schedstat_enabled())
901 return;
902
903 sleep_start = schedstat_val(se->statistics.sleep_start);
904 block_start = schedstat_val(se->statistics.block_start);
905
906 if (entity_is_task(se))
907 tsk = task_of(se);
908
909 if (sleep_start) {
910 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
911
912 if ((s64)delta < 0)
913 delta = 0;
914
915 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
916 __schedstat_set(se->statistics.sleep_max, delta);
917
918 __schedstat_set(se->statistics.sleep_start, 0);
919 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
920
921 if (tsk) {
922 account_scheduler_latency(tsk, delta >> 10, 1);
923 trace_sched_stat_sleep(tsk, delta);
924 }
925 }
926 if (block_start) {
927 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
928
929 if ((s64)delta < 0)
930 delta = 0;
931
932 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
933 __schedstat_set(se->statistics.block_max, delta);
934
935 __schedstat_set(se->statistics.block_start, 0);
936 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
937
938 if (tsk) {
939 if (tsk->in_iowait) {
940 __schedstat_add(se->statistics.iowait_sum, delta);
941 __schedstat_inc(se->statistics.iowait_count);
942 trace_sched_stat_iowait(tsk, delta);
943 }
944
945 trace_sched_stat_blocked(tsk, delta);
946
947
948
949
950
951
952 if (unlikely(prof_on == SLEEP_PROFILING)) {
953 profile_hits(SLEEP_PROFILING,
954 (void *)get_wchan(tsk),
955 delta >> 20);
956 }
957 account_scheduler_latency(tsk, delta >> 10, 0);
958 }
959 }
960}
961
962
963
964
965static inline void
966update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
967{
968 if (!schedstat_enabled())
969 return;
970
971
972
973
974
975 if (se != cfs_rq->curr)
976 update_stats_wait_start(cfs_rq, se);
977
978 if (flags & ENQUEUE_WAKEUP)
979 update_stats_enqueue_sleeper(cfs_rq, se);
980}
981
982static inline void
983update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
984{
985
986 if (!schedstat_enabled())
987 return;
988
989
990
991
992
993 if (se != cfs_rq->curr)
994 update_stats_wait_end(cfs_rq, se);
995
996 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
997 struct task_struct *tsk = task_of(se);
998
999 if (tsk->state & TASK_INTERRUPTIBLE)
1000 __schedstat_set(se->statistics.sleep_start,
1001 rq_clock(rq_of(cfs_rq)));
1002 if (tsk->state & TASK_UNINTERRUPTIBLE)
1003 __schedstat_set(se->statistics.block_start,
1004 rq_clock(rq_of(cfs_rq)));
1005 }
1006}
1007
1008
1009
1010
1011static inline void
1012update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1013{
1014
1015
1016
1017 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1018}
1019
1020
1021
1022
1023
1024#ifdef CONFIG_NUMA_BALANCING
1025
1026
1027
1028
1029
1030unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1031unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1032
1033
1034unsigned int sysctl_numa_balancing_scan_size = 256;
1035
1036
1037unsigned int sysctl_numa_balancing_scan_delay = 1000;
1038
1039struct numa_group {
1040 atomic_t refcount;
1041
1042 spinlock_t lock;
1043 int nr_tasks;
1044 pid_t gid;
1045 int active_nodes;
1046
1047 struct rcu_head rcu;
1048 unsigned long total_faults;
1049 unsigned long max_faults_cpu;
1050
1051
1052
1053
1054
1055 unsigned long *faults_cpu;
1056 unsigned long faults[0];
1057};
1058
1059static inline unsigned long group_faults_priv(struct numa_group *ng);
1060static inline unsigned long group_faults_shared(struct numa_group *ng);
1061
1062static unsigned int task_nr_scan_windows(struct task_struct *p)
1063{
1064 unsigned long rss = 0;
1065 unsigned long nr_scan_pages;
1066
1067
1068
1069
1070
1071
1072 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1073 rss = get_mm_rss(p->mm);
1074 if (!rss)
1075 rss = nr_scan_pages;
1076
1077 rss = round_up(rss, nr_scan_pages);
1078 return rss / nr_scan_pages;
1079}
1080
1081
1082#define MAX_SCAN_WINDOW 2560
1083
1084static unsigned int task_scan_min(struct task_struct *p)
1085{
1086 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1087 unsigned int scan, floor;
1088 unsigned int windows = 1;
1089
1090 if (scan_size < MAX_SCAN_WINDOW)
1091 windows = MAX_SCAN_WINDOW / scan_size;
1092 floor = 1000 / windows;
1093
1094 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1095 return max_t(unsigned int, floor, scan);
1096}
1097
1098static unsigned int task_scan_start(struct task_struct *p)
1099{
1100 unsigned long smin = task_scan_min(p);
1101 unsigned long period = smin;
1102
1103
1104 if (p->numa_group) {
1105 struct numa_group *ng = p->numa_group;
1106 unsigned long shared = group_faults_shared(ng);
1107 unsigned long private = group_faults_priv(ng);
1108
1109 period *= atomic_read(&ng->refcount);
1110 period *= shared + 1;
1111 period /= private + shared + 1;
1112 }
1113
1114 return max(smin, period);
1115}
1116
1117static unsigned int task_scan_max(struct task_struct *p)
1118{
1119 unsigned long smin = task_scan_min(p);
1120 unsigned long smax;
1121
1122
1123 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1124
1125
1126 if (p->numa_group) {
1127 struct numa_group *ng = p->numa_group;
1128 unsigned long shared = group_faults_shared(ng);
1129 unsigned long private = group_faults_priv(ng);
1130 unsigned long period = smax;
1131
1132 period *= atomic_read(&ng->refcount);
1133 period *= shared + 1;
1134 period /= private + shared + 1;
1135
1136 smax = max(smax, period);
1137 }
1138
1139 return max(smin, smax);
1140}
1141
1142void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1143{
1144 int mm_users = 0;
1145 struct mm_struct *mm = p->mm;
1146
1147 if (mm) {
1148 mm_users = atomic_read(&mm->mm_users);
1149 if (mm_users == 1) {
1150 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1151 mm->numa_scan_seq = 0;
1152 }
1153 }
1154 p->node_stamp = 0;
1155 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1156 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1157 p->numa_work.next = &p->numa_work;
1158 p->numa_faults = NULL;
1159 p->numa_group = NULL;
1160 p->last_task_numa_placement = 0;
1161 p->last_sum_exec_runtime = 0;
1162
1163
1164 if (!(clone_flags & CLONE_VM)) {
1165 p->numa_preferred_nid = -1;
1166 return;
1167 }
1168
1169
1170
1171
1172
1173 if (mm) {
1174 unsigned int delay;
1175
1176 delay = min_t(unsigned int, task_scan_max(current),
1177 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1178 delay += 2 * TICK_NSEC;
1179 p->node_stamp = delay;
1180 }
1181}
1182
1183static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1184{
1185 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1186 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1187}
1188
1189static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1190{
1191 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1192 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1193}
1194
1195
1196#define NR_NUMA_HINT_FAULT_TYPES 2
1197
1198
1199#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1200
1201
1202#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1203
1204pid_t task_numa_group_id(struct task_struct *p)
1205{
1206 return p->numa_group ? p->numa_group->gid : 0;
1207}
1208
1209
1210
1211
1212
1213
1214
1215static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1216{
1217 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1218}
1219
1220static inline unsigned long task_faults(struct task_struct *p, int nid)
1221{
1222 if (!p->numa_faults)
1223 return 0;
1224
1225 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1226 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1227}
1228
1229static inline unsigned long group_faults(struct task_struct *p, int nid)
1230{
1231 if (!p->numa_group)
1232 return 0;
1233
1234 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1235 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1236}
1237
1238static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1239{
1240 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1241 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1242}
1243
1244static inline unsigned long group_faults_priv(struct numa_group *ng)
1245{
1246 unsigned long faults = 0;
1247 int node;
1248
1249 for_each_online_node(node) {
1250 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1251 }
1252
1253 return faults;
1254}
1255
1256static inline unsigned long group_faults_shared(struct numa_group *ng)
1257{
1258 unsigned long faults = 0;
1259 int node;
1260
1261 for_each_online_node(node) {
1262 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1263 }
1264
1265 return faults;
1266}
1267
1268
1269
1270
1271
1272
1273#define ACTIVE_NODE_FRACTION 3
1274
1275static bool numa_is_active_node(int nid, struct numa_group *ng)
1276{
1277 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1278}
1279
1280
1281static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1282 int maxdist, bool task)
1283{
1284 unsigned long score = 0;
1285 int node;
1286
1287
1288
1289
1290
1291 if (sched_numa_topology_type == NUMA_DIRECT)
1292 return 0;
1293
1294
1295
1296
1297
1298 for_each_online_node(node) {
1299 unsigned long faults;
1300 int dist = node_distance(nid, node);
1301
1302
1303
1304
1305
1306 if (dist == sched_max_numa_distance || node == nid)
1307 continue;
1308
1309
1310
1311
1312
1313
1314
1315
1316 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1317 dist > maxdist)
1318 continue;
1319
1320
1321 if (task)
1322 faults = task_faults(p, node);
1323 else
1324 faults = group_faults(p, node);
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1335 faults *= (sched_max_numa_distance - dist);
1336 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1337 }
1338
1339 score += faults;
1340 }
1341
1342 return score;
1343}
1344
1345
1346
1347
1348
1349
1350
1351static inline unsigned long task_weight(struct task_struct *p, int nid,
1352 int dist)
1353{
1354 unsigned long faults, total_faults;
1355
1356 if (!p->numa_faults)
1357 return 0;
1358
1359 total_faults = p->total_numa_faults;
1360
1361 if (!total_faults)
1362 return 0;
1363
1364 faults = task_faults(p, nid);
1365 faults += score_nearby_nodes(p, nid, dist, true);
1366
1367 return 1000 * faults / total_faults;
1368}
1369
1370static inline unsigned long group_weight(struct task_struct *p, int nid,
1371 int dist)
1372{
1373 unsigned long faults, total_faults;
1374
1375 if (!p->numa_group)
1376 return 0;
1377
1378 total_faults = p->numa_group->total_faults;
1379
1380 if (!total_faults)
1381 return 0;
1382
1383 faults = group_faults(p, nid);
1384 faults += score_nearby_nodes(p, nid, dist, false);
1385
1386 return 1000 * faults / total_faults;
1387}
1388
1389bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1390 int src_nid, int dst_cpu)
1391{
1392 struct numa_group *ng = p->numa_group;
1393 int dst_nid = cpu_to_node(dst_cpu);
1394 int last_cpupid, this_cpupid;
1395
1396 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1416 if (!cpupid_pid_unset(last_cpupid) &&
1417 cpupid_to_nid(last_cpupid) != dst_nid)
1418 return false;
1419
1420
1421 if (cpupid_match_pid(p, last_cpupid))
1422 return true;
1423
1424
1425 if (!ng)
1426 return true;
1427
1428
1429
1430
1431
1432 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1433 ACTIVE_NODE_FRACTION)
1434 return true;
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1445 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1446}
1447
1448static unsigned long weighted_cpuload(struct rq *rq);
1449static unsigned long source_load(int cpu, int type);
1450static unsigned long target_load(int cpu, int type);
1451static unsigned long capacity_of(int cpu);
1452
1453
1454struct numa_stats {
1455 unsigned long nr_running;
1456 unsigned long load;
1457
1458
1459 unsigned long compute_capacity;
1460
1461
1462 unsigned long task_capacity;
1463 int has_free_capacity;
1464};
1465
1466
1467
1468
1469static void update_numa_stats(struct numa_stats *ns, int nid)
1470{
1471 int smt, cpu, cpus = 0;
1472 unsigned long capacity;
1473
1474 memset(ns, 0, sizeof(*ns));
1475 for_each_cpu(cpu, cpumask_of_node(nid)) {
1476 struct rq *rq = cpu_rq(cpu);
1477
1478 ns->nr_running += rq->nr_running;
1479 ns->load += weighted_cpuload(rq);
1480 ns->compute_capacity += capacity_of(cpu);
1481
1482 cpus++;
1483 }
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493 if (!cpus)
1494 return;
1495
1496
1497 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1498 capacity = cpus / smt;
1499
1500 ns->task_capacity = min_t(unsigned, capacity,
1501 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1502 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1503}
1504
1505struct task_numa_env {
1506 struct task_struct *p;
1507
1508 int src_cpu, src_nid;
1509 int dst_cpu, dst_nid;
1510
1511 struct numa_stats src_stats, dst_stats;
1512
1513 int imbalance_pct;
1514 int dist;
1515
1516 struct task_struct *best_task;
1517 long best_imp;
1518 int best_cpu;
1519};
1520
1521static void task_numa_assign(struct task_numa_env *env,
1522 struct task_struct *p, long imp)
1523{
1524 if (env->best_task)
1525 put_task_struct(env->best_task);
1526 if (p)
1527 get_task_struct(p);
1528
1529 env->best_task = p;
1530 env->best_imp = imp;
1531 env->best_cpu = env->dst_cpu;
1532}
1533
1534static bool load_too_imbalanced(long src_load, long dst_load,
1535 struct task_numa_env *env)
1536{
1537 long imb, old_imb;
1538 long orig_src_load, orig_dst_load;
1539 long src_capacity, dst_capacity;
1540
1541
1542
1543
1544
1545
1546
1547
1548 src_capacity = env->src_stats.compute_capacity;
1549 dst_capacity = env->dst_stats.compute_capacity;
1550
1551
1552 if (dst_load < src_load)
1553 swap(dst_load, src_load);
1554
1555
1556 imb = dst_load * src_capacity * 100 -
1557 src_load * dst_capacity * env->imbalance_pct;
1558 if (imb <= 0)
1559 return false;
1560
1561
1562
1563
1564
1565 orig_src_load = env->src_stats.load;
1566 orig_dst_load = env->dst_stats.load;
1567
1568 if (orig_dst_load < orig_src_load)
1569 swap(orig_dst_load, orig_src_load);
1570
1571 old_imb = orig_dst_load * src_capacity * 100 -
1572 orig_src_load * dst_capacity * env->imbalance_pct;
1573
1574
1575 return (imb > old_imb);
1576}
1577
1578
1579
1580
1581
1582
1583
1584static void task_numa_compare(struct task_numa_env *env,
1585 long taskimp, long groupimp)
1586{
1587 struct rq *src_rq = cpu_rq(env->src_cpu);
1588 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1589 struct task_struct *cur;
1590 long src_load, dst_load;
1591 long load;
1592 long imp = env->p->numa_group ? groupimp : taskimp;
1593 long moveimp = imp;
1594 int dist = env->dist;
1595
1596 rcu_read_lock();
1597 cur = task_rcu_dereference(&dst_rq->curr);
1598 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1599 cur = NULL;
1600
1601
1602
1603
1604
1605 if (cur == env->p)
1606 goto unlock;
1607
1608
1609
1610
1611
1612
1613
1614
1615 if (cur) {
1616
1617 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1618 goto unlock;
1619
1620
1621
1622
1623
1624 if (cur->numa_group == env->p->numa_group) {
1625 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1626 task_weight(cur, env->dst_nid, dist);
1627
1628
1629
1630
1631 if (cur->numa_group)
1632 imp -= imp/16;
1633 } else {
1634
1635
1636
1637
1638
1639 if (cur->numa_group)
1640 imp += group_weight(cur, env->src_nid, dist) -
1641 group_weight(cur, env->dst_nid, dist);
1642 else
1643 imp += task_weight(cur, env->src_nid, dist) -
1644 task_weight(cur, env->dst_nid, dist);
1645 }
1646 }
1647
1648 if (imp <= env->best_imp && moveimp <= env->best_imp)
1649 goto unlock;
1650
1651 if (!cur) {
1652
1653 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1654 !env->dst_stats.has_free_capacity)
1655 goto unlock;
1656
1657 goto balance;
1658 }
1659
1660
1661 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1662 dst_rq->nr_running == 1)
1663 goto assign;
1664
1665
1666
1667
1668balance:
1669 load = task_h_load(env->p);
1670 dst_load = env->dst_stats.load + load;
1671 src_load = env->src_stats.load - load;
1672
1673 if (moveimp > imp && moveimp > env->best_imp) {
1674
1675
1676
1677
1678
1679
1680 if (!load_too_imbalanced(src_load, dst_load, env)) {
1681 imp = moveimp - 1;
1682 cur = NULL;
1683 goto assign;
1684 }
1685 }
1686
1687 if (imp <= env->best_imp)
1688 goto unlock;
1689
1690 if (cur) {
1691 load = task_h_load(cur);
1692 dst_load -= load;
1693 src_load += load;
1694 }
1695
1696 if (load_too_imbalanced(src_load, dst_load, env))
1697 goto unlock;
1698
1699
1700
1701
1702
1703 if (!cur) {
1704
1705
1706
1707
1708 local_irq_disable();
1709 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1710 env->dst_cpu);
1711 local_irq_enable();
1712 }
1713
1714assign:
1715 task_numa_assign(env, cur, imp);
1716unlock:
1717 rcu_read_unlock();
1718}
1719
1720static void task_numa_find_cpu(struct task_numa_env *env,
1721 long taskimp, long groupimp)
1722{
1723 int cpu;
1724
1725 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1726
1727 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1728 continue;
1729
1730 env->dst_cpu = cpu;
1731 task_numa_compare(env, taskimp, groupimp);
1732 }
1733}
1734
1735
1736static bool numa_has_capacity(struct task_numa_env *env)
1737{
1738 struct numa_stats *src = &env->src_stats;
1739 struct numa_stats *dst = &env->dst_stats;
1740
1741 if (src->has_free_capacity && !dst->has_free_capacity)
1742 return false;
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752 if (src->load * dst->compute_capacity * env->imbalance_pct >
1753
1754 dst->load * src->compute_capacity * 100)
1755 return true;
1756
1757 return false;
1758}
1759
1760static int task_numa_migrate(struct task_struct *p)
1761{
1762 struct task_numa_env env = {
1763 .p = p,
1764
1765 .src_cpu = task_cpu(p),
1766 .src_nid = task_node(p),
1767
1768 .imbalance_pct = 112,
1769
1770 .best_task = NULL,
1771 .best_imp = 0,
1772 .best_cpu = -1,
1773 };
1774 struct sched_domain *sd;
1775 unsigned long taskweight, groupweight;
1776 int nid, ret, dist;
1777 long taskimp, groupimp;
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787 rcu_read_lock();
1788 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1789 if (sd)
1790 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1791 rcu_read_unlock();
1792
1793
1794
1795
1796
1797
1798
1799 if (unlikely(!sd)) {
1800 p->numa_preferred_nid = task_node(p);
1801 return -EINVAL;
1802 }
1803
1804 env.dst_nid = p->numa_preferred_nid;
1805 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1806 taskweight = task_weight(p, env.src_nid, dist);
1807 groupweight = group_weight(p, env.src_nid, dist);
1808 update_numa_stats(&env.src_stats, env.src_nid);
1809 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1810 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1811 update_numa_stats(&env.dst_stats, env.dst_nid);
1812
1813
1814 if (numa_has_capacity(&env))
1815 task_numa_find_cpu(&env, taskimp, groupimp);
1816
1817
1818
1819
1820
1821
1822
1823
1824 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1825 for_each_online_node(nid) {
1826 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1827 continue;
1828
1829 dist = node_distance(env.src_nid, env.dst_nid);
1830 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1831 dist != env.dist) {
1832 taskweight = task_weight(p, env.src_nid, dist);
1833 groupweight = group_weight(p, env.src_nid, dist);
1834 }
1835
1836
1837 taskimp = task_weight(p, nid, dist) - taskweight;
1838 groupimp = group_weight(p, nid, dist) - groupweight;
1839 if (taskimp < 0 && groupimp < 0)
1840 continue;
1841
1842 env.dist = dist;
1843 env.dst_nid = nid;
1844 update_numa_stats(&env.dst_stats, env.dst_nid);
1845 if (numa_has_capacity(&env))
1846 task_numa_find_cpu(&env, taskimp, groupimp);
1847 }
1848 }
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858 if (p->numa_group) {
1859 struct numa_group *ng = p->numa_group;
1860
1861 if (env.best_cpu == -1)
1862 nid = env.src_nid;
1863 else
1864 nid = env.dst_nid;
1865
1866 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1867 sched_setnuma(p, env.dst_nid);
1868 }
1869
1870
1871 if (env.best_cpu == -1)
1872 return -EAGAIN;
1873
1874
1875
1876
1877
1878 p->numa_scan_period = task_scan_start(p);
1879
1880 if (env.best_task == NULL) {
1881 ret = migrate_task_to(p, env.best_cpu);
1882 if (ret != 0)
1883 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1884 return ret;
1885 }
1886
1887 ret = migrate_swap(p, env.best_task);
1888 if (ret != 0)
1889 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1890 put_task_struct(env.best_task);
1891 return ret;
1892}
1893
1894
1895static void numa_migrate_preferred(struct task_struct *p)
1896{
1897 unsigned long interval = HZ;
1898
1899
1900 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1901 return;
1902
1903
1904 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1905 p->numa_migrate_retry = jiffies + interval;
1906
1907
1908 if (task_node(p) == p->numa_preferred_nid)
1909 return;
1910
1911
1912 task_numa_migrate(p);
1913}
1914
1915
1916
1917
1918
1919
1920
1921static void numa_group_count_active_nodes(struct numa_group *numa_group)
1922{
1923 unsigned long faults, max_faults = 0;
1924 int nid, active_nodes = 0;
1925
1926 for_each_online_node(nid) {
1927 faults = group_faults_cpu(numa_group, nid);
1928 if (faults > max_faults)
1929 max_faults = faults;
1930 }
1931
1932 for_each_online_node(nid) {
1933 faults = group_faults_cpu(numa_group, nid);
1934 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1935 active_nodes++;
1936 }
1937
1938 numa_group->max_faults_cpu = max_faults;
1939 numa_group->active_nodes = active_nodes;
1940}
1941
1942
1943
1944
1945
1946
1947
1948
1949#define NUMA_PERIOD_SLOTS 10
1950#define NUMA_PERIOD_THRESHOLD 7
1951
1952
1953
1954
1955
1956
1957
1958static void update_task_scan_period(struct task_struct *p,
1959 unsigned long shared, unsigned long private)
1960{
1961 unsigned int period_slot;
1962 int lr_ratio, ps_ratio;
1963 int diff;
1964
1965 unsigned long remote = p->numa_faults_locality[0];
1966 unsigned long local = p->numa_faults_locality[1];
1967
1968
1969
1970
1971
1972
1973
1974
1975 if (local + shared == 0 || p->numa_faults_locality[2]) {
1976 p->numa_scan_period = min(p->numa_scan_period_max,
1977 p->numa_scan_period << 1);
1978
1979 p->mm->numa_next_scan = jiffies +
1980 msecs_to_jiffies(p->numa_scan_period);
1981
1982 return;
1983 }
1984
1985
1986
1987
1988
1989
1990
1991 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1992 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1993 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1994
1995 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1996
1997
1998
1999
2000 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2001 if (!slot)
2002 slot = 1;
2003 diff = slot * period_slot;
2004 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2005
2006
2007
2008
2009
2010 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2011 if (!slot)
2012 slot = 1;
2013 diff = slot * period_slot;
2014 } else {
2015
2016
2017
2018
2019
2020 int ratio = max(lr_ratio, ps_ratio);
2021 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2022 }
2023
2024 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2025 task_scan_min(p), task_scan_max(p));
2026 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2027}
2028
2029
2030
2031
2032
2033
2034
2035
2036static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2037{
2038 u64 runtime, delta, now;
2039
2040 now = p->se.exec_start;
2041 runtime = p->se.sum_exec_runtime;
2042
2043 if (p->last_task_numa_placement) {
2044 delta = runtime - p->last_sum_exec_runtime;
2045 *period = now - p->last_task_numa_placement;
2046 } else {
2047 delta = p->se.avg.load_sum;
2048 *period = LOAD_AVG_MAX;
2049 }
2050
2051 p->last_sum_exec_runtime = runtime;
2052 p->last_task_numa_placement = now;
2053
2054 return delta;
2055}
2056
2057
2058
2059
2060
2061
2062static int preferred_group_nid(struct task_struct *p, int nid)
2063{
2064 nodemask_t nodes;
2065 int dist;
2066
2067
2068 if (sched_numa_topology_type == NUMA_DIRECT)
2069 return nid;
2070
2071
2072
2073
2074
2075
2076 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2077 unsigned long score, max_score = 0;
2078 int node, max_node = nid;
2079
2080 dist = sched_max_numa_distance;
2081
2082 for_each_online_node(node) {
2083 score = group_weight(p, node, dist);
2084 if (score > max_score) {
2085 max_score = score;
2086 max_node = node;
2087 }
2088 }
2089 return max_node;
2090 }
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101 nodes = node_online_map;
2102 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2103 unsigned long max_faults = 0;
2104 nodemask_t max_group = NODE_MASK_NONE;
2105 int a, b;
2106
2107
2108 if (!find_numa_distance(dist))
2109 continue;
2110
2111 for_each_node_mask(a, nodes) {
2112 unsigned long faults = 0;
2113 nodemask_t this_group;
2114 nodes_clear(this_group);
2115
2116
2117 for_each_node_mask(b, nodes) {
2118 if (node_distance(a, b) < dist) {
2119 faults += group_faults(p, b);
2120 node_set(b, this_group);
2121 node_clear(b, nodes);
2122 }
2123 }
2124
2125
2126 if (faults > max_faults) {
2127 max_faults = faults;
2128 max_group = this_group;
2129
2130
2131
2132
2133
2134 nid = a;
2135 }
2136 }
2137
2138 if (!max_faults)
2139 break;
2140 nodes = max_group;
2141 }
2142 return nid;
2143}
2144
2145static void task_numa_placement(struct task_struct *p)
2146{
2147 int seq, nid, max_nid = -1, max_group_nid = -1;
2148 unsigned long max_faults = 0, max_group_faults = 0;
2149 unsigned long fault_types[2] = { 0, 0 };
2150 unsigned long total_faults;
2151 u64 runtime, period;
2152 spinlock_t *group_lock = NULL;
2153
2154
2155
2156
2157
2158
2159 seq = READ_ONCE(p->mm->numa_scan_seq);
2160 if (p->numa_scan_seq == seq)
2161 return;
2162 p->numa_scan_seq = seq;
2163 p->numa_scan_period_max = task_scan_max(p);
2164
2165 total_faults = p->numa_faults_locality[0] +
2166 p->numa_faults_locality[1];
2167 runtime = numa_get_avg_runtime(p, &period);
2168
2169
2170 if (p->numa_group) {
2171 group_lock = &p->numa_group->lock;
2172 spin_lock_irq(group_lock);
2173 }
2174
2175
2176 for_each_online_node(nid) {
2177
2178 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2179 unsigned long faults = 0, group_faults = 0;
2180 int priv;
2181
2182 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2183 long diff, f_diff, f_weight;
2184
2185 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2186 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2187 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2188 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2189
2190
2191 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2192 fault_types[priv] += p->numa_faults[membuf_idx];
2193 p->numa_faults[membuf_idx] = 0;
2194
2195
2196
2197
2198
2199
2200
2201
2202 f_weight = div64_u64(runtime << 16, period + 1);
2203 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2204 (total_faults + 1);
2205 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2206 p->numa_faults[cpubuf_idx] = 0;
2207
2208 p->numa_faults[mem_idx] += diff;
2209 p->numa_faults[cpu_idx] += f_diff;
2210 faults += p->numa_faults[mem_idx];
2211 p->total_numa_faults += diff;
2212 if (p->numa_group) {
2213
2214
2215
2216
2217
2218
2219
2220 p->numa_group->faults[mem_idx] += diff;
2221 p->numa_group->faults_cpu[mem_idx] += f_diff;
2222 p->numa_group->total_faults += diff;
2223 group_faults += p->numa_group->faults[mem_idx];
2224 }
2225 }
2226
2227 if (faults > max_faults) {
2228 max_faults = faults;
2229 max_nid = nid;
2230 }
2231
2232 if (group_faults > max_group_faults) {
2233 max_group_faults = group_faults;
2234 max_group_nid = nid;
2235 }
2236 }
2237
2238 update_task_scan_period(p, fault_types[0], fault_types[1]);
2239
2240 if (p->numa_group) {
2241 numa_group_count_active_nodes(p->numa_group);
2242 spin_unlock_irq(group_lock);
2243 max_nid = preferred_group_nid(p, max_group_nid);
2244 }
2245
2246 if (max_faults) {
2247
2248 if (max_nid != p->numa_preferred_nid)
2249 sched_setnuma(p, max_nid);
2250
2251 if (task_node(p) != p->numa_preferred_nid)
2252 numa_migrate_preferred(p);
2253 }
2254}
2255
2256static inline int get_numa_group(struct numa_group *grp)
2257{
2258 return atomic_inc_not_zero(&grp->refcount);
2259}
2260
2261static inline void put_numa_group(struct numa_group *grp)
2262{
2263 if (atomic_dec_and_test(&grp->refcount))
2264 kfree_rcu(grp, rcu);
2265}
2266
2267static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2268 int *priv)
2269{
2270 struct numa_group *grp, *my_grp;
2271 struct task_struct *tsk;
2272 bool join = false;
2273 int cpu = cpupid_to_cpu(cpupid);
2274 int i;
2275
2276 if (unlikely(!p->numa_group)) {
2277 unsigned int size = sizeof(struct numa_group) +
2278 4*nr_node_ids*sizeof(unsigned long);
2279
2280 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2281 if (!grp)
2282 return;
2283
2284 atomic_set(&grp->refcount, 1);
2285 grp->active_nodes = 1;
2286 grp->max_faults_cpu = 0;
2287 spin_lock_init(&grp->lock);
2288 grp->gid = p->pid;
2289
2290 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2291 nr_node_ids;
2292
2293 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2294 grp->faults[i] = p->numa_faults[i];
2295
2296 grp->total_faults = p->total_numa_faults;
2297
2298 grp->nr_tasks++;
2299 rcu_assign_pointer(p->numa_group, grp);
2300 }
2301
2302 rcu_read_lock();
2303 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2304
2305 if (!cpupid_match_pid(tsk, cpupid))
2306 goto no_join;
2307
2308 grp = rcu_dereference(tsk->numa_group);
2309 if (!grp)
2310 goto no_join;
2311
2312 my_grp = p->numa_group;
2313 if (grp == my_grp)
2314 goto no_join;
2315
2316
2317
2318
2319
2320 if (my_grp->nr_tasks > grp->nr_tasks)
2321 goto no_join;
2322
2323
2324
2325
2326 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2327 goto no_join;
2328
2329
2330 if (tsk->mm == current->mm)
2331 join = true;
2332
2333
2334 if (flags & TNF_SHARED)
2335 join = true;
2336
2337
2338 *priv = !join;
2339
2340 if (join && !get_numa_group(grp))
2341 goto no_join;
2342
2343 rcu_read_unlock();
2344
2345 if (!join)
2346 return;
2347
2348 BUG_ON(irqs_disabled());
2349 double_lock_irq(&my_grp->lock, &grp->lock);
2350
2351 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2352 my_grp->faults[i] -= p->numa_faults[i];
2353 grp->faults[i] += p->numa_faults[i];
2354 }
2355 my_grp->total_faults -= p->total_numa_faults;
2356 grp->total_faults += p->total_numa_faults;
2357
2358 my_grp->nr_tasks--;
2359 grp->nr_tasks++;
2360
2361 spin_unlock(&my_grp->lock);
2362 spin_unlock_irq(&grp->lock);
2363
2364 rcu_assign_pointer(p->numa_group, grp);
2365
2366 put_numa_group(my_grp);
2367 return;
2368
2369no_join:
2370 rcu_read_unlock();
2371 return;
2372}
2373
2374void task_numa_free(struct task_struct *p)
2375{
2376 struct numa_group *grp = p->numa_group;
2377 void *numa_faults = p->numa_faults;
2378 unsigned long flags;
2379 int i;
2380
2381 if (grp) {
2382 spin_lock_irqsave(&grp->lock, flags);
2383 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2384 grp->faults[i] -= p->numa_faults[i];
2385 grp->total_faults -= p->total_numa_faults;
2386
2387 grp->nr_tasks--;
2388 spin_unlock_irqrestore(&grp->lock, flags);
2389 RCU_INIT_POINTER(p->numa_group, NULL);
2390 put_numa_group(grp);
2391 }
2392
2393 p->numa_faults = NULL;
2394 kfree(numa_faults);
2395}
2396
2397
2398
2399
2400void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2401{
2402 struct task_struct *p = current;
2403 bool migrated = flags & TNF_MIGRATED;
2404 int cpu_node = task_node(current);
2405 int local = !!(flags & TNF_FAULT_LOCAL);
2406 struct numa_group *ng;
2407 int priv;
2408
2409 if (!static_branch_likely(&sched_numa_balancing))
2410 return;
2411
2412
2413 if (!p->mm)
2414 return;
2415
2416
2417 if (unlikely(!p->numa_faults)) {
2418 int size = sizeof(*p->numa_faults) *
2419 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2420
2421 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2422 if (!p->numa_faults)
2423 return;
2424
2425 p->total_numa_faults = 0;
2426 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2427 }
2428
2429
2430
2431
2432
2433 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2434 priv = 1;
2435 } else {
2436 priv = cpupid_match_pid(p, last_cpupid);
2437 if (!priv && !(flags & TNF_NO_GROUP))
2438 task_numa_group(p, last_cpupid, flags, &priv);
2439 }
2440
2441
2442
2443
2444
2445
2446
2447 ng = p->numa_group;
2448 if (!priv && !local && ng && ng->active_nodes > 1 &&
2449 numa_is_active_node(cpu_node, ng) &&
2450 numa_is_active_node(mem_node, ng))
2451 local = 1;
2452
2453 task_numa_placement(p);
2454
2455
2456
2457
2458
2459 if (time_after(jiffies, p->numa_migrate_retry))
2460 numa_migrate_preferred(p);
2461
2462 if (migrated)
2463 p->numa_pages_migrated += pages;
2464 if (flags & TNF_MIGRATE_FAIL)
2465 p->numa_faults_locality[2] += pages;
2466
2467 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2468 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2469 p->numa_faults_locality[local] += pages;
2470}
2471
2472static void reset_ptenuma_scan(struct task_struct *p)
2473{
2474
2475
2476
2477
2478
2479
2480
2481
2482 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2483 p->mm->numa_scan_offset = 0;
2484}
2485
2486
2487
2488
2489
2490void task_numa_work(struct callback_head *work)
2491{
2492 unsigned long migrate, next_scan, now = jiffies;
2493 struct task_struct *p = current;
2494 struct mm_struct *mm = p->mm;
2495 u64 runtime = p->se.sum_exec_runtime;
2496 struct vm_area_struct *vma;
2497 unsigned long start, end;
2498 unsigned long nr_pte_updates = 0;
2499 long pages, virtpages;
2500
2501 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2502
2503 work->next = work;
2504
2505
2506
2507
2508
2509
2510
2511
2512 if (p->flags & PF_EXITING)
2513 return;
2514
2515 if (!mm->numa_next_scan) {
2516 mm->numa_next_scan = now +
2517 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2518 }
2519
2520
2521
2522
2523 migrate = mm->numa_next_scan;
2524 if (time_before(now, migrate))
2525 return;
2526
2527 if (p->numa_scan_period == 0) {
2528 p->numa_scan_period_max = task_scan_max(p);
2529 p->numa_scan_period = task_scan_start(p);
2530 }
2531
2532 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2533 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2534 return;
2535
2536
2537
2538
2539
2540 p->node_stamp += 2 * TICK_NSEC;
2541
2542 start = mm->numa_scan_offset;
2543 pages = sysctl_numa_balancing_scan_size;
2544 pages <<= 20 - PAGE_SHIFT;
2545 virtpages = pages * 8;
2546 if (!pages)
2547 return;
2548
2549
2550 if (!down_read_trylock(&mm->mmap_sem))
2551 return;
2552 vma = find_vma(mm, start);
2553 if (!vma) {
2554 reset_ptenuma_scan(p);
2555 start = 0;
2556 vma = mm->mmap;
2557 }
2558 for (; vma; vma = vma->vm_next) {
2559 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2560 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2561 continue;
2562 }
2563
2564
2565
2566
2567
2568
2569
2570 if (!vma->vm_mm ||
2571 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2572 continue;
2573
2574
2575
2576
2577
2578 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2579 continue;
2580
2581 do {
2582 start = max(start, vma->vm_start);
2583 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2584 end = min(end, vma->vm_end);
2585 nr_pte_updates = change_prot_numa(vma, start, end);
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595 if (nr_pte_updates)
2596 pages -= (end - start) >> PAGE_SHIFT;
2597 virtpages -= (end - start) >> PAGE_SHIFT;
2598
2599 start = end;
2600 if (pages <= 0 || virtpages <= 0)
2601 goto out;
2602
2603 cond_resched();
2604 } while (end != vma->vm_end);
2605 }
2606
2607out:
2608
2609
2610
2611
2612
2613
2614 if (vma)
2615 mm->numa_scan_offset = start;
2616 else
2617 reset_ptenuma_scan(p);
2618 up_read(&mm->mmap_sem);
2619
2620
2621
2622
2623
2624
2625
2626 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2627 u64 diff = p->se.sum_exec_runtime - runtime;
2628 p->node_stamp += 32 * diff;
2629 }
2630}
2631
2632
2633
2634
2635void task_tick_numa(struct rq *rq, struct task_struct *curr)
2636{
2637 struct callback_head *work = &curr->numa_work;
2638 u64 period, now;
2639
2640
2641
2642
2643 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2644 return;
2645
2646
2647
2648
2649
2650
2651
2652 now = curr->se.sum_exec_runtime;
2653 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2654
2655 if (now > curr->node_stamp + period) {
2656 if (!curr->node_stamp)
2657 curr->numa_scan_period = task_scan_start(curr);
2658 curr->node_stamp += period;
2659
2660 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2661 init_task_work(work, task_numa_work);
2662 task_work_add(curr, work, true);
2663 }
2664 }
2665}
2666
2667#else
2668static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2669{
2670}
2671
2672static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2673{
2674}
2675
2676static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2677{
2678}
2679
2680#endif
2681
2682static void
2683account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2684{
2685 update_load_add(&cfs_rq->load, se->load.weight);
2686 if (!parent_entity(se))
2687 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2688#ifdef CONFIG_SMP
2689 if (entity_is_task(se)) {
2690 struct rq *rq = rq_of(cfs_rq);
2691
2692 account_numa_enqueue(rq, task_of(se));
2693 list_add(&se->group_node, &rq->cfs_tasks);
2694 }
2695#endif
2696 cfs_rq->nr_running++;
2697}
2698
2699static void
2700account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2701{
2702 update_load_sub(&cfs_rq->load, se->load.weight);
2703 if (!parent_entity(se))
2704 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2705#ifdef CONFIG_SMP
2706 if (entity_is_task(se)) {
2707 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2708 list_del_init(&se->group_node);
2709 }
2710#endif
2711 cfs_rq->nr_running--;
2712}
2713
2714
2715
2716
2717
2718
2719
2720
2721#define add_positive(_ptr, _val) do { \
2722 typeof(_ptr) ptr = (_ptr); \
2723 typeof(_val) val = (_val); \
2724 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2725 \
2726 res = var + val; \
2727 \
2728 if (val < 0 && res > var) \
2729 res = 0; \
2730 \
2731 WRITE_ONCE(*ptr, res); \
2732} while (0)
2733
2734
2735
2736
2737
2738
2739
2740
2741#define sub_positive(_ptr, _val) do { \
2742 typeof(_ptr) ptr = (_ptr); \
2743 typeof(*ptr) val = (_val); \
2744 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2745 res = var - val; \
2746 if (res > var) \
2747 res = 0; \
2748 WRITE_ONCE(*ptr, res); \
2749} while (0)
2750
2751#ifdef CONFIG_SMP
2752
2753
2754
2755static inline long se_weight(struct sched_entity *se)
2756{
2757 return scale_load_down(se->load.weight);
2758}
2759
2760static inline long se_runnable(struct sched_entity *se)
2761{
2762 return scale_load_down(se->runnable_weight);
2763}
2764
2765static inline void
2766enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2767{
2768 cfs_rq->runnable_weight += se->runnable_weight;
2769
2770 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2771 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2772}
2773
2774static inline void
2775dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2776{
2777 cfs_rq->runnable_weight -= se->runnable_weight;
2778
2779 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2780 sub_positive(&cfs_rq->avg.runnable_load_sum,
2781 se_runnable(se) * se->avg.runnable_load_sum);
2782}
2783
2784static inline void
2785enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2786{
2787 cfs_rq->avg.load_avg += se->avg.load_avg;
2788 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2789}
2790
2791static inline void
2792dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2793{
2794 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2795 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2796}
2797#else
2798static inline void
2799enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2800static inline void
2801dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2802static inline void
2803enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2804static inline void
2805dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2806#endif
2807
2808static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2809 unsigned long weight, unsigned long runnable)
2810{
2811 if (se->on_rq) {
2812
2813 if (cfs_rq->curr == se)
2814 update_curr(cfs_rq);
2815 account_entity_dequeue(cfs_rq, se);
2816 dequeue_runnable_load_avg(cfs_rq, se);
2817 }
2818 dequeue_load_avg(cfs_rq, se);
2819
2820 se->runnable_weight = runnable;
2821 update_load_set(&se->load, weight);
2822
2823#ifdef CONFIG_SMP
2824 do {
2825 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2826
2827 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2828 se->avg.runnable_load_avg =
2829 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2830 } while (0);
2831#endif
2832
2833 enqueue_load_avg(cfs_rq, se);
2834 if (se->on_rq) {
2835 account_entity_enqueue(cfs_rq, se);
2836 enqueue_runnable_load_avg(cfs_rq, se);
2837 }
2838}
2839
2840void reweight_task(struct task_struct *p, int prio)
2841{
2842 struct sched_entity *se = &p->se;
2843 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2844 struct load_weight *load = &se->load;
2845 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2846
2847 reweight_entity(cfs_rq, se, weight, weight);
2848 load->inv_weight = sched_prio_to_wmult[prio];
2849}
2850
2851#ifdef CONFIG_FAIR_GROUP_SCHED
2852#ifdef CONFIG_SMP
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926static long calc_group_shares(struct cfs_rq *cfs_rq)
2927{
2928 long tg_weight, tg_shares, load, shares;
2929 struct task_group *tg = cfs_rq->tg;
2930
2931 tg_shares = READ_ONCE(tg->shares);
2932
2933 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
2934
2935 tg_weight = atomic_long_read(&tg->load_avg);
2936
2937
2938 tg_weight -= cfs_rq->tg_load_avg_contrib;
2939 tg_weight += load;
2940
2941 shares = (tg_shares * load);
2942 if (tg_weight)
2943 shares /= tg_weight;
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957 return clamp_t(long, shares, MIN_SHARES, tg_shares);
2958}
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2988{
2989 long runnable, load_avg;
2990
2991 load_avg = max(cfs_rq->avg.load_avg,
2992 scale_load_down(cfs_rq->load.weight));
2993
2994 runnable = max(cfs_rq->avg.runnable_load_avg,
2995 scale_load_down(cfs_rq->runnable_weight));
2996
2997 runnable *= shares;
2998 if (load_avg)
2999 runnable /= load_avg;
3000
3001 return clamp_t(long, runnable, MIN_SHARES, shares);
3002}
3003#endif
3004
3005static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3006
3007
3008
3009
3010
3011static void update_cfs_group(struct sched_entity *se)
3012{
3013 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3014 long shares, runnable;
3015
3016 if (!gcfs_rq)
3017 return;
3018
3019 if (throttled_hierarchy(gcfs_rq))
3020 return;
3021
3022#ifndef CONFIG_SMP
3023 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3024
3025 if (likely(se->load.weight == shares))
3026 return;
3027#else
3028 shares = calc_group_shares(gcfs_rq);
3029 runnable = calc_group_runnable(gcfs_rq, shares);
3030#endif
3031
3032 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3033}
3034
3035#else
3036static inline void update_cfs_group(struct sched_entity *se)
3037{
3038}
3039#endif
3040
3041static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3042{
3043 struct rq *rq = rq_of(cfs_rq);
3044
3045 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060 cpufreq_update_util(rq, flags);
3061 }
3062}
3063
3064#ifdef CONFIG_SMP
3065
3066
3067
3068
3069static u64 decay_load(u64 val, u64 n)
3070{
3071 unsigned int local_n;
3072
3073 if (unlikely(n > LOAD_AVG_PERIOD * 63))
3074 return 0;
3075
3076
3077 local_n = n;
3078
3079
3080
3081
3082
3083
3084
3085
3086 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
3087 val >>= local_n / LOAD_AVG_PERIOD;
3088 local_n %= LOAD_AVG_PERIOD;
3089 }
3090
3091 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
3092 return val;
3093}
3094
3095static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
3096{
3097 u32 c1, c2, c3 = d3;
3098
3099
3100
3101
3102 c1 = decay_load((u64)d1, periods);
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
3114
3115 return c1 + c2 + c3;
3116}
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139static __always_inline u32
3140accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
3141 unsigned long load, unsigned long runnable, int running)
3142{
3143 unsigned long scale_freq, scale_cpu;
3144 u32 contrib = (u32)delta;
3145 u64 periods;
3146
3147 scale_freq = arch_scale_freq_capacity(cpu);
3148 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3149
3150 delta += sa->period_contrib;
3151 periods = delta / 1024;
3152
3153
3154
3155
3156 if (periods) {
3157 sa->load_sum = decay_load(sa->load_sum, periods);
3158 sa->runnable_load_sum =
3159 decay_load(sa->runnable_load_sum, periods);
3160 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
3161
3162
3163
3164
3165 delta %= 1024;
3166 contrib = __accumulate_pelt_segments(periods,
3167 1024 - sa->period_contrib, delta);
3168 }
3169 sa->period_contrib = delta;
3170
3171 contrib = cap_scale(contrib, scale_freq);
3172 if (load)
3173 sa->load_sum += load * contrib;
3174 if (runnable)
3175 sa->runnable_load_sum += runnable * contrib;
3176 if (running)
3177 sa->util_sum += contrib * scale_cpu;
3178
3179 return periods;
3180}
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210static __always_inline int
3211___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
3212 unsigned long load, unsigned long runnable, int running)
3213{
3214 u64 delta;
3215
3216 delta = now - sa->last_update_time;
3217
3218
3219
3220
3221 if ((s64)delta < 0) {
3222 sa->last_update_time = now;
3223 return 0;
3224 }
3225
3226
3227
3228
3229
3230 delta >>= 10;
3231 if (!delta)
3232 return 0;
3233
3234 sa->last_update_time += delta << 10;
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245 if (!load)
3246 runnable = running = 0;
3247
3248
3249
3250
3251
3252
3253
3254
3255 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
3256 return 0;
3257
3258 return 1;
3259}
3260
3261static __always_inline void
3262___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
3263{
3264 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3265
3266
3267
3268
3269 sa->load_avg = div_u64(load * sa->load_sum, divider);
3270 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
3271 sa->util_avg = sa->util_sum / divider;
3272}
3273
3274
3275
3276
3277
3278
3279
3280
3281#define UTIL_AVG_UNCHANGED 0x1
3282
3283static inline void cfs_se_util_change(struct sched_avg *avg)
3284{
3285 unsigned int enqueued;
3286
3287 if (!sched_feat(UTIL_EST))
3288 return;
3289
3290
3291 enqueued = avg->util_est.enqueued;
3292 if (!(enqueued & UTIL_AVG_UNCHANGED))
3293 return;
3294
3295
3296 enqueued &= ~UTIL_AVG_UNCHANGED;
3297 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3298}
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327static int
3328__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3329{
3330 if (entity_is_task(se))
3331 se->runnable_weight = se->load.weight;
3332
3333 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
3334 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3335 return 1;
3336 }
3337
3338 return 0;
3339}
3340
3341static int
3342__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3343{
3344 if (entity_is_task(se))
3345 se->runnable_weight = se->load.weight;
3346
3347 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
3348 cfs_rq->curr == se)) {
3349
3350 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3351 cfs_se_util_change(&se->avg);
3352 return 1;
3353 }
3354
3355 return 0;
3356}
3357
3358static int
3359__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3360{
3361 if (___update_load_sum(now, cpu, &cfs_rq->avg,
3362 scale_load_down(cfs_rq->load.weight),
3363 scale_load_down(cfs_rq->runnable_weight),
3364 cfs_rq->curr != NULL)) {
3365
3366 ___update_load_avg(&cfs_rq->avg, 1, 1);
3367 return 1;
3368 }
3369
3370 return 0;
3371}
3372
3373#ifdef CONFIG_FAIR_GROUP_SCHED
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3390{
3391 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3392
3393
3394
3395
3396 if (cfs_rq->tg == &root_task_group)
3397 return;
3398
3399 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3400 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3401 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3402 }
3403}
3404
3405
3406
3407
3408
3409
3410void set_task_rq_fair(struct sched_entity *se,
3411 struct cfs_rq *prev, struct cfs_rq *next)
3412{
3413 u64 p_last_update_time;
3414 u64 n_last_update_time;
3415
3416 if (!sched_feat(ATTACH_AGE_LOAD))
3417 return;
3418
3419
3420
3421
3422
3423
3424
3425
3426 if (!(se->avg.last_update_time && prev))
3427 return;
3428
3429#ifndef CONFIG_64BIT
3430 {
3431 u64 p_last_update_time_copy;
3432 u64 n_last_update_time_copy;
3433
3434 do {
3435 p_last_update_time_copy = prev->load_last_update_time_copy;
3436 n_last_update_time_copy = next->load_last_update_time_copy;
3437
3438 smp_rmb();
3439
3440 p_last_update_time = prev->avg.last_update_time;
3441 n_last_update_time = next->avg.last_update_time;
3442
3443 } while (p_last_update_time != p_last_update_time_copy ||
3444 n_last_update_time != n_last_update_time_copy);
3445 }
3446#else
3447 p_last_update_time = prev->avg.last_update_time;
3448 n_last_update_time = next->avg.last_update_time;
3449#endif
3450 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3451 se->avg.last_update_time = n_last_update_time;
3452}
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523static inline void
3524update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3525{
3526 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3527
3528
3529 if (!delta)
3530 return;
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541 se->avg.util_avg = gcfs_rq->avg.util_avg;
3542 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3543
3544
3545 add_positive(&cfs_rq->avg.util_avg, delta);
3546 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3547}
3548
3549static inline void
3550update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3551{
3552 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3553 unsigned long runnable_load_avg, load_avg;
3554 u64 runnable_load_sum, load_sum = 0;
3555 s64 delta_sum;
3556
3557 if (!runnable_sum)
3558 return;
3559
3560 gcfs_rq->prop_runnable_sum = 0;
3561
3562 if (runnable_sum >= 0) {
3563
3564
3565
3566
3567 runnable_sum += se->avg.load_sum;
3568 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3569 } else {
3570
3571
3572
3573
3574 if (scale_load_down(gcfs_rq->load.weight)) {
3575 load_sum = div_s64(gcfs_rq->avg.load_sum,
3576 scale_load_down(gcfs_rq->load.weight));
3577 }
3578
3579
3580 runnable_sum = min(se->avg.load_sum, load_sum);
3581 }
3582
3583
3584
3585
3586
3587
3588 running_sum = se->avg.util_sum /
3589 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3590 runnable_sum = max(runnable_sum, running_sum);
3591
3592 load_sum = (s64)se_weight(se) * runnable_sum;
3593 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3594
3595 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3596 delta_avg = load_avg - se->avg.load_avg;
3597
3598 se->avg.load_sum = runnable_sum;
3599 se->avg.load_avg = load_avg;
3600 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3601 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3602
3603 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3604 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3605 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3606 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3607
3608 se->avg.runnable_load_sum = runnable_sum;
3609 se->avg.runnable_load_avg = runnable_load_avg;
3610
3611 if (se->on_rq) {
3612 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3613 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3614 }
3615}
3616
3617static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3618{
3619 cfs_rq->propagate = 1;
3620 cfs_rq->prop_runnable_sum += runnable_sum;
3621}
3622
3623
3624static inline int propagate_entity_load_avg(struct sched_entity *se)
3625{
3626 struct cfs_rq *cfs_rq, *gcfs_rq;
3627
3628 if (entity_is_task(se))
3629 return 0;
3630
3631 gcfs_rq = group_cfs_rq(se);
3632 if (!gcfs_rq->propagate)
3633 return 0;
3634
3635 gcfs_rq->propagate = 0;
3636
3637 cfs_rq = cfs_rq_of(se);
3638
3639 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3640
3641 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3642 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3643
3644 return 1;
3645}
3646
3647
3648
3649
3650
3651static inline bool skip_blocked_update(struct sched_entity *se)
3652{
3653 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3654
3655
3656
3657
3658
3659 if (se->avg.load_avg || se->avg.util_avg)
3660 return false;
3661
3662
3663
3664
3665
3666 if (gcfs_rq->propagate)
3667 return false;
3668
3669
3670
3671
3672
3673
3674 return true;
3675}
3676
3677#else
3678
3679static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3680
3681static inline int propagate_entity_load_avg(struct sched_entity *se)
3682{
3683 return 0;
3684}
3685
3686static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3687
3688#endif
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706static inline int
3707update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3708{
3709 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3710 struct sched_avg *sa = &cfs_rq->avg;
3711 int decayed = 0;
3712
3713 if (cfs_rq->removed.nr) {
3714 unsigned long r;
3715 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3716
3717 raw_spin_lock(&cfs_rq->removed.lock);
3718 swap(cfs_rq->removed.util_avg, removed_util);
3719 swap(cfs_rq->removed.load_avg, removed_load);
3720 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3721 cfs_rq->removed.nr = 0;
3722 raw_spin_unlock(&cfs_rq->removed.lock);
3723
3724 r = removed_load;
3725 sub_positive(&sa->load_avg, r);
3726 sub_positive(&sa->load_sum, r * divider);
3727
3728 r = removed_util;
3729 sub_positive(&sa->util_avg, r);
3730 sub_positive(&sa->util_sum, r * divider);
3731
3732 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3733
3734 decayed = 1;
3735 }
3736
3737 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3738
3739#ifndef CONFIG_64BIT
3740 smp_wmb();
3741 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3742#endif
3743
3744 if (decayed)
3745 cfs_rq_util_change(cfs_rq, 0);
3746
3747 return decayed;
3748}
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3759{
3760 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3761
3762
3763
3764
3765
3766
3767
3768
3769 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3770 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3771
3772
3773
3774
3775
3776
3777
3778 se->avg.util_sum = se->avg.util_avg * divider;
3779
3780 se->avg.load_sum = divider;
3781 if (se_weight(se)) {
3782 se->avg.load_sum =
3783 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3784 }
3785
3786 se->avg.runnable_load_sum = se->avg.load_sum;
3787
3788 enqueue_load_avg(cfs_rq, se);
3789 cfs_rq->avg.util_avg += se->avg.util_avg;
3790 cfs_rq->avg.util_sum += se->avg.util_sum;
3791
3792 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3793
3794 cfs_rq_util_change(cfs_rq, flags);
3795}
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3806{
3807 dequeue_load_avg(cfs_rq, se);
3808 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3809 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3810
3811 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3812
3813 cfs_rq_util_change(cfs_rq, 0);
3814}
3815
3816
3817
3818
3819#define UPDATE_TG 0x1
3820#define SKIP_AGE_LOAD 0x2
3821#define DO_ATTACH 0x4
3822
3823
3824static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3825{
3826 u64 now = cfs_rq_clock_task(cfs_rq);
3827 struct rq *rq = rq_of(cfs_rq);
3828 int cpu = cpu_of(rq);
3829 int decayed;
3830
3831
3832
3833
3834
3835 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3836 __update_load_avg_se(now, cpu, cfs_rq, se);
3837
3838 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3839 decayed |= propagate_entity_load_avg(se);
3840
3841 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3842
3843
3844
3845
3846
3847
3848
3849
3850 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3851 update_tg_load_avg(cfs_rq, 0);
3852
3853 } else if (decayed && (flags & UPDATE_TG))
3854 update_tg_load_avg(cfs_rq, 0);
3855}
3856
3857#ifndef CONFIG_64BIT
3858static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3859{
3860 u64 last_update_time_copy;
3861 u64 last_update_time;
3862
3863 do {
3864 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3865 smp_rmb();
3866 last_update_time = cfs_rq->avg.last_update_time;
3867 } while (last_update_time != last_update_time_copy);
3868
3869 return last_update_time;
3870}
3871#else
3872static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3873{
3874 return cfs_rq->avg.last_update_time;
3875}
3876#endif
3877
3878
3879
3880
3881
3882void sync_entity_load_avg(struct sched_entity *se)
3883{
3884 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3885 u64 last_update_time;
3886
3887 last_update_time = cfs_rq_last_update_time(cfs_rq);
3888 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
3889}
3890
3891
3892
3893
3894
3895void remove_entity_load_avg(struct sched_entity *se)
3896{
3897 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3898 unsigned long flags;
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910 sync_entity_load_avg(se);
3911
3912 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3913 ++cfs_rq->removed.nr;
3914 cfs_rq->removed.util_avg += se->avg.util_avg;
3915 cfs_rq->removed.load_avg += se->avg.load_avg;
3916 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3917 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3918}
3919
3920static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3921{
3922 return cfs_rq->avg.runnable_load_avg;
3923}
3924
3925static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3926{
3927 return cfs_rq->avg.load_avg;
3928}
3929
3930static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3931
3932static inline unsigned long task_util(struct task_struct *p)
3933{
3934 return READ_ONCE(p->se.avg.util_avg);
3935}
3936
3937static inline unsigned long _task_util_est(struct task_struct *p)
3938{
3939 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3940
3941 return max(ue.ewma, ue.enqueued);
3942}
3943
3944static inline unsigned long task_util_est(struct task_struct *p)
3945{
3946 return max(task_util(p), _task_util_est(p));
3947}
3948
3949static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3950 struct task_struct *p)
3951{
3952 unsigned int enqueued;
3953
3954 if (!sched_feat(UTIL_EST))
3955 return;
3956
3957
3958 enqueued = cfs_rq->avg.util_est.enqueued;
3959 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3960 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3961}
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971static inline bool within_margin(int value, int margin)
3972{
3973 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3974}
3975
3976static void
3977util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3978{
3979 long last_ewma_diff;
3980 struct util_est ue;
3981
3982 if (!sched_feat(UTIL_EST))
3983 return;
3984
3985
3986 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3987 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3988 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3989 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3990
3991
3992
3993
3994
3995 if (!task_sleep)
3996 return;
3997
3998
3999
4000
4001
4002 ue = p->se.avg.util_est;
4003 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4004 return;
4005
4006
4007
4008
4009
4010 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
4011 last_ewma_diff = ue.enqueued - ue.ewma;
4012 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
4013 return;
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4033 ue.ewma += last_ewma_diff;
4034 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4035 WRITE_ONCE(p->se.avg.util_est, ue);
4036}
4037
4038#else
4039
4040static inline int
4041update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4042{
4043 return 0;
4044}
4045
4046#define UPDATE_TG 0x0
4047#define SKIP_AGE_LOAD 0x0
4048#define DO_ATTACH 0x0
4049
4050static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4051{
4052 cfs_rq_util_change(cfs_rq, 0);
4053}
4054
4055static inline void remove_entity_load_avg(struct sched_entity *se) {}
4056
4057static inline void
4058attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
4059static inline void
4060detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4061
4062static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
4063{
4064 return 0;
4065}
4066
4067static inline void
4068util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4069
4070static inline void
4071util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4072 bool task_sleep) {}
4073
4074#endif
4075
4076static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4077{
4078#ifdef CONFIG_SCHED_DEBUG
4079 s64 d = se->vruntime - cfs_rq->min_vruntime;
4080
4081 if (d < 0)
4082 d = -d;
4083
4084 if (d > 3*sysctl_sched_latency)
4085 schedstat_inc(cfs_rq->nr_spread_over);
4086#endif
4087}
4088
4089static void
4090place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4091{
4092 u64 vruntime = cfs_rq->min_vruntime;
4093
4094
4095
4096
4097
4098
4099
4100 if (initial && sched_feat(START_DEBIT))
4101 vruntime += sched_vslice(cfs_rq, se);
4102
4103
4104 if (!initial) {
4105 unsigned long thresh = sysctl_sched_latency;
4106
4107
4108
4109
4110
4111 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4112 thresh >>= 1;
4113
4114 vruntime -= thresh;
4115 }
4116
4117
4118 se->vruntime = max_vruntime(se->vruntime, vruntime);
4119}
4120
4121static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4122
4123static inline void check_schedstat_required(void)
4124{
4125#ifdef CONFIG_SCHEDSTATS
4126 if (schedstat_enabled())
4127 return;
4128
4129
4130 if (trace_sched_stat_wait_enabled() ||
4131 trace_sched_stat_sleep_enabled() ||
4132 trace_sched_stat_iowait_enabled() ||
4133 trace_sched_stat_blocked_enabled() ||
4134 trace_sched_stat_runtime_enabled()) {
4135 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4136 "stat_blocked and stat_runtime require the "
4137 "kernel parameter schedstats=enable or "
4138 "kernel.sched_schedstats=1\n");
4139 }
4140#endif
4141}
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174static void
4175enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4176{
4177 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4178 bool curr = cfs_rq->curr == se;
4179
4180
4181
4182
4183
4184 if (renorm && curr)
4185 se->vruntime += cfs_rq->min_vruntime;
4186
4187 update_curr(cfs_rq);
4188
4189
4190
4191
4192
4193
4194
4195 if (renorm && !curr)
4196 se->vruntime += cfs_rq->min_vruntime;
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4207 update_cfs_group(se);
4208 enqueue_runnable_load_avg(cfs_rq, se);
4209 account_entity_enqueue(cfs_rq, se);
4210
4211 if (flags & ENQUEUE_WAKEUP)
4212 place_entity(cfs_rq, se, 0);
4213
4214 check_schedstat_required();
4215 update_stats_enqueue(cfs_rq, se, flags);
4216 check_spread(cfs_rq, se);
4217 if (!curr)
4218 __enqueue_entity(cfs_rq, se);
4219 se->on_rq = 1;
4220
4221 if (cfs_rq->nr_running == 1) {
4222 list_add_leaf_cfs_rq(cfs_rq);
4223 check_enqueue_throttle(cfs_rq);
4224 }
4225}
4226
4227static void __clear_buddies_last(struct sched_entity *se)
4228{
4229 for_each_sched_entity(se) {
4230 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4231 if (cfs_rq->last != se)
4232 break;
4233
4234 cfs_rq->last = NULL;
4235 }
4236}
4237
4238static void __clear_buddies_next(struct sched_entity *se)
4239{
4240 for_each_sched_entity(se) {
4241 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4242 if (cfs_rq->next != se)
4243 break;
4244
4245 cfs_rq->next = NULL;
4246 }
4247}
4248
4249static void __clear_buddies_skip(struct sched_entity *se)
4250{
4251 for_each_sched_entity(se) {
4252 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4253 if (cfs_rq->skip != se)
4254 break;
4255
4256 cfs_rq->skip = NULL;
4257 }
4258}
4259
4260static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4261{
4262 if (cfs_rq->last == se)
4263 __clear_buddies_last(se);
4264
4265 if (cfs_rq->next == se)
4266 __clear_buddies_next(se);
4267
4268 if (cfs_rq->skip == se)
4269 __clear_buddies_skip(se);
4270}
4271
4272static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4273
4274static void
4275dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4276{
4277
4278
4279
4280 update_curr(cfs_rq);
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290 update_load_avg(cfs_rq, se, UPDATE_TG);
4291 dequeue_runnable_load_avg(cfs_rq, se);
4292
4293 update_stats_dequeue(cfs_rq, se, flags);
4294
4295 clear_buddies(cfs_rq, se);
4296
4297 if (se != cfs_rq->curr)
4298 __dequeue_entity(cfs_rq, se);
4299 se->on_rq = 0;
4300 account_entity_dequeue(cfs_rq, se);
4301
4302
4303
4304
4305
4306
4307
4308 if (!(flags & DEQUEUE_SLEEP))
4309 se->vruntime -= cfs_rq->min_vruntime;
4310
4311
4312 return_cfs_rq_runtime(cfs_rq);
4313
4314 update_cfs_group(se);
4315
4316
4317
4318
4319
4320
4321
4322 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
4323 update_min_vruntime(cfs_rq);
4324}
4325
4326
4327
4328
4329static void
4330check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4331{
4332 unsigned long ideal_runtime, delta_exec;
4333 struct sched_entity *se;
4334 s64 delta;
4335
4336 ideal_runtime = sched_slice(cfs_rq, curr);
4337 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4338 if (delta_exec > ideal_runtime) {
4339 resched_curr(rq_of(cfs_rq));
4340
4341
4342
4343
4344 clear_buddies(cfs_rq, curr);
4345 return;
4346 }
4347
4348
4349
4350
4351
4352
4353 if (delta_exec < sysctl_sched_min_granularity)
4354 return;
4355
4356 se = __pick_first_entity(cfs_rq);
4357 delta = curr->vruntime - se->vruntime;
4358
4359 if (delta < 0)
4360 return;
4361
4362 if (delta > ideal_runtime)
4363 resched_curr(rq_of(cfs_rq));
4364}
4365
4366static void
4367set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4368{
4369
4370 if (se->on_rq) {
4371
4372
4373
4374
4375
4376 update_stats_wait_end(cfs_rq, se);
4377 __dequeue_entity(cfs_rq, se);
4378 update_load_avg(cfs_rq, se, UPDATE_TG);
4379 }
4380
4381 update_stats_curr_start(cfs_rq, se);
4382 cfs_rq->curr = se;
4383
4384
4385
4386
4387
4388
4389 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4390 schedstat_set(se->statistics.slice_max,
4391 max((u64)schedstat_val(se->statistics.slice_max),
4392 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4393 }
4394
4395 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4396}
4397
4398static int
4399wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4400
4401
4402
4403
4404
4405
4406
4407
4408static struct sched_entity *
4409pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4410{
4411 struct sched_entity *left = __pick_first_entity(cfs_rq);
4412 struct sched_entity *se;
4413
4414
4415
4416
4417
4418 if (!left || (curr && entity_before(curr, left)))
4419 left = curr;
4420
4421 se = left;
4422
4423
4424
4425
4426
4427 if (cfs_rq->skip == se) {
4428 struct sched_entity *second;
4429
4430 if (se == curr) {
4431 second = __pick_first_entity(cfs_rq);
4432 } else {
4433 second = __pick_next_entity(se);
4434 if (!second || (curr && entity_before(curr, second)))
4435 second = curr;
4436 }
4437
4438 if (second && wakeup_preempt_entity(second, left) < 1)
4439 se = second;
4440 }
4441
4442
4443
4444
4445 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4446 se = cfs_rq->last;
4447
4448
4449
4450
4451 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4452 se = cfs_rq->next;
4453
4454 clear_buddies(cfs_rq, se);
4455
4456 return se;
4457}
4458
4459static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4460
4461static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4462{
4463
4464
4465
4466
4467 if (prev->on_rq)
4468 update_curr(cfs_rq);
4469
4470
4471 check_cfs_rq_runtime(cfs_rq);
4472
4473 check_spread(cfs_rq, prev);
4474
4475 if (prev->on_rq) {
4476 update_stats_wait_start(cfs_rq, prev);
4477
4478 __enqueue_entity(cfs_rq, prev);
4479
4480 update_load_avg(cfs_rq, prev, 0);
4481 }
4482 cfs_rq->curr = NULL;
4483}
4484
4485static void
4486entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4487{
4488
4489
4490
4491 update_curr(cfs_rq);
4492
4493
4494
4495
4496 update_load_avg(cfs_rq, curr, UPDATE_TG);
4497 update_cfs_group(curr);
4498
4499#ifdef CONFIG_SCHED_HRTICK
4500
4501
4502
4503
4504 if (queued) {
4505 resched_curr(rq_of(cfs_rq));
4506 return;
4507 }
4508
4509
4510
4511 if (!sched_feat(DOUBLE_TICK) &&
4512 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4513 return;
4514#endif
4515
4516 if (cfs_rq->nr_running > 1)
4517 check_preempt_tick(cfs_rq, curr);
4518}
4519
4520
4521
4522
4523
4524
4525#ifdef CONFIG_CFS_BANDWIDTH
4526
4527#ifdef HAVE_JUMP_LABEL
4528static struct static_key __cfs_bandwidth_used;
4529
4530static inline bool cfs_bandwidth_used(void)
4531{
4532 return static_key_false(&__cfs_bandwidth_used);
4533}
4534
4535void cfs_bandwidth_usage_inc(void)
4536{
4537 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4538}
4539
4540void cfs_bandwidth_usage_dec(void)
4541{
4542 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4543}
4544#else
4545static bool cfs_bandwidth_used(void)
4546{
4547 return true;
4548}
4549
4550void cfs_bandwidth_usage_inc(void) {}
4551void cfs_bandwidth_usage_dec(void) {}
4552#endif
4553
4554
4555
4556
4557
4558static inline u64 default_cfs_period(void)
4559{
4560 return 100000000ULL;
4561}
4562
4563static inline u64 sched_cfs_bandwidth_slice(void)
4564{
4565 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4566}
4567
4568
4569
4570
4571
4572
4573
4574
4575void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4576{
4577 u64 now;
4578
4579 if (cfs_b->quota == RUNTIME_INF)
4580 return;
4581
4582 now = sched_clock_cpu(smp_processor_id());
4583 cfs_b->runtime = cfs_b->quota;
4584 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4585 cfs_b->expires_seq++;
4586}
4587
4588static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4589{
4590 return &tg->cfs_bandwidth;
4591}
4592
4593
4594static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4595{
4596 if (unlikely(cfs_rq->throttle_count))
4597 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4598
4599 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4600}
4601
4602
4603static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4604{
4605 struct task_group *tg = cfs_rq->tg;
4606 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4607 u64 amount = 0, min_amount, expires;
4608 int expires_seq;
4609
4610
4611 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4612
4613 raw_spin_lock(&cfs_b->lock);
4614 if (cfs_b->quota == RUNTIME_INF)
4615 amount = min_amount;
4616 else {
4617 start_cfs_bandwidth(cfs_b);
4618
4619 if (cfs_b->runtime > 0) {
4620 amount = min(cfs_b->runtime, min_amount);
4621 cfs_b->runtime -= amount;
4622 cfs_b->idle = 0;
4623 }
4624 }
4625 expires_seq = cfs_b->expires_seq;
4626 expires = cfs_b->runtime_expires;
4627 raw_spin_unlock(&cfs_b->lock);
4628
4629 cfs_rq->runtime_remaining += amount;
4630
4631
4632
4633
4634
4635 if (cfs_rq->expires_seq != expires_seq) {
4636 cfs_rq->expires_seq = expires_seq;
4637 cfs_rq->runtime_expires = expires;
4638 }
4639
4640 return cfs_rq->runtime_remaining > 0;
4641}
4642
4643
4644
4645
4646
4647static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4648{
4649 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4650
4651
4652 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4653 return;
4654
4655 if (cfs_rq->runtime_remaining < 0)
4656 return;
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4667
4668 cfs_rq->runtime_expires += TICK_NSEC;
4669 } else {
4670
4671 cfs_rq->runtime_remaining = 0;
4672 }
4673}
4674
4675static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4676{
4677
4678 cfs_rq->runtime_remaining -= delta_exec;
4679 expire_cfs_rq_runtime(cfs_rq);
4680
4681 if (likely(cfs_rq->runtime_remaining > 0))
4682 return;
4683
4684
4685
4686
4687
4688 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4689 resched_curr(rq_of(cfs_rq));
4690}
4691
4692static __always_inline
4693void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4694{
4695 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4696 return;
4697
4698 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4699}
4700
4701static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4702{
4703 return cfs_bandwidth_used() && cfs_rq->throttled;
4704}
4705
4706
4707static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4708{
4709 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4710}
4711
4712
4713
4714
4715
4716
4717static inline int throttled_lb_pair(struct task_group *tg,
4718 int src_cpu, int dest_cpu)
4719{
4720 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4721
4722 src_cfs_rq = tg->cfs_rq[src_cpu];
4723 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4724
4725 return throttled_hierarchy(src_cfs_rq) ||
4726 throttled_hierarchy(dest_cfs_rq);
4727}
4728
4729
4730static int tg_unthrottle_up(struct task_group *tg, void *data)
4731{
4732 struct rq *rq = data;
4733 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4734
4735 cfs_rq->throttle_count--;
4736 if (!cfs_rq->throttle_count) {
4737
4738 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4739 cfs_rq->throttled_clock_task;
4740 }
4741
4742 return 0;
4743}
4744
4745static int tg_throttle_down(struct task_group *tg, void *data)
4746{
4747 struct rq *rq = data;
4748 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4749
4750
4751 if (!cfs_rq->throttle_count)
4752 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4753 cfs_rq->throttle_count++;
4754
4755 return 0;
4756}
4757
4758static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4759{
4760 struct rq *rq = rq_of(cfs_rq);
4761 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4762 struct sched_entity *se;
4763 long task_delta, dequeue = 1;
4764 bool empty;
4765
4766 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4767
4768
4769 rcu_read_lock();
4770 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4771 rcu_read_unlock();
4772
4773 task_delta = cfs_rq->h_nr_running;
4774 for_each_sched_entity(se) {
4775 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4776
4777 if (!se->on_rq)
4778 break;
4779
4780 if (dequeue)
4781 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4782 qcfs_rq->h_nr_running -= task_delta;
4783
4784 if (qcfs_rq->load.weight)
4785 dequeue = 0;
4786 }
4787
4788 if (!se)
4789 sub_nr_running(rq, task_delta);
4790
4791 cfs_rq->throttled = 1;
4792 cfs_rq->throttled_clock = rq_clock(rq);
4793 raw_spin_lock(&cfs_b->lock);
4794 empty = list_empty(&cfs_b->throttled_cfs_rq);
4795
4796
4797
4798
4799
4800 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4801
4802
4803
4804
4805
4806 if (empty)
4807 start_cfs_bandwidth(cfs_b);
4808
4809 raw_spin_unlock(&cfs_b->lock);
4810}
4811
4812void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4813{
4814 struct rq *rq = rq_of(cfs_rq);
4815 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4816 struct sched_entity *se;
4817 int enqueue = 1;
4818 long task_delta;
4819
4820 se = cfs_rq->tg->se[cpu_of(rq)];
4821
4822 cfs_rq->throttled = 0;
4823
4824 update_rq_clock(rq);
4825
4826 raw_spin_lock(&cfs_b->lock);
4827 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4828 list_del_rcu(&cfs_rq->throttled_list);
4829 raw_spin_unlock(&cfs_b->lock);
4830
4831
4832 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4833
4834 if (!cfs_rq->load.weight)
4835 return;
4836
4837 task_delta = cfs_rq->h_nr_running;
4838 for_each_sched_entity(se) {
4839 if (se->on_rq)
4840 enqueue = 0;
4841
4842 cfs_rq = cfs_rq_of(se);
4843 if (enqueue)
4844 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4845 cfs_rq->h_nr_running += task_delta;
4846
4847 if (cfs_rq_throttled(cfs_rq))
4848 break;
4849 }
4850
4851 if (!se)
4852 add_nr_running(rq, task_delta);
4853
4854
4855 if (rq->curr == rq->idle && rq->cfs.nr_running)
4856 resched_curr(rq);
4857}
4858
4859static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4860 u64 remaining, u64 expires)
4861{
4862 struct cfs_rq *cfs_rq;
4863 u64 runtime;
4864 u64 starting_runtime = remaining;
4865
4866 rcu_read_lock();
4867 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4868 throttled_list) {
4869 struct rq *rq = rq_of(cfs_rq);
4870 struct rq_flags rf;
4871
4872 rq_lock(rq, &rf);
4873 if (!cfs_rq_throttled(cfs_rq))
4874 goto next;
4875
4876 runtime = -cfs_rq->runtime_remaining + 1;
4877 if (runtime > remaining)
4878 runtime = remaining;
4879 remaining -= runtime;
4880
4881 cfs_rq->runtime_remaining += runtime;
4882 cfs_rq->runtime_expires = expires;
4883
4884
4885 if (cfs_rq->runtime_remaining > 0)
4886 unthrottle_cfs_rq(cfs_rq);
4887
4888next:
4889 rq_unlock(rq, &rf);
4890
4891 if (!remaining)
4892 break;
4893 }
4894 rcu_read_unlock();
4895
4896 return starting_runtime - remaining;
4897}
4898
4899
4900
4901
4902
4903
4904
4905static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4906{
4907 u64 runtime, runtime_expires;
4908 int throttled;
4909
4910
4911 if (cfs_b->quota == RUNTIME_INF)
4912 goto out_deactivate;
4913
4914 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4915 cfs_b->nr_periods += overrun;
4916
4917
4918
4919
4920
4921 if (cfs_b->idle && !throttled)
4922 goto out_deactivate;
4923
4924 __refill_cfs_bandwidth_runtime(cfs_b);
4925
4926 if (!throttled) {
4927
4928 cfs_b->idle = 1;
4929 return 0;
4930 }
4931
4932
4933 cfs_b->nr_throttled += overrun;
4934
4935 runtime_expires = cfs_b->runtime_expires;
4936
4937
4938
4939
4940
4941
4942
4943
4944 while (throttled && cfs_b->runtime > 0) {
4945 runtime = cfs_b->runtime;
4946 raw_spin_unlock(&cfs_b->lock);
4947
4948 runtime = distribute_cfs_runtime(cfs_b, runtime,
4949 runtime_expires);
4950 raw_spin_lock(&cfs_b->lock);
4951
4952 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4953
4954 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4955 }
4956
4957
4958
4959
4960
4961
4962
4963 cfs_b->idle = 0;
4964
4965 return 0;
4966
4967out_deactivate:
4968 return 1;
4969}
4970
4971
4972static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4973
4974static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4975
4976static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4977
4978
4979
4980
4981
4982
4983
4984
4985static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4986{
4987 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4988 u64 remaining;
4989
4990
4991 if (hrtimer_callback_running(refresh_timer))
4992 return 1;
4993
4994
4995 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4996 if (remaining < min_expire)
4997 return 1;
4998
4999 return 0;
5000}
5001
5002static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5003{
5004 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5005
5006
5007 if (runtime_refresh_within(cfs_b, min_left))
5008 return;
5009
5010 hrtimer_start(&cfs_b->slack_timer,
5011 ns_to_ktime(cfs_bandwidth_slack_period),
5012 HRTIMER_MODE_REL);
5013}
5014
5015
5016static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5017{
5018 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5019 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5020
5021 if (slack_runtime <= 0)
5022 return;
5023
5024 raw_spin_lock(&cfs_b->lock);
5025 if (cfs_b->quota != RUNTIME_INF &&
5026 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5027 cfs_b->runtime += slack_runtime;
5028
5029
5030 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5031 !list_empty(&cfs_b->throttled_cfs_rq))
5032 start_cfs_slack_bandwidth(cfs_b);
5033 }
5034 raw_spin_unlock(&cfs_b->lock);
5035
5036
5037 cfs_rq->runtime_remaining -= slack_runtime;
5038}
5039
5040static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5041{
5042 if (!cfs_bandwidth_used())
5043 return;
5044
5045 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5046 return;
5047
5048 __return_cfs_rq_runtime(cfs_rq);
5049}
5050
5051
5052
5053
5054
5055static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5056{
5057 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5058 u64 expires;
5059
5060
5061 raw_spin_lock(&cfs_b->lock);
5062 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5063 raw_spin_unlock(&cfs_b->lock);
5064 return;
5065 }
5066
5067 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5068 runtime = cfs_b->runtime;
5069
5070 expires = cfs_b->runtime_expires;
5071 raw_spin_unlock(&cfs_b->lock);
5072
5073 if (!runtime)
5074 return;
5075
5076 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5077
5078 raw_spin_lock(&cfs_b->lock);
5079 if (expires == cfs_b->runtime_expires)
5080 cfs_b->runtime -= min(runtime, cfs_b->runtime);
5081 raw_spin_unlock(&cfs_b->lock);
5082}
5083
5084
5085
5086
5087
5088
5089static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5090{
5091 if (!cfs_bandwidth_used())
5092 return;
5093
5094
5095 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5096 return;
5097
5098
5099 if (cfs_rq_throttled(cfs_rq))
5100 return;
5101
5102
5103 account_cfs_rq_runtime(cfs_rq, 0);
5104 if (cfs_rq->runtime_remaining <= 0)
5105 throttle_cfs_rq(cfs_rq);
5106}
5107
5108static void sync_throttle(struct task_group *tg, int cpu)
5109{
5110 struct cfs_rq *pcfs_rq, *cfs_rq;
5111
5112 if (!cfs_bandwidth_used())
5113 return;
5114
5115 if (!tg->parent)
5116 return;
5117
5118 cfs_rq = tg->cfs_rq[cpu];
5119 pcfs_rq = tg->parent->cfs_rq[cpu];
5120
5121 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5122 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5123}
5124
5125
5126static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5127{
5128 if (!cfs_bandwidth_used())
5129 return false;
5130
5131 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5132 return false;
5133
5134
5135
5136
5137
5138 if (cfs_rq_throttled(cfs_rq))
5139 return true;
5140
5141 throttle_cfs_rq(cfs_rq);
5142 return true;
5143}
5144
5145static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5146{
5147 struct cfs_bandwidth *cfs_b =
5148 container_of(timer, struct cfs_bandwidth, slack_timer);
5149
5150 do_sched_cfs_slack_timer(cfs_b);
5151
5152 return HRTIMER_NORESTART;
5153}
5154
5155static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5156{
5157 struct cfs_bandwidth *cfs_b =
5158 container_of(timer, struct cfs_bandwidth, period_timer);
5159 int overrun;
5160 int idle = 0;
5161
5162 raw_spin_lock(&cfs_b->lock);
5163 for (;;) {
5164 overrun = hrtimer_forward_now(timer, cfs_b->period);
5165 if (!overrun)
5166 break;
5167
5168 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5169 }
5170 if (idle)
5171 cfs_b->period_active = 0;
5172 raw_spin_unlock(&cfs_b->lock);
5173
5174 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5175}
5176
5177void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5178{
5179 raw_spin_lock_init(&cfs_b->lock);
5180 cfs_b->runtime = 0;
5181 cfs_b->quota = RUNTIME_INF;
5182 cfs_b->period = ns_to_ktime(default_cfs_period());
5183
5184 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5185 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5186 cfs_b->period_timer.function = sched_cfs_period_timer;
5187 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5188 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5189}
5190
5191static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5192{
5193 cfs_rq->runtime_enabled = 0;
5194 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5195}
5196
5197void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5198{
5199 u64 overrun;
5200
5201 lockdep_assert_held(&cfs_b->lock);
5202
5203 if (cfs_b->period_active)
5204 return;
5205
5206 cfs_b->period_active = 1;
5207 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5208 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
5209 cfs_b->expires_seq++;
5210 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5211}
5212
5213static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5214{
5215
5216 if (!cfs_b->throttled_cfs_rq.next)
5217 return;
5218
5219 hrtimer_cancel(&cfs_b->period_timer);
5220 hrtimer_cancel(&cfs_b->slack_timer);
5221}
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231static void __maybe_unused update_runtime_enabled(struct rq *rq)
5232{
5233 struct task_group *tg;
5234
5235 lockdep_assert_held(&rq->lock);
5236
5237 rcu_read_lock();
5238 list_for_each_entry_rcu(tg, &task_groups, list) {
5239 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5240 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5241
5242 raw_spin_lock(&cfs_b->lock);
5243 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5244 raw_spin_unlock(&cfs_b->lock);
5245 }
5246 rcu_read_unlock();
5247}
5248
5249
5250static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5251{
5252 struct task_group *tg;
5253
5254 lockdep_assert_held(&rq->lock);
5255
5256 rcu_read_lock();
5257 list_for_each_entry_rcu(tg, &task_groups, list) {
5258 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5259
5260 if (!cfs_rq->runtime_enabled)
5261 continue;
5262
5263
5264
5265
5266
5267 cfs_rq->runtime_remaining = 1;
5268
5269
5270
5271
5272 cfs_rq->runtime_enabled = 0;
5273
5274 if (cfs_rq_throttled(cfs_rq))
5275 unthrottle_cfs_rq(cfs_rq);
5276 }
5277 rcu_read_unlock();
5278}
5279
5280#else
5281static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5282{
5283 return rq_clock_task(rq_of(cfs_rq));
5284}
5285
5286static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5287static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5288static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5289static inline void sync_throttle(struct task_group *tg, int cpu) {}
5290static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5291
5292static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5293{
5294 return 0;
5295}
5296
5297static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5298{
5299 return 0;
5300}
5301
5302static inline int throttled_lb_pair(struct task_group *tg,
5303 int src_cpu, int dest_cpu)
5304{
5305 return 0;
5306}
5307
5308void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5309
5310#ifdef CONFIG_FAIR_GROUP_SCHED
5311static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5312#endif
5313
5314static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5315{
5316 return NULL;
5317}
5318static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5319static inline void update_runtime_enabled(struct rq *rq) {}
5320static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5321
5322#endif
5323
5324
5325
5326
5327
5328#ifdef CONFIG_SCHED_HRTICK
5329static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5330{
5331 struct sched_entity *se = &p->se;
5332 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5333
5334 SCHED_WARN_ON(task_rq(p) != rq);
5335
5336 if (rq->cfs.h_nr_running > 1) {
5337 u64 slice = sched_slice(cfs_rq, se);
5338 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5339 s64 delta = slice - ran;
5340
5341 if (delta < 0) {
5342 if (rq->curr == p)
5343 resched_curr(rq);
5344 return;
5345 }
5346 hrtick_start(rq, delta);
5347 }
5348}
5349
5350
5351
5352
5353
5354
5355static void hrtick_update(struct rq *rq)
5356{
5357 struct task_struct *curr = rq->curr;
5358
5359 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5360 return;
5361
5362 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5363 hrtick_start_fair(rq, curr);
5364}
5365#else
5366static inline void
5367hrtick_start_fair(struct rq *rq, struct task_struct *p)
5368{
5369}
5370
5371static inline void hrtick_update(struct rq *rq)
5372{
5373}
5374#endif
5375
5376
5377
5378
5379
5380
5381static void
5382enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5383{
5384 struct cfs_rq *cfs_rq;
5385 struct sched_entity *se = &p->se;
5386
5387
5388
5389
5390
5391
5392
5393 util_est_enqueue(&rq->cfs, p);
5394
5395
5396
5397
5398
5399
5400 if (p->in_iowait)
5401 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5402
5403 for_each_sched_entity(se) {
5404 if (se->on_rq)
5405 break;
5406 cfs_rq = cfs_rq_of(se);
5407 enqueue_entity(cfs_rq, se, flags);
5408
5409
5410
5411
5412
5413
5414
5415 if (cfs_rq_throttled(cfs_rq))
5416 break;
5417 cfs_rq->h_nr_running++;
5418
5419 flags = ENQUEUE_WAKEUP;
5420 }
5421
5422 for_each_sched_entity(se) {
5423 cfs_rq = cfs_rq_of(se);
5424 cfs_rq->h_nr_running++;
5425
5426 if (cfs_rq_throttled(cfs_rq))
5427 break;
5428
5429 update_load_avg(cfs_rq, se, UPDATE_TG);
5430 update_cfs_group(se);
5431 }
5432
5433 if (!se)
5434 add_nr_running(rq, 1);
5435
5436 hrtick_update(rq);
5437}
5438
5439static void set_next_buddy(struct sched_entity *se);
5440
5441
5442
5443
5444
5445
5446static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5447{
5448 struct cfs_rq *cfs_rq;
5449 struct sched_entity *se = &p->se;
5450 int task_sleep = flags & DEQUEUE_SLEEP;
5451
5452 for_each_sched_entity(se) {
5453 cfs_rq = cfs_rq_of(se);
5454 dequeue_entity(cfs_rq, se, flags);
5455
5456
5457
5458
5459
5460
5461
5462 if (cfs_rq_throttled(cfs_rq))
5463 break;
5464 cfs_rq->h_nr_running--;
5465
5466
5467 if (cfs_rq->load.weight) {
5468
5469 se = parent_entity(se);
5470
5471
5472
5473
5474 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5475 set_next_buddy(se);
5476 break;
5477 }
5478 flags |= DEQUEUE_SLEEP;
5479 }
5480
5481 for_each_sched_entity(se) {
5482 cfs_rq = cfs_rq_of(se);
5483 cfs_rq->h_nr_running--;
5484
5485 if (cfs_rq_throttled(cfs_rq))
5486 break;
5487
5488 update_load_avg(cfs_rq, se, UPDATE_TG);
5489 update_cfs_group(se);
5490 }
5491
5492 if (!se)
5493 sub_nr_running(rq, 1);
5494
5495 util_est_dequeue(&rq->cfs, p, task_sleep);
5496 hrtick_update(rq);
5497}
5498
5499#ifdef CONFIG_SMP
5500
5501
5502DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5503DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5504
5505#ifdef CONFIG_NO_HZ_COMMON
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532#define DEGRADE_SHIFT 7
5533
5534static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5535static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5536 { 0, 0, 0, 0, 0, 0, 0, 0 },
5537 { 64, 32, 8, 0, 0, 0, 0, 0 },
5538 { 96, 72, 40, 12, 1, 0, 0, 0 },
5539 { 112, 98, 75, 43, 15, 1, 0, 0 },
5540 { 120, 112, 98, 76, 45, 16, 2, 0 }
5541};
5542
5543
5544
5545
5546
5547
5548static unsigned long
5549decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5550{
5551 int j = 0;
5552
5553 if (!missed_updates)
5554 return load;
5555
5556 if (missed_updates >= degrade_zero_ticks[idx])
5557 return 0;
5558
5559 if (idx == 1)
5560 return load >> missed_updates;
5561
5562 while (missed_updates) {
5563 if (missed_updates % 2)
5564 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5565
5566 missed_updates >>= 1;
5567 j++;
5568 }
5569 return load;
5570}
5571
5572static struct {
5573 cpumask_var_t idle_cpus_mask;
5574 atomic_t nr_cpus;
5575 int has_blocked;
5576 unsigned long next_balance;
5577 unsigned long next_blocked;
5578} nohz ____cacheline_aligned;
5579
5580#endif
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5618 unsigned long pending_updates)
5619{
5620 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5621 int i, scale;
5622
5623 this_rq->nr_load_updates++;
5624
5625
5626 this_rq->cpu_load[0] = this_load;
5627 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5628 unsigned long old_load, new_load;
5629
5630
5631
5632 old_load = this_rq->cpu_load[i];
5633#ifdef CONFIG_NO_HZ_COMMON
5634 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5635 if (tickless_load) {
5636 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5637
5638
5639
5640
5641
5642 old_load += tickless_load;
5643 }
5644#endif
5645 new_load = this_load;
5646
5647
5648
5649
5650
5651 if (new_load > old_load)
5652 new_load += scale - 1;
5653
5654 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5655 }
5656
5657 sched_avg_update(this_rq);
5658}
5659
5660
5661static unsigned long weighted_cpuload(struct rq *rq)
5662{
5663 return cfs_rq_runnable_load_avg(&rq->cfs);
5664}
5665
5666#ifdef CONFIG_NO_HZ_COMMON
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681static void cpu_load_update_nohz(struct rq *this_rq,
5682 unsigned long curr_jiffies,
5683 unsigned long load)
5684{
5685 unsigned long pending_updates;
5686
5687 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5688 if (pending_updates) {
5689 this_rq->last_load_update_tick = curr_jiffies;
5690
5691
5692
5693
5694
5695 cpu_load_update(this_rq, load, pending_updates);
5696 }
5697}
5698
5699
5700
5701
5702
5703static void cpu_load_update_idle(struct rq *this_rq)
5704{
5705
5706
5707
5708 if (weighted_cpuload(this_rq))
5709 return;
5710
5711 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5712}
5713
5714
5715
5716
5717
5718
5719
5720void cpu_load_update_nohz_start(void)
5721{
5722 struct rq *this_rq = this_rq();
5723
5724
5725
5726
5727
5728
5729 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5730}
5731
5732
5733
5734
5735void cpu_load_update_nohz_stop(void)
5736{
5737 unsigned long curr_jiffies = READ_ONCE(jiffies);
5738 struct rq *this_rq = this_rq();
5739 unsigned long load;
5740 struct rq_flags rf;
5741
5742 if (curr_jiffies == this_rq->last_load_update_tick)
5743 return;
5744
5745 load = weighted_cpuload(this_rq);
5746 rq_lock(this_rq, &rf);
5747 update_rq_clock(this_rq);
5748 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5749 rq_unlock(this_rq, &rf);
5750}
5751#else
5752static inline void cpu_load_update_nohz(struct rq *this_rq,
5753 unsigned long curr_jiffies,
5754 unsigned long load) { }
5755#endif
5756
5757static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5758{
5759#ifdef CONFIG_NO_HZ_COMMON
5760
5761 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5762#endif
5763 cpu_load_update(this_rq, load, 1);
5764}
5765
5766
5767
5768
5769void cpu_load_update_active(struct rq *this_rq)
5770{
5771 unsigned long load = weighted_cpuload(this_rq);
5772
5773 if (tick_nohz_tick_stopped())
5774 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5775 else
5776 cpu_load_update_periodic(this_rq, load);
5777}
5778
5779
5780
5781
5782
5783
5784
5785
5786static unsigned long source_load(int cpu, int type)
5787{
5788 struct rq *rq = cpu_rq(cpu);
5789 unsigned long total = weighted_cpuload(rq);
5790
5791 if (type == 0 || !sched_feat(LB_BIAS))
5792 return total;
5793
5794 return min(rq->cpu_load[type-1], total);
5795}
5796
5797
5798
5799
5800
5801static unsigned long target_load(int cpu, int type)
5802{
5803 struct rq *rq = cpu_rq(cpu);
5804 unsigned long total = weighted_cpuload(rq);
5805
5806 if (type == 0 || !sched_feat(LB_BIAS))
5807 return total;
5808
5809 return max(rq->cpu_load[type-1], total);
5810}
5811
5812static unsigned long capacity_of(int cpu)
5813{
5814 return cpu_rq(cpu)->cpu_capacity;
5815}
5816
5817static unsigned long capacity_orig_of(int cpu)
5818{
5819 return cpu_rq(cpu)->cpu_capacity_orig;
5820}
5821
5822static unsigned long cpu_avg_load_per_task(int cpu)
5823{
5824 struct rq *rq = cpu_rq(cpu);
5825 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5826 unsigned long load_avg = weighted_cpuload(rq);
5827
5828 if (nr_running)
5829 return load_avg / nr_running;
5830
5831 return 0;
5832}
5833
5834static void record_wakee(struct task_struct *p)
5835{
5836
5837
5838
5839
5840 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5841 current->wakee_flips >>= 1;
5842 current->wakee_flip_decay_ts = jiffies;
5843 }
5844
5845 if (current->last_wakee != p) {
5846 current->last_wakee = p;
5847 current->wakee_flips++;
5848 }
5849}
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868static int wake_wide(struct task_struct *p)
5869{
5870 unsigned int master = current->wakee_flips;
5871 unsigned int slave = p->wakee_flips;
5872 int factor = this_cpu_read(sd_llc_size);
5873
5874 if (master < slave)
5875 swap(master, slave);
5876 if (slave < factor || master < slave * factor)
5877 return 0;
5878 return 1;
5879}
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893static int
5894wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5895{
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5909 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5910
5911 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5912 return this_cpu;
5913
5914 return nr_cpumask_bits;
5915}
5916
5917static int
5918wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5919 int this_cpu, int prev_cpu, int sync)
5920{
5921 s64 this_eff_load, prev_eff_load;
5922 unsigned long task_load;
5923
5924 this_eff_load = target_load(this_cpu, sd->wake_idx);
5925
5926 if (sync) {
5927 unsigned long current_load = task_h_load(current);
5928
5929 if (current_load > this_eff_load)
5930 return this_cpu;
5931
5932 this_eff_load -= current_load;
5933 }
5934
5935 task_load = task_h_load(p);
5936
5937 this_eff_load += task_load;
5938 if (sched_feat(WA_BIAS))
5939 this_eff_load *= 100;
5940 this_eff_load *= capacity_of(prev_cpu);
5941
5942 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5943 prev_eff_load -= task_load;
5944 if (sched_feat(WA_BIAS))
5945 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5946 prev_eff_load *= capacity_of(this_cpu);
5947
5948
5949
5950
5951
5952
5953
5954 if (sync)
5955 prev_eff_load += 1;
5956
5957 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5958}
5959
5960static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5961 int this_cpu, int prev_cpu, int sync)
5962{
5963 int target = nr_cpumask_bits;
5964
5965 if (sched_feat(WA_IDLE))
5966 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5967
5968 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5969 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5970
5971 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5972 if (target == nr_cpumask_bits)
5973 return prev_cpu;
5974
5975 schedstat_inc(sd->ttwu_move_affine);
5976 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5977 return target;
5978}
5979
5980static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5981
5982static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5983{
5984 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
5985}
5986
5987
5988
5989
5990
5991
5992
5993static struct sched_group *
5994find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5995 int this_cpu, int sd_flag)
5996{
5997 struct sched_group *idlest = NULL, *group = sd->groups;
5998 struct sched_group *most_spare_sg = NULL;
5999 unsigned long min_runnable_load = ULONG_MAX;
6000 unsigned long this_runnable_load = ULONG_MAX;
6001 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6002 unsigned long most_spare = 0, this_spare = 0;
6003 int load_idx = sd->forkexec_idx;
6004 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6005 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6006 (sd->imbalance_pct-100) / 100;
6007
6008 if (sd_flag & SD_BALANCE_WAKE)
6009 load_idx = sd->wake_idx;
6010
6011 do {
6012 unsigned long load, avg_load, runnable_load;
6013 unsigned long spare_cap, max_spare_cap;
6014 int local_group;
6015 int i;
6016
6017
6018 if (!cpumask_intersects(sched_group_span(group),
6019 &p->cpus_allowed))
6020 continue;
6021
6022 local_group = cpumask_test_cpu(this_cpu,
6023 sched_group_span(group));
6024
6025
6026
6027
6028
6029 avg_load = 0;
6030 runnable_load = 0;
6031 max_spare_cap = 0;
6032
6033 for_each_cpu(i, sched_group_span(group)) {
6034
6035 if (local_group)
6036 load = source_load(i, load_idx);
6037 else
6038 load = target_load(i, load_idx);
6039
6040 runnable_load += load;
6041
6042 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6043
6044 spare_cap = capacity_spare_wake(i, p);
6045
6046 if (spare_cap > max_spare_cap)
6047 max_spare_cap = spare_cap;
6048 }
6049
6050
6051 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6052 group->sgc->capacity;
6053 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6054 group->sgc->capacity;
6055
6056 if (local_group) {
6057 this_runnable_load = runnable_load;
6058 this_avg_load = avg_load;
6059 this_spare = max_spare_cap;
6060 } else {
6061 if (min_runnable_load > (runnable_load + imbalance)) {
6062
6063
6064
6065
6066 min_runnable_load = runnable_load;
6067 min_avg_load = avg_load;
6068 idlest = group;
6069 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6070 (100*min_avg_load > imbalance_scale*avg_load)) {
6071
6072
6073
6074
6075 min_avg_load = avg_load;
6076 idlest = group;
6077 }
6078
6079 if (most_spare < max_spare_cap) {
6080 most_spare = max_spare_cap;
6081 most_spare_sg = group;
6082 }
6083 }
6084 } while (group = group->next, group != sd->groups);
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097 if (sd_flag & SD_BALANCE_FORK)
6098 goto skip_spare;
6099
6100 if (this_spare > task_util(p) / 2 &&
6101 imbalance_scale*this_spare > 100*most_spare)
6102 return NULL;
6103
6104 if (most_spare > task_util(p) / 2)
6105 return most_spare_sg;
6106
6107skip_spare:
6108 if (!idlest)
6109 return NULL;
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119 if ((sd->flags & SD_NUMA) &&
6120 min_runnable_load + imbalance >= this_runnable_load)
6121 return NULL;
6122
6123 if (min_runnable_load > (this_runnable_load + imbalance))
6124 return NULL;
6125
6126 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6127 (100*this_avg_load < imbalance_scale*min_avg_load))
6128 return NULL;
6129
6130 return idlest;
6131}
6132
6133
6134
6135
6136static int
6137find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6138{
6139 unsigned long load, min_load = ULONG_MAX;
6140 unsigned int min_exit_latency = UINT_MAX;
6141 u64 latest_idle_timestamp = 0;
6142 int least_loaded_cpu = this_cpu;
6143 int shallowest_idle_cpu = -1;
6144 int i;
6145
6146
6147 if (group->group_weight == 1)
6148 return cpumask_first(sched_group_span(group));
6149
6150
6151 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
6152 if (available_idle_cpu(i)) {
6153 struct rq *rq = cpu_rq(i);
6154 struct cpuidle_state *idle = idle_get_state(rq);
6155 if (idle && idle->exit_latency < min_exit_latency) {
6156
6157
6158
6159
6160
6161 min_exit_latency = idle->exit_latency;
6162 latest_idle_timestamp = rq->idle_stamp;
6163 shallowest_idle_cpu = i;
6164 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6165 rq->idle_stamp > latest_idle_timestamp) {
6166
6167
6168
6169
6170
6171 latest_idle_timestamp = rq->idle_stamp;
6172 shallowest_idle_cpu = i;
6173 }
6174 } else if (shallowest_idle_cpu == -1) {
6175 load = weighted_cpuload(cpu_rq(i));
6176 if (load < min_load) {
6177 min_load = load;
6178 least_loaded_cpu = i;
6179 }
6180 }
6181 }
6182
6183 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6184}
6185
6186static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6187 int cpu, int prev_cpu, int sd_flag)
6188{
6189 int new_cpu = cpu;
6190
6191 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6192 return prev_cpu;
6193
6194
6195
6196
6197
6198 if (!(sd_flag & SD_BALANCE_FORK))
6199 sync_entity_load_avg(&p->se);
6200
6201 while (sd) {
6202 struct sched_group *group;
6203 struct sched_domain *tmp;
6204 int weight;
6205
6206 if (!(sd->flags & sd_flag)) {
6207 sd = sd->child;
6208 continue;
6209 }
6210
6211 group = find_idlest_group(sd, p, cpu, sd_flag);
6212 if (!group) {
6213 sd = sd->child;
6214 continue;
6215 }
6216
6217 new_cpu = find_idlest_group_cpu(group, p, cpu);
6218 if (new_cpu == cpu) {
6219
6220 sd = sd->child;
6221 continue;
6222 }
6223
6224
6225 cpu = new_cpu;
6226 weight = sd->span_weight;
6227 sd = NULL;
6228 for_each_domain(cpu, tmp) {
6229 if (weight <= tmp->span_weight)
6230 break;
6231 if (tmp->flags & sd_flag)
6232 sd = tmp;
6233 }
6234 }
6235
6236 return new_cpu;
6237}
6238
6239#ifdef CONFIG_SCHED_SMT
6240
6241static inline void set_idle_cores(int cpu, int val)
6242{
6243 struct sched_domain_shared *sds;
6244
6245 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6246 if (sds)
6247 WRITE_ONCE(sds->has_idle_cores, val);
6248}
6249
6250static inline bool test_idle_cores(int cpu, bool def)
6251{
6252 struct sched_domain_shared *sds;
6253
6254 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6255 if (sds)
6256 return READ_ONCE(sds->has_idle_cores);
6257
6258 return def;
6259}
6260
6261
6262
6263
6264
6265
6266
6267
6268void __update_idle_core(struct rq *rq)
6269{
6270 int core = cpu_of(rq);
6271 int cpu;
6272
6273 rcu_read_lock();
6274 if (test_idle_cores(core, true))
6275 goto unlock;
6276
6277 for_each_cpu(cpu, cpu_smt_mask(core)) {
6278 if (cpu == core)
6279 continue;
6280
6281 if (!available_idle_cpu(cpu))
6282 goto unlock;
6283 }
6284
6285 set_idle_cores(core, 1);
6286unlock:
6287 rcu_read_unlock();
6288}
6289
6290
6291
6292
6293
6294
6295static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6296{
6297 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6298 int core, cpu;
6299
6300 if (!static_branch_likely(&sched_smt_present))
6301 return -1;
6302
6303 if (!test_idle_cores(target, false))
6304 return -1;
6305
6306 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6307
6308 for_each_cpu_wrap(core, cpus, target) {
6309 bool idle = true;
6310
6311 for_each_cpu(cpu, cpu_smt_mask(core)) {
6312 cpumask_clear_cpu(cpu, cpus);
6313 if (!available_idle_cpu(cpu))
6314 idle = false;
6315 }
6316
6317 if (idle)
6318 return core;
6319 }
6320
6321
6322
6323
6324 set_idle_cores(target, 0);
6325
6326 return -1;
6327}
6328
6329
6330
6331
6332static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6333{
6334 int cpu;
6335
6336 if (!static_branch_likely(&sched_smt_present))
6337 return -1;
6338
6339 for_each_cpu(cpu, cpu_smt_mask(target)) {
6340 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6341 continue;
6342 if (available_idle_cpu(cpu))
6343 return cpu;
6344 }
6345
6346 return -1;
6347}
6348
6349#else
6350
6351static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6352{
6353 return -1;
6354}
6355
6356static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6357{
6358 return -1;
6359}
6360
6361#endif
6362
6363
6364
6365
6366
6367
6368static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6369{
6370 struct sched_domain *this_sd;
6371 u64 avg_cost, avg_idle;
6372 u64 time, cost;
6373 s64 delta;
6374 int cpu, nr = INT_MAX;
6375
6376 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6377 if (!this_sd)
6378 return -1;
6379
6380
6381
6382
6383
6384 avg_idle = this_rq()->avg_idle / 512;
6385 avg_cost = this_sd->avg_scan_cost + 1;
6386
6387 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
6388 return -1;
6389
6390 if (sched_feat(SIS_PROP)) {
6391 u64 span_avg = sd->span_weight * avg_idle;
6392 if (span_avg > 4*avg_cost)
6393 nr = div_u64(span_avg, avg_cost);
6394 else
6395 nr = 4;
6396 }
6397
6398 time = local_clock();
6399
6400 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6401 if (!--nr)
6402 return -1;
6403 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6404 continue;
6405 if (available_idle_cpu(cpu))
6406 break;
6407 }
6408
6409 time = local_clock() - time;
6410 cost = this_sd->avg_scan_cost;
6411 delta = (s64)(time - cost) / 8;
6412 this_sd->avg_scan_cost += delta;
6413
6414 return cpu;
6415}
6416
6417
6418
6419
6420static int select_idle_sibling(struct task_struct *p, int prev, int target)
6421{
6422 struct sched_domain *sd;
6423 int i, recent_used_cpu;
6424
6425 if (available_idle_cpu(target))
6426 return target;
6427
6428
6429
6430
6431 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6432 return prev;
6433
6434
6435 recent_used_cpu = p->recent_used_cpu;
6436 if (recent_used_cpu != prev &&
6437 recent_used_cpu != target &&
6438 cpus_share_cache(recent_used_cpu, target) &&
6439 available_idle_cpu(recent_used_cpu) &&
6440 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6441
6442
6443
6444
6445 p->recent_used_cpu = prev;
6446 return recent_used_cpu;
6447 }
6448
6449 sd = rcu_dereference(per_cpu(sd_llc, target));
6450 if (!sd)
6451 return target;
6452
6453 i = select_idle_core(p, sd, target);
6454 if ((unsigned)i < nr_cpumask_bits)
6455 return i;
6456
6457 i = select_idle_cpu(p, sd, target);
6458 if ((unsigned)i < nr_cpumask_bits)
6459 return i;
6460
6461 i = select_idle_smt(p, sd, target);
6462 if ((unsigned)i < nr_cpumask_bits)
6463 return i;
6464
6465 return target;
6466}
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506static inline unsigned long cpu_util(int cpu)
6507{
6508 struct cfs_rq *cfs_rq;
6509 unsigned int util;
6510
6511 cfs_rq = &cpu_rq(cpu)->cfs;
6512 util = READ_ONCE(cfs_rq->avg.util_avg);
6513
6514 if (sched_feat(UTIL_EST))
6515 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6516
6517 return min_t(unsigned long, util, capacity_orig_of(cpu));
6518}
6519
6520
6521
6522
6523
6524static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6525{
6526 struct cfs_rq *cfs_rq;
6527 unsigned int util;
6528
6529
6530 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6531 return cpu_util(cpu);
6532
6533 cfs_rq = &cpu_rq(cpu)->cfs;
6534 util = READ_ONCE(cfs_rq->avg.util_avg);
6535
6536
6537 util -= min_t(unsigned int, util, task_util(p));
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565 if (sched_feat(UTIL_EST))
6566 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6567
6568
6569
6570
6571
6572
6573 return min_t(unsigned long, util, capacity_orig_of(cpu));
6574}
6575
6576
6577
6578
6579
6580
6581
6582
6583static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6584{
6585 long min_cap, max_cap;
6586
6587 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6588 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6589
6590
6591 if (max_cap - min_cap < max_cap >> 3)
6592 return 0;
6593
6594
6595 sync_entity_load_avg(&p->se);
6596
6597 return min_cap * 1024 < task_util(p) * capacity_margin;
6598}
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612static int
6613select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6614{
6615 struct sched_domain *tmp, *sd = NULL;
6616 int cpu = smp_processor_id();
6617 int new_cpu = prev_cpu;
6618 int want_affine = 0;
6619 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6620
6621 if (sd_flag & SD_BALANCE_WAKE) {
6622 record_wakee(p);
6623 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
6624 && cpumask_test_cpu(cpu, &p->cpus_allowed);
6625 }
6626
6627 rcu_read_lock();
6628 for_each_domain(cpu, tmp) {
6629 if (!(tmp->flags & SD_LOAD_BALANCE))
6630 break;
6631
6632
6633
6634
6635
6636 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6637 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6638 if (cpu != prev_cpu)
6639 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6640
6641 sd = NULL;
6642 break;
6643 }
6644
6645 if (tmp->flags & sd_flag)
6646 sd = tmp;
6647 else if (!want_affine)
6648 break;
6649 }
6650
6651 if (unlikely(sd)) {
6652
6653 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6654 } else if (sd_flag & SD_BALANCE_WAKE) {
6655
6656
6657 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6658
6659 if (want_affine)
6660 current->recent_used_cpu = cpu;
6661 }
6662 rcu_read_unlock();
6663
6664 return new_cpu;
6665}
6666
6667static void detach_entity_cfs_rq(struct sched_entity *se);
6668
6669
6670
6671
6672
6673
6674static void migrate_task_rq_fair(struct task_struct *p)
6675{
6676
6677
6678
6679
6680
6681
6682 if (p->state == TASK_WAKING) {
6683 struct sched_entity *se = &p->se;
6684 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6685 u64 min_vruntime;
6686
6687#ifndef CONFIG_64BIT
6688 u64 min_vruntime_copy;
6689
6690 do {
6691 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6692 smp_rmb();
6693 min_vruntime = cfs_rq->min_vruntime;
6694 } while (min_vruntime != min_vruntime_copy);
6695#else
6696 min_vruntime = cfs_rq->min_vruntime;
6697#endif
6698
6699 se->vruntime -= min_vruntime;
6700 }
6701
6702 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6703
6704
6705
6706
6707 lockdep_assert_held(&task_rq(p)->lock);
6708 detach_entity_cfs_rq(&p->se);
6709
6710 } else {
6711
6712
6713
6714
6715
6716
6717
6718
6719 remove_entity_load_avg(&p->se);
6720 }
6721
6722
6723 p->se.avg.last_update_time = 0;
6724
6725
6726 p->se.exec_start = 0;
6727}
6728
6729static void task_dead_fair(struct task_struct *p)
6730{
6731 remove_entity_load_avg(&p->se);
6732}
6733#endif
6734
6735static unsigned long wakeup_gran(struct sched_entity *se)
6736{
6737 unsigned long gran = sysctl_sched_wakeup_granularity;
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752 return calc_delta_fair(gran, se);
6753}
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769static int
6770wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6771{
6772 s64 gran, vdiff = curr->vruntime - se->vruntime;
6773
6774 if (vdiff <= 0)
6775 return -1;
6776
6777 gran = wakeup_gran(se);
6778 if (vdiff > gran)
6779 return 1;
6780
6781 return 0;
6782}
6783
6784static void set_last_buddy(struct sched_entity *se)
6785{
6786 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6787 return;
6788
6789 for_each_sched_entity(se) {
6790 if (SCHED_WARN_ON(!se->on_rq))
6791 return;
6792 cfs_rq_of(se)->last = se;
6793 }
6794}
6795
6796static void set_next_buddy(struct sched_entity *se)
6797{
6798 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6799 return;
6800
6801 for_each_sched_entity(se) {
6802 if (SCHED_WARN_ON(!se->on_rq))
6803 return;
6804 cfs_rq_of(se)->next = se;
6805 }
6806}
6807
6808static void set_skip_buddy(struct sched_entity *se)
6809{
6810 for_each_sched_entity(se)
6811 cfs_rq_of(se)->skip = se;
6812}
6813
6814
6815
6816
6817static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6818{
6819 struct task_struct *curr = rq->curr;
6820 struct sched_entity *se = &curr->se, *pse = &p->se;
6821 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6822 int scale = cfs_rq->nr_running >= sched_nr_latency;
6823 int next_buddy_marked = 0;
6824
6825 if (unlikely(se == pse))
6826 return;
6827
6828
6829
6830
6831
6832
6833
6834 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6835 return;
6836
6837 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6838 set_next_buddy(pse);
6839 next_buddy_marked = 1;
6840 }
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852 if (test_tsk_need_resched(curr))
6853 return;
6854
6855
6856 if (unlikely(curr->policy == SCHED_IDLE) &&
6857 likely(p->policy != SCHED_IDLE))
6858 goto preempt;
6859
6860
6861
6862
6863
6864 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6865 return;
6866
6867 find_matching_se(&se, &pse);
6868 update_curr(cfs_rq_of(se));
6869 BUG_ON(!pse);
6870 if (wakeup_preempt_entity(se, pse) == 1) {
6871
6872
6873
6874
6875 if (!next_buddy_marked)
6876 set_next_buddy(pse);
6877 goto preempt;
6878 }
6879
6880 return;
6881
6882preempt:
6883 resched_curr(rq);
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893 if (unlikely(!se->on_rq || curr == rq->idle))
6894 return;
6895
6896 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6897 set_last_buddy(se);
6898}
6899
6900static struct task_struct *
6901pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6902{
6903 struct cfs_rq *cfs_rq = &rq->cfs;
6904 struct sched_entity *se;
6905 struct task_struct *p;
6906 int new_tasks;
6907
6908again:
6909 if (!cfs_rq->nr_running)
6910 goto idle;
6911
6912#ifdef CONFIG_FAIR_GROUP_SCHED
6913 if (prev->sched_class != &fair_sched_class)
6914 goto simple;
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924 do {
6925 struct sched_entity *curr = cfs_rq->curr;
6926
6927
6928
6929
6930
6931
6932
6933 if (curr) {
6934 if (curr->on_rq)
6935 update_curr(cfs_rq);
6936 else
6937 curr = NULL;
6938
6939
6940
6941
6942
6943
6944
6945 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6946 cfs_rq = &rq->cfs;
6947
6948 if (!cfs_rq->nr_running)
6949 goto idle;
6950
6951 goto simple;
6952 }
6953 }
6954
6955 se = pick_next_entity(cfs_rq, curr);
6956 cfs_rq = group_cfs_rq(se);
6957 } while (cfs_rq);
6958
6959 p = task_of(se);
6960
6961
6962
6963
6964
6965
6966 if (prev != p) {
6967 struct sched_entity *pse = &prev->se;
6968
6969 while (!(cfs_rq = is_same_group(se, pse))) {
6970 int se_depth = se->depth;
6971 int pse_depth = pse->depth;
6972
6973 if (se_depth <= pse_depth) {
6974 put_prev_entity(cfs_rq_of(pse), pse);
6975 pse = parent_entity(pse);
6976 }
6977 if (se_depth >= pse_depth) {
6978 set_next_entity(cfs_rq_of(se), se);
6979 se = parent_entity(se);
6980 }
6981 }
6982
6983 put_prev_entity(cfs_rq, pse);
6984 set_next_entity(cfs_rq, se);
6985 }
6986
6987 goto done;
6988simple:
6989#endif
6990
6991 put_prev_task(rq, prev);
6992
6993 do {
6994 se = pick_next_entity(cfs_rq, NULL);
6995 set_next_entity(cfs_rq, se);
6996 cfs_rq = group_cfs_rq(se);
6997 } while (cfs_rq);
6998
6999 p = task_of(se);
7000
7001done: __maybe_unused;
7002#ifdef CONFIG_SMP
7003
7004
7005
7006
7007
7008 list_move(&p->se.group_node, &rq->cfs_tasks);
7009#endif
7010
7011 if (hrtick_enabled(rq))
7012 hrtick_start_fair(rq, p);
7013
7014 return p;
7015
7016idle:
7017 new_tasks = idle_balance(rq, rf);
7018
7019
7020
7021
7022
7023
7024 if (new_tasks < 0)
7025 return RETRY_TASK;
7026
7027 if (new_tasks > 0)
7028 goto again;
7029
7030 return NULL;
7031}
7032
7033
7034
7035
7036static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7037{
7038 struct sched_entity *se = &prev->se;
7039 struct cfs_rq *cfs_rq;
7040
7041 for_each_sched_entity(se) {
7042 cfs_rq = cfs_rq_of(se);
7043 put_prev_entity(cfs_rq, se);
7044 }
7045}
7046
7047
7048
7049
7050
7051
7052static void yield_task_fair(struct rq *rq)
7053{
7054 struct task_struct *curr = rq->curr;
7055 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7056 struct sched_entity *se = &curr->se;
7057
7058
7059
7060
7061 if (unlikely(rq->nr_running == 1))
7062 return;
7063
7064 clear_buddies(cfs_rq, se);
7065
7066 if (curr->policy != SCHED_BATCH) {
7067 update_rq_clock(rq);
7068
7069
7070
7071 update_curr(cfs_rq);
7072
7073
7074
7075
7076
7077 rq_clock_skip_update(rq);
7078 }
7079
7080 set_skip_buddy(se);
7081}
7082
7083static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7084{
7085 struct sched_entity *se = &p->se;
7086
7087
7088 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7089 return false;
7090
7091
7092 set_next_buddy(se);
7093
7094 yield_task_fair(rq);
7095
7096 return true;
7097}
7098
7099#ifdef CONFIG_SMP
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7219
7220enum fbq_type { regular, remote, all };
7221
7222#define LBF_ALL_PINNED 0x01
7223#define LBF_NEED_BREAK 0x02
7224#define LBF_DST_PINNED 0x04
7225#define LBF_SOME_PINNED 0x08
7226#define LBF_NOHZ_STATS 0x10
7227#define LBF_NOHZ_AGAIN 0x20
7228
7229struct lb_env {
7230 struct sched_domain *sd;
7231
7232 struct rq *src_rq;
7233 int src_cpu;
7234
7235 int dst_cpu;
7236 struct rq *dst_rq;
7237
7238 struct cpumask *dst_grpmask;
7239 int new_dst_cpu;
7240 enum cpu_idle_type idle;
7241 long imbalance;
7242
7243 struct cpumask *cpus;
7244
7245 unsigned int flags;
7246
7247 unsigned int loop;
7248 unsigned int loop_break;
7249 unsigned int loop_max;
7250
7251 enum fbq_type fbq_type;
7252 struct list_head tasks;
7253};
7254
7255
7256
7257
7258static int task_hot(struct task_struct *p, struct lb_env *env)
7259{
7260 s64 delta;
7261
7262 lockdep_assert_held(&env->src_rq->lock);
7263
7264 if (p->sched_class != &fair_sched_class)
7265 return 0;
7266
7267 if (unlikely(p->policy == SCHED_IDLE))
7268 return 0;
7269
7270
7271
7272
7273 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7274 (&p->se == cfs_rq_of(&p->se)->next ||
7275 &p->se == cfs_rq_of(&p->se)->last))
7276 return 1;
7277
7278 if (sysctl_sched_migration_cost == -1)
7279 return 1;
7280 if (sysctl_sched_migration_cost == 0)
7281 return 0;
7282
7283 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7284
7285 return delta < (s64)sysctl_sched_migration_cost;
7286}
7287
7288#ifdef CONFIG_NUMA_BALANCING
7289
7290
7291
7292
7293
7294static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7295{
7296 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7297 unsigned long src_faults, dst_faults;
7298 int src_nid, dst_nid;
7299
7300 if (!static_branch_likely(&sched_numa_balancing))
7301 return -1;
7302
7303 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7304 return -1;
7305
7306 src_nid = cpu_to_node(env->src_cpu);
7307 dst_nid = cpu_to_node(env->dst_cpu);
7308
7309 if (src_nid == dst_nid)
7310 return -1;
7311
7312
7313 if (src_nid == p->numa_preferred_nid) {
7314 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7315 return 1;
7316 else
7317 return -1;
7318 }
7319
7320
7321 if (dst_nid == p->numa_preferred_nid)
7322 return 0;
7323
7324
7325 if (env->idle != CPU_NOT_IDLE)
7326 return -1;
7327
7328 if (numa_group) {
7329 src_faults = group_faults(p, src_nid);
7330 dst_faults = group_faults(p, dst_nid);
7331 } else {
7332 src_faults = task_faults(p, src_nid);
7333 dst_faults = task_faults(p, dst_nid);
7334 }
7335
7336 return dst_faults < src_faults;
7337}
7338
7339#else
7340static inline int migrate_degrades_locality(struct task_struct *p,
7341 struct lb_env *env)
7342{
7343 return -1;
7344}
7345#endif
7346
7347
7348
7349
7350static
7351int can_migrate_task(struct task_struct *p, struct lb_env *env)
7352{
7353 int tsk_cache_hot;
7354
7355 lockdep_assert_held(&env->src_rq->lock);
7356
7357
7358
7359
7360
7361
7362
7363
7364 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7365 return 0;
7366
7367 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
7368 int cpu;
7369
7370 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7371
7372 env->flags |= LBF_SOME_PINNED;
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7383 return 0;
7384
7385
7386 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7387 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7388 env->flags |= LBF_DST_PINNED;
7389 env->new_dst_cpu = cpu;
7390 break;
7391 }
7392 }
7393
7394 return 0;
7395 }
7396
7397
7398 env->flags &= ~LBF_ALL_PINNED;
7399
7400 if (task_running(env->src_rq, p)) {
7401 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7402 return 0;
7403 }
7404
7405
7406
7407
7408
7409
7410
7411 tsk_cache_hot = migrate_degrades_locality(p, env);
7412 if (tsk_cache_hot == -1)
7413 tsk_cache_hot = task_hot(p, env);
7414
7415 if (tsk_cache_hot <= 0 ||
7416 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7417 if (tsk_cache_hot == 1) {
7418 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7419 schedstat_inc(p->se.statistics.nr_forced_migrations);
7420 }
7421 return 1;
7422 }
7423
7424 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7425 return 0;
7426}
7427
7428
7429
7430
7431static void detach_task(struct task_struct *p, struct lb_env *env)
7432{
7433 lockdep_assert_held(&env->src_rq->lock);
7434
7435 p->on_rq = TASK_ON_RQ_MIGRATING;
7436 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7437 set_task_cpu(p, env->dst_cpu);
7438}
7439
7440
7441
7442
7443
7444
7445
7446static struct task_struct *detach_one_task(struct lb_env *env)
7447{
7448 struct task_struct *p;
7449
7450 lockdep_assert_held(&env->src_rq->lock);
7451
7452 list_for_each_entry_reverse(p,
7453 &env->src_rq->cfs_tasks, se.group_node) {
7454 if (!can_migrate_task(p, env))
7455 continue;
7456
7457 detach_task(p, env);
7458
7459
7460
7461
7462
7463
7464
7465 schedstat_inc(env->sd->lb_gained[env->idle]);
7466 return p;
7467 }
7468 return NULL;
7469}
7470
7471static const unsigned int sched_nr_migrate_break = 32;
7472
7473
7474
7475
7476
7477
7478
7479static int detach_tasks(struct lb_env *env)
7480{
7481 struct list_head *tasks = &env->src_rq->cfs_tasks;
7482 struct task_struct *p;
7483 unsigned long load;
7484 int detached = 0;
7485
7486 lockdep_assert_held(&env->src_rq->lock);
7487
7488 if (env->imbalance <= 0)
7489 return 0;
7490
7491 while (!list_empty(tasks)) {
7492
7493
7494
7495
7496 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7497 break;
7498
7499 p = list_last_entry(tasks, struct task_struct, se.group_node);
7500
7501 env->loop++;
7502
7503 if (env->loop > env->loop_max)
7504 break;
7505
7506
7507 if (env->loop > env->loop_break) {
7508 env->loop_break += sched_nr_migrate_break;
7509 env->flags |= LBF_NEED_BREAK;
7510 break;
7511 }
7512
7513 if (!can_migrate_task(p, env))
7514 goto next;
7515
7516 load = task_h_load(p);
7517
7518 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7519 goto next;
7520
7521 if ((load / 2) > env->imbalance)
7522 goto next;
7523
7524 detach_task(p, env);
7525 list_add(&p->se.group_node, &env->tasks);
7526
7527 detached++;
7528 env->imbalance -= load;
7529
7530#ifdef CONFIG_PREEMPT
7531
7532
7533
7534
7535
7536 if (env->idle == CPU_NEWLY_IDLE)
7537 break;
7538#endif
7539
7540
7541
7542
7543
7544 if (env->imbalance <= 0)
7545 break;
7546
7547 continue;
7548next:
7549 list_move(&p->se.group_node, tasks);
7550 }
7551
7552
7553
7554
7555
7556
7557 schedstat_add(env->sd->lb_gained[env->idle], detached);
7558
7559 return detached;
7560}
7561
7562
7563
7564
7565static void attach_task(struct rq *rq, struct task_struct *p)
7566{
7567 lockdep_assert_held(&rq->lock);
7568
7569 BUG_ON(task_rq(p) != rq);
7570 activate_task(rq, p, ENQUEUE_NOCLOCK);
7571 p->on_rq = TASK_ON_RQ_QUEUED;
7572 check_preempt_curr(rq, p, 0);
7573}
7574
7575
7576
7577
7578
7579static void attach_one_task(struct rq *rq, struct task_struct *p)
7580{
7581 struct rq_flags rf;
7582
7583 rq_lock(rq, &rf);
7584 update_rq_clock(rq);
7585 attach_task(rq, p);
7586 rq_unlock(rq, &rf);
7587}
7588
7589
7590
7591
7592
7593static void attach_tasks(struct lb_env *env)
7594{
7595 struct list_head *tasks = &env->tasks;
7596 struct task_struct *p;
7597 struct rq_flags rf;
7598
7599 rq_lock(env->dst_rq, &rf);
7600 update_rq_clock(env->dst_rq);
7601
7602 while (!list_empty(tasks)) {
7603 p = list_first_entry(tasks, struct task_struct, se.group_node);
7604 list_del_init(&p->se.group_node);
7605
7606 attach_task(env->dst_rq, p);
7607 }
7608
7609 rq_unlock(env->dst_rq, &rf);
7610}
7611
7612static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7613{
7614 if (cfs_rq->avg.load_avg)
7615 return true;
7616
7617 if (cfs_rq->avg.util_avg)
7618 return true;
7619
7620 return false;
7621}
7622
7623#ifdef CONFIG_FAIR_GROUP_SCHED
7624
7625static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7626{
7627 if (cfs_rq->load.weight)
7628 return false;
7629
7630 if (cfs_rq->avg.load_sum)
7631 return false;
7632
7633 if (cfs_rq->avg.util_sum)
7634 return false;
7635
7636 if (cfs_rq->avg.runnable_load_sum)
7637 return false;
7638
7639 return true;
7640}
7641
7642static void update_blocked_averages(int cpu)
7643{
7644 struct rq *rq = cpu_rq(cpu);
7645 struct cfs_rq *cfs_rq, *pos;
7646 struct rq_flags rf;
7647 bool done = true;
7648
7649 rq_lock_irqsave(rq, &rf);
7650 update_rq_clock(rq);
7651
7652
7653
7654
7655
7656 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7657 struct sched_entity *se;
7658
7659
7660 if (throttled_hierarchy(cfs_rq))
7661 continue;
7662
7663 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
7664 update_tg_load_avg(cfs_rq, 0);
7665
7666
7667 se = cfs_rq->tg->se[cpu];
7668 if (se && !skip_blocked_update(se))
7669 update_load_avg(cfs_rq_of(se), se, 0);
7670
7671
7672
7673
7674
7675 if (cfs_rq_is_decayed(cfs_rq))
7676 list_del_leaf_cfs_rq(cfs_rq);
7677
7678
7679 if (cfs_rq_has_blocked(cfs_rq))
7680 done = false;
7681 }
7682
7683#ifdef CONFIG_NO_HZ_COMMON
7684 rq->last_blocked_load_update_tick = jiffies;
7685 if (done)
7686 rq->has_blocked_load = 0;
7687#endif
7688 rq_unlock_irqrestore(rq, &rf);
7689}
7690
7691
7692
7693
7694
7695
7696static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7697{
7698 struct rq *rq = rq_of(cfs_rq);
7699 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7700 unsigned long now = jiffies;
7701 unsigned long load;
7702
7703 if (cfs_rq->last_h_load_update == now)
7704 return;
7705
7706 cfs_rq->h_load_next = NULL;
7707 for_each_sched_entity(se) {
7708 cfs_rq = cfs_rq_of(se);
7709 cfs_rq->h_load_next = se;
7710 if (cfs_rq->last_h_load_update == now)
7711 break;
7712 }
7713
7714 if (!se) {
7715 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7716 cfs_rq->last_h_load_update = now;
7717 }
7718
7719 while ((se = cfs_rq->h_load_next) != NULL) {
7720 load = cfs_rq->h_load;
7721 load = div64_ul(load * se->avg.load_avg,
7722 cfs_rq_load_avg(cfs_rq) + 1);
7723 cfs_rq = group_cfs_rq(se);
7724 cfs_rq->h_load = load;
7725 cfs_rq->last_h_load_update = now;
7726 }
7727}
7728
7729static unsigned long task_h_load(struct task_struct *p)
7730{
7731 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7732
7733 update_cfs_rq_h_load(cfs_rq);
7734 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7735 cfs_rq_load_avg(cfs_rq) + 1);
7736}
7737#else
7738static inline void update_blocked_averages(int cpu)
7739{
7740 struct rq *rq = cpu_rq(cpu);
7741 struct cfs_rq *cfs_rq = &rq->cfs;
7742 struct rq_flags rf;
7743
7744 rq_lock_irqsave(rq, &rf);
7745 update_rq_clock(rq);
7746 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7747#ifdef CONFIG_NO_HZ_COMMON
7748 rq->last_blocked_load_update_tick = jiffies;
7749 if (!cfs_rq_has_blocked(cfs_rq))
7750 rq->has_blocked_load = 0;
7751#endif
7752 rq_unlock_irqrestore(rq, &rf);
7753}
7754
7755static unsigned long task_h_load(struct task_struct *p)
7756{
7757 return p->se.avg.load_avg;
7758}
7759#endif
7760
7761
7762
7763enum group_type {
7764 group_other = 0,
7765 group_imbalanced,
7766 group_overloaded,
7767};
7768
7769
7770
7771
7772struct sg_lb_stats {
7773 unsigned long avg_load;
7774 unsigned long group_load;
7775 unsigned long sum_weighted_load;
7776 unsigned long load_per_task;
7777 unsigned long group_capacity;
7778 unsigned long group_util;
7779 unsigned int sum_nr_running;
7780 unsigned int idle_cpus;
7781 unsigned int group_weight;
7782 enum group_type group_type;
7783 int group_no_capacity;
7784#ifdef CONFIG_NUMA_BALANCING
7785 unsigned int nr_numa_running;
7786 unsigned int nr_preferred_running;
7787#endif
7788};
7789
7790
7791
7792
7793
7794struct sd_lb_stats {
7795 struct sched_group *busiest;
7796 struct sched_group *local;
7797 unsigned long total_running;
7798 unsigned long total_load;
7799 unsigned long total_capacity;
7800 unsigned long avg_load;
7801
7802 struct sg_lb_stats busiest_stat;
7803 struct sg_lb_stats local_stat;
7804};
7805
7806static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7807{
7808
7809
7810
7811
7812
7813
7814 *sds = (struct sd_lb_stats){
7815 .busiest = NULL,
7816 .local = NULL,
7817 .total_running = 0UL,
7818 .total_load = 0UL,
7819 .total_capacity = 0UL,
7820 .busiest_stat = {
7821 .avg_load = 0UL,
7822 .sum_nr_running = 0,
7823 .group_type = group_other,
7824 },
7825 };
7826}
7827
7828
7829
7830
7831
7832
7833
7834
7835static inline int get_sd_load_idx(struct sched_domain *sd,
7836 enum cpu_idle_type idle)
7837{
7838 int load_idx;
7839
7840 switch (idle) {
7841 case CPU_NOT_IDLE:
7842 load_idx = sd->busy_idx;
7843 break;
7844
7845 case CPU_NEWLY_IDLE:
7846 load_idx = sd->newidle_idx;
7847 break;
7848 default:
7849 load_idx = sd->idle_idx;
7850 break;
7851 }
7852
7853 return load_idx;
7854}
7855
7856static unsigned long scale_rt_capacity(int cpu)
7857{
7858 struct rq *rq = cpu_rq(cpu);
7859 u64 total, used, age_stamp, avg;
7860 s64 delta;
7861
7862
7863
7864
7865
7866 age_stamp = READ_ONCE(rq->age_stamp);
7867 avg = READ_ONCE(rq->rt_avg);
7868 delta = __rq_clock_broken(rq) - age_stamp;
7869
7870 if (unlikely(delta < 0))
7871 delta = 0;
7872
7873 total = sched_avg_period() + delta;
7874
7875 used = div_u64(avg, total);
7876
7877 if (likely(used < SCHED_CAPACITY_SCALE))
7878 return SCHED_CAPACITY_SCALE - used;
7879
7880 return 1;
7881}
7882
7883static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7884{
7885 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
7886 struct sched_group *sdg = sd->groups;
7887
7888 cpu_rq(cpu)->cpu_capacity_orig = capacity;
7889
7890 capacity *= scale_rt_capacity(cpu);
7891 capacity >>= SCHED_CAPACITY_SHIFT;
7892
7893 if (!capacity)
7894 capacity = 1;
7895
7896 cpu_rq(cpu)->cpu_capacity = capacity;
7897 sdg->sgc->capacity = capacity;
7898 sdg->sgc->min_capacity = capacity;
7899}
7900
7901void update_group_capacity(struct sched_domain *sd, int cpu)
7902{
7903 struct sched_domain *child = sd->child;
7904 struct sched_group *group, *sdg = sd->groups;
7905 unsigned long capacity, min_capacity;
7906 unsigned long interval;
7907
7908 interval = msecs_to_jiffies(sd->balance_interval);
7909 interval = clamp(interval, 1UL, max_load_balance_interval);
7910 sdg->sgc->next_update = jiffies + interval;
7911
7912 if (!child) {
7913 update_cpu_capacity(sd, cpu);
7914 return;
7915 }
7916
7917 capacity = 0;
7918 min_capacity = ULONG_MAX;
7919
7920 if (child->flags & SD_OVERLAP) {
7921
7922
7923
7924
7925
7926 for_each_cpu(cpu, sched_group_span(sdg)) {
7927 struct sched_group_capacity *sgc;
7928 struct rq *rq = cpu_rq(cpu);
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941 if (unlikely(!rq->sd)) {
7942 capacity += capacity_of(cpu);
7943 } else {
7944 sgc = rq->sd->groups->sgc;
7945 capacity += sgc->capacity;
7946 }
7947
7948 min_capacity = min(capacity, min_capacity);
7949 }
7950 } else {
7951
7952
7953
7954
7955
7956 group = child->groups;
7957 do {
7958 struct sched_group_capacity *sgc = group->sgc;
7959
7960 capacity += sgc->capacity;
7961 min_capacity = min(sgc->min_capacity, min_capacity);
7962 group = group->next;
7963 } while (group != child->groups);
7964 }
7965
7966 sdg->sgc->capacity = capacity;
7967 sdg->sgc->min_capacity = min_capacity;
7968}
7969
7970
7971
7972
7973
7974
7975static inline int
7976check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7977{
7978 return ((rq->cpu_capacity * sd->imbalance_pct) <
7979 (rq->cpu_capacity_orig * 100));
7980}
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011static inline int sg_imbalanced(struct sched_group *group)
8012{
8013 return group->sgc->imbalance;
8014}
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028static inline bool
8029group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8030{
8031 if (sgs->sum_nr_running < sgs->group_weight)
8032 return true;
8033
8034 if ((sgs->group_capacity * 100) >
8035 (sgs->group_util * env->sd->imbalance_pct))
8036 return true;
8037
8038 return false;
8039}
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049static inline bool
8050group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8051{
8052 if (sgs->sum_nr_running <= sgs->group_weight)
8053 return false;
8054
8055 if ((sgs->group_capacity * 100) <
8056 (sgs->group_util * env->sd->imbalance_pct))
8057 return true;
8058
8059 return false;
8060}
8061
8062
8063
8064
8065
8066static inline bool
8067group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8068{
8069 return sg->sgc->min_capacity * capacity_margin <
8070 ref->sgc->min_capacity * 1024;
8071}
8072
8073static inline enum
8074group_type group_classify(struct sched_group *group,
8075 struct sg_lb_stats *sgs)
8076{
8077 if (sgs->group_no_capacity)
8078 return group_overloaded;
8079
8080 if (sg_imbalanced(group))
8081 return group_imbalanced;
8082
8083 return group_other;
8084}
8085
8086static bool update_nohz_stats(struct rq *rq, bool force)
8087{
8088#ifdef CONFIG_NO_HZ_COMMON
8089 unsigned int cpu = rq->cpu;
8090
8091 if (!rq->has_blocked_load)
8092 return false;
8093
8094 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8095 return false;
8096
8097 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8098 return true;
8099
8100 update_blocked_averages(cpu);
8101
8102 return rq->has_blocked_load;
8103#else
8104 return false;
8105#endif
8106}
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117static inline void update_sg_lb_stats(struct lb_env *env,
8118 struct sched_group *group, int load_idx,
8119 int local_group, struct sg_lb_stats *sgs,
8120 bool *overload)
8121{
8122 unsigned long load;
8123 int i, nr_running;
8124
8125 memset(sgs, 0, sizeof(*sgs));
8126
8127 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8128 struct rq *rq = cpu_rq(i);
8129
8130 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8131 env->flags |= LBF_NOHZ_AGAIN;
8132
8133
8134 if (local_group)
8135 load = target_load(i, load_idx);
8136 else
8137 load = source_load(i, load_idx);
8138
8139 sgs->group_load += load;
8140 sgs->group_util += cpu_util(i);
8141 sgs->sum_nr_running += rq->cfs.h_nr_running;
8142
8143 nr_running = rq->nr_running;
8144 if (nr_running > 1)
8145 *overload = true;
8146
8147#ifdef CONFIG_NUMA_BALANCING
8148 sgs->nr_numa_running += rq->nr_numa_running;
8149 sgs->nr_preferred_running += rq->nr_preferred_running;
8150#endif
8151 sgs->sum_weighted_load += weighted_cpuload(rq);
8152
8153
8154
8155 if (!nr_running && idle_cpu(i))
8156 sgs->idle_cpus++;
8157 }
8158
8159
8160 sgs->group_capacity = group->sgc->capacity;
8161 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8162
8163 if (sgs->sum_nr_running)
8164 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8165
8166 sgs->group_weight = group->group_weight;
8167
8168 sgs->group_no_capacity = group_is_overloaded(env, sgs);
8169 sgs->group_type = group_classify(group, sgs);
8170}
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185static bool update_sd_pick_busiest(struct lb_env *env,
8186 struct sd_lb_stats *sds,
8187 struct sched_group *sg,
8188 struct sg_lb_stats *sgs)
8189{
8190 struct sg_lb_stats *busiest = &sds->busiest_stat;
8191
8192 if (sgs->group_type > busiest->group_type)
8193 return true;
8194
8195 if (sgs->group_type < busiest->group_type)
8196 return false;
8197
8198 if (sgs->avg_load <= busiest->avg_load)
8199 return false;
8200
8201 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8202 goto asym_packing;
8203
8204
8205
8206
8207
8208
8209
8210 if (sgs->sum_nr_running <= sgs->group_weight &&
8211 group_smaller_cpu_capacity(sds->local, sg))
8212 return false;
8213
8214asym_packing:
8215
8216 if (!(env->sd->flags & SD_ASYM_PACKING))
8217 return true;
8218
8219
8220 if (env->idle == CPU_NOT_IDLE)
8221 return true;
8222
8223
8224
8225
8226
8227 if (sgs->sum_nr_running &&
8228 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
8229 if (!sds->busiest)
8230 return true;
8231
8232
8233 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
8234 sg->asym_prefer_cpu))
8235 return true;
8236 }
8237
8238 return false;
8239}
8240
8241#ifdef CONFIG_NUMA_BALANCING
8242static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8243{
8244 if (sgs->sum_nr_running > sgs->nr_numa_running)
8245 return regular;
8246 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8247 return remote;
8248 return all;
8249}
8250
8251static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8252{
8253 if (rq->nr_running > rq->nr_numa_running)
8254 return regular;
8255 if (rq->nr_running > rq->nr_preferred_running)
8256 return remote;
8257 return all;
8258}
8259#else
8260static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8261{
8262 return all;
8263}
8264
8265static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8266{
8267 return regular;
8268}
8269#endif
8270
8271
8272
8273
8274
8275
8276static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8277{
8278 struct sched_domain *child = env->sd->child;
8279 struct sched_group *sg = env->sd->groups;
8280 struct sg_lb_stats *local = &sds->local_stat;
8281 struct sg_lb_stats tmp_sgs;
8282 int load_idx, prefer_sibling = 0;
8283 bool overload = false;
8284
8285 if (child && child->flags & SD_PREFER_SIBLING)
8286 prefer_sibling = 1;
8287
8288#ifdef CONFIG_NO_HZ_COMMON
8289 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8290 env->flags |= LBF_NOHZ_STATS;
8291#endif
8292
8293 load_idx = get_sd_load_idx(env->sd, env->idle);
8294
8295 do {
8296 struct sg_lb_stats *sgs = &tmp_sgs;
8297 int local_group;
8298
8299 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8300 if (local_group) {
8301 sds->local = sg;
8302 sgs = local;
8303
8304 if (env->idle != CPU_NEWLY_IDLE ||
8305 time_after_eq(jiffies, sg->sgc->next_update))
8306 update_group_capacity(env->sd, env->dst_cpu);
8307 }
8308
8309 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8310 &overload);
8311
8312 if (local_group)
8313 goto next_group;
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325 if (prefer_sibling && sds->local &&
8326 group_has_capacity(env, local) &&
8327 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8328 sgs->group_no_capacity = 1;
8329 sgs->group_type = group_classify(sg, sgs);
8330 }
8331
8332 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8333 sds->busiest = sg;
8334 sds->busiest_stat = *sgs;
8335 }
8336
8337next_group:
8338
8339 sds->total_running += sgs->sum_nr_running;
8340 sds->total_load += sgs->group_load;
8341 sds->total_capacity += sgs->group_capacity;
8342
8343 sg = sg->next;
8344 } while (sg != env->sd->groups);
8345
8346#ifdef CONFIG_NO_HZ_COMMON
8347 if ((env->flags & LBF_NOHZ_AGAIN) &&
8348 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8349
8350 WRITE_ONCE(nohz.next_blocked,
8351 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8352 }
8353#endif
8354
8355 if (env->sd->flags & SD_NUMA)
8356 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8357
8358 if (!env->sd->parent) {
8359
8360 if (env->dst_rq->rd->overload != overload)
8361 env->dst_rq->rd->overload = overload;
8362 }
8363}
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8389{
8390 int busiest_cpu;
8391
8392 if (!(env->sd->flags & SD_ASYM_PACKING))
8393 return 0;
8394
8395 if (env->idle == CPU_NOT_IDLE)
8396 return 0;
8397
8398 if (!sds->busiest)
8399 return 0;
8400
8401 busiest_cpu = sds->busiest->asym_prefer_cpu;
8402 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8403 return 0;
8404
8405 env->imbalance = DIV_ROUND_CLOSEST(
8406 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8407 SCHED_CAPACITY_SCALE);
8408
8409 return 1;
8410}
8411
8412
8413
8414
8415
8416
8417
8418
8419static inline
8420void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8421{
8422 unsigned long tmp, capa_now = 0, capa_move = 0;
8423 unsigned int imbn = 2;
8424 unsigned long scaled_busy_load_per_task;
8425 struct sg_lb_stats *local, *busiest;
8426
8427 local = &sds->local_stat;
8428 busiest = &sds->busiest_stat;
8429
8430 if (!local->sum_nr_running)
8431 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8432 else if (busiest->load_per_task > local->load_per_task)
8433 imbn = 1;
8434
8435 scaled_busy_load_per_task =
8436 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8437 busiest->group_capacity;
8438
8439 if (busiest->avg_load + scaled_busy_load_per_task >=
8440 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8441 env->imbalance = busiest->load_per_task;
8442 return;
8443 }
8444
8445
8446
8447
8448
8449
8450
8451 capa_now += busiest->group_capacity *
8452 min(busiest->load_per_task, busiest->avg_load);
8453 capa_now += local->group_capacity *
8454 min(local->load_per_task, local->avg_load);
8455 capa_now /= SCHED_CAPACITY_SCALE;
8456
8457
8458 if (busiest->avg_load > scaled_busy_load_per_task) {
8459 capa_move += busiest->group_capacity *
8460 min(busiest->load_per_task,
8461 busiest->avg_load - scaled_busy_load_per_task);
8462 }
8463
8464
8465 if (busiest->avg_load * busiest->group_capacity <
8466 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8467 tmp = (busiest->avg_load * busiest->group_capacity) /
8468 local->group_capacity;
8469 } else {
8470 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8471 local->group_capacity;
8472 }
8473 capa_move += local->group_capacity *
8474 min(local->load_per_task, local->avg_load + tmp);
8475 capa_move /= SCHED_CAPACITY_SCALE;
8476
8477
8478 if (capa_move > capa_now)
8479 env->imbalance = busiest->load_per_task;
8480}
8481
8482
8483
8484
8485
8486
8487
8488static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8489{
8490 unsigned long max_pull, load_above_capacity = ~0UL;
8491 struct sg_lb_stats *local, *busiest;
8492
8493 local = &sds->local_stat;
8494 busiest = &sds->busiest_stat;
8495
8496 if (busiest->group_type == group_imbalanced) {
8497
8498
8499
8500
8501 busiest->load_per_task =
8502 min(busiest->load_per_task, sds->avg_load);
8503 }
8504
8505
8506
8507
8508
8509
8510
8511 if (busiest->avg_load <= sds->avg_load ||
8512 local->avg_load >= sds->avg_load) {
8513 env->imbalance = 0;
8514 return fix_small_imbalance(env, sds);
8515 }
8516
8517
8518
8519
8520 if (busiest->group_type == group_overloaded &&
8521 local->group_type == group_overloaded) {
8522 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8523 if (load_above_capacity > busiest->group_capacity) {
8524 load_above_capacity -= busiest->group_capacity;
8525 load_above_capacity *= scale_load_down(NICE_0_LOAD);
8526 load_above_capacity /= busiest->group_capacity;
8527 } else
8528 load_above_capacity = ~0UL;
8529 }
8530
8531
8532
8533
8534
8535
8536
8537
8538 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8539
8540
8541 env->imbalance = min(
8542 max_pull * busiest->group_capacity,
8543 (sds->avg_load - local->avg_load) * local->group_capacity
8544 ) / SCHED_CAPACITY_SCALE;
8545
8546
8547
8548
8549
8550
8551
8552 if (env->imbalance < busiest->load_per_task)
8553 return fix_small_imbalance(env, sds);
8554}
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569static struct sched_group *find_busiest_group(struct lb_env *env)
8570{
8571 struct sg_lb_stats *local, *busiest;
8572 struct sd_lb_stats sds;
8573
8574 init_sd_lb_stats(&sds);
8575
8576
8577
8578
8579
8580 update_sd_lb_stats(env, &sds);
8581 local = &sds.local_stat;
8582 busiest = &sds.busiest_stat;
8583
8584
8585 if (check_asym_packing(env, &sds))
8586 return sds.busiest;
8587
8588
8589 if (!sds.busiest || busiest->sum_nr_running == 0)
8590 goto out_balanced;
8591
8592
8593 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8594 / sds.total_capacity;
8595
8596
8597
8598
8599
8600
8601 if (busiest->group_type == group_imbalanced)
8602 goto force_balance;
8603
8604
8605
8606
8607
8608 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8609 busiest->group_no_capacity)
8610 goto force_balance;
8611
8612
8613
8614
8615
8616 if (local->avg_load >= busiest->avg_load)
8617 goto out_balanced;
8618
8619
8620
8621
8622
8623 if (local->avg_load >= sds.avg_load)
8624 goto out_balanced;
8625
8626 if (env->idle == CPU_IDLE) {
8627
8628
8629
8630
8631
8632
8633
8634 if ((busiest->group_type != group_overloaded) &&
8635 (local->idle_cpus <= (busiest->idle_cpus + 1)))
8636 goto out_balanced;
8637 } else {
8638
8639
8640
8641
8642 if (100 * busiest->avg_load <=
8643 env->sd->imbalance_pct * local->avg_load)
8644 goto out_balanced;
8645 }
8646
8647force_balance:
8648
8649 calculate_imbalance(env, &sds);
8650 return sds.busiest;
8651
8652out_balanced:
8653 env->imbalance = 0;
8654 return NULL;
8655}
8656
8657
8658
8659
8660static struct rq *find_busiest_queue(struct lb_env *env,
8661 struct sched_group *group)
8662{
8663 struct rq *busiest = NULL, *rq;
8664 unsigned long busiest_load = 0, busiest_capacity = 1;
8665 int i;
8666
8667 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8668 unsigned long capacity, wl;
8669 enum fbq_type rt;
8670
8671 rq = cpu_rq(i);
8672 rt = fbq_classify_rq(rq);
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693 if (rt > env->fbq_type)
8694 continue;
8695
8696 capacity = capacity_of(i);
8697
8698 wl = weighted_cpuload(rq);
8699
8700
8701
8702
8703
8704
8705 if (rq->nr_running == 1 && wl > env->imbalance &&
8706 !check_cpu_capacity(rq, env->sd))
8707 continue;
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720 if (wl * busiest_capacity > busiest_load * capacity) {
8721 busiest_load = wl;
8722 busiest_capacity = capacity;
8723 busiest = rq;
8724 }
8725 }
8726
8727 return busiest;
8728}
8729
8730
8731
8732
8733
8734#define MAX_PINNED_INTERVAL 512
8735
8736static int need_active_balance(struct lb_env *env)
8737{
8738 struct sched_domain *sd = env->sd;
8739
8740 if (env->idle == CPU_NEWLY_IDLE) {
8741
8742
8743
8744
8745
8746
8747 if ((sd->flags & SD_ASYM_PACKING) &&
8748 sched_asym_prefer(env->dst_cpu, env->src_cpu))
8749 return 1;
8750 }
8751
8752
8753
8754
8755
8756
8757
8758 if ((env->idle != CPU_NOT_IDLE) &&
8759 (env->src_rq->cfs.h_nr_running == 1)) {
8760 if ((check_cpu_capacity(env->src_rq, sd)) &&
8761 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8762 return 1;
8763 }
8764
8765 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8766}
8767
8768static int active_load_balance_cpu_stop(void *data);
8769
8770static int should_we_balance(struct lb_env *env)
8771{
8772 struct sched_group *sg = env->sd->groups;
8773 int cpu, balance_cpu = -1;
8774
8775
8776
8777
8778
8779 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8780 return 0;
8781
8782
8783
8784
8785
8786 if (env->idle == CPU_NEWLY_IDLE)
8787 return 1;
8788
8789
8790 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8791 if (!idle_cpu(cpu))
8792 continue;
8793
8794 balance_cpu = cpu;
8795 break;
8796 }
8797
8798 if (balance_cpu == -1)
8799 balance_cpu = group_balance_cpu(sg);
8800
8801
8802
8803
8804
8805 return balance_cpu == env->dst_cpu;
8806}
8807
8808
8809
8810
8811
8812static int load_balance(int this_cpu, struct rq *this_rq,
8813 struct sched_domain *sd, enum cpu_idle_type idle,
8814 int *continue_balancing)
8815{
8816 int ld_moved, cur_ld_moved, active_balance = 0;
8817 struct sched_domain *sd_parent = sd->parent;
8818 struct sched_group *group;
8819 struct rq *busiest;
8820 struct rq_flags rf;
8821 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8822
8823 struct lb_env env = {
8824 .sd = sd,
8825 .dst_cpu = this_cpu,
8826 .dst_rq = this_rq,
8827 .dst_grpmask = sched_group_span(sd->groups),
8828 .idle = idle,
8829 .loop_break = sched_nr_migrate_break,
8830 .cpus = cpus,
8831 .fbq_type = all,
8832 .tasks = LIST_HEAD_INIT(env.tasks),
8833 };
8834
8835 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8836
8837 schedstat_inc(sd->lb_count[idle]);
8838
8839redo:
8840 if (!should_we_balance(&env)) {
8841 *continue_balancing = 0;
8842 goto out_balanced;
8843 }
8844
8845 group = find_busiest_group(&env);
8846 if (!group) {
8847 schedstat_inc(sd->lb_nobusyg[idle]);
8848 goto out_balanced;
8849 }
8850
8851 busiest = find_busiest_queue(&env, group);
8852 if (!busiest) {
8853 schedstat_inc(sd->lb_nobusyq[idle]);
8854 goto out_balanced;
8855 }
8856
8857 BUG_ON(busiest == env.dst_rq);
8858
8859 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8860
8861 env.src_cpu = busiest->cpu;
8862 env.src_rq = busiest;
8863
8864 ld_moved = 0;
8865 if (busiest->nr_running > 1) {
8866
8867
8868
8869
8870
8871
8872 env.flags |= LBF_ALL_PINNED;
8873 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8874
8875more_balance:
8876 rq_lock_irqsave(busiest, &rf);
8877 update_rq_clock(busiest);
8878
8879
8880
8881
8882
8883 cur_ld_moved = detach_tasks(&env);
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893 rq_unlock(busiest, &rf);
8894
8895 if (cur_ld_moved) {
8896 attach_tasks(&env);
8897 ld_moved += cur_ld_moved;
8898 }
8899
8900 local_irq_restore(rf.flags);
8901
8902 if (env.flags & LBF_NEED_BREAK) {
8903 env.flags &= ~LBF_NEED_BREAK;
8904 goto more_balance;
8905 }
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8927
8928
8929 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8930
8931 env.dst_rq = cpu_rq(env.new_dst_cpu);
8932 env.dst_cpu = env.new_dst_cpu;
8933 env.flags &= ~LBF_DST_PINNED;
8934 env.loop = 0;
8935 env.loop_break = sched_nr_migrate_break;
8936
8937
8938
8939
8940
8941 goto more_balance;
8942 }
8943
8944
8945
8946
8947 if (sd_parent) {
8948 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8949
8950 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8951 *group_imbalance = 1;
8952 }
8953
8954
8955 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8956 cpumask_clear_cpu(cpu_of(busiest), cpus);
8957
8958
8959
8960
8961
8962
8963
8964
8965 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8966 env.loop = 0;
8967 env.loop_break = sched_nr_migrate_break;
8968 goto redo;
8969 }
8970 goto out_all_pinned;
8971 }
8972 }
8973
8974 if (!ld_moved) {
8975 schedstat_inc(sd->lb_failed[idle]);
8976
8977
8978
8979
8980
8981
8982 if (idle != CPU_NEWLY_IDLE)
8983 sd->nr_balance_failed++;
8984
8985 if (need_active_balance(&env)) {
8986 unsigned long flags;
8987
8988 raw_spin_lock_irqsave(&busiest->lock, flags);
8989
8990
8991
8992
8993
8994
8995 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8996 raw_spin_unlock_irqrestore(&busiest->lock,
8997 flags);
8998 env.flags |= LBF_ALL_PINNED;
8999 goto out_one_pinned;
9000 }
9001
9002
9003
9004
9005
9006
9007 if (!busiest->active_balance) {
9008 busiest->active_balance = 1;
9009 busiest->push_cpu = this_cpu;
9010 active_balance = 1;
9011 }
9012 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9013
9014 if (active_balance) {
9015 stop_one_cpu_nowait(cpu_of(busiest),
9016 active_load_balance_cpu_stop, busiest,
9017 &busiest->active_balance_work);
9018 }
9019
9020
9021 sd->nr_balance_failed = sd->cache_nice_tries+1;
9022 }
9023 } else
9024 sd->nr_balance_failed = 0;
9025
9026 if (likely(!active_balance)) {
9027
9028 sd->balance_interval = sd->min_interval;
9029 } else {
9030
9031
9032
9033
9034
9035
9036 if (sd->balance_interval < sd->max_interval)
9037 sd->balance_interval *= 2;
9038 }
9039
9040 goto out;
9041
9042out_balanced:
9043
9044
9045
9046
9047 if (sd_parent) {
9048 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9049
9050 if (*group_imbalance)
9051 *group_imbalance = 0;
9052 }
9053
9054out_all_pinned:
9055
9056
9057
9058
9059
9060 schedstat_inc(sd->lb_balanced[idle]);
9061
9062 sd->nr_balance_failed = 0;
9063
9064out_one_pinned:
9065
9066 if (((env.flags & LBF_ALL_PINNED) &&
9067 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9068 (sd->balance_interval < sd->max_interval))
9069 sd->balance_interval *= 2;
9070
9071 ld_moved = 0;
9072out:
9073 return ld_moved;
9074}
9075
9076static inline unsigned long
9077get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9078{
9079 unsigned long interval = sd->balance_interval;
9080
9081 if (cpu_busy)
9082 interval *= sd->busy_factor;
9083
9084
9085 interval = msecs_to_jiffies(interval);
9086 interval = clamp(interval, 1UL, max_load_balance_interval);
9087
9088 return interval;
9089}
9090
9091static inline void
9092update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9093{
9094 unsigned long interval, next;
9095
9096
9097 interval = get_sd_balance_interval(sd, 0);
9098 next = sd->last_balance + interval;
9099
9100 if (time_after(*next_balance, next))
9101 *next_balance = next;
9102}
9103
9104
9105
9106
9107
9108
9109
9110static int active_load_balance_cpu_stop(void *data)
9111{
9112 struct rq *busiest_rq = data;
9113 int busiest_cpu = cpu_of(busiest_rq);
9114 int target_cpu = busiest_rq->push_cpu;
9115 struct rq *target_rq = cpu_rq(target_cpu);
9116 struct sched_domain *sd;
9117 struct task_struct *p = NULL;
9118 struct rq_flags rf;
9119
9120 rq_lock_irq(busiest_rq, &rf);
9121
9122
9123
9124
9125
9126 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9127 goto out_unlock;
9128
9129
9130 if (unlikely(busiest_cpu != smp_processor_id() ||
9131 !busiest_rq->active_balance))
9132 goto out_unlock;
9133
9134
9135 if (busiest_rq->nr_running <= 1)
9136 goto out_unlock;
9137
9138
9139
9140
9141
9142
9143 BUG_ON(busiest_rq == target_rq);
9144
9145
9146 rcu_read_lock();
9147 for_each_domain(target_cpu, sd) {
9148 if ((sd->flags & SD_LOAD_BALANCE) &&
9149 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9150 break;
9151 }
9152
9153 if (likely(sd)) {
9154 struct lb_env env = {
9155 .sd = sd,
9156 .dst_cpu = target_cpu,
9157 .dst_rq = target_rq,
9158 .src_cpu = busiest_rq->cpu,
9159 .src_rq = busiest_rq,
9160 .idle = CPU_IDLE,
9161
9162
9163
9164
9165
9166
9167 .flags = LBF_DST_PINNED,
9168 };
9169
9170 schedstat_inc(sd->alb_count);
9171 update_rq_clock(busiest_rq);
9172
9173 p = detach_one_task(&env);
9174 if (p) {
9175 schedstat_inc(sd->alb_pushed);
9176
9177 sd->nr_balance_failed = 0;
9178 } else {
9179 schedstat_inc(sd->alb_failed);
9180 }
9181 }
9182 rcu_read_unlock();
9183out_unlock:
9184 busiest_rq->active_balance = 0;
9185 rq_unlock(busiest_rq, &rf);
9186
9187 if (p)
9188 attach_one_task(target_rq, p);
9189
9190 local_irq_enable();
9191
9192 return 0;
9193}
9194
9195static DEFINE_SPINLOCK(balancing);
9196
9197
9198
9199
9200
9201void update_max_interval(void)
9202{
9203 max_load_balance_interval = HZ*num_online_cpus()/10;
9204}
9205
9206
9207
9208
9209
9210
9211
9212static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9213{
9214 int continue_balancing = 1;
9215 int cpu = rq->cpu;
9216 unsigned long interval;
9217 struct sched_domain *sd;
9218
9219 unsigned long next_balance = jiffies + 60*HZ;
9220 int update_next_balance = 0;
9221 int need_serialize, need_decay = 0;
9222 u64 max_cost = 0;
9223
9224 rcu_read_lock();
9225 for_each_domain(cpu, sd) {
9226
9227
9228
9229
9230 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9231 sd->max_newidle_lb_cost =
9232 (sd->max_newidle_lb_cost * 253) / 256;
9233 sd->next_decay_max_lb_cost = jiffies + HZ;
9234 need_decay = 1;
9235 }
9236 max_cost += sd->max_newidle_lb_cost;
9237
9238 if (!(sd->flags & SD_LOAD_BALANCE))
9239 continue;
9240
9241
9242
9243
9244
9245
9246 if (!continue_balancing) {
9247 if (need_decay)
9248 continue;
9249 break;
9250 }
9251
9252 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9253
9254 need_serialize = sd->flags & SD_SERIALIZE;
9255 if (need_serialize) {
9256 if (!spin_trylock(&balancing))
9257 goto out;
9258 }
9259
9260 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9261 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9262
9263
9264
9265
9266
9267 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9268 }
9269 sd->last_balance = jiffies;
9270 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9271 }
9272 if (need_serialize)
9273 spin_unlock(&balancing);
9274out:
9275 if (time_after(next_balance, sd->last_balance + interval)) {
9276 next_balance = sd->last_balance + interval;
9277 update_next_balance = 1;
9278 }
9279 }
9280 if (need_decay) {
9281
9282
9283
9284
9285 rq->max_idle_balance_cost =
9286 max((u64)sysctl_sched_migration_cost, max_cost);
9287 }
9288 rcu_read_unlock();
9289
9290
9291
9292
9293
9294
9295 if (likely(update_next_balance)) {
9296 rq->next_balance = next_balance;
9297
9298#ifdef CONFIG_NO_HZ_COMMON
9299
9300
9301
9302
9303
9304
9305
9306
9307 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9308 nohz.next_balance = rq->next_balance;
9309#endif
9310 }
9311}
9312
9313static inline int on_null_domain(struct rq *rq)
9314{
9315 return unlikely(!rcu_dereference_sched(rq->sd));
9316}
9317
9318#ifdef CONFIG_NO_HZ_COMMON
9319
9320
9321
9322
9323
9324
9325
9326static inline int find_new_ilb(void)
9327{
9328 int ilb = cpumask_first(nohz.idle_cpus_mask);
9329
9330 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9331 return ilb;
9332
9333 return nr_cpu_ids;
9334}
9335
9336
9337
9338
9339
9340
9341static void kick_ilb(unsigned int flags)
9342{
9343 int ilb_cpu;
9344
9345 nohz.next_balance++;
9346
9347 ilb_cpu = find_new_ilb();
9348
9349 if (ilb_cpu >= nr_cpu_ids)
9350 return;
9351
9352 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9353 if (flags & NOHZ_KICK_MASK)
9354 return;
9355
9356
9357
9358
9359
9360
9361
9362 smp_send_reschedule(ilb_cpu);
9363}
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376static void nohz_balancer_kick(struct rq *rq)
9377{
9378 unsigned long now = jiffies;
9379 struct sched_domain_shared *sds;
9380 struct sched_domain *sd;
9381 int nr_busy, i, cpu = rq->cpu;
9382 unsigned int flags = 0;
9383
9384 if (unlikely(rq->idle_balance))
9385 return;
9386
9387
9388
9389
9390
9391 nohz_balance_exit_idle(rq);
9392
9393
9394
9395
9396
9397 if (likely(!atomic_read(&nohz.nr_cpus)))
9398 return;
9399
9400 if (READ_ONCE(nohz.has_blocked) &&
9401 time_after(now, READ_ONCE(nohz.next_blocked)))
9402 flags = NOHZ_STATS_KICK;
9403
9404 if (time_before(now, nohz.next_balance))
9405 goto out;
9406
9407 if (rq->nr_running >= 2) {
9408 flags = NOHZ_KICK_MASK;
9409 goto out;
9410 }
9411
9412 rcu_read_lock();
9413 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9414 if (sds) {
9415
9416
9417
9418
9419 nr_busy = atomic_read(&sds->nr_busy_cpus);
9420 if (nr_busy > 1) {
9421 flags = NOHZ_KICK_MASK;
9422 goto unlock;
9423 }
9424
9425 }
9426
9427 sd = rcu_dereference(rq->sd);
9428 if (sd) {
9429 if ((rq->cfs.h_nr_running >= 1) &&
9430 check_cpu_capacity(rq, sd)) {
9431 flags = NOHZ_KICK_MASK;
9432 goto unlock;
9433 }
9434 }
9435
9436 sd = rcu_dereference(per_cpu(sd_asym, cpu));
9437 if (sd) {
9438 for_each_cpu(i, sched_domain_span(sd)) {
9439 if (i == cpu ||
9440 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9441 continue;
9442
9443 if (sched_asym_prefer(i, cpu)) {
9444 flags = NOHZ_KICK_MASK;
9445 goto unlock;
9446 }
9447 }
9448 }
9449unlock:
9450 rcu_read_unlock();
9451out:
9452 if (flags)
9453 kick_ilb(flags);
9454}
9455
9456static void set_cpu_sd_state_busy(int cpu)
9457{
9458 struct sched_domain *sd;
9459
9460 rcu_read_lock();
9461 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9462
9463 if (!sd || !sd->nohz_idle)
9464 goto unlock;
9465 sd->nohz_idle = 0;
9466
9467 atomic_inc(&sd->shared->nr_busy_cpus);
9468unlock:
9469 rcu_read_unlock();
9470}
9471
9472void nohz_balance_exit_idle(struct rq *rq)
9473{
9474 SCHED_WARN_ON(rq != this_rq());
9475
9476 if (likely(!rq->nohz_tick_stopped))
9477 return;
9478
9479 rq->nohz_tick_stopped = 0;
9480 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9481 atomic_dec(&nohz.nr_cpus);
9482
9483 set_cpu_sd_state_busy(rq->cpu);
9484}
9485
9486static void set_cpu_sd_state_idle(int cpu)
9487{
9488 struct sched_domain *sd;
9489
9490 rcu_read_lock();
9491 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9492
9493 if (!sd || sd->nohz_idle)
9494 goto unlock;
9495 sd->nohz_idle = 1;
9496
9497 atomic_dec(&sd->shared->nr_busy_cpus);
9498unlock:
9499 rcu_read_unlock();
9500}
9501
9502
9503
9504
9505
9506void nohz_balance_enter_idle(int cpu)
9507{
9508 struct rq *rq = cpu_rq(cpu);
9509
9510 SCHED_WARN_ON(cpu != smp_processor_id());
9511
9512
9513 if (!cpu_active(cpu))
9514 return;
9515
9516
9517 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9518 return;
9519
9520
9521
9522
9523
9524
9525 rq->has_blocked_load = 1;
9526
9527
9528
9529
9530
9531
9532
9533 if (rq->nohz_tick_stopped)
9534 goto out;
9535
9536
9537 if (on_null_domain(rq))
9538 return;
9539
9540 rq->nohz_tick_stopped = 1;
9541
9542 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9543 atomic_inc(&nohz.nr_cpus);
9544
9545
9546
9547
9548
9549
9550 smp_mb__after_atomic();
9551
9552 set_cpu_sd_state_idle(cpu);
9553
9554out:
9555
9556
9557
9558
9559 WRITE_ONCE(nohz.has_blocked, 1);
9560}
9561
9562
9563
9564
9565
9566
9567
9568
9569static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9570 enum cpu_idle_type idle)
9571{
9572
9573 unsigned long now = jiffies;
9574 unsigned long next_balance = now + 60*HZ;
9575 bool has_blocked_load = false;
9576 int update_next_balance = 0;
9577 int this_cpu = this_rq->cpu;
9578 int balance_cpu;
9579 int ret = false;
9580 struct rq *rq;
9581
9582 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592 WRITE_ONCE(nohz.has_blocked, 0);
9593
9594
9595
9596
9597
9598 smp_mb();
9599
9600 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9601 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9602 continue;
9603
9604
9605
9606
9607
9608
9609 if (need_resched()) {
9610 has_blocked_load = true;
9611 goto abort;
9612 }
9613
9614 rq = cpu_rq(balance_cpu);
9615
9616 has_blocked_load |= update_nohz_stats(rq, true);
9617
9618
9619
9620
9621
9622 if (time_after_eq(jiffies, rq->next_balance)) {
9623 struct rq_flags rf;
9624
9625 rq_lock_irqsave(rq, &rf);
9626 update_rq_clock(rq);
9627 cpu_load_update_idle(rq);
9628 rq_unlock_irqrestore(rq, &rf);
9629
9630 if (flags & NOHZ_BALANCE_KICK)
9631 rebalance_domains(rq, CPU_IDLE);
9632 }
9633
9634 if (time_after(next_balance, rq->next_balance)) {
9635 next_balance = rq->next_balance;
9636 update_next_balance = 1;
9637 }
9638 }
9639
9640
9641 if (idle != CPU_NEWLY_IDLE) {
9642 update_blocked_averages(this_cpu);
9643 has_blocked_load |= this_rq->has_blocked_load;
9644 }
9645
9646 if (flags & NOHZ_BALANCE_KICK)
9647 rebalance_domains(this_rq, CPU_IDLE);
9648
9649 WRITE_ONCE(nohz.next_blocked,
9650 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9651
9652
9653 ret = true;
9654
9655abort:
9656
9657 if (has_blocked_load)
9658 WRITE_ONCE(nohz.has_blocked, 1);
9659
9660
9661
9662
9663
9664
9665 if (likely(update_next_balance))
9666 nohz.next_balance = next_balance;
9667
9668 return ret;
9669}
9670
9671
9672
9673
9674
9675static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9676{
9677 int this_cpu = this_rq->cpu;
9678 unsigned int flags;
9679
9680 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9681 return false;
9682
9683 if (idle != CPU_IDLE) {
9684 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9685 return false;
9686 }
9687
9688
9689
9690
9691 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9692 if (!(flags & NOHZ_KICK_MASK))
9693 return false;
9694
9695 _nohz_idle_balance(this_rq, flags, idle);
9696
9697 return true;
9698}
9699
9700static void nohz_newidle_balance(struct rq *this_rq)
9701{
9702 int this_cpu = this_rq->cpu;
9703
9704
9705
9706
9707
9708 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9709 return;
9710
9711
9712 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9713 return;
9714
9715
9716 if (!READ_ONCE(nohz.has_blocked) ||
9717 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9718 return;
9719
9720 raw_spin_unlock(&this_rq->lock);
9721
9722
9723
9724
9725
9726
9727 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9728 kick_ilb(NOHZ_STATS_KICK);
9729 raw_spin_lock(&this_rq->lock);
9730}
9731
9732#else
9733static inline void nohz_balancer_kick(struct rq *rq) { }
9734
9735static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9736{
9737 return false;
9738}
9739
9740static inline void nohz_newidle_balance(struct rq *this_rq) { }
9741#endif
9742
9743
9744
9745
9746
9747static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9748{
9749 unsigned long next_balance = jiffies + HZ;
9750 int this_cpu = this_rq->cpu;
9751 struct sched_domain *sd;
9752 int pulled_task = 0;
9753 u64 curr_cost = 0;
9754
9755
9756
9757
9758
9759 this_rq->idle_stamp = rq_clock(this_rq);
9760
9761
9762
9763
9764 if (!cpu_active(this_cpu))
9765 return 0;
9766
9767
9768
9769
9770
9771
9772
9773 rq_unpin_lock(this_rq, rf);
9774
9775 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9776 !this_rq->rd->overload) {
9777
9778 rcu_read_lock();
9779 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9780 if (sd)
9781 update_next_balance(sd, &next_balance);
9782 rcu_read_unlock();
9783
9784 nohz_newidle_balance(this_rq);
9785
9786 goto out;
9787 }
9788
9789 raw_spin_unlock(&this_rq->lock);
9790
9791 update_blocked_averages(this_cpu);
9792 rcu_read_lock();
9793 for_each_domain(this_cpu, sd) {
9794 int continue_balancing = 1;
9795 u64 t0, domain_cost;
9796
9797 if (!(sd->flags & SD_LOAD_BALANCE))
9798 continue;
9799
9800 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9801 update_next_balance(sd, &next_balance);
9802 break;
9803 }
9804
9805 if (sd->flags & SD_BALANCE_NEWIDLE) {
9806 t0 = sched_clock_cpu(this_cpu);
9807
9808 pulled_task = load_balance(this_cpu, this_rq,
9809 sd, CPU_NEWLY_IDLE,
9810 &continue_balancing);
9811
9812 domain_cost = sched_clock_cpu(this_cpu) - t0;
9813 if (domain_cost > sd->max_newidle_lb_cost)
9814 sd->max_newidle_lb_cost = domain_cost;
9815
9816 curr_cost += domain_cost;
9817 }
9818
9819 update_next_balance(sd, &next_balance);
9820
9821
9822
9823
9824
9825 if (pulled_task || this_rq->nr_running > 0)
9826 break;
9827 }
9828 rcu_read_unlock();
9829
9830 raw_spin_lock(&this_rq->lock);
9831
9832 if (curr_cost > this_rq->max_idle_balance_cost)
9833 this_rq->max_idle_balance_cost = curr_cost;
9834
9835out:
9836
9837
9838
9839
9840
9841 if (this_rq->cfs.h_nr_running && !pulled_task)
9842 pulled_task = 1;
9843
9844
9845 if (time_after(this_rq->next_balance, next_balance))
9846 this_rq->next_balance = next_balance;
9847
9848
9849 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9850 pulled_task = -1;
9851
9852 if (pulled_task)
9853 this_rq->idle_stamp = 0;
9854
9855 rq_repin_lock(this_rq, rf);
9856
9857 return pulled_task;
9858}
9859
9860
9861
9862
9863
9864static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9865{
9866 struct rq *this_rq = this_rq();
9867 enum cpu_idle_type idle = this_rq->idle_balance ?
9868 CPU_IDLE : CPU_NOT_IDLE;
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878 if (nohz_idle_balance(this_rq, idle))
9879 return;
9880
9881
9882 update_blocked_averages(this_rq->cpu);
9883 rebalance_domains(this_rq, idle);
9884}
9885
9886
9887
9888
9889void trigger_load_balance(struct rq *rq)
9890{
9891
9892 if (unlikely(on_null_domain(rq)))
9893 return;
9894
9895 if (time_after_eq(jiffies, rq->next_balance))
9896 raise_softirq(SCHED_SOFTIRQ);
9897
9898 nohz_balancer_kick(rq);
9899}
9900
9901static void rq_online_fair(struct rq *rq)
9902{
9903 update_sysctl();
9904
9905 update_runtime_enabled(rq);
9906}
9907
9908static void rq_offline_fair(struct rq *rq)
9909{
9910 update_sysctl();
9911
9912
9913 unthrottle_offline_cfs_rqs(rq);
9914}
9915
9916#endif
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9927{
9928 struct cfs_rq *cfs_rq;
9929 struct sched_entity *se = &curr->se;
9930
9931 for_each_sched_entity(se) {
9932 cfs_rq = cfs_rq_of(se);
9933 entity_tick(cfs_rq, se, queued);
9934 }
9935
9936 if (static_branch_unlikely(&sched_numa_balancing))
9937 task_tick_numa(rq, curr);
9938}
9939
9940
9941
9942
9943
9944
9945static void task_fork_fair(struct task_struct *p)
9946{
9947 struct cfs_rq *cfs_rq;
9948 struct sched_entity *se = &p->se, *curr;
9949 struct rq *rq = this_rq();
9950 struct rq_flags rf;
9951
9952 rq_lock(rq, &rf);
9953 update_rq_clock(rq);
9954
9955 cfs_rq = task_cfs_rq(current);
9956 curr = cfs_rq->curr;
9957 if (curr) {
9958 update_curr(cfs_rq);
9959 se->vruntime = curr->vruntime;
9960 }
9961 place_entity(cfs_rq, se, 1);
9962
9963 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9964
9965
9966
9967
9968 swap(curr->vruntime, se->vruntime);
9969 resched_curr(rq);
9970 }
9971
9972 se->vruntime -= cfs_rq->min_vruntime;
9973 rq_unlock(rq, &rf);
9974}
9975
9976
9977
9978
9979
9980static void
9981prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9982{
9983 if (!task_on_rq_queued(p))
9984 return;
9985
9986
9987
9988
9989
9990
9991 if (rq->curr == p) {
9992 if (p->prio > oldprio)
9993 resched_curr(rq);
9994 } else
9995 check_preempt_curr(rq, p, 0);
9996}
9997
9998static inline bool vruntime_normalized(struct task_struct *p)
9999{
10000 struct sched_entity *se = &p->se;
10001
10002
10003
10004
10005
10006
10007 if (p->on_rq)
10008 return true;
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
10020 return true;
10021
10022 return false;
10023}
10024
10025#ifdef CONFIG_FAIR_GROUP_SCHED
10026
10027
10028
10029
10030static void propagate_entity_cfs_rq(struct sched_entity *se)
10031{
10032 struct cfs_rq *cfs_rq;
10033
10034
10035 se = se->parent;
10036
10037 for_each_sched_entity(se) {
10038 cfs_rq = cfs_rq_of(se);
10039
10040 if (cfs_rq_throttled(cfs_rq))
10041 break;
10042
10043 update_load_avg(cfs_rq, se, UPDATE_TG);
10044 }
10045}
10046#else
10047static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10048#endif
10049
10050static void detach_entity_cfs_rq(struct sched_entity *se)
10051{
10052 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10053
10054
10055 update_load_avg(cfs_rq, se, 0);
10056 detach_entity_load_avg(cfs_rq, se);
10057 update_tg_load_avg(cfs_rq, false);
10058 propagate_entity_cfs_rq(se);
10059}
10060
10061static void attach_entity_cfs_rq(struct sched_entity *se)
10062{
10063 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10064
10065#ifdef CONFIG_FAIR_GROUP_SCHED
10066
10067
10068
10069
10070 se->depth = se->parent ? se->parent->depth + 1 : 0;
10071#endif
10072
10073
10074 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10075 attach_entity_load_avg(cfs_rq, se, 0);
10076 update_tg_load_avg(cfs_rq, false);
10077 propagate_entity_cfs_rq(se);
10078}
10079
10080static void detach_task_cfs_rq(struct task_struct *p)
10081{
10082 struct sched_entity *se = &p->se;
10083 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10084
10085 if (!vruntime_normalized(p)) {
10086
10087
10088
10089
10090 place_entity(cfs_rq, se, 0);
10091 se->vruntime -= cfs_rq->min_vruntime;
10092 }
10093
10094 detach_entity_cfs_rq(se);
10095}
10096
10097static void attach_task_cfs_rq(struct task_struct *p)
10098{
10099 struct sched_entity *se = &p->se;
10100 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10101
10102 attach_entity_cfs_rq(se);
10103
10104 if (!vruntime_normalized(p))
10105 se->vruntime += cfs_rq->min_vruntime;
10106}
10107
10108static void switched_from_fair(struct rq *rq, struct task_struct *p)
10109{
10110 detach_task_cfs_rq(p);
10111}
10112
10113static void switched_to_fair(struct rq *rq, struct task_struct *p)
10114{
10115 attach_task_cfs_rq(p);
10116
10117 if (task_on_rq_queued(p)) {
10118
10119
10120
10121
10122
10123 if (rq->curr == p)
10124 resched_curr(rq);
10125 else
10126 check_preempt_curr(rq, p, 0);
10127 }
10128}
10129
10130
10131
10132
10133
10134
10135static void set_curr_task_fair(struct rq *rq)
10136{
10137 struct sched_entity *se = &rq->curr->se;
10138
10139 for_each_sched_entity(se) {
10140 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10141
10142 set_next_entity(cfs_rq, se);
10143
10144 account_cfs_rq_runtime(cfs_rq, 0);
10145 }
10146}
10147
10148void init_cfs_rq(struct cfs_rq *cfs_rq)
10149{
10150 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10151 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10152#ifndef CONFIG_64BIT
10153 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10154#endif
10155#ifdef CONFIG_SMP
10156 raw_spin_lock_init(&cfs_rq->removed.lock);
10157#endif
10158}
10159
10160#ifdef CONFIG_FAIR_GROUP_SCHED
10161static void task_set_group_fair(struct task_struct *p)
10162{
10163 struct sched_entity *se = &p->se;
10164
10165 set_task_rq(p, task_cpu(p));
10166 se->depth = se->parent ? se->parent->depth + 1 : 0;
10167}
10168
10169static void task_move_group_fair(struct task_struct *p)
10170{
10171 detach_task_cfs_rq(p);
10172 set_task_rq(p, task_cpu(p));
10173
10174#ifdef CONFIG_SMP
10175
10176 p->se.avg.last_update_time = 0;
10177#endif
10178 attach_task_cfs_rq(p);
10179}
10180
10181static void task_change_group_fair(struct task_struct *p, int type)
10182{
10183 switch (type) {
10184 case TASK_SET_GROUP:
10185 task_set_group_fair(p);
10186 break;
10187
10188 case TASK_MOVE_GROUP:
10189 task_move_group_fair(p);
10190 break;
10191 }
10192}
10193
10194void free_fair_sched_group(struct task_group *tg)
10195{
10196 int i;
10197
10198 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10199
10200 for_each_possible_cpu(i) {
10201 if (tg->cfs_rq)
10202 kfree(tg->cfs_rq[i]);
10203 if (tg->se)
10204 kfree(tg->se[i]);
10205 }
10206
10207 kfree(tg->cfs_rq);
10208 kfree(tg->se);
10209}
10210
10211int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10212{
10213 struct sched_entity *se;
10214 struct cfs_rq *cfs_rq;
10215 int i;
10216
10217 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10218 if (!tg->cfs_rq)
10219 goto err;
10220 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10221 if (!tg->se)
10222 goto err;
10223
10224 tg->shares = NICE_0_LOAD;
10225
10226 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10227
10228 for_each_possible_cpu(i) {
10229 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10230 GFP_KERNEL, cpu_to_node(i));
10231 if (!cfs_rq)
10232 goto err;
10233
10234 se = kzalloc_node(sizeof(struct sched_entity),
10235 GFP_KERNEL, cpu_to_node(i));
10236 if (!se)
10237 goto err_free_rq;
10238
10239 init_cfs_rq(cfs_rq);
10240 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10241 init_entity_runnable_average(se);
10242 }
10243
10244 return 1;
10245
10246err_free_rq:
10247 kfree(cfs_rq);
10248err:
10249 return 0;
10250}
10251
10252void online_fair_sched_group(struct task_group *tg)
10253{
10254 struct sched_entity *se;
10255 struct rq *rq;
10256 int i;
10257
10258 for_each_possible_cpu(i) {
10259 rq = cpu_rq(i);
10260 se = tg->se[i];
10261
10262 raw_spin_lock_irq(&rq->lock);
10263 update_rq_clock(rq);
10264 attach_entity_cfs_rq(se);
10265 sync_throttle(tg, i);
10266 raw_spin_unlock_irq(&rq->lock);
10267 }
10268}
10269
10270void unregister_fair_sched_group(struct task_group *tg)
10271{
10272 unsigned long flags;
10273 struct rq *rq;
10274 int cpu;
10275
10276 for_each_possible_cpu(cpu) {
10277 if (tg->se[cpu])
10278 remove_entity_load_avg(tg->se[cpu]);
10279
10280
10281
10282
10283
10284 if (!tg->cfs_rq[cpu]->on_list)
10285 continue;
10286
10287 rq = cpu_rq(cpu);
10288
10289 raw_spin_lock_irqsave(&rq->lock, flags);
10290 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10291 raw_spin_unlock_irqrestore(&rq->lock, flags);
10292 }
10293}
10294
10295void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10296 struct sched_entity *se, int cpu,
10297 struct sched_entity *parent)
10298{
10299 struct rq *rq = cpu_rq(cpu);
10300
10301 cfs_rq->tg = tg;
10302 cfs_rq->rq = rq;
10303 init_cfs_rq_runtime(cfs_rq);
10304
10305 tg->cfs_rq[cpu] = cfs_rq;
10306 tg->se[cpu] = se;
10307
10308
10309 if (!se)
10310 return;
10311
10312 if (!parent) {
10313 se->cfs_rq = &rq->cfs;
10314 se->depth = 0;
10315 } else {
10316 se->cfs_rq = parent->my_q;
10317 se->depth = parent->depth + 1;
10318 }
10319
10320 se->my_q = cfs_rq;
10321
10322 update_load_set(&se->load, NICE_0_LOAD);
10323 se->parent = parent;
10324}
10325
10326static DEFINE_MUTEX(shares_mutex);
10327
10328int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10329{
10330 int i;
10331
10332
10333
10334
10335 if (!tg->se[0])
10336 return -EINVAL;
10337
10338 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10339
10340 mutex_lock(&shares_mutex);
10341 if (tg->shares == shares)
10342 goto done;
10343
10344 tg->shares = shares;
10345 for_each_possible_cpu(i) {
10346 struct rq *rq = cpu_rq(i);
10347 struct sched_entity *se = tg->se[i];
10348 struct rq_flags rf;
10349
10350
10351 rq_lock_irqsave(rq, &rf);
10352 update_rq_clock(rq);
10353 for_each_sched_entity(se) {
10354 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10355 update_cfs_group(se);
10356 }
10357 rq_unlock_irqrestore(rq, &rf);
10358 }
10359
10360done:
10361 mutex_unlock(&shares_mutex);
10362 return 0;
10363}
10364#else
10365
10366void free_fair_sched_group(struct task_group *tg) { }
10367
10368int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10369{
10370 return 1;
10371}
10372
10373void online_fair_sched_group(struct task_group *tg) { }
10374
10375void unregister_fair_sched_group(struct task_group *tg) { }
10376
10377#endif
10378
10379
10380static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10381{
10382 struct sched_entity *se = &task->se;
10383 unsigned int rr_interval = 0;
10384
10385
10386
10387
10388
10389 if (rq->cfs.load.weight)
10390 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10391
10392 return rr_interval;
10393}
10394
10395
10396
10397
10398const struct sched_class fair_sched_class = {
10399 .next = &idle_sched_class,
10400 .enqueue_task = enqueue_task_fair,
10401 .dequeue_task = dequeue_task_fair,
10402 .yield_task = yield_task_fair,
10403 .yield_to_task = yield_to_task_fair,
10404
10405 .check_preempt_curr = check_preempt_wakeup,
10406
10407 .pick_next_task = pick_next_task_fair,
10408 .put_prev_task = put_prev_task_fair,
10409
10410#ifdef CONFIG_SMP
10411 .select_task_rq = select_task_rq_fair,
10412 .migrate_task_rq = migrate_task_rq_fair,
10413
10414 .rq_online = rq_online_fair,
10415 .rq_offline = rq_offline_fair,
10416
10417 .task_dead = task_dead_fair,
10418 .set_cpus_allowed = set_cpus_allowed_common,
10419#endif
10420
10421 .set_curr_task = set_curr_task_fair,
10422 .task_tick = task_tick_fair,
10423 .task_fork = task_fork_fair,
10424
10425 .prio_changed = prio_changed_fair,
10426 .switched_from = switched_from_fair,
10427 .switched_to = switched_to_fair,
10428
10429 .get_rr_interval = get_rr_interval_fair,
10430
10431 .update_curr = update_curr_fair,
10432
10433#ifdef CONFIG_FAIR_GROUP_SCHED
10434 .task_change_group = task_change_group_fair,
10435#endif
10436};
10437
10438#ifdef CONFIG_SCHED_DEBUG
10439void print_cfs_stats(struct seq_file *m, int cpu)
10440{
10441 struct cfs_rq *cfs_rq, *pos;
10442
10443 rcu_read_lock();
10444 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10445 print_cfs_rq(m, cpu, cfs_rq);
10446 rcu_read_unlock();
10447}
10448
10449#ifdef CONFIG_NUMA_BALANCING
10450void show_numa_stats(struct task_struct *p, struct seq_file *m)
10451{
10452 int node;
10453 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10454
10455 for_each_online_node(node) {
10456 if (p->numa_faults) {
10457 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10458 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10459 }
10460 if (p->numa_group) {
10461 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10462 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10463 }
10464 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10465 }
10466}
10467#endif
10468#endif
10469
10470__init void init_sched_fair_class(void)
10471{
10472#ifdef CONFIG_SMP
10473 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10474
10475#ifdef CONFIG_NO_HZ_COMMON
10476 nohz.next_balance = jiffies;
10477 nohz.next_blocked = jiffies;
10478 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10479#endif
10480#endif
10481
10482}
10483