1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25#include <trace/events/sched.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40unsigned int sysctl_sched_latency = 6000000ULL;
41static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
42
43
44
45
46
47
48
49
50
51
52
53
54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
55
56
57
58
59
60
61unsigned int sysctl_sched_min_granularity = 750000ULL;
62static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
63
64
65
66
67static unsigned int sched_nr_latency = 8;
68
69
70
71
72
73unsigned int sysctl_sched_child_runs_first __read_mostly;
74
75
76
77
78
79
80
81
82
83
84unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
86
87const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
88
89#ifdef CONFIG_SMP
90
91
92
93int __weak arch_asym_cpu_priority(int cpu)
94{
95 return -cpu;
96}
97
98
99
100
101
102
103#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
104
105#endif
106
107#ifdef CONFIG_CFS_BANDWIDTH
108
109
110
111
112
113
114
115
116
117
118unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
119#endif
120
121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122{
123 lw->weight += inc;
124 lw->inv_weight = 0;
125}
126
127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128{
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131}
132
133static inline void update_load_set(struct load_weight *lw, unsigned long w)
134{
135 lw->weight = w;
136 lw->inv_weight = 0;
137}
138
139
140
141
142
143
144
145
146
147
148static unsigned int get_update_sysctl_factor(void)
149{
150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167}
168
169static void update_sysctl(void)
170{
171 unsigned int factor = get_update_sysctl_factor();
172
173#define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178#undef SET_SYSCTL
179}
180
181void sched_init_granularity(void)
182{
183 update_sysctl();
184}
185
186#define WMULT_CONST (~0U)
187#define WMULT_SHIFT 32
188
189static void __update_inv_weight(struct load_weight *lw)
190{
191 unsigned long w;
192
193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204}
205
206
207
208
209
210
211
212
213
214
215
216
217
218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219{
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
222
223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
230 }
231
232 fact = mul_u32_u32(fact, lw->inv_weight);
233
234 while (fact >> 32) {
235 fact >>= 1;
236 shift--;
237 }
238
239 return mul_u64_u32_shr(delta_exec, fact, shift);
240}
241
242
243const struct sched_class fair_sched_class;
244
245
246
247
248
249#ifdef CONFIG_FAIR_GROUP_SCHED
250static inline struct task_struct *task_of(struct sched_entity *se)
251{
252 SCHED_WARN_ON(!entity_is_task(se));
253 return container_of(se, struct task_struct, se);
254}
255
256
257#define for_each_sched_entity(se) \
258 for (; se; se = se->parent)
259
260static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
261{
262 return p->se.cfs_rq;
263}
264
265
266static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
267{
268 return se->cfs_rq;
269}
270
271
272static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
273{
274 return grp->my_q;
275}
276
277static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
278{
279 if (!path)
280 return;
281
282 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
283 autogroup_path(cfs_rq->tg, path, len);
284 else if (cfs_rq && cfs_rq->tg->css.cgroup)
285 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
286 else
287 strlcpy(path, "(null)", len);
288}
289
290static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292 struct rq *rq = rq_of(cfs_rq);
293 int cpu = cpu_of(rq);
294
295 if (cfs_rq->on_list)
296 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
297
298 cfs_rq->on_list = 1;
299
300
301
302
303
304
305
306
307
308
309 if (cfs_rq->tg->parent &&
310 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
311
312
313
314
315
316
317 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
318 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
319
320
321
322
323
324 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
325 return true;
326 }
327
328 if (!cfs_rq->tg->parent) {
329
330
331
332
333 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
334 &rq->leaf_cfs_rq_list);
335
336
337
338
339 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
340 return true;
341 }
342
343
344
345
346
347
348
349 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
350
351
352
353
354 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
355 return false;
356}
357
358static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
359{
360 if (cfs_rq->on_list) {
361 struct rq *rq = rq_of(cfs_rq);
362
363
364
365
366
367
368
369
370 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
371 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
372
373 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
374 cfs_rq->on_list = 0;
375 }
376}
377
378static inline void assert_list_leaf_cfs_rq(struct rq *rq)
379{
380 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
381}
382
383
384#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
385 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
386 leaf_cfs_rq_list)
387
388
389static inline struct cfs_rq *
390is_same_group(struct sched_entity *se, struct sched_entity *pse)
391{
392 if (se->cfs_rq == pse->cfs_rq)
393 return se->cfs_rq;
394
395 return NULL;
396}
397
398static inline struct sched_entity *parent_entity(struct sched_entity *se)
399{
400 return se->parent;
401}
402
403static void
404find_matching_se(struct sched_entity **se, struct sched_entity **pse)
405{
406 int se_depth, pse_depth;
407
408
409
410
411
412
413
414
415
416 se_depth = (*se)->depth;
417 pse_depth = (*pse)->depth;
418
419 while (se_depth > pse_depth) {
420 se_depth--;
421 *se = parent_entity(*se);
422 }
423
424 while (pse_depth > se_depth) {
425 pse_depth--;
426 *pse = parent_entity(*pse);
427 }
428
429 while (!is_same_group(*se, *pse)) {
430 *se = parent_entity(*se);
431 *pse = parent_entity(*pse);
432 }
433}
434
435#else
436
437static inline struct task_struct *task_of(struct sched_entity *se)
438{
439 return container_of(se, struct task_struct, se);
440}
441
442#define for_each_sched_entity(se) \
443 for (; se; se = NULL)
444
445static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
446{
447 return &task_rq(p)->cfs;
448}
449
450static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
451{
452 struct task_struct *p = task_of(se);
453 struct rq *rq = task_rq(p);
454
455 return &rq->cfs;
456}
457
458
459static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
460{
461 return NULL;
462}
463
464static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
465{
466 if (path)
467 strlcpy(path, "(null)", len);
468}
469
470static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
471{
472 return true;
473}
474
475static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
476{
477}
478
479static inline void assert_list_leaf_cfs_rq(struct rq *rq)
480{
481}
482
483#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
484 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
485
486static inline struct sched_entity *parent_entity(struct sched_entity *se)
487{
488 return NULL;
489}
490
491static inline void
492find_matching_se(struct sched_entity **se, struct sched_entity **pse)
493{
494}
495
496#endif
497
498static __always_inline
499void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
500
501
502
503
504
505static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
506{
507 s64 delta = (s64)(vruntime - max_vruntime);
508 if (delta > 0)
509 max_vruntime = vruntime;
510
511 return max_vruntime;
512}
513
514static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
515{
516 s64 delta = (s64)(vruntime - min_vruntime);
517 if (delta < 0)
518 min_vruntime = vruntime;
519
520 return min_vruntime;
521}
522
523static inline int entity_before(struct sched_entity *a,
524 struct sched_entity *b)
525{
526 return (s64)(a->vruntime - b->vruntime) < 0;
527}
528
529static void update_min_vruntime(struct cfs_rq *cfs_rq)
530{
531 struct sched_entity *curr = cfs_rq->curr;
532 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
533
534 u64 vruntime = cfs_rq->min_vruntime;
535
536 if (curr) {
537 if (curr->on_rq)
538 vruntime = curr->vruntime;
539 else
540 curr = NULL;
541 }
542
543 if (leftmost) {
544 struct sched_entity *se;
545 se = rb_entry(leftmost, struct sched_entity, run_node);
546
547 if (!curr)
548 vruntime = se->vruntime;
549 else
550 vruntime = min_vruntime(vruntime, se->vruntime);
551 }
552
553
554 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
555#ifndef CONFIG_64BIT
556 smp_wmb();
557 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
558#endif
559}
560
561
562
563
564static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
565{
566 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
567 struct rb_node *parent = NULL;
568 struct sched_entity *entry;
569 bool leftmost = true;
570
571
572
573
574 while (*link) {
575 parent = *link;
576 entry = rb_entry(parent, struct sched_entity, run_node);
577
578
579
580
581 if (entity_before(se, entry)) {
582 link = &parent->rb_left;
583 } else {
584 link = &parent->rb_right;
585 leftmost = false;
586 }
587 }
588
589 rb_link_node(&se->run_node, parent, link);
590 rb_insert_color_cached(&se->run_node,
591 &cfs_rq->tasks_timeline, leftmost);
592}
593
594static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
595{
596 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
597}
598
599struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
600{
601 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
602
603 if (!left)
604 return NULL;
605
606 return rb_entry(left, struct sched_entity, run_node);
607}
608
609static struct sched_entity *__pick_next_entity(struct sched_entity *se)
610{
611 struct rb_node *next = rb_next(&se->run_node);
612
613 if (!next)
614 return NULL;
615
616 return rb_entry(next, struct sched_entity, run_node);
617}
618
619#ifdef CONFIG_SCHED_DEBUG
620struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
621{
622 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
623
624 if (!last)
625 return NULL;
626
627 return rb_entry(last, struct sched_entity, run_node);
628}
629
630
631
632
633
634int sched_proc_update_handler(struct ctl_table *table, int write,
635 void __user *buffer, size_t *lenp,
636 loff_t *ppos)
637{
638 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
639 unsigned int factor = get_update_sysctl_factor();
640
641 if (ret || !write)
642 return ret;
643
644 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
645 sysctl_sched_min_granularity);
646
647#define WRT_SYSCTL(name) \
648 (normalized_sysctl_##name = sysctl_##name / (factor))
649 WRT_SYSCTL(sched_min_granularity);
650 WRT_SYSCTL(sched_latency);
651 WRT_SYSCTL(sched_wakeup_granularity);
652#undef WRT_SYSCTL
653
654 return 0;
655}
656#endif
657
658
659
660
661static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
662{
663 if (unlikely(se->load.weight != NICE_0_LOAD))
664 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
665
666 return delta;
667}
668
669
670
671
672
673
674
675
676
677static u64 __sched_period(unsigned long nr_running)
678{
679 if (unlikely(nr_running > sched_nr_latency))
680 return nr_running * sysctl_sched_min_granularity;
681 else
682 return sysctl_sched_latency;
683}
684
685
686
687
688
689
690
691static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
692{
693 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
694
695 for_each_sched_entity(se) {
696 struct load_weight *load;
697 struct load_weight lw;
698
699 cfs_rq = cfs_rq_of(se);
700 load = &cfs_rq->load;
701
702 if (unlikely(!se->on_rq)) {
703 lw = cfs_rq->load;
704
705 update_load_add(&lw, se->load.weight);
706 load = &lw;
707 }
708 slice = __calc_delta(slice, se->load.weight, load);
709 }
710 return slice;
711}
712
713
714
715
716
717
718static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
719{
720 return calc_delta_fair(sched_slice(cfs_rq, se), se);
721}
722
723#include "pelt.h"
724#ifdef CONFIG_SMP
725
726static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
727static unsigned long task_h_load(struct task_struct *p);
728static unsigned long capacity_of(int cpu);
729
730
731void init_entity_runnable_average(struct sched_entity *se)
732{
733 struct sched_avg *sa = &se->avg;
734
735 memset(sa, 0, sizeof(*sa));
736
737
738
739
740
741
742
743 if (entity_is_task(se))
744 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
745
746 se->runnable_weight = se->load.weight;
747
748
749}
750
751static void attach_entity_cfs_rq(struct sched_entity *se);
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779void post_init_entity_util_avg(struct task_struct *p)
780{
781 struct sched_entity *se = &p->se;
782 struct cfs_rq *cfs_rq = cfs_rq_of(se);
783 struct sched_avg *sa = &se->avg;
784 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
785 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
786
787 if (cap > 0) {
788 if (cfs_rq->avg.util_avg != 0) {
789 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
790 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
791
792 if (sa->util_avg > cap)
793 sa->util_avg = cap;
794 } else {
795 sa->util_avg = cap;
796 }
797 }
798
799 if (p->sched_class != &fair_sched_class) {
800
801
802
803
804
805
806
807
808
809
810 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
811 return;
812 }
813
814 attach_entity_cfs_rq(se);
815}
816
817#else
818void init_entity_runnable_average(struct sched_entity *se)
819{
820}
821void post_init_entity_util_avg(struct task_struct *p)
822{
823}
824static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
825{
826}
827#endif
828
829
830
831
832static void update_curr(struct cfs_rq *cfs_rq)
833{
834 struct sched_entity *curr = cfs_rq->curr;
835 u64 now = rq_clock_task(rq_of(cfs_rq));
836 u64 delta_exec;
837
838 if (unlikely(!curr))
839 return;
840
841 delta_exec = now - curr->exec_start;
842 if (unlikely((s64)delta_exec <= 0))
843 return;
844
845 curr->exec_start = now;
846
847 schedstat_set(curr->statistics.exec_max,
848 max(delta_exec, curr->statistics.exec_max));
849
850 curr->sum_exec_runtime += delta_exec;
851 schedstat_add(cfs_rq->exec_clock, delta_exec);
852
853 curr->vruntime += calc_delta_fair(delta_exec, curr);
854 update_min_vruntime(cfs_rq);
855
856 if (entity_is_task(curr)) {
857 struct task_struct *curtask = task_of(curr);
858
859 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
860 cgroup_account_cputime(curtask, delta_exec);
861 account_group_exec_runtime(curtask, delta_exec);
862 }
863
864 account_cfs_rq_runtime(cfs_rq, delta_exec);
865}
866
867static void update_curr_fair(struct rq *rq)
868{
869 update_curr(cfs_rq_of(&rq->curr->se));
870}
871
872static inline void
873update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
874{
875 u64 wait_start, prev_wait_start;
876
877 if (!schedstat_enabled())
878 return;
879
880 wait_start = rq_clock(rq_of(cfs_rq));
881 prev_wait_start = schedstat_val(se->statistics.wait_start);
882
883 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
884 likely(wait_start > prev_wait_start))
885 wait_start -= prev_wait_start;
886
887 __schedstat_set(se->statistics.wait_start, wait_start);
888}
889
890static inline void
891update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
892{
893 struct task_struct *p;
894 u64 delta;
895
896 if (!schedstat_enabled())
897 return;
898
899 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
900
901 if (entity_is_task(se)) {
902 p = task_of(se);
903 if (task_on_rq_migrating(p)) {
904
905
906
907
908
909 __schedstat_set(se->statistics.wait_start, delta);
910 return;
911 }
912 trace_sched_stat_wait(p, delta);
913 }
914
915 __schedstat_set(se->statistics.wait_max,
916 max(schedstat_val(se->statistics.wait_max), delta));
917 __schedstat_inc(se->statistics.wait_count);
918 __schedstat_add(se->statistics.wait_sum, delta);
919 __schedstat_set(se->statistics.wait_start, 0);
920}
921
922static inline void
923update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
924{
925 struct task_struct *tsk = NULL;
926 u64 sleep_start, block_start;
927
928 if (!schedstat_enabled())
929 return;
930
931 sleep_start = schedstat_val(se->statistics.sleep_start);
932 block_start = schedstat_val(se->statistics.block_start);
933
934 if (entity_is_task(se))
935 tsk = task_of(se);
936
937 if (sleep_start) {
938 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
939
940 if ((s64)delta < 0)
941 delta = 0;
942
943 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
944 __schedstat_set(se->statistics.sleep_max, delta);
945
946 __schedstat_set(se->statistics.sleep_start, 0);
947 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
948
949 if (tsk) {
950 account_scheduler_latency(tsk, delta >> 10, 1);
951 trace_sched_stat_sleep(tsk, delta);
952 }
953 }
954 if (block_start) {
955 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
956
957 if ((s64)delta < 0)
958 delta = 0;
959
960 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
961 __schedstat_set(se->statistics.block_max, delta);
962
963 __schedstat_set(se->statistics.block_start, 0);
964 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
965
966 if (tsk) {
967 if (tsk->in_iowait) {
968 __schedstat_add(se->statistics.iowait_sum, delta);
969 __schedstat_inc(se->statistics.iowait_count);
970 trace_sched_stat_iowait(tsk, delta);
971 }
972
973 trace_sched_stat_blocked(tsk, delta);
974
975
976
977
978
979
980 if (unlikely(prof_on == SLEEP_PROFILING)) {
981 profile_hits(SLEEP_PROFILING,
982 (void *)get_wchan(tsk),
983 delta >> 20);
984 }
985 account_scheduler_latency(tsk, delta >> 10, 0);
986 }
987 }
988}
989
990
991
992
993static inline void
994update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
995{
996 if (!schedstat_enabled())
997 return;
998
999
1000
1001
1002
1003 if (se != cfs_rq->curr)
1004 update_stats_wait_start(cfs_rq, se);
1005
1006 if (flags & ENQUEUE_WAKEUP)
1007 update_stats_enqueue_sleeper(cfs_rq, se);
1008}
1009
1010static inline void
1011update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1012{
1013
1014 if (!schedstat_enabled())
1015 return;
1016
1017
1018
1019
1020
1021 if (se != cfs_rq->curr)
1022 update_stats_wait_end(cfs_rq, se);
1023
1024 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1025 struct task_struct *tsk = task_of(se);
1026
1027 if (tsk->state & TASK_INTERRUPTIBLE)
1028 __schedstat_set(se->statistics.sleep_start,
1029 rq_clock(rq_of(cfs_rq)));
1030 if (tsk->state & TASK_UNINTERRUPTIBLE)
1031 __schedstat_set(se->statistics.block_start,
1032 rq_clock(rq_of(cfs_rq)));
1033 }
1034}
1035
1036
1037
1038
1039static inline void
1040update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1041{
1042
1043
1044
1045 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1046}
1047
1048
1049
1050
1051
1052#ifdef CONFIG_NUMA_BALANCING
1053
1054
1055
1056
1057
1058unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1059unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1060
1061
1062unsigned int sysctl_numa_balancing_scan_size = 256;
1063
1064
1065unsigned int sysctl_numa_balancing_scan_delay = 1000;
1066
1067struct numa_group {
1068 refcount_t refcount;
1069
1070 spinlock_t lock;
1071 int nr_tasks;
1072 pid_t gid;
1073 int active_nodes;
1074
1075 struct rcu_head rcu;
1076 unsigned long total_faults;
1077 unsigned long max_faults_cpu;
1078
1079
1080
1081
1082
1083 unsigned long *faults_cpu;
1084 unsigned long faults[0];
1085};
1086
1087
1088
1089
1090
1091static struct numa_group *deref_task_numa_group(struct task_struct *p)
1092{
1093 return rcu_dereference_check(p->numa_group, p == current ||
1094 (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1095}
1096
1097static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1098{
1099 return rcu_dereference_protected(p->numa_group, p == current);
1100}
1101
1102static inline unsigned long group_faults_priv(struct numa_group *ng);
1103static inline unsigned long group_faults_shared(struct numa_group *ng);
1104
1105static unsigned int task_nr_scan_windows(struct task_struct *p)
1106{
1107 unsigned long rss = 0;
1108 unsigned long nr_scan_pages;
1109
1110
1111
1112
1113
1114
1115 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1116 rss = get_mm_rss(p->mm);
1117 if (!rss)
1118 rss = nr_scan_pages;
1119
1120 rss = round_up(rss, nr_scan_pages);
1121 return rss / nr_scan_pages;
1122}
1123
1124
1125#define MAX_SCAN_WINDOW 2560
1126
1127static unsigned int task_scan_min(struct task_struct *p)
1128{
1129 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1130 unsigned int scan, floor;
1131 unsigned int windows = 1;
1132
1133 if (scan_size < MAX_SCAN_WINDOW)
1134 windows = MAX_SCAN_WINDOW / scan_size;
1135 floor = 1000 / windows;
1136
1137 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1138 return max_t(unsigned int, floor, scan);
1139}
1140
1141static unsigned int task_scan_start(struct task_struct *p)
1142{
1143 unsigned long smin = task_scan_min(p);
1144 unsigned long period = smin;
1145 struct numa_group *ng;
1146
1147
1148 rcu_read_lock();
1149 ng = rcu_dereference(p->numa_group);
1150 if (ng) {
1151 unsigned long shared = group_faults_shared(ng);
1152 unsigned long private = group_faults_priv(ng);
1153
1154 period *= refcount_read(&ng->refcount);
1155 period *= shared + 1;
1156 period /= private + shared + 1;
1157 }
1158 rcu_read_unlock();
1159
1160 return max(smin, period);
1161}
1162
1163static unsigned int task_scan_max(struct task_struct *p)
1164{
1165 unsigned long smin = task_scan_min(p);
1166 unsigned long smax;
1167 struct numa_group *ng;
1168
1169
1170 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1171
1172
1173 ng = deref_curr_numa_group(p);
1174 if (ng) {
1175 unsigned long shared = group_faults_shared(ng);
1176 unsigned long private = group_faults_priv(ng);
1177 unsigned long period = smax;
1178
1179 period *= refcount_read(&ng->refcount);
1180 period *= shared + 1;
1181 period /= private + shared + 1;
1182
1183 smax = max(smax, period);
1184 }
1185
1186 return max(smin, smax);
1187}
1188
1189static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1190{
1191 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1192 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1193}
1194
1195static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1196{
1197 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1198 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1199}
1200
1201
1202#define NR_NUMA_HINT_FAULT_TYPES 2
1203
1204
1205#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1206
1207
1208#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1209
1210pid_t task_numa_group_id(struct task_struct *p)
1211{
1212 struct numa_group *ng;
1213 pid_t gid = 0;
1214
1215 rcu_read_lock();
1216 ng = rcu_dereference(p->numa_group);
1217 if (ng)
1218 gid = ng->gid;
1219 rcu_read_unlock();
1220
1221 return gid;
1222}
1223
1224
1225
1226
1227
1228
1229
1230static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1231{
1232 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1233}
1234
1235static inline unsigned long task_faults(struct task_struct *p, int nid)
1236{
1237 if (!p->numa_faults)
1238 return 0;
1239
1240 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1241 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1242}
1243
1244static inline unsigned long group_faults(struct task_struct *p, int nid)
1245{
1246 struct numa_group *ng = deref_task_numa_group(p);
1247
1248 if (!ng)
1249 return 0;
1250
1251 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1252 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1253}
1254
1255static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1256{
1257 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1258 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1259}
1260
1261static inline unsigned long group_faults_priv(struct numa_group *ng)
1262{
1263 unsigned long faults = 0;
1264 int node;
1265
1266 for_each_online_node(node) {
1267 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1268 }
1269
1270 return faults;
1271}
1272
1273static inline unsigned long group_faults_shared(struct numa_group *ng)
1274{
1275 unsigned long faults = 0;
1276 int node;
1277
1278 for_each_online_node(node) {
1279 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1280 }
1281
1282 return faults;
1283}
1284
1285
1286
1287
1288
1289
1290#define ACTIVE_NODE_FRACTION 3
1291
1292static bool numa_is_active_node(int nid, struct numa_group *ng)
1293{
1294 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1295}
1296
1297
1298static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1299 int maxdist, bool task)
1300{
1301 unsigned long score = 0;
1302 int node;
1303
1304
1305
1306
1307
1308 if (sched_numa_topology_type == NUMA_DIRECT)
1309 return 0;
1310
1311
1312
1313
1314
1315 for_each_online_node(node) {
1316 unsigned long faults;
1317 int dist = node_distance(nid, node);
1318
1319
1320
1321
1322
1323 if (dist == sched_max_numa_distance || node == nid)
1324 continue;
1325
1326
1327
1328
1329
1330
1331
1332
1333 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1334 dist >= maxdist)
1335 continue;
1336
1337
1338 if (task)
1339 faults = task_faults(p, node);
1340 else
1341 faults = group_faults(p, node);
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1352 faults *= (sched_max_numa_distance - dist);
1353 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1354 }
1355
1356 score += faults;
1357 }
1358
1359 return score;
1360}
1361
1362
1363
1364
1365
1366
1367
1368static inline unsigned long task_weight(struct task_struct *p, int nid,
1369 int dist)
1370{
1371 unsigned long faults, total_faults;
1372
1373 if (!p->numa_faults)
1374 return 0;
1375
1376 total_faults = p->total_numa_faults;
1377
1378 if (!total_faults)
1379 return 0;
1380
1381 faults = task_faults(p, nid);
1382 faults += score_nearby_nodes(p, nid, dist, true);
1383
1384 return 1000 * faults / total_faults;
1385}
1386
1387static inline unsigned long group_weight(struct task_struct *p, int nid,
1388 int dist)
1389{
1390 struct numa_group *ng = deref_task_numa_group(p);
1391 unsigned long faults, total_faults;
1392
1393 if (!ng)
1394 return 0;
1395
1396 total_faults = ng->total_faults;
1397
1398 if (!total_faults)
1399 return 0;
1400
1401 faults = group_faults(p, nid);
1402 faults += score_nearby_nodes(p, nid, dist, false);
1403
1404 return 1000 * faults / total_faults;
1405}
1406
1407bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1408 int src_nid, int dst_cpu)
1409{
1410 struct numa_group *ng = deref_curr_numa_group(p);
1411 int dst_nid = cpu_to_node(dst_cpu);
1412 int last_cpupid, this_cpupid;
1413
1414 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1415 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1416
1417
1418
1419
1420
1421
1422
1423 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1424 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1425 return true;
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444 if (!cpupid_pid_unset(last_cpupid) &&
1445 cpupid_to_nid(last_cpupid) != dst_nid)
1446 return false;
1447
1448
1449 if (cpupid_match_pid(p, last_cpupid))
1450 return true;
1451
1452
1453 if (!ng)
1454 return true;
1455
1456
1457
1458
1459
1460 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1461 ACTIVE_NODE_FRACTION)
1462 return true;
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1473 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1474}
1475
1476static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
1477
1478static unsigned long cpu_runnable_load(struct rq *rq)
1479{
1480 return cfs_rq_runnable_load_avg(&rq->cfs);
1481}
1482
1483
1484struct numa_stats {
1485 unsigned long load;
1486
1487
1488 unsigned long compute_capacity;
1489};
1490
1491
1492
1493
1494static void update_numa_stats(struct numa_stats *ns, int nid)
1495{
1496 int cpu;
1497
1498 memset(ns, 0, sizeof(*ns));
1499 for_each_cpu(cpu, cpumask_of_node(nid)) {
1500 struct rq *rq = cpu_rq(cpu);
1501
1502 ns->load += cpu_runnable_load(rq);
1503 ns->compute_capacity += capacity_of(cpu);
1504 }
1505
1506}
1507
1508struct task_numa_env {
1509 struct task_struct *p;
1510
1511 int src_cpu, src_nid;
1512 int dst_cpu, dst_nid;
1513
1514 struct numa_stats src_stats, dst_stats;
1515
1516 int imbalance_pct;
1517 int dist;
1518
1519 struct task_struct *best_task;
1520 long best_imp;
1521 int best_cpu;
1522};
1523
1524static void task_numa_assign(struct task_numa_env *env,
1525 struct task_struct *p, long imp)
1526{
1527 struct rq *rq = cpu_rq(env->dst_cpu);
1528
1529
1530 if (xchg(&rq->numa_migrate_on, 1))
1531 return;
1532
1533
1534
1535
1536
1537 if (env->best_cpu != -1) {
1538 rq = cpu_rq(env->best_cpu);
1539 WRITE_ONCE(rq->numa_migrate_on, 0);
1540 }
1541
1542 if (env->best_task)
1543 put_task_struct(env->best_task);
1544 if (p)
1545 get_task_struct(p);
1546
1547 env->best_task = p;
1548 env->best_imp = imp;
1549 env->best_cpu = env->dst_cpu;
1550}
1551
1552static bool load_too_imbalanced(long src_load, long dst_load,
1553 struct task_numa_env *env)
1554{
1555 long imb, old_imb;
1556 long orig_src_load, orig_dst_load;
1557 long src_capacity, dst_capacity;
1558
1559
1560
1561
1562
1563
1564
1565
1566 src_capacity = env->src_stats.compute_capacity;
1567 dst_capacity = env->dst_stats.compute_capacity;
1568
1569 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1570
1571 orig_src_load = env->src_stats.load;
1572 orig_dst_load = env->dst_stats.load;
1573
1574 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1575
1576
1577 return (imb > old_imb);
1578}
1579
1580
1581
1582
1583
1584
1585#define SMALLIMP 30
1586
1587
1588
1589
1590
1591
1592
1593static void task_numa_compare(struct task_numa_env *env,
1594 long taskimp, long groupimp, bool maymove)
1595{
1596 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1597 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1598 long imp = p_ng ? groupimp : taskimp;
1599 struct task_struct *cur;
1600 long src_load, dst_load;
1601 int dist = env->dist;
1602 long moveimp = imp;
1603 long load;
1604
1605 if (READ_ONCE(dst_rq->numa_migrate_on))
1606 return;
1607
1608 rcu_read_lock();
1609 cur = rcu_dereference(dst_rq->curr);
1610 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1611 cur = NULL;
1612
1613
1614
1615
1616
1617 if (cur == env->p)
1618 goto unlock;
1619
1620 if (!cur) {
1621 if (maymove && moveimp >= env->best_imp)
1622 goto assign;
1623 else
1624 goto unlock;
1625 }
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1636 goto unlock;
1637
1638
1639
1640
1641
1642 cur_ng = rcu_dereference(cur->numa_group);
1643 if (cur_ng == p_ng) {
1644 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1645 task_weight(cur, env->dst_nid, dist);
1646
1647
1648
1649
1650 if (cur_ng)
1651 imp -= imp / 16;
1652 } else {
1653
1654
1655
1656
1657 if (cur_ng && p_ng)
1658 imp += group_weight(cur, env->src_nid, dist) -
1659 group_weight(cur, env->dst_nid, dist);
1660 else
1661 imp += task_weight(cur, env->src_nid, dist) -
1662 task_weight(cur, env->dst_nid, dist);
1663 }
1664
1665 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1666 imp = moveimp;
1667 cur = NULL;
1668 goto assign;
1669 }
1670
1671
1672
1673
1674
1675
1676
1677 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1678 goto unlock;
1679
1680
1681
1682
1683 load = task_h_load(env->p) - task_h_load(cur);
1684 if (!load)
1685 goto assign;
1686
1687 dst_load = env->dst_stats.load + load;
1688 src_load = env->src_stats.load - load;
1689
1690 if (load_too_imbalanced(src_load, dst_load, env))
1691 goto unlock;
1692
1693assign:
1694
1695
1696
1697
1698 if (!cur) {
1699
1700
1701
1702
1703 local_irq_disable();
1704 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1705 env->dst_cpu);
1706 local_irq_enable();
1707 }
1708
1709 task_numa_assign(env, cur, imp);
1710unlock:
1711 rcu_read_unlock();
1712}
1713
1714static void task_numa_find_cpu(struct task_numa_env *env,
1715 long taskimp, long groupimp)
1716{
1717 long src_load, dst_load, load;
1718 bool maymove = false;
1719 int cpu;
1720
1721 load = task_h_load(env->p);
1722 dst_load = env->dst_stats.load + load;
1723 src_load = env->src_stats.load - load;
1724
1725
1726
1727
1728
1729 maymove = !load_too_imbalanced(src_load, dst_load, env);
1730
1731 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1732
1733 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1734 continue;
1735
1736 env->dst_cpu = cpu;
1737 task_numa_compare(env, taskimp, groupimp, maymove);
1738 }
1739}
1740
1741static int task_numa_migrate(struct task_struct *p)
1742{
1743 struct task_numa_env env = {
1744 .p = p,
1745
1746 .src_cpu = task_cpu(p),
1747 .src_nid = task_node(p),
1748
1749 .imbalance_pct = 112,
1750
1751 .best_task = NULL,
1752 .best_imp = 0,
1753 .best_cpu = -1,
1754 };
1755 unsigned long taskweight, groupweight;
1756 struct sched_domain *sd;
1757 long taskimp, groupimp;
1758 struct numa_group *ng;
1759 struct rq *best_rq;
1760 int nid, ret, dist;
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 rcu_read_lock();
1771 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1772 if (sd)
1773 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1774 rcu_read_unlock();
1775
1776
1777
1778
1779
1780
1781
1782 if (unlikely(!sd)) {
1783 sched_setnuma(p, task_node(p));
1784 return -EINVAL;
1785 }
1786
1787 env.dst_nid = p->numa_preferred_nid;
1788 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1789 taskweight = task_weight(p, env.src_nid, dist);
1790 groupweight = group_weight(p, env.src_nid, dist);
1791 update_numa_stats(&env.src_stats, env.src_nid);
1792 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1793 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1794 update_numa_stats(&env.dst_stats, env.dst_nid);
1795
1796
1797 task_numa_find_cpu(&env, taskimp, groupimp);
1798
1799
1800
1801
1802
1803
1804
1805
1806 ng = deref_curr_numa_group(p);
1807 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1808 for_each_online_node(nid) {
1809 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1810 continue;
1811
1812 dist = node_distance(env.src_nid, env.dst_nid);
1813 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1814 dist != env.dist) {
1815 taskweight = task_weight(p, env.src_nid, dist);
1816 groupweight = group_weight(p, env.src_nid, dist);
1817 }
1818
1819
1820 taskimp = task_weight(p, nid, dist) - taskweight;
1821 groupimp = group_weight(p, nid, dist) - groupweight;
1822 if (taskimp < 0 && groupimp < 0)
1823 continue;
1824
1825 env.dist = dist;
1826 env.dst_nid = nid;
1827 update_numa_stats(&env.dst_stats, env.dst_nid);
1828 task_numa_find_cpu(&env, taskimp, groupimp);
1829 }
1830 }
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840 if (ng) {
1841 if (env.best_cpu == -1)
1842 nid = env.src_nid;
1843 else
1844 nid = cpu_to_node(env.best_cpu);
1845
1846 if (nid != p->numa_preferred_nid)
1847 sched_setnuma(p, nid);
1848 }
1849
1850
1851 if (env.best_cpu == -1)
1852 return -EAGAIN;
1853
1854 best_rq = cpu_rq(env.best_cpu);
1855 if (env.best_task == NULL) {
1856 ret = migrate_task_to(p, env.best_cpu);
1857 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1858 if (ret != 0)
1859 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1860 return ret;
1861 }
1862
1863 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1864 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1865
1866 if (ret != 0)
1867 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1868 put_task_struct(env.best_task);
1869 return ret;
1870}
1871
1872
1873static void numa_migrate_preferred(struct task_struct *p)
1874{
1875 unsigned long interval = HZ;
1876
1877
1878 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1879 return;
1880
1881
1882 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1883 p->numa_migrate_retry = jiffies + interval;
1884
1885
1886 if (task_node(p) == p->numa_preferred_nid)
1887 return;
1888
1889
1890 task_numa_migrate(p);
1891}
1892
1893
1894
1895
1896
1897
1898
1899static void numa_group_count_active_nodes(struct numa_group *numa_group)
1900{
1901 unsigned long faults, max_faults = 0;
1902 int nid, active_nodes = 0;
1903
1904 for_each_online_node(nid) {
1905 faults = group_faults_cpu(numa_group, nid);
1906 if (faults > max_faults)
1907 max_faults = faults;
1908 }
1909
1910 for_each_online_node(nid) {
1911 faults = group_faults_cpu(numa_group, nid);
1912 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1913 active_nodes++;
1914 }
1915
1916 numa_group->max_faults_cpu = max_faults;
1917 numa_group->active_nodes = active_nodes;
1918}
1919
1920
1921
1922
1923
1924
1925
1926
1927#define NUMA_PERIOD_SLOTS 10
1928#define NUMA_PERIOD_THRESHOLD 7
1929
1930
1931
1932
1933
1934
1935
1936static void update_task_scan_period(struct task_struct *p,
1937 unsigned long shared, unsigned long private)
1938{
1939 unsigned int period_slot;
1940 int lr_ratio, ps_ratio;
1941 int diff;
1942
1943 unsigned long remote = p->numa_faults_locality[0];
1944 unsigned long local = p->numa_faults_locality[1];
1945
1946
1947
1948
1949
1950
1951
1952
1953 if (local + shared == 0 || p->numa_faults_locality[2]) {
1954 p->numa_scan_period = min(p->numa_scan_period_max,
1955 p->numa_scan_period << 1);
1956
1957 p->mm->numa_next_scan = jiffies +
1958 msecs_to_jiffies(p->numa_scan_period);
1959
1960 return;
1961 }
1962
1963
1964
1965
1966
1967
1968
1969 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1970 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1971 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1972
1973 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1974
1975
1976
1977
1978 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1979 if (!slot)
1980 slot = 1;
1981 diff = slot * period_slot;
1982 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1983
1984
1985
1986
1987
1988 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1989 if (!slot)
1990 slot = 1;
1991 diff = slot * period_slot;
1992 } else {
1993
1994
1995
1996
1997
1998 int ratio = max(lr_ratio, ps_ratio);
1999 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2000 }
2001
2002 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2003 task_scan_min(p), task_scan_max(p));
2004 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2015{
2016 u64 runtime, delta, now;
2017
2018 now = p->se.exec_start;
2019 runtime = p->se.sum_exec_runtime;
2020
2021 if (p->last_task_numa_placement) {
2022 delta = runtime - p->last_sum_exec_runtime;
2023 *period = now - p->last_task_numa_placement;
2024
2025
2026 if (unlikely((s64)*period < 0))
2027 *period = 0;
2028 } else {
2029 delta = p->se.avg.load_sum;
2030 *period = LOAD_AVG_MAX;
2031 }
2032
2033 p->last_sum_exec_runtime = runtime;
2034 p->last_task_numa_placement = now;
2035
2036 return delta;
2037}
2038
2039
2040
2041
2042
2043
2044static int preferred_group_nid(struct task_struct *p, int nid)
2045{
2046 nodemask_t nodes;
2047 int dist;
2048
2049
2050 if (sched_numa_topology_type == NUMA_DIRECT)
2051 return nid;
2052
2053
2054
2055
2056
2057
2058 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2059 unsigned long score, max_score = 0;
2060 int node, max_node = nid;
2061
2062 dist = sched_max_numa_distance;
2063
2064 for_each_online_node(node) {
2065 score = group_weight(p, node, dist);
2066 if (score > max_score) {
2067 max_score = score;
2068 max_node = node;
2069 }
2070 }
2071 return max_node;
2072 }
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083 nodes = node_online_map;
2084 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2085 unsigned long max_faults = 0;
2086 nodemask_t max_group = NODE_MASK_NONE;
2087 int a, b;
2088
2089
2090 if (!find_numa_distance(dist))
2091 continue;
2092
2093 for_each_node_mask(a, nodes) {
2094 unsigned long faults = 0;
2095 nodemask_t this_group;
2096 nodes_clear(this_group);
2097
2098
2099 for_each_node_mask(b, nodes) {
2100 if (node_distance(a, b) < dist) {
2101 faults += group_faults(p, b);
2102 node_set(b, this_group);
2103 node_clear(b, nodes);
2104 }
2105 }
2106
2107
2108 if (faults > max_faults) {
2109 max_faults = faults;
2110 max_group = this_group;
2111
2112
2113
2114
2115
2116 nid = a;
2117 }
2118 }
2119
2120 if (!max_faults)
2121 break;
2122 nodes = max_group;
2123 }
2124 return nid;
2125}
2126
2127static void task_numa_placement(struct task_struct *p)
2128{
2129 int seq, nid, max_nid = NUMA_NO_NODE;
2130 unsigned long max_faults = 0;
2131 unsigned long fault_types[2] = { 0, 0 };
2132 unsigned long total_faults;
2133 u64 runtime, period;
2134 spinlock_t *group_lock = NULL;
2135 struct numa_group *ng;
2136
2137
2138
2139
2140
2141
2142 seq = READ_ONCE(p->mm->numa_scan_seq);
2143 if (p->numa_scan_seq == seq)
2144 return;
2145 p->numa_scan_seq = seq;
2146 p->numa_scan_period_max = task_scan_max(p);
2147
2148 total_faults = p->numa_faults_locality[0] +
2149 p->numa_faults_locality[1];
2150 runtime = numa_get_avg_runtime(p, &period);
2151
2152
2153 ng = deref_curr_numa_group(p);
2154 if (ng) {
2155 group_lock = &ng->lock;
2156 spin_lock_irq(group_lock);
2157 }
2158
2159
2160 for_each_online_node(nid) {
2161
2162 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2163 unsigned long faults = 0, group_faults = 0;
2164 int priv;
2165
2166 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2167 long diff, f_diff, f_weight;
2168
2169 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2170 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2171 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2172 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2173
2174
2175 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2176 fault_types[priv] += p->numa_faults[membuf_idx];
2177 p->numa_faults[membuf_idx] = 0;
2178
2179
2180
2181
2182
2183
2184
2185
2186 f_weight = div64_u64(runtime << 16, period + 1);
2187 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2188 (total_faults + 1);
2189 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2190 p->numa_faults[cpubuf_idx] = 0;
2191
2192 p->numa_faults[mem_idx] += diff;
2193 p->numa_faults[cpu_idx] += f_diff;
2194 faults += p->numa_faults[mem_idx];
2195 p->total_numa_faults += diff;
2196 if (ng) {
2197
2198
2199
2200
2201
2202
2203
2204 ng->faults[mem_idx] += diff;
2205 ng->faults_cpu[mem_idx] += f_diff;
2206 ng->total_faults += diff;
2207 group_faults += ng->faults[mem_idx];
2208 }
2209 }
2210
2211 if (!ng) {
2212 if (faults > max_faults) {
2213 max_faults = faults;
2214 max_nid = nid;
2215 }
2216 } else if (group_faults > max_faults) {
2217 max_faults = group_faults;
2218 max_nid = nid;
2219 }
2220 }
2221
2222 if (ng) {
2223 numa_group_count_active_nodes(ng);
2224 spin_unlock_irq(group_lock);
2225 max_nid = preferred_group_nid(p, max_nid);
2226 }
2227
2228 if (max_faults) {
2229
2230 if (max_nid != p->numa_preferred_nid)
2231 sched_setnuma(p, max_nid);
2232 }
2233
2234 update_task_scan_period(p, fault_types[0], fault_types[1]);
2235}
2236
2237static inline int get_numa_group(struct numa_group *grp)
2238{
2239 return refcount_inc_not_zero(&grp->refcount);
2240}
2241
2242static inline void put_numa_group(struct numa_group *grp)
2243{
2244 if (refcount_dec_and_test(&grp->refcount))
2245 kfree_rcu(grp, rcu);
2246}
2247
2248static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2249 int *priv)
2250{
2251 struct numa_group *grp, *my_grp;
2252 struct task_struct *tsk;
2253 bool join = false;
2254 int cpu = cpupid_to_cpu(cpupid);
2255 int i;
2256
2257 if (unlikely(!deref_curr_numa_group(p))) {
2258 unsigned int size = sizeof(struct numa_group) +
2259 4*nr_node_ids*sizeof(unsigned long);
2260
2261 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2262 if (!grp)
2263 return;
2264
2265 refcount_set(&grp->refcount, 1);
2266 grp->active_nodes = 1;
2267 grp->max_faults_cpu = 0;
2268 spin_lock_init(&grp->lock);
2269 grp->gid = p->pid;
2270
2271 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2272 nr_node_ids;
2273
2274 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2275 grp->faults[i] = p->numa_faults[i];
2276
2277 grp->total_faults = p->total_numa_faults;
2278
2279 grp->nr_tasks++;
2280 rcu_assign_pointer(p->numa_group, grp);
2281 }
2282
2283 rcu_read_lock();
2284 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2285
2286 if (!cpupid_match_pid(tsk, cpupid))
2287 goto no_join;
2288
2289 grp = rcu_dereference(tsk->numa_group);
2290 if (!grp)
2291 goto no_join;
2292
2293 my_grp = deref_curr_numa_group(p);
2294 if (grp == my_grp)
2295 goto no_join;
2296
2297
2298
2299
2300
2301 if (my_grp->nr_tasks > grp->nr_tasks)
2302 goto no_join;
2303
2304
2305
2306
2307 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2308 goto no_join;
2309
2310
2311 if (tsk->mm == current->mm)
2312 join = true;
2313
2314
2315 if (flags & TNF_SHARED)
2316 join = true;
2317
2318
2319 *priv = !join;
2320
2321 if (join && !get_numa_group(grp))
2322 goto no_join;
2323
2324 rcu_read_unlock();
2325
2326 if (!join)
2327 return;
2328
2329 BUG_ON(irqs_disabled());
2330 double_lock_irq(&my_grp->lock, &grp->lock);
2331
2332 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2333 my_grp->faults[i] -= p->numa_faults[i];
2334 grp->faults[i] += p->numa_faults[i];
2335 }
2336 my_grp->total_faults -= p->total_numa_faults;
2337 grp->total_faults += p->total_numa_faults;
2338
2339 my_grp->nr_tasks--;
2340 grp->nr_tasks++;
2341
2342 spin_unlock(&my_grp->lock);
2343 spin_unlock_irq(&grp->lock);
2344
2345 rcu_assign_pointer(p->numa_group, grp);
2346
2347 put_numa_group(my_grp);
2348 return;
2349
2350no_join:
2351 rcu_read_unlock();
2352 return;
2353}
2354
2355
2356
2357
2358
2359
2360
2361
2362void task_numa_free(struct task_struct *p, bool final)
2363{
2364
2365 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2366 unsigned long *numa_faults = p->numa_faults;
2367 unsigned long flags;
2368 int i;
2369
2370 if (!numa_faults)
2371 return;
2372
2373 if (grp) {
2374 spin_lock_irqsave(&grp->lock, flags);
2375 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2376 grp->faults[i] -= p->numa_faults[i];
2377 grp->total_faults -= p->total_numa_faults;
2378
2379 grp->nr_tasks--;
2380 spin_unlock_irqrestore(&grp->lock, flags);
2381 RCU_INIT_POINTER(p->numa_group, NULL);
2382 put_numa_group(grp);
2383 }
2384
2385 if (final) {
2386 p->numa_faults = NULL;
2387 kfree(numa_faults);
2388 } else {
2389 p->total_numa_faults = 0;
2390 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2391 numa_faults[i] = 0;
2392 }
2393}
2394
2395
2396
2397
2398void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2399{
2400 struct task_struct *p = current;
2401 bool migrated = flags & TNF_MIGRATED;
2402 int cpu_node = task_node(current);
2403 int local = !!(flags & TNF_FAULT_LOCAL);
2404 struct numa_group *ng;
2405 int priv;
2406
2407 if (!static_branch_likely(&sched_numa_balancing))
2408 return;
2409
2410
2411 if (!p->mm)
2412 return;
2413
2414
2415 if (unlikely(!p->numa_faults)) {
2416 int size = sizeof(*p->numa_faults) *
2417 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2418
2419 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2420 if (!p->numa_faults)
2421 return;
2422
2423 p->total_numa_faults = 0;
2424 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2425 }
2426
2427
2428
2429
2430
2431 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2432 priv = 1;
2433 } else {
2434 priv = cpupid_match_pid(p, last_cpupid);
2435 if (!priv && !(flags & TNF_NO_GROUP))
2436 task_numa_group(p, last_cpupid, flags, &priv);
2437 }
2438
2439
2440
2441
2442
2443
2444
2445 ng = deref_curr_numa_group(p);
2446 if (!priv && !local && ng && ng->active_nodes > 1 &&
2447 numa_is_active_node(cpu_node, ng) &&
2448 numa_is_active_node(mem_node, ng))
2449 local = 1;
2450
2451
2452
2453
2454
2455 if (time_after(jiffies, p->numa_migrate_retry)) {
2456 task_numa_placement(p);
2457 numa_migrate_preferred(p);
2458 }
2459
2460 if (migrated)
2461 p->numa_pages_migrated += pages;
2462 if (flags & TNF_MIGRATE_FAIL)
2463 p->numa_faults_locality[2] += pages;
2464
2465 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2466 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2467 p->numa_faults_locality[local] += pages;
2468}
2469
2470static void reset_ptenuma_scan(struct task_struct *p)
2471{
2472
2473
2474
2475
2476
2477
2478
2479
2480 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2481 p->mm->numa_scan_offset = 0;
2482}
2483
2484
2485
2486
2487
2488static void task_numa_work(struct callback_head *work)
2489{
2490 unsigned long migrate, next_scan, now = jiffies;
2491 struct task_struct *p = current;
2492 struct mm_struct *mm = p->mm;
2493 u64 runtime = p->se.sum_exec_runtime;
2494 struct vm_area_struct *vma;
2495 unsigned long start, end;
2496 unsigned long nr_pte_updates = 0;
2497 long pages, virtpages;
2498
2499 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2500
2501 work->next = work;
2502
2503
2504
2505
2506
2507
2508
2509
2510 if (p->flags & PF_EXITING)
2511 return;
2512
2513 if (!mm->numa_next_scan) {
2514 mm->numa_next_scan = now +
2515 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2516 }
2517
2518
2519
2520
2521 migrate = mm->numa_next_scan;
2522 if (time_before(now, migrate))
2523 return;
2524
2525 if (p->numa_scan_period == 0) {
2526 p->numa_scan_period_max = task_scan_max(p);
2527 p->numa_scan_period = task_scan_start(p);
2528 }
2529
2530 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2531 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2532 return;
2533
2534
2535
2536
2537
2538 p->node_stamp += 2 * TICK_NSEC;
2539
2540 start = mm->numa_scan_offset;
2541 pages = sysctl_numa_balancing_scan_size;
2542 pages <<= 20 - PAGE_SHIFT;
2543 virtpages = pages * 8;
2544 if (!pages)
2545 return;
2546
2547
2548 if (!down_read_trylock(&mm->mmap_sem))
2549 return;
2550 vma = find_vma(mm, start);
2551 if (!vma) {
2552 reset_ptenuma_scan(p);
2553 start = 0;
2554 vma = mm->mmap;
2555 }
2556 for (; vma; vma = vma->vm_next) {
2557 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2558 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2559 continue;
2560 }
2561
2562
2563
2564
2565
2566
2567
2568 if (!vma->vm_mm ||
2569 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2570 continue;
2571
2572
2573
2574
2575
2576 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2577 continue;
2578
2579 do {
2580 start = max(start, vma->vm_start);
2581 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2582 end = min(end, vma->vm_end);
2583 nr_pte_updates = change_prot_numa(vma, start, end);
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593 if (nr_pte_updates)
2594 pages -= (end - start) >> PAGE_SHIFT;
2595 virtpages -= (end - start) >> PAGE_SHIFT;
2596
2597 start = end;
2598 if (pages <= 0 || virtpages <= 0)
2599 goto out;
2600
2601 cond_resched();
2602 } while (end != vma->vm_end);
2603 }
2604
2605out:
2606
2607
2608
2609
2610
2611
2612 if (vma)
2613 mm->numa_scan_offset = start;
2614 else
2615 reset_ptenuma_scan(p);
2616 up_read(&mm->mmap_sem);
2617
2618
2619
2620
2621
2622
2623
2624 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2625 u64 diff = p->se.sum_exec_runtime - runtime;
2626 p->node_stamp += 32 * diff;
2627 }
2628}
2629
2630void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2631{
2632 int mm_users = 0;
2633 struct mm_struct *mm = p->mm;
2634
2635 if (mm) {
2636 mm_users = atomic_read(&mm->mm_users);
2637 if (mm_users == 1) {
2638 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2639 mm->numa_scan_seq = 0;
2640 }
2641 }
2642 p->node_stamp = 0;
2643 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2644 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2645
2646 p->numa_work.next = &p->numa_work;
2647 p->numa_faults = NULL;
2648 RCU_INIT_POINTER(p->numa_group, NULL);
2649 p->last_task_numa_placement = 0;
2650 p->last_sum_exec_runtime = 0;
2651
2652 init_task_work(&p->numa_work, task_numa_work);
2653
2654
2655 if (!(clone_flags & CLONE_VM)) {
2656 p->numa_preferred_nid = NUMA_NO_NODE;
2657 return;
2658 }
2659
2660
2661
2662
2663
2664 if (mm) {
2665 unsigned int delay;
2666
2667 delay = min_t(unsigned int, task_scan_max(current),
2668 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2669 delay += 2 * TICK_NSEC;
2670 p->node_stamp = delay;
2671 }
2672}
2673
2674
2675
2676
2677static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2678{
2679 struct callback_head *work = &curr->numa_work;
2680 u64 period, now;
2681
2682
2683
2684
2685 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2686 return;
2687
2688
2689
2690
2691
2692
2693
2694 now = curr->se.sum_exec_runtime;
2695 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2696
2697 if (now > curr->node_stamp + period) {
2698 if (!curr->node_stamp)
2699 curr->numa_scan_period = task_scan_start(curr);
2700 curr->node_stamp += period;
2701
2702 if (!time_before(jiffies, curr->mm->numa_next_scan))
2703 task_work_add(curr, work, true);
2704 }
2705}
2706
2707static void update_scan_period(struct task_struct *p, int new_cpu)
2708{
2709 int src_nid = cpu_to_node(task_cpu(p));
2710 int dst_nid = cpu_to_node(new_cpu);
2711
2712 if (!static_branch_likely(&sched_numa_balancing))
2713 return;
2714
2715 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2716 return;
2717
2718 if (src_nid == dst_nid)
2719 return;
2720
2721
2722
2723
2724
2725
2726 if (p->numa_scan_seq) {
2727
2728
2729
2730
2731
2732 if (dst_nid == p->numa_preferred_nid ||
2733 (p->numa_preferred_nid != NUMA_NO_NODE &&
2734 src_nid != p->numa_preferred_nid))
2735 return;
2736 }
2737
2738 p->numa_scan_period = task_scan_start(p);
2739}
2740
2741#else
2742static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2743{
2744}
2745
2746static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2747{
2748}
2749
2750static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2751{
2752}
2753
2754static inline void update_scan_period(struct task_struct *p, int new_cpu)
2755{
2756}
2757
2758#endif
2759
2760static void
2761account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763 update_load_add(&cfs_rq->load, se->load.weight);
2764#ifdef CONFIG_SMP
2765 if (entity_is_task(se)) {
2766 struct rq *rq = rq_of(cfs_rq);
2767
2768 account_numa_enqueue(rq, task_of(se));
2769 list_add(&se->group_node, &rq->cfs_tasks);
2770 }
2771#endif
2772 cfs_rq->nr_running++;
2773}
2774
2775static void
2776account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2777{
2778 update_load_sub(&cfs_rq->load, se->load.weight);
2779#ifdef CONFIG_SMP
2780 if (entity_is_task(se)) {
2781 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2782 list_del_init(&se->group_node);
2783 }
2784#endif
2785 cfs_rq->nr_running--;
2786}
2787
2788
2789
2790
2791
2792
2793
2794
2795#define add_positive(_ptr, _val) do { \
2796 typeof(_ptr) ptr = (_ptr); \
2797 typeof(_val) val = (_val); \
2798 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2799 \
2800 res = var + val; \
2801 \
2802 if (val < 0 && res > var) \
2803 res = 0; \
2804 \
2805 WRITE_ONCE(*ptr, res); \
2806} while (0)
2807
2808
2809
2810
2811
2812
2813
2814
2815#define sub_positive(_ptr, _val) do { \
2816 typeof(_ptr) ptr = (_ptr); \
2817 typeof(*ptr) val = (_val); \
2818 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2819 res = var - val; \
2820 if (res > var) \
2821 res = 0; \
2822 WRITE_ONCE(*ptr, res); \
2823} while (0)
2824
2825
2826
2827
2828
2829
2830
2831#define lsub_positive(_ptr, _val) do { \
2832 typeof(_ptr) ptr = (_ptr); \
2833 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
2834} while (0)
2835
2836#ifdef CONFIG_SMP
2837static inline void
2838enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2839{
2840 cfs_rq->runnable_weight += se->runnable_weight;
2841
2842 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2843 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2844}
2845
2846static inline void
2847dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2848{
2849 cfs_rq->runnable_weight -= se->runnable_weight;
2850
2851 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2852 sub_positive(&cfs_rq->avg.runnable_load_sum,
2853 se_runnable(se) * se->avg.runnable_load_sum);
2854}
2855
2856static inline void
2857enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2858{
2859 cfs_rq->avg.load_avg += se->avg.load_avg;
2860 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2861}
2862
2863static inline void
2864dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2865{
2866 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2867 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2868}
2869#else
2870static inline void
2871enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2872static inline void
2873dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2874static inline void
2875enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2876static inline void
2877dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2878#endif
2879
2880static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2881 unsigned long weight, unsigned long runnable)
2882{
2883 if (se->on_rq) {
2884
2885 if (cfs_rq->curr == se)
2886 update_curr(cfs_rq);
2887 account_entity_dequeue(cfs_rq, se);
2888 dequeue_runnable_load_avg(cfs_rq, se);
2889 }
2890 dequeue_load_avg(cfs_rq, se);
2891
2892 se->runnable_weight = runnable;
2893 update_load_set(&se->load, weight);
2894
2895#ifdef CONFIG_SMP
2896 do {
2897 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2898
2899 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2900 se->avg.runnable_load_avg =
2901 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2902 } while (0);
2903#endif
2904
2905 enqueue_load_avg(cfs_rq, se);
2906 if (se->on_rq) {
2907 account_entity_enqueue(cfs_rq, se);
2908 enqueue_runnable_load_avg(cfs_rq, se);
2909 }
2910}
2911
2912void reweight_task(struct task_struct *p, int prio)
2913{
2914 struct sched_entity *se = &p->se;
2915 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2916 struct load_weight *load = &se->load;
2917 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2918
2919 reweight_entity(cfs_rq, se, weight, weight);
2920 load->inv_weight = sched_prio_to_wmult[prio];
2921}
2922
2923#ifdef CONFIG_FAIR_GROUP_SCHED
2924#ifdef CONFIG_SMP
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998static long calc_group_shares(struct cfs_rq *cfs_rq)
2999{
3000 long tg_weight, tg_shares, load, shares;
3001 struct task_group *tg = cfs_rq->tg;
3002
3003 tg_shares = READ_ONCE(tg->shares);
3004
3005 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3006
3007 tg_weight = atomic_long_read(&tg->load_avg);
3008
3009
3010 tg_weight -= cfs_rq->tg_load_avg_contrib;
3011 tg_weight += load;
3012
3013 shares = (tg_shares * load);
3014 if (tg_weight)
3015 shares /= tg_weight;
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3030}
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3060{
3061 long runnable, load_avg;
3062
3063 load_avg = max(cfs_rq->avg.load_avg,
3064 scale_load_down(cfs_rq->load.weight));
3065
3066 runnable = max(cfs_rq->avg.runnable_load_avg,
3067 scale_load_down(cfs_rq->runnable_weight));
3068
3069 runnable *= shares;
3070 if (load_avg)
3071 runnable /= load_avg;
3072
3073 return clamp_t(long, runnable, MIN_SHARES, shares);
3074}
3075#endif
3076
3077static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3078
3079
3080
3081
3082
3083static void update_cfs_group(struct sched_entity *se)
3084{
3085 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3086 long shares, runnable;
3087
3088 if (!gcfs_rq)
3089 return;
3090
3091 if (throttled_hierarchy(gcfs_rq))
3092 return;
3093
3094#ifndef CONFIG_SMP
3095 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3096
3097 if (likely(se->load.weight == shares))
3098 return;
3099#else
3100 shares = calc_group_shares(gcfs_rq);
3101 runnable = calc_group_runnable(gcfs_rq, shares);
3102#endif
3103
3104 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3105}
3106
3107#else
3108static inline void update_cfs_group(struct sched_entity *se)
3109{
3110}
3111#endif
3112
3113static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3114{
3115 struct rq *rq = rq_of(cfs_rq);
3116
3117 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132 cpufreq_update_util(rq, flags);
3133 }
3134}
3135
3136#ifdef CONFIG_SMP
3137#ifdef CONFIG_FAIR_GROUP_SCHED
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3154{
3155 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3156
3157
3158
3159
3160 if (cfs_rq->tg == &root_task_group)
3161 return;
3162
3163 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3164 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3165 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3166 }
3167}
3168
3169
3170
3171
3172
3173
3174void set_task_rq_fair(struct sched_entity *se,
3175 struct cfs_rq *prev, struct cfs_rq *next)
3176{
3177 u64 p_last_update_time;
3178 u64 n_last_update_time;
3179
3180 if (!sched_feat(ATTACH_AGE_LOAD))
3181 return;
3182
3183
3184
3185
3186
3187
3188
3189
3190 if (!(se->avg.last_update_time && prev))
3191 return;
3192
3193#ifndef CONFIG_64BIT
3194 {
3195 u64 p_last_update_time_copy;
3196 u64 n_last_update_time_copy;
3197
3198 do {
3199 p_last_update_time_copy = prev->load_last_update_time_copy;
3200 n_last_update_time_copy = next->load_last_update_time_copy;
3201
3202 smp_rmb();
3203
3204 p_last_update_time = prev->avg.last_update_time;
3205 n_last_update_time = next->avg.last_update_time;
3206
3207 } while (p_last_update_time != p_last_update_time_copy ||
3208 n_last_update_time != n_last_update_time_copy);
3209 }
3210#else
3211 p_last_update_time = prev->avg.last_update_time;
3212 n_last_update_time = next->avg.last_update_time;
3213#endif
3214 __update_load_avg_blocked_se(p_last_update_time, se);
3215 se->avg.last_update_time = n_last_update_time;
3216}
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287static inline void
3288update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3289{
3290 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3291
3292
3293 if (!delta)
3294 return;
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305 se->avg.util_avg = gcfs_rq->avg.util_avg;
3306 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3307
3308
3309 add_positive(&cfs_rq->avg.util_avg, delta);
3310 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3311}
3312
3313static inline void
3314update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3315{
3316 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3317 unsigned long runnable_load_avg, load_avg;
3318 u64 runnable_load_sum, load_sum = 0;
3319 s64 delta_sum;
3320
3321 if (!runnable_sum)
3322 return;
3323
3324 gcfs_rq->prop_runnable_sum = 0;
3325
3326 if (runnable_sum >= 0) {
3327
3328
3329
3330
3331 runnable_sum += se->avg.load_sum;
3332 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3333 } else {
3334
3335
3336
3337
3338 if (scale_load_down(gcfs_rq->load.weight)) {
3339 load_sum = div_s64(gcfs_rq->avg.load_sum,
3340 scale_load_down(gcfs_rq->load.weight));
3341 }
3342
3343
3344 runnable_sum = min(se->avg.load_sum, load_sum);
3345 }
3346
3347
3348
3349
3350
3351
3352
3353 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3354 runnable_sum = max(runnable_sum, running_sum);
3355
3356 load_sum = (s64)se_weight(se) * runnable_sum;
3357 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3358
3359 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3360 delta_avg = load_avg - se->avg.load_avg;
3361
3362 se->avg.load_sum = runnable_sum;
3363 se->avg.load_avg = load_avg;
3364 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3365 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3366
3367 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3368 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3369 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3370 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3371
3372 se->avg.runnable_load_sum = runnable_sum;
3373 se->avg.runnable_load_avg = runnable_load_avg;
3374
3375 if (se->on_rq) {
3376 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3377 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3378 }
3379}
3380
3381static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3382{
3383 cfs_rq->propagate = 1;
3384 cfs_rq->prop_runnable_sum += runnable_sum;
3385}
3386
3387
3388static inline int propagate_entity_load_avg(struct sched_entity *se)
3389{
3390 struct cfs_rq *cfs_rq, *gcfs_rq;
3391
3392 if (entity_is_task(se))
3393 return 0;
3394
3395 gcfs_rq = group_cfs_rq(se);
3396 if (!gcfs_rq->propagate)
3397 return 0;
3398
3399 gcfs_rq->propagate = 0;
3400
3401 cfs_rq = cfs_rq_of(se);
3402
3403 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3404
3405 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3406 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3407
3408 trace_pelt_cfs_tp(cfs_rq);
3409 trace_pelt_se_tp(se);
3410
3411 return 1;
3412}
3413
3414
3415
3416
3417
3418static inline bool skip_blocked_update(struct sched_entity *se)
3419{
3420 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3421
3422
3423
3424
3425
3426 if (se->avg.load_avg || se->avg.util_avg)
3427 return false;
3428
3429
3430
3431
3432
3433 if (gcfs_rq->propagate)
3434 return false;
3435
3436
3437
3438
3439
3440
3441 return true;
3442}
3443
3444#else
3445
3446static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3447
3448static inline int propagate_entity_load_avg(struct sched_entity *se)
3449{
3450 return 0;
3451}
3452
3453static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3454
3455#endif
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473static inline int
3474update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3475{
3476 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3477 struct sched_avg *sa = &cfs_rq->avg;
3478 int decayed = 0;
3479
3480 if (cfs_rq->removed.nr) {
3481 unsigned long r;
3482 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3483
3484 raw_spin_lock(&cfs_rq->removed.lock);
3485 swap(cfs_rq->removed.util_avg, removed_util);
3486 swap(cfs_rq->removed.load_avg, removed_load);
3487 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3488 cfs_rq->removed.nr = 0;
3489 raw_spin_unlock(&cfs_rq->removed.lock);
3490
3491 r = removed_load;
3492 sub_positive(&sa->load_avg, r);
3493 sub_positive(&sa->load_sum, r * divider);
3494
3495 r = removed_util;
3496 sub_positive(&sa->util_avg, r);
3497 sub_positive(&sa->util_sum, r * divider);
3498
3499 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3500
3501 decayed = 1;
3502 }
3503
3504 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3505
3506#ifndef CONFIG_64BIT
3507 smp_wmb();
3508 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3509#endif
3510
3511 return decayed;
3512}
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3524{
3525 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3526
3527
3528
3529
3530
3531
3532
3533
3534 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3535 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3536
3537
3538
3539
3540
3541
3542
3543 se->avg.util_sum = se->avg.util_avg * divider;
3544
3545 se->avg.load_sum = divider;
3546 if (se_weight(se)) {
3547 se->avg.load_sum =
3548 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3549 }
3550
3551 se->avg.runnable_load_sum = se->avg.load_sum;
3552
3553 enqueue_load_avg(cfs_rq, se);
3554 cfs_rq->avg.util_avg += se->avg.util_avg;
3555 cfs_rq->avg.util_sum += se->avg.util_sum;
3556
3557 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3558
3559 cfs_rq_util_change(cfs_rq, flags);
3560
3561 trace_pelt_cfs_tp(cfs_rq);
3562}
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3573{
3574 dequeue_load_avg(cfs_rq, se);
3575 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3576 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3577
3578 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3579
3580 cfs_rq_util_change(cfs_rq, 0);
3581
3582 trace_pelt_cfs_tp(cfs_rq);
3583}
3584
3585
3586
3587
3588#define UPDATE_TG 0x1
3589#define SKIP_AGE_LOAD 0x2
3590#define DO_ATTACH 0x4
3591
3592
3593static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3594{
3595 u64 now = cfs_rq_clock_pelt(cfs_rq);
3596 int decayed;
3597
3598
3599
3600
3601
3602 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3603 __update_load_avg_se(now, cfs_rq, se);
3604
3605 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3606 decayed |= propagate_entity_load_avg(se);
3607
3608 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3609
3610
3611
3612
3613
3614
3615
3616
3617 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3618 update_tg_load_avg(cfs_rq, 0);
3619
3620 } else if (decayed) {
3621 cfs_rq_util_change(cfs_rq, 0);
3622
3623 if (flags & UPDATE_TG)
3624 update_tg_load_avg(cfs_rq, 0);
3625 }
3626}
3627
3628#ifndef CONFIG_64BIT
3629static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3630{
3631 u64 last_update_time_copy;
3632 u64 last_update_time;
3633
3634 do {
3635 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3636 smp_rmb();
3637 last_update_time = cfs_rq->avg.last_update_time;
3638 } while (last_update_time != last_update_time_copy);
3639
3640 return last_update_time;
3641}
3642#else
3643static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3644{
3645 return cfs_rq->avg.last_update_time;
3646}
3647#endif
3648
3649
3650
3651
3652
3653static void sync_entity_load_avg(struct sched_entity *se)
3654{
3655 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3656 u64 last_update_time;
3657
3658 last_update_time = cfs_rq_last_update_time(cfs_rq);
3659 __update_load_avg_blocked_se(last_update_time, se);
3660}
3661
3662
3663
3664
3665
3666static void remove_entity_load_avg(struct sched_entity *se)
3667{
3668 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3669 unsigned long flags;
3670
3671
3672
3673
3674
3675
3676
3677 sync_entity_load_avg(se);
3678
3679 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3680 ++cfs_rq->removed.nr;
3681 cfs_rq->removed.util_avg += se->avg.util_avg;
3682 cfs_rq->removed.load_avg += se->avg.load_avg;
3683 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3684 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3685}
3686
3687static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3688{
3689 return cfs_rq->avg.runnable_load_avg;
3690}
3691
3692static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3693{
3694 return cfs_rq->avg.load_avg;
3695}
3696
3697static inline unsigned long task_util(struct task_struct *p)
3698{
3699 return READ_ONCE(p->se.avg.util_avg);
3700}
3701
3702static inline unsigned long _task_util_est(struct task_struct *p)
3703{
3704 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3705
3706 return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3707}
3708
3709static inline unsigned long task_util_est(struct task_struct *p)
3710{
3711 return max(task_util(p), _task_util_est(p));
3712}
3713
3714static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3715 struct task_struct *p)
3716{
3717 unsigned int enqueued;
3718
3719 if (!sched_feat(UTIL_EST))
3720 return;
3721
3722
3723 enqueued = cfs_rq->avg.util_est.enqueued;
3724 enqueued += _task_util_est(p);
3725 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3726}
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736static inline bool within_margin(int value, int margin)
3737{
3738 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3739}
3740
3741static void
3742util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3743{
3744 long last_ewma_diff;
3745 struct util_est ue;
3746 int cpu;
3747
3748 if (!sched_feat(UTIL_EST))
3749 return;
3750
3751
3752 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3753 ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3754 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3755
3756
3757
3758
3759
3760 if (!task_sleep)
3761 return;
3762
3763
3764
3765
3766
3767 ue = p->se.avg.util_est;
3768 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3769 return;
3770
3771
3772
3773
3774
3775 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3776 if (sched_feat(UTIL_EST_FASTUP)) {
3777 if (ue.ewma < ue.enqueued) {
3778 ue.ewma = ue.enqueued;
3779 goto done;
3780 }
3781 }
3782
3783
3784
3785
3786
3787 last_ewma_diff = ue.enqueued - ue.ewma;
3788 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3789 return;
3790
3791
3792
3793
3794
3795 cpu = cpu_of(rq_of(cfs_rq));
3796 if (task_util(p) > capacity_orig_of(cpu))
3797 return;
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3817 ue.ewma += last_ewma_diff;
3818 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3819done:
3820 WRITE_ONCE(p->se.avg.util_est, ue);
3821}
3822
3823static inline int task_fits_capacity(struct task_struct *p, long capacity)
3824{
3825 return fits_capacity(task_util_est(p), capacity);
3826}
3827
3828static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3829{
3830 if (!static_branch_unlikely(&sched_asym_cpucapacity))
3831 return;
3832
3833 if (!p) {
3834 rq->misfit_task_load = 0;
3835 return;
3836 }
3837
3838 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3839 rq->misfit_task_load = 0;
3840 return;
3841 }
3842
3843 rq->misfit_task_load = task_h_load(p);
3844}
3845
3846#else
3847
3848#define UPDATE_TG 0x0
3849#define SKIP_AGE_LOAD 0x0
3850#define DO_ATTACH 0x0
3851
3852static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3853{
3854 cfs_rq_util_change(cfs_rq, 0);
3855}
3856
3857static inline void remove_entity_load_avg(struct sched_entity *se) {}
3858
3859static inline void
3860attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3861static inline void
3862detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3863
3864static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3865{
3866 return 0;
3867}
3868
3869static inline void
3870util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3871
3872static inline void
3873util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3874 bool task_sleep) {}
3875static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3876
3877#endif
3878
3879static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3880{
3881#ifdef CONFIG_SCHED_DEBUG
3882 s64 d = se->vruntime - cfs_rq->min_vruntime;
3883
3884 if (d < 0)
3885 d = -d;
3886
3887 if (d > 3*sysctl_sched_latency)
3888 schedstat_inc(cfs_rq->nr_spread_over);
3889#endif
3890}
3891
3892static void
3893place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3894{
3895 u64 vruntime = cfs_rq->min_vruntime;
3896
3897
3898
3899
3900
3901
3902
3903 if (initial && sched_feat(START_DEBIT))
3904 vruntime += sched_vslice(cfs_rq, se);
3905
3906
3907 if (!initial) {
3908 unsigned long thresh = sysctl_sched_latency;
3909
3910
3911
3912
3913
3914 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3915 thresh >>= 1;
3916
3917 vruntime -= thresh;
3918 }
3919
3920
3921 se->vruntime = max_vruntime(se->vruntime, vruntime);
3922}
3923
3924static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3925
3926static inline void check_schedstat_required(void)
3927{
3928#ifdef CONFIG_SCHEDSTATS
3929 if (schedstat_enabled())
3930 return;
3931
3932
3933 if (trace_sched_stat_wait_enabled() ||
3934 trace_sched_stat_sleep_enabled() ||
3935 trace_sched_stat_iowait_enabled() ||
3936 trace_sched_stat_blocked_enabled() ||
3937 trace_sched_stat_runtime_enabled()) {
3938 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3939 "stat_blocked and stat_runtime require the "
3940 "kernel parameter schedstats=enable or "
3941 "kernel.sched_schedstats=1\n");
3942 }
3943#endif
3944}
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977static void
3978enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3979{
3980 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3981 bool curr = cfs_rq->curr == se;
3982
3983
3984
3985
3986
3987 if (renorm && curr)
3988 se->vruntime += cfs_rq->min_vruntime;
3989
3990 update_curr(cfs_rq);
3991
3992
3993
3994
3995
3996
3997
3998 if (renorm && !curr)
3999 se->vruntime += cfs_rq->min_vruntime;
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4010 update_cfs_group(se);
4011 enqueue_runnable_load_avg(cfs_rq, se);
4012 account_entity_enqueue(cfs_rq, se);
4013
4014 if (flags & ENQUEUE_WAKEUP)
4015 place_entity(cfs_rq, se, 0);
4016
4017 check_schedstat_required();
4018 update_stats_enqueue(cfs_rq, se, flags);
4019 check_spread(cfs_rq, se);
4020 if (!curr)
4021 __enqueue_entity(cfs_rq, se);
4022 se->on_rq = 1;
4023
4024 if (cfs_rq->nr_running == 1) {
4025 list_add_leaf_cfs_rq(cfs_rq);
4026 check_enqueue_throttle(cfs_rq);
4027 }
4028}
4029
4030static void __clear_buddies_last(struct sched_entity *se)
4031{
4032 for_each_sched_entity(se) {
4033 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4034 if (cfs_rq->last != se)
4035 break;
4036
4037 cfs_rq->last = NULL;
4038 }
4039}
4040
4041static void __clear_buddies_next(struct sched_entity *se)
4042{
4043 for_each_sched_entity(se) {
4044 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4045 if (cfs_rq->next != se)
4046 break;
4047
4048 cfs_rq->next = NULL;
4049 }
4050}
4051
4052static void __clear_buddies_skip(struct sched_entity *se)
4053{
4054 for_each_sched_entity(se) {
4055 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4056 if (cfs_rq->skip != se)
4057 break;
4058
4059 cfs_rq->skip = NULL;
4060 }
4061}
4062
4063static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4064{
4065 if (cfs_rq->last == se)
4066 __clear_buddies_last(se);
4067
4068 if (cfs_rq->next == se)
4069 __clear_buddies_next(se);
4070
4071 if (cfs_rq->skip == se)
4072 __clear_buddies_skip(se);
4073}
4074
4075static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4076
4077static void
4078dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4079{
4080
4081
4082
4083 update_curr(cfs_rq);
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093 update_load_avg(cfs_rq, se, UPDATE_TG);
4094 dequeue_runnable_load_avg(cfs_rq, se);
4095
4096 update_stats_dequeue(cfs_rq, se, flags);
4097
4098 clear_buddies(cfs_rq, se);
4099
4100 if (se != cfs_rq->curr)
4101 __dequeue_entity(cfs_rq, se);
4102 se->on_rq = 0;
4103 account_entity_dequeue(cfs_rq, se);
4104
4105
4106
4107
4108
4109
4110
4111 if (!(flags & DEQUEUE_SLEEP))
4112 se->vruntime -= cfs_rq->min_vruntime;
4113
4114
4115 return_cfs_rq_runtime(cfs_rq);
4116
4117 update_cfs_group(se);
4118
4119
4120
4121
4122
4123
4124
4125 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4126 update_min_vruntime(cfs_rq);
4127}
4128
4129
4130
4131
4132static void
4133check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4134{
4135 unsigned long ideal_runtime, delta_exec;
4136 struct sched_entity *se;
4137 s64 delta;
4138
4139 ideal_runtime = sched_slice(cfs_rq, curr);
4140 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4141 if (delta_exec > ideal_runtime) {
4142 resched_curr(rq_of(cfs_rq));
4143
4144
4145
4146
4147 clear_buddies(cfs_rq, curr);
4148 return;
4149 }
4150
4151
4152
4153
4154
4155
4156 if (delta_exec < sysctl_sched_min_granularity)
4157 return;
4158
4159 se = __pick_first_entity(cfs_rq);
4160 delta = curr->vruntime - se->vruntime;
4161
4162 if (delta < 0)
4163 return;
4164
4165 if (delta > ideal_runtime)
4166 resched_curr(rq_of(cfs_rq));
4167}
4168
4169static void
4170set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4171{
4172
4173 if (se->on_rq) {
4174
4175
4176
4177
4178
4179 update_stats_wait_end(cfs_rq, se);
4180 __dequeue_entity(cfs_rq, se);
4181 update_load_avg(cfs_rq, se, UPDATE_TG);
4182 }
4183
4184 update_stats_curr_start(cfs_rq, se);
4185 cfs_rq->curr = se;
4186
4187
4188
4189
4190
4191
4192 if (schedstat_enabled() &&
4193 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4194 schedstat_set(se->statistics.slice_max,
4195 max((u64)schedstat_val(se->statistics.slice_max),
4196 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4197 }
4198
4199 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4200}
4201
4202static int
4203wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4204
4205
4206
4207
4208
4209
4210
4211
4212static struct sched_entity *
4213pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4214{
4215 struct sched_entity *left = __pick_first_entity(cfs_rq);
4216 struct sched_entity *se;
4217
4218
4219
4220
4221
4222 if (!left || (curr && entity_before(curr, left)))
4223 left = curr;
4224
4225 se = left;
4226
4227
4228
4229
4230
4231 if (cfs_rq->skip == se) {
4232 struct sched_entity *second;
4233
4234 if (se == curr) {
4235 second = __pick_first_entity(cfs_rq);
4236 } else {
4237 second = __pick_next_entity(se);
4238 if (!second || (curr && entity_before(curr, second)))
4239 second = curr;
4240 }
4241
4242 if (second && wakeup_preempt_entity(second, left) < 1)
4243 se = second;
4244 }
4245
4246
4247
4248
4249 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4250 se = cfs_rq->last;
4251
4252
4253
4254
4255 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4256 se = cfs_rq->next;
4257
4258 clear_buddies(cfs_rq, se);
4259
4260 return se;
4261}
4262
4263static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4264
4265static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4266{
4267
4268
4269
4270
4271 if (prev->on_rq)
4272 update_curr(cfs_rq);
4273
4274
4275 check_cfs_rq_runtime(cfs_rq);
4276
4277 check_spread(cfs_rq, prev);
4278
4279 if (prev->on_rq) {
4280 update_stats_wait_start(cfs_rq, prev);
4281
4282 __enqueue_entity(cfs_rq, prev);
4283
4284 update_load_avg(cfs_rq, prev, 0);
4285 }
4286 cfs_rq->curr = NULL;
4287}
4288
4289static void
4290entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4291{
4292
4293
4294
4295 update_curr(cfs_rq);
4296
4297
4298
4299
4300 update_load_avg(cfs_rq, curr, UPDATE_TG);
4301 update_cfs_group(curr);
4302
4303#ifdef CONFIG_SCHED_HRTICK
4304
4305
4306
4307
4308 if (queued) {
4309 resched_curr(rq_of(cfs_rq));
4310 return;
4311 }
4312
4313
4314
4315 if (!sched_feat(DOUBLE_TICK) &&
4316 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4317 return;
4318#endif
4319
4320 if (cfs_rq->nr_running > 1)
4321 check_preempt_tick(cfs_rq, curr);
4322}
4323
4324
4325
4326
4327
4328
4329#ifdef CONFIG_CFS_BANDWIDTH
4330
4331#ifdef CONFIG_JUMP_LABEL
4332static struct static_key __cfs_bandwidth_used;
4333
4334static inline bool cfs_bandwidth_used(void)
4335{
4336 return static_key_false(&__cfs_bandwidth_used);
4337}
4338
4339void cfs_bandwidth_usage_inc(void)
4340{
4341 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4342}
4343
4344void cfs_bandwidth_usage_dec(void)
4345{
4346 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4347}
4348#else
4349static bool cfs_bandwidth_used(void)
4350{
4351 return true;
4352}
4353
4354void cfs_bandwidth_usage_inc(void) {}
4355void cfs_bandwidth_usage_dec(void) {}
4356#endif
4357
4358
4359
4360
4361
4362static inline u64 default_cfs_period(void)
4363{
4364 return 100000000ULL;
4365}
4366
4367static inline u64 sched_cfs_bandwidth_slice(void)
4368{
4369 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4370}
4371
4372
4373
4374
4375
4376
4377
4378
4379void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4380{
4381 if (cfs_b->quota != RUNTIME_INF)
4382 cfs_b->runtime = cfs_b->quota;
4383}
4384
4385static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4386{
4387 return &tg->cfs_bandwidth;
4388}
4389
4390
4391static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4392{
4393 struct task_group *tg = cfs_rq->tg;
4394 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4395 u64 amount = 0, min_amount;
4396
4397
4398 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4399
4400 raw_spin_lock(&cfs_b->lock);
4401 if (cfs_b->quota == RUNTIME_INF)
4402 amount = min_amount;
4403 else {
4404 start_cfs_bandwidth(cfs_b);
4405
4406 if (cfs_b->runtime > 0) {
4407 amount = min(cfs_b->runtime, min_amount);
4408 cfs_b->runtime -= amount;
4409 cfs_b->idle = 0;
4410 }
4411 }
4412 raw_spin_unlock(&cfs_b->lock);
4413
4414 cfs_rq->runtime_remaining += amount;
4415
4416 return cfs_rq->runtime_remaining > 0;
4417}
4418
4419static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4420{
4421
4422 cfs_rq->runtime_remaining -= delta_exec;
4423
4424 if (likely(cfs_rq->runtime_remaining > 0))
4425 return;
4426
4427 if (cfs_rq->throttled)
4428 return;
4429
4430
4431
4432
4433 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4434 resched_curr(rq_of(cfs_rq));
4435}
4436
4437static __always_inline
4438void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4439{
4440 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4441 return;
4442
4443 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4444}
4445
4446static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4447{
4448 return cfs_bandwidth_used() && cfs_rq->throttled;
4449}
4450
4451
4452static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4453{
4454 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4455}
4456
4457
4458
4459
4460
4461
4462static inline int throttled_lb_pair(struct task_group *tg,
4463 int src_cpu, int dest_cpu)
4464{
4465 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4466
4467 src_cfs_rq = tg->cfs_rq[src_cpu];
4468 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4469
4470 return throttled_hierarchy(src_cfs_rq) ||
4471 throttled_hierarchy(dest_cfs_rq);
4472}
4473
4474static int tg_unthrottle_up(struct task_group *tg, void *data)
4475{
4476 struct rq *rq = data;
4477 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4478
4479 cfs_rq->throttle_count--;
4480 if (!cfs_rq->throttle_count) {
4481 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4482 cfs_rq->throttled_clock_task;
4483
4484
4485 if (cfs_rq->nr_running >= 1)
4486 list_add_leaf_cfs_rq(cfs_rq);
4487 }
4488
4489 return 0;
4490}
4491
4492static int tg_throttle_down(struct task_group *tg, void *data)
4493{
4494 struct rq *rq = data;
4495 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4496
4497
4498 if (!cfs_rq->throttle_count) {
4499 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4500 list_del_leaf_cfs_rq(cfs_rq);
4501 }
4502 cfs_rq->throttle_count++;
4503
4504 return 0;
4505}
4506
4507static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4508{
4509 struct rq *rq = rq_of(cfs_rq);
4510 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4511 struct sched_entity *se;
4512 long task_delta, idle_task_delta, dequeue = 1;
4513 bool empty;
4514
4515 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4516
4517
4518 rcu_read_lock();
4519 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4520 rcu_read_unlock();
4521
4522 task_delta = cfs_rq->h_nr_running;
4523 idle_task_delta = cfs_rq->idle_h_nr_running;
4524 for_each_sched_entity(se) {
4525 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4526
4527 if (!se->on_rq)
4528 break;
4529
4530 if (dequeue)
4531 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4532 qcfs_rq->h_nr_running -= task_delta;
4533 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4534
4535 if (qcfs_rq->load.weight)
4536 dequeue = 0;
4537 }
4538
4539 if (!se)
4540 sub_nr_running(rq, task_delta);
4541
4542 cfs_rq->throttled = 1;
4543 cfs_rq->throttled_clock = rq_clock(rq);
4544 raw_spin_lock(&cfs_b->lock);
4545 empty = list_empty(&cfs_b->throttled_cfs_rq);
4546
4547
4548
4549
4550
4551
4552 if (cfs_b->distribute_running)
4553 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4554 else
4555 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4556
4557
4558
4559
4560
4561 if (empty)
4562 start_cfs_bandwidth(cfs_b);
4563
4564 raw_spin_unlock(&cfs_b->lock);
4565}
4566
4567void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4568{
4569 struct rq *rq = rq_of(cfs_rq);
4570 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4571 struct sched_entity *se;
4572 int enqueue = 1;
4573 long task_delta, idle_task_delta;
4574
4575 se = cfs_rq->tg->se[cpu_of(rq)];
4576
4577 cfs_rq->throttled = 0;
4578
4579 update_rq_clock(rq);
4580
4581 raw_spin_lock(&cfs_b->lock);
4582 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4583 list_del_rcu(&cfs_rq->throttled_list);
4584 raw_spin_unlock(&cfs_b->lock);
4585
4586
4587 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4588
4589 if (!cfs_rq->load.weight)
4590 return;
4591
4592 task_delta = cfs_rq->h_nr_running;
4593 idle_task_delta = cfs_rq->idle_h_nr_running;
4594 for_each_sched_entity(se) {
4595 if (se->on_rq)
4596 enqueue = 0;
4597
4598 cfs_rq = cfs_rq_of(se);
4599 if (enqueue)
4600 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4601 cfs_rq->h_nr_running += task_delta;
4602 cfs_rq->idle_h_nr_running += idle_task_delta;
4603
4604 if (cfs_rq_throttled(cfs_rq))
4605 break;
4606 }
4607
4608 assert_list_leaf_cfs_rq(rq);
4609
4610 if (!se)
4611 add_nr_running(rq, task_delta);
4612
4613
4614 if (rq->curr == rq->idle && rq->cfs.nr_running)
4615 resched_curr(rq);
4616}
4617
4618static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4619{
4620 struct cfs_rq *cfs_rq;
4621 u64 runtime;
4622 u64 starting_runtime = remaining;
4623
4624 rcu_read_lock();
4625 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4626 throttled_list) {
4627 struct rq *rq = rq_of(cfs_rq);
4628 struct rq_flags rf;
4629
4630 rq_lock_irqsave(rq, &rf);
4631 if (!cfs_rq_throttled(cfs_rq))
4632 goto next;
4633
4634
4635 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4636
4637 runtime = -cfs_rq->runtime_remaining + 1;
4638 if (runtime > remaining)
4639 runtime = remaining;
4640 remaining -= runtime;
4641
4642 cfs_rq->runtime_remaining += runtime;
4643
4644
4645 if (cfs_rq->runtime_remaining > 0)
4646 unthrottle_cfs_rq(cfs_rq);
4647
4648next:
4649 rq_unlock_irqrestore(rq, &rf);
4650
4651 if (!remaining)
4652 break;
4653 }
4654 rcu_read_unlock();
4655
4656 return starting_runtime - remaining;
4657}
4658
4659
4660
4661
4662
4663
4664
4665static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4666{
4667 u64 runtime;
4668 int throttled;
4669
4670
4671 if (cfs_b->quota == RUNTIME_INF)
4672 goto out_deactivate;
4673
4674 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4675 cfs_b->nr_periods += overrun;
4676
4677
4678
4679
4680
4681 if (cfs_b->idle && !throttled)
4682 goto out_deactivate;
4683
4684 __refill_cfs_bandwidth_runtime(cfs_b);
4685
4686 if (!throttled) {
4687
4688 cfs_b->idle = 1;
4689 return 0;
4690 }
4691
4692
4693 cfs_b->nr_throttled += overrun;
4694
4695
4696
4697
4698
4699
4700
4701
4702 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4703 runtime = cfs_b->runtime;
4704 cfs_b->distribute_running = 1;
4705 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4706
4707 runtime = distribute_cfs_runtime(cfs_b, runtime);
4708 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4709
4710 cfs_b->distribute_running = 0;
4711 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4712
4713 lsub_positive(&cfs_b->runtime, runtime);
4714 }
4715
4716
4717
4718
4719
4720
4721
4722 cfs_b->idle = 0;
4723
4724 return 0;
4725
4726out_deactivate:
4727 return 1;
4728}
4729
4730
4731static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4732
4733static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4734
4735static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4736
4737
4738
4739
4740
4741
4742
4743
4744static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4745{
4746 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4747 u64 remaining;
4748
4749
4750 if (hrtimer_callback_running(refresh_timer))
4751 return 1;
4752
4753
4754 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4755 if (remaining < min_expire)
4756 return 1;
4757
4758 return 0;
4759}
4760
4761static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4762{
4763 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4764
4765
4766 if (runtime_refresh_within(cfs_b, min_left))
4767 return;
4768
4769
4770 if (cfs_b->slack_started)
4771 return;
4772 cfs_b->slack_started = true;
4773
4774 hrtimer_start(&cfs_b->slack_timer,
4775 ns_to_ktime(cfs_bandwidth_slack_period),
4776 HRTIMER_MODE_REL);
4777}
4778
4779
4780static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4781{
4782 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4783 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4784
4785 if (slack_runtime <= 0)
4786 return;
4787
4788 raw_spin_lock(&cfs_b->lock);
4789 if (cfs_b->quota != RUNTIME_INF) {
4790 cfs_b->runtime += slack_runtime;
4791
4792
4793 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4794 !list_empty(&cfs_b->throttled_cfs_rq))
4795 start_cfs_slack_bandwidth(cfs_b);
4796 }
4797 raw_spin_unlock(&cfs_b->lock);
4798
4799
4800 cfs_rq->runtime_remaining -= slack_runtime;
4801}
4802
4803static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4804{
4805 if (!cfs_bandwidth_used())
4806 return;
4807
4808 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4809 return;
4810
4811 __return_cfs_rq_runtime(cfs_rq);
4812}
4813
4814
4815
4816
4817
4818static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4819{
4820 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4821 unsigned long flags;
4822
4823
4824 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4825 cfs_b->slack_started = false;
4826 if (cfs_b->distribute_running) {
4827 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4828 return;
4829 }
4830
4831 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4832 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4833 return;
4834 }
4835
4836 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4837 runtime = cfs_b->runtime;
4838
4839 if (runtime)
4840 cfs_b->distribute_running = 1;
4841
4842 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4843
4844 if (!runtime)
4845 return;
4846
4847 runtime = distribute_cfs_runtime(cfs_b, runtime);
4848
4849 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4850 lsub_positive(&cfs_b->runtime, runtime);
4851 cfs_b->distribute_running = 0;
4852 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4853}
4854
4855
4856
4857
4858
4859
4860static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4861{
4862 if (!cfs_bandwidth_used())
4863 return;
4864
4865
4866 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4867 return;
4868
4869
4870 if (cfs_rq_throttled(cfs_rq))
4871 return;
4872
4873
4874 account_cfs_rq_runtime(cfs_rq, 0);
4875 if (cfs_rq->runtime_remaining <= 0)
4876 throttle_cfs_rq(cfs_rq);
4877}
4878
4879static void sync_throttle(struct task_group *tg, int cpu)
4880{
4881 struct cfs_rq *pcfs_rq, *cfs_rq;
4882
4883 if (!cfs_bandwidth_used())
4884 return;
4885
4886 if (!tg->parent)
4887 return;
4888
4889 cfs_rq = tg->cfs_rq[cpu];
4890 pcfs_rq = tg->parent->cfs_rq[cpu];
4891
4892 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4893 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4894}
4895
4896
4897static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4898{
4899 if (!cfs_bandwidth_used())
4900 return false;
4901
4902 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4903 return false;
4904
4905
4906
4907
4908
4909 if (cfs_rq_throttled(cfs_rq))
4910 return true;
4911
4912 throttle_cfs_rq(cfs_rq);
4913 return true;
4914}
4915
4916static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4917{
4918 struct cfs_bandwidth *cfs_b =
4919 container_of(timer, struct cfs_bandwidth, slack_timer);
4920
4921 do_sched_cfs_slack_timer(cfs_b);
4922
4923 return HRTIMER_NORESTART;
4924}
4925
4926extern const u64 max_cfs_quota_period;
4927
4928static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4929{
4930 struct cfs_bandwidth *cfs_b =
4931 container_of(timer, struct cfs_bandwidth, period_timer);
4932 unsigned long flags;
4933 int overrun;
4934 int idle = 0;
4935 int count = 0;
4936
4937 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4938 for (;;) {
4939 overrun = hrtimer_forward_now(timer, cfs_b->period);
4940 if (!overrun)
4941 break;
4942
4943 if (++count > 3) {
4944 u64 new, old = ktime_to_ns(cfs_b->period);
4945
4946
4947
4948
4949
4950
4951 new = old * 2;
4952 if (new < max_cfs_quota_period) {
4953 cfs_b->period = ns_to_ktime(new);
4954 cfs_b->quota *= 2;
4955
4956 pr_warn_ratelimited(
4957 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4958 smp_processor_id(),
4959 div_u64(new, NSEC_PER_USEC),
4960 div_u64(cfs_b->quota, NSEC_PER_USEC));
4961 } else {
4962 pr_warn_ratelimited(
4963 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4964 smp_processor_id(),
4965 div_u64(old, NSEC_PER_USEC),
4966 div_u64(cfs_b->quota, NSEC_PER_USEC));
4967 }
4968
4969
4970 count = 0;
4971 }
4972
4973 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4974 }
4975 if (idle)
4976 cfs_b->period_active = 0;
4977 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4978
4979 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4980}
4981
4982void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4983{
4984 raw_spin_lock_init(&cfs_b->lock);
4985 cfs_b->runtime = 0;
4986 cfs_b->quota = RUNTIME_INF;
4987 cfs_b->period = ns_to_ktime(default_cfs_period());
4988
4989 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4990 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4991 cfs_b->period_timer.function = sched_cfs_period_timer;
4992 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4993 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4994 cfs_b->distribute_running = 0;
4995 cfs_b->slack_started = false;
4996}
4997
4998static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4999{
5000 cfs_rq->runtime_enabled = 0;
5001 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5002}
5003
5004void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5005{
5006 lockdep_assert_held(&cfs_b->lock);
5007
5008 if (cfs_b->period_active)
5009 return;
5010
5011 cfs_b->period_active = 1;
5012 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5013 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5014}
5015
5016static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5017{
5018
5019 if (!cfs_b->throttled_cfs_rq.next)
5020 return;
5021
5022 hrtimer_cancel(&cfs_b->period_timer);
5023 hrtimer_cancel(&cfs_b->slack_timer);
5024}
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034static void __maybe_unused update_runtime_enabled(struct rq *rq)
5035{
5036 struct task_group *tg;
5037
5038 lockdep_assert_held(&rq->lock);
5039
5040 rcu_read_lock();
5041 list_for_each_entry_rcu(tg, &task_groups, list) {
5042 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5043 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5044
5045 raw_spin_lock(&cfs_b->lock);
5046 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5047 raw_spin_unlock(&cfs_b->lock);
5048 }
5049 rcu_read_unlock();
5050}
5051
5052
5053static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5054{
5055 struct task_group *tg;
5056
5057 lockdep_assert_held(&rq->lock);
5058
5059 rcu_read_lock();
5060 list_for_each_entry_rcu(tg, &task_groups, list) {
5061 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5062
5063 if (!cfs_rq->runtime_enabled)
5064 continue;
5065
5066
5067
5068
5069
5070 cfs_rq->runtime_remaining = 1;
5071
5072
5073
5074
5075 cfs_rq->runtime_enabled = 0;
5076
5077 if (cfs_rq_throttled(cfs_rq))
5078 unthrottle_cfs_rq(cfs_rq);
5079 }
5080 rcu_read_unlock();
5081}
5082
5083#else
5084
5085static inline bool cfs_bandwidth_used(void)
5086{
5087 return false;
5088}
5089
5090static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5091static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5092static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5093static inline void sync_throttle(struct task_group *tg, int cpu) {}
5094static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5095
5096static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5097{
5098 return 0;
5099}
5100
5101static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5102{
5103 return 0;
5104}
5105
5106static inline int throttled_lb_pair(struct task_group *tg,
5107 int src_cpu, int dest_cpu)
5108{
5109 return 0;
5110}
5111
5112void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5113
5114#ifdef CONFIG_FAIR_GROUP_SCHED
5115static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5116#endif
5117
5118static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5119{
5120 return NULL;
5121}
5122static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5123static inline void update_runtime_enabled(struct rq *rq) {}
5124static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5125
5126#endif
5127
5128
5129
5130
5131
5132#ifdef CONFIG_SCHED_HRTICK
5133static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5134{
5135 struct sched_entity *se = &p->se;
5136 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5137
5138 SCHED_WARN_ON(task_rq(p) != rq);
5139
5140 if (rq->cfs.h_nr_running > 1) {
5141 u64 slice = sched_slice(cfs_rq, se);
5142 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5143 s64 delta = slice - ran;
5144
5145 if (delta < 0) {
5146 if (rq->curr == p)
5147 resched_curr(rq);
5148 return;
5149 }
5150 hrtick_start(rq, delta);
5151 }
5152}
5153
5154
5155
5156
5157
5158
5159static void hrtick_update(struct rq *rq)
5160{
5161 struct task_struct *curr = rq->curr;
5162
5163 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5164 return;
5165
5166 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5167 hrtick_start_fair(rq, curr);
5168}
5169#else
5170static inline void
5171hrtick_start_fair(struct rq *rq, struct task_struct *p)
5172{
5173}
5174
5175static inline void hrtick_update(struct rq *rq)
5176{
5177}
5178#endif
5179
5180#ifdef CONFIG_SMP
5181static inline unsigned long cpu_util(int cpu);
5182
5183static inline bool cpu_overutilized(int cpu)
5184{
5185 return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5186}
5187
5188static inline void update_overutilized_status(struct rq *rq)
5189{
5190 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5191 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5192 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5193 }
5194}
5195#else
5196static inline void update_overutilized_status(struct rq *rq) { }
5197#endif
5198
5199
5200
5201
5202
5203
5204static void
5205enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5206{
5207 struct cfs_rq *cfs_rq;
5208 struct sched_entity *se = &p->se;
5209 int idle_h_nr_running = task_has_idle_policy(p);
5210
5211
5212
5213
5214
5215
5216
5217 util_est_enqueue(&rq->cfs, p);
5218
5219
5220
5221
5222
5223
5224 if (p->in_iowait)
5225 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5226
5227 for_each_sched_entity(se) {
5228 if (se->on_rq)
5229 break;
5230 cfs_rq = cfs_rq_of(se);
5231 enqueue_entity(cfs_rq, se, flags);
5232
5233
5234
5235
5236
5237
5238
5239 if (cfs_rq_throttled(cfs_rq))
5240 break;
5241 cfs_rq->h_nr_running++;
5242 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5243
5244 flags = ENQUEUE_WAKEUP;
5245 }
5246
5247 for_each_sched_entity(se) {
5248 cfs_rq = cfs_rq_of(se);
5249 cfs_rq->h_nr_running++;
5250 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5251
5252 if (cfs_rq_throttled(cfs_rq))
5253 break;
5254
5255 update_load_avg(cfs_rq, se, UPDATE_TG);
5256 update_cfs_group(se);
5257 }
5258
5259 if (!se) {
5260 add_nr_running(rq, 1);
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275 if (flags & ENQUEUE_WAKEUP)
5276 update_overutilized_status(rq);
5277
5278 }
5279
5280 if (cfs_bandwidth_used()) {
5281
5282
5283
5284
5285
5286
5287 for_each_sched_entity(se) {
5288 cfs_rq = cfs_rq_of(se);
5289
5290 if (list_add_leaf_cfs_rq(cfs_rq))
5291 break;
5292 }
5293 }
5294
5295 assert_list_leaf_cfs_rq(rq);
5296
5297 hrtick_update(rq);
5298}
5299
5300static void set_next_buddy(struct sched_entity *se);
5301
5302
5303
5304
5305
5306
5307static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5308{
5309 struct cfs_rq *cfs_rq;
5310 struct sched_entity *se = &p->se;
5311 int task_sleep = flags & DEQUEUE_SLEEP;
5312 int idle_h_nr_running = task_has_idle_policy(p);
5313
5314 for_each_sched_entity(se) {
5315 cfs_rq = cfs_rq_of(se);
5316 dequeue_entity(cfs_rq, se, flags);
5317
5318
5319
5320
5321
5322
5323
5324 if (cfs_rq_throttled(cfs_rq))
5325 break;
5326 cfs_rq->h_nr_running--;
5327 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5328
5329
5330 if (cfs_rq->load.weight) {
5331
5332 se = parent_entity(se);
5333
5334
5335
5336
5337 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5338 set_next_buddy(se);
5339 break;
5340 }
5341 flags |= DEQUEUE_SLEEP;
5342 }
5343
5344 for_each_sched_entity(se) {
5345 cfs_rq = cfs_rq_of(se);
5346 cfs_rq->h_nr_running--;
5347 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5348
5349 if (cfs_rq_throttled(cfs_rq))
5350 break;
5351
5352 update_load_avg(cfs_rq, se, UPDATE_TG);
5353 update_cfs_group(se);
5354 }
5355
5356 if (!se)
5357 sub_nr_running(rq, 1);
5358
5359 util_est_dequeue(&rq->cfs, p, task_sleep);
5360 hrtick_update(rq);
5361}
5362
5363#ifdef CONFIG_SMP
5364
5365
5366DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5367DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5368
5369#ifdef CONFIG_NO_HZ_COMMON
5370
5371static struct {
5372 cpumask_var_t idle_cpus_mask;
5373 atomic_t nr_cpus;
5374 int has_blocked;
5375 unsigned long next_balance;
5376 unsigned long next_blocked;
5377} nohz ____cacheline_aligned;
5378
5379#endif
5380
5381
5382static int sched_idle_cpu(int cpu)
5383{
5384 struct rq *rq = cpu_rq(cpu);
5385
5386 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5387 rq->nr_running);
5388}
5389
5390static unsigned long cpu_load(struct rq *rq)
5391{
5392 return cfs_rq_load_avg(&rq->cfs);
5393}
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5409{
5410 struct cfs_rq *cfs_rq;
5411 unsigned int load;
5412
5413
5414 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5415 return cpu_load(rq);
5416
5417 cfs_rq = &rq->cfs;
5418 load = READ_ONCE(cfs_rq->avg.load_avg);
5419
5420
5421 lsub_positive(&load, task_h_load(p));
5422
5423 return load;
5424}
5425
5426static unsigned long capacity_of(int cpu)
5427{
5428 return cpu_rq(cpu)->cpu_capacity;
5429}
5430
5431static void record_wakee(struct task_struct *p)
5432{
5433
5434
5435
5436
5437 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5438 current->wakee_flips >>= 1;
5439 current->wakee_flip_decay_ts = jiffies;
5440 }
5441
5442 if (current->last_wakee != p) {
5443 current->last_wakee = p;
5444 current->wakee_flips++;
5445 }
5446}
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465static int wake_wide(struct task_struct *p)
5466{
5467 unsigned int master = current->wakee_flips;
5468 unsigned int slave = p->wakee_flips;
5469 int factor = this_cpu_read(sd_llc_size);
5470
5471 if (master < slave)
5472 swap(master, slave);
5473 if (slave < factor || master < slave * factor)
5474 return 0;
5475 return 1;
5476}
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490static int
5491wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5492{
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5506 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5507
5508 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5509 return this_cpu;
5510
5511 return nr_cpumask_bits;
5512}
5513
5514static int
5515wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5516 int this_cpu, int prev_cpu, int sync)
5517{
5518 s64 this_eff_load, prev_eff_load;
5519 unsigned long task_load;
5520
5521 this_eff_load = cpu_load(cpu_rq(this_cpu));
5522
5523 if (sync) {
5524 unsigned long current_load = task_h_load(current);
5525
5526 if (current_load > this_eff_load)
5527 return this_cpu;
5528
5529 this_eff_load -= current_load;
5530 }
5531
5532 task_load = task_h_load(p);
5533
5534 this_eff_load += task_load;
5535 if (sched_feat(WA_BIAS))
5536 this_eff_load *= 100;
5537 this_eff_load *= capacity_of(prev_cpu);
5538
5539 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5540 prev_eff_load -= task_load;
5541 if (sched_feat(WA_BIAS))
5542 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5543 prev_eff_load *= capacity_of(this_cpu);
5544
5545
5546
5547
5548
5549
5550
5551 if (sync)
5552 prev_eff_load += 1;
5553
5554 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5555}
5556
5557static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5558 int this_cpu, int prev_cpu, int sync)
5559{
5560 int target = nr_cpumask_bits;
5561
5562 if (sched_feat(WA_IDLE))
5563 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5564
5565 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5566 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5567
5568 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5569 if (target == nr_cpumask_bits)
5570 return prev_cpu;
5571
5572 schedstat_inc(sd->ttwu_move_affine);
5573 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5574 return target;
5575}
5576
5577static struct sched_group *
5578find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5579 int this_cpu, int sd_flag);
5580
5581
5582
5583
5584static int
5585find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5586{
5587 unsigned long load, min_load = ULONG_MAX;
5588 unsigned int min_exit_latency = UINT_MAX;
5589 u64 latest_idle_timestamp = 0;
5590 int least_loaded_cpu = this_cpu;
5591 int shallowest_idle_cpu = -1, si_cpu = -1;
5592 int i;
5593
5594
5595 if (group->group_weight == 1)
5596 return cpumask_first(sched_group_span(group));
5597
5598
5599 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5600 if (available_idle_cpu(i)) {
5601 struct rq *rq = cpu_rq(i);
5602 struct cpuidle_state *idle = idle_get_state(rq);
5603 if (idle && idle->exit_latency < min_exit_latency) {
5604
5605
5606
5607
5608
5609 min_exit_latency = idle->exit_latency;
5610 latest_idle_timestamp = rq->idle_stamp;
5611 shallowest_idle_cpu = i;
5612 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5613 rq->idle_stamp > latest_idle_timestamp) {
5614
5615
5616
5617
5618
5619 latest_idle_timestamp = rq->idle_stamp;
5620 shallowest_idle_cpu = i;
5621 }
5622 } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
5623 if (sched_idle_cpu(i)) {
5624 si_cpu = i;
5625 continue;
5626 }
5627
5628 load = cpu_load(cpu_rq(i));
5629 if (load < min_load) {
5630 min_load = load;
5631 least_loaded_cpu = i;
5632 }
5633 }
5634 }
5635
5636 if (shallowest_idle_cpu != -1)
5637 return shallowest_idle_cpu;
5638 if (si_cpu != -1)
5639 return si_cpu;
5640 return least_loaded_cpu;
5641}
5642
5643static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5644 int cpu, int prev_cpu, int sd_flag)
5645{
5646 int new_cpu = cpu;
5647
5648 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5649 return prev_cpu;
5650
5651
5652
5653
5654
5655 if (!(sd_flag & SD_BALANCE_FORK))
5656 sync_entity_load_avg(&p->se);
5657
5658 while (sd) {
5659 struct sched_group *group;
5660 struct sched_domain *tmp;
5661 int weight;
5662
5663 if (!(sd->flags & sd_flag)) {
5664 sd = sd->child;
5665 continue;
5666 }
5667
5668 group = find_idlest_group(sd, p, cpu, sd_flag);
5669 if (!group) {
5670 sd = sd->child;
5671 continue;
5672 }
5673
5674 new_cpu = find_idlest_group_cpu(group, p, cpu);
5675 if (new_cpu == cpu) {
5676
5677 sd = sd->child;
5678 continue;
5679 }
5680
5681
5682 cpu = new_cpu;
5683 weight = sd->span_weight;
5684 sd = NULL;
5685 for_each_domain(cpu, tmp) {
5686 if (weight <= tmp->span_weight)
5687 break;
5688 if (tmp->flags & sd_flag)
5689 sd = tmp;
5690 }
5691 }
5692
5693 return new_cpu;
5694}
5695
5696#ifdef CONFIG_SCHED_SMT
5697DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5698EXPORT_SYMBOL_GPL(sched_smt_present);
5699
5700static inline void set_idle_cores(int cpu, int val)
5701{
5702 struct sched_domain_shared *sds;
5703
5704 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5705 if (sds)
5706 WRITE_ONCE(sds->has_idle_cores, val);
5707}
5708
5709static inline bool test_idle_cores(int cpu, bool def)
5710{
5711 struct sched_domain_shared *sds;
5712
5713 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5714 if (sds)
5715 return READ_ONCE(sds->has_idle_cores);
5716
5717 return def;
5718}
5719
5720
5721
5722
5723
5724
5725
5726
5727void __update_idle_core(struct rq *rq)
5728{
5729 int core = cpu_of(rq);
5730 int cpu;
5731
5732 rcu_read_lock();
5733 if (test_idle_cores(core, true))
5734 goto unlock;
5735
5736 for_each_cpu(cpu, cpu_smt_mask(core)) {
5737 if (cpu == core)
5738 continue;
5739
5740 if (!available_idle_cpu(cpu))
5741 goto unlock;
5742 }
5743
5744 set_idle_cores(core, 1);
5745unlock:
5746 rcu_read_unlock();
5747}
5748
5749
5750
5751
5752
5753
5754static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5755{
5756 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5757 int core, cpu;
5758
5759 if (!static_branch_likely(&sched_smt_present))
5760 return -1;
5761
5762 if (!test_idle_cores(target, false))
5763 return -1;
5764
5765 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5766
5767 for_each_cpu_wrap(core, cpus, target) {
5768 bool idle = true;
5769
5770 for_each_cpu(cpu, cpu_smt_mask(core)) {
5771 __cpumask_clear_cpu(cpu, cpus);
5772 if (!available_idle_cpu(cpu))
5773 idle = false;
5774 }
5775
5776 if (idle)
5777 return core;
5778 }
5779
5780
5781
5782
5783 set_idle_cores(target, 0);
5784
5785 return -1;
5786}
5787
5788
5789
5790
5791static int select_idle_smt(struct task_struct *p, int target)
5792{
5793 int cpu, si_cpu = -1;
5794
5795 if (!static_branch_likely(&sched_smt_present))
5796 return -1;
5797
5798 for_each_cpu(cpu, cpu_smt_mask(target)) {
5799 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
5800 continue;
5801 if (available_idle_cpu(cpu))
5802 return cpu;
5803 if (si_cpu == -1 && sched_idle_cpu(cpu))
5804 si_cpu = cpu;
5805 }
5806
5807 return si_cpu;
5808}
5809
5810#else
5811
5812static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5813{
5814 return -1;
5815}
5816
5817static inline int select_idle_smt(struct task_struct *p, int target)
5818{
5819 return -1;
5820}
5821
5822#endif
5823
5824
5825
5826
5827
5828
5829static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5830{
5831 struct sched_domain *this_sd;
5832 u64 avg_cost, avg_idle;
5833 u64 time, cost;
5834 s64 delta;
5835 int this = smp_processor_id();
5836 int cpu, nr = INT_MAX, si_cpu = -1;
5837
5838 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5839 if (!this_sd)
5840 return -1;
5841
5842
5843
5844
5845
5846 avg_idle = this_rq()->avg_idle / 512;
5847 avg_cost = this_sd->avg_scan_cost + 1;
5848
5849 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5850 return -1;
5851
5852 if (sched_feat(SIS_PROP)) {
5853 u64 span_avg = sd->span_weight * avg_idle;
5854 if (span_avg > 4*avg_cost)
5855 nr = div_u64(span_avg, avg_cost);
5856 else
5857 nr = 4;
5858 }
5859
5860 time = cpu_clock(this);
5861
5862 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
5863 if (!--nr)
5864 return si_cpu;
5865 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
5866 continue;
5867 if (available_idle_cpu(cpu))
5868 break;
5869 if (si_cpu == -1 && sched_idle_cpu(cpu))
5870 si_cpu = cpu;
5871 }
5872
5873 time = cpu_clock(this) - time;
5874 cost = this_sd->avg_scan_cost;
5875 delta = (s64)(time - cost) / 8;
5876 this_sd->avg_scan_cost += delta;
5877
5878 return cpu;
5879}
5880
5881
5882
5883
5884static int select_idle_sibling(struct task_struct *p, int prev, int target)
5885{
5886 struct sched_domain *sd;
5887 int i, recent_used_cpu;
5888
5889 if (available_idle_cpu(target) || sched_idle_cpu(target))
5890 return target;
5891
5892
5893
5894
5895 if (prev != target && cpus_share_cache(prev, target) &&
5896 (available_idle_cpu(prev) || sched_idle_cpu(prev)))
5897 return prev;
5898
5899
5900 recent_used_cpu = p->recent_used_cpu;
5901 if (recent_used_cpu != prev &&
5902 recent_used_cpu != target &&
5903 cpus_share_cache(recent_used_cpu, target) &&
5904 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
5905 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
5906
5907
5908
5909
5910 p->recent_used_cpu = prev;
5911 return recent_used_cpu;
5912 }
5913
5914 sd = rcu_dereference(per_cpu(sd_llc, target));
5915 if (!sd)
5916 return target;
5917
5918 i = select_idle_core(p, sd, target);
5919 if ((unsigned)i < nr_cpumask_bits)
5920 return i;
5921
5922 i = select_idle_cpu(p, sd, target);
5923 if ((unsigned)i < nr_cpumask_bits)
5924 return i;
5925
5926 i = select_idle_smt(p, target);
5927 if ((unsigned)i < nr_cpumask_bits)
5928 return i;
5929
5930 return target;
5931}
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971static inline unsigned long cpu_util(int cpu)
5972{
5973 struct cfs_rq *cfs_rq;
5974 unsigned int util;
5975
5976 cfs_rq = &cpu_rq(cpu)->cfs;
5977 util = READ_ONCE(cfs_rq->avg.util_avg);
5978
5979 if (sched_feat(UTIL_EST))
5980 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
5981
5982 return min_t(unsigned long, util, capacity_orig_of(cpu));
5983}
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998static unsigned long cpu_util_without(int cpu, struct task_struct *p)
5999{
6000 struct cfs_rq *cfs_rq;
6001 unsigned int util;
6002
6003
6004 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6005 return cpu_util(cpu);
6006
6007 cfs_rq = &cpu_rq(cpu)->cfs;
6008 util = READ_ONCE(cfs_rq->avg.util_avg);
6009
6010
6011 lsub_positive(&util, task_util(p));
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039 if (sched_feat(UTIL_EST)) {
6040 unsigned int estimated =
6041 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060 if (unlikely(task_on_rq_queued(p) || current == p))
6061 lsub_positive(&estimated, _task_util_est(p));
6062
6063 util = max(util, estimated);
6064 }
6065
6066
6067
6068
6069
6070
6071 return min_t(unsigned long, util, capacity_orig_of(cpu));
6072}
6073
6074
6075
6076
6077
6078
6079
6080
6081static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6082{
6083 long min_cap, max_cap;
6084
6085 if (!static_branch_unlikely(&sched_asym_cpucapacity))
6086 return 0;
6087
6088 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6089 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6090
6091
6092 if (max_cap - min_cap < max_cap >> 3)
6093 return 0;
6094
6095
6096 sync_entity_load_avg(&p->se);
6097
6098 return !task_fits_capacity(p, min_cap);
6099}
6100
6101
6102
6103
6104
6105static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6106{
6107 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6108 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6109
6110
6111
6112
6113
6114
6115
6116 if (task_cpu(p) == cpu && dst_cpu != cpu)
6117 sub_positive(&util, task_util(p));
6118 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6119 util += task_util(p);
6120
6121 if (sched_feat(UTIL_EST)) {
6122 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6123
6124
6125
6126
6127
6128
6129
6130 if (dst_cpu == cpu)
6131 util_est += _task_util_est(p);
6132
6133 util = max(util, util_est);
6134 }
6135
6136 return min(util, capacity_orig_of(cpu));
6137}
6138
6139
6140
6141
6142
6143
6144
6145
6146static long
6147compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6148{
6149 struct cpumask *pd_mask = perf_domain_span(pd);
6150 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6151 unsigned long max_util = 0, sum_util = 0;
6152 int cpu;
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6164 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6165 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6166
6167
6168
6169
6170
6171
6172
6173 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6174 ENERGY_UTIL, NULL);
6175
6176
6177
6178
6179
6180
6181
6182
6183 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6184 FREQUENCY_UTIL, tsk);
6185 max_util = max(max_util, cpu_util);
6186 }
6187
6188 return em_pd_energy(pd->em_pd, max_util, sum_util);
6189}
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6231{
6232 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6233 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6234 unsigned long cpu_cap, util, base_energy = 0;
6235 int cpu, best_energy_cpu = prev_cpu;
6236 struct sched_domain *sd;
6237 struct perf_domain *pd;
6238
6239 rcu_read_lock();
6240 pd = rcu_dereference(rd->pd);
6241 if (!pd || READ_ONCE(rd->overutilized))
6242 goto fail;
6243
6244
6245
6246
6247
6248 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6249 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6250 sd = sd->parent;
6251 if (!sd)
6252 goto fail;
6253
6254 sync_entity_load_avg(&p->se);
6255 if (!task_util_est(p))
6256 goto unlock;
6257
6258 for (; pd; pd = pd->next) {
6259 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6260 unsigned long base_energy_pd;
6261 int max_spare_cap_cpu = -1;
6262
6263
6264 base_energy_pd = compute_energy(p, -1, pd);
6265 base_energy += base_energy_pd;
6266
6267 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6268 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6269 continue;
6270
6271
6272 util = cpu_util_next(cpu, p, cpu);
6273 cpu_cap = capacity_of(cpu);
6274 if (!fits_capacity(util, cpu_cap))
6275 continue;
6276
6277
6278 if (cpu == prev_cpu) {
6279 prev_delta = compute_energy(p, prev_cpu, pd);
6280 prev_delta -= base_energy_pd;
6281 best_delta = min(best_delta, prev_delta);
6282 }
6283
6284
6285
6286
6287
6288 spare_cap = cpu_cap - util;
6289 if (spare_cap > max_spare_cap) {
6290 max_spare_cap = spare_cap;
6291 max_spare_cap_cpu = cpu;
6292 }
6293 }
6294
6295
6296 if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
6297 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6298 cur_delta -= base_energy_pd;
6299 if (cur_delta < best_delta) {
6300 best_delta = cur_delta;
6301 best_energy_cpu = max_spare_cap_cpu;
6302 }
6303 }
6304 }
6305unlock:
6306 rcu_read_unlock();
6307
6308
6309
6310
6311
6312 if (prev_delta == ULONG_MAX)
6313 return best_energy_cpu;
6314
6315 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6316 return best_energy_cpu;
6317
6318 return prev_cpu;
6319
6320fail:
6321 rcu_read_unlock();
6322
6323 return -1;
6324}
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338static int
6339select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6340{
6341 struct sched_domain *tmp, *sd = NULL;
6342 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu;
6344 int want_affine = 0;
6345 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6346
6347 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p);
6349
6350 if (sched_energy_enabled()) {
6351 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6352 if (new_cpu >= 0)
6353 return new_cpu;
6354 new_cpu = prev_cpu;
6355 }
6356
6357 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6358 cpumask_test_cpu(cpu, p->cpus_ptr);
6359 }
6360
6361 rcu_read_lock();
6362 for_each_domain(cpu, tmp) {
6363 if (!(tmp->flags & SD_LOAD_BALANCE))
6364 break;
6365
6366
6367
6368
6369
6370 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6371 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6372 if (cpu != prev_cpu)
6373 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6374
6375 sd = NULL;
6376 break;
6377 }
6378
6379 if (tmp->flags & sd_flag)
6380 sd = tmp;
6381 else if (!want_affine)
6382 break;
6383 }
6384
6385 if (unlikely(sd)) {
6386
6387 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6388 } else if (sd_flag & SD_BALANCE_WAKE) {
6389
6390
6391 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6392
6393 if (want_affine)
6394 current->recent_used_cpu = cpu;
6395 }
6396 rcu_read_unlock();
6397
6398 return new_cpu;
6399}
6400
6401static void detach_entity_cfs_rq(struct sched_entity *se);
6402
6403
6404
6405
6406
6407
6408static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6409{
6410
6411
6412
6413
6414
6415
6416 if (p->state == TASK_WAKING) {
6417 struct sched_entity *se = &p->se;
6418 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6419 u64 min_vruntime;
6420
6421#ifndef CONFIG_64BIT
6422 u64 min_vruntime_copy;
6423
6424 do {
6425 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6426 smp_rmb();
6427 min_vruntime = cfs_rq->min_vruntime;
6428 } while (min_vruntime != min_vruntime_copy);
6429#else
6430 min_vruntime = cfs_rq->min_vruntime;
6431#endif
6432
6433 se->vruntime -= min_vruntime;
6434 }
6435
6436 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6437
6438
6439
6440
6441 lockdep_assert_held(&task_rq(p)->lock);
6442 detach_entity_cfs_rq(&p->se);
6443
6444 } else {
6445
6446
6447
6448
6449
6450
6451
6452
6453 remove_entity_load_avg(&p->se);
6454 }
6455
6456
6457 p->se.avg.last_update_time = 0;
6458
6459
6460 p->se.exec_start = 0;
6461
6462 update_scan_period(p, new_cpu);
6463}
6464
6465static void task_dead_fair(struct task_struct *p)
6466{
6467 remove_entity_load_avg(&p->se);
6468}
6469
6470static int
6471balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6472{
6473 if (rq->nr_running)
6474 return 1;
6475
6476 return newidle_balance(rq, rf) != 0;
6477}
6478#endif
6479
6480static unsigned long wakeup_gran(struct sched_entity *se)
6481{
6482 unsigned long gran = sysctl_sched_wakeup_granularity;
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497 return calc_delta_fair(gran, se);
6498}
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514static int
6515wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6516{
6517 s64 gran, vdiff = curr->vruntime - se->vruntime;
6518
6519 if (vdiff <= 0)
6520 return -1;
6521
6522 gran = wakeup_gran(se);
6523 if (vdiff > gran)
6524 return 1;
6525
6526 return 0;
6527}
6528
6529static void set_last_buddy(struct sched_entity *se)
6530{
6531 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6532 return;
6533
6534 for_each_sched_entity(se) {
6535 if (SCHED_WARN_ON(!se->on_rq))
6536 return;
6537 cfs_rq_of(se)->last = se;
6538 }
6539}
6540
6541static void set_next_buddy(struct sched_entity *se)
6542{
6543 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6544 return;
6545
6546 for_each_sched_entity(se) {
6547 if (SCHED_WARN_ON(!se->on_rq))
6548 return;
6549 cfs_rq_of(se)->next = se;
6550 }
6551}
6552
6553static void set_skip_buddy(struct sched_entity *se)
6554{
6555 for_each_sched_entity(se)
6556 cfs_rq_of(se)->skip = se;
6557}
6558
6559
6560
6561
6562static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6563{
6564 struct task_struct *curr = rq->curr;
6565 struct sched_entity *se = &curr->se, *pse = &p->se;
6566 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6567 int scale = cfs_rq->nr_running >= sched_nr_latency;
6568 int next_buddy_marked = 0;
6569
6570 if (unlikely(se == pse))
6571 return;
6572
6573
6574
6575
6576
6577
6578
6579 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6580 return;
6581
6582 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6583 set_next_buddy(pse);
6584 next_buddy_marked = 1;
6585 }
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597 if (test_tsk_need_resched(curr))
6598 return;
6599
6600
6601 if (unlikely(task_has_idle_policy(curr)) &&
6602 likely(!task_has_idle_policy(p)))
6603 goto preempt;
6604
6605
6606
6607
6608
6609 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6610 return;
6611
6612 find_matching_se(&se, &pse);
6613 update_curr(cfs_rq_of(se));
6614 BUG_ON(!pse);
6615 if (wakeup_preempt_entity(se, pse) == 1) {
6616
6617
6618
6619
6620 if (!next_buddy_marked)
6621 set_next_buddy(pse);
6622 goto preempt;
6623 }
6624
6625 return;
6626
6627preempt:
6628 resched_curr(rq);
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638 if (unlikely(!se->on_rq || curr == rq->idle))
6639 return;
6640
6641 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6642 set_last_buddy(se);
6643}
6644
6645struct task_struct *
6646pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6647{
6648 struct cfs_rq *cfs_rq = &rq->cfs;
6649 struct sched_entity *se;
6650 struct task_struct *p;
6651 int new_tasks;
6652
6653again:
6654 if (!sched_fair_runnable(rq))
6655 goto idle;
6656
6657#ifdef CONFIG_FAIR_GROUP_SCHED
6658 if (!prev || prev->sched_class != &fair_sched_class)
6659 goto simple;
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669 do {
6670 struct sched_entity *curr = cfs_rq->curr;
6671
6672
6673
6674
6675
6676
6677
6678 if (curr) {
6679 if (curr->on_rq)
6680 update_curr(cfs_rq);
6681 else
6682 curr = NULL;
6683
6684
6685
6686
6687
6688
6689
6690 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6691 cfs_rq = &rq->cfs;
6692
6693 if (!cfs_rq->nr_running)
6694 goto idle;
6695
6696 goto simple;
6697 }
6698 }
6699
6700 se = pick_next_entity(cfs_rq, curr);
6701 cfs_rq = group_cfs_rq(se);
6702 } while (cfs_rq);
6703
6704 p = task_of(se);
6705
6706
6707
6708
6709
6710
6711 if (prev != p) {
6712 struct sched_entity *pse = &prev->se;
6713
6714 while (!(cfs_rq = is_same_group(se, pse))) {
6715 int se_depth = se->depth;
6716 int pse_depth = pse->depth;
6717
6718 if (se_depth <= pse_depth) {
6719 put_prev_entity(cfs_rq_of(pse), pse);
6720 pse = parent_entity(pse);
6721 }
6722 if (se_depth >= pse_depth) {
6723 set_next_entity(cfs_rq_of(se), se);
6724 se = parent_entity(se);
6725 }
6726 }
6727
6728 put_prev_entity(cfs_rq, pse);
6729 set_next_entity(cfs_rq, se);
6730 }
6731
6732 goto done;
6733simple:
6734#endif
6735 if (prev)
6736 put_prev_task(rq, prev);
6737
6738 do {
6739 se = pick_next_entity(cfs_rq, NULL);
6740 set_next_entity(cfs_rq, se);
6741 cfs_rq = group_cfs_rq(se);
6742 } while (cfs_rq);
6743
6744 p = task_of(se);
6745
6746done: __maybe_unused;
6747#ifdef CONFIG_SMP
6748
6749
6750
6751
6752
6753 list_move(&p->se.group_node, &rq->cfs_tasks);
6754#endif
6755
6756 if (hrtick_enabled(rq))
6757 hrtick_start_fair(rq, p);
6758
6759 update_misfit_status(p, rq);
6760
6761 return p;
6762
6763idle:
6764 if (!rf)
6765 return NULL;
6766
6767 new_tasks = newidle_balance(rq, rf);
6768
6769
6770
6771
6772
6773
6774 if (new_tasks < 0)
6775 return RETRY_TASK;
6776
6777 if (new_tasks > 0)
6778 goto again;
6779
6780
6781
6782
6783
6784 update_idle_rq_clock_pelt(rq);
6785
6786 return NULL;
6787}
6788
6789static struct task_struct *__pick_next_task_fair(struct rq *rq)
6790{
6791 return pick_next_task_fair(rq, NULL, NULL);
6792}
6793
6794
6795
6796
6797static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6798{
6799 struct sched_entity *se = &prev->se;
6800 struct cfs_rq *cfs_rq;
6801
6802 for_each_sched_entity(se) {
6803 cfs_rq = cfs_rq_of(se);
6804 put_prev_entity(cfs_rq, se);
6805 }
6806}
6807
6808
6809
6810
6811
6812
6813static void yield_task_fair(struct rq *rq)
6814{
6815 struct task_struct *curr = rq->curr;
6816 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6817 struct sched_entity *se = &curr->se;
6818
6819
6820
6821
6822 if (unlikely(rq->nr_running == 1))
6823 return;
6824
6825 clear_buddies(cfs_rq, se);
6826
6827 if (curr->policy != SCHED_BATCH) {
6828 update_rq_clock(rq);
6829
6830
6831
6832 update_curr(cfs_rq);
6833
6834
6835
6836
6837
6838 rq_clock_skip_update(rq);
6839 }
6840
6841 set_skip_buddy(se);
6842}
6843
6844static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6845{
6846 struct sched_entity *se = &p->se;
6847
6848
6849 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6850 return false;
6851
6852
6853 set_next_buddy(se);
6854
6855 yield_task_fair(rq);
6856
6857 return true;
6858}
6859
6860#ifdef CONFIG_SMP
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6980
6981enum fbq_type { regular, remote, all };
6982
6983
6984
6985
6986
6987
6988
6989
6990enum group_type {
6991
6992 group_has_spare = 0,
6993
6994
6995
6996
6997 group_fully_busy,
6998
6999
7000
7001
7002 group_misfit_task,
7003
7004
7005
7006
7007
7008 group_asym_packing,
7009
7010
7011
7012
7013 group_imbalanced,
7014
7015
7016
7017
7018 group_overloaded
7019};
7020
7021enum migration_type {
7022 migrate_load = 0,
7023 migrate_util,
7024 migrate_task,
7025 migrate_misfit
7026};
7027
7028#define LBF_ALL_PINNED 0x01
7029#define LBF_NEED_BREAK 0x02
7030#define LBF_DST_PINNED 0x04
7031#define LBF_SOME_PINNED 0x08
7032#define LBF_NOHZ_STATS 0x10
7033#define LBF_NOHZ_AGAIN 0x20
7034
7035struct lb_env {
7036 struct sched_domain *sd;
7037
7038 struct rq *src_rq;
7039 int src_cpu;
7040
7041 int dst_cpu;
7042 struct rq *dst_rq;
7043
7044 struct cpumask *dst_grpmask;
7045 int new_dst_cpu;
7046 enum cpu_idle_type idle;
7047 long imbalance;
7048
7049 struct cpumask *cpus;
7050
7051 unsigned int flags;
7052
7053 unsigned int loop;
7054 unsigned int loop_break;
7055 unsigned int loop_max;
7056
7057 enum fbq_type fbq_type;
7058 enum migration_type migration_type;
7059 struct list_head tasks;
7060};
7061
7062
7063
7064
7065static int task_hot(struct task_struct *p, struct lb_env *env)
7066{
7067 s64 delta;
7068
7069 lockdep_assert_held(&env->src_rq->lock);
7070
7071 if (p->sched_class != &fair_sched_class)
7072 return 0;
7073
7074 if (unlikely(task_has_idle_policy(p)))
7075 return 0;
7076
7077
7078
7079
7080 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7081 (&p->se == cfs_rq_of(&p->se)->next ||
7082 &p->se == cfs_rq_of(&p->se)->last))
7083 return 1;
7084
7085 if (sysctl_sched_migration_cost == -1)
7086 return 1;
7087 if (sysctl_sched_migration_cost == 0)
7088 return 0;
7089
7090 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7091
7092 return delta < (s64)sysctl_sched_migration_cost;
7093}
7094
7095#ifdef CONFIG_NUMA_BALANCING
7096
7097
7098
7099
7100
7101static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7102{
7103 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7104 unsigned long src_weight, dst_weight;
7105 int src_nid, dst_nid, dist;
7106
7107 if (!static_branch_likely(&sched_numa_balancing))
7108 return -1;
7109
7110 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7111 return -1;
7112
7113 src_nid = cpu_to_node(env->src_cpu);
7114 dst_nid = cpu_to_node(env->dst_cpu);
7115
7116 if (src_nid == dst_nid)
7117 return -1;
7118
7119
7120 if (src_nid == p->numa_preferred_nid) {
7121 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7122 return 1;
7123 else
7124 return -1;
7125 }
7126
7127
7128 if (dst_nid == p->numa_preferred_nid)
7129 return 0;
7130
7131
7132 if (env->idle == CPU_IDLE)
7133 return -1;
7134
7135 dist = node_distance(src_nid, dst_nid);
7136 if (numa_group) {
7137 src_weight = group_weight(p, src_nid, dist);
7138 dst_weight = group_weight(p, dst_nid, dist);
7139 } else {
7140 src_weight = task_weight(p, src_nid, dist);
7141 dst_weight = task_weight(p, dst_nid, dist);
7142 }
7143
7144 return dst_weight < src_weight;
7145}
7146
7147#else
7148static inline int migrate_degrades_locality(struct task_struct *p,
7149 struct lb_env *env)
7150{
7151 return -1;
7152}
7153#endif
7154
7155
7156
7157
7158static
7159int can_migrate_task(struct task_struct *p, struct lb_env *env)
7160{
7161 int tsk_cache_hot;
7162
7163 lockdep_assert_held(&env->src_rq->lock);
7164
7165
7166
7167
7168
7169
7170
7171
7172 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7173 return 0;
7174
7175 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7176 int cpu;
7177
7178 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7179
7180 env->flags |= LBF_SOME_PINNED;
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7191 return 0;
7192
7193
7194 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7195 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7196 env->flags |= LBF_DST_PINNED;
7197 env->new_dst_cpu = cpu;
7198 break;
7199 }
7200 }
7201
7202 return 0;
7203 }
7204
7205
7206 env->flags &= ~LBF_ALL_PINNED;
7207
7208 if (task_running(env->src_rq, p)) {
7209 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7210 return 0;
7211 }
7212
7213
7214
7215
7216
7217
7218
7219 tsk_cache_hot = migrate_degrades_locality(p, env);
7220 if (tsk_cache_hot == -1)
7221 tsk_cache_hot = task_hot(p, env);
7222
7223 if (tsk_cache_hot <= 0 ||
7224 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7225 if (tsk_cache_hot == 1) {
7226 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7227 schedstat_inc(p->se.statistics.nr_forced_migrations);
7228 }
7229 return 1;
7230 }
7231
7232 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7233 return 0;
7234}
7235
7236
7237
7238
7239static void detach_task(struct task_struct *p, struct lb_env *env)
7240{
7241 lockdep_assert_held(&env->src_rq->lock);
7242
7243 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7244 set_task_cpu(p, env->dst_cpu);
7245}
7246
7247
7248
7249
7250
7251
7252
7253static struct task_struct *detach_one_task(struct lb_env *env)
7254{
7255 struct task_struct *p;
7256
7257 lockdep_assert_held(&env->src_rq->lock);
7258
7259 list_for_each_entry_reverse(p,
7260 &env->src_rq->cfs_tasks, se.group_node) {
7261 if (!can_migrate_task(p, env))
7262 continue;
7263
7264 detach_task(p, env);
7265
7266
7267
7268
7269
7270
7271
7272 schedstat_inc(env->sd->lb_gained[env->idle]);
7273 return p;
7274 }
7275 return NULL;
7276}
7277
7278static const unsigned int sched_nr_migrate_break = 32;
7279
7280
7281
7282
7283
7284
7285
7286static int detach_tasks(struct lb_env *env)
7287{
7288 struct list_head *tasks = &env->src_rq->cfs_tasks;
7289 unsigned long util, load;
7290 struct task_struct *p;
7291 int detached = 0;
7292
7293 lockdep_assert_held(&env->src_rq->lock);
7294
7295 if (env->imbalance <= 0)
7296 return 0;
7297
7298 while (!list_empty(tasks)) {
7299
7300
7301
7302
7303 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7304 break;
7305
7306 p = list_last_entry(tasks, struct task_struct, se.group_node);
7307
7308 env->loop++;
7309
7310 if (env->loop > env->loop_max)
7311 break;
7312
7313
7314 if (env->loop > env->loop_break) {
7315 env->loop_break += sched_nr_migrate_break;
7316 env->flags |= LBF_NEED_BREAK;
7317 break;
7318 }
7319
7320 if (!can_migrate_task(p, env))
7321 goto next;
7322
7323 switch (env->migration_type) {
7324 case migrate_load:
7325 load = task_h_load(p);
7326
7327 if (sched_feat(LB_MIN) &&
7328 load < 16 && !env->sd->nr_balance_failed)
7329 goto next;
7330
7331
7332
7333
7334
7335
7336
7337 if (load/2 > env->imbalance &&
7338 env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
7339 goto next;
7340
7341 env->imbalance -= load;
7342 break;
7343
7344 case migrate_util:
7345 util = task_util_est(p);
7346
7347 if (util > env->imbalance)
7348 goto next;
7349
7350 env->imbalance -= util;
7351 break;
7352
7353 case migrate_task:
7354 env->imbalance--;
7355 break;
7356
7357 case migrate_misfit:
7358
7359 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
7360 goto next;
7361
7362 env->imbalance = 0;
7363 break;
7364 }
7365
7366 detach_task(p, env);
7367 list_add(&p->se.group_node, &env->tasks);
7368
7369 detached++;
7370
7371#ifdef CONFIG_PREEMPTION
7372
7373
7374
7375
7376
7377 if (env->idle == CPU_NEWLY_IDLE)
7378 break;
7379#endif
7380
7381
7382
7383
7384
7385 if (env->imbalance <= 0)
7386 break;
7387
7388 continue;
7389next:
7390 list_move(&p->se.group_node, tasks);
7391 }
7392
7393
7394
7395
7396
7397
7398 schedstat_add(env->sd->lb_gained[env->idle], detached);
7399
7400 return detached;
7401}
7402
7403
7404
7405
7406static void attach_task(struct rq *rq, struct task_struct *p)
7407{
7408 lockdep_assert_held(&rq->lock);
7409
7410 BUG_ON(task_rq(p) != rq);
7411 activate_task(rq, p, ENQUEUE_NOCLOCK);
7412 check_preempt_curr(rq, p, 0);
7413}
7414
7415
7416
7417
7418
7419static void attach_one_task(struct rq *rq, struct task_struct *p)
7420{
7421 struct rq_flags rf;
7422
7423 rq_lock(rq, &rf);
7424 update_rq_clock(rq);
7425 attach_task(rq, p);
7426 rq_unlock(rq, &rf);
7427}
7428
7429
7430
7431
7432
7433static void attach_tasks(struct lb_env *env)
7434{
7435 struct list_head *tasks = &env->tasks;
7436 struct task_struct *p;
7437 struct rq_flags rf;
7438
7439 rq_lock(env->dst_rq, &rf);
7440 update_rq_clock(env->dst_rq);
7441
7442 while (!list_empty(tasks)) {
7443 p = list_first_entry(tasks, struct task_struct, se.group_node);
7444 list_del_init(&p->se.group_node);
7445
7446 attach_task(env->dst_rq, p);
7447 }
7448
7449 rq_unlock(env->dst_rq, &rf);
7450}
7451
7452#ifdef CONFIG_NO_HZ_COMMON
7453static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7454{
7455 if (cfs_rq->avg.load_avg)
7456 return true;
7457
7458 if (cfs_rq->avg.util_avg)
7459 return true;
7460
7461 return false;
7462}
7463
7464static inline bool others_have_blocked(struct rq *rq)
7465{
7466 if (READ_ONCE(rq->avg_rt.util_avg))
7467 return true;
7468
7469 if (READ_ONCE(rq->avg_dl.util_avg))
7470 return true;
7471
7472#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
7473 if (READ_ONCE(rq->avg_irq.util_avg))
7474 return true;
7475#endif
7476
7477 return false;
7478}
7479
7480static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7481{
7482 rq->last_blocked_load_update_tick = jiffies;
7483
7484 if (!has_blocked)
7485 rq->has_blocked_load = 0;
7486}
7487#else
7488static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7489static inline bool others_have_blocked(struct rq *rq) { return false; }
7490static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7491#endif
7492
7493static bool __update_blocked_others(struct rq *rq, bool *done)
7494{
7495 const struct sched_class *curr_class;
7496 u64 now = rq_clock_pelt(rq);
7497 bool decayed;
7498
7499
7500
7501
7502
7503 curr_class = rq->curr->sched_class;
7504
7505 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
7506 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
7507 update_irq_load_avg(rq, 0);
7508
7509 if (others_have_blocked(rq))
7510 *done = false;
7511
7512 return decayed;
7513}
7514
7515#ifdef CONFIG_FAIR_GROUP_SCHED
7516
7517static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7518{
7519 if (cfs_rq->load.weight)
7520 return false;
7521
7522 if (cfs_rq->avg.load_sum)
7523 return false;
7524
7525 if (cfs_rq->avg.util_sum)
7526 return false;
7527
7528 if (cfs_rq->avg.runnable_load_sum)
7529 return false;
7530
7531 return true;
7532}
7533
7534static bool __update_blocked_fair(struct rq *rq, bool *done)
7535{
7536 struct cfs_rq *cfs_rq, *pos;
7537 bool decayed = false;
7538 int cpu = cpu_of(rq);
7539
7540
7541
7542
7543
7544 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7545 struct sched_entity *se;
7546
7547 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
7548 update_tg_load_avg(cfs_rq, 0);
7549
7550 if (cfs_rq == &rq->cfs)
7551 decayed = true;
7552 }
7553
7554
7555 se = cfs_rq->tg->se[cpu];
7556 if (se && !skip_blocked_update(se))
7557 update_load_avg(cfs_rq_of(se), se, 0);
7558
7559
7560
7561
7562
7563 if (cfs_rq_is_decayed(cfs_rq))
7564 list_del_leaf_cfs_rq(cfs_rq);
7565
7566
7567 if (cfs_rq_has_blocked(cfs_rq))
7568 *done = false;
7569 }
7570
7571 return decayed;
7572}
7573
7574
7575
7576
7577
7578
7579static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7580{
7581 struct rq *rq = rq_of(cfs_rq);
7582 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7583 unsigned long now = jiffies;
7584 unsigned long load;
7585
7586 if (cfs_rq->last_h_load_update == now)
7587 return;
7588
7589 WRITE_ONCE(cfs_rq->h_load_next, NULL);
7590 for_each_sched_entity(se) {
7591 cfs_rq = cfs_rq_of(se);
7592 WRITE_ONCE(cfs_rq->h_load_next, se);
7593 if (cfs_rq->last_h_load_update == now)
7594 break;
7595 }
7596
7597 if (!se) {
7598 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7599 cfs_rq->last_h_load_update = now;
7600 }
7601
7602 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7603 load = cfs_rq->h_load;
7604 load = div64_ul(load * se->avg.load_avg,
7605 cfs_rq_load_avg(cfs_rq) + 1);
7606 cfs_rq = group_cfs_rq(se);
7607 cfs_rq->h_load = load;
7608 cfs_rq->last_h_load_update = now;
7609 }
7610}
7611
7612static unsigned long task_h_load(struct task_struct *p)
7613{
7614 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7615
7616 update_cfs_rq_h_load(cfs_rq);
7617 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7618 cfs_rq_load_avg(cfs_rq) + 1);
7619}
7620#else
7621static bool __update_blocked_fair(struct rq *rq, bool *done)
7622{
7623 struct cfs_rq *cfs_rq = &rq->cfs;
7624 bool decayed;
7625
7626 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7627 if (cfs_rq_has_blocked(cfs_rq))
7628 *done = false;
7629
7630 return decayed;
7631}
7632
7633static unsigned long task_h_load(struct task_struct *p)
7634{
7635 return p->se.avg.load_avg;
7636}
7637#endif
7638
7639static void update_blocked_averages(int cpu)
7640{
7641 bool decayed = false, done = true;
7642 struct rq *rq = cpu_rq(cpu);
7643 struct rq_flags rf;
7644
7645 rq_lock_irqsave(rq, &rf);
7646 update_rq_clock(rq);
7647
7648 decayed |= __update_blocked_others(rq, &done);
7649 decayed |= __update_blocked_fair(rq, &done);
7650
7651 update_blocked_load_status(rq, !done);
7652 if (decayed)
7653 cpufreq_update_util(rq, 0);
7654 rq_unlock_irqrestore(rq, &rf);
7655}
7656
7657
7658
7659
7660
7661
7662struct sg_lb_stats {
7663 unsigned long avg_load;
7664 unsigned long group_load;
7665 unsigned long group_capacity;
7666 unsigned long group_util;
7667 unsigned int sum_nr_running;
7668 unsigned int sum_h_nr_running;
7669 unsigned int idle_cpus;
7670 unsigned int group_weight;
7671 enum group_type group_type;
7672 unsigned int group_asym_packing;
7673 unsigned long group_misfit_task_load;
7674#ifdef CONFIG_NUMA_BALANCING
7675 unsigned int nr_numa_running;
7676 unsigned int nr_preferred_running;
7677#endif
7678};
7679
7680
7681
7682
7683
7684struct sd_lb_stats {
7685 struct sched_group *busiest;
7686 struct sched_group *local;
7687 unsigned long total_load;
7688 unsigned long total_capacity;
7689 unsigned long avg_load;
7690 unsigned int prefer_sibling;
7691
7692 struct sg_lb_stats busiest_stat;
7693 struct sg_lb_stats local_stat;
7694};
7695
7696static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7697{
7698
7699
7700
7701
7702
7703
7704
7705 *sds = (struct sd_lb_stats){
7706 .busiest = NULL,
7707 .local = NULL,
7708 .total_load = 0UL,
7709 .total_capacity = 0UL,
7710 .busiest_stat = {
7711 .idle_cpus = UINT_MAX,
7712 .group_type = group_has_spare,
7713 },
7714 };
7715}
7716
7717static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7718{
7719 struct rq *rq = cpu_rq(cpu);
7720 unsigned long max = arch_scale_cpu_capacity(cpu);
7721 unsigned long used, free;
7722 unsigned long irq;
7723
7724 irq = cpu_util_irq(rq);
7725
7726 if (unlikely(irq >= max))
7727 return 1;
7728
7729 used = READ_ONCE(rq->avg_rt.util_avg);
7730 used += READ_ONCE(rq->avg_dl.util_avg);
7731
7732 if (unlikely(used >= max))
7733 return 1;
7734
7735 free = max - used;
7736
7737 return scale_irq_capacity(free, irq, max);
7738}
7739
7740static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7741{
7742 unsigned long capacity = scale_rt_capacity(sd, cpu);
7743 struct sched_group *sdg = sd->groups;
7744
7745 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
7746
7747 if (!capacity)
7748 capacity = 1;
7749
7750 cpu_rq(cpu)->cpu_capacity = capacity;
7751 sdg->sgc->capacity = capacity;
7752 sdg->sgc->min_capacity = capacity;
7753 sdg->sgc->max_capacity = capacity;
7754}
7755
7756void update_group_capacity(struct sched_domain *sd, int cpu)
7757{
7758 struct sched_domain *child = sd->child;
7759 struct sched_group *group, *sdg = sd->groups;
7760 unsigned long capacity, min_capacity, max_capacity;
7761 unsigned long interval;
7762
7763 interval = msecs_to_jiffies(sd->balance_interval);
7764 interval = clamp(interval, 1UL, max_load_balance_interval);
7765 sdg->sgc->next_update = jiffies + interval;
7766
7767 if (!child) {
7768 update_cpu_capacity(sd, cpu);
7769 return;
7770 }
7771
7772 capacity = 0;
7773 min_capacity = ULONG_MAX;
7774 max_capacity = 0;
7775
7776 if (child->flags & SD_OVERLAP) {
7777
7778
7779
7780
7781
7782 for_each_cpu(cpu, sched_group_span(sdg)) {
7783 struct sched_group_capacity *sgc;
7784 struct rq *rq = cpu_rq(cpu);
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797 if (unlikely(!rq->sd)) {
7798 capacity += capacity_of(cpu);
7799 } else {
7800 sgc = rq->sd->groups->sgc;
7801 capacity += sgc->capacity;
7802 }
7803
7804 min_capacity = min(capacity, min_capacity);
7805 max_capacity = max(capacity, max_capacity);
7806 }
7807 } else {
7808
7809
7810
7811
7812
7813 group = child->groups;
7814 do {
7815 struct sched_group_capacity *sgc = group->sgc;
7816
7817 capacity += sgc->capacity;
7818 min_capacity = min(sgc->min_capacity, min_capacity);
7819 max_capacity = max(sgc->max_capacity, max_capacity);
7820 group = group->next;
7821 } while (group != child->groups);
7822 }
7823
7824 sdg->sgc->capacity = capacity;
7825 sdg->sgc->min_capacity = min_capacity;
7826 sdg->sgc->max_capacity = max_capacity;
7827}
7828
7829
7830
7831
7832
7833
7834static inline int
7835check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7836{
7837 return ((rq->cpu_capacity * sd->imbalance_pct) <
7838 (rq->cpu_capacity_orig * 100));
7839}
7840
7841
7842
7843
7844
7845
7846static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
7847{
7848 return rq->misfit_task_load &&
7849 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
7850 check_cpu_capacity(rq, sd));
7851}
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882static inline int sg_imbalanced(struct sched_group *group)
7883{
7884 return group->sgc->imbalance;
7885}
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899static inline bool
7900group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
7901{
7902 if (sgs->sum_nr_running < sgs->group_weight)
7903 return true;
7904
7905 if ((sgs->group_capacity * 100) >
7906 (sgs->group_util * imbalance_pct))
7907 return true;
7908
7909 return false;
7910}
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920static inline bool
7921group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
7922{
7923 if (sgs->sum_nr_running <= sgs->group_weight)
7924 return false;
7925
7926 if ((sgs->group_capacity * 100) <
7927 (sgs->group_util * imbalance_pct))
7928 return true;
7929
7930 return false;
7931}
7932
7933
7934
7935
7936
7937static inline bool
7938group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7939{
7940 return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
7941}
7942
7943
7944
7945
7946
7947static inline bool
7948group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7949{
7950 return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
7951}
7952
7953static inline enum
7954group_type group_classify(unsigned int imbalance_pct,
7955 struct sched_group *group,
7956 struct sg_lb_stats *sgs)
7957{
7958 if (group_is_overloaded(imbalance_pct, sgs))
7959 return group_overloaded;
7960
7961 if (sg_imbalanced(group))
7962 return group_imbalanced;
7963
7964 if (sgs->group_asym_packing)
7965 return group_asym_packing;
7966
7967 if (sgs->group_misfit_task_load)
7968 return group_misfit_task;
7969
7970 if (!group_has_capacity(imbalance_pct, sgs))
7971 return group_fully_busy;
7972
7973 return group_has_spare;
7974}
7975
7976static bool update_nohz_stats(struct rq *rq, bool force)
7977{
7978#ifdef CONFIG_NO_HZ_COMMON
7979 unsigned int cpu = rq->cpu;
7980
7981 if (!rq->has_blocked_load)
7982 return false;
7983
7984 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
7985 return false;
7986
7987 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
7988 return true;
7989
7990 update_blocked_averages(cpu);
7991
7992 return rq->has_blocked_load;
7993#else
7994 return false;
7995#endif
7996}
7997
7998
7999
8000
8001
8002
8003
8004
8005static inline void update_sg_lb_stats(struct lb_env *env,
8006 struct sched_group *group,
8007 struct sg_lb_stats *sgs,
8008 int *sg_status)
8009{
8010 int i, nr_running, local_group;
8011
8012 memset(sgs, 0, sizeof(*sgs));
8013
8014 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8015
8016 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8017 struct rq *rq = cpu_rq(i);
8018
8019 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8020 env->flags |= LBF_NOHZ_AGAIN;
8021
8022 sgs->group_load += cpu_load(rq);
8023 sgs->group_util += cpu_util(i);
8024 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8025
8026 nr_running = rq->nr_running;
8027 sgs->sum_nr_running += nr_running;
8028
8029 if (nr_running > 1)
8030 *sg_status |= SG_OVERLOAD;
8031
8032 if (cpu_overutilized(i))
8033 *sg_status |= SG_OVERUTILIZED;
8034
8035#ifdef CONFIG_NUMA_BALANCING
8036 sgs->nr_numa_running += rq->nr_numa_running;
8037 sgs->nr_preferred_running += rq->nr_preferred_running;
8038#endif
8039
8040
8041
8042 if (!nr_running && idle_cpu(i)) {
8043 sgs->idle_cpus++;
8044
8045 continue;
8046 }
8047
8048 if (local_group)
8049 continue;
8050
8051
8052 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8053 sgs->group_misfit_task_load < rq->misfit_task_load) {
8054 sgs->group_misfit_task_load = rq->misfit_task_load;
8055 *sg_status |= SG_OVERLOAD;
8056 }
8057 }
8058
8059
8060 if (env->sd->flags & SD_ASYM_PACKING &&
8061 env->idle != CPU_NOT_IDLE &&
8062 sgs->sum_h_nr_running &&
8063 sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8064 sgs->group_asym_packing = 1;
8065 }
8066
8067 sgs->group_capacity = group->sgc->capacity;
8068
8069 sgs->group_weight = group->group_weight;
8070
8071 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8072
8073
8074 if (sgs->group_type == group_overloaded)
8075 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8076 sgs->group_capacity;
8077}
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092static bool update_sd_pick_busiest(struct lb_env *env,
8093 struct sd_lb_stats *sds,
8094 struct sched_group *sg,
8095 struct sg_lb_stats *sgs)
8096{
8097 struct sg_lb_stats *busiest = &sds->busiest_stat;
8098
8099
8100 if (!sgs->sum_h_nr_running)
8101 return false;
8102
8103
8104
8105
8106
8107
8108
8109 if (sgs->group_type == group_misfit_task &&
8110 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
8111 sds->local_stat.group_type != group_has_spare))
8112 return false;
8113
8114 if (sgs->group_type > busiest->group_type)
8115 return true;
8116
8117 if (sgs->group_type < busiest->group_type)
8118 return false;
8119
8120
8121
8122
8123
8124
8125 switch (sgs->group_type) {
8126 case group_overloaded:
8127
8128 if (sgs->avg_load <= busiest->avg_load)
8129 return false;
8130 break;
8131
8132 case group_imbalanced:
8133
8134
8135
8136
8137 return false;
8138
8139 case group_asym_packing:
8140
8141 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8142 return false;
8143 break;
8144
8145 case group_misfit_task:
8146
8147
8148
8149
8150 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8151 return false;
8152 break;
8153
8154 case group_fully_busy:
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165 if (sgs->avg_load <= busiest->avg_load)
8166 return false;
8167 break;
8168
8169 case group_has_spare:
8170
8171
8172
8173
8174
8175
8176
8177 if (sgs->idle_cpus >= busiest->idle_cpus)
8178 return false;
8179 break;
8180 }
8181
8182
8183
8184
8185
8186
8187
8188 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8189 (sgs->group_type <= group_fully_busy) &&
8190 (group_smaller_min_cpu_capacity(sds->local, sg)))
8191 return false;
8192
8193 return true;
8194}
8195
8196#ifdef CONFIG_NUMA_BALANCING
8197static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8198{
8199 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
8200 return regular;
8201 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
8202 return remote;
8203 return all;
8204}
8205
8206static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8207{
8208 if (rq->nr_running > rq->nr_numa_running)
8209 return regular;
8210 if (rq->nr_running > rq->nr_preferred_running)
8211 return remote;
8212 return all;
8213}
8214#else
8215static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8216{
8217 return all;
8218}
8219
8220static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8221{
8222 return regular;
8223}
8224#endif
8225
8226
8227struct sg_lb_stats;
8228
8229
8230
8231
8232
8233static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8234{
8235
8236 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8237 return 0;
8238
8239 if (task_on_rq_queued(p))
8240 return 1;
8241
8242 return 0;
8243}
8244
8245
8246
8247
8248
8249
8250
8251
8252static int idle_cpu_without(int cpu, struct task_struct *p)
8253{
8254 struct rq *rq = cpu_rq(cpu);
8255
8256 if (rq->curr != rq->idle && rq->curr != p)
8257 return 0;
8258
8259
8260
8261
8262
8263
8264
8265#ifdef CONFIG_SMP
8266 if (!llist_empty(&rq->wake_list))
8267 return 0;
8268#endif
8269
8270 return 1;
8271}
8272
8273
8274
8275
8276
8277
8278
8279
8280static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8281 struct sched_group *group,
8282 struct sg_lb_stats *sgs,
8283 struct task_struct *p)
8284{
8285 int i, nr_running;
8286
8287 memset(sgs, 0, sizeof(*sgs));
8288
8289 for_each_cpu(i, sched_group_span(group)) {
8290 struct rq *rq = cpu_rq(i);
8291 unsigned int local;
8292
8293 sgs->group_load += cpu_load_without(rq, p);
8294 sgs->group_util += cpu_util_without(i, p);
8295 local = task_running_on_cpu(i, p);
8296 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8297
8298 nr_running = rq->nr_running - local;
8299 sgs->sum_nr_running += nr_running;
8300
8301
8302
8303
8304 if (!nr_running && idle_cpu_without(i, p))
8305 sgs->idle_cpus++;
8306
8307 }
8308
8309
8310 if (sd->flags & SD_ASYM_CPUCAPACITY &&
8311 !task_fits_capacity(p, group->sgc->max_capacity)) {
8312 sgs->group_misfit_task_load = 1;
8313 }
8314
8315 sgs->group_capacity = group->sgc->capacity;
8316
8317 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8318
8319
8320
8321
8322
8323 if (sgs->group_type < group_fully_busy)
8324 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8325 sgs->group_capacity;
8326}
8327
8328static bool update_pick_idlest(struct sched_group *idlest,
8329 struct sg_lb_stats *idlest_sgs,
8330 struct sched_group *group,
8331 struct sg_lb_stats *sgs)
8332{
8333 if (sgs->group_type < idlest_sgs->group_type)
8334 return true;
8335
8336 if (sgs->group_type > idlest_sgs->group_type)
8337 return false;
8338
8339
8340
8341
8342
8343
8344 switch (sgs->group_type) {
8345 case group_overloaded:
8346 case group_fully_busy:
8347
8348 if (idlest_sgs->avg_load <= sgs->avg_load)
8349 return false;
8350 break;
8351
8352 case group_imbalanced:
8353 case group_asym_packing:
8354
8355 return false;
8356
8357 case group_misfit_task:
8358
8359 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
8360 return false;
8361 break;
8362
8363 case group_has_spare:
8364
8365 if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
8366 return false;
8367 break;
8368 }
8369
8370 return true;
8371}
8372
8373
8374
8375
8376
8377
8378
8379static struct sched_group *
8380find_idlest_group(struct sched_domain *sd, struct task_struct *p,
8381 int this_cpu, int sd_flag)
8382{
8383 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
8384 struct sg_lb_stats local_sgs, tmp_sgs;
8385 struct sg_lb_stats *sgs;
8386 unsigned long imbalance;
8387 struct sg_lb_stats idlest_sgs = {
8388 .avg_load = UINT_MAX,
8389 .group_type = group_overloaded,
8390 };
8391
8392 imbalance = scale_load_down(NICE_0_LOAD) *
8393 (sd->imbalance_pct-100) / 100;
8394
8395 do {
8396 int local_group;
8397
8398
8399 if (!cpumask_intersects(sched_group_span(group),
8400 p->cpus_ptr))
8401 continue;
8402
8403 local_group = cpumask_test_cpu(this_cpu,
8404 sched_group_span(group));
8405
8406 if (local_group) {
8407 sgs = &local_sgs;
8408 local = group;
8409 } else {
8410 sgs = &tmp_sgs;
8411 }
8412
8413 update_sg_wakeup_stats(sd, group, sgs, p);
8414
8415 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
8416 idlest = group;
8417 idlest_sgs = *sgs;
8418 }
8419
8420 } while (group = group->next, group != sd->groups);
8421
8422
8423
8424 if (!idlest)
8425 return NULL;
8426
8427
8428 if (!local)
8429 return idlest;
8430
8431
8432
8433
8434
8435 if (local_sgs.group_type < idlest_sgs.group_type)
8436 return NULL;
8437
8438
8439
8440
8441
8442 if (local_sgs.group_type > idlest_sgs.group_type)
8443 return idlest;
8444
8445 switch (local_sgs.group_type) {
8446 case group_overloaded:
8447 case group_fully_busy:
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457 if ((sd->flags & SD_NUMA) &&
8458 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
8459 return NULL;
8460
8461
8462
8463
8464
8465 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
8466 return NULL;
8467
8468 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
8469 return NULL;
8470 break;
8471
8472 case group_imbalanced:
8473 case group_asym_packing:
8474
8475 return NULL;
8476
8477 case group_misfit_task:
8478
8479 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
8480 return NULL;
8481 break;
8482
8483 case group_has_spare:
8484 if (sd->flags & SD_NUMA) {
8485#ifdef CONFIG_NUMA_BALANCING
8486 int idlest_cpu;
8487
8488
8489
8490
8491 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
8492 return NULL;
8493
8494 idlest_cpu = cpumask_first(sched_group_span(idlest));
8495 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
8496 return idlest;
8497#endif
8498
8499
8500
8501
8502
8503
8504 if (local_sgs.idle_cpus)
8505 return NULL;
8506 }
8507
8508
8509
8510
8511
8512
8513
8514 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
8515 return NULL;
8516 break;
8517 }
8518
8519 return idlest;
8520}
8521
8522
8523
8524
8525
8526
8527
8528static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8529{
8530 struct sched_domain *child = env->sd->child;
8531 struct sched_group *sg = env->sd->groups;
8532 struct sg_lb_stats *local = &sds->local_stat;
8533 struct sg_lb_stats tmp_sgs;
8534 int sg_status = 0;
8535
8536#ifdef CONFIG_NO_HZ_COMMON
8537 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8538 env->flags |= LBF_NOHZ_STATS;
8539#endif
8540
8541 do {
8542 struct sg_lb_stats *sgs = &tmp_sgs;
8543 int local_group;
8544
8545 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8546 if (local_group) {
8547 sds->local = sg;
8548 sgs = local;
8549
8550 if (env->idle != CPU_NEWLY_IDLE ||
8551 time_after_eq(jiffies, sg->sgc->next_update))
8552 update_group_capacity(env->sd, env->dst_cpu);
8553 }
8554
8555 update_sg_lb_stats(env, sg, sgs, &sg_status);
8556
8557 if (local_group)
8558 goto next_group;
8559
8560
8561 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8562 sds->busiest = sg;
8563 sds->busiest_stat = *sgs;
8564 }
8565
8566next_group:
8567
8568 sds->total_load += sgs->group_load;
8569 sds->total_capacity += sgs->group_capacity;
8570
8571 sg = sg->next;
8572 } while (sg != env->sd->groups);
8573
8574
8575 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
8576
8577#ifdef CONFIG_NO_HZ_COMMON
8578 if ((env->flags & LBF_NOHZ_AGAIN) &&
8579 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8580
8581 WRITE_ONCE(nohz.next_blocked,
8582 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8583 }
8584#endif
8585
8586 if (env->sd->flags & SD_NUMA)
8587 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8588
8589 if (!env->sd->parent) {
8590 struct root_domain *rd = env->dst_rq->rd;
8591
8592
8593 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
8594
8595
8596 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8597 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
8598 } else if (sg_status & SG_OVERUTILIZED) {
8599 struct root_domain *rd = env->dst_rq->rd;
8600
8601 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
8602 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
8603 }
8604}
8605
8606
8607
8608
8609
8610
8611
8612static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8613{
8614 struct sg_lb_stats *local, *busiest;
8615
8616 local = &sds->local_stat;
8617 busiest = &sds->busiest_stat;
8618
8619 if (busiest->group_type == group_misfit_task) {
8620
8621 env->migration_type = migrate_misfit;
8622 env->imbalance = 1;
8623 return;
8624 }
8625
8626 if (busiest->group_type == group_asym_packing) {
8627
8628
8629
8630
8631 env->migration_type = migrate_task;
8632 env->imbalance = busiest->sum_h_nr_running;
8633 return;
8634 }
8635
8636 if (busiest->group_type == group_imbalanced) {
8637
8638
8639
8640
8641
8642
8643 env->migration_type = migrate_task;
8644 env->imbalance = 1;
8645 return;
8646 }
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656 if (local->group_type == group_has_spare) {
8657 if (busiest->group_type > group_fully_busy) {
8658
8659
8660
8661
8662
8663
8664
8665
8666 env->migration_type = migrate_util;
8667 env->imbalance = max(local->group_capacity, local->group_util) -
8668 local->group_util;
8669
8670
8671
8672
8673
8674
8675
8676
8677 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
8678 env->migration_type = migrate_task;
8679 env->imbalance = 1;
8680 }
8681
8682 return;
8683 }
8684
8685 if (busiest->group_weight == 1 || sds->prefer_sibling) {
8686 unsigned int nr_diff = busiest->sum_nr_running;
8687
8688
8689
8690
8691 env->migration_type = migrate_task;
8692 lsub_positive(&nr_diff, local->sum_nr_running);
8693 env->imbalance = nr_diff >> 1;
8694 return;
8695 }
8696
8697
8698
8699
8700
8701 env->migration_type = migrate_task;
8702 env->imbalance = max_t(long, 0, (local->idle_cpus -
8703 busiest->idle_cpus) >> 1);
8704 return;
8705 }
8706
8707
8708
8709
8710
8711 if (local->group_type < group_overloaded) {
8712
8713
8714
8715
8716
8717 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
8718 local->group_capacity;
8719
8720 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
8721 sds->total_capacity;
8722 }
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732 env->migration_type = migrate_load;
8733 env->imbalance = min(
8734 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
8735 (sds->avg_load - local->avg_load) * local->group_capacity
8736 ) / SCHED_CAPACITY_SCALE;
8737}
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772static struct sched_group *find_busiest_group(struct lb_env *env)
8773{
8774 struct sg_lb_stats *local, *busiest;
8775 struct sd_lb_stats sds;
8776
8777 init_sd_lb_stats(&sds);
8778
8779
8780
8781
8782
8783 update_sd_lb_stats(env, &sds);
8784
8785 if (sched_energy_enabled()) {
8786 struct root_domain *rd = env->dst_rq->rd;
8787
8788 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
8789 goto out_balanced;
8790 }
8791
8792 local = &sds.local_stat;
8793 busiest = &sds.busiest_stat;
8794
8795
8796 if (!sds.busiest)
8797 goto out_balanced;
8798
8799
8800 if (busiest->group_type == group_misfit_task)
8801 goto force_balance;
8802
8803
8804 if (busiest->group_type == group_asym_packing)
8805 goto force_balance;
8806
8807
8808
8809
8810
8811
8812 if (busiest->group_type == group_imbalanced)
8813 goto force_balance;
8814
8815
8816
8817
8818
8819 if (local->group_type > busiest->group_type)
8820 goto out_balanced;
8821
8822
8823
8824
8825
8826 if (local->group_type == group_overloaded) {
8827
8828
8829
8830
8831 if (local->avg_load >= busiest->avg_load)
8832 goto out_balanced;
8833
8834
8835 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
8836 sds.total_capacity;
8837
8838
8839
8840
8841
8842 if (local->avg_load >= sds.avg_load)
8843 goto out_balanced;
8844
8845
8846
8847
8848
8849 if (100 * busiest->avg_load <=
8850 env->sd->imbalance_pct * local->avg_load)
8851 goto out_balanced;
8852 }
8853
8854
8855 if (sds.prefer_sibling && local->group_type == group_has_spare &&
8856 busiest->sum_nr_running > local->sum_nr_running + 1)
8857 goto force_balance;
8858
8859 if (busiest->group_type != group_overloaded) {
8860 if (env->idle == CPU_NOT_IDLE)
8861
8862
8863
8864
8865
8866 goto out_balanced;
8867
8868 if (busiest->group_weight > 1 &&
8869 local->idle_cpus <= (busiest->idle_cpus + 1))
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879 goto out_balanced;
8880
8881 if (busiest->sum_h_nr_running == 1)
8882
8883
8884
8885 goto out_balanced;
8886 }
8887
8888force_balance:
8889
8890 calculate_imbalance(env, &sds);
8891 return env->imbalance ? sds.busiest : NULL;
8892
8893out_balanced:
8894 env->imbalance = 0;
8895 return NULL;
8896}
8897
8898
8899
8900
8901static struct rq *find_busiest_queue(struct lb_env *env,
8902 struct sched_group *group)
8903{
8904 struct rq *busiest = NULL, *rq;
8905 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
8906 unsigned int busiest_nr = 0;
8907 int i;
8908
8909 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8910 unsigned long capacity, load, util;
8911 unsigned int nr_running;
8912 enum fbq_type rt;
8913
8914 rq = cpu_rq(i);
8915 rt = fbq_classify_rq(rq);
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936 if (rt > env->fbq_type)
8937 continue;
8938
8939 capacity = capacity_of(i);
8940 nr_running = rq->cfs.h_nr_running;
8941
8942
8943
8944
8945
8946
8947
8948 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8949 capacity_of(env->dst_cpu) < capacity &&
8950 nr_running == 1)
8951 continue;
8952
8953 switch (env->migration_type) {
8954 case migrate_load:
8955
8956
8957
8958
8959 load = cpu_load(rq);
8960
8961 if (nr_running == 1 && load > env->imbalance &&
8962 !check_cpu_capacity(rq, env->sd))
8963 break;
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978 if (load * busiest_capacity > busiest_load * capacity) {
8979 busiest_load = load;
8980 busiest_capacity = capacity;
8981 busiest = rq;
8982 }
8983 break;
8984
8985 case migrate_util:
8986 util = cpu_util(cpu_of(rq));
8987
8988 if (busiest_util < util) {
8989 busiest_util = util;
8990 busiest = rq;
8991 }
8992 break;
8993
8994 case migrate_task:
8995 if (busiest_nr < nr_running) {
8996 busiest_nr = nr_running;
8997 busiest = rq;
8998 }
8999 break;
9000
9001 case migrate_misfit:
9002
9003
9004
9005
9006 if (rq->misfit_task_load > busiest_load) {
9007 busiest_load = rq->misfit_task_load;
9008 busiest = rq;
9009 }
9010
9011 break;
9012
9013 }
9014 }
9015
9016 return busiest;
9017}
9018
9019
9020
9021
9022
9023#define MAX_PINNED_INTERVAL 512
9024
9025static inline bool
9026asym_active_balance(struct lb_env *env)
9027{
9028
9029
9030
9031
9032
9033 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9034 sched_asym_prefer(env->dst_cpu, env->src_cpu);
9035}
9036
9037static inline bool
9038voluntary_active_balance(struct lb_env *env)
9039{
9040 struct sched_domain *sd = env->sd;
9041
9042 if (asym_active_balance(env))
9043 return 1;
9044
9045
9046
9047
9048
9049
9050
9051 if ((env->idle != CPU_NOT_IDLE) &&
9052 (env->src_rq->cfs.h_nr_running == 1)) {
9053 if ((check_cpu_capacity(env->src_rq, sd)) &&
9054 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9055 return 1;
9056 }
9057
9058 if (env->migration_type == migrate_misfit)
9059 return 1;
9060
9061 return 0;
9062}
9063
9064static int need_active_balance(struct lb_env *env)
9065{
9066 struct sched_domain *sd = env->sd;
9067
9068 if (voluntary_active_balance(env))
9069 return 1;
9070
9071 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9072}
9073
9074static int active_load_balance_cpu_stop(void *data);
9075
9076static int should_we_balance(struct lb_env *env)
9077{
9078 struct sched_group *sg = env->sd->groups;
9079 int cpu, balance_cpu = -1;
9080
9081
9082
9083
9084
9085 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9086 return 0;
9087
9088
9089
9090
9091
9092 if (env->idle == CPU_NEWLY_IDLE)
9093 return 1;
9094
9095
9096 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
9097 if (!idle_cpu(cpu))
9098 continue;
9099
9100 balance_cpu = cpu;
9101 break;
9102 }
9103
9104 if (balance_cpu == -1)
9105 balance_cpu = group_balance_cpu(sg);
9106
9107
9108
9109
9110
9111 return balance_cpu == env->dst_cpu;
9112}
9113
9114
9115
9116
9117
9118static int load_balance(int this_cpu, struct rq *this_rq,
9119 struct sched_domain *sd, enum cpu_idle_type idle,
9120 int *continue_balancing)
9121{
9122 int ld_moved, cur_ld_moved, active_balance = 0;
9123 struct sched_domain *sd_parent = sd->parent;
9124 struct sched_group *group;
9125 struct rq *busiest;
9126 struct rq_flags rf;
9127 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9128
9129 struct lb_env env = {
9130 .sd = sd,
9131 .dst_cpu = this_cpu,
9132 .dst_rq = this_rq,
9133 .dst_grpmask = sched_group_span(sd->groups),
9134 .idle = idle,
9135 .loop_break = sched_nr_migrate_break,
9136 .cpus = cpus,
9137 .fbq_type = all,
9138 .tasks = LIST_HEAD_INIT(env.tasks),
9139 };
9140
9141 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
9142
9143 schedstat_inc(sd->lb_count[idle]);
9144
9145redo:
9146 if (!should_we_balance(&env)) {
9147 *continue_balancing = 0;
9148 goto out_balanced;
9149 }
9150
9151 group = find_busiest_group(&env);
9152 if (!group) {
9153 schedstat_inc(sd->lb_nobusyg[idle]);
9154 goto out_balanced;
9155 }
9156
9157 busiest = find_busiest_queue(&env, group);
9158 if (!busiest) {
9159 schedstat_inc(sd->lb_nobusyq[idle]);
9160 goto out_balanced;
9161 }
9162
9163 BUG_ON(busiest == env.dst_rq);
9164
9165 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
9166
9167 env.src_cpu = busiest->cpu;
9168 env.src_rq = busiest;
9169
9170 ld_moved = 0;
9171 if (busiest->nr_running > 1) {
9172
9173
9174
9175
9176
9177
9178 env.flags |= LBF_ALL_PINNED;
9179 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9180
9181more_balance:
9182 rq_lock_irqsave(busiest, &rf);
9183 update_rq_clock(busiest);
9184
9185
9186
9187
9188
9189 cur_ld_moved = detach_tasks(&env);
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199 rq_unlock(busiest, &rf);
9200
9201 if (cur_ld_moved) {
9202 attach_tasks(&env);
9203 ld_moved += cur_ld_moved;
9204 }
9205
9206 local_irq_restore(rf.flags);
9207
9208 if (env.flags & LBF_NEED_BREAK) {
9209 env.flags &= ~LBF_NEED_BREAK;
9210 goto more_balance;
9211 }
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9233
9234
9235 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9236
9237 env.dst_rq = cpu_rq(env.new_dst_cpu);
9238 env.dst_cpu = env.new_dst_cpu;
9239 env.flags &= ~LBF_DST_PINNED;
9240 env.loop = 0;
9241 env.loop_break = sched_nr_migrate_break;
9242
9243
9244
9245
9246
9247 goto more_balance;
9248 }
9249
9250
9251
9252
9253 if (sd_parent) {
9254 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9255
9256 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9257 *group_imbalance = 1;
9258 }
9259
9260
9261 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9262 __cpumask_clear_cpu(cpu_of(busiest), cpus);
9263
9264
9265
9266
9267
9268
9269
9270
9271 if (!cpumask_subset(cpus, env.dst_grpmask)) {
9272 env.loop = 0;
9273 env.loop_break = sched_nr_migrate_break;
9274 goto redo;
9275 }
9276 goto out_all_pinned;
9277 }
9278 }
9279
9280 if (!ld_moved) {
9281 schedstat_inc(sd->lb_failed[idle]);
9282
9283
9284
9285
9286
9287
9288 if (idle != CPU_NEWLY_IDLE)
9289 sd->nr_balance_failed++;
9290
9291 if (need_active_balance(&env)) {
9292 unsigned long flags;
9293
9294 raw_spin_lock_irqsave(&busiest->lock, flags);
9295
9296
9297
9298
9299
9300
9301 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9302 raw_spin_unlock_irqrestore(&busiest->lock,
9303 flags);
9304 env.flags |= LBF_ALL_PINNED;
9305 goto out_one_pinned;
9306 }
9307
9308
9309
9310
9311
9312
9313 if (!busiest->active_balance) {
9314 busiest->active_balance = 1;
9315 busiest->push_cpu = this_cpu;
9316 active_balance = 1;
9317 }
9318 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9319
9320 if (active_balance) {
9321 stop_one_cpu_nowait(cpu_of(busiest),
9322 active_load_balance_cpu_stop, busiest,
9323 &busiest->active_balance_work);
9324 }
9325
9326
9327 sd->nr_balance_failed = sd->cache_nice_tries+1;
9328 }
9329 } else
9330 sd->nr_balance_failed = 0;
9331
9332 if (likely(!active_balance) || voluntary_active_balance(&env)) {
9333
9334 sd->balance_interval = sd->min_interval;
9335 } else {
9336
9337
9338
9339
9340
9341
9342 if (sd->balance_interval < sd->max_interval)
9343 sd->balance_interval *= 2;
9344 }
9345
9346 goto out;
9347
9348out_balanced:
9349
9350
9351
9352
9353
9354 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9355 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9356
9357 if (*group_imbalance)
9358 *group_imbalance = 0;
9359 }
9360
9361out_all_pinned:
9362
9363
9364
9365
9366
9367 schedstat_inc(sd->lb_balanced[idle]);
9368
9369 sd->nr_balance_failed = 0;
9370
9371out_one_pinned:
9372 ld_moved = 0;
9373
9374
9375
9376
9377
9378
9379
9380 if (env.idle == CPU_NEWLY_IDLE)
9381 goto out;
9382
9383
9384 if ((env.flags & LBF_ALL_PINNED &&
9385 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9386 sd->balance_interval < sd->max_interval)
9387 sd->balance_interval *= 2;
9388out:
9389 return ld_moved;
9390}
9391
9392static inline unsigned long
9393get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9394{
9395 unsigned long interval = sd->balance_interval;
9396
9397 if (cpu_busy)
9398 interval *= sd->busy_factor;
9399
9400
9401 interval = msecs_to_jiffies(interval);
9402 interval = clamp(interval, 1UL, max_load_balance_interval);
9403
9404 return interval;
9405}
9406
9407static inline void
9408update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9409{
9410 unsigned long interval, next;
9411
9412
9413 interval = get_sd_balance_interval(sd, 0);
9414 next = sd->last_balance + interval;
9415
9416 if (time_after(*next_balance, next))
9417 *next_balance = next;
9418}
9419
9420
9421
9422
9423
9424
9425
9426static int active_load_balance_cpu_stop(void *data)
9427{
9428 struct rq *busiest_rq = data;
9429 int busiest_cpu = cpu_of(busiest_rq);
9430 int target_cpu = busiest_rq->push_cpu;
9431 struct rq *target_rq = cpu_rq(target_cpu);
9432 struct sched_domain *sd;
9433 struct task_struct *p = NULL;
9434 struct rq_flags rf;
9435
9436 rq_lock_irq(busiest_rq, &rf);
9437
9438
9439
9440
9441
9442 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9443 goto out_unlock;
9444
9445
9446 if (unlikely(busiest_cpu != smp_processor_id() ||
9447 !busiest_rq->active_balance))
9448 goto out_unlock;
9449
9450
9451 if (busiest_rq->nr_running <= 1)
9452 goto out_unlock;
9453
9454
9455
9456
9457
9458
9459 BUG_ON(busiest_rq == target_rq);
9460
9461
9462 rcu_read_lock();
9463 for_each_domain(target_cpu, sd) {
9464 if ((sd->flags & SD_LOAD_BALANCE) &&
9465 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9466 break;
9467 }
9468
9469 if (likely(sd)) {
9470 struct lb_env env = {
9471 .sd = sd,
9472 .dst_cpu = target_cpu,
9473 .dst_rq = target_rq,
9474 .src_cpu = busiest_rq->cpu,
9475 .src_rq = busiest_rq,
9476 .idle = CPU_IDLE,
9477
9478
9479
9480
9481
9482
9483 .flags = LBF_DST_PINNED,
9484 };
9485
9486 schedstat_inc(sd->alb_count);
9487 update_rq_clock(busiest_rq);
9488
9489 p = detach_one_task(&env);
9490 if (p) {
9491 schedstat_inc(sd->alb_pushed);
9492
9493 sd->nr_balance_failed = 0;
9494 } else {
9495 schedstat_inc(sd->alb_failed);
9496 }
9497 }
9498 rcu_read_unlock();
9499out_unlock:
9500 busiest_rq->active_balance = 0;
9501 rq_unlock(busiest_rq, &rf);
9502
9503 if (p)
9504 attach_one_task(target_rq, p);
9505
9506 local_irq_enable();
9507
9508 return 0;
9509}
9510
9511static DEFINE_SPINLOCK(balancing);
9512
9513
9514
9515
9516
9517void update_max_interval(void)
9518{
9519 max_load_balance_interval = HZ*num_online_cpus()/10;
9520}
9521
9522
9523
9524
9525
9526
9527
9528static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9529{
9530 int continue_balancing = 1;
9531 int cpu = rq->cpu;
9532 unsigned long interval;
9533 struct sched_domain *sd;
9534
9535 unsigned long next_balance = jiffies + 60*HZ;
9536 int update_next_balance = 0;
9537 int need_serialize, need_decay = 0;
9538 u64 max_cost = 0;
9539
9540 rcu_read_lock();
9541 for_each_domain(cpu, sd) {
9542
9543
9544
9545
9546 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9547 sd->max_newidle_lb_cost =
9548 (sd->max_newidle_lb_cost * 253) / 256;
9549 sd->next_decay_max_lb_cost = jiffies + HZ;
9550 need_decay = 1;
9551 }
9552 max_cost += sd->max_newidle_lb_cost;
9553
9554 if (!(sd->flags & SD_LOAD_BALANCE))
9555 continue;
9556
9557
9558
9559
9560
9561
9562 if (!continue_balancing) {
9563 if (need_decay)
9564 continue;
9565 break;
9566 }
9567
9568 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9569
9570 need_serialize = sd->flags & SD_SERIALIZE;
9571 if (need_serialize) {
9572 if (!spin_trylock(&balancing))
9573 goto out;
9574 }
9575
9576 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9577 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9578
9579
9580
9581
9582
9583 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9584 }
9585 sd->last_balance = jiffies;
9586 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9587 }
9588 if (need_serialize)
9589 spin_unlock(&balancing);
9590out:
9591 if (time_after(next_balance, sd->last_balance + interval)) {
9592 next_balance = sd->last_balance + interval;
9593 update_next_balance = 1;
9594 }
9595 }
9596 if (need_decay) {
9597
9598
9599
9600
9601 rq->max_idle_balance_cost =
9602 max((u64)sysctl_sched_migration_cost, max_cost);
9603 }
9604 rcu_read_unlock();
9605
9606
9607
9608
9609
9610
9611 if (likely(update_next_balance)) {
9612 rq->next_balance = next_balance;
9613
9614#ifdef CONFIG_NO_HZ_COMMON
9615
9616
9617
9618
9619
9620
9621
9622
9623 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9624 nohz.next_balance = rq->next_balance;
9625#endif
9626 }
9627}
9628
9629static inline int on_null_domain(struct rq *rq)
9630{
9631 return unlikely(!rcu_dereference_sched(rq->sd));
9632}
9633
9634#ifdef CONFIG_NO_HZ_COMMON
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644static inline int find_new_ilb(void)
9645{
9646 int ilb;
9647
9648 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9649 housekeeping_cpumask(HK_FLAG_MISC)) {
9650 if (idle_cpu(ilb))
9651 return ilb;
9652 }
9653
9654 return nr_cpu_ids;
9655}
9656
9657
9658
9659
9660
9661static void kick_ilb(unsigned int flags)
9662{
9663 int ilb_cpu;
9664
9665 nohz.next_balance++;
9666
9667 ilb_cpu = find_new_ilb();
9668
9669 if (ilb_cpu >= nr_cpu_ids)
9670 return;
9671
9672 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9673 if (flags & NOHZ_KICK_MASK)
9674 return;
9675
9676
9677
9678
9679
9680
9681
9682 smp_send_reschedule(ilb_cpu);
9683}
9684
9685
9686
9687
9688
9689static void nohz_balancer_kick(struct rq *rq)
9690{
9691 unsigned long now = jiffies;
9692 struct sched_domain_shared *sds;
9693 struct sched_domain *sd;
9694 int nr_busy, i, cpu = rq->cpu;
9695 unsigned int flags = 0;
9696
9697 if (unlikely(rq->idle_balance))
9698 return;
9699
9700
9701
9702
9703
9704 nohz_balance_exit_idle(rq);
9705
9706
9707
9708
9709
9710 if (likely(!atomic_read(&nohz.nr_cpus)))
9711 return;
9712
9713 if (READ_ONCE(nohz.has_blocked) &&
9714 time_after(now, READ_ONCE(nohz.next_blocked)))
9715 flags = NOHZ_STATS_KICK;
9716
9717 if (time_before(now, nohz.next_balance))
9718 goto out;
9719
9720 if (rq->nr_running >= 2) {
9721 flags = NOHZ_KICK_MASK;
9722 goto out;
9723 }
9724
9725 rcu_read_lock();
9726
9727 sd = rcu_dereference(rq->sd);
9728 if (sd) {
9729
9730
9731
9732
9733
9734 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
9735 flags = NOHZ_KICK_MASK;
9736 goto unlock;
9737 }
9738 }
9739
9740 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9741 if (sd) {
9742
9743
9744
9745
9746
9747 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9748 if (sched_asym_prefer(i, cpu)) {
9749 flags = NOHZ_KICK_MASK;
9750 goto unlock;
9751 }
9752 }
9753 }
9754
9755 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
9756 if (sd) {
9757
9758
9759
9760
9761 if (check_misfit_status(rq, sd)) {
9762 flags = NOHZ_KICK_MASK;
9763 goto unlock;
9764 }
9765
9766
9767
9768
9769
9770
9771
9772
9773 goto unlock;
9774 }
9775
9776 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9777 if (sds) {
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787 nr_busy = atomic_read(&sds->nr_busy_cpus);
9788 if (nr_busy > 1) {
9789 flags = NOHZ_KICK_MASK;
9790 goto unlock;
9791 }
9792 }
9793unlock:
9794 rcu_read_unlock();
9795out:
9796 if (flags)
9797 kick_ilb(flags);
9798}
9799
9800static void set_cpu_sd_state_busy(int cpu)
9801{
9802 struct sched_domain *sd;
9803
9804 rcu_read_lock();
9805 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9806
9807 if (!sd || !sd->nohz_idle)
9808 goto unlock;
9809 sd->nohz_idle = 0;
9810
9811 atomic_inc(&sd->shared->nr_busy_cpus);
9812unlock:
9813 rcu_read_unlock();
9814}
9815
9816void nohz_balance_exit_idle(struct rq *rq)
9817{
9818 SCHED_WARN_ON(rq != this_rq());
9819
9820 if (likely(!rq->nohz_tick_stopped))
9821 return;
9822
9823 rq->nohz_tick_stopped = 0;
9824 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9825 atomic_dec(&nohz.nr_cpus);
9826
9827 set_cpu_sd_state_busy(rq->cpu);
9828}
9829
9830static void set_cpu_sd_state_idle(int cpu)
9831{
9832 struct sched_domain *sd;
9833
9834 rcu_read_lock();
9835 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9836
9837 if (!sd || sd->nohz_idle)
9838 goto unlock;
9839 sd->nohz_idle = 1;
9840
9841 atomic_dec(&sd->shared->nr_busy_cpus);
9842unlock:
9843 rcu_read_unlock();
9844}
9845
9846
9847
9848
9849
9850void nohz_balance_enter_idle(int cpu)
9851{
9852 struct rq *rq = cpu_rq(cpu);
9853
9854 SCHED_WARN_ON(cpu != smp_processor_id());
9855
9856
9857 if (!cpu_active(cpu))
9858 return;
9859
9860
9861 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9862 return;
9863
9864
9865
9866
9867
9868
9869 rq->has_blocked_load = 1;
9870
9871
9872
9873
9874
9875
9876
9877 if (rq->nohz_tick_stopped)
9878 goto out;
9879
9880
9881 if (on_null_domain(rq))
9882 return;
9883
9884 rq->nohz_tick_stopped = 1;
9885
9886 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9887 atomic_inc(&nohz.nr_cpus);
9888
9889
9890
9891
9892
9893
9894 smp_mb__after_atomic();
9895
9896 set_cpu_sd_state_idle(cpu);
9897
9898out:
9899
9900
9901
9902
9903 WRITE_ONCE(nohz.has_blocked, 1);
9904}
9905
9906
9907
9908
9909
9910
9911
9912
9913static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9914 enum cpu_idle_type idle)
9915{
9916
9917 unsigned long now = jiffies;
9918 unsigned long next_balance = now + 60*HZ;
9919 bool has_blocked_load = false;
9920 int update_next_balance = 0;
9921 int this_cpu = this_rq->cpu;
9922 int balance_cpu;
9923 int ret = false;
9924 struct rq *rq;
9925
9926 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936 WRITE_ONCE(nohz.has_blocked, 0);
9937
9938
9939
9940
9941
9942 smp_mb();
9943
9944 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9945 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9946 continue;
9947
9948
9949
9950
9951
9952
9953 if (need_resched()) {
9954 has_blocked_load = true;
9955 goto abort;
9956 }
9957
9958 rq = cpu_rq(balance_cpu);
9959
9960 has_blocked_load |= update_nohz_stats(rq, true);
9961
9962
9963
9964
9965
9966 if (time_after_eq(jiffies, rq->next_balance)) {
9967 struct rq_flags rf;
9968
9969 rq_lock_irqsave(rq, &rf);
9970 update_rq_clock(rq);
9971 rq_unlock_irqrestore(rq, &rf);
9972
9973 if (flags & NOHZ_BALANCE_KICK)
9974 rebalance_domains(rq, CPU_IDLE);
9975 }
9976
9977 if (time_after(next_balance, rq->next_balance)) {
9978 next_balance = rq->next_balance;
9979 update_next_balance = 1;
9980 }
9981 }
9982
9983
9984 if (idle != CPU_NEWLY_IDLE) {
9985 update_blocked_averages(this_cpu);
9986 has_blocked_load |= this_rq->has_blocked_load;
9987 }
9988
9989 if (flags & NOHZ_BALANCE_KICK)
9990 rebalance_domains(this_rq, CPU_IDLE);
9991
9992 WRITE_ONCE(nohz.next_blocked,
9993 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9994
9995
9996 ret = true;
9997
9998abort:
9999
10000 if (has_blocked_load)
10001 WRITE_ONCE(nohz.has_blocked, 1);
10002
10003
10004
10005
10006
10007
10008 if (likely(update_next_balance))
10009 nohz.next_balance = next_balance;
10010
10011 return ret;
10012}
10013
10014
10015
10016
10017
10018static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10019{
10020 int this_cpu = this_rq->cpu;
10021 unsigned int flags;
10022
10023 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
10024 return false;
10025
10026 if (idle != CPU_IDLE) {
10027 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10028 return false;
10029 }
10030
10031
10032 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10033 if (!(flags & NOHZ_KICK_MASK))
10034 return false;
10035
10036 _nohz_idle_balance(this_rq, flags, idle);
10037
10038 return true;
10039}
10040
10041static void nohz_newidle_balance(struct rq *this_rq)
10042{
10043 int this_cpu = this_rq->cpu;
10044
10045
10046
10047
10048
10049 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
10050 return;
10051
10052
10053 if (this_rq->avg_idle < sysctl_sched_migration_cost)
10054 return;
10055
10056
10057 if (!READ_ONCE(nohz.has_blocked) ||
10058 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10059 return;
10060
10061 raw_spin_unlock(&this_rq->lock);
10062
10063
10064
10065
10066
10067
10068 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
10069 kick_ilb(NOHZ_STATS_KICK);
10070 raw_spin_lock(&this_rq->lock);
10071}
10072
10073#else
10074static inline void nohz_balancer_kick(struct rq *rq) { }
10075
10076static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10077{
10078 return false;
10079}
10080
10081static inline void nohz_newidle_balance(struct rq *this_rq) { }
10082#endif
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
10094{
10095 unsigned long next_balance = jiffies + HZ;
10096 int this_cpu = this_rq->cpu;
10097 struct sched_domain *sd;
10098 int pulled_task = 0;
10099 u64 curr_cost = 0;
10100
10101 update_misfit_status(NULL, this_rq);
10102
10103
10104
10105
10106 this_rq->idle_stamp = rq_clock(this_rq);
10107
10108
10109
10110
10111 if (!cpu_active(this_cpu))
10112 return 0;
10113
10114
10115
10116
10117
10118
10119
10120 rq_unpin_lock(this_rq, rf);
10121
10122 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
10123 !READ_ONCE(this_rq->rd->overload)) {
10124
10125 rcu_read_lock();
10126 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10127 if (sd)
10128 update_next_balance(sd, &next_balance);
10129 rcu_read_unlock();
10130
10131 nohz_newidle_balance(this_rq);
10132
10133 goto out;
10134 }
10135
10136 raw_spin_unlock(&this_rq->lock);
10137
10138 update_blocked_averages(this_cpu);
10139 rcu_read_lock();
10140 for_each_domain(this_cpu, sd) {
10141 int continue_balancing = 1;
10142 u64 t0, domain_cost;
10143
10144 if (!(sd->flags & SD_LOAD_BALANCE))
10145 continue;
10146
10147 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10148 update_next_balance(sd, &next_balance);
10149 break;
10150 }
10151
10152 if (sd->flags & SD_BALANCE_NEWIDLE) {
10153 t0 = sched_clock_cpu(this_cpu);
10154
10155 pulled_task = load_balance(this_cpu, this_rq,
10156 sd, CPU_NEWLY_IDLE,
10157 &continue_balancing);
10158
10159 domain_cost = sched_clock_cpu(this_cpu) - t0;
10160 if (domain_cost > sd->max_newidle_lb_cost)
10161 sd->max_newidle_lb_cost = domain_cost;
10162
10163 curr_cost += domain_cost;
10164 }
10165
10166 update_next_balance(sd, &next_balance);
10167
10168
10169
10170
10171
10172 if (pulled_task || this_rq->nr_running > 0)
10173 break;
10174 }
10175 rcu_read_unlock();
10176
10177 raw_spin_lock(&this_rq->lock);
10178
10179 if (curr_cost > this_rq->max_idle_balance_cost)
10180 this_rq->max_idle_balance_cost = curr_cost;
10181
10182out:
10183
10184
10185
10186
10187
10188 if (this_rq->cfs.h_nr_running && !pulled_task)
10189 pulled_task = 1;
10190
10191
10192 if (time_after(this_rq->next_balance, next_balance))
10193 this_rq->next_balance = next_balance;
10194
10195
10196 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10197 pulled_task = -1;
10198
10199 if (pulled_task)
10200 this_rq->idle_stamp = 0;
10201
10202 rq_repin_lock(this_rq, rf);
10203
10204 return pulled_task;
10205}
10206
10207
10208
10209
10210
10211static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
10212{
10213 struct rq *this_rq = this_rq();
10214 enum cpu_idle_type idle = this_rq->idle_balance ?
10215 CPU_IDLE : CPU_NOT_IDLE;
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225 if (nohz_idle_balance(this_rq, idle))
10226 return;
10227
10228
10229 update_blocked_averages(this_rq->cpu);
10230 rebalance_domains(this_rq, idle);
10231}
10232
10233
10234
10235
10236void trigger_load_balance(struct rq *rq)
10237{
10238
10239 if (unlikely(on_null_domain(rq)))
10240 return;
10241
10242 if (time_after_eq(jiffies, rq->next_balance))
10243 raise_softirq(SCHED_SOFTIRQ);
10244
10245 nohz_balancer_kick(rq);
10246}
10247
10248static void rq_online_fair(struct rq *rq)
10249{
10250 update_sysctl();
10251
10252 update_runtime_enabled(rq);
10253}
10254
10255static void rq_offline_fair(struct rq *rq)
10256{
10257 update_sysctl();
10258
10259
10260 unthrottle_offline_cfs_rqs(rq);
10261}
10262
10263#endif
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10274{
10275 struct cfs_rq *cfs_rq;
10276 struct sched_entity *se = &curr->se;
10277
10278 for_each_sched_entity(se) {
10279 cfs_rq = cfs_rq_of(se);
10280 entity_tick(cfs_rq, se, queued);
10281 }
10282
10283 if (static_branch_unlikely(&sched_numa_balancing))
10284 task_tick_numa(rq, curr);
10285
10286 update_misfit_status(curr, rq);
10287 update_overutilized_status(task_rq(curr));
10288}
10289
10290
10291
10292
10293
10294
10295static void task_fork_fair(struct task_struct *p)
10296{
10297 struct cfs_rq *cfs_rq;
10298 struct sched_entity *se = &p->se, *curr;
10299 struct rq *rq = this_rq();
10300 struct rq_flags rf;
10301
10302 rq_lock(rq, &rf);
10303 update_rq_clock(rq);
10304
10305 cfs_rq = task_cfs_rq(current);
10306 curr = cfs_rq->curr;
10307 if (curr) {
10308 update_curr(cfs_rq);
10309 se->vruntime = curr->vruntime;
10310 }
10311 place_entity(cfs_rq, se, 1);
10312
10313 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10314
10315
10316
10317
10318 swap(curr->vruntime, se->vruntime);
10319 resched_curr(rq);
10320 }
10321
10322 se->vruntime -= cfs_rq->min_vruntime;
10323 rq_unlock(rq, &rf);
10324}
10325
10326
10327
10328
10329
10330static void
10331prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10332{
10333 if (!task_on_rq_queued(p))
10334 return;
10335
10336
10337
10338
10339
10340
10341 if (rq->curr == p) {
10342 if (p->prio > oldprio)
10343 resched_curr(rq);
10344 } else
10345 check_preempt_curr(rq, p, 0);
10346}
10347
10348static inline bool vruntime_normalized(struct task_struct *p)
10349{
10350 struct sched_entity *se = &p->se;
10351
10352
10353
10354
10355
10356
10357 if (p->on_rq)
10358 return true;
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369 if (!se->sum_exec_runtime ||
10370 (p->state == TASK_WAKING && p->sched_remote_wakeup))
10371 return true;
10372
10373 return false;
10374}
10375
10376#ifdef CONFIG_FAIR_GROUP_SCHED
10377
10378
10379
10380
10381static void propagate_entity_cfs_rq(struct sched_entity *se)
10382{
10383 struct cfs_rq *cfs_rq;
10384
10385
10386 se = se->parent;
10387
10388 for_each_sched_entity(se) {
10389 cfs_rq = cfs_rq_of(se);
10390
10391 if (cfs_rq_throttled(cfs_rq))
10392 break;
10393
10394 update_load_avg(cfs_rq, se, UPDATE_TG);
10395 }
10396}
10397#else
10398static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10399#endif
10400
10401static void detach_entity_cfs_rq(struct sched_entity *se)
10402{
10403 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10404
10405
10406 update_load_avg(cfs_rq, se, 0);
10407 detach_entity_load_avg(cfs_rq, se);
10408 update_tg_load_avg(cfs_rq, false);
10409 propagate_entity_cfs_rq(se);
10410}
10411
10412static void attach_entity_cfs_rq(struct sched_entity *se)
10413{
10414 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10415
10416#ifdef CONFIG_FAIR_GROUP_SCHED
10417
10418
10419
10420
10421 se->depth = se->parent ? se->parent->depth + 1 : 0;
10422#endif
10423
10424
10425 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10426 attach_entity_load_avg(cfs_rq, se, 0);
10427 update_tg_load_avg(cfs_rq, false);
10428 propagate_entity_cfs_rq(se);
10429}
10430
10431static void detach_task_cfs_rq(struct task_struct *p)
10432{
10433 struct sched_entity *se = &p->se;
10434 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10435
10436 if (!vruntime_normalized(p)) {
10437
10438
10439
10440
10441 place_entity(cfs_rq, se, 0);
10442 se->vruntime -= cfs_rq->min_vruntime;
10443 }
10444
10445 detach_entity_cfs_rq(se);
10446}
10447
10448static void attach_task_cfs_rq(struct task_struct *p)
10449{
10450 struct sched_entity *se = &p->se;
10451 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10452
10453 attach_entity_cfs_rq(se);
10454
10455 if (!vruntime_normalized(p))
10456 se->vruntime += cfs_rq->min_vruntime;
10457}
10458
10459static void switched_from_fair(struct rq *rq, struct task_struct *p)
10460{
10461 detach_task_cfs_rq(p);
10462}
10463
10464static void switched_to_fair(struct rq *rq, struct task_struct *p)
10465{
10466 attach_task_cfs_rq(p);
10467
10468 if (task_on_rq_queued(p)) {
10469
10470
10471
10472
10473
10474 if (rq->curr == p)
10475 resched_curr(rq);
10476 else
10477 check_preempt_curr(rq, p, 0);
10478 }
10479}
10480
10481
10482
10483
10484
10485
10486static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
10487{
10488 struct sched_entity *se = &p->se;
10489
10490#ifdef CONFIG_SMP
10491 if (task_on_rq_queued(p)) {
10492
10493
10494
10495
10496 list_move(&se->group_node, &rq->cfs_tasks);
10497 }
10498#endif
10499
10500 for_each_sched_entity(se) {
10501 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10502
10503 set_next_entity(cfs_rq, se);
10504
10505 account_cfs_rq_runtime(cfs_rq, 0);
10506 }
10507}
10508
10509void init_cfs_rq(struct cfs_rq *cfs_rq)
10510{
10511 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10512 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10513#ifndef CONFIG_64BIT
10514 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10515#endif
10516#ifdef CONFIG_SMP
10517 raw_spin_lock_init(&cfs_rq->removed.lock);
10518#endif
10519}
10520
10521#ifdef CONFIG_FAIR_GROUP_SCHED
10522static void task_set_group_fair(struct task_struct *p)
10523{
10524 struct sched_entity *se = &p->se;
10525
10526 set_task_rq(p, task_cpu(p));
10527 se->depth = se->parent ? se->parent->depth + 1 : 0;
10528}
10529
10530static void task_move_group_fair(struct task_struct *p)
10531{
10532 detach_task_cfs_rq(p);
10533 set_task_rq(p, task_cpu(p));
10534
10535#ifdef CONFIG_SMP
10536
10537 p->se.avg.last_update_time = 0;
10538#endif
10539 attach_task_cfs_rq(p);
10540}
10541
10542static void task_change_group_fair(struct task_struct *p, int type)
10543{
10544 switch (type) {
10545 case TASK_SET_GROUP:
10546 task_set_group_fair(p);
10547 break;
10548
10549 case TASK_MOVE_GROUP:
10550 task_move_group_fair(p);
10551 break;
10552 }
10553}
10554
10555void free_fair_sched_group(struct task_group *tg)
10556{
10557 int i;
10558
10559 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10560
10561 for_each_possible_cpu(i) {
10562 if (tg->cfs_rq)
10563 kfree(tg->cfs_rq[i]);
10564 if (tg->se)
10565 kfree(tg->se[i]);
10566 }
10567
10568 kfree(tg->cfs_rq);
10569 kfree(tg->se);
10570}
10571
10572int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10573{
10574 struct sched_entity *se;
10575 struct cfs_rq *cfs_rq;
10576 int i;
10577
10578 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10579 if (!tg->cfs_rq)
10580 goto err;
10581 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10582 if (!tg->se)
10583 goto err;
10584
10585 tg->shares = NICE_0_LOAD;
10586
10587 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10588
10589 for_each_possible_cpu(i) {
10590 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10591 GFP_KERNEL, cpu_to_node(i));
10592 if (!cfs_rq)
10593 goto err;
10594
10595 se = kzalloc_node(sizeof(struct sched_entity),
10596 GFP_KERNEL, cpu_to_node(i));
10597 if (!se)
10598 goto err_free_rq;
10599
10600 init_cfs_rq(cfs_rq);
10601 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10602 init_entity_runnable_average(se);
10603 }
10604
10605 return 1;
10606
10607err_free_rq:
10608 kfree(cfs_rq);
10609err:
10610 return 0;
10611}
10612
10613void online_fair_sched_group(struct task_group *tg)
10614{
10615 struct sched_entity *se;
10616 struct rq_flags rf;
10617 struct rq *rq;
10618 int i;
10619
10620 for_each_possible_cpu(i) {
10621 rq = cpu_rq(i);
10622 se = tg->se[i];
10623 rq_lock_irq(rq, &rf);
10624 update_rq_clock(rq);
10625 attach_entity_cfs_rq(se);
10626 sync_throttle(tg, i);
10627 rq_unlock_irq(rq, &rf);
10628 }
10629}
10630
10631void unregister_fair_sched_group(struct task_group *tg)
10632{
10633 unsigned long flags;
10634 struct rq *rq;
10635 int cpu;
10636
10637 for_each_possible_cpu(cpu) {
10638 if (tg->se[cpu])
10639 remove_entity_load_avg(tg->se[cpu]);
10640
10641
10642
10643
10644
10645 if (!tg->cfs_rq[cpu]->on_list)
10646 continue;
10647
10648 rq = cpu_rq(cpu);
10649
10650 raw_spin_lock_irqsave(&rq->lock, flags);
10651 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10652 raw_spin_unlock_irqrestore(&rq->lock, flags);
10653 }
10654}
10655
10656void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10657 struct sched_entity *se, int cpu,
10658 struct sched_entity *parent)
10659{
10660 struct rq *rq = cpu_rq(cpu);
10661
10662 cfs_rq->tg = tg;
10663 cfs_rq->rq = rq;
10664 init_cfs_rq_runtime(cfs_rq);
10665
10666 tg->cfs_rq[cpu] = cfs_rq;
10667 tg->se[cpu] = se;
10668
10669
10670 if (!se)
10671 return;
10672
10673 if (!parent) {
10674 se->cfs_rq = &rq->cfs;
10675 se->depth = 0;
10676 } else {
10677 se->cfs_rq = parent->my_q;
10678 se->depth = parent->depth + 1;
10679 }
10680
10681 se->my_q = cfs_rq;
10682
10683 update_load_set(&se->load, NICE_0_LOAD);
10684 se->parent = parent;
10685}
10686
10687static DEFINE_MUTEX(shares_mutex);
10688
10689int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10690{
10691 int i;
10692
10693
10694
10695
10696 if (!tg->se[0])
10697 return -EINVAL;
10698
10699 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10700
10701 mutex_lock(&shares_mutex);
10702 if (tg->shares == shares)
10703 goto done;
10704
10705 tg->shares = shares;
10706 for_each_possible_cpu(i) {
10707 struct rq *rq = cpu_rq(i);
10708 struct sched_entity *se = tg->se[i];
10709 struct rq_flags rf;
10710
10711
10712 rq_lock_irqsave(rq, &rf);
10713 update_rq_clock(rq);
10714 for_each_sched_entity(se) {
10715 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10716 update_cfs_group(se);
10717 }
10718 rq_unlock_irqrestore(rq, &rf);
10719 }
10720
10721done:
10722 mutex_unlock(&shares_mutex);
10723 return 0;
10724}
10725#else
10726
10727void free_fair_sched_group(struct task_group *tg) { }
10728
10729int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10730{
10731 return 1;
10732}
10733
10734void online_fair_sched_group(struct task_group *tg) { }
10735
10736void unregister_fair_sched_group(struct task_group *tg) { }
10737
10738#endif
10739
10740
10741static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10742{
10743 struct sched_entity *se = &task->se;
10744 unsigned int rr_interval = 0;
10745
10746
10747
10748
10749
10750 if (rq->cfs.load.weight)
10751 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10752
10753 return rr_interval;
10754}
10755
10756
10757
10758
10759const struct sched_class fair_sched_class = {
10760 .next = &idle_sched_class,
10761 .enqueue_task = enqueue_task_fair,
10762 .dequeue_task = dequeue_task_fair,
10763 .yield_task = yield_task_fair,
10764 .yield_to_task = yield_to_task_fair,
10765
10766 .check_preempt_curr = check_preempt_wakeup,
10767
10768 .pick_next_task = __pick_next_task_fair,
10769 .put_prev_task = put_prev_task_fair,
10770 .set_next_task = set_next_task_fair,
10771
10772#ifdef CONFIG_SMP
10773 .balance = balance_fair,
10774 .select_task_rq = select_task_rq_fair,
10775 .migrate_task_rq = migrate_task_rq_fair,
10776
10777 .rq_online = rq_online_fair,
10778 .rq_offline = rq_offline_fair,
10779
10780 .task_dead = task_dead_fair,
10781 .set_cpus_allowed = set_cpus_allowed_common,
10782#endif
10783
10784 .task_tick = task_tick_fair,
10785 .task_fork = task_fork_fair,
10786
10787 .prio_changed = prio_changed_fair,
10788 .switched_from = switched_from_fair,
10789 .switched_to = switched_to_fair,
10790
10791 .get_rr_interval = get_rr_interval_fair,
10792
10793 .update_curr = update_curr_fair,
10794
10795#ifdef CONFIG_FAIR_GROUP_SCHED
10796 .task_change_group = task_change_group_fair,
10797#endif
10798
10799#ifdef CONFIG_UCLAMP_TASK
10800 .uclamp_enabled = 1,
10801#endif
10802};
10803
10804#ifdef CONFIG_SCHED_DEBUG
10805void print_cfs_stats(struct seq_file *m, int cpu)
10806{
10807 struct cfs_rq *cfs_rq, *pos;
10808
10809 rcu_read_lock();
10810 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10811 print_cfs_rq(m, cpu, cfs_rq);
10812 rcu_read_unlock();
10813}
10814
10815#ifdef CONFIG_NUMA_BALANCING
10816void show_numa_stats(struct task_struct *p, struct seq_file *m)
10817{
10818 int node;
10819 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10820 struct numa_group *ng;
10821
10822 rcu_read_lock();
10823 ng = rcu_dereference(p->numa_group);
10824 for_each_online_node(node) {
10825 if (p->numa_faults) {
10826 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10827 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10828 }
10829 if (ng) {
10830 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
10831 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
10832 }
10833 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10834 }
10835 rcu_read_unlock();
10836}
10837#endif
10838#endif
10839
10840__init void init_sched_fair_class(void)
10841{
10842#ifdef CONFIG_SMP
10843 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10844
10845#ifdef CONFIG_NO_HZ_COMMON
10846 nohz.next_balance = jiffies;
10847 nohz.next_blocked = jiffies;
10848 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10849#endif
10850#endif
10851
10852}
10853
10854
10855
10856
10857
10858const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10859{
10860#ifdef CONFIG_SMP
10861 return cfs_rq ? &cfs_rq->avg : NULL;
10862#else
10863 return NULL;
10864#endif
10865}
10866EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10867
10868char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10869{
10870 if (!cfs_rq) {
10871 if (str)
10872 strlcpy(str, "(null)", len);
10873 else
10874 return NULL;
10875 }
10876
10877 cfs_rq_tg_path(cfs_rq, str, len);
10878 return str;
10879}
10880EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10881
10882int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10883{
10884 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10885}
10886EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10887
10888const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10889{
10890#ifdef CONFIG_SMP
10891 return rq ? &rq->avg_rt : NULL;
10892#else
10893 return NULL;
10894#endif
10895}
10896EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10897
10898const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10899{
10900#ifdef CONFIG_SMP
10901 return rq ? &rq->avg_dl : NULL;
10902#else
10903 return NULL;
10904#endif
10905}
10906EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10907
10908const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10909{
10910#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10911 return rq ? &rq->avg_irq : NULL;
10912#else
10913 return NULL;
10914#endif
10915}
10916EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10917
10918int sched_trace_rq_cpu(struct rq *rq)
10919{
10920 return rq ? cpu_of(rq) : -1;
10921}
10922EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10923
10924const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10925{
10926#ifdef CONFIG_SMP
10927 return rd ? rd->span : NULL;
10928#else
10929 return NULL;
10930#endif
10931}
10932EXPORT_SYMBOL_GPL(sched_trace_rd_span);
10933