1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25#include <trace/events/sched.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40unsigned int sysctl_sched_latency = 6000000ULL;
41unsigned int normalized_sysctl_sched_latency = 6000000ULL;
42
43
44
45
46
47
48
49
50
51
52
53
54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
55
56
57
58
59
60
61unsigned int sysctl_sched_min_granularity = 750000ULL;
62unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
63
64
65
66
67static unsigned int sched_nr_latency = 8;
68
69
70
71
72
73unsigned int sysctl_sched_child_runs_first __read_mostly;
74
75
76
77
78
79
80
81
82
83
84unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
86
87const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
88
89#ifdef CONFIG_SMP
90
91
92
93int __weak arch_asym_cpu_priority(int cpu)
94{
95 return -cpu;
96}
97#endif
98
99#ifdef CONFIG_CFS_BANDWIDTH
100
101
102
103
104
105
106
107
108
109
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111#endif
112
113
114
115
116
117
118
119unsigned int capacity_margin = 1280;
120
121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122{
123 lw->weight += inc;
124 lw->inv_weight = 0;
125}
126
127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128{
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131}
132
133static inline void update_load_set(struct load_weight *lw, unsigned long w)
134{
135 lw->weight = w;
136 lw->inv_weight = 0;
137}
138
139
140
141
142
143
144
145
146
147
148static unsigned int get_update_sysctl_factor(void)
149{
150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167}
168
169static void update_sysctl(void)
170{
171 unsigned int factor = get_update_sysctl_factor();
172
173#define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178#undef SET_SYSCTL
179}
180
181void sched_init_granularity(void)
182{
183 update_sysctl();
184}
185
186#define WMULT_CONST (~0U)
187#define WMULT_SHIFT 32
188
189static void __update_inv_weight(struct load_weight *lw)
190{
191 unsigned long w;
192
193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204}
205
206
207
208
209
210
211
212
213
214
215
216
217
218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219{
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
222
223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
230 }
231
232
233 fact = (u64)(u32)fact * lw->inv_weight;
234
235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
239
240 return mul_u64_u32_shr(delta_exec, fact, shift);
241}
242
243
244const struct sched_class fair_sched_class;
245
246
247
248
249
250#ifdef CONFIG_FAIR_GROUP_SCHED
251
252
253static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
254{
255 return cfs_rq->rq;
256}
257
258static inline struct task_struct *task_of(struct sched_entity *se)
259{
260 SCHED_WARN_ON(!entity_is_task(se));
261 return container_of(se, struct task_struct, se);
262}
263
264
265#define for_each_sched_entity(se) \
266 for (; se; se = se->parent)
267
268static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
269{
270 return p->se.cfs_rq;
271}
272
273
274static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
275{
276 return se->cfs_rq;
277}
278
279
280static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
281{
282 return grp->my_q;
283}
284
285static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
286{
287 if (!cfs_rq->on_list) {
288 struct rq *rq = rq_of(cfs_rq);
289 int cpu = cpu_of(rq);
290
291
292
293
294
295
296
297
298
299 if (cfs_rq->tg->parent &&
300 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
301
302
303
304
305
306
307 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
308 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
309
310
311
312
313
314 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
315 } else if (!cfs_rq->tg->parent) {
316
317
318
319
320 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
321 &rq->leaf_cfs_rq_list);
322
323
324
325
326 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
327 } else {
328
329
330
331
332
333
334 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
335 rq->tmp_alone_branch);
336
337
338
339
340 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
341 }
342
343 cfs_rq->on_list = 1;
344 }
345}
346
347static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
348{
349 if (cfs_rq->on_list) {
350 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
351 cfs_rq->on_list = 0;
352 }
353}
354
355
356#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
357 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
358 leaf_cfs_rq_list)
359
360
361static inline struct cfs_rq *
362is_same_group(struct sched_entity *se, struct sched_entity *pse)
363{
364 if (se->cfs_rq == pse->cfs_rq)
365 return se->cfs_rq;
366
367 return NULL;
368}
369
370static inline struct sched_entity *parent_entity(struct sched_entity *se)
371{
372 return se->parent;
373}
374
375static void
376find_matching_se(struct sched_entity **se, struct sched_entity **pse)
377{
378 int se_depth, pse_depth;
379
380
381
382
383
384
385
386
387
388 se_depth = (*se)->depth;
389 pse_depth = (*pse)->depth;
390
391 while (se_depth > pse_depth) {
392 se_depth--;
393 *se = parent_entity(*se);
394 }
395
396 while (pse_depth > se_depth) {
397 pse_depth--;
398 *pse = parent_entity(*pse);
399 }
400
401 while (!is_same_group(*se, *pse)) {
402 *se = parent_entity(*se);
403 *pse = parent_entity(*pse);
404 }
405}
406
407#else
408
409static inline struct task_struct *task_of(struct sched_entity *se)
410{
411 return container_of(se, struct task_struct, se);
412}
413
414static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
415{
416 return container_of(cfs_rq, struct rq, cfs);
417}
418
419
420#define for_each_sched_entity(se) \
421 for (; se; se = NULL)
422
423static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
424{
425 return &task_rq(p)->cfs;
426}
427
428static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
429{
430 struct task_struct *p = task_of(se);
431 struct rq *rq = task_rq(p);
432
433 return &rq->cfs;
434}
435
436
437static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
438{
439 return NULL;
440}
441
442static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
443{
444}
445
446static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
447{
448}
449
450#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
451 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
452
453static inline struct sched_entity *parent_entity(struct sched_entity *se)
454{
455 return NULL;
456}
457
458static inline void
459find_matching_se(struct sched_entity **se, struct sched_entity **pse)
460{
461}
462
463#endif
464
465static __always_inline
466void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
467
468
469
470
471
472static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
473{
474 s64 delta = (s64)(vruntime - max_vruntime);
475 if (delta > 0)
476 max_vruntime = vruntime;
477
478 return max_vruntime;
479}
480
481static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
482{
483 s64 delta = (s64)(vruntime - min_vruntime);
484 if (delta < 0)
485 min_vruntime = vruntime;
486
487 return min_vruntime;
488}
489
490static inline int entity_before(struct sched_entity *a,
491 struct sched_entity *b)
492{
493 return (s64)(a->vruntime - b->vruntime) < 0;
494}
495
496static void update_min_vruntime(struct cfs_rq *cfs_rq)
497{
498 struct sched_entity *curr = cfs_rq->curr;
499 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
500
501 u64 vruntime = cfs_rq->min_vruntime;
502
503 if (curr) {
504 if (curr->on_rq)
505 vruntime = curr->vruntime;
506 else
507 curr = NULL;
508 }
509
510 if (leftmost) {
511 struct sched_entity *se;
512 se = rb_entry(leftmost, struct sched_entity, run_node);
513
514 if (!curr)
515 vruntime = se->vruntime;
516 else
517 vruntime = min_vruntime(vruntime, se->vruntime);
518 }
519
520
521 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
522#ifndef CONFIG_64BIT
523 smp_wmb();
524 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
525#endif
526}
527
528
529
530
531static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
532{
533 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
534 struct rb_node *parent = NULL;
535 struct sched_entity *entry;
536 bool leftmost = true;
537
538
539
540
541 while (*link) {
542 parent = *link;
543 entry = rb_entry(parent, struct sched_entity, run_node);
544
545
546
547
548 if (entity_before(se, entry)) {
549 link = &parent->rb_left;
550 } else {
551 link = &parent->rb_right;
552 leftmost = false;
553 }
554 }
555
556 rb_link_node(&se->run_node, parent, link);
557 rb_insert_color_cached(&se->run_node,
558 &cfs_rq->tasks_timeline, leftmost);
559}
560
561static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
562{
563 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
564}
565
566struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
567{
568 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
569
570 if (!left)
571 return NULL;
572
573 return rb_entry(left, struct sched_entity, run_node);
574}
575
576static struct sched_entity *__pick_next_entity(struct sched_entity *se)
577{
578 struct rb_node *next = rb_next(&se->run_node);
579
580 if (!next)
581 return NULL;
582
583 return rb_entry(next, struct sched_entity, run_node);
584}
585
586#ifdef CONFIG_SCHED_DEBUG
587struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
588{
589 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
590
591 if (!last)
592 return NULL;
593
594 return rb_entry(last, struct sched_entity, run_node);
595}
596
597
598
599
600
601int sched_proc_update_handler(struct ctl_table *table, int write,
602 void __user *buffer, size_t *lenp,
603 loff_t *ppos)
604{
605 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
606 unsigned int factor = get_update_sysctl_factor();
607
608 if (ret || !write)
609 return ret;
610
611 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
612 sysctl_sched_min_granularity);
613
614#define WRT_SYSCTL(name) \
615 (normalized_sysctl_##name = sysctl_##name / (factor))
616 WRT_SYSCTL(sched_min_granularity);
617 WRT_SYSCTL(sched_latency);
618 WRT_SYSCTL(sched_wakeup_granularity);
619#undef WRT_SYSCTL
620
621 return 0;
622}
623#endif
624
625
626
627
628static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
629{
630 if (unlikely(se->load.weight != NICE_0_LOAD))
631 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
632
633 return delta;
634}
635
636
637
638
639
640
641
642
643
644static u64 __sched_period(unsigned long nr_running)
645{
646 if (unlikely(nr_running > sched_nr_latency))
647 return nr_running * sysctl_sched_min_granularity;
648 else
649 return sysctl_sched_latency;
650}
651
652
653
654
655
656
657
658static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
659{
660 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
661
662 for_each_sched_entity(se) {
663 struct load_weight *load;
664 struct load_weight lw;
665
666 cfs_rq = cfs_rq_of(se);
667 load = &cfs_rq->load;
668
669 if (unlikely(!se->on_rq)) {
670 lw = cfs_rq->load;
671
672 update_load_add(&lw, se->load.weight);
673 load = &lw;
674 }
675 slice = __calc_delta(slice, se->load.weight, load);
676 }
677 return slice;
678}
679
680
681
682
683
684
685static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
686{
687 return calc_delta_fair(sched_slice(cfs_rq, se), se);
688}
689
690#ifdef CONFIG_SMP
691#include "pelt.h"
692#include "sched-pelt.h"
693
694static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
695static unsigned long task_h_load(struct task_struct *p);
696
697
698void init_entity_runnable_average(struct sched_entity *se)
699{
700 struct sched_avg *sa = &se->avg;
701
702 memset(sa, 0, sizeof(*sa));
703
704
705
706
707
708
709
710 if (entity_is_task(se))
711 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
712
713 se->runnable_weight = se->load.weight;
714
715
716}
717
718static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
719static void attach_entity_cfs_rq(struct sched_entity *se);
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747void post_init_entity_util_avg(struct sched_entity *se)
748{
749 struct cfs_rq *cfs_rq = cfs_rq_of(se);
750 struct sched_avg *sa = &se->avg;
751 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
752 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
753
754 if (cap > 0) {
755 if (cfs_rq->avg.util_avg != 0) {
756 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
757 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
758
759 if (sa->util_avg > cap)
760 sa->util_avg = cap;
761 } else {
762 sa->util_avg = cap;
763 }
764 }
765
766 if (entity_is_task(se)) {
767 struct task_struct *p = task_of(se);
768 if (p->sched_class != &fair_sched_class) {
769
770
771
772
773
774
775
776
777
778
779 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
780 return;
781 }
782 }
783
784 attach_entity_cfs_rq(se);
785}
786
787#else
788void init_entity_runnable_average(struct sched_entity *se)
789{
790}
791void post_init_entity_util_avg(struct sched_entity *se)
792{
793}
794static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
795{
796}
797#endif
798
799
800
801
802static void update_curr(struct cfs_rq *cfs_rq)
803{
804 struct sched_entity *curr = cfs_rq->curr;
805 u64 now = rq_clock_task(rq_of(cfs_rq));
806 u64 delta_exec;
807
808 if (unlikely(!curr))
809 return;
810
811 delta_exec = now - curr->exec_start;
812 if (unlikely((s64)delta_exec <= 0))
813 return;
814
815 curr->exec_start = now;
816
817 schedstat_set(curr->statistics.exec_max,
818 max(delta_exec, curr->statistics.exec_max));
819
820 curr->sum_exec_runtime += delta_exec;
821 schedstat_add(cfs_rq->exec_clock, delta_exec);
822
823 curr->vruntime += calc_delta_fair(delta_exec, curr);
824 update_min_vruntime(cfs_rq);
825
826 if (entity_is_task(curr)) {
827 struct task_struct *curtask = task_of(curr);
828
829 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
830 cgroup_account_cputime(curtask, delta_exec);
831 account_group_exec_runtime(curtask, delta_exec);
832 }
833
834 account_cfs_rq_runtime(cfs_rq, delta_exec);
835}
836
837static void update_curr_fair(struct rq *rq)
838{
839 update_curr(cfs_rq_of(&rq->curr->se));
840}
841
842static inline void
843update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
844{
845 u64 wait_start, prev_wait_start;
846
847 if (!schedstat_enabled())
848 return;
849
850 wait_start = rq_clock(rq_of(cfs_rq));
851 prev_wait_start = schedstat_val(se->statistics.wait_start);
852
853 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
854 likely(wait_start > prev_wait_start))
855 wait_start -= prev_wait_start;
856
857 __schedstat_set(se->statistics.wait_start, wait_start);
858}
859
860static inline void
861update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
862{
863 struct task_struct *p;
864 u64 delta;
865
866 if (!schedstat_enabled())
867 return;
868
869 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
870
871 if (entity_is_task(se)) {
872 p = task_of(se);
873 if (task_on_rq_migrating(p)) {
874
875
876
877
878
879 __schedstat_set(se->statistics.wait_start, delta);
880 return;
881 }
882 trace_sched_stat_wait(p, delta);
883 }
884
885 __schedstat_set(se->statistics.wait_max,
886 max(schedstat_val(se->statistics.wait_max), delta));
887 __schedstat_inc(se->statistics.wait_count);
888 __schedstat_add(se->statistics.wait_sum, delta);
889 __schedstat_set(se->statistics.wait_start, 0);
890}
891
892static inline void
893update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
894{
895 struct task_struct *tsk = NULL;
896 u64 sleep_start, block_start;
897
898 if (!schedstat_enabled())
899 return;
900
901 sleep_start = schedstat_val(se->statistics.sleep_start);
902 block_start = schedstat_val(se->statistics.block_start);
903
904 if (entity_is_task(se))
905 tsk = task_of(se);
906
907 if (sleep_start) {
908 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
909
910 if ((s64)delta < 0)
911 delta = 0;
912
913 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
914 __schedstat_set(se->statistics.sleep_max, delta);
915
916 __schedstat_set(se->statistics.sleep_start, 0);
917 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
918
919 if (tsk) {
920 account_scheduler_latency(tsk, delta >> 10, 1);
921 trace_sched_stat_sleep(tsk, delta);
922 }
923 }
924 if (block_start) {
925 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
926
927 if ((s64)delta < 0)
928 delta = 0;
929
930 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
931 __schedstat_set(se->statistics.block_max, delta);
932
933 __schedstat_set(se->statistics.block_start, 0);
934 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
935
936 if (tsk) {
937 if (tsk->in_iowait) {
938 __schedstat_add(se->statistics.iowait_sum, delta);
939 __schedstat_inc(se->statistics.iowait_count);
940 trace_sched_stat_iowait(tsk, delta);
941 }
942
943 trace_sched_stat_blocked(tsk, delta);
944
945
946
947
948
949
950 if (unlikely(prof_on == SLEEP_PROFILING)) {
951 profile_hits(SLEEP_PROFILING,
952 (void *)get_wchan(tsk),
953 delta >> 20);
954 }
955 account_scheduler_latency(tsk, delta >> 10, 0);
956 }
957 }
958}
959
960
961
962
963static inline void
964update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
965{
966 if (!schedstat_enabled())
967 return;
968
969
970
971
972
973 if (se != cfs_rq->curr)
974 update_stats_wait_start(cfs_rq, se);
975
976 if (flags & ENQUEUE_WAKEUP)
977 update_stats_enqueue_sleeper(cfs_rq, se);
978}
979
980static inline void
981update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
982{
983
984 if (!schedstat_enabled())
985 return;
986
987
988
989
990
991 if (se != cfs_rq->curr)
992 update_stats_wait_end(cfs_rq, se);
993
994 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
995 struct task_struct *tsk = task_of(se);
996
997 if (tsk->state & TASK_INTERRUPTIBLE)
998 __schedstat_set(se->statistics.sleep_start,
999 rq_clock(rq_of(cfs_rq)));
1000 if (tsk->state & TASK_UNINTERRUPTIBLE)
1001 __schedstat_set(se->statistics.block_start,
1002 rq_clock(rq_of(cfs_rq)));
1003 }
1004}
1005
1006
1007
1008
1009static inline void
1010update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1011{
1012
1013
1014
1015 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1016}
1017
1018
1019
1020
1021
1022#ifdef CONFIG_NUMA_BALANCING
1023
1024
1025
1026
1027
1028unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1029unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1030
1031
1032unsigned int sysctl_numa_balancing_scan_size = 256;
1033
1034
1035unsigned int sysctl_numa_balancing_scan_delay = 1000;
1036
1037struct numa_group {
1038 atomic_t refcount;
1039
1040 spinlock_t lock;
1041 int nr_tasks;
1042 pid_t gid;
1043 int active_nodes;
1044
1045 struct rcu_head rcu;
1046 unsigned long total_faults;
1047 unsigned long max_faults_cpu;
1048
1049
1050
1051
1052
1053 unsigned long *faults_cpu;
1054 unsigned long faults[0];
1055};
1056
1057static inline unsigned long group_faults_priv(struct numa_group *ng);
1058static inline unsigned long group_faults_shared(struct numa_group *ng);
1059
1060static unsigned int task_nr_scan_windows(struct task_struct *p)
1061{
1062 unsigned long rss = 0;
1063 unsigned long nr_scan_pages;
1064
1065
1066
1067
1068
1069
1070 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1071 rss = get_mm_rss(p->mm);
1072 if (!rss)
1073 rss = nr_scan_pages;
1074
1075 rss = round_up(rss, nr_scan_pages);
1076 return rss / nr_scan_pages;
1077}
1078
1079
1080#define MAX_SCAN_WINDOW 2560
1081
1082static unsigned int task_scan_min(struct task_struct *p)
1083{
1084 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1085 unsigned int scan, floor;
1086 unsigned int windows = 1;
1087
1088 if (scan_size < MAX_SCAN_WINDOW)
1089 windows = MAX_SCAN_WINDOW / scan_size;
1090 floor = 1000 / windows;
1091
1092 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1093 return max_t(unsigned int, floor, scan);
1094}
1095
1096static unsigned int task_scan_start(struct task_struct *p)
1097{
1098 unsigned long smin = task_scan_min(p);
1099 unsigned long period = smin;
1100
1101
1102 if (p->numa_group) {
1103 struct numa_group *ng = p->numa_group;
1104 unsigned long shared = group_faults_shared(ng);
1105 unsigned long private = group_faults_priv(ng);
1106
1107 period *= atomic_read(&ng->refcount);
1108 period *= shared + 1;
1109 period /= private + shared + 1;
1110 }
1111
1112 return max(smin, period);
1113}
1114
1115static unsigned int task_scan_max(struct task_struct *p)
1116{
1117 unsigned long smin = task_scan_min(p);
1118 unsigned long smax;
1119
1120
1121 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1122
1123
1124 if (p->numa_group) {
1125 struct numa_group *ng = p->numa_group;
1126 unsigned long shared = group_faults_shared(ng);
1127 unsigned long private = group_faults_priv(ng);
1128 unsigned long period = smax;
1129
1130 period *= atomic_read(&ng->refcount);
1131 period *= shared + 1;
1132 period /= private + shared + 1;
1133
1134 smax = max(smax, period);
1135 }
1136
1137 return max(smin, smax);
1138}
1139
1140void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1141{
1142 int mm_users = 0;
1143 struct mm_struct *mm = p->mm;
1144
1145 if (mm) {
1146 mm_users = atomic_read(&mm->mm_users);
1147 if (mm_users == 1) {
1148 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1149 mm->numa_scan_seq = 0;
1150 }
1151 }
1152 p->node_stamp = 0;
1153 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1154 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1155 p->numa_work.next = &p->numa_work;
1156 p->numa_faults = NULL;
1157 p->numa_group = NULL;
1158 p->last_task_numa_placement = 0;
1159 p->last_sum_exec_runtime = 0;
1160
1161
1162 if (!(clone_flags & CLONE_VM)) {
1163 p->numa_preferred_nid = -1;
1164 return;
1165 }
1166
1167
1168
1169
1170
1171 if (mm) {
1172 unsigned int delay;
1173
1174 delay = min_t(unsigned int, task_scan_max(current),
1175 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1176 delay += 2 * TICK_NSEC;
1177 p->node_stamp = delay;
1178 }
1179}
1180
1181static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1182{
1183 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1184 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1185}
1186
1187static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1188{
1189 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1190 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1191}
1192
1193
1194#define NR_NUMA_HINT_FAULT_TYPES 2
1195
1196
1197#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1198
1199
1200#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1201
1202pid_t task_numa_group_id(struct task_struct *p)
1203{
1204 return p->numa_group ? p->numa_group->gid : 0;
1205}
1206
1207
1208
1209
1210
1211
1212
1213static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1214{
1215 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1216}
1217
1218static inline unsigned long task_faults(struct task_struct *p, int nid)
1219{
1220 if (!p->numa_faults)
1221 return 0;
1222
1223 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1224 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1225}
1226
1227static inline unsigned long group_faults(struct task_struct *p, int nid)
1228{
1229 if (!p->numa_group)
1230 return 0;
1231
1232 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1233 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1234}
1235
1236static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1237{
1238 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1239 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1240}
1241
1242static inline unsigned long group_faults_priv(struct numa_group *ng)
1243{
1244 unsigned long faults = 0;
1245 int node;
1246
1247 for_each_online_node(node) {
1248 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1249 }
1250
1251 return faults;
1252}
1253
1254static inline unsigned long group_faults_shared(struct numa_group *ng)
1255{
1256 unsigned long faults = 0;
1257 int node;
1258
1259 for_each_online_node(node) {
1260 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1261 }
1262
1263 return faults;
1264}
1265
1266
1267
1268
1269
1270
1271#define ACTIVE_NODE_FRACTION 3
1272
1273static bool numa_is_active_node(int nid, struct numa_group *ng)
1274{
1275 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1276}
1277
1278
1279static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1280 int maxdist, bool task)
1281{
1282 unsigned long score = 0;
1283 int node;
1284
1285
1286
1287
1288
1289 if (sched_numa_topology_type == NUMA_DIRECT)
1290 return 0;
1291
1292
1293
1294
1295
1296 for_each_online_node(node) {
1297 unsigned long faults;
1298 int dist = node_distance(nid, node);
1299
1300
1301
1302
1303
1304 if (dist == sched_max_numa_distance || node == nid)
1305 continue;
1306
1307
1308
1309
1310
1311
1312
1313
1314 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1315 dist >= maxdist)
1316 continue;
1317
1318
1319 if (task)
1320 faults = task_faults(p, node);
1321 else
1322 faults = group_faults(p, node);
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1333 faults *= (sched_max_numa_distance - dist);
1334 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1335 }
1336
1337 score += faults;
1338 }
1339
1340 return score;
1341}
1342
1343
1344
1345
1346
1347
1348
1349static inline unsigned long task_weight(struct task_struct *p, int nid,
1350 int dist)
1351{
1352 unsigned long faults, total_faults;
1353
1354 if (!p->numa_faults)
1355 return 0;
1356
1357 total_faults = p->total_numa_faults;
1358
1359 if (!total_faults)
1360 return 0;
1361
1362 faults = task_faults(p, nid);
1363 faults += score_nearby_nodes(p, nid, dist, true);
1364
1365 return 1000 * faults / total_faults;
1366}
1367
1368static inline unsigned long group_weight(struct task_struct *p, int nid,
1369 int dist)
1370{
1371 unsigned long faults, total_faults;
1372
1373 if (!p->numa_group)
1374 return 0;
1375
1376 total_faults = p->numa_group->total_faults;
1377
1378 if (!total_faults)
1379 return 0;
1380
1381 faults = group_faults(p, nid);
1382 faults += score_nearby_nodes(p, nid, dist, false);
1383
1384 return 1000 * faults / total_faults;
1385}
1386
1387bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1388 int src_nid, int dst_cpu)
1389{
1390 struct numa_group *ng = p->numa_group;
1391 int dst_nid = cpu_to_node(dst_cpu);
1392 int last_cpupid, this_cpupid;
1393
1394 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1395 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1396
1397
1398
1399
1400
1401
1402
1403 if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
1404 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1405 return true;
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424 if (!cpupid_pid_unset(last_cpupid) &&
1425 cpupid_to_nid(last_cpupid) != dst_nid)
1426 return false;
1427
1428
1429 if (cpupid_match_pid(p, last_cpupid))
1430 return true;
1431
1432
1433 if (!ng)
1434 return true;
1435
1436
1437
1438
1439
1440 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1441 ACTIVE_NODE_FRACTION)
1442 return true;
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1453 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1454}
1455
1456static unsigned long weighted_cpuload(struct rq *rq);
1457static unsigned long source_load(int cpu, int type);
1458static unsigned long target_load(int cpu, int type);
1459static unsigned long capacity_of(int cpu);
1460
1461
1462struct numa_stats {
1463 unsigned long load;
1464
1465
1466 unsigned long compute_capacity;
1467
1468 unsigned int nr_running;
1469};
1470
1471
1472
1473
1474static void update_numa_stats(struct numa_stats *ns, int nid)
1475{
1476 int smt, cpu, cpus = 0;
1477 unsigned long capacity;
1478
1479 memset(ns, 0, sizeof(*ns));
1480 for_each_cpu(cpu, cpumask_of_node(nid)) {
1481 struct rq *rq = cpu_rq(cpu);
1482
1483 ns->nr_running += rq->nr_running;
1484 ns->load += weighted_cpuload(rq);
1485 ns->compute_capacity += capacity_of(cpu);
1486
1487 cpus++;
1488 }
1489
1490
1491
1492
1493
1494
1495
1496
1497 if (!cpus)
1498 return;
1499
1500
1501 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1502 capacity = cpus / smt;
1503
1504 capacity = min_t(unsigned, capacity,
1505 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1506}
1507
1508struct task_numa_env {
1509 struct task_struct *p;
1510
1511 int src_cpu, src_nid;
1512 int dst_cpu, dst_nid;
1513
1514 struct numa_stats src_stats, dst_stats;
1515
1516 int imbalance_pct;
1517 int dist;
1518
1519 struct task_struct *best_task;
1520 long best_imp;
1521 int best_cpu;
1522};
1523
1524static void task_numa_assign(struct task_numa_env *env,
1525 struct task_struct *p, long imp)
1526{
1527 struct rq *rq = cpu_rq(env->dst_cpu);
1528
1529
1530 if (xchg(&rq->numa_migrate_on, 1))
1531 return;
1532
1533
1534
1535
1536
1537 if (env->best_cpu != -1) {
1538 rq = cpu_rq(env->best_cpu);
1539 WRITE_ONCE(rq->numa_migrate_on, 0);
1540 }
1541
1542 if (env->best_task)
1543 put_task_struct(env->best_task);
1544 if (p)
1545 get_task_struct(p);
1546
1547 env->best_task = p;
1548 env->best_imp = imp;
1549 env->best_cpu = env->dst_cpu;
1550}
1551
1552static bool load_too_imbalanced(long src_load, long dst_load,
1553 struct task_numa_env *env)
1554{
1555 long imb, old_imb;
1556 long orig_src_load, orig_dst_load;
1557 long src_capacity, dst_capacity;
1558
1559
1560
1561
1562
1563
1564
1565
1566 src_capacity = env->src_stats.compute_capacity;
1567 dst_capacity = env->dst_stats.compute_capacity;
1568
1569 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1570
1571 orig_src_load = env->src_stats.load;
1572 orig_dst_load = env->dst_stats.load;
1573
1574 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1575
1576
1577 return (imb > old_imb);
1578}
1579
1580
1581
1582
1583
1584
1585#define SMALLIMP 30
1586
1587
1588
1589
1590
1591
1592
1593static void task_numa_compare(struct task_numa_env *env,
1594 long taskimp, long groupimp, bool maymove)
1595{
1596 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1597 struct task_struct *cur;
1598 long src_load, dst_load;
1599 long load;
1600 long imp = env->p->numa_group ? groupimp : taskimp;
1601 long moveimp = imp;
1602 int dist = env->dist;
1603
1604 if (READ_ONCE(dst_rq->numa_migrate_on))
1605 return;
1606
1607 rcu_read_lock();
1608 cur = task_rcu_dereference(&dst_rq->curr);
1609 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1610 cur = NULL;
1611
1612
1613
1614
1615
1616 if (cur == env->p)
1617 goto unlock;
1618
1619 if (!cur) {
1620 if (maymove && moveimp >= env->best_imp)
1621 goto assign;
1622 else
1623 goto unlock;
1624 }
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1635 goto unlock;
1636
1637
1638
1639
1640
1641 if (cur->numa_group == env->p->numa_group) {
1642 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1643 task_weight(cur, env->dst_nid, dist);
1644
1645
1646
1647
1648 if (cur->numa_group)
1649 imp -= imp / 16;
1650 } else {
1651
1652
1653
1654
1655 if (cur->numa_group && env->p->numa_group)
1656 imp += group_weight(cur, env->src_nid, dist) -
1657 group_weight(cur, env->dst_nid, dist);
1658 else
1659 imp += task_weight(cur, env->src_nid, dist) -
1660 task_weight(cur, env->dst_nid, dist);
1661 }
1662
1663 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1664 imp = moveimp;
1665 cur = NULL;
1666 goto assign;
1667 }
1668
1669
1670
1671
1672
1673
1674
1675 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1676 goto unlock;
1677
1678
1679
1680
1681 load = task_h_load(env->p) - task_h_load(cur);
1682 if (!load)
1683 goto assign;
1684
1685 dst_load = env->dst_stats.load + load;
1686 src_load = env->src_stats.load - load;
1687
1688 if (load_too_imbalanced(src_load, dst_load, env))
1689 goto unlock;
1690
1691assign:
1692
1693
1694
1695
1696 if (!cur) {
1697
1698
1699
1700
1701 local_irq_disable();
1702 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1703 env->dst_cpu);
1704 local_irq_enable();
1705 }
1706
1707 task_numa_assign(env, cur, imp);
1708unlock:
1709 rcu_read_unlock();
1710}
1711
1712static void task_numa_find_cpu(struct task_numa_env *env,
1713 long taskimp, long groupimp)
1714{
1715 long src_load, dst_load, load;
1716 bool maymove = false;
1717 int cpu;
1718
1719 load = task_h_load(env->p);
1720 dst_load = env->dst_stats.load + load;
1721 src_load = env->src_stats.load - load;
1722
1723
1724
1725
1726
1727 maymove = !load_too_imbalanced(src_load, dst_load, env);
1728
1729 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1730
1731 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1732 continue;
1733
1734 env->dst_cpu = cpu;
1735 task_numa_compare(env, taskimp, groupimp, maymove);
1736 }
1737}
1738
1739static int task_numa_migrate(struct task_struct *p)
1740{
1741 struct task_numa_env env = {
1742 .p = p,
1743
1744 .src_cpu = task_cpu(p),
1745 .src_nid = task_node(p),
1746
1747 .imbalance_pct = 112,
1748
1749 .best_task = NULL,
1750 .best_imp = 0,
1751 .best_cpu = -1,
1752 };
1753 struct sched_domain *sd;
1754 struct rq *best_rq;
1755 unsigned long taskweight, groupweight;
1756 int nid, ret, dist;
1757 long taskimp, groupimp;
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767 rcu_read_lock();
1768 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1769 if (sd)
1770 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1771 rcu_read_unlock();
1772
1773
1774
1775
1776
1777
1778
1779 if (unlikely(!sd)) {
1780 sched_setnuma(p, task_node(p));
1781 return -EINVAL;
1782 }
1783
1784 env.dst_nid = p->numa_preferred_nid;
1785 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1786 taskweight = task_weight(p, env.src_nid, dist);
1787 groupweight = group_weight(p, env.src_nid, dist);
1788 update_numa_stats(&env.src_stats, env.src_nid);
1789 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1790 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1791 update_numa_stats(&env.dst_stats, env.dst_nid);
1792
1793
1794 task_numa_find_cpu(&env, taskimp, groupimp);
1795
1796
1797
1798
1799
1800
1801
1802
1803 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1804 for_each_online_node(nid) {
1805 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1806 continue;
1807
1808 dist = node_distance(env.src_nid, env.dst_nid);
1809 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1810 dist != env.dist) {
1811 taskweight = task_weight(p, env.src_nid, dist);
1812 groupweight = group_weight(p, env.src_nid, dist);
1813 }
1814
1815
1816 taskimp = task_weight(p, nid, dist) - taskweight;
1817 groupimp = group_weight(p, nid, dist) - groupweight;
1818 if (taskimp < 0 && groupimp < 0)
1819 continue;
1820
1821 env.dist = dist;
1822 env.dst_nid = nid;
1823 update_numa_stats(&env.dst_stats, env.dst_nid);
1824 task_numa_find_cpu(&env, taskimp, groupimp);
1825 }
1826 }
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836 if (p->numa_group) {
1837 if (env.best_cpu == -1)
1838 nid = env.src_nid;
1839 else
1840 nid = cpu_to_node(env.best_cpu);
1841
1842 if (nid != p->numa_preferred_nid)
1843 sched_setnuma(p, nid);
1844 }
1845
1846
1847 if (env.best_cpu == -1)
1848 return -EAGAIN;
1849
1850 best_rq = cpu_rq(env.best_cpu);
1851 if (env.best_task == NULL) {
1852 ret = migrate_task_to(p, env.best_cpu);
1853 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1854 if (ret != 0)
1855 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1856 return ret;
1857 }
1858
1859 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1860 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1861
1862 if (ret != 0)
1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1864 put_task_struct(env.best_task);
1865 return ret;
1866}
1867
1868
1869static void numa_migrate_preferred(struct task_struct *p)
1870{
1871 unsigned long interval = HZ;
1872
1873
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1875 return;
1876
1877
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval;
1880
1881
1882 if (task_node(p) == p->numa_preferred_nid)
1883 return;
1884
1885
1886 task_numa_migrate(p);
1887}
1888
1889
1890
1891
1892
1893
1894
1895static void numa_group_count_active_nodes(struct numa_group *numa_group)
1896{
1897 unsigned long faults, max_faults = 0;
1898 int nid, active_nodes = 0;
1899
1900 for_each_online_node(nid) {
1901 faults = group_faults_cpu(numa_group, nid);
1902 if (faults > max_faults)
1903 max_faults = faults;
1904 }
1905
1906 for_each_online_node(nid) {
1907 faults = group_faults_cpu(numa_group, nid);
1908 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1909 active_nodes++;
1910 }
1911
1912 numa_group->max_faults_cpu = max_faults;
1913 numa_group->active_nodes = active_nodes;
1914}
1915
1916
1917
1918
1919
1920
1921
1922
1923#define NUMA_PERIOD_SLOTS 10
1924#define NUMA_PERIOD_THRESHOLD 7
1925
1926
1927
1928
1929
1930
1931
1932static void update_task_scan_period(struct task_struct *p,
1933 unsigned long shared, unsigned long private)
1934{
1935 unsigned int period_slot;
1936 int lr_ratio, ps_ratio;
1937 int diff;
1938
1939 unsigned long remote = p->numa_faults_locality[0];
1940 unsigned long local = p->numa_faults_locality[1];
1941
1942
1943
1944
1945
1946
1947
1948
1949 if (local + shared == 0 || p->numa_faults_locality[2]) {
1950 p->numa_scan_period = min(p->numa_scan_period_max,
1951 p->numa_scan_period << 1);
1952
1953 p->mm->numa_next_scan = jiffies +
1954 msecs_to_jiffies(p->numa_scan_period);
1955
1956 return;
1957 }
1958
1959
1960
1961
1962
1963
1964
1965 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1966 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1967 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1968
1969 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1970
1971
1972
1973
1974 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1975 if (!slot)
1976 slot = 1;
1977 diff = slot * period_slot;
1978 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1979
1980
1981
1982
1983
1984 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1985 if (!slot)
1986 slot = 1;
1987 diff = slot * period_slot;
1988 } else {
1989
1990
1991
1992
1993
1994 int ratio = max(lr_ratio, ps_ratio);
1995 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1996 }
1997
1998 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1999 task_scan_min(p), task_scan_max(p));
2000 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2001}
2002
2003
2004
2005
2006
2007
2008
2009
2010static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2011{
2012 u64 runtime, delta, now;
2013
2014 now = p->se.exec_start;
2015 runtime = p->se.sum_exec_runtime;
2016
2017 if (p->last_task_numa_placement) {
2018 delta = runtime - p->last_sum_exec_runtime;
2019 *period = now - p->last_task_numa_placement;
2020 } else {
2021 delta = p->se.avg.load_sum;
2022 *period = LOAD_AVG_MAX;
2023 }
2024
2025 p->last_sum_exec_runtime = runtime;
2026 p->last_task_numa_placement = now;
2027
2028 return delta;
2029}
2030
2031
2032
2033
2034
2035
2036static int preferred_group_nid(struct task_struct *p, int nid)
2037{
2038 nodemask_t nodes;
2039 int dist;
2040
2041
2042 if (sched_numa_topology_type == NUMA_DIRECT)
2043 return nid;
2044
2045
2046
2047
2048
2049
2050 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2051 unsigned long score, max_score = 0;
2052 int node, max_node = nid;
2053
2054 dist = sched_max_numa_distance;
2055
2056 for_each_online_node(node) {
2057 score = group_weight(p, node, dist);
2058 if (score > max_score) {
2059 max_score = score;
2060 max_node = node;
2061 }
2062 }
2063 return max_node;
2064 }
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075 nodes = node_online_map;
2076 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2077 unsigned long max_faults = 0;
2078 nodemask_t max_group = NODE_MASK_NONE;
2079 int a, b;
2080
2081
2082 if (!find_numa_distance(dist))
2083 continue;
2084
2085 for_each_node_mask(a, nodes) {
2086 unsigned long faults = 0;
2087 nodemask_t this_group;
2088 nodes_clear(this_group);
2089
2090
2091 for_each_node_mask(b, nodes) {
2092 if (node_distance(a, b) < dist) {
2093 faults += group_faults(p, b);
2094 node_set(b, this_group);
2095 node_clear(b, nodes);
2096 }
2097 }
2098
2099
2100 if (faults > max_faults) {
2101 max_faults = faults;
2102 max_group = this_group;
2103
2104
2105
2106
2107
2108 nid = a;
2109 }
2110 }
2111
2112 if (!max_faults)
2113 break;
2114 nodes = max_group;
2115 }
2116 return nid;
2117}
2118
2119static void task_numa_placement(struct task_struct *p)
2120{
2121 int seq, nid, max_nid = -1;
2122 unsigned long max_faults = 0;
2123 unsigned long fault_types[2] = { 0, 0 };
2124 unsigned long total_faults;
2125 u64 runtime, period;
2126 spinlock_t *group_lock = NULL;
2127
2128
2129
2130
2131
2132
2133 seq = READ_ONCE(p->mm->numa_scan_seq);
2134 if (p->numa_scan_seq == seq)
2135 return;
2136 p->numa_scan_seq = seq;
2137 p->numa_scan_period_max = task_scan_max(p);
2138
2139 total_faults = p->numa_faults_locality[0] +
2140 p->numa_faults_locality[1];
2141 runtime = numa_get_avg_runtime(p, &period);
2142
2143
2144 if (p->numa_group) {
2145 group_lock = &p->numa_group->lock;
2146 spin_lock_irq(group_lock);
2147 }
2148
2149
2150 for_each_online_node(nid) {
2151
2152 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2153 unsigned long faults = 0, group_faults = 0;
2154 int priv;
2155
2156 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2157 long diff, f_diff, f_weight;
2158
2159 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2160 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2161 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2162 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2163
2164
2165 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2166 fault_types[priv] += p->numa_faults[membuf_idx];
2167 p->numa_faults[membuf_idx] = 0;
2168
2169
2170
2171
2172
2173
2174
2175
2176 f_weight = div64_u64(runtime << 16, period + 1);
2177 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2178 (total_faults + 1);
2179 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2180 p->numa_faults[cpubuf_idx] = 0;
2181
2182 p->numa_faults[mem_idx] += diff;
2183 p->numa_faults[cpu_idx] += f_diff;
2184 faults += p->numa_faults[mem_idx];
2185 p->total_numa_faults += diff;
2186 if (p->numa_group) {
2187
2188
2189
2190
2191
2192
2193
2194 p->numa_group->faults[mem_idx] += diff;
2195 p->numa_group->faults_cpu[mem_idx] += f_diff;
2196 p->numa_group->total_faults += diff;
2197 group_faults += p->numa_group->faults[mem_idx];
2198 }
2199 }
2200
2201 if (!p->numa_group) {
2202 if (faults > max_faults) {
2203 max_faults = faults;
2204 max_nid = nid;
2205 }
2206 } else if (group_faults > max_faults) {
2207 max_faults = group_faults;
2208 max_nid = nid;
2209 }
2210 }
2211
2212 if (p->numa_group) {
2213 numa_group_count_active_nodes(p->numa_group);
2214 spin_unlock_irq(group_lock);
2215 max_nid = preferred_group_nid(p, max_nid);
2216 }
2217
2218 if (max_faults) {
2219
2220 if (max_nid != p->numa_preferred_nid)
2221 sched_setnuma(p, max_nid);
2222 }
2223
2224 update_task_scan_period(p, fault_types[0], fault_types[1]);
2225}
2226
2227static inline int get_numa_group(struct numa_group *grp)
2228{
2229 return atomic_inc_not_zero(&grp->refcount);
2230}
2231
2232static inline void put_numa_group(struct numa_group *grp)
2233{
2234 if (atomic_dec_and_test(&grp->refcount))
2235 kfree_rcu(grp, rcu);
2236}
2237
2238static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2239 int *priv)
2240{
2241 struct numa_group *grp, *my_grp;
2242 struct task_struct *tsk;
2243 bool join = false;
2244 int cpu = cpupid_to_cpu(cpupid);
2245 int i;
2246
2247 if (unlikely(!p->numa_group)) {
2248 unsigned int size = sizeof(struct numa_group) +
2249 4*nr_node_ids*sizeof(unsigned long);
2250
2251 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2252 if (!grp)
2253 return;
2254
2255 atomic_set(&grp->refcount, 1);
2256 grp->active_nodes = 1;
2257 grp->max_faults_cpu = 0;
2258 spin_lock_init(&grp->lock);
2259 grp->gid = p->pid;
2260
2261 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2262 nr_node_ids;
2263
2264 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2265 grp->faults[i] = p->numa_faults[i];
2266
2267 grp->total_faults = p->total_numa_faults;
2268
2269 grp->nr_tasks++;
2270 rcu_assign_pointer(p->numa_group, grp);
2271 }
2272
2273 rcu_read_lock();
2274 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2275
2276 if (!cpupid_match_pid(tsk, cpupid))
2277 goto no_join;
2278
2279 grp = rcu_dereference(tsk->numa_group);
2280 if (!grp)
2281 goto no_join;
2282
2283 my_grp = p->numa_group;
2284 if (grp == my_grp)
2285 goto no_join;
2286
2287
2288
2289
2290
2291 if (my_grp->nr_tasks > grp->nr_tasks)
2292 goto no_join;
2293
2294
2295
2296
2297 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2298 goto no_join;
2299
2300
2301 if (tsk->mm == current->mm)
2302 join = true;
2303
2304
2305 if (flags & TNF_SHARED)
2306 join = true;
2307
2308
2309 *priv = !join;
2310
2311 if (join && !get_numa_group(grp))
2312 goto no_join;
2313
2314 rcu_read_unlock();
2315
2316 if (!join)
2317 return;
2318
2319 BUG_ON(irqs_disabled());
2320 double_lock_irq(&my_grp->lock, &grp->lock);
2321
2322 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2323 my_grp->faults[i] -= p->numa_faults[i];
2324 grp->faults[i] += p->numa_faults[i];
2325 }
2326 my_grp->total_faults -= p->total_numa_faults;
2327 grp->total_faults += p->total_numa_faults;
2328
2329 my_grp->nr_tasks--;
2330 grp->nr_tasks++;
2331
2332 spin_unlock(&my_grp->lock);
2333 spin_unlock_irq(&grp->lock);
2334
2335 rcu_assign_pointer(p->numa_group, grp);
2336
2337 put_numa_group(my_grp);
2338 return;
2339
2340no_join:
2341 rcu_read_unlock();
2342 return;
2343}
2344
2345void task_numa_free(struct task_struct *p)
2346{
2347 struct numa_group *grp = p->numa_group;
2348 void *numa_faults = p->numa_faults;
2349 unsigned long flags;
2350 int i;
2351
2352 if (grp) {
2353 spin_lock_irqsave(&grp->lock, flags);
2354 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2355 grp->faults[i] -= p->numa_faults[i];
2356 grp->total_faults -= p->total_numa_faults;
2357
2358 grp->nr_tasks--;
2359 spin_unlock_irqrestore(&grp->lock, flags);
2360 RCU_INIT_POINTER(p->numa_group, NULL);
2361 put_numa_group(grp);
2362 }
2363
2364 p->numa_faults = NULL;
2365 kfree(numa_faults);
2366}
2367
2368
2369
2370
2371void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2372{
2373 struct task_struct *p = current;
2374 bool migrated = flags & TNF_MIGRATED;
2375 int cpu_node = task_node(current);
2376 int local = !!(flags & TNF_FAULT_LOCAL);
2377 struct numa_group *ng;
2378 int priv;
2379
2380 if (!static_branch_likely(&sched_numa_balancing))
2381 return;
2382
2383
2384 if (!p->mm)
2385 return;
2386
2387
2388 if (unlikely(!p->numa_faults)) {
2389 int size = sizeof(*p->numa_faults) *
2390 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2391
2392 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2393 if (!p->numa_faults)
2394 return;
2395
2396 p->total_numa_faults = 0;
2397 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2398 }
2399
2400
2401
2402
2403
2404 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2405 priv = 1;
2406 } else {
2407 priv = cpupid_match_pid(p, last_cpupid);
2408 if (!priv && !(flags & TNF_NO_GROUP))
2409 task_numa_group(p, last_cpupid, flags, &priv);
2410 }
2411
2412
2413
2414
2415
2416
2417
2418 ng = p->numa_group;
2419 if (!priv && !local && ng && ng->active_nodes > 1 &&
2420 numa_is_active_node(cpu_node, ng) &&
2421 numa_is_active_node(mem_node, ng))
2422 local = 1;
2423
2424
2425
2426
2427
2428 if (time_after(jiffies, p->numa_migrate_retry)) {
2429 task_numa_placement(p);
2430 numa_migrate_preferred(p);
2431 }
2432
2433 if (migrated)
2434 p->numa_pages_migrated += pages;
2435 if (flags & TNF_MIGRATE_FAIL)
2436 p->numa_faults_locality[2] += pages;
2437
2438 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2439 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2440 p->numa_faults_locality[local] += pages;
2441}
2442
2443static void reset_ptenuma_scan(struct task_struct *p)
2444{
2445
2446
2447
2448
2449
2450
2451
2452
2453 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2454 p->mm->numa_scan_offset = 0;
2455}
2456
2457
2458
2459
2460
2461void task_numa_work(struct callback_head *work)
2462{
2463 unsigned long migrate, next_scan, now = jiffies;
2464 struct task_struct *p = current;
2465 struct mm_struct *mm = p->mm;
2466 u64 runtime = p->se.sum_exec_runtime;
2467 struct vm_area_struct *vma;
2468 unsigned long start, end;
2469 unsigned long nr_pte_updates = 0;
2470 long pages, virtpages;
2471
2472 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2473
2474 work->next = work;
2475
2476
2477
2478
2479
2480
2481
2482
2483 if (p->flags & PF_EXITING)
2484 return;
2485
2486 if (!mm->numa_next_scan) {
2487 mm->numa_next_scan = now +
2488 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2489 }
2490
2491
2492
2493
2494 migrate = mm->numa_next_scan;
2495 if (time_before(now, migrate))
2496 return;
2497
2498 if (p->numa_scan_period == 0) {
2499 p->numa_scan_period_max = task_scan_max(p);
2500 p->numa_scan_period = task_scan_start(p);
2501 }
2502
2503 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2504 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2505 return;
2506
2507
2508
2509
2510
2511 p->node_stamp += 2 * TICK_NSEC;
2512
2513 start = mm->numa_scan_offset;
2514 pages = sysctl_numa_balancing_scan_size;
2515 pages <<= 20 - PAGE_SHIFT;
2516 virtpages = pages * 8;
2517 if (!pages)
2518 return;
2519
2520
2521 if (!down_read_trylock(&mm->mmap_sem))
2522 return;
2523 vma = find_vma(mm, start);
2524 if (!vma) {
2525 reset_ptenuma_scan(p);
2526 start = 0;
2527 vma = mm->mmap;
2528 }
2529 for (; vma; vma = vma->vm_next) {
2530 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2531 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2532 continue;
2533 }
2534
2535
2536
2537
2538
2539
2540
2541 if (!vma->vm_mm ||
2542 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2543 continue;
2544
2545
2546
2547
2548
2549 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2550 continue;
2551
2552 do {
2553 start = max(start, vma->vm_start);
2554 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2555 end = min(end, vma->vm_end);
2556 nr_pte_updates = change_prot_numa(vma, start, end);
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566 if (nr_pte_updates)
2567 pages -= (end - start) >> PAGE_SHIFT;
2568 virtpages -= (end - start) >> PAGE_SHIFT;
2569
2570 start = end;
2571 if (pages <= 0 || virtpages <= 0)
2572 goto out;
2573
2574 cond_resched();
2575 } while (end != vma->vm_end);
2576 }
2577
2578out:
2579
2580
2581
2582
2583
2584
2585 if (vma)
2586 mm->numa_scan_offset = start;
2587 else
2588 reset_ptenuma_scan(p);
2589 up_read(&mm->mmap_sem);
2590
2591
2592
2593
2594
2595
2596
2597 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2598 u64 diff = p->se.sum_exec_runtime - runtime;
2599 p->node_stamp += 32 * diff;
2600 }
2601}
2602
2603
2604
2605
2606void task_tick_numa(struct rq *rq, struct task_struct *curr)
2607{
2608 struct callback_head *work = &curr->numa_work;
2609 u64 period, now;
2610
2611
2612
2613
2614 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2615 return;
2616
2617
2618
2619
2620
2621
2622
2623 now = curr->se.sum_exec_runtime;
2624 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2625
2626 if (now > curr->node_stamp + period) {
2627 if (!curr->node_stamp)
2628 curr->numa_scan_period = task_scan_start(curr);
2629 curr->node_stamp += period;
2630
2631 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2632 init_task_work(work, task_numa_work);
2633 task_work_add(curr, work, true);
2634 }
2635 }
2636}
2637
2638static void update_scan_period(struct task_struct *p, int new_cpu)
2639{
2640 int src_nid = cpu_to_node(task_cpu(p));
2641 int dst_nid = cpu_to_node(new_cpu);
2642
2643 if (!static_branch_likely(&sched_numa_balancing))
2644 return;
2645
2646 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2647 return;
2648
2649 if (src_nid == dst_nid)
2650 return;
2651
2652
2653
2654
2655
2656
2657 if (p->numa_scan_seq) {
2658
2659
2660
2661
2662
2663 if (dst_nid == p->numa_preferred_nid ||
2664 (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2665 return;
2666 }
2667
2668 p->numa_scan_period = task_scan_start(p);
2669}
2670
2671#else
2672static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2673{
2674}
2675
2676static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2677{
2678}
2679
2680static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2681{
2682}
2683
2684static inline void update_scan_period(struct task_struct *p, int new_cpu)
2685{
2686}
2687
2688#endif
2689
2690static void
2691account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2692{
2693 update_load_add(&cfs_rq->load, se->load.weight);
2694 if (!parent_entity(se))
2695 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2696#ifdef CONFIG_SMP
2697 if (entity_is_task(se)) {
2698 struct rq *rq = rq_of(cfs_rq);
2699
2700 account_numa_enqueue(rq, task_of(se));
2701 list_add(&se->group_node, &rq->cfs_tasks);
2702 }
2703#endif
2704 cfs_rq->nr_running++;
2705}
2706
2707static void
2708account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2709{
2710 update_load_sub(&cfs_rq->load, se->load.weight);
2711 if (!parent_entity(se))
2712 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2713#ifdef CONFIG_SMP
2714 if (entity_is_task(se)) {
2715 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2716 list_del_init(&se->group_node);
2717 }
2718#endif
2719 cfs_rq->nr_running--;
2720}
2721
2722
2723
2724
2725
2726
2727
2728
2729#define add_positive(_ptr, _val) do { \
2730 typeof(_ptr) ptr = (_ptr); \
2731 typeof(_val) val = (_val); \
2732 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2733 \
2734 res = var + val; \
2735 \
2736 if (val < 0 && res > var) \
2737 res = 0; \
2738 \
2739 WRITE_ONCE(*ptr, res); \
2740} while (0)
2741
2742
2743
2744
2745
2746
2747
2748
2749#define sub_positive(_ptr, _val) do { \
2750 typeof(_ptr) ptr = (_ptr); \
2751 typeof(*ptr) val = (_val); \
2752 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2753 res = var - val; \
2754 if (res > var) \
2755 res = 0; \
2756 WRITE_ONCE(*ptr, res); \
2757} while (0)
2758
2759#ifdef CONFIG_SMP
2760static inline void
2761enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763 cfs_rq->runnable_weight += se->runnable_weight;
2764
2765 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2766 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2767}
2768
2769static inline void
2770dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2771{
2772 cfs_rq->runnable_weight -= se->runnable_weight;
2773
2774 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2775 sub_positive(&cfs_rq->avg.runnable_load_sum,
2776 se_runnable(se) * se->avg.runnable_load_sum);
2777}
2778
2779static inline void
2780enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2781{
2782 cfs_rq->avg.load_avg += se->avg.load_avg;
2783 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2784}
2785
2786static inline void
2787dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2788{
2789 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2790 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2791}
2792#else
2793static inline void
2794enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2795static inline void
2796dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2797static inline void
2798enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2799static inline void
2800dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2801#endif
2802
2803static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2804 unsigned long weight, unsigned long runnable)
2805{
2806 if (se->on_rq) {
2807
2808 if (cfs_rq->curr == se)
2809 update_curr(cfs_rq);
2810 account_entity_dequeue(cfs_rq, se);
2811 dequeue_runnable_load_avg(cfs_rq, se);
2812 }
2813 dequeue_load_avg(cfs_rq, se);
2814
2815 se->runnable_weight = runnable;
2816 update_load_set(&se->load, weight);
2817
2818#ifdef CONFIG_SMP
2819 do {
2820 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2821
2822 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2823 se->avg.runnable_load_avg =
2824 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2825 } while (0);
2826#endif
2827
2828 enqueue_load_avg(cfs_rq, se);
2829 if (se->on_rq) {
2830 account_entity_enqueue(cfs_rq, se);
2831 enqueue_runnable_load_avg(cfs_rq, se);
2832 }
2833}
2834
2835void reweight_task(struct task_struct *p, int prio)
2836{
2837 struct sched_entity *se = &p->se;
2838 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2839 struct load_weight *load = &se->load;
2840 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2841
2842 reweight_entity(cfs_rq, se, weight, weight);
2843 load->inv_weight = sched_prio_to_wmult[prio];
2844}
2845
2846#ifdef CONFIG_FAIR_GROUP_SCHED
2847#ifdef CONFIG_SMP
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921static long calc_group_shares(struct cfs_rq *cfs_rq)
2922{
2923 long tg_weight, tg_shares, load, shares;
2924 struct task_group *tg = cfs_rq->tg;
2925
2926 tg_shares = READ_ONCE(tg->shares);
2927
2928 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
2929
2930 tg_weight = atomic_long_read(&tg->load_avg);
2931
2932
2933 tg_weight -= cfs_rq->tg_load_avg_contrib;
2934 tg_weight += load;
2935
2936 shares = (tg_shares * load);
2937 if (tg_weight)
2938 shares /= tg_weight;
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952 return clamp_t(long, shares, MIN_SHARES, tg_shares);
2953}
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2983{
2984 long runnable, load_avg;
2985
2986 load_avg = max(cfs_rq->avg.load_avg,
2987 scale_load_down(cfs_rq->load.weight));
2988
2989 runnable = max(cfs_rq->avg.runnable_load_avg,
2990 scale_load_down(cfs_rq->runnable_weight));
2991
2992 runnable *= shares;
2993 if (load_avg)
2994 runnable /= load_avg;
2995
2996 return clamp_t(long, runnable, MIN_SHARES, shares);
2997}
2998#endif
2999
3000static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3001
3002
3003
3004
3005
3006static void update_cfs_group(struct sched_entity *se)
3007{
3008 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3009 long shares, runnable;
3010
3011 if (!gcfs_rq)
3012 return;
3013
3014 if (throttled_hierarchy(gcfs_rq))
3015 return;
3016
3017#ifndef CONFIG_SMP
3018 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3019
3020 if (likely(se->load.weight == shares))
3021 return;
3022#else
3023 shares = calc_group_shares(gcfs_rq);
3024 runnable = calc_group_runnable(gcfs_rq, shares);
3025#endif
3026
3027 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3028}
3029
3030#else
3031static inline void update_cfs_group(struct sched_entity *se)
3032{
3033}
3034#endif
3035
3036static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3037{
3038 struct rq *rq = rq_of(cfs_rq);
3039
3040 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055 cpufreq_update_util(rq, flags);
3056 }
3057}
3058
3059#ifdef CONFIG_SMP
3060#ifdef CONFIG_FAIR_GROUP_SCHED
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3077{
3078 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3079
3080
3081
3082
3083 if (cfs_rq->tg == &root_task_group)
3084 return;
3085
3086 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3087 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3088 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3089 }
3090}
3091
3092
3093
3094
3095
3096
3097void set_task_rq_fair(struct sched_entity *se,
3098 struct cfs_rq *prev, struct cfs_rq *next)
3099{
3100 u64 p_last_update_time;
3101 u64 n_last_update_time;
3102
3103 if (!sched_feat(ATTACH_AGE_LOAD))
3104 return;
3105
3106
3107
3108
3109
3110
3111
3112
3113 if (!(se->avg.last_update_time && prev))
3114 return;
3115
3116#ifndef CONFIG_64BIT
3117 {
3118 u64 p_last_update_time_copy;
3119 u64 n_last_update_time_copy;
3120
3121 do {
3122 p_last_update_time_copy = prev->load_last_update_time_copy;
3123 n_last_update_time_copy = next->load_last_update_time_copy;
3124
3125 smp_rmb();
3126
3127 p_last_update_time = prev->avg.last_update_time;
3128 n_last_update_time = next->avg.last_update_time;
3129
3130 } while (p_last_update_time != p_last_update_time_copy ||
3131 n_last_update_time != n_last_update_time_copy);
3132 }
3133#else
3134 p_last_update_time = prev->avg.last_update_time;
3135 n_last_update_time = next->avg.last_update_time;
3136#endif
3137 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3138 se->avg.last_update_time = n_last_update_time;
3139}
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210static inline void
3211update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3212{
3213 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3214
3215
3216 if (!delta)
3217 return;
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228 se->avg.util_avg = gcfs_rq->avg.util_avg;
3229 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3230
3231
3232 add_positive(&cfs_rq->avg.util_avg, delta);
3233 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3234}
3235
3236static inline void
3237update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3238{
3239 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3240 unsigned long runnable_load_avg, load_avg;
3241 u64 runnable_load_sum, load_sum = 0;
3242 s64 delta_sum;
3243
3244 if (!runnable_sum)
3245 return;
3246
3247 gcfs_rq->prop_runnable_sum = 0;
3248
3249 if (runnable_sum >= 0) {
3250
3251
3252
3253
3254 runnable_sum += se->avg.load_sum;
3255 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3256 } else {
3257
3258
3259
3260
3261 if (scale_load_down(gcfs_rq->load.weight)) {
3262 load_sum = div_s64(gcfs_rq->avg.load_sum,
3263 scale_load_down(gcfs_rq->load.weight));
3264 }
3265
3266
3267 runnable_sum = min(se->avg.load_sum, load_sum);
3268 }
3269
3270
3271
3272
3273
3274
3275 running_sum = se->avg.util_sum /
3276 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3277 runnable_sum = max(runnable_sum, running_sum);
3278
3279 load_sum = (s64)se_weight(se) * runnable_sum;
3280 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3281
3282 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3283 delta_avg = load_avg - se->avg.load_avg;
3284
3285 se->avg.load_sum = runnable_sum;
3286 se->avg.load_avg = load_avg;
3287 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3288 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3289
3290 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3291 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3292 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3293 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3294
3295 se->avg.runnable_load_sum = runnable_sum;
3296 se->avg.runnable_load_avg = runnable_load_avg;
3297
3298 if (se->on_rq) {
3299 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3300 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3301 }
3302}
3303
3304static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3305{
3306 cfs_rq->propagate = 1;
3307 cfs_rq->prop_runnable_sum += runnable_sum;
3308}
3309
3310
3311static inline int propagate_entity_load_avg(struct sched_entity *se)
3312{
3313 struct cfs_rq *cfs_rq, *gcfs_rq;
3314
3315 if (entity_is_task(se))
3316 return 0;
3317
3318 gcfs_rq = group_cfs_rq(se);
3319 if (!gcfs_rq->propagate)
3320 return 0;
3321
3322 gcfs_rq->propagate = 0;
3323
3324 cfs_rq = cfs_rq_of(se);
3325
3326 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3327
3328 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3329 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3330
3331 return 1;
3332}
3333
3334
3335
3336
3337
3338static inline bool skip_blocked_update(struct sched_entity *se)
3339{
3340 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3341
3342
3343
3344
3345
3346 if (se->avg.load_avg || se->avg.util_avg)
3347 return false;
3348
3349
3350
3351
3352
3353 if (gcfs_rq->propagate)
3354 return false;
3355
3356
3357
3358
3359
3360
3361 return true;
3362}
3363
3364#else
3365
3366static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3367
3368static inline int propagate_entity_load_avg(struct sched_entity *se)
3369{
3370 return 0;
3371}
3372
3373static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3374
3375#endif
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393static inline int
3394update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3395{
3396 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3397 struct sched_avg *sa = &cfs_rq->avg;
3398 int decayed = 0;
3399
3400 if (cfs_rq->removed.nr) {
3401 unsigned long r;
3402 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3403
3404 raw_spin_lock(&cfs_rq->removed.lock);
3405 swap(cfs_rq->removed.util_avg, removed_util);
3406 swap(cfs_rq->removed.load_avg, removed_load);
3407 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3408 cfs_rq->removed.nr = 0;
3409 raw_spin_unlock(&cfs_rq->removed.lock);
3410
3411 r = removed_load;
3412 sub_positive(&sa->load_avg, r);
3413 sub_positive(&sa->load_sum, r * divider);
3414
3415 r = removed_util;
3416 sub_positive(&sa->util_avg, r);
3417 sub_positive(&sa->util_sum, r * divider);
3418
3419 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3420
3421 decayed = 1;
3422 }
3423
3424 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3425
3426#ifndef CONFIG_64BIT
3427 smp_wmb();
3428 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3429#endif
3430
3431 if (decayed)
3432 cfs_rq_util_change(cfs_rq, 0);
3433
3434 return decayed;
3435}
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3447{
3448 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3449
3450
3451
3452
3453
3454
3455
3456
3457 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3458 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3459
3460
3461
3462
3463
3464
3465
3466 se->avg.util_sum = se->avg.util_avg * divider;
3467
3468 se->avg.load_sum = divider;
3469 if (se_weight(se)) {
3470 se->avg.load_sum =
3471 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3472 }
3473
3474 se->avg.runnable_load_sum = se->avg.load_sum;
3475
3476 enqueue_load_avg(cfs_rq, se);
3477 cfs_rq->avg.util_avg += se->avg.util_avg;
3478 cfs_rq->avg.util_sum += se->avg.util_sum;
3479
3480 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3481
3482 cfs_rq_util_change(cfs_rq, flags);
3483}
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3494{
3495 dequeue_load_avg(cfs_rq, se);
3496 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3497 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3498
3499 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3500
3501 cfs_rq_util_change(cfs_rq, 0);
3502}
3503
3504
3505
3506
3507#define UPDATE_TG 0x1
3508#define SKIP_AGE_LOAD 0x2
3509#define DO_ATTACH 0x4
3510
3511
3512static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3513{
3514 u64 now = cfs_rq_clock_task(cfs_rq);
3515 struct rq *rq = rq_of(cfs_rq);
3516 int cpu = cpu_of(rq);
3517 int decayed;
3518
3519
3520
3521
3522
3523 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3524 __update_load_avg_se(now, cpu, cfs_rq, se);
3525
3526 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3527 decayed |= propagate_entity_load_avg(se);
3528
3529 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3530
3531
3532
3533
3534
3535
3536
3537
3538 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3539 update_tg_load_avg(cfs_rq, 0);
3540
3541 } else if (decayed && (flags & UPDATE_TG))
3542 update_tg_load_avg(cfs_rq, 0);
3543}
3544
3545#ifndef CONFIG_64BIT
3546static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3547{
3548 u64 last_update_time_copy;
3549 u64 last_update_time;
3550
3551 do {
3552 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3553 smp_rmb();
3554 last_update_time = cfs_rq->avg.last_update_time;
3555 } while (last_update_time != last_update_time_copy);
3556
3557 return last_update_time;
3558}
3559#else
3560static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3561{
3562 return cfs_rq->avg.last_update_time;
3563}
3564#endif
3565
3566
3567
3568
3569
3570void sync_entity_load_avg(struct sched_entity *se)
3571{
3572 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3573 u64 last_update_time;
3574
3575 last_update_time = cfs_rq_last_update_time(cfs_rq);
3576 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
3577}
3578
3579
3580
3581
3582
3583void remove_entity_load_avg(struct sched_entity *se)
3584{
3585 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3586 unsigned long flags;
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598 sync_entity_load_avg(se);
3599
3600 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3601 ++cfs_rq->removed.nr;
3602 cfs_rq->removed.util_avg += se->avg.util_avg;
3603 cfs_rq->removed.load_avg += se->avg.load_avg;
3604 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3605 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3606}
3607
3608static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3609{
3610 return cfs_rq->avg.runnable_load_avg;
3611}
3612
3613static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3614{
3615 return cfs_rq->avg.load_avg;
3616}
3617
3618static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3619
3620static inline unsigned long task_util(struct task_struct *p)
3621{
3622 return READ_ONCE(p->se.avg.util_avg);
3623}
3624
3625static inline unsigned long _task_util_est(struct task_struct *p)
3626{
3627 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3628
3629 return max(ue.ewma, ue.enqueued);
3630}
3631
3632static inline unsigned long task_util_est(struct task_struct *p)
3633{
3634 return max(task_util(p), _task_util_est(p));
3635}
3636
3637static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3638 struct task_struct *p)
3639{
3640 unsigned int enqueued;
3641
3642 if (!sched_feat(UTIL_EST))
3643 return;
3644
3645
3646 enqueued = cfs_rq->avg.util_est.enqueued;
3647 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3648 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3649}
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659static inline bool within_margin(int value, int margin)
3660{
3661 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3662}
3663
3664static void
3665util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3666{
3667 long last_ewma_diff;
3668 struct util_est ue;
3669
3670 if (!sched_feat(UTIL_EST))
3671 return;
3672
3673
3674 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3675 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3676 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3677 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3678
3679
3680
3681
3682
3683 if (!task_sleep)
3684 return;
3685
3686
3687
3688
3689
3690 ue = p->se.avg.util_est;
3691 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3692 return;
3693
3694
3695
3696
3697
3698 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3699 last_ewma_diff = ue.enqueued - ue.ewma;
3700 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3701 return;
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3721 ue.ewma += last_ewma_diff;
3722 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3723 WRITE_ONCE(p->se.avg.util_est, ue);
3724}
3725
3726#else
3727
3728#define UPDATE_TG 0x0
3729#define SKIP_AGE_LOAD 0x0
3730#define DO_ATTACH 0x0
3731
3732static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3733{
3734 cfs_rq_util_change(cfs_rq, 0);
3735}
3736
3737static inline void remove_entity_load_avg(struct sched_entity *se) {}
3738
3739static inline void
3740attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3741static inline void
3742detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3743
3744static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3745{
3746 return 0;
3747}
3748
3749static inline void
3750util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3751
3752static inline void
3753util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3754 bool task_sleep) {}
3755
3756#endif
3757
3758static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3759{
3760#ifdef CONFIG_SCHED_DEBUG
3761 s64 d = se->vruntime - cfs_rq->min_vruntime;
3762
3763 if (d < 0)
3764 d = -d;
3765
3766 if (d > 3*sysctl_sched_latency)
3767 schedstat_inc(cfs_rq->nr_spread_over);
3768#endif
3769}
3770
3771static void
3772place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3773{
3774 u64 vruntime = cfs_rq->min_vruntime;
3775
3776
3777
3778
3779
3780
3781
3782 if (initial && sched_feat(START_DEBIT))
3783 vruntime += sched_vslice(cfs_rq, se);
3784
3785
3786 if (!initial) {
3787 unsigned long thresh = sysctl_sched_latency;
3788
3789
3790
3791
3792
3793 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3794 thresh >>= 1;
3795
3796 vruntime -= thresh;
3797 }
3798
3799
3800 se->vruntime = max_vruntime(se->vruntime, vruntime);
3801}
3802
3803static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3804
3805static inline void check_schedstat_required(void)
3806{
3807#ifdef CONFIG_SCHEDSTATS
3808 if (schedstat_enabled())
3809 return;
3810
3811
3812 if (trace_sched_stat_wait_enabled() ||
3813 trace_sched_stat_sleep_enabled() ||
3814 trace_sched_stat_iowait_enabled() ||
3815 trace_sched_stat_blocked_enabled() ||
3816 trace_sched_stat_runtime_enabled()) {
3817 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3818 "stat_blocked and stat_runtime require the "
3819 "kernel parameter schedstats=enable or "
3820 "kernel.sched_schedstats=1\n");
3821 }
3822#endif
3823}
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856static void
3857enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3858{
3859 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3860 bool curr = cfs_rq->curr == se;
3861
3862
3863
3864
3865
3866 if (renorm && curr)
3867 se->vruntime += cfs_rq->min_vruntime;
3868
3869 update_curr(cfs_rq);
3870
3871
3872
3873
3874
3875
3876
3877 if (renorm && !curr)
3878 se->vruntime += cfs_rq->min_vruntime;
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3889 update_cfs_group(se);
3890 enqueue_runnable_load_avg(cfs_rq, se);
3891 account_entity_enqueue(cfs_rq, se);
3892
3893 if (flags & ENQUEUE_WAKEUP)
3894 place_entity(cfs_rq, se, 0);
3895
3896 check_schedstat_required();
3897 update_stats_enqueue(cfs_rq, se, flags);
3898 check_spread(cfs_rq, se);
3899 if (!curr)
3900 __enqueue_entity(cfs_rq, se);
3901 se->on_rq = 1;
3902
3903 if (cfs_rq->nr_running == 1) {
3904 list_add_leaf_cfs_rq(cfs_rq);
3905 check_enqueue_throttle(cfs_rq);
3906 }
3907}
3908
3909static void __clear_buddies_last(struct sched_entity *se)
3910{
3911 for_each_sched_entity(se) {
3912 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3913 if (cfs_rq->last != se)
3914 break;
3915
3916 cfs_rq->last = NULL;
3917 }
3918}
3919
3920static void __clear_buddies_next(struct sched_entity *se)
3921{
3922 for_each_sched_entity(se) {
3923 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3924 if (cfs_rq->next != se)
3925 break;
3926
3927 cfs_rq->next = NULL;
3928 }
3929}
3930
3931static void __clear_buddies_skip(struct sched_entity *se)
3932{
3933 for_each_sched_entity(se) {
3934 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3935 if (cfs_rq->skip != se)
3936 break;
3937
3938 cfs_rq->skip = NULL;
3939 }
3940}
3941
3942static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3943{
3944 if (cfs_rq->last == se)
3945 __clear_buddies_last(se);
3946
3947 if (cfs_rq->next == se)
3948 __clear_buddies_next(se);
3949
3950 if (cfs_rq->skip == se)
3951 __clear_buddies_skip(se);
3952}
3953
3954static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3955
3956static void
3957dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3958{
3959
3960
3961
3962 update_curr(cfs_rq);
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972 update_load_avg(cfs_rq, se, UPDATE_TG);
3973 dequeue_runnable_load_avg(cfs_rq, se);
3974
3975 update_stats_dequeue(cfs_rq, se, flags);
3976
3977 clear_buddies(cfs_rq, se);
3978
3979 if (se != cfs_rq->curr)
3980 __dequeue_entity(cfs_rq, se);
3981 se->on_rq = 0;
3982 account_entity_dequeue(cfs_rq, se);
3983
3984
3985
3986
3987
3988
3989
3990 if (!(flags & DEQUEUE_SLEEP))
3991 se->vruntime -= cfs_rq->min_vruntime;
3992
3993
3994 return_cfs_rq_runtime(cfs_rq);
3995
3996 update_cfs_group(se);
3997
3998
3999
4000
4001
4002
4003
4004 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4005 update_min_vruntime(cfs_rq);
4006}
4007
4008
4009
4010
4011static void
4012check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4013{
4014 unsigned long ideal_runtime, delta_exec;
4015 struct sched_entity *se;
4016 s64 delta;
4017
4018 ideal_runtime = sched_slice(cfs_rq, curr);
4019 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4020 if (delta_exec > ideal_runtime) {
4021 resched_curr(rq_of(cfs_rq));
4022
4023
4024
4025
4026 clear_buddies(cfs_rq, curr);
4027 return;
4028 }
4029
4030
4031
4032
4033
4034
4035 if (delta_exec < sysctl_sched_min_granularity)
4036 return;
4037
4038 se = __pick_first_entity(cfs_rq);
4039 delta = curr->vruntime - se->vruntime;
4040
4041 if (delta < 0)
4042 return;
4043
4044 if (delta > ideal_runtime)
4045 resched_curr(rq_of(cfs_rq));
4046}
4047
4048static void
4049set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4050{
4051
4052 if (se->on_rq) {
4053
4054
4055
4056
4057
4058 update_stats_wait_end(cfs_rq, se);
4059 __dequeue_entity(cfs_rq, se);
4060 update_load_avg(cfs_rq, se, UPDATE_TG);
4061 }
4062
4063 update_stats_curr_start(cfs_rq, se);
4064 cfs_rq->curr = se;
4065
4066
4067
4068
4069
4070
4071 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4072 schedstat_set(se->statistics.slice_max,
4073 max((u64)schedstat_val(se->statistics.slice_max),
4074 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4075 }
4076
4077 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4078}
4079
4080static int
4081wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4082
4083
4084
4085
4086
4087
4088
4089
4090static struct sched_entity *
4091pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4092{
4093 struct sched_entity *left = __pick_first_entity(cfs_rq);
4094 struct sched_entity *se;
4095
4096
4097
4098
4099
4100 if (!left || (curr && entity_before(curr, left)))
4101 left = curr;
4102
4103 se = left;
4104
4105
4106
4107
4108
4109 if (cfs_rq->skip == se) {
4110 struct sched_entity *second;
4111
4112 if (se == curr) {
4113 second = __pick_first_entity(cfs_rq);
4114 } else {
4115 second = __pick_next_entity(se);
4116 if (!second || (curr && entity_before(curr, second)))
4117 second = curr;
4118 }
4119
4120 if (second && wakeup_preempt_entity(second, left) < 1)
4121 se = second;
4122 }
4123
4124
4125
4126
4127 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4128 se = cfs_rq->last;
4129
4130
4131
4132
4133 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4134 se = cfs_rq->next;
4135
4136 clear_buddies(cfs_rq, se);
4137
4138 return se;
4139}
4140
4141static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4142
4143static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4144{
4145
4146
4147
4148
4149 if (prev->on_rq)
4150 update_curr(cfs_rq);
4151
4152
4153 check_cfs_rq_runtime(cfs_rq);
4154
4155 check_spread(cfs_rq, prev);
4156
4157 if (prev->on_rq) {
4158 update_stats_wait_start(cfs_rq, prev);
4159
4160 __enqueue_entity(cfs_rq, prev);
4161
4162 update_load_avg(cfs_rq, prev, 0);
4163 }
4164 cfs_rq->curr = NULL;
4165}
4166
4167static void
4168entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4169{
4170
4171
4172
4173 update_curr(cfs_rq);
4174
4175
4176
4177
4178 update_load_avg(cfs_rq, curr, UPDATE_TG);
4179 update_cfs_group(curr);
4180
4181#ifdef CONFIG_SCHED_HRTICK
4182
4183
4184
4185
4186 if (queued) {
4187 resched_curr(rq_of(cfs_rq));
4188 return;
4189 }
4190
4191
4192
4193 if (!sched_feat(DOUBLE_TICK) &&
4194 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4195 return;
4196#endif
4197
4198 if (cfs_rq->nr_running > 1)
4199 check_preempt_tick(cfs_rq, curr);
4200}
4201
4202
4203
4204
4205
4206
4207#ifdef CONFIG_CFS_BANDWIDTH
4208
4209#ifdef HAVE_JUMP_LABEL
4210static struct static_key __cfs_bandwidth_used;
4211
4212static inline bool cfs_bandwidth_used(void)
4213{
4214 return static_key_false(&__cfs_bandwidth_used);
4215}
4216
4217void cfs_bandwidth_usage_inc(void)
4218{
4219 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4220}
4221
4222void cfs_bandwidth_usage_dec(void)
4223{
4224 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4225}
4226#else
4227static bool cfs_bandwidth_used(void)
4228{
4229 return true;
4230}
4231
4232void cfs_bandwidth_usage_inc(void) {}
4233void cfs_bandwidth_usage_dec(void) {}
4234#endif
4235
4236
4237
4238
4239
4240static inline u64 default_cfs_period(void)
4241{
4242 return 100000000ULL;
4243}
4244
4245static inline u64 sched_cfs_bandwidth_slice(void)
4246{
4247 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4248}
4249
4250
4251
4252
4253
4254
4255
4256
4257void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4258{
4259 u64 now;
4260
4261 if (cfs_b->quota == RUNTIME_INF)
4262 return;
4263
4264 now = sched_clock_cpu(smp_processor_id());
4265 cfs_b->runtime = cfs_b->quota;
4266 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4267 cfs_b->expires_seq++;
4268}
4269
4270static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4271{
4272 return &tg->cfs_bandwidth;
4273}
4274
4275
4276static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4277{
4278 if (unlikely(cfs_rq->throttle_count))
4279 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4280
4281 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4282}
4283
4284
4285static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4286{
4287 struct task_group *tg = cfs_rq->tg;
4288 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4289 u64 amount = 0, min_amount, expires;
4290 int expires_seq;
4291
4292
4293 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4294
4295 raw_spin_lock(&cfs_b->lock);
4296 if (cfs_b->quota == RUNTIME_INF)
4297 amount = min_amount;
4298 else {
4299 start_cfs_bandwidth(cfs_b);
4300
4301 if (cfs_b->runtime > 0) {
4302 amount = min(cfs_b->runtime, min_amount);
4303 cfs_b->runtime -= amount;
4304 cfs_b->idle = 0;
4305 }
4306 }
4307 expires_seq = cfs_b->expires_seq;
4308 expires = cfs_b->runtime_expires;
4309 raw_spin_unlock(&cfs_b->lock);
4310
4311 cfs_rq->runtime_remaining += amount;
4312
4313
4314
4315
4316
4317 if (cfs_rq->expires_seq != expires_seq) {
4318 cfs_rq->expires_seq = expires_seq;
4319 cfs_rq->runtime_expires = expires;
4320 }
4321
4322 return cfs_rq->runtime_remaining > 0;
4323}
4324
4325
4326
4327
4328
4329static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4330{
4331 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4332
4333
4334 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4335 return;
4336
4337 if (cfs_rq->runtime_remaining < 0)
4338 return;
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4349
4350 cfs_rq->runtime_expires += TICK_NSEC;
4351 } else {
4352
4353 cfs_rq->runtime_remaining = 0;
4354 }
4355}
4356
4357static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4358{
4359
4360 cfs_rq->runtime_remaining -= delta_exec;
4361 expire_cfs_rq_runtime(cfs_rq);
4362
4363 if (likely(cfs_rq->runtime_remaining > 0))
4364 return;
4365
4366
4367
4368
4369
4370 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4371 resched_curr(rq_of(cfs_rq));
4372}
4373
4374static __always_inline
4375void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4376{
4377 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4378 return;
4379
4380 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4381}
4382
4383static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4384{
4385 return cfs_bandwidth_used() && cfs_rq->throttled;
4386}
4387
4388
4389static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4390{
4391 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4392}
4393
4394
4395
4396
4397
4398
4399static inline int throttled_lb_pair(struct task_group *tg,
4400 int src_cpu, int dest_cpu)
4401{
4402 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4403
4404 src_cfs_rq = tg->cfs_rq[src_cpu];
4405 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4406
4407 return throttled_hierarchy(src_cfs_rq) ||
4408 throttled_hierarchy(dest_cfs_rq);
4409}
4410
4411static int tg_unthrottle_up(struct task_group *tg, void *data)
4412{
4413 struct rq *rq = data;
4414 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4415
4416 cfs_rq->throttle_count--;
4417 if (!cfs_rq->throttle_count) {
4418
4419 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4420 cfs_rq->throttled_clock_task;
4421 }
4422
4423 return 0;
4424}
4425
4426static int tg_throttle_down(struct task_group *tg, void *data)
4427{
4428 struct rq *rq = data;
4429 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4430
4431
4432 if (!cfs_rq->throttle_count)
4433 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4434 cfs_rq->throttle_count++;
4435
4436 return 0;
4437}
4438
4439static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4440{
4441 struct rq *rq = rq_of(cfs_rq);
4442 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4443 struct sched_entity *se;
4444 long task_delta, dequeue = 1;
4445 bool empty;
4446
4447 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4448
4449
4450 rcu_read_lock();
4451 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4452 rcu_read_unlock();
4453
4454 task_delta = cfs_rq->h_nr_running;
4455 for_each_sched_entity(se) {
4456 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4457
4458 if (!se->on_rq)
4459 break;
4460
4461 if (dequeue)
4462 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4463 qcfs_rq->h_nr_running -= task_delta;
4464
4465 if (qcfs_rq->load.weight)
4466 dequeue = 0;
4467 }
4468
4469 if (!se)
4470 sub_nr_running(rq, task_delta);
4471
4472 cfs_rq->throttled = 1;
4473 cfs_rq->throttled_clock = rq_clock(rq);
4474 raw_spin_lock(&cfs_b->lock);
4475 empty = list_empty(&cfs_b->throttled_cfs_rq);
4476
4477
4478
4479
4480
4481
4482 if (cfs_b->distribute_running)
4483 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4484 else
4485 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4486
4487
4488
4489
4490
4491 if (empty)
4492 start_cfs_bandwidth(cfs_b);
4493
4494 raw_spin_unlock(&cfs_b->lock);
4495}
4496
4497void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4498{
4499 struct rq *rq = rq_of(cfs_rq);
4500 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4501 struct sched_entity *se;
4502 int enqueue = 1;
4503 long task_delta;
4504
4505 se = cfs_rq->tg->se[cpu_of(rq)];
4506
4507 cfs_rq->throttled = 0;
4508
4509 update_rq_clock(rq);
4510
4511 raw_spin_lock(&cfs_b->lock);
4512 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4513 list_del_rcu(&cfs_rq->throttled_list);
4514 raw_spin_unlock(&cfs_b->lock);
4515
4516
4517 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4518
4519 if (!cfs_rq->load.weight)
4520 return;
4521
4522 task_delta = cfs_rq->h_nr_running;
4523 for_each_sched_entity(se) {
4524 if (se->on_rq)
4525 enqueue = 0;
4526
4527 cfs_rq = cfs_rq_of(se);
4528 if (enqueue)
4529 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4530 cfs_rq->h_nr_running += task_delta;
4531
4532 if (cfs_rq_throttled(cfs_rq))
4533 break;
4534 }
4535
4536 if (!se)
4537 add_nr_running(rq, task_delta);
4538
4539
4540 if (rq->curr == rq->idle && rq->cfs.nr_running)
4541 resched_curr(rq);
4542}
4543
4544static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4545 u64 remaining, u64 expires)
4546{
4547 struct cfs_rq *cfs_rq;
4548 u64 runtime;
4549 u64 starting_runtime = remaining;
4550
4551 rcu_read_lock();
4552 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4553 throttled_list) {
4554 struct rq *rq = rq_of(cfs_rq);
4555 struct rq_flags rf;
4556
4557 rq_lock(rq, &rf);
4558 if (!cfs_rq_throttled(cfs_rq))
4559 goto next;
4560
4561 runtime = -cfs_rq->runtime_remaining + 1;
4562 if (runtime > remaining)
4563 runtime = remaining;
4564 remaining -= runtime;
4565
4566 cfs_rq->runtime_remaining += runtime;
4567 cfs_rq->runtime_expires = expires;
4568
4569
4570 if (cfs_rq->runtime_remaining > 0)
4571 unthrottle_cfs_rq(cfs_rq);
4572
4573next:
4574 rq_unlock(rq, &rf);
4575
4576 if (!remaining)
4577 break;
4578 }
4579 rcu_read_unlock();
4580
4581 return starting_runtime - remaining;
4582}
4583
4584
4585
4586
4587
4588
4589
4590static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4591{
4592 u64 runtime, runtime_expires;
4593 int throttled;
4594
4595
4596 if (cfs_b->quota == RUNTIME_INF)
4597 goto out_deactivate;
4598
4599 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4600 cfs_b->nr_periods += overrun;
4601
4602
4603
4604
4605
4606 if (cfs_b->idle && !throttled)
4607 goto out_deactivate;
4608
4609 __refill_cfs_bandwidth_runtime(cfs_b);
4610
4611 if (!throttled) {
4612
4613 cfs_b->idle = 1;
4614 return 0;
4615 }
4616
4617
4618 cfs_b->nr_throttled += overrun;
4619
4620 runtime_expires = cfs_b->runtime_expires;
4621
4622
4623
4624
4625
4626
4627
4628
4629 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4630 runtime = cfs_b->runtime;
4631 cfs_b->distribute_running = 1;
4632 raw_spin_unlock(&cfs_b->lock);
4633
4634 runtime = distribute_cfs_runtime(cfs_b, runtime,
4635 runtime_expires);
4636 raw_spin_lock(&cfs_b->lock);
4637
4638 cfs_b->distribute_running = 0;
4639 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4640
4641 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4642 }
4643
4644
4645
4646
4647
4648
4649
4650 cfs_b->idle = 0;
4651
4652 return 0;
4653
4654out_deactivate:
4655 return 1;
4656}
4657
4658
4659static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4660
4661static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4662
4663static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4664
4665
4666
4667
4668
4669
4670
4671
4672static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4673{
4674 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4675 u64 remaining;
4676
4677
4678 if (hrtimer_callback_running(refresh_timer))
4679 return 1;
4680
4681
4682 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4683 if (remaining < min_expire)
4684 return 1;
4685
4686 return 0;
4687}
4688
4689static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4690{
4691 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4692
4693
4694 if (runtime_refresh_within(cfs_b, min_left))
4695 return;
4696
4697 hrtimer_start(&cfs_b->slack_timer,
4698 ns_to_ktime(cfs_bandwidth_slack_period),
4699 HRTIMER_MODE_REL);
4700}
4701
4702
4703static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4704{
4705 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4706 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4707
4708 if (slack_runtime <= 0)
4709 return;
4710
4711 raw_spin_lock(&cfs_b->lock);
4712 if (cfs_b->quota != RUNTIME_INF &&
4713 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4714 cfs_b->runtime += slack_runtime;
4715
4716
4717 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4718 !list_empty(&cfs_b->throttled_cfs_rq))
4719 start_cfs_slack_bandwidth(cfs_b);
4720 }
4721 raw_spin_unlock(&cfs_b->lock);
4722
4723
4724 cfs_rq->runtime_remaining -= slack_runtime;
4725}
4726
4727static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4728{
4729 if (!cfs_bandwidth_used())
4730 return;
4731
4732 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4733 return;
4734
4735 __return_cfs_rq_runtime(cfs_rq);
4736}
4737
4738
4739
4740
4741
4742static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4743{
4744 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4745 u64 expires;
4746
4747
4748 raw_spin_lock(&cfs_b->lock);
4749 if (cfs_b->distribute_running) {
4750 raw_spin_unlock(&cfs_b->lock);
4751 return;
4752 }
4753
4754 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4755 raw_spin_unlock(&cfs_b->lock);
4756 return;
4757 }
4758
4759 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4760 runtime = cfs_b->runtime;
4761
4762 expires = cfs_b->runtime_expires;
4763 if (runtime)
4764 cfs_b->distribute_running = 1;
4765
4766 raw_spin_unlock(&cfs_b->lock);
4767
4768 if (!runtime)
4769 return;
4770
4771 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4772
4773 raw_spin_lock(&cfs_b->lock);
4774 if (expires == cfs_b->runtime_expires)
4775 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4776 cfs_b->distribute_running = 0;
4777 raw_spin_unlock(&cfs_b->lock);
4778}
4779
4780
4781
4782
4783
4784
4785static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4786{
4787 if (!cfs_bandwidth_used())
4788 return;
4789
4790
4791 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4792 return;
4793
4794
4795 if (cfs_rq_throttled(cfs_rq))
4796 return;
4797
4798
4799 account_cfs_rq_runtime(cfs_rq, 0);
4800 if (cfs_rq->runtime_remaining <= 0)
4801 throttle_cfs_rq(cfs_rq);
4802}
4803
4804static void sync_throttle(struct task_group *tg, int cpu)
4805{
4806 struct cfs_rq *pcfs_rq, *cfs_rq;
4807
4808 if (!cfs_bandwidth_used())
4809 return;
4810
4811 if (!tg->parent)
4812 return;
4813
4814 cfs_rq = tg->cfs_rq[cpu];
4815 pcfs_rq = tg->parent->cfs_rq[cpu];
4816
4817 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4818 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4819}
4820
4821
4822static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4823{
4824 if (!cfs_bandwidth_used())
4825 return false;
4826
4827 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4828 return false;
4829
4830
4831
4832
4833
4834 if (cfs_rq_throttled(cfs_rq))
4835 return true;
4836
4837 throttle_cfs_rq(cfs_rq);
4838 return true;
4839}
4840
4841static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4842{
4843 struct cfs_bandwidth *cfs_b =
4844 container_of(timer, struct cfs_bandwidth, slack_timer);
4845
4846 do_sched_cfs_slack_timer(cfs_b);
4847
4848 return HRTIMER_NORESTART;
4849}
4850
4851static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4852{
4853 struct cfs_bandwidth *cfs_b =
4854 container_of(timer, struct cfs_bandwidth, period_timer);
4855 int overrun;
4856 int idle = 0;
4857
4858 raw_spin_lock(&cfs_b->lock);
4859 for (;;) {
4860 overrun = hrtimer_forward_now(timer, cfs_b->period);
4861 if (!overrun)
4862 break;
4863
4864 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4865 }
4866 if (idle)
4867 cfs_b->period_active = 0;
4868 raw_spin_unlock(&cfs_b->lock);
4869
4870 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4871}
4872
4873void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4874{
4875 raw_spin_lock_init(&cfs_b->lock);
4876 cfs_b->runtime = 0;
4877 cfs_b->quota = RUNTIME_INF;
4878 cfs_b->period = ns_to_ktime(default_cfs_period());
4879
4880 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4881 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4882 cfs_b->period_timer.function = sched_cfs_period_timer;
4883 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4884 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4885 cfs_b->distribute_running = 0;
4886}
4887
4888static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4889{
4890 cfs_rq->runtime_enabled = 0;
4891 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4892}
4893
4894void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4895{
4896 u64 overrun;
4897
4898 lockdep_assert_held(&cfs_b->lock);
4899
4900 if (cfs_b->period_active)
4901 return;
4902
4903 cfs_b->period_active = 1;
4904 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4905 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
4906 cfs_b->expires_seq++;
4907 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4908}
4909
4910static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4911{
4912
4913 if (!cfs_b->throttled_cfs_rq.next)
4914 return;
4915
4916 hrtimer_cancel(&cfs_b->period_timer);
4917 hrtimer_cancel(&cfs_b->slack_timer);
4918}
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928static void __maybe_unused update_runtime_enabled(struct rq *rq)
4929{
4930 struct task_group *tg;
4931
4932 lockdep_assert_held(&rq->lock);
4933
4934 rcu_read_lock();
4935 list_for_each_entry_rcu(tg, &task_groups, list) {
4936 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
4937 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4938
4939 raw_spin_lock(&cfs_b->lock);
4940 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4941 raw_spin_unlock(&cfs_b->lock);
4942 }
4943 rcu_read_unlock();
4944}
4945
4946
4947static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4948{
4949 struct task_group *tg;
4950
4951 lockdep_assert_held(&rq->lock);
4952
4953 rcu_read_lock();
4954 list_for_each_entry_rcu(tg, &task_groups, list) {
4955 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4956
4957 if (!cfs_rq->runtime_enabled)
4958 continue;
4959
4960
4961
4962
4963
4964 cfs_rq->runtime_remaining = 1;
4965
4966
4967
4968
4969 cfs_rq->runtime_enabled = 0;
4970
4971 if (cfs_rq_throttled(cfs_rq))
4972 unthrottle_cfs_rq(cfs_rq);
4973 }
4974 rcu_read_unlock();
4975}
4976
4977#else
4978static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4979{
4980 return rq_clock_task(rq_of(cfs_rq));
4981}
4982
4983static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4984static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4985static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4986static inline void sync_throttle(struct task_group *tg, int cpu) {}
4987static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4988
4989static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4990{
4991 return 0;
4992}
4993
4994static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4995{
4996 return 0;
4997}
4998
4999static inline int throttled_lb_pair(struct task_group *tg,
5000 int src_cpu, int dest_cpu)
5001{
5002 return 0;
5003}
5004
5005void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5006
5007#ifdef CONFIG_FAIR_GROUP_SCHED
5008static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5009#endif
5010
5011static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5012{
5013 return NULL;
5014}
5015static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5016static inline void update_runtime_enabled(struct rq *rq) {}
5017static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5018
5019#endif
5020
5021
5022
5023
5024
5025#ifdef CONFIG_SCHED_HRTICK
5026static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5027{
5028 struct sched_entity *se = &p->se;
5029 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5030
5031 SCHED_WARN_ON(task_rq(p) != rq);
5032
5033 if (rq->cfs.h_nr_running > 1) {
5034 u64 slice = sched_slice(cfs_rq, se);
5035 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5036 s64 delta = slice - ran;
5037
5038 if (delta < 0) {
5039 if (rq->curr == p)
5040 resched_curr(rq);
5041 return;
5042 }
5043 hrtick_start(rq, delta);
5044 }
5045}
5046
5047
5048
5049
5050
5051
5052static void hrtick_update(struct rq *rq)
5053{
5054 struct task_struct *curr = rq->curr;
5055
5056 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5057 return;
5058
5059 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5060 hrtick_start_fair(rq, curr);
5061}
5062#else
5063static inline void
5064hrtick_start_fair(struct rq *rq, struct task_struct *p)
5065{
5066}
5067
5068static inline void hrtick_update(struct rq *rq)
5069{
5070}
5071#endif
5072
5073
5074
5075
5076
5077
5078static void
5079enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5080{
5081 struct cfs_rq *cfs_rq;
5082 struct sched_entity *se = &p->se;
5083
5084
5085
5086
5087
5088
5089
5090 util_est_enqueue(&rq->cfs, p);
5091
5092
5093
5094
5095
5096
5097 if (p->in_iowait)
5098 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5099
5100 for_each_sched_entity(se) {
5101 if (se->on_rq)
5102 break;
5103 cfs_rq = cfs_rq_of(se);
5104 enqueue_entity(cfs_rq, se, flags);
5105
5106
5107
5108
5109
5110
5111
5112 if (cfs_rq_throttled(cfs_rq))
5113 break;
5114 cfs_rq->h_nr_running++;
5115
5116 flags = ENQUEUE_WAKEUP;
5117 }
5118
5119 for_each_sched_entity(se) {
5120 cfs_rq = cfs_rq_of(se);
5121 cfs_rq->h_nr_running++;
5122
5123 if (cfs_rq_throttled(cfs_rq))
5124 break;
5125
5126 update_load_avg(cfs_rq, se, UPDATE_TG);
5127 update_cfs_group(se);
5128 }
5129
5130 if (!se)
5131 add_nr_running(rq, 1);
5132
5133 hrtick_update(rq);
5134}
5135
5136static void set_next_buddy(struct sched_entity *se);
5137
5138
5139
5140
5141
5142
5143static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5144{
5145 struct cfs_rq *cfs_rq;
5146 struct sched_entity *se = &p->se;
5147 int task_sleep = flags & DEQUEUE_SLEEP;
5148
5149 for_each_sched_entity(se) {
5150 cfs_rq = cfs_rq_of(se);
5151 dequeue_entity(cfs_rq, se, flags);
5152
5153
5154
5155
5156
5157
5158
5159 if (cfs_rq_throttled(cfs_rq))
5160 break;
5161 cfs_rq->h_nr_running--;
5162
5163
5164 if (cfs_rq->load.weight) {
5165
5166 se = parent_entity(se);
5167
5168
5169
5170
5171 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5172 set_next_buddy(se);
5173 break;
5174 }
5175 flags |= DEQUEUE_SLEEP;
5176 }
5177
5178 for_each_sched_entity(se) {
5179 cfs_rq = cfs_rq_of(se);
5180 cfs_rq->h_nr_running--;
5181
5182 if (cfs_rq_throttled(cfs_rq))
5183 break;
5184
5185 update_load_avg(cfs_rq, se, UPDATE_TG);
5186 update_cfs_group(se);
5187 }
5188
5189 if (!se)
5190 sub_nr_running(rq, 1);
5191
5192 util_est_dequeue(&rq->cfs, p, task_sleep);
5193 hrtick_update(rq);
5194}
5195
5196#ifdef CONFIG_SMP
5197
5198
5199DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5200DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5201
5202#ifdef CONFIG_NO_HZ_COMMON
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229#define DEGRADE_SHIFT 7
5230
5231static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5232static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5233 { 0, 0, 0, 0, 0, 0, 0, 0 },
5234 { 64, 32, 8, 0, 0, 0, 0, 0 },
5235 { 96, 72, 40, 12, 1, 0, 0, 0 },
5236 { 112, 98, 75, 43, 15, 1, 0, 0 },
5237 { 120, 112, 98, 76, 45, 16, 2, 0 }
5238};
5239
5240
5241
5242
5243
5244
5245static unsigned long
5246decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5247{
5248 int j = 0;
5249
5250 if (!missed_updates)
5251 return load;
5252
5253 if (missed_updates >= degrade_zero_ticks[idx])
5254 return 0;
5255
5256 if (idx == 1)
5257 return load >> missed_updates;
5258
5259 while (missed_updates) {
5260 if (missed_updates % 2)
5261 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5262
5263 missed_updates >>= 1;
5264 j++;
5265 }
5266 return load;
5267}
5268
5269static struct {
5270 cpumask_var_t idle_cpus_mask;
5271 atomic_t nr_cpus;
5272 int has_blocked;
5273 unsigned long next_balance;
5274 unsigned long next_blocked;
5275} nohz ____cacheline_aligned;
5276
5277#endif
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5315 unsigned long pending_updates)
5316{
5317 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5318 int i, scale;
5319
5320 this_rq->nr_load_updates++;
5321
5322
5323 this_rq->cpu_load[0] = this_load;
5324 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5325 unsigned long old_load, new_load;
5326
5327
5328
5329 old_load = this_rq->cpu_load[i];
5330#ifdef CONFIG_NO_HZ_COMMON
5331 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5332 if (tickless_load) {
5333 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5334
5335
5336
5337
5338
5339 old_load += tickless_load;
5340 }
5341#endif
5342 new_load = this_load;
5343
5344
5345
5346
5347
5348 if (new_load > old_load)
5349 new_load += scale - 1;
5350
5351 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5352 }
5353}
5354
5355
5356static unsigned long weighted_cpuload(struct rq *rq)
5357{
5358 return cfs_rq_runnable_load_avg(&rq->cfs);
5359}
5360
5361#ifdef CONFIG_NO_HZ_COMMON
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376static void cpu_load_update_nohz(struct rq *this_rq,
5377 unsigned long curr_jiffies,
5378 unsigned long load)
5379{
5380 unsigned long pending_updates;
5381
5382 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5383 if (pending_updates) {
5384 this_rq->last_load_update_tick = curr_jiffies;
5385
5386
5387
5388
5389
5390 cpu_load_update(this_rq, load, pending_updates);
5391 }
5392}
5393
5394
5395
5396
5397
5398static void cpu_load_update_idle(struct rq *this_rq)
5399{
5400
5401
5402
5403 if (weighted_cpuload(this_rq))
5404 return;
5405
5406 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5407}
5408
5409
5410
5411
5412
5413
5414
5415void cpu_load_update_nohz_start(void)
5416{
5417 struct rq *this_rq = this_rq();
5418
5419
5420
5421
5422
5423
5424 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5425}
5426
5427
5428
5429
5430void cpu_load_update_nohz_stop(void)
5431{
5432 unsigned long curr_jiffies = READ_ONCE(jiffies);
5433 struct rq *this_rq = this_rq();
5434 unsigned long load;
5435 struct rq_flags rf;
5436
5437 if (curr_jiffies == this_rq->last_load_update_tick)
5438 return;
5439
5440 load = weighted_cpuload(this_rq);
5441 rq_lock(this_rq, &rf);
5442 update_rq_clock(this_rq);
5443 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5444 rq_unlock(this_rq, &rf);
5445}
5446#else
5447static inline void cpu_load_update_nohz(struct rq *this_rq,
5448 unsigned long curr_jiffies,
5449 unsigned long load) { }
5450#endif
5451
5452static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5453{
5454#ifdef CONFIG_NO_HZ_COMMON
5455
5456 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5457#endif
5458 cpu_load_update(this_rq, load, 1);
5459}
5460
5461
5462
5463
5464void cpu_load_update_active(struct rq *this_rq)
5465{
5466 unsigned long load = weighted_cpuload(this_rq);
5467
5468 if (tick_nohz_tick_stopped())
5469 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5470 else
5471 cpu_load_update_periodic(this_rq, load);
5472}
5473
5474
5475
5476
5477
5478
5479
5480
5481static unsigned long source_load(int cpu, int type)
5482{
5483 struct rq *rq = cpu_rq(cpu);
5484 unsigned long total = weighted_cpuload(rq);
5485
5486 if (type == 0 || !sched_feat(LB_BIAS))
5487 return total;
5488
5489 return min(rq->cpu_load[type-1], total);
5490}
5491
5492
5493
5494
5495
5496static unsigned long target_load(int cpu, int type)
5497{
5498 struct rq *rq = cpu_rq(cpu);
5499 unsigned long total = weighted_cpuload(rq);
5500
5501 if (type == 0 || !sched_feat(LB_BIAS))
5502 return total;
5503
5504 return max(rq->cpu_load[type-1], total);
5505}
5506
5507static unsigned long capacity_of(int cpu)
5508{
5509 return cpu_rq(cpu)->cpu_capacity;
5510}
5511
5512static unsigned long capacity_orig_of(int cpu)
5513{
5514 return cpu_rq(cpu)->cpu_capacity_orig;
5515}
5516
5517static unsigned long cpu_avg_load_per_task(int cpu)
5518{
5519 struct rq *rq = cpu_rq(cpu);
5520 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5521 unsigned long load_avg = weighted_cpuload(rq);
5522
5523 if (nr_running)
5524 return load_avg / nr_running;
5525
5526 return 0;
5527}
5528
5529static void record_wakee(struct task_struct *p)
5530{
5531
5532
5533
5534
5535 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5536 current->wakee_flips >>= 1;
5537 current->wakee_flip_decay_ts = jiffies;
5538 }
5539
5540 if (current->last_wakee != p) {
5541 current->last_wakee = p;
5542 current->wakee_flips++;
5543 }
5544}
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563static int wake_wide(struct task_struct *p)
5564{
5565 unsigned int master = current->wakee_flips;
5566 unsigned int slave = p->wakee_flips;
5567 int factor = this_cpu_read(sd_llc_size);
5568
5569 if (master < slave)
5570 swap(master, slave);
5571 if (slave < factor || master < slave * factor)
5572 return 0;
5573 return 1;
5574}
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588static int
5589wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5590{
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5604 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5605
5606 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5607 return this_cpu;
5608
5609 return nr_cpumask_bits;
5610}
5611
5612static int
5613wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5614 int this_cpu, int prev_cpu, int sync)
5615{
5616 s64 this_eff_load, prev_eff_load;
5617 unsigned long task_load;
5618
5619 this_eff_load = target_load(this_cpu, sd->wake_idx);
5620
5621 if (sync) {
5622 unsigned long current_load = task_h_load(current);
5623
5624 if (current_load > this_eff_load)
5625 return this_cpu;
5626
5627 this_eff_load -= current_load;
5628 }
5629
5630 task_load = task_h_load(p);
5631
5632 this_eff_load += task_load;
5633 if (sched_feat(WA_BIAS))
5634 this_eff_load *= 100;
5635 this_eff_load *= capacity_of(prev_cpu);
5636
5637 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5638 prev_eff_load -= task_load;
5639 if (sched_feat(WA_BIAS))
5640 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5641 prev_eff_load *= capacity_of(this_cpu);
5642
5643
5644
5645
5646
5647
5648
5649 if (sync)
5650 prev_eff_load += 1;
5651
5652 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5653}
5654
5655static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5656 int this_cpu, int prev_cpu, int sync)
5657{
5658 int target = nr_cpumask_bits;
5659
5660 if (sched_feat(WA_IDLE))
5661 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5662
5663 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5664 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5665
5666 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5667 if (target == nr_cpumask_bits)
5668 return prev_cpu;
5669
5670 schedstat_inc(sd->ttwu_move_affine);
5671 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5672 return target;
5673}
5674
5675static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5676
5677static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5678{
5679 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
5680}
5681
5682
5683
5684
5685
5686
5687
5688static struct sched_group *
5689find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5690 int this_cpu, int sd_flag)
5691{
5692 struct sched_group *idlest = NULL, *group = sd->groups;
5693 struct sched_group *most_spare_sg = NULL;
5694 unsigned long min_runnable_load = ULONG_MAX;
5695 unsigned long this_runnable_load = ULONG_MAX;
5696 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5697 unsigned long most_spare = 0, this_spare = 0;
5698 int load_idx = sd->forkexec_idx;
5699 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5700 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5701 (sd->imbalance_pct-100) / 100;
5702
5703 if (sd_flag & SD_BALANCE_WAKE)
5704 load_idx = sd->wake_idx;
5705
5706 do {
5707 unsigned long load, avg_load, runnable_load;
5708 unsigned long spare_cap, max_spare_cap;
5709 int local_group;
5710 int i;
5711
5712
5713 if (!cpumask_intersects(sched_group_span(group),
5714 &p->cpus_allowed))
5715 continue;
5716
5717 local_group = cpumask_test_cpu(this_cpu,
5718 sched_group_span(group));
5719
5720
5721
5722
5723
5724 avg_load = 0;
5725 runnable_load = 0;
5726 max_spare_cap = 0;
5727
5728 for_each_cpu(i, sched_group_span(group)) {
5729
5730 if (local_group)
5731 load = source_load(i, load_idx);
5732 else
5733 load = target_load(i, load_idx);
5734
5735 runnable_load += load;
5736
5737 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5738
5739 spare_cap = capacity_spare_wake(i, p);
5740
5741 if (spare_cap > max_spare_cap)
5742 max_spare_cap = spare_cap;
5743 }
5744
5745
5746 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5747 group->sgc->capacity;
5748 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5749 group->sgc->capacity;
5750
5751 if (local_group) {
5752 this_runnable_load = runnable_load;
5753 this_avg_load = avg_load;
5754 this_spare = max_spare_cap;
5755 } else {
5756 if (min_runnable_load > (runnable_load + imbalance)) {
5757
5758
5759
5760
5761 min_runnable_load = runnable_load;
5762 min_avg_load = avg_load;
5763 idlest = group;
5764 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5765 (100*min_avg_load > imbalance_scale*avg_load)) {
5766
5767
5768
5769
5770 min_avg_load = avg_load;
5771 idlest = group;
5772 }
5773
5774 if (most_spare < max_spare_cap) {
5775 most_spare = max_spare_cap;
5776 most_spare_sg = group;
5777 }
5778 }
5779 } while (group = group->next, group != sd->groups);
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792 if (sd_flag & SD_BALANCE_FORK)
5793 goto skip_spare;
5794
5795 if (this_spare > task_util(p) / 2 &&
5796 imbalance_scale*this_spare > 100*most_spare)
5797 return NULL;
5798
5799 if (most_spare > task_util(p) / 2)
5800 return most_spare_sg;
5801
5802skip_spare:
5803 if (!idlest)
5804 return NULL;
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814 if ((sd->flags & SD_NUMA) &&
5815 min_runnable_load + imbalance >= this_runnable_load)
5816 return NULL;
5817
5818 if (min_runnable_load > (this_runnable_load + imbalance))
5819 return NULL;
5820
5821 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5822 (100*this_avg_load < imbalance_scale*min_avg_load))
5823 return NULL;
5824
5825 return idlest;
5826}
5827
5828
5829
5830
5831static int
5832find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5833{
5834 unsigned long load, min_load = ULONG_MAX;
5835 unsigned int min_exit_latency = UINT_MAX;
5836 u64 latest_idle_timestamp = 0;
5837 int least_loaded_cpu = this_cpu;
5838 int shallowest_idle_cpu = -1;
5839 int i;
5840
5841
5842 if (group->group_weight == 1)
5843 return cpumask_first(sched_group_span(group));
5844
5845
5846 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5847 if (available_idle_cpu(i)) {
5848 struct rq *rq = cpu_rq(i);
5849 struct cpuidle_state *idle = idle_get_state(rq);
5850 if (idle && idle->exit_latency < min_exit_latency) {
5851
5852
5853
5854
5855
5856 min_exit_latency = idle->exit_latency;
5857 latest_idle_timestamp = rq->idle_stamp;
5858 shallowest_idle_cpu = i;
5859 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5860 rq->idle_stamp > latest_idle_timestamp) {
5861
5862
5863
5864
5865
5866 latest_idle_timestamp = rq->idle_stamp;
5867 shallowest_idle_cpu = i;
5868 }
5869 } else if (shallowest_idle_cpu == -1) {
5870 load = weighted_cpuload(cpu_rq(i));
5871 if (load < min_load) {
5872 min_load = load;
5873 least_loaded_cpu = i;
5874 }
5875 }
5876 }
5877
5878 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5879}
5880
5881static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5882 int cpu, int prev_cpu, int sd_flag)
5883{
5884 int new_cpu = cpu;
5885
5886 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
5887 return prev_cpu;
5888
5889
5890
5891
5892
5893 if (!(sd_flag & SD_BALANCE_FORK))
5894 sync_entity_load_avg(&p->se);
5895
5896 while (sd) {
5897 struct sched_group *group;
5898 struct sched_domain *tmp;
5899 int weight;
5900
5901 if (!(sd->flags & sd_flag)) {
5902 sd = sd->child;
5903 continue;
5904 }
5905
5906 group = find_idlest_group(sd, p, cpu, sd_flag);
5907 if (!group) {
5908 sd = sd->child;
5909 continue;
5910 }
5911
5912 new_cpu = find_idlest_group_cpu(group, p, cpu);
5913 if (new_cpu == cpu) {
5914
5915 sd = sd->child;
5916 continue;
5917 }
5918
5919
5920 cpu = new_cpu;
5921 weight = sd->span_weight;
5922 sd = NULL;
5923 for_each_domain(cpu, tmp) {
5924 if (weight <= tmp->span_weight)
5925 break;
5926 if (tmp->flags & sd_flag)
5927 sd = tmp;
5928 }
5929 }
5930
5931 return new_cpu;
5932}
5933
5934#ifdef CONFIG_SCHED_SMT
5935DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5936
5937static inline void set_idle_cores(int cpu, int val)
5938{
5939 struct sched_domain_shared *sds;
5940
5941 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5942 if (sds)
5943 WRITE_ONCE(sds->has_idle_cores, val);
5944}
5945
5946static inline bool test_idle_cores(int cpu, bool def)
5947{
5948 struct sched_domain_shared *sds;
5949
5950 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5951 if (sds)
5952 return READ_ONCE(sds->has_idle_cores);
5953
5954 return def;
5955}
5956
5957
5958
5959
5960
5961
5962
5963
5964void __update_idle_core(struct rq *rq)
5965{
5966 int core = cpu_of(rq);
5967 int cpu;
5968
5969 rcu_read_lock();
5970 if (test_idle_cores(core, true))
5971 goto unlock;
5972
5973 for_each_cpu(cpu, cpu_smt_mask(core)) {
5974 if (cpu == core)
5975 continue;
5976
5977 if (!available_idle_cpu(cpu))
5978 goto unlock;
5979 }
5980
5981 set_idle_cores(core, 1);
5982unlock:
5983 rcu_read_unlock();
5984}
5985
5986
5987
5988
5989
5990
5991static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5992{
5993 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5994 int core, cpu;
5995
5996 if (!static_branch_likely(&sched_smt_present))
5997 return -1;
5998
5999 if (!test_idle_cores(target, false))
6000 return -1;
6001
6002 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6003
6004 for_each_cpu_wrap(core, cpus, target) {
6005 bool idle = true;
6006
6007 for_each_cpu(cpu, cpu_smt_mask(core)) {
6008 cpumask_clear_cpu(cpu, cpus);
6009 if (!available_idle_cpu(cpu))
6010 idle = false;
6011 }
6012
6013 if (idle)
6014 return core;
6015 }
6016
6017
6018
6019
6020 set_idle_cores(target, 0);
6021
6022 return -1;
6023}
6024
6025
6026
6027
6028static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6029{
6030 int cpu;
6031
6032 if (!static_branch_likely(&sched_smt_present))
6033 return -1;
6034
6035 for_each_cpu(cpu, cpu_smt_mask(target)) {
6036 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6037 continue;
6038 if (available_idle_cpu(cpu))
6039 return cpu;
6040 }
6041
6042 return -1;
6043}
6044
6045#else
6046
6047static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6048{
6049 return -1;
6050}
6051
6052static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6053{
6054 return -1;
6055}
6056
6057#endif
6058
6059
6060
6061
6062
6063
6064static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6065{
6066 struct sched_domain *this_sd;
6067 u64 avg_cost, avg_idle;
6068 u64 time, cost;
6069 s64 delta;
6070 int cpu, nr = INT_MAX;
6071
6072 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6073 if (!this_sd)
6074 return -1;
6075
6076
6077
6078
6079
6080 avg_idle = this_rq()->avg_idle / 512;
6081 avg_cost = this_sd->avg_scan_cost + 1;
6082
6083 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
6084 return -1;
6085
6086 if (sched_feat(SIS_PROP)) {
6087 u64 span_avg = sd->span_weight * avg_idle;
6088 if (span_avg > 4*avg_cost)
6089 nr = div_u64(span_avg, avg_cost);
6090 else
6091 nr = 4;
6092 }
6093
6094 time = local_clock();
6095
6096 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6097 if (!--nr)
6098 return -1;
6099 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6100 continue;
6101 if (available_idle_cpu(cpu))
6102 break;
6103 }
6104
6105 time = local_clock() - time;
6106 cost = this_sd->avg_scan_cost;
6107 delta = (s64)(time - cost) / 8;
6108 this_sd->avg_scan_cost += delta;
6109
6110 return cpu;
6111}
6112
6113
6114
6115
6116static int select_idle_sibling(struct task_struct *p, int prev, int target)
6117{
6118 struct sched_domain *sd;
6119 int i, recent_used_cpu;
6120
6121 if (available_idle_cpu(target))
6122 return target;
6123
6124
6125
6126
6127 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6128 return prev;
6129
6130
6131 recent_used_cpu = p->recent_used_cpu;
6132 if (recent_used_cpu != prev &&
6133 recent_used_cpu != target &&
6134 cpus_share_cache(recent_used_cpu, target) &&
6135 available_idle_cpu(recent_used_cpu) &&
6136 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6137
6138
6139
6140
6141 p->recent_used_cpu = prev;
6142 return recent_used_cpu;
6143 }
6144
6145 sd = rcu_dereference(per_cpu(sd_llc, target));
6146 if (!sd)
6147 return target;
6148
6149 i = select_idle_core(p, sd, target);
6150 if ((unsigned)i < nr_cpumask_bits)
6151 return i;
6152
6153 i = select_idle_cpu(p, sd, target);
6154 if ((unsigned)i < nr_cpumask_bits)
6155 return i;
6156
6157 i = select_idle_smt(p, sd, target);
6158 if ((unsigned)i < nr_cpumask_bits)
6159 return i;
6160
6161 return target;
6162}
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202static inline unsigned long cpu_util(int cpu)
6203{
6204 struct cfs_rq *cfs_rq;
6205 unsigned int util;
6206
6207 cfs_rq = &cpu_rq(cpu)->cfs;
6208 util = READ_ONCE(cfs_rq->avg.util_avg);
6209
6210 if (sched_feat(UTIL_EST))
6211 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6212
6213 return min_t(unsigned long, util, capacity_orig_of(cpu));
6214}
6215
6216
6217
6218
6219
6220static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6221{
6222 struct cfs_rq *cfs_rq;
6223 unsigned int util;
6224
6225
6226 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6227 return cpu_util(cpu);
6228
6229 cfs_rq = &cpu_rq(cpu)->cfs;
6230 util = READ_ONCE(cfs_rq->avg.util_avg);
6231
6232
6233 util -= min_t(unsigned int, util, task_util(p));
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261 if (sched_feat(UTIL_EST))
6262 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6263
6264
6265
6266
6267
6268
6269 return min_t(unsigned long, util, capacity_orig_of(cpu));
6270}
6271
6272
6273
6274
6275
6276
6277
6278
6279static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6280{
6281 long min_cap, max_cap;
6282
6283 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6284 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6285
6286
6287 if (max_cap - min_cap < max_cap >> 3)
6288 return 0;
6289
6290
6291 sync_entity_load_avg(&p->se);
6292
6293 return min_cap * 1024 < task_util(p) * capacity_margin;
6294}
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308static int
6309select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6310{
6311 struct sched_domain *tmp, *sd = NULL;
6312 int cpu = smp_processor_id();
6313 int new_cpu = prev_cpu;
6314 int want_affine = 0;
6315 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6316
6317 if (sd_flag & SD_BALANCE_WAKE) {
6318 record_wakee(p);
6319 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
6320 && cpumask_test_cpu(cpu, &p->cpus_allowed);
6321 }
6322
6323 rcu_read_lock();
6324 for_each_domain(cpu, tmp) {
6325 if (!(tmp->flags & SD_LOAD_BALANCE))
6326 break;
6327
6328
6329
6330
6331
6332 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6333 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6334 if (cpu != prev_cpu)
6335 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6336
6337 sd = NULL;
6338 break;
6339 }
6340
6341 if (tmp->flags & sd_flag)
6342 sd = tmp;
6343 else if (!want_affine)
6344 break;
6345 }
6346
6347 if (unlikely(sd)) {
6348
6349 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6350 } else if (sd_flag & SD_BALANCE_WAKE) {
6351
6352
6353 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6354
6355 if (want_affine)
6356 current->recent_used_cpu = cpu;
6357 }
6358 rcu_read_unlock();
6359
6360 return new_cpu;
6361}
6362
6363static void detach_entity_cfs_rq(struct sched_entity *se);
6364
6365
6366
6367
6368
6369
6370static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6371{
6372
6373
6374
6375
6376
6377
6378 if (p->state == TASK_WAKING) {
6379 struct sched_entity *se = &p->se;
6380 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6381 u64 min_vruntime;
6382
6383#ifndef CONFIG_64BIT
6384 u64 min_vruntime_copy;
6385
6386 do {
6387 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6388 smp_rmb();
6389 min_vruntime = cfs_rq->min_vruntime;
6390 } while (min_vruntime != min_vruntime_copy);
6391#else
6392 min_vruntime = cfs_rq->min_vruntime;
6393#endif
6394
6395 se->vruntime -= min_vruntime;
6396 }
6397
6398 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6399
6400
6401
6402
6403 lockdep_assert_held(&task_rq(p)->lock);
6404 detach_entity_cfs_rq(&p->se);
6405
6406 } else {
6407
6408
6409
6410
6411
6412
6413
6414
6415 remove_entity_load_avg(&p->se);
6416 }
6417
6418
6419 p->se.avg.last_update_time = 0;
6420
6421
6422 p->se.exec_start = 0;
6423
6424 update_scan_period(p, new_cpu);
6425}
6426
6427static void task_dead_fair(struct task_struct *p)
6428{
6429 remove_entity_load_avg(&p->se);
6430}
6431#endif
6432
6433static unsigned long wakeup_gran(struct sched_entity *se)
6434{
6435 unsigned long gran = sysctl_sched_wakeup_granularity;
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450 return calc_delta_fair(gran, se);
6451}
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467static int
6468wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6469{
6470 s64 gran, vdiff = curr->vruntime - se->vruntime;
6471
6472 if (vdiff <= 0)
6473 return -1;
6474
6475 gran = wakeup_gran(se);
6476 if (vdiff > gran)
6477 return 1;
6478
6479 return 0;
6480}
6481
6482static void set_last_buddy(struct sched_entity *se)
6483{
6484 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6485 return;
6486
6487 for_each_sched_entity(se) {
6488 if (SCHED_WARN_ON(!se->on_rq))
6489 return;
6490 cfs_rq_of(se)->last = se;
6491 }
6492}
6493
6494static void set_next_buddy(struct sched_entity *se)
6495{
6496 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6497 return;
6498
6499 for_each_sched_entity(se) {
6500 if (SCHED_WARN_ON(!se->on_rq))
6501 return;
6502 cfs_rq_of(se)->next = se;
6503 }
6504}
6505
6506static void set_skip_buddy(struct sched_entity *se)
6507{
6508 for_each_sched_entity(se)
6509 cfs_rq_of(se)->skip = se;
6510}
6511
6512
6513
6514
6515static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6516{
6517 struct task_struct *curr = rq->curr;
6518 struct sched_entity *se = &curr->se, *pse = &p->se;
6519 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6520 int scale = cfs_rq->nr_running >= sched_nr_latency;
6521 int next_buddy_marked = 0;
6522
6523 if (unlikely(se == pse))
6524 return;
6525
6526
6527
6528
6529
6530
6531
6532 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6533 return;
6534
6535 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6536 set_next_buddy(pse);
6537 next_buddy_marked = 1;
6538 }
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550 if (test_tsk_need_resched(curr))
6551 return;
6552
6553
6554 if (unlikely(curr->policy == SCHED_IDLE) &&
6555 likely(p->policy != SCHED_IDLE))
6556 goto preempt;
6557
6558
6559
6560
6561
6562 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6563 return;
6564
6565 find_matching_se(&se, &pse);
6566 update_curr(cfs_rq_of(se));
6567 BUG_ON(!pse);
6568 if (wakeup_preempt_entity(se, pse) == 1) {
6569
6570
6571
6572
6573 if (!next_buddy_marked)
6574 set_next_buddy(pse);
6575 goto preempt;
6576 }
6577
6578 return;
6579
6580preempt:
6581 resched_curr(rq);
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591 if (unlikely(!se->on_rq || curr == rq->idle))
6592 return;
6593
6594 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6595 set_last_buddy(se);
6596}
6597
6598static struct task_struct *
6599pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6600{
6601 struct cfs_rq *cfs_rq = &rq->cfs;
6602 struct sched_entity *se;
6603 struct task_struct *p;
6604 int new_tasks;
6605
6606again:
6607 if (!cfs_rq->nr_running)
6608 goto idle;
6609
6610#ifdef CONFIG_FAIR_GROUP_SCHED
6611 if (prev->sched_class != &fair_sched_class)
6612 goto simple;
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622 do {
6623 struct sched_entity *curr = cfs_rq->curr;
6624
6625
6626
6627
6628
6629
6630
6631 if (curr) {
6632 if (curr->on_rq)
6633 update_curr(cfs_rq);
6634 else
6635 curr = NULL;
6636
6637
6638
6639
6640
6641
6642
6643 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6644 cfs_rq = &rq->cfs;
6645
6646 if (!cfs_rq->nr_running)
6647 goto idle;
6648
6649 goto simple;
6650 }
6651 }
6652
6653 se = pick_next_entity(cfs_rq, curr);
6654 cfs_rq = group_cfs_rq(se);
6655 } while (cfs_rq);
6656
6657 p = task_of(se);
6658
6659
6660
6661
6662
6663
6664 if (prev != p) {
6665 struct sched_entity *pse = &prev->se;
6666
6667 while (!(cfs_rq = is_same_group(se, pse))) {
6668 int se_depth = se->depth;
6669 int pse_depth = pse->depth;
6670
6671 if (se_depth <= pse_depth) {
6672 put_prev_entity(cfs_rq_of(pse), pse);
6673 pse = parent_entity(pse);
6674 }
6675 if (se_depth >= pse_depth) {
6676 set_next_entity(cfs_rq_of(se), se);
6677 se = parent_entity(se);
6678 }
6679 }
6680
6681 put_prev_entity(cfs_rq, pse);
6682 set_next_entity(cfs_rq, se);
6683 }
6684
6685 goto done;
6686simple:
6687#endif
6688
6689 put_prev_task(rq, prev);
6690
6691 do {
6692 se = pick_next_entity(cfs_rq, NULL);
6693 set_next_entity(cfs_rq, se);
6694 cfs_rq = group_cfs_rq(se);
6695 } while (cfs_rq);
6696
6697 p = task_of(se);
6698
6699done: __maybe_unused;
6700#ifdef CONFIG_SMP
6701
6702
6703
6704
6705
6706 list_move(&p->se.group_node, &rq->cfs_tasks);
6707#endif
6708
6709 if (hrtick_enabled(rq))
6710 hrtick_start_fair(rq, p);
6711
6712 return p;
6713
6714idle:
6715 new_tasks = idle_balance(rq, rf);
6716
6717
6718
6719
6720
6721
6722 if (new_tasks < 0)
6723 return RETRY_TASK;
6724
6725 if (new_tasks > 0)
6726 goto again;
6727
6728 return NULL;
6729}
6730
6731
6732
6733
6734static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6735{
6736 struct sched_entity *se = &prev->se;
6737 struct cfs_rq *cfs_rq;
6738
6739 for_each_sched_entity(se) {
6740 cfs_rq = cfs_rq_of(se);
6741 put_prev_entity(cfs_rq, se);
6742 }
6743}
6744
6745
6746
6747
6748
6749
6750static void yield_task_fair(struct rq *rq)
6751{
6752 struct task_struct *curr = rq->curr;
6753 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6754 struct sched_entity *se = &curr->se;
6755
6756
6757
6758
6759 if (unlikely(rq->nr_running == 1))
6760 return;
6761
6762 clear_buddies(cfs_rq, se);
6763
6764 if (curr->policy != SCHED_BATCH) {
6765 update_rq_clock(rq);
6766
6767
6768
6769 update_curr(cfs_rq);
6770
6771
6772
6773
6774
6775 rq_clock_skip_update(rq);
6776 }
6777
6778 set_skip_buddy(se);
6779}
6780
6781static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6782{
6783 struct sched_entity *se = &p->se;
6784
6785
6786 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6787 return false;
6788
6789
6790 set_next_buddy(se);
6791
6792 yield_task_fair(rq);
6793
6794 return true;
6795}
6796
6797#ifdef CONFIG_SMP
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6917
6918enum fbq_type { regular, remote, all };
6919
6920#define LBF_ALL_PINNED 0x01
6921#define LBF_NEED_BREAK 0x02
6922#define LBF_DST_PINNED 0x04
6923#define LBF_SOME_PINNED 0x08
6924#define LBF_NOHZ_STATS 0x10
6925#define LBF_NOHZ_AGAIN 0x20
6926
6927struct lb_env {
6928 struct sched_domain *sd;
6929
6930 struct rq *src_rq;
6931 int src_cpu;
6932
6933 int dst_cpu;
6934 struct rq *dst_rq;
6935
6936 struct cpumask *dst_grpmask;
6937 int new_dst_cpu;
6938 enum cpu_idle_type idle;
6939 long imbalance;
6940
6941 struct cpumask *cpus;
6942
6943 unsigned int flags;
6944
6945 unsigned int loop;
6946 unsigned int loop_break;
6947 unsigned int loop_max;
6948
6949 enum fbq_type fbq_type;
6950 struct list_head tasks;
6951};
6952
6953
6954
6955
6956static int task_hot(struct task_struct *p, struct lb_env *env)
6957{
6958 s64 delta;
6959
6960 lockdep_assert_held(&env->src_rq->lock);
6961
6962 if (p->sched_class != &fair_sched_class)
6963 return 0;
6964
6965 if (unlikely(p->policy == SCHED_IDLE))
6966 return 0;
6967
6968
6969
6970
6971 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
6972 (&p->se == cfs_rq_of(&p->se)->next ||
6973 &p->se == cfs_rq_of(&p->se)->last))
6974 return 1;
6975
6976 if (sysctl_sched_migration_cost == -1)
6977 return 1;
6978 if (sysctl_sched_migration_cost == 0)
6979 return 0;
6980
6981 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
6982
6983 return delta < (s64)sysctl_sched_migration_cost;
6984}
6985
6986#ifdef CONFIG_NUMA_BALANCING
6987
6988
6989
6990
6991
6992static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6993{
6994 struct numa_group *numa_group = rcu_dereference(p->numa_group);
6995 unsigned long src_weight, dst_weight;
6996 int src_nid, dst_nid, dist;
6997
6998 if (!static_branch_likely(&sched_numa_balancing))
6999 return -1;
7000
7001 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7002 return -1;
7003
7004 src_nid = cpu_to_node(env->src_cpu);
7005 dst_nid = cpu_to_node(env->dst_cpu);
7006
7007 if (src_nid == dst_nid)
7008 return -1;
7009
7010
7011 if (src_nid == p->numa_preferred_nid) {
7012 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7013 return 1;
7014 else
7015 return -1;
7016 }
7017
7018
7019 if (dst_nid == p->numa_preferred_nid)
7020 return 0;
7021
7022
7023 if (env->idle == CPU_IDLE)
7024 return -1;
7025
7026 dist = node_distance(src_nid, dst_nid);
7027 if (numa_group) {
7028 src_weight = group_weight(p, src_nid, dist);
7029 dst_weight = group_weight(p, dst_nid, dist);
7030 } else {
7031 src_weight = task_weight(p, src_nid, dist);
7032 dst_weight = task_weight(p, dst_nid, dist);
7033 }
7034
7035 return dst_weight < src_weight;
7036}
7037
7038#else
7039static inline int migrate_degrades_locality(struct task_struct *p,
7040 struct lb_env *env)
7041{
7042 return -1;
7043}
7044#endif
7045
7046
7047
7048
7049static
7050int can_migrate_task(struct task_struct *p, struct lb_env *env)
7051{
7052 int tsk_cache_hot;
7053
7054 lockdep_assert_held(&env->src_rq->lock);
7055
7056
7057
7058
7059
7060
7061
7062
7063 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7064 return 0;
7065
7066 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
7067 int cpu;
7068
7069 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7070
7071 env->flags |= LBF_SOME_PINNED;
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7082 return 0;
7083
7084
7085 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7086 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7087 env->flags |= LBF_DST_PINNED;
7088 env->new_dst_cpu = cpu;
7089 break;
7090 }
7091 }
7092
7093 return 0;
7094 }
7095
7096
7097 env->flags &= ~LBF_ALL_PINNED;
7098
7099 if (task_running(env->src_rq, p)) {
7100 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7101 return 0;
7102 }
7103
7104
7105
7106
7107
7108
7109
7110 tsk_cache_hot = migrate_degrades_locality(p, env);
7111 if (tsk_cache_hot == -1)
7112 tsk_cache_hot = task_hot(p, env);
7113
7114 if (tsk_cache_hot <= 0 ||
7115 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7116 if (tsk_cache_hot == 1) {
7117 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7118 schedstat_inc(p->se.statistics.nr_forced_migrations);
7119 }
7120 return 1;
7121 }
7122
7123 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7124 return 0;
7125}
7126
7127
7128
7129
7130static void detach_task(struct task_struct *p, struct lb_env *env)
7131{
7132 lockdep_assert_held(&env->src_rq->lock);
7133
7134 p->on_rq = TASK_ON_RQ_MIGRATING;
7135 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7136 set_task_cpu(p, env->dst_cpu);
7137}
7138
7139
7140
7141
7142
7143
7144
7145static struct task_struct *detach_one_task(struct lb_env *env)
7146{
7147 struct task_struct *p;
7148
7149 lockdep_assert_held(&env->src_rq->lock);
7150
7151 list_for_each_entry_reverse(p,
7152 &env->src_rq->cfs_tasks, se.group_node) {
7153 if (!can_migrate_task(p, env))
7154 continue;
7155
7156 detach_task(p, env);
7157
7158
7159
7160
7161
7162
7163
7164 schedstat_inc(env->sd->lb_gained[env->idle]);
7165 return p;
7166 }
7167 return NULL;
7168}
7169
7170static const unsigned int sched_nr_migrate_break = 32;
7171
7172
7173
7174
7175
7176
7177
7178static int detach_tasks(struct lb_env *env)
7179{
7180 struct list_head *tasks = &env->src_rq->cfs_tasks;
7181 struct task_struct *p;
7182 unsigned long load;
7183 int detached = 0;
7184
7185 lockdep_assert_held(&env->src_rq->lock);
7186
7187 if (env->imbalance <= 0)
7188 return 0;
7189
7190 while (!list_empty(tasks)) {
7191
7192
7193
7194
7195 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7196 break;
7197
7198 p = list_last_entry(tasks, struct task_struct, se.group_node);
7199
7200 env->loop++;
7201
7202 if (env->loop > env->loop_max)
7203 break;
7204
7205
7206 if (env->loop > env->loop_break) {
7207 env->loop_break += sched_nr_migrate_break;
7208 env->flags |= LBF_NEED_BREAK;
7209 break;
7210 }
7211
7212 if (!can_migrate_task(p, env))
7213 goto next;
7214
7215 load = task_h_load(p);
7216
7217 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7218 goto next;
7219
7220 if ((load / 2) > env->imbalance)
7221 goto next;
7222
7223 detach_task(p, env);
7224 list_add(&p->se.group_node, &env->tasks);
7225
7226 detached++;
7227 env->imbalance -= load;
7228
7229#ifdef CONFIG_PREEMPT
7230
7231
7232
7233
7234
7235 if (env->idle == CPU_NEWLY_IDLE)
7236 break;
7237#endif
7238
7239
7240
7241
7242
7243 if (env->imbalance <= 0)
7244 break;
7245
7246 continue;
7247next:
7248 list_move(&p->se.group_node, tasks);
7249 }
7250
7251
7252
7253
7254
7255
7256 schedstat_add(env->sd->lb_gained[env->idle], detached);
7257
7258 return detached;
7259}
7260
7261
7262
7263
7264static void attach_task(struct rq *rq, struct task_struct *p)
7265{
7266 lockdep_assert_held(&rq->lock);
7267
7268 BUG_ON(task_rq(p) != rq);
7269 activate_task(rq, p, ENQUEUE_NOCLOCK);
7270 p->on_rq = TASK_ON_RQ_QUEUED;
7271 check_preempt_curr(rq, p, 0);
7272}
7273
7274
7275
7276
7277
7278static void attach_one_task(struct rq *rq, struct task_struct *p)
7279{
7280 struct rq_flags rf;
7281
7282 rq_lock(rq, &rf);
7283 update_rq_clock(rq);
7284 attach_task(rq, p);
7285 rq_unlock(rq, &rf);
7286}
7287
7288
7289
7290
7291
7292static void attach_tasks(struct lb_env *env)
7293{
7294 struct list_head *tasks = &env->tasks;
7295 struct task_struct *p;
7296 struct rq_flags rf;
7297
7298 rq_lock(env->dst_rq, &rf);
7299 update_rq_clock(env->dst_rq);
7300
7301 while (!list_empty(tasks)) {
7302 p = list_first_entry(tasks, struct task_struct, se.group_node);
7303 list_del_init(&p->se.group_node);
7304
7305 attach_task(env->dst_rq, p);
7306 }
7307
7308 rq_unlock(env->dst_rq, &rf);
7309}
7310
7311static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7312{
7313 if (cfs_rq->avg.load_avg)
7314 return true;
7315
7316 if (cfs_rq->avg.util_avg)
7317 return true;
7318
7319 return false;
7320}
7321
7322static inline bool others_have_blocked(struct rq *rq)
7323{
7324 if (READ_ONCE(rq->avg_rt.util_avg))
7325 return true;
7326
7327 if (READ_ONCE(rq->avg_dl.util_avg))
7328 return true;
7329
7330#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
7331 if (READ_ONCE(rq->avg_irq.util_avg))
7332 return true;
7333#endif
7334
7335 return false;
7336}
7337
7338#ifdef CONFIG_FAIR_GROUP_SCHED
7339
7340static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7341{
7342 if (cfs_rq->load.weight)
7343 return false;
7344
7345 if (cfs_rq->avg.load_sum)
7346 return false;
7347
7348 if (cfs_rq->avg.util_sum)
7349 return false;
7350
7351 if (cfs_rq->avg.runnable_load_sum)
7352 return false;
7353
7354 return true;
7355}
7356
7357static void update_blocked_averages(int cpu)
7358{
7359 struct rq *rq = cpu_rq(cpu);
7360 struct cfs_rq *cfs_rq, *pos;
7361 const struct sched_class *curr_class;
7362 struct rq_flags rf;
7363 bool done = true;
7364
7365 rq_lock_irqsave(rq, &rf);
7366 update_rq_clock(rq);
7367
7368
7369
7370
7371
7372 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7373 struct sched_entity *se;
7374
7375
7376 if (throttled_hierarchy(cfs_rq))
7377 continue;
7378
7379 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
7380 update_tg_load_avg(cfs_rq, 0);
7381
7382
7383 se = cfs_rq->tg->se[cpu];
7384 if (se && !skip_blocked_update(se))
7385 update_load_avg(cfs_rq_of(se), se, 0);
7386
7387
7388
7389
7390
7391 if (cfs_rq_is_decayed(cfs_rq))
7392 list_del_leaf_cfs_rq(cfs_rq);
7393
7394
7395 if (cfs_rq_has_blocked(cfs_rq))
7396 done = false;
7397 }
7398
7399 curr_class = rq->curr->sched_class;
7400 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
7401 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
7402 update_irq_load_avg(rq, 0);
7403
7404 if (others_have_blocked(rq))
7405 done = false;
7406
7407#ifdef CONFIG_NO_HZ_COMMON
7408 rq->last_blocked_load_update_tick = jiffies;
7409 if (done)
7410 rq->has_blocked_load = 0;
7411#endif
7412 rq_unlock_irqrestore(rq, &rf);
7413}
7414
7415
7416
7417
7418
7419
7420static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7421{
7422 struct rq *rq = rq_of(cfs_rq);
7423 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7424 unsigned long now = jiffies;
7425 unsigned long load;
7426
7427 if (cfs_rq->last_h_load_update == now)
7428 return;
7429
7430 cfs_rq->h_load_next = NULL;
7431 for_each_sched_entity(se) {
7432 cfs_rq = cfs_rq_of(se);
7433 cfs_rq->h_load_next = se;
7434 if (cfs_rq->last_h_load_update == now)
7435 break;
7436 }
7437
7438 if (!se) {
7439 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7440 cfs_rq->last_h_load_update = now;
7441 }
7442
7443 while ((se = cfs_rq->h_load_next) != NULL) {
7444 load = cfs_rq->h_load;
7445 load = div64_ul(load * se->avg.load_avg,
7446 cfs_rq_load_avg(cfs_rq) + 1);
7447 cfs_rq = group_cfs_rq(se);
7448 cfs_rq->h_load = load;
7449 cfs_rq->last_h_load_update = now;
7450 }
7451}
7452
7453static unsigned long task_h_load(struct task_struct *p)
7454{
7455 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7456
7457 update_cfs_rq_h_load(cfs_rq);
7458 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7459 cfs_rq_load_avg(cfs_rq) + 1);
7460}
7461#else
7462static inline void update_blocked_averages(int cpu)
7463{
7464 struct rq *rq = cpu_rq(cpu);
7465 struct cfs_rq *cfs_rq = &rq->cfs;
7466 const struct sched_class *curr_class;
7467 struct rq_flags rf;
7468
7469 rq_lock_irqsave(rq, &rf);
7470 update_rq_clock(rq);
7471 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7472
7473 curr_class = rq->curr->sched_class;
7474 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
7475 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
7476 update_irq_load_avg(rq, 0);
7477#ifdef CONFIG_NO_HZ_COMMON
7478 rq->last_blocked_load_update_tick = jiffies;
7479 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
7480 rq->has_blocked_load = 0;
7481#endif
7482 rq_unlock_irqrestore(rq, &rf);
7483}
7484
7485static unsigned long task_h_load(struct task_struct *p)
7486{
7487 return p->se.avg.load_avg;
7488}
7489#endif
7490
7491
7492
7493enum group_type {
7494 group_other = 0,
7495 group_imbalanced,
7496 group_overloaded,
7497};
7498
7499
7500
7501
7502struct sg_lb_stats {
7503 unsigned long avg_load;
7504 unsigned long group_load;
7505 unsigned long sum_weighted_load;
7506 unsigned long load_per_task;
7507 unsigned long group_capacity;
7508 unsigned long group_util;
7509 unsigned int sum_nr_running;
7510 unsigned int idle_cpus;
7511 unsigned int group_weight;
7512 enum group_type group_type;
7513 int group_no_capacity;
7514#ifdef CONFIG_NUMA_BALANCING
7515 unsigned int nr_numa_running;
7516 unsigned int nr_preferred_running;
7517#endif
7518};
7519
7520
7521
7522
7523
7524struct sd_lb_stats {
7525 struct sched_group *busiest;
7526 struct sched_group *local;
7527 unsigned long total_running;
7528 unsigned long total_load;
7529 unsigned long total_capacity;
7530 unsigned long avg_load;
7531
7532 struct sg_lb_stats busiest_stat;
7533 struct sg_lb_stats local_stat;
7534};
7535
7536static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7537{
7538
7539
7540
7541
7542
7543
7544 *sds = (struct sd_lb_stats){
7545 .busiest = NULL,
7546 .local = NULL,
7547 .total_running = 0UL,
7548 .total_load = 0UL,
7549 .total_capacity = 0UL,
7550 .busiest_stat = {
7551 .avg_load = 0UL,
7552 .sum_nr_running = 0,
7553 .group_type = group_other,
7554 },
7555 };
7556}
7557
7558
7559
7560
7561
7562
7563
7564
7565static inline int get_sd_load_idx(struct sched_domain *sd,
7566 enum cpu_idle_type idle)
7567{
7568 int load_idx;
7569
7570 switch (idle) {
7571 case CPU_NOT_IDLE:
7572 load_idx = sd->busy_idx;
7573 break;
7574
7575 case CPU_NEWLY_IDLE:
7576 load_idx = sd->newidle_idx;
7577 break;
7578 default:
7579 load_idx = sd->idle_idx;
7580 break;
7581 }
7582
7583 return load_idx;
7584}
7585
7586static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7587{
7588 struct rq *rq = cpu_rq(cpu);
7589 unsigned long max = arch_scale_cpu_capacity(sd, cpu);
7590 unsigned long used, free;
7591 unsigned long irq;
7592
7593 irq = cpu_util_irq(rq);
7594
7595 if (unlikely(irq >= max))
7596 return 1;
7597
7598 used = READ_ONCE(rq->avg_rt.util_avg);
7599 used += READ_ONCE(rq->avg_dl.util_avg);
7600
7601 if (unlikely(used >= max))
7602 return 1;
7603
7604 free = max - used;
7605
7606 return scale_irq_capacity(free, irq, max);
7607}
7608
7609static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7610{
7611 unsigned long capacity = scale_rt_capacity(sd, cpu);
7612 struct sched_group *sdg = sd->groups;
7613
7614 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
7615
7616 if (!capacity)
7617 capacity = 1;
7618
7619 cpu_rq(cpu)->cpu_capacity = capacity;
7620 sdg->sgc->capacity = capacity;
7621 sdg->sgc->min_capacity = capacity;
7622}
7623
7624void update_group_capacity(struct sched_domain *sd, int cpu)
7625{
7626 struct sched_domain *child = sd->child;
7627 struct sched_group *group, *sdg = sd->groups;
7628 unsigned long capacity, min_capacity;
7629 unsigned long interval;
7630
7631 interval = msecs_to_jiffies(sd->balance_interval);
7632 interval = clamp(interval, 1UL, max_load_balance_interval);
7633 sdg->sgc->next_update = jiffies + interval;
7634
7635 if (!child) {
7636 update_cpu_capacity(sd, cpu);
7637 return;
7638 }
7639
7640 capacity = 0;
7641 min_capacity = ULONG_MAX;
7642
7643 if (child->flags & SD_OVERLAP) {
7644
7645
7646
7647
7648
7649 for_each_cpu(cpu, sched_group_span(sdg)) {
7650 struct sched_group_capacity *sgc;
7651 struct rq *rq = cpu_rq(cpu);
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664 if (unlikely(!rq->sd)) {
7665 capacity += capacity_of(cpu);
7666 } else {
7667 sgc = rq->sd->groups->sgc;
7668 capacity += sgc->capacity;
7669 }
7670
7671 min_capacity = min(capacity, min_capacity);
7672 }
7673 } else {
7674
7675
7676
7677
7678
7679 group = child->groups;
7680 do {
7681 struct sched_group_capacity *sgc = group->sgc;
7682
7683 capacity += sgc->capacity;
7684 min_capacity = min(sgc->min_capacity, min_capacity);
7685 group = group->next;
7686 } while (group != child->groups);
7687 }
7688
7689 sdg->sgc->capacity = capacity;
7690 sdg->sgc->min_capacity = min_capacity;
7691}
7692
7693
7694
7695
7696
7697
7698static inline int
7699check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7700{
7701 return ((rq->cpu_capacity * sd->imbalance_pct) <
7702 (rq->cpu_capacity_orig * 100));
7703}
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734static inline int sg_imbalanced(struct sched_group *group)
7735{
7736 return group->sgc->imbalance;
7737}
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751static inline bool
7752group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
7753{
7754 if (sgs->sum_nr_running < sgs->group_weight)
7755 return true;
7756
7757 if ((sgs->group_capacity * 100) >
7758 (sgs->group_util * env->sd->imbalance_pct))
7759 return true;
7760
7761 return false;
7762}
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772static inline bool
7773group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7774{
7775 if (sgs->sum_nr_running <= sgs->group_weight)
7776 return false;
7777
7778 if ((sgs->group_capacity * 100) <
7779 (sgs->group_util * env->sd->imbalance_pct))
7780 return true;
7781
7782 return false;
7783}
7784
7785
7786
7787
7788
7789static inline bool
7790group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7791{
7792 return sg->sgc->min_capacity * capacity_margin <
7793 ref->sgc->min_capacity * 1024;
7794}
7795
7796static inline enum
7797group_type group_classify(struct sched_group *group,
7798 struct sg_lb_stats *sgs)
7799{
7800 if (sgs->group_no_capacity)
7801 return group_overloaded;
7802
7803 if (sg_imbalanced(group))
7804 return group_imbalanced;
7805
7806 return group_other;
7807}
7808
7809static bool update_nohz_stats(struct rq *rq, bool force)
7810{
7811#ifdef CONFIG_NO_HZ_COMMON
7812 unsigned int cpu = rq->cpu;
7813
7814 if (!rq->has_blocked_load)
7815 return false;
7816
7817 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
7818 return false;
7819
7820 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
7821 return true;
7822
7823 update_blocked_averages(cpu);
7824
7825 return rq->has_blocked_load;
7826#else
7827 return false;
7828#endif
7829}
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840static inline void update_sg_lb_stats(struct lb_env *env,
7841 struct sched_group *group, int load_idx,
7842 int local_group, struct sg_lb_stats *sgs,
7843 bool *overload)
7844{
7845 unsigned long load;
7846 int i, nr_running;
7847
7848 memset(sgs, 0, sizeof(*sgs));
7849
7850 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7851 struct rq *rq = cpu_rq(i);
7852
7853 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
7854 env->flags |= LBF_NOHZ_AGAIN;
7855
7856
7857 if (local_group)
7858 load = target_load(i, load_idx);
7859 else
7860 load = source_load(i, load_idx);
7861
7862 sgs->group_load += load;
7863 sgs->group_util += cpu_util(i);
7864 sgs->sum_nr_running += rq->cfs.h_nr_running;
7865
7866 nr_running = rq->nr_running;
7867 if (nr_running > 1)
7868 *overload = true;
7869
7870#ifdef CONFIG_NUMA_BALANCING
7871 sgs->nr_numa_running += rq->nr_numa_running;
7872 sgs->nr_preferred_running += rq->nr_preferred_running;
7873#endif
7874 sgs->sum_weighted_load += weighted_cpuload(rq);
7875
7876
7877
7878 if (!nr_running && idle_cpu(i))
7879 sgs->idle_cpus++;
7880 }
7881
7882
7883 sgs->group_capacity = group->sgc->capacity;
7884 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
7885
7886 if (sgs->sum_nr_running)
7887 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7888
7889 sgs->group_weight = group->group_weight;
7890
7891 sgs->group_no_capacity = group_is_overloaded(env, sgs);
7892 sgs->group_type = group_classify(group, sgs);
7893}
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908static bool update_sd_pick_busiest(struct lb_env *env,
7909 struct sd_lb_stats *sds,
7910 struct sched_group *sg,
7911 struct sg_lb_stats *sgs)
7912{
7913 struct sg_lb_stats *busiest = &sds->busiest_stat;
7914
7915 if (sgs->group_type > busiest->group_type)
7916 return true;
7917
7918 if (sgs->group_type < busiest->group_type)
7919 return false;
7920
7921 if (sgs->avg_load <= busiest->avg_load)
7922 return false;
7923
7924 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7925 goto asym_packing;
7926
7927
7928
7929
7930
7931
7932
7933 if (sgs->sum_nr_running <= sgs->group_weight &&
7934 group_smaller_cpu_capacity(sds->local, sg))
7935 return false;
7936
7937asym_packing:
7938
7939 if (!(env->sd->flags & SD_ASYM_PACKING))
7940 return true;
7941
7942
7943 if (env->idle == CPU_NOT_IDLE)
7944 return true;
7945
7946
7947
7948
7949
7950 if (sgs->sum_nr_running &&
7951 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
7952 if (!sds->busiest)
7953 return true;
7954
7955
7956 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7957 sg->asym_prefer_cpu))
7958 return true;
7959 }
7960
7961 return false;
7962}
7963
7964#ifdef CONFIG_NUMA_BALANCING
7965static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7966{
7967 if (sgs->sum_nr_running > sgs->nr_numa_running)
7968 return regular;
7969 if (sgs->sum_nr_running > sgs->nr_preferred_running)
7970 return remote;
7971 return all;
7972}
7973
7974static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7975{
7976 if (rq->nr_running > rq->nr_numa_running)
7977 return regular;
7978 if (rq->nr_running > rq->nr_preferred_running)
7979 return remote;
7980 return all;
7981}
7982#else
7983static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7984{
7985 return all;
7986}
7987
7988static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7989{
7990 return regular;
7991}
7992#endif
7993
7994
7995
7996
7997
7998
7999static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8000{
8001 struct sched_domain *child = env->sd->child;
8002 struct sched_group *sg = env->sd->groups;
8003 struct sg_lb_stats *local = &sds->local_stat;
8004 struct sg_lb_stats tmp_sgs;
8005 int load_idx, prefer_sibling = 0;
8006 bool overload = false;
8007
8008 if (child && child->flags & SD_PREFER_SIBLING)
8009 prefer_sibling = 1;
8010
8011#ifdef CONFIG_NO_HZ_COMMON
8012 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8013 env->flags |= LBF_NOHZ_STATS;
8014#endif
8015
8016 load_idx = get_sd_load_idx(env->sd, env->idle);
8017
8018 do {
8019 struct sg_lb_stats *sgs = &tmp_sgs;
8020 int local_group;
8021
8022 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8023 if (local_group) {
8024 sds->local = sg;
8025 sgs = local;
8026
8027 if (env->idle != CPU_NEWLY_IDLE ||
8028 time_after_eq(jiffies, sg->sgc->next_update))
8029 update_group_capacity(env->sd, env->dst_cpu);
8030 }
8031
8032 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8033 &overload);
8034
8035 if (local_group)
8036 goto next_group;
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048 if (prefer_sibling && sds->local &&
8049 group_has_capacity(env, local) &&
8050 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8051 sgs->group_no_capacity = 1;
8052 sgs->group_type = group_classify(sg, sgs);
8053 }
8054
8055 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8056 sds->busiest = sg;
8057 sds->busiest_stat = *sgs;
8058 }
8059
8060next_group:
8061
8062 sds->total_running += sgs->sum_nr_running;
8063 sds->total_load += sgs->group_load;
8064 sds->total_capacity += sgs->group_capacity;
8065
8066 sg = sg->next;
8067 } while (sg != env->sd->groups);
8068
8069#ifdef CONFIG_NO_HZ_COMMON
8070 if ((env->flags & LBF_NOHZ_AGAIN) &&
8071 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8072
8073 WRITE_ONCE(nohz.next_blocked,
8074 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8075 }
8076#endif
8077
8078 if (env->sd->flags & SD_NUMA)
8079 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8080
8081 if (!env->sd->parent) {
8082
8083 if (env->dst_rq->rd->overload != overload)
8084 env->dst_rq->rd->overload = overload;
8085 }
8086}
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8112{
8113 int busiest_cpu;
8114
8115 if (!(env->sd->flags & SD_ASYM_PACKING))
8116 return 0;
8117
8118 if (env->idle == CPU_NOT_IDLE)
8119 return 0;
8120
8121 if (!sds->busiest)
8122 return 0;
8123
8124 busiest_cpu = sds->busiest->asym_prefer_cpu;
8125 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8126 return 0;
8127
8128 env->imbalance = DIV_ROUND_CLOSEST(
8129 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8130 SCHED_CAPACITY_SCALE);
8131
8132 return 1;
8133}
8134
8135
8136
8137
8138
8139
8140
8141
8142static inline
8143void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8144{
8145 unsigned long tmp, capa_now = 0, capa_move = 0;
8146 unsigned int imbn = 2;
8147 unsigned long scaled_busy_load_per_task;
8148 struct sg_lb_stats *local, *busiest;
8149
8150 local = &sds->local_stat;
8151 busiest = &sds->busiest_stat;
8152
8153 if (!local->sum_nr_running)
8154 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8155 else if (busiest->load_per_task > local->load_per_task)
8156 imbn = 1;
8157
8158 scaled_busy_load_per_task =
8159 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8160 busiest->group_capacity;
8161
8162 if (busiest->avg_load + scaled_busy_load_per_task >=
8163 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8164 env->imbalance = busiest->load_per_task;
8165 return;
8166 }
8167
8168
8169
8170
8171
8172
8173
8174 capa_now += busiest->group_capacity *
8175 min(busiest->load_per_task, busiest->avg_load);
8176 capa_now += local->group_capacity *
8177 min(local->load_per_task, local->avg_load);
8178 capa_now /= SCHED_CAPACITY_SCALE;
8179
8180
8181 if (busiest->avg_load > scaled_busy_load_per_task) {
8182 capa_move += busiest->group_capacity *
8183 min(busiest->load_per_task,
8184 busiest->avg_load - scaled_busy_load_per_task);
8185 }
8186
8187
8188 if (busiest->avg_load * busiest->group_capacity <
8189 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8190 tmp = (busiest->avg_load * busiest->group_capacity) /
8191 local->group_capacity;
8192 } else {
8193 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8194 local->group_capacity;
8195 }
8196 capa_move += local->group_capacity *
8197 min(local->load_per_task, local->avg_load + tmp);
8198 capa_move /= SCHED_CAPACITY_SCALE;
8199
8200
8201 if (capa_move > capa_now)
8202 env->imbalance = busiest->load_per_task;
8203}
8204
8205
8206
8207
8208
8209
8210
8211static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8212{
8213 unsigned long max_pull, load_above_capacity = ~0UL;
8214 struct sg_lb_stats *local, *busiest;
8215
8216 local = &sds->local_stat;
8217 busiest = &sds->busiest_stat;
8218
8219 if (busiest->group_type == group_imbalanced) {
8220
8221
8222
8223
8224 busiest->load_per_task =
8225 min(busiest->load_per_task, sds->avg_load);
8226 }
8227
8228
8229
8230
8231
8232
8233
8234 if (busiest->avg_load <= sds->avg_load ||
8235 local->avg_load >= sds->avg_load) {
8236 env->imbalance = 0;
8237 return fix_small_imbalance(env, sds);
8238 }
8239
8240
8241
8242
8243 if (busiest->group_type == group_overloaded &&
8244 local->group_type == group_overloaded) {
8245 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8246 if (load_above_capacity > busiest->group_capacity) {
8247 load_above_capacity -= busiest->group_capacity;
8248 load_above_capacity *= scale_load_down(NICE_0_LOAD);
8249 load_above_capacity /= busiest->group_capacity;
8250 } else
8251 load_above_capacity = ~0UL;
8252 }
8253
8254
8255
8256
8257
8258
8259
8260
8261 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8262
8263
8264 env->imbalance = min(
8265 max_pull * busiest->group_capacity,
8266 (sds->avg_load - local->avg_load) * local->group_capacity
8267 ) / SCHED_CAPACITY_SCALE;
8268
8269
8270
8271
8272
8273
8274
8275 if (env->imbalance < busiest->load_per_task)
8276 return fix_small_imbalance(env, sds);
8277}
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292static struct sched_group *find_busiest_group(struct lb_env *env)
8293{
8294 struct sg_lb_stats *local, *busiest;
8295 struct sd_lb_stats sds;
8296
8297 init_sd_lb_stats(&sds);
8298
8299
8300
8301
8302
8303 update_sd_lb_stats(env, &sds);
8304 local = &sds.local_stat;
8305 busiest = &sds.busiest_stat;
8306
8307
8308 if (check_asym_packing(env, &sds))
8309 return sds.busiest;
8310
8311
8312 if (!sds.busiest || busiest->sum_nr_running == 0)
8313 goto out_balanced;
8314
8315
8316 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8317 / sds.total_capacity;
8318
8319
8320
8321
8322
8323
8324 if (busiest->group_type == group_imbalanced)
8325 goto force_balance;
8326
8327
8328
8329
8330
8331 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8332 busiest->group_no_capacity)
8333 goto force_balance;
8334
8335
8336
8337
8338
8339 if (local->avg_load >= busiest->avg_load)
8340 goto out_balanced;
8341
8342
8343
8344
8345
8346 if (local->avg_load >= sds.avg_load)
8347 goto out_balanced;
8348
8349 if (env->idle == CPU_IDLE) {
8350
8351
8352
8353
8354
8355
8356
8357 if ((busiest->group_type != group_overloaded) &&
8358 (local->idle_cpus <= (busiest->idle_cpus + 1)))
8359 goto out_balanced;
8360 } else {
8361
8362
8363
8364
8365 if (100 * busiest->avg_load <=
8366 env->sd->imbalance_pct * local->avg_load)
8367 goto out_balanced;
8368 }
8369
8370force_balance:
8371
8372 calculate_imbalance(env, &sds);
8373 return env->imbalance ? sds.busiest : NULL;
8374
8375out_balanced:
8376 env->imbalance = 0;
8377 return NULL;
8378}
8379
8380
8381
8382
8383static struct rq *find_busiest_queue(struct lb_env *env,
8384 struct sched_group *group)
8385{
8386 struct rq *busiest = NULL, *rq;
8387 unsigned long busiest_load = 0, busiest_capacity = 1;
8388 int i;
8389
8390 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8391 unsigned long capacity, wl;
8392 enum fbq_type rt;
8393
8394 rq = cpu_rq(i);
8395 rt = fbq_classify_rq(rq);
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416 if (rt > env->fbq_type)
8417 continue;
8418
8419 capacity = capacity_of(i);
8420
8421 wl = weighted_cpuload(rq);
8422
8423
8424
8425
8426
8427
8428 if (rq->nr_running == 1 && wl > env->imbalance &&
8429 !check_cpu_capacity(rq, env->sd))
8430 continue;
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443 if (wl * busiest_capacity > busiest_load * capacity) {
8444 busiest_load = wl;
8445 busiest_capacity = capacity;
8446 busiest = rq;
8447 }
8448 }
8449
8450 return busiest;
8451}
8452
8453
8454
8455
8456
8457#define MAX_PINNED_INTERVAL 512
8458
8459static int need_active_balance(struct lb_env *env)
8460{
8461 struct sched_domain *sd = env->sd;
8462
8463 if (env->idle == CPU_NEWLY_IDLE) {
8464
8465
8466
8467
8468
8469
8470 if ((sd->flags & SD_ASYM_PACKING) &&
8471 sched_asym_prefer(env->dst_cpu, env->src_cpu))
8472 return 1;
8473 }
8474
8475
8476
8477
8478
8479
8480
8481 if ((env->idle != CPU_NOT_IDLE) &&
8482 (env->src_rq->cfs.h_nr_running == 1)) {
8483 if ((check_cpu_capacity(env->src_rq, sd)) &&
8484 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8485 return 1;
8486 }
8487
8488 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8489}
8490
8491static int active_load_balance_cpu_stop(void *data);
8492
8493static int should_we_balance(struct lb_env *env)
8494{
8495 struct sched_group *sg = env->sd->groups;
8496 int cpu, balance_cpu = -1;
8497
8498
8499
8500
8501
8502 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8503 return 0;
8504
8505
8506
8507
8508
8509 if (env->idle == CPU_NEWLY_IDLE)
8510 return 1;
8511
8512
8513 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8514 if (!idle_cpu(cpu))
8515 continue;
8516
8517 balance_cpu = cpu;
8518 break;
8519 }
8520
8521 if (balance_cpu == -1)
8522 balance_cpu = group_balance_cpu(sg);
8523
8524
8525
8526
8527
8528 return balance_cpu == env->dst_cpu;
8529}
8530
8531
8532
8533
8534
8535static int load_balance(int this_cpu, struct rq *this_rq,
8536 struct sched_domain *sd, enum cpu_idle_type idle,
8537 int *continue_balancing)
8538{
8539 int ld_moved, cur_ld_moved, active_balance = 0;
8540 struct sched_domain *sd_parent = sd->parent;
8541 struct sched_group *group;
8542 struct rq *busiest;
8543 struct rq_flags rf;
8544 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8545
8546 struct lb_env env = {
8547 .sd = sd,
8548 .dst_cpu = this_cpu,
8549 .dst_rq = this_rq,
8550 .dst_grpmask = sched_group_span(sd->groups),
8551 .idle = idle,
8552 .loop_break = sched_nr_migrate_break,
8553 .cpus = cpus,
8554 .fbq_type = all,
8555 .tasks = LIST_HEAD_INIT(env.tasks),
8556 };
8557
8558 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8559
8560 schedstat_inc(sd->lb_count[idle]);
8561
8562redo:
8563 if (!should_we_balance(&env)) {
8564 *continue_balancing = 0;
8565 goto out_balanced;
8566 }
8567
8568 group = find_busiest_group(&env);
8569 if (!group) {
8570 schedstat_inc(sd->lb_nobusyg[idle]);
8571 goto out_balanced;
8572 }
8573
8574 busiest = find_busiest_queue(&env, group);
8575 if (!busiest) {
8576 schedstat_inc(sd->lb_nobusyq[idle]);
8577 goto out_balanced;
8578 }
8579
8580 BUG_ON(busiest == env.dst_rq);
8581
8582 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8583
8584 env.src_cpu = busiest->cpu;
8585 env.src_rq = busiest;
8586
8587 ld_moved = 0;
8588 if (busiest->nr_running > 1) {
8589
8590
8591
8592
8593
8594
8595 env.flags |= LBF_ALL_PINNED;
8596 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8597
8598more_balance:
8599 rq_lock_irqsave(busiest, &rf);
8600 update_rq_clock(busiest);
8601
8602
8603
8604
8605
8606 cur_ld_moved = detach_tasks(&env);
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616 rq_unlock(busiest, &rf);
8617
8618 if (cur_ld_moved) {
8619 attach_tasks(&env);
8620 ld_moved += cur_ld_moved;
8621 }
8622
8623 local_irq_restore(rf.flags);
8624
8625 if (env.flags & LBF_NEED_BREAK) {
8626 env.flags &= ~LBF_NEED_BREAK;
8627 goto more_balance;
8628 }
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8650
8651
8652 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8653
8654 env.dst_rq = cpu_rq(env.new_dst_cpu);
8655 env.dst_cpu = env.new_dst_cpu;
8656 env.flags &= ~LBF_DST_PINNED;
8657 env.loop = 0;
8658 env.loop_break = sched_nr_migrate_break;
8659
8660
8661
8662
8663
8664 goto more_balance;
8665 }
8666
8667
8668
8669
8670 if (sd_parent) {
8671 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8672
8673 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8674 *group_imbalance = 1;
8675 }
8676
8677
8678 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8679 cpumask_clear_cpu(cpu_of(busiest), cpus);
8680
8681
8682
8683
8684
8685
8686
8687
8688 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8689 env.loop = 0;
8690 env.loop_break = sched_nr_migrate_break;
8691 goto redo;
8692 }
8693 goto out_all_pinned;
8694 }
8695 }
8696
8697 if (!ld_moved) {
8698 schedstat_inc(sd->lb_failed[idle]);
8699
8700
8701
8702
8703
8704
8705 if (idle != CPU_NEWLY_IDLE)
8706 sd->nr_balance_failed++;
8707
8708 if (need_active_balance(&env)) {
8709 unsigned long flags;
8710
8711 raw_spin_lock_irqsave(&busiest->lock, flags);
8712
8713
8714
8715
8716
8717
8718 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8719 raw_spin_unlock_irqrestore(&busiest->lock,
8720 flags);
8721 env.flags |= LBF_ALL_PINNED;
8722 goto out_one_pinned;
8723 }
8724
8725
8726
8727
8728
8729
8730 if (!busiest->active_balance) {
8731 busiest->active_balance = 1;
8732 busiest->push_cpu = this_cpu;
8733 active_balance = 1;
8734 }
8735 raw_spin_unlock_irqrestore(&busiest->lock, flags);
8736
8737 if (active_balance) {
8738 stop_one_cpu_nowait(cpu_of(busiest),
8739 active_load_balance_cpu_stop, busiest,
8740 &busiest->active_balance_work);
8741 }
8742
8743
8744 sd->nr_balance_failed = sd->cache_nice_tries+1;
8745 }
8746 } else
8747 sd->nr_balance_failed = 0;
8748
8749 if (likely(!active_balance)) {
8750
8751 sd->balance_interval = sd->min_interval;
8752 } else {
8753
8754
8755
8756
8757
8758
8759 if (sd->balance_interval < sd->max_interval)
8760 sd->balance_interval *= 2;
8761 }
8762
8763 goto out;
8764
8765out_balanced:
8766
8767
8768
8769
8770 if (sd_parent) {
8771 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8772
8773 if (*group_imbalance)
8774 *group_imbalance = 0;
8775 }
8776
8777out_all_pinned:
8778
8779
8780
8781
8782
8783 schedstat_inc(sd->lb_balanced[idle]);
8784
8785 sd->nr_balance_failed = 0;
8786
8787out_one_pinned:
8788
8789 if (((env.flags & LBF_ALL_PINNED) &&
8790 sd->balance_interval < MAX_PINNED_INTERVAL) ||
8791 (sd->balance_interval < sd->max_interval))
8792 sd->balance_interval *= 2;
8793
8794 ld_moved = 0;
8795out:
8796 return ld_moved;
8797}
8798
8799static inline unsigned long
8800get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
8801{
8802 unsigned long interval = sd->balance_interval;
8803
8804 if (cpu_busy)
8805 interval *= sd->busy_factor;
8806
8807
8808 interval = msecs_to_jiffies(interval);
8809 interval = clamp(interval, 1UL, max_load_balance_interval);
8810
8811 return interval;
8812}
8813
8814static inline void
8815update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8816{
8817 unsigned long interval, next;
8818
8819
8820 interval = get_sd_balance_interval(sd, 0);
8821 next = sd->last_balance + interval;
8822
8823 if (time_after(*next_balance, next))
8824 *next_balance = next;
8825}
8826
8827
8828
8829
8830
8831
8832
8833static int active_load_balance_cpu_stop(void *data)
8834{
8835 struct rq *busiest_rq = data;
8836 int busiest_cpu = cpu_of(busiest_rq);
8837 int target_cpu = busiest_rq->push_cpu;
8838 struct rq *target_rq = cpu_rq(target_cpu);
8839 struct sched_domain *sd;
8840 struct task_struct *p = NULL;
8841 struct rq_flags rf;
8842
8843 rq_lock_irq(busiest_rq, &rf);
8844
8845
8846
8847
8848
8849 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8850 goto out_unlock;
8851
8852
8853 if (unlikely(busiest_cpu != smp_processor_id() ||
8854 !busiest_rq->active_balance))
8855 goto out_unlock;
8856
8857
8858 if (busiest_rq->nr_running <= 1)
8859 goto out_unlock;
8860
8861
8862
8863
8864
8865
8866 BUG_ON(busiest_rq == target_rq);
8867
8868
8869 rcu_read_lock();
8870 for_each_domain(target_cpu, sd) {
8871 if ((sd->flags & SD_LOAD_BALANCE) &&
8872 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
8873 break;
8874 }
8875
8876 if (likely(sd)) {
8877 struct lb_env env = {
8878 .sd = sd,
8879 .dst_cpu = target_cpu,
8880 .dst_rq = target_rq,
8881 .src_cpu = busiest_rq->cpu,
8882 .src_rq = busiest_rq,
8883 .idle = CPU_IDLE,
8884
8885
8886
8887
8888
8889
8890 .flags = LBF_DST_PINNED,
8891 };
8892
8893 schedstat_inc(sd->alb_count);
8894 update_rq_clock(busiest_rq);
8895
8896 p = detach_one_task(&env);
8897 if (p) {
8898 schedstat_inc(sd->alb_pushed);
8899
8900 sd->nr_balance_failed = 0;
8901 } else {
8902 schedstat_inc(sd->alb_failed);
8903 }
8904 }
8905 rcu_read_unlock();
8906out_unlock:
8907 busiest_rq->active_balance = 0;
8908 rq_unlock(busiest_rq, &rf);
8909
8910 if (p)
8911 attach_one_task(target_rq, p);
8912
8913 local_irq_enable();
8914
8915 return 0;
8916}
8917
8918static DEFINE_SPINLOCK(balancing);
8919
8920
8921
8922
8923
8924void update_max_interval(void)
8925{
8926 max_load_balance_interval = HZ*num_online_cpus()/10;
8927}
8928
8929
8930
8931
8932
8933
8934
8935static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
8936{
8937 int continue_balancing = 1;
8938 int cpu = rq->cpu;
8939 unsigned long interval;
8940 struct sched_domain *sd;
8941
8942 unsigned long next_balance = jiffies + 60*HZ;
8943 int update_next_balance = 0;
8944 int need_serialize, need_decay = 0;
8945 u64 max_cost = 0;
8946
8947 rcu_read_lock();
8948 for_each_domain(cpu, sd) {
8949
8950
8951
8952
8953 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8954 sd->max_newidle_lb_cost =
8955 (sd->max_newidle_lb_cost * 253) / 256;
8956 sd->next_decay_max_lb_cost = jiffies + HZ;
8957 need_decay = 1;
8958 }
8959 max_cost += sd->max_newidle_lb_cost;
8960
8961 if (!(sd->flags & SD_LOAD_BALANCE))
8962 continue;
8963
8964
8965
8966
8967
8968
8969 if (!continue_balancing) {
8970 if (need_decay)
8971 continue;
8972 break;
8973 }
8974
8975 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8976
8977 need_serialize = sd->flags & SD_SERIALIZE;
8978 if (need_serialize) {
8979 if (!spin_trylock(&balancing))
8980 goto out;
8981 }
8982
8983 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8984 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8985
8986
8987
8988
8989
8990 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8991 }
8992 sd->last_balance = jiffies;
8993 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8994 }
8995 if (need_serialize)
8996 spin_unlock(&balancing);
8997out:
8998 if (time_after(next_balance, sd->last_balance + interval)) {
8999 next_balance = sd->last_balance + interval;
9000 update_next_balance = 1;
9001 }
9002 }
9003 if (need_decay) {
9004
9005
9006
9007
9008 rq->max_idle_balance_cost =
9009 max((u64)sysctl_sched_migration_cost, max_cost);
9010 }
9011 rcu_read_unlock();
9012
9013
9014
9015
9016
9017
9018 if (likely(update_next_balance)) {
9019 rq->next_balance = next_balance;
9020
9021#ifdef CONFIG_NO_HZ_COMMON
9022
9023
9024
9025
9026
9027
9028
9029
9030 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9031 nohz.next_balance = rq->next_balance;
9032#endif
9033 }
9034}
9035
9036static inline int on_null_domain(struct rq *rq)
9037{
9038 return unlikely(!rcu_dereference_sched(rq->sd));
9039}
9040
9041#ifdef CONFIG_NO_HZ_COMMON
9042
9043
9044
9045
9046
9047
9048
9049static inline int find_new_ilb(void)
9050{
9051 int ilb = cpumask_first(nohz.idle_cpus_mask);
9052
9053 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9054 return ilb;
9055
9056 return nr_cpu_ids;
9057}
9058
9059
9060
9061
9062
9063
9064static void kick_ilb(unsigned int flags)
9065{
9066 int ilb_cpu;
9067
9068 nohz.next_balance++;
9069
9070 ilb_cpu = find_new_ilb();
9071
9072 if (ilb_cpu >= nr_cpu_ids)
9073 return;
9074
9075 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9076 if (flags & NOHZ_KICK_MASK)
9077 return;
9078
9079
9080
9081
9082
9083
9084
9085 smp_send_reschedule(ilb_cpu);
9086}
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099static void nohz_balancer_kick(struct rq *rq)
9100{
9101 unsigned long now = jiffies;
9102 struct sched_domain_shared *sds;
9103 struct sched_domain *sd;
9104 int nr_busy, i, cpu = rq->cpu;
9105 unsigned int flags = 0;
9106
9107 if (unlikely(rq->idle_balance))
9108 return;
9109
9110
9111
9112
9113
9114 nohz_balance_exit_idle(rq);
9115
9116
9117
9118
9119
9120 if (likely(!atomic_read(&nohz.nr_cpus)))
9121 return;
9122
9123 if (READ_ONCE(nohz.has_blocked) &&
9124 time_after(now, READ_ONCE(nohz.next_blocked)))
9125 flags = NOHZ_STATS_KICK;
9126
9127 if (time_before(now, nohz.next_balance))
9128 goto out;
9129
9130 if (rq->nr_running >= 2) {
9131 flags = NOHZ_KICK_MASK;
9132 goto out;
9133 }
9134
9135 rcu_read_lock();
9136 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9137 if (sds) {
9138
9139
9140
9141
9142 nr_busy = atomic_read(&sds->nr_busy_cpus);
9143 if (nr_busy > 1) {
9144 flags = NOHZ_KICK_MASK;
9145 goto unlock;
9146 }
9147
9148 }
9149
9150 sd = rcu_dereference(rq->sd);
9151 if (sd) {
9152 if ((rq->cfs.h_nr_running >= 1) &&
9153 check_cpu_capacity(rq, sd)) {
9154 flags = NOHZ_KICK_MASK;
9155 goto unlock;
9156 }
9157 }
9158
9159 sd = rcu_dereference(per_cpu(sd_asym, cpu));
9160 if (sd) {
9161 for_each_cpu(i, sched_domain_span(sd)) {
9162 if (i == cpu ||
9163 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9164 continue;
9165
9166 if (sched_asym_prefer(i, cpu)) {
9167 flags = NOHZ_KICK_MASK;
9168 goto unlock;
9169 }
9170 }
9171 }
9172unlock:
9173 rcu_read_unlock();
9174out:
9175 if (flags)
9176 kick_ilb(flags);
9177}
9178
9179static void set_cpu_sd_state_busy(int cpu)
9180{
9181 struct sched_domain *sd;
9182
9183 rcu_read_lock();
9184 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9185
9186 if (!sd || !sd->nohz_idle)
9187 goto unlock;
9188 sd->nohz_idle = 0;
9189
9190 atomic_inc(&sd->shared->nr_busy_cpus);
9191unlock:
9192 rcu_read_unlock();
9193}
9194
9195void nohz_balance_exit_idle(struct rq *rq)
9196{
9197 SCHED_WARN_ON(rq != this_rq());
9198
9199 if (likely(!rq->nohz_tick_stopped))
9200 return;
9201
9202 rq->nohz_tick_stopped = 0;
9203 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9204 atomic_dec(&nohz.nr_cpus);
9205
9206 set_cpu_sd_state_busy(rq->cpu);
9207}
9208
9209static void set_cpu_sd_state_idle(int cpu)
9210{
9211 struct sched_domain *sd;
9212
9213 rcu_read_lock();
9214 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9215
9216 if (!sd || sd->nohz_idle)
9217 goto unlock;
9218 sd->nohz_idle = 1;
9219
9220 atomic_dec(&sd->shared->nr_busy_cpus);
9221unlock:
9222 rcu_read_unlock();
9223}
9224
9225
9226
9227
9228
9229void nohz_balance_enter_idle(int cpu)
9230{
9231 struct rq *rq = cpu_rq(cpu);
9232
9233 SCHED_WARN_ON(cpu != smp_processor_id());
9234
9235
9236 if (!cpu_active(cpu))
9237 return;
9238
9239
9240 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9241 return;
9242
9243
9244
9245
9246
9247
9248 rq->has_blocked_load = 1;
9249
9250
9251
9252
9253
9254
9255
9256 if (rq->nohz_tick_stopped)
9257 goto out;
9258
9259
9260 if (on_null_domain(rq))
9261 return;
9262
9263 rq->nohz_tick_stopped = 1;
9264
9265 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9266 atomic_inc(&nohz.nr_cpus);
9267
9268
9269
9270
9271
9272
9273 smp_mb__after_atomic();
9274
9275 set_cpu_sd_state_idle(cpu);
9276
9277out:
9278
9279
9280
9281
9282 WRITE_ONCE(nohz.has_blocked, 1);
9283}
9284
9285
9286
9287
9288
9289
9290
9291
9292static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9293 enum cpu_idle_type idle)
9294{
9295
9296 unsigned long now = jiffies;
9297 unsigned long next_balance = now + 60*HZ;
9298 bool has_blocked_load = false;
9299 int update_next_balance = 0;
9300 int this_cpu = this_rq->cpu;
9301 int balance_cpu;
9302 int ret = false;
9303 struct rq *rq;
9304
9305 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315 WRITE_ONCE(nohz.has_blocked, 0);
9316
9317
9318
9319
9320
9321 smp_mb();
9322
9323 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9324 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9325 continue;
9326
9327
9328
9329
9330
9331
9332 if (need_resched()) {
9333 has_blocked_load = true;
9334 goto abort;
9335 }
9336
9337 rq = cpu_rq(balance_cpu);
9338
9339 has_blocked_load |= update_nohz_stats(rq, true);
9340
9341
9342
9343
9344
9345 if (time_after_eq(jiffies, rq->next_balance)) {
9346 struct rq_flags rf;
9347
9348 rq_lock_irqsave(rq, &rf);
9349 update_rq_clock(rq);
9350 cpu_load_update_idle(rq);
9351 rq_unlock_irqrestore(rq, &rf);
9352
9353 if (flags & NOHZ_BALANCE_KICK)
9354 rebalance_domains(rq, CPU_IDLE);
9355 }
9356
9357 if (time_after(next_balance, rq->next_balance)) {
9358 next_balance = rq->next_balance;
9359 update_next_balance = 1;
9360 }
9361 }
9362
9363
9364 if (idle != CPU_NEWLY_IDLE) {
9365 update_blocked_averages(this_cpu);
9366 has_blocked_load |= this_rq->has_blocked_load;
9367 }
9368
9369 if (flags & NOHZ_BALANCE_KICK)
9370 rebalance_domains(this_rq, CPU_IDLE);
9371
9372 WRITE_ONCE(nohz.next_blocked,
9373 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9374
9375
9376 ret = true;
9377
9378abort:
9379
9380 if (has_blocked_load)
9381 WRITE_ONCE(nohz.has_blocked, 1);
9382
9383
9384
9385
9386
9387
9388 if (likely(update_next_balance))
9389 nohz.next_balance = next_balance;
9390
9391 return ret;
9392}
9393
9394
9395
9396
9397
9398static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9399{
9400 int this_cpu = this_rq->cpu;
9401 unsigned int flags;
9402
9403 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9404 return false;
9405
9406 if (idle != CPU_IDLE) {
9407 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9408 return false;
9409 }
9410
9411
9412
9413
9414 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9415 if (!(flags & NOHZ_KICK_MASK))
9416 return false;
9417
9418 _nohz_idle_balance(this_rq, flags, idle);
9419
9420 return true;
9421}
9422
9423static void nohz_newidle_balance(struct rq *this_rq)
9424{
9425 int this_cpu = this_rq->cpu;
9426
9427
9428
9429
9430
9431 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9432 return;
9433
9434
9435 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9436 return;
9437
9438
9439 if (!READ_ONCE(nohz.has_blocked) ||
9440 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9441 return;
9442
9443 raw_spin_unlock(&this_rq->lock);
9444
9445
9446
9447
9448
9449
9450 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9451 kick_ilb(NOHZ_STATS_KICK);
9452 raw_spin_lock(&this_rq->lock);
9453}
9454
9455#else
9456static inline void nohz_balancer_kick(struct rq *rq) { }
9457
9458static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9459{
9460 return false;
9461}
9462
9463static inline void nohz_newidle_balance(struct rq *this_rq) { }
9464#endif
9465
9466
9467
9468
9469
9470static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9471{
9472 unsigned long next_balance = jiffies + HZ;
9473 int this_cpu = this_rq->cpu;
9474 struct sched_domain *sd;
9475 int pulled_task = 0;
9476 u64 curr_cost = 0;
9477
9478
9479
9480
9481
9482 this_rq->idle_stamp = rq_clock(this_rq);
9483
9484
9485
9486
9487 if (!cpu_active(this_cpu))
9488 return 0;
9489
9490
9491
9492
9493
9494
9495
9496 rq_unpin_lock(this_rq, rf);
9497
9498 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9499 !this_rq->rd->overload) {
9500
9501 rcu_read_lock();
9502 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9503 if (sd)
9504 update_next_balance(sd, &next_balance);
9505 rcu_read_unlock();
9506
9507 nohz_newidle_balance(this_rq);
9508
9509 goto out;
9510 }
9511
9512 raw_spin_unlock(&this_rq->lock);
9513
9514 update_blocked_averages(this_cpu);
9515 rcu_read_lock();
9516 for_each_domain(this_cpu, sd) {
9517 int continue_balancing = 1;
9518 u64 t0, domain_cost;
9519
9520 if (!(sd->flags & SD_LOAD_BALANCE))
9521 continue;
9522
9523 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9524 update_next_balance(sd, &next_balance);
9525 break;
9526 }
9527
9528 if (sd->flags & SD_BALANCE_NEWIDLE) {
9529 t0 = sched_clock_cpu(this_cpu);
9530
9531 pulled_task = load_balance(this_cpu, this_rq,
9532 sd, CPU_NEWLY_IDLE,
9533 &continue_balancing);
9534
9535 domain_cost = sched_clock_cpu(this_cpu) - t0;
9536 if (domain_cost > sd->max_newidle_lb_cost)
9537 sd->max_newidle_lb_cost = domain_cost;
9538
9539 curr_cost += domain_cost;
9540 }
9541
9542 update_next_balance(sd, &next_balance);
9543
9544
9545
9546
9547
9548 if (pulled_task || this_rq->nr_running > 0)
9549 break;
9550 }
9551 rcu_read_unlock();
9552
9553 raw_spin_lock(&this_rq->lock);
9554
9555 if (curr_cost > this_rq->max_idle_balance_cost)
9556 this_rq->max_idle_balance_cost = curr_cost;
9557
9558out:
9559
9560
9561
9562
9563
9564 if (this_rq->cfs.h_nr_running && !pulled_task)
9565 pulled_task = 1;
9566
9567
9568 if (time_after(this_rq->next_balance, next_balance))
9569 this_rq->next_balance = next_balance;
9570
9571
9572 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9573 pulled_task = -1;
9574
9575 if (pulled_task)
9576 this_rq->idle_stamp = 0;
9577
9578 rq_repin_lock(this_rq, rf);
9579
9580 return pulled_task;
9581}
9582
9583
9584
9585
9586
9587static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9588{
9589 struct rq *this_rq = this_rq();
9590 enum cpu_idle_type idle = this_rq->idle_balance ?
9591 CPU_IDLE : CPU_NOT_IDLE;
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601 if (nohz_idle_balance(this_rq, idle))
9602 return;
9603
9604
9605 update_blocked_averages(this_rq->cpu);
9606 rebalance_domains(this_rq, idle);
9607}
9608
9609
9610
9611
9612void trigger_load_balance(struct rq *rq)
9613{
9614
9615 if (unlikely(on_null_domain(rq)))
9616 return;
9617
9618 if (time_after_eq(jiffies, rq->next_balance))
9619 raise_softirq(SCHED_SOFTIRQ);
9620
9621 nohz_balancer_kick(rq);
9622}
9623
9624static void rq_online_fair(struct rq *rq)
9625{
9626 update_sysctl();
9627
9628 update_runtime_enabled(rq);
9629}
9630
9631static void rq_offline_fair(struct rq *rq)
9632{
9633 update_sysctl();
9634
9635
9636 unthrottle_offline_cfs_rqs(rq);
9637}
9638
9639#endif
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9650{
9651 struct cfs_rq *cfs_rq;
9652 struct sched_entity *se = &curr->se;
9653
9654 for_each_sched_entity(se) {
9655 cfs_rq = cfs_rq_of(se);
9656 entity_tick(cfs_rq, se, queued);
9657 }
9658
9659 if (static_branch_unlikely(&sched_numa_balancing))
9660 task_tick_numa(rq, curr);
9661}
9662
9663
9664
9665
9666
9667
9668static void task_fork_fair(struct task_struct *p)
9669{
9670 struct cfs_rq *cfs_rq;
9671 struct sched_entity *se = &p->se, *curr;
9672 struct rq *rq = this_rq();
9673 struct rq_flags rf;
9674
9675 rq_lock(rq, &rf);
9676 update_rq_clock(rq);
9677
9678 cfs_rq = task_cfs_rq(current);
9679 curr = cfs_rq->curr;
9680 if (curr) {
9681 update_curr(cfs_rq);
9682 se->vruntime = curr->vruntime;
9683 }
9684 place_entity(cfs_rq, se, 1);
9685
9686 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9687
9688
9689
9690
9691 swap(curr->vruntime, se->vruntime);
9692 resched_curr(rq);
9693 }
9694
9695 se->vruntime -= cfs_rq->min_vruntime;
9696 rq_unlock(rq, &rf);
9697}
9698
9699
9700
9701
9702
9703static void
9704prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9705{
9706 if (!task_on_rq_queued(p))
9707 return;
9708
9709
9710
9711
9712
9713
9714 if (rq->curr == p) {
9715 if (p->prio > oldprio)
9716 resched_curr(rq);
9717 } else
9718 check_preempt_curr(rq, p, 0);
9719}
9720
9721static inline bool vruntime_normalized(struct task_struct *p)
9722{
9723 struct sched_entity *se = &p->se;
9724
9725
9726
9727
9728
9729
9730 if (p->on_rq)
9731 return true;
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742 if (!se->sum_exec_runtime ||
9743 (p->state == TASK_WAKING && p->sched_remote_wakeup))
9744 return true;
9745
9746 return false;
9747}
9748
9749#ifdef CONFIG_FAIR_GROUP_SCHED
9750
9751
9752
9753
9754static void propagate_entity_cfs_rq(struct sched_entity *se)
9755{
9756 struct cfs_rq *cfs_rq;
9757
9758
9759 se = se->parent;
9760
9761 for_each_sched_entity(se) {
9762 cfs_rq = cfs_rq_of(se);
9763
9764 if (cfs_rq_throttled(cfs_rq))
9765 break;
9766
9767 update_load_avg(cfs_rq, se, UPDATE_TG);
9768 }
9769}
9770#else
9771static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9772#endif
9773
9774static void detach_entity_cfs_rq(struct sched_entity *se)
9775{
9776 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9777
9778
9779 update_load_avg(cfs_rq, se, 0);
9780 detach_entity_load_avg(cfs_rq, se);
9781 update_tg_load_avg(cfs_rq, false);
9782 propagate_entity_cfs_rq(se);
9783}
9784
9785static void attach_entity_cfs_rq(struct sched_entity *se)
9786{
9787 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9788
9789#ifdef CONFIG_FAIR_GROUP_SCHED
9790
9791
9792
9793
9794 se->depth = se->parent ? se->parent->depth + 1 : 0;
9795#endif
9796
9797
9798 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9799 attach_entity_load_avg(cfs_rq, se, 0);
9800 update_tg_load_avg(cfs_rq, false);
9801 propagate_entity_cfs_rq(se);
9802}
9803
9804static void detach_task_cfs_rq(struct task_struct *p)
9805{
9806 struct sched_entity *se = &p->se;
9807 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9808
9809 if (!vruntime_normalized(p)) {
9810
9811
9812
9813
9814 place_entity(cfs_rq, se, 0);
9815 se->vruntime -= cfs_rq->min_vruntime;
9816 }
9817
9818 detach_entity_cfs_rq(se);
9819}
9820
9821static void attach_task_cfs_rq(struct task_struct *p)
9822{
9823 struct sched_entity *se = &p->se;
9824 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9825
9826 attach_entity_cfs_rq(se);
9827
9828 if (!vruntime_normalized(p))
9829 se->vruntime += cfs_rq->min_vruntime;
9830}
9831
9832static void switched_from_fair(struct rq *rq, struct task_struct *p)
9833{
9834 detach_task_cfs_rq(p);
9835}
9836
9837static void switched_to_fair(struct rq *rq, struct task_struct *p)
9838{
9839 attach_task_cfs_rq(p);
9840
9841 if (task_on_rq_queued(p)) {
9842
9843
9844
9845
9846
9847 if (rq->curr == p)
9848 resched_curr(rq);
9849 else
9850 check_preempt_curr(rq, p, 0);
9851 }
9852}
9853
9854
9855
9856
9857
9858
9859static void set_curr_task_fair(struct rq *rq)
9860{
9861 struct sched_entity *se = &rq->curr->se;
9862
9863 for_each_sched_entity(se) {
9864 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9865
9866 set_next_entity(cfs_rq, se);
9867
9868 account_cfs_rq_runtime(cfs_rq, 0);
9869 }
9870}
9871
9872void init_cfs_rq(struct cfs_rq *cfs_rq)
9873{
9874 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
9875 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9876#ifndef CONFIG_64BIT
9877 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9878#endif
9879#ifdef CONFIG_SMP
9880 raw_spin_lock_init(&cfs_rq->removed.lock);
9881#endif
9882}
9883
9884#ifdef CONFIG_FAIR_GROUP_SCHED
9885static void task_set_group_fair(struct task_struct *p)
9886{
9887 struct sched_entity *se = &p->se;
9888
9889 set_task_rq(p, task_cpu(p));
9890 se->depth = se->parent ? se->parent->depth + 1 : 0;
9891}
9892
9893static void task_move_group_fair(struct task_struct *p)
9894{
9895 detach_task_cfs_rq(p);
9896 set_task_rq(p, task_cpu(p));
9897
9898#ifdef CONFIG_SMP
9899
9900 p->se.avg.last_update_time = 0;
9901#endif
9902 attach_task_cfs_rq(p);
9903}
9904
9905static void task_change_group_fair(struct task_struct *p, int type)
9906{
9907 switch (type) {
9908 case TASK_SET_GROUP:
9909 task_set_group_fair(p);
9910 break;
9911
9912 case TASK_MOVE_GROUP:
9913 task_move_group_fair(p);
9914 break;
9915 }
9916}
9917
9918void free_fair_sched_group(struct task_group *tg)
9919{
9920 int i;
9921
9922 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9923
9924 for_each_possible_cpu(i) {
9925 if (tg->cfs_rq)
9926 kfree(tg->cfs_rq[i]);
9927 if (tg->se)
9928 kfree(tg->se[i]);
9929 }
9930
9931 kfree(tg->cfs_rq);
9932 kfree(tg->se);
9933}
9934
9935int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9936{
9937 struct sched_entity *se;
9938 struct cfs_rq *cfs_rq;
9939 int i;
9940
9941 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
9942 if (!tg->cfs_rq)
9943 goto err;
9944 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
9945 if (!tg->se)
9946 goto err;
9947
9948 tg->shares = NICE_0_LOAD;
9949
9950 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9951
9952 for_each_possible_cpu(i) {
9953 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9954 GFP_KERNEL, cpu_to_node(i));
9955 if (!cfs_rq)
9956 goto err;
9957
9958 se = kzalloc_node(sizeof(struct sched_entity),
9959 GFP_KERNEL, cpu_to_node(i));
9960 if (!se)
9961 goto err_free_rq;
9962
9963 init_cfs_rq(cfs_rq);
9964 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9965 init_entity_runnable_average(se);
9966 }
9967
9968 return 1;
9969
9970err_free_rq:
9971 kfree(cfs_rq);
9972err:
9973 return 0;
9974}
9975
9976void online_fair_sched_group(struct task_group *tg)
9977{
9978 struct sched_entity *se;
9979 struct rq *rq;
9980 int i;
9981
9982 for_each_possible_cpu(i) {
9983 rq = cpu_rq(i);
9984 se = tg->se[i];
9985
9986 raw_spin_lock_irq(&rq->lock);
9987 update_rq_clock(rq);
9988 attach_entity_cfs_rq(se);
9989 sync_throttle(tg, i);
9990 raw_spin_unlock_irq(&rq->lock);
9991 }
9992}
9993
9994void unregister_fair_sched_group(struct task_group *tg)
9995{
9996 unsigned long flags;
9997 struct rq *rq;
9998 int cpu;
9999
10000 for_each_possible_cpu(cpu) {
10001 if (tg->se[cpu])
10002 remove_entity_load_avg(tg->se[cpu]);
10003
10004
10005
10006
10007
10008 if (!tg->cfs_rq[cpu]->on_list)
10009 continue;
10010
10011 rq = cpu_rq(cpu);
10012
10013 raw_spin_lock_irqsave(&rq->lock, flags);
10014 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10015 raw_spin_unlock_irqrestore(&rq->lock, flags);
10016 }
10017}
10018
10019void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10020 struct sched_entity *se, int cpu,
10021 struct sched_entity *parent)
10022{
10023 struct rq *rq = cpu_rq(cpu);
10024
10025 cfs_rq->tg = tg;
10026 cfs_rq->rq = rq;
10027 init_cfs_rq_runtime(cfs_rq);
10028
10029 tg->cfs_rq[cpu] = cfs_rq;
10030 tg->se[cpu] = se;
10031
10032
10033 if (!se)
10034 return;
10035
10036 if (!parent) {
10037 se->cfs_rq = &rq->cfs;
10038 se->depth = 0;
10039 } else {
10040 se->cfs_rq = parent->my_q;
10041 se->depth = parent->depth + 1;
10042 }
10043
10044 se->my_q = cfs_rq;
10045
10046 update_load_set(&se->load, NICE_0_LOAD);
10047 se->parent = parent;
10048}
10049
10050static DEFINE_MUTEX(shares_mutex);
10051
10052int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10053{
10054 int i;
10055
10056
10057
10058
10059 if (!tg->se[0])
10060 return -EINVAL;
10061
10062 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10063
10064 mutex_lock(&shares_mutex);
10065 if (tg->shares == shares)
10066 goto done;
10067
10068 tg->shares = shares;
10069 for_each_possible_cpu(i) {
10070 struct rq *rq = cpu_rq(i);
10071 struct sched_entity *se = tg->se[i];
10072 struct rq_flags rf;
10073
10074
10075 rq_lock_irqsave(rq, &rf);
10076 update_rq_clock(rq);
10077 for_each_sched_entity(se) {
10078 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10079 update_cfs_group(se);
10080 }
10081 rq_unlock_irqrestore(rq, &rf);
10082 }
10083
10084done:
10085 mutex_unlock(&shares_mutex);
10086 return 0;
10087}
10088#else
10089
10090void free_fair_sched_group(struct task_group *tg) { }
10091
10092int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10093{
10094 return 1;
10095}
10096
10097void online_fair_sched_group(struct task_group *tg) { }
10098
10099void unregister_fair_sched_group(struct task_group *tg) { }
10100
10101#endif
10102
10103
10104static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10105{
10106 struct sched_entity *se = &task->se;
10107 unsigned int rr_interval = 0;
10108
10109
10110
10111
10112
10113 if (rq->cfs.load.weight)
10114 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10115
10116 return rr_interval;
10117}
10118
10119
10120
10121
10122const struct sched_class fair_sched_class = {
10123 .next = &idle_sched_class,
10124 .enqueue_task = enqueue_task_fair,
10125 .dequeue_task = dequeue_task_fair,
10126 .yield_task = yield_task_fair,
10127 .yield_to_task = yield_to_task_fair,
10128
10129 .check_preempt_curr = check_preempt_wakeup,
10130
10131 .pick_next_task = pick_next_task_fair,
10132 .put_prev_task = put_prev_task_fair,
10133
10134#ifdef CONFIG_SMP
10135 .select_task_rq = select_task_rq_fair,
10136 .migrate_task_rq = migrate_task_rq_fair,
10137
10138 .rq_online = rq_online_fair,
10139 .rq_offline = rq_offline_fair,
10140
10141 .task_dead = task_dead_fair,
10142 .set_cpus_allowed = set_cpus_allowed_common,
10143#endif
10144
10145 .set_curr_task = set_curr_task_fair,
10146 .task_tick = task_tick_fair,
10147 .task_fork = task_fork_fair,
10148
10149 .prio_changed = prio_changed_fair,
10150 .switched_from = switched_from_fair,
10151 .switched_to = switched_to_fair,
10152
10153 .get_rr_interval = get_rr_interval_fair,
10154
10155 .update_curr = update_curr_fair,
10156
10157#ifdef CONFIG_FAIR_GROUP_SCHED
10158 .task_change_group = task_change_group_fair,
10159#endif
10160};
10161
10162#ifdef CONFIG_SCHED_DEBUG
10163void print_cfs_stats(struct seq_file *m, int cpu)
10164{
10165 struct cfs_rq *cfs_rq, *pos;
10166
10167 rcu_read_lock();
10168 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10169 print_cfs_rq(m, cpu, cfs_rq);
10170 rcu_read_unlock();
10171}
10172
10173#ifdef CONFIG_NUMA_BALANCING
10174void show_numa_stats(struct task_struct *p, struct seq_file *m)
10175{
10176 int node;
10177 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10178
10179 for_each_online_node(node) {
10180 if (p->numa_faults) {
10181 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10182 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10183 }
10184 if (p->numa_group) {
10185 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10186 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10187 }
10188 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10189 }
10190}
10191#endif
10192#endif
10193
10194__init void init_sched_fair_class(void)
10195{
10196#ifdef CONFIG_SMP
10197 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10198
10199#ifdef CONFIG_NO_HZ_COMMON
10200 nohz.next_balance = jiffies;
10201 nohz.next_blocked = jiffies;
10202 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10203#endif
10204#endif
10205
10206}
10207