1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25#include <trace/events/sched.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40unsigned int sysctl_sched_latency = 6000000ULL;
41static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
42
43
44
45
46
47
48
49
50
51
52
53
54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
55
56
57
58
59
60
61unsigned int sysctl_sched_min_granularity = 750000ULL;
62static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
63
64
65
66
67static unsigned int sched_nr_latency = 8;
68
69
70
71
72
73unsigned int sysctl_sched_child_runs_first __read_mostly;
74
75
76
77
78
79
80
81
82
83
84unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
86
87const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
88
89#ifdef CONFIG_SMP
90
91
92
93int __weak arch_asym_cpu_priority(int cpu)
94{
95 return -cpu;
96}
97
98
99
100
101
102
103
104static unsigned int capacity_margin = 1280;
105#endif
106
107#ifdef CONFIG_CFS_BANDWIDTH
108
109
110
111
112
113
114
115
116
117
118unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
119#endif
120
121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122{
123 lw->weight += inc;
124 lw->inv_weight = 0;
125}
126
127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128{
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131}
132
133static inline void update_load_set(struct load_weight *lw, unsigned long w)
134{
135 lw->weight = w;
136 lw->inv_weight = 0;
137}
138
139
140
141
142
143
144
145
146
147
148static unsigned int get_update_sysctl_factor(void)
149{
150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167}
168
169static void update_sysctl(void)
170{
171 unsigned int factor = get_update_sysctl_factor();
172
173#define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178#undef SET_SYSCTL
179}
180
181void sched_init_granularity(void)
182{
183 update_sysctl();
184}
185
186#define WMULT_CONST (~0U)
187#define WMULT_SHIFT 32
188
189static void __update_inv_weight(struct load_weight *lw)
190{
191 unsigned long w;
192
193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204}
205
206
207
208
209
210
211
212
213
214
215
216
217
218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219{
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
222
223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
230 }
231
232
233 fact = (u64)(u32)fact * lw->inv_weight;
234
235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
239
240 return mul_u64_u32_shr(delta_exec, fact, shift);
241}
242
243
244const struct sched_class fair_sched_class;
245
246
247
248
249
250#ifdef CONFIG_FAIR_GROUP_SCHED
251static inline struct task_struct *task_of(struct sched_entity *se)
252{
253 SCHED_WARN_ON(!entity_is_task(se));
254 return container_of(se, struct task_struct, se);
255}
256
257
258#define for_each_sched_entity(se) \
259 for (; se; se = se->parent)
260
261static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
262{
263 return p->se.cfs_rq;
264}
265
266
267static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
268{
269 return se->cfs_rq;
270}
271
272
273static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
274{
275 return grp->my_q;
276}
277
278static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
279{
280 struct rq *rq = rq_of(cfs_rq);
281 int cpu = cpu_of(rq);
282
283 if (cfs_rq->on_list)
284 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
285
286 cfs_rq->on_list = 1;
287
288
289
290
291
292
293
294
295
296
297 if (cfs_rq->tg->parent &&
298 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
299
300
301
302
303
304
305 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
306 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
307
308
309
310
311
312 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
313 return true;
314 }
315
316 if (!cfs_rq->tg->parent) {
317
318
319
320
321 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
322 &rq->leaf_cfs_rq_list);
323
324
325
326
327 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
328 return true;
329 }
330
331
332
333
334
335
336
337 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
338
339
340
341
342 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
343 return false;
344}
345
346static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
347{
348 if (cfs_rq->on_list) {
349 struct rq *rq = rq_of(cfs_rq);
350
351
352
353
354
355
356
357
358 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
359 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
360
361 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
362 cfs_rq->on_list = 0;
363 }
364}
365
366static inline void assert_list_leaf_cfs_rq(struct rq *rq)
367{
368 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
369}
370
371
372#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
373 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
374 leaf_cfs_rq_list)
375
376
377static inline struct cfs_rq *
378is_same_group(struct sched_entity *se, struct sched_entity *pse)
379{
380 if (se->cfs_rq == pse->cfs_rq)
381 return se->cfs_rq;
382
383 return NULL;
384}
385
386static inline struct sched_entity *parent_entity(struct sched_entity *se)
387{
388 return se->parent;
389}
390
391static void
392find_matching_se(struct sched_entity **se, struct sched_entity **pse)
393{
394 int se_depth, pse_depth;
395
396
397
398
399
400
401
402
403
404 se_depth = (*se)->depth;
405 pse_depth = (*pse)->depth;
406
407 while (se_depth > pse_depth) {
408 se_depth--;
409 *se = parent_entity(*se);
410 }
411
412 while (pse_depth > se_depth) {
413 pse_depth--;
414 *pse = parent_entity(*pse);
415 }
416
417 while (!is_same_group(*se, *pse)) {
418 *se = parent_entity(*se);
419 *pse = parent_entity(*pse);
420 }
421}
422
423#else
424
425static inline struct task_struct *task_of(struct sched_entity *se)
426{
427 return container_of(se, struct task_struct, se);
428}
429
430#define for_each_sched_entity(se) \
431 for (; se; se = NULL)
432
433static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
434{
435 return &task_rq(p)->cfs;
436}
437
438static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
439{
440 struct task_struct *p = task_of(se);
441 struct rq *rq = task_rq(p);
442
443 return &rq->cfs;
444}
445
446
447static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
448{
449 return NULL;
450}
451
452static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
453{
454 return true;
455}
456
457static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
458{
459}
460
461static inline void assert_list_leaf_cfs_rq(struct rq *rq)
462{
463}
464
465#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
466 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
467
468static inline struct sched_entity *parent_entity(struct sched_entity *se)
469{
470 return NULL;
471}
472
473static inline void
474find_matching_se(struct sched_entity **se, struct sched_entity **pse)
475{
476}
477
478#endif
479
480static __always_inline
481void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
482
483
484
485
486
487static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
488{
489 s64 delta = (s64)(vruntime - max_vruntime);
490 if (delta > 0)
491 max_vruntime = vruntime;
492
493 return max_vruntime;
494}
495
496static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
497{
498 s64 delta = (s64)(vruntime - min_vruntime);
499 if (delta < 0)
500 min_vruntime = vruntime;
501
502 return min_vruntime;
503}
504
505static inline int entity_before(struct sched_entity *a,
506 struct sched_entity *b)
507{
508 return (s64)(a->vruntime - b->vruntime) < 0;
509}
510
511static void update_min_vruntime(struct cfs_rq *cfs_rq)
512{
513 struct sched_entity *curr = cfs_rq->curr;
514 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
515
516 u64 vruntime = cfs_rq->min_vruntime;
517
518 if (curr) {
519 if (curr->on_rq)
520 vruntime = curr->vruntime;
521 else
522 curr = NULL;
523 }
524
525 if (leftmost) {
526 struct sched_entity *se;
527 se = rb_entry(leftmost, struct sched_entity, run_node);
528
529 if (!curr)
530 vruntime = se->vruntime;
531 else
532 vruntime = min_vruntime(vruntime, se->vruntime);
533 }
534
535
536 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
537#ifndef CONFIG_64BIT
538 smp_wmb();
539 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
540#endif
541}
542
543
544
545
546static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
547{
548 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
549 struct rb_node *parent = NULL;
550 struct sched_entity *entry;
551 bool leftmost = true;
552
553
554
555
556 while (*link) {
557 parent = *link;
558 entry = rb_entry(parent, struct sched_entity, run_node);
559
560
561
562
563 if (entity_before(se, entry)) {
564 link = &parent->rb_left;
565 } else {
566 link = &parent->rb_right;
567 leftmost = false;
568 }
569 }
570
571 rb_link_node(&se->run_node, parent, link);
572 rb_insert_color_cached(&se->run_node,
573 &cfs_rq->tasks_timeline, leftmost);
574}
575
576static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
577{
578 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
579}
580
581struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
582{
583 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
584
585 if (!left)
586 return NULL;
587
588 return rb_entry(left, struct sched_entity, run_node);
589}
590
591static struct sched_entity *__pick_next_entity(struct sched_entity *se)
592{
593 struct rb_node *next = rb_next(&se->run_node);
594
595 if (!next)
596 return NULL;
597
598 return rb_entry(next, struct sched_entity, run_node);
599}
600
601#ifdef CONFIG_SCHED_DEBUG
602struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
603{
604 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
605
606 if (!last)
607 return NULL;
608
609 return rb_entry(last, struct sched_entity, run_node);
610}
611
612
613
614
615
616int sched_proc_update_handler(struct ctl_table *table, int write,
617 void __user *buffer, size_t *lenp,
618 loff_t *ppos)
619{
620 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
621 unsigned int factor = get_update_sysctl_factor();
622
623 if (ret || !write)
624 return ret;
625
626 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
627 sysctl_sched_min_granularity);
628
629#define WRT_SYSCTL(name) \
630 (normalized_sysctl_##name = sysctl_##name / (factor))
631 WRT_SYSCTL(sched_min_granularity);
632 WRT_SYSCTL(sched_latency);
633 WRT_SYSCTL(sched_wakeup_granularity);
634#undef WRT_SYSCTL
635
636 return 0;
637}
638#endif
639
640
641
642
643static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
644{
645 if (unlikely(se->load.weight != NICE_0_LOAD))
646 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
647
648 return delta;
649}
650
651
652
653
654
655
656
657
658
659static u64 __sched_period(unsigned long nr_running)
660{
661 if (unlikely(nr_running > sched_nr_latency))
662 return nr_running * sysctl_sched_min_granularity;
663 else
664 return sysctl_sched_latency;
665}
666
667
668
669
670
671
672
673static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
674{
675 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
676
677 for_each_sched_entity(se) {
678 struct load_weight *load;
679 struct load_weight lw;
680
681 cfs_rq = cfs_rq_of(se);
682 load = &cfs_rq->load;
683
684 if (unlikely(!se->on_rq)) {
685 lw = cfs_rq->load;
686
687 update_load_add(&lw, se->load.weight);
688 load = &lw;
689 }
690 slice = __calc_delta(slice, se->load.weight, load);
691 }
692 return slice;
693}
694
695
696
697
698
699
700static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
701{
702 return calc_delta_fair(sched_slice(cfs_rq, se), se);
703}
704
705#include "pelt.h"
706#ifdef CONFIG_SMP
707
708static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
709static unsigned long task_h_load(struct task_struct *p);
710static unsigned long capacity_of(int cpu);
711
712
713void init_entity_runnable_average(struct sched_entity *se)
714{
715 struct sched_avg *sa = &se->avg;
716
717 memset(sa, 0, sizeof(*sa));
718
719
720
721
722
723
724
725 if (entity_is_task(se))
726 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
727
728 se->runnable_weight = se->load.weight;
729
730
731}
732
733static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
734static void attach_entity_cfs_rq(struct sched_entity *se);
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762void post_init_entity_util_avg(struct task_struct *p)
763{
764 struct sched_entity *se = &p->se;
765 struct cfs_rq *cfs_rq = cfs_rq_of(se);
766 struct sched_avg *sa = &se->avg;
767 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
768 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
769
770 if (cap > 0) {
771 if (cfs_rq->avg.util_avg != 0) {
772 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
773 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
774
775 if (sa->util_avg > cap)
776 sa->util_avg = cap;
777 } else {
778 sa->util_avg = cap;
779 }
780 }
781
782 if (p->sched_class != &fair_sched_class) {
783
784
785
786
787
788
789
790
791
792
793 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
794 return;
795 }
796
797 attach_entity_cfs_rq(se);
798}
799
800#else
801void init_entity_runnable_average(struct sched_entity *se)
802{
803}
804void post_init_entity_util_avg(struct task_struct *p)
805{
806}
807static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
808{
809}
810#endif
811
812
813
814
815static void update_curr(struct cfs_rq *cfs_rq)
816{
817 struct sched_entity *curr = cfs_rq->curr;
818 u64 now = rq_clock_task(rq_of(cfs_rq));
819 u64 delta_exec;
820
821 if (unlikely(!curr))
822 return;
823
824 delta_exec = now - curr->exec_start;
825 if (unlikely((s64)delta_exec <= 0))
826 return;
827
828 curr->exec_start = now;
829
830 schedstat_set(curr->statistics.exec_max,
831 max(delta_exec, curr->statistics.exec_max));
832
833 curr->sum_exec_runtime += delta_exec;
834 schedstat_add(cfs_rq->exec_clock, delta_exec);
835
836 curr->vruntime += calc_delta_fair(delta_exec, curr);
837 update_min_vruntime(cfs_rq);
838
839 if (entity_is_task(curr)) {
840 struct task_struct *curtask = task_of(curr);
841
842 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
843 cgroup_account_cputime(curtask, delta_exec);
844 account_group_exec_runtime(curtask, delta_exec);
845 }
846
847 account_cfs_rq_runtime(cfs_rq, delta_exec);
848}
849
850static void update_curr_fair(struct rq *rq)
851{
852 update_curr(cfs_rq_of(&rq->curr->se));
853}
854
855static inline void
856update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
857{
858 u64 wait_start, prev_wait_start;
859
860 if (!schedstat_enabled())
861 return;
862
863 wait_start = rq_clock(rq_of(cfs_rq));
864 prev_wait_start = schedstat_val(se->statistics.wait_start);
865
866 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
867 likely(wait_start > prev_wait_start))
868 wait_start -= prev_wait_start;
869
870 __schedstat_set(se->statistics.wait_start, wait_start);
871}
872
873static inline void
874update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
875{
876 struct task_struct *p;
877 u64 delta;
878
879 if (!schedstat_enabled())
880 return;
881
882 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
883
884 if (entity_is_task(se)) {
885 p = task_of(se);
886 if (task_on_rq_migrating(p)) {
887
888
889
890
891
892 __schedstat_set(se->statistics.wait_start, delta);
893 return;
894 }
895 trace_sched_stat_wait(p, delta);
896 }
897
898 __schedstat_set(se->statistics.wait_max,
899 max(schedstat_val(se->statistics.wait_max), delta));
900 __schedstat_inc(se->statistics.wait_count);
901 __schedstat_add(se->statistics.wait_sum, delta);
902 __schedstat_set(se->statistics.wait_start, 0);
903}
904
905static inline void
906update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
907{
908 struct task_struct *tsk = NULL;
909 u64 sleep_start, block_start;
910
911 if (!schedstat_enabled())
912 return;
913
914 sleep_start = schedstat_val(se->statistics.sleep_start);
915 block_start = schedstat_val(se->statistics.block_start);
916
917 if (entity_is_task(se))
918 tsk = task_of(se);
919
920 if (sleep_start) {
921 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
922
923 if ((s64)delta < 0)
924 delta = 0;
925
926 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
927 __schedstat_set(se->statistics.sleep_max, delta);
928
929 __schedstat_set(se->statistics.sleep_start, 0);
930 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
931
932 if (tsk) {
933 account_scheduler_latency(tsk, delta >> 10, 1);
934 trace_sched_stat_sleep(tsk, delta);
935 }
936 }
937 if (block_start) {
938 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
939
940 if ((s64)delta < 0)
941 delta = 0;
942
943 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
944 __schedstat_set(se->statistics.block_max, delta);
945
946 __schedstat_set(se->statistics.block_start, 0);
947 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
948
949 if (tsk) {
950 if (tsk->in_iowait) {
951 __schedstat_add(se->statistics.iowait_sum, delta);
952 __schedstat_inc(se->statistics.iowait_count);
953 trace_sched_stat_iowait(tsk, delta);
954 }
955
956 trace_sched_stat_blocked(tsk, delta);
957
958
959
960
961
962
963 if (unlikely(prof_on == SLEEP_PROFILING)) {
964 profile_hits(SLEEP_PROFILING,
965 (void *)get_wchan(tsk),
966 delta >> 20);
967 }
968 account_scheduler_latency(tsk, delta >> 10, 0);
969 }
970 }
971}
972
973
974
975
976static inline void
977update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978{
979 if (!schedstat_enabled())
980 return;
981
982
983
984
985
986 if (se != cfs_rq->curr)
987 update_stats_wait_start(cfs_rq, se);
988
989 if (flags & ENQUEUE_WAKEUP)
990 update_stats_enqueue_sleeper(cfs_rq, se);
991}
992
993static inline void
994update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
995{
996
997 if (!schedstat_enabled())
998 return;
999
1000
1001
1002
1003
1004 if (se != cfs_rq->curr)
1005 update_stats_wait_end(cfs_rq, se);
1006
1007 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1008 struct task_struct *tsk = task_of(se);
1009
1010 if (tsk->state & TASK_INTERRUPTIBLE)
1011 __schedstat_set(se->statistics.sleep_start,
1012 rq_clock(rq_of(cfs_rq)));
1013 if (tsk->state & TASK_UNINTERRUPTIBLE)
1014 __schedstat_set(se->statistics.block_start,
1015 rq_clock(rq_of(cfs_rq)));
1016 }
1017}
1018
1019
1020
1021
1022static inline void
1023update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1024{
1025
1026
1027
1028 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1029}
1030
1031
1032
1033
1034
1035#ifdef CONFIG_NUMA_BALANCING
1036
1037
1038
1039
1040
1041unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1042unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1043
1044
1045unsigned int sysctl_numa_balancing_scan_size = 256;
1046
1047
1048unsigned int sysctl_numa_balancing_scan_delay = 1000;
1049
1050struct numa_group {
1051 refcount_t refcount;
1052
1053 spinlock_t lock;
1054 int nr_tasks;
1055 pid_t gid;
1056 int active_nodes;
1057
1058 struct rcu_head rcu;
1059 unsigned long total_faults;
1060 unsigned long max_faults_cpu;
1061
1062
1063
1064
1065
1066 unsigned long *faults_cpu;
1067 unsigned long faults[0];
1068};
1069
1070static inline unsigned long group_faults_priv(struct numa_group *ng);
1071static inline unsigned long group_faults_shared(struct numa_group *ng);
1072
1073static unsigned int task_nr_scan_windows(struct task_struct *p)
1074{
1075 unsigned long rss = 0;
1076 unsigned long nr_scan_pages;
1077
1078
1079
1080
1081
1082
1083 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1084 rss = get_mm_rss(p->mm);
1085 if (!rss)
1086 rss = nr_scan_pages;
1087
1088 rss = round_up(rss, nr_scan_pages);
1089 return rss / nr_scan_pages;
1090}
1091
1092
1093#define MAX_SCAN_WINDOW 2560
1094
1095static unsigned int task_scan_min(struct task_struct *p)
1096{
1097 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1098 unsigned int scan, floor;
1099 unsigned int windows = 1;
1100
1101 if (scan_size < MAX_SCAN_WINDOW)
1102 windows = MAX_SCAN_WINDOW / scan_size;
1103 floor = 1000 / windows;
1104
1105 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1106 return max_t(unsigned int, floor, scan);
1107}
1108
1109static unsigned int task_scan_start(struct task_struct *p)
1110{
1111 unsigned long smin = task_scan_min(p);
1112 unsigned long period = smin;
1113
1114
1115 if (p->numa_group) {
1116 struct numa_group *ng = p->numa_group;
1117 unsigned long shared = group_faults_shared(ng);
1118 unsigned long private = group_faults_priv(ng);
1119
1120 period *= refcount_read(&ng->refcount);
1121 period *= shared + 1;
1122 period /= private + shared + 1;
1123 }
1124
1125 return max(smin, period);
1126}
1127
1128static unsigned int task_scan_max(struct task_struct *p)
1129{
1130 unsigned long smin = task_scan_min(p);
1131 unsigned long smax;
1132
1133
1134 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1135
1136
1137 if (p->numa_group) {
1138 struct numa_group *ng = p->numa_group;
1139 unsigned long shared = group_faults_shared(ng);
1140 unsigned long private = group_faults_priv(ng);
1141 unsigned long period = smax;
1142
1143 period *= refcount_read(&ng->refcount);
1144 period *= shared + 1;
1145 period /= private + shared + 1;
1146
1147 smax = max(smax, period);
1148 }
1149
1150 return max(smin, smax);
1151}
1152
1153void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1154{
1155 int mm_users = 0;
1156 struct mm_struct *mm = p->mm;
1157
1158 if (mm) {
1159 mm_users = atomic_read(&mm->mm_users);
1160 if (mm_users == 1) {
1161 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1162 mm->numa_scan_seq = 0;
1163 }
1164 }
1165 p->node_stamp = 0;
1166 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1167 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1168 p->numa_work.next = &p->numa_work;
1169 p->numa_faults = NULL;
1170 p->numa_group = NULL;
1171 p->last_task_numa_placement = 0;
1172 p->last_sum_exec_runtime = 0;
1173
1174
1175 if (!(clone_flags & CLONE_VM)) {
1176 p->numa_preferred_nid = NUMA_NO_NODE;
1177 return;
1178 }
1179
1180
1181
1182
1183
1184 if (mm) {
1185 unsigned int delay;
1186
1187 delay = min_t(unsigned int, task_scan_max(current),
1188 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1189 delay += 2 * TICK_NSEC;
1190 p->node_stamp = delay;
1191 }
1192}
1193
1194static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1195{
1196 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1197 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1198}
1199
1200static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1201{
1202 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1203 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1204}
1205
1206
1207#define NR_NUMA_HINT_FAULT_TYPES 2
1208
1209
1210#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1211
1212
1213#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1214
1215pid_t task_numa_group_id(struct task_struct *p)
1216{
1217 return p->numa_group ? p->numa_group->gid : 0;
1218}
1219
1220
1221
1222
1223
1224
1225
1226static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1227{
1228 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1229}
1230
1231static inline unsigned long task_faults(struct task_struct *p, int nid)
1232{
1233 if (!p->numa_faults)
1234 return 0;
1235
1236 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1237 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1238}
1239
1240static inline unsigned long group_faults(struct task_struct *p, int nid)
1241{
1242 if (!p->numa_group)
1243 return 0;
1244
1245 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1246 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1247}
1248
1249static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1250{
1251 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1252 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1253}
1254
1255static inline unsigned long group_faults_priv(struct numa_group *ng)
1256{
1257 unsigned long faults = 0;
1258 int node;
1259
1260 for_each_online_node(node) {
1261 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1262 }
1263
1264 return faults;
1265}
1266
1267static inline unsigned long group_faults_shared(struct numa_group *ng)
1268{
1269 unsigned long faults = 0;
1270 int node;
1271
1272 for_each_online_node(node) {
1273 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1274 }
1275
1276 return faults;
1277}
1278
1279
1280
1281
1282
1283
1284#define ACTIVE_NODE_FRACTION 3
1285
1286static bool numa_is_active_node(int nid, struct numa_group *ng)
1287{
1288 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1289}
1290
1291
1292static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1293 int maxdist, bool task)
1294{
1295 unsigned long score = 0;
1296 int node;
1297
1298
1299
1300
1301
1302 if (sched_numa_topology_type == NUMA_DIRECT)
1303 return 0;
1304
1305
1306
1307
1308
1309 for_each_online_node(node) {
1310 unsigned long faults;
1311 int dist = node_distance(nid, node);
1312
1313
1314
1315
1316
1317 if (dist == sched_max_numa_distance || node == nid)
1318 continue;
1319
1320
1321
1322
1323
1324
1325
1326
1327 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1328 dist >= maxdist)
1329 continue;
1330
1331
1332 if (task)
1333 faults = task_faults(p, node);
1334 else
1335 faults = group_faults(p, node);
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1346 faults *= (sched_max_numa_distance - dist);
1347 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1348 }
1349
1350 score += faults;
1351 }
1352
1353 return score;
1354}
1355
1356
1357
1358
1359
1360
1361
1362static inline unsigned long task_weight(struct task_struct *p, int nid,
1363 int dist)
1364{
1365 unsigned long faults, total_faults;
1366
1367 if (!p->numa_faults)
1368 return 0;
1369
1370 total_faults = p->total_numa_faults;
1371
1372 if (!total_faults)
1373 return 0;
1374
1375 faults = task_faults(p, nid);
1376 faults += score_nearby_nodes(p, nid, dist, true);
1377
1378 return 1000 * faults / total_faults;
1379}
1380
1381static inline unsigned long group_weight(struct task_struct *p, int nid,
1382 int dist)
1383{
1384 unsigned long faults, total_faults;
1385
1386 if (!p->numa_group)
1387 return 0;
1388
1389 total_faults = p->numa_group->total_faults;
1390
1391 if (!total_faults)
1392 return 0;
1393
1394 faults = group_faults(p, nid);
1395 faults += score_nearby_nodes(p, nid, dist, false);
1396
1397 return 1000 * faults / total_faults;
1398}
1399
1400bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1401 int src_nid, int dst_cpu)
1402{
1403 struct numa_group *ng = p->numa_group;
1404 int dst_nid = cpu_to_node(dst_cpu);
1405 int last_cpupid, this_cpupid;
1406
1407 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1408 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1409
1410
1411
1412
1413
1414
1415
1416 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1417 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1418 return true;
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437 if (!cpupid_pid_unset(last_cpupid) &&
1438 cpupid_to_nid(last_cpupid) != dst_nid)
1439 return false;
1440
1441
1442 if (cpupid_match_pid(p, last_cpupid))
1443 return true;
1444
1445
1446 if (!ng)
1447 return true;
1448
1449
1450
1451
1452
1453 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1454 ACTIVE_NODE_FRACTION)
1455 return true;
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1466 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1467}
1468
1469static unsigned long weighted_cpuload(struct rq *rq);
1470static unsigned long source_load(int cpu, int type);
1471static unsigned long target_load(int cpu, int type);
1472
1473
1474struct numa_stats {
1475 unsigned long load;
1476
1477
1478 unsigned long compute_capacity;
1479};
1480
1481
1482
1483
1484static void update_numa_stats(struct numa_stats *ns, int nid)
1485{
1486 int cpu;
1487
1488 memset(ns, 0, sizeof(*ns));
1489 for_each_cpu(cpu, cpumask_of_node(nid)) {
1490 struct rq *rq = cpu_rq(cpu);
1491
1492 ns->load += weighted_cpuload(rq);
1493 ns->compute_capacity += capacity_of(cpu);
1494 }
1495
1496}
1497
1498struct task_numa_env {
1499 struct task_struct *p;
1500
1501 int src_cpu, src_nid;
1502 int dst_cpu, dst_nid;
1503
1504 struct numa_stats src_stats, dst_stats;
1505
1506 int imbalance_pct;
1507 int dist;
1508
1509 struct task_struct *best_task;
1510 long best_imp;
1511 int best_cpu;
1512};
1513
1514static void task_numa_assign(struct task_numa_env *env,
1515 struct task_struct *p, long imp)
1516{
1517 struct rq *rq = cpu_rq(env->dst_cpu);
1518
1519
1520 if (xchg(&rq->numa_migrate_on, 1))
1521 return;
1522
1523
1524
1525
1526
1527 if (env->best_cpu != -1) {
1528 rq = cpu_rq(env->best_cpu);
1529 WRITE_ONCE(rq->numa_migrate_on, 0);
1530 }
1531
1532 if (env->best_task)
1533 put_task_struct(env->best_task);
1534 if (p)
1535 get_task_struct(p);
1536
1537 env->best_task = p;
1538 env->best_imp = imp;
1539 env->best_cpu = env->dst_cpu;
1540}
1541
1542static bool load_too_imbalanced(long src_load, long dst_load,
1543 struct task_numa_env *env)
1544{
1545 long imb, old_imb;
1546 long orig_src_load, orig_dst_load;
1547 long src_capacity, dst_capacity;
1548
1549
1550
1551
1552
1553
1554
1555
1556 src_capacity = env->src_stats.compute_capacity;
1557 dst_capacity = env->dst_stats.compute_capacity;
1558
1559 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1560
1561 orig_src_load = env->src_stats.load;
1562 orig_dst_load = env->dst_stats.load;
1563
1564 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1565
1566
1567 return (imb > old_imb);
1568}
1569
1570
1571
1572
1573
1574
1575#define SMALLIMP 30
1576
1577
1578
1579
1580
1581
1582
1583static void task_numa_compare(struct task_numa_env *env,
1584 long taskimp, long groupimp, bool maymove)
1585{
1586 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1587 struct task_struct *cur;
1588 long src_load, dst_load;
1589 long load;
1590 long imp = env->p->numa_group ? groupimp : taskimp;
1591 long moveimp = imp;
1592 int dist = env->dist;
1593
1594 if (READ_ONCE(dst_rq->numa_migrate_on))
1595 return;
1596
1597 rcu_read_lock();
1598 cur = task_rcu_dereference(&dst_rq->curr);
1599 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1600 cur = NULL;
1601
1602
1603
1604
1605
1606 if (cur == env->p)
1607 goto unlock;
1608
1609 if (!cur) {
1610 if (maymove && moveimp >= env->best_imp)
1611 goto assign;
1612 else
1613 goto unlock;
1614 }
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1625 goto unlock;
1626
1627
1628
1629
1630
1631 if (cur->numa_group == env->p->numa_group) {
1632 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1633 task_weight(cur, env->dst_nid, dist);
1634
1635
1636
1637
1638 if (cur->numa_group)
1639 imp -= imp / 16;
1640 } else {
1641
1642
1643
1644
1645 if (cur->numa_group && env->p->numa_group)
1646 imp += group_weight(cur, env->src_nid, dist) -
1647 group_weight(cur, env->dst_nid, dist);
1648 else
1649 imp += task_weight(cur, env->src_nid, dist) -
1650 task_weight(cur, env->dst_nid, dist);
1651 }
1652
1653 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1654 imp = moveimp;
1655 cur = NULL;
1656 goto assign;
1657 }
1658
1659
1660
1661
1662
1663
1664
1665 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1666 goto unlock;
1667
1668
1669
1670
1671 load = task_h_load(env->p) - task_h_load(cur);
1672 if (!load)
1673 goto assign;
1674
1675 dst_load = env->dst_stats.load + load;
1676 src_load = env->src_stats.load - load;
1677
1678 if (load_too_imbalanced(src_load, dst_load, env))
1679 goto unlock;
1680
1681assign:
1682
1683
1684
1685
1686 if (!cur) {
1687
1688
1689
1690
1691 local_irq_disable();
1692 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1693 env->dst_cpu);
1694 local_irq_enable();
1695 }
1696
1697 task_numa_assign(env, cur, imp);
1698unlock:
1699 rcu_read_unlock();
1700}
1701
1702static void task_numa_find_cpu(struct task_numa_env *env,
1703 long taskimp, long groupimp)
1704{
1705 long src_load, dst_load, load;
1706 bool maymove = false;
1707 int cpu;
1708
1709 load = task_h_load(env->p);
1710 dst_load = env->dst_stats.load + load;
1711 src_load = env->src_stats.load - load;
1712
1713
1714
1715
1716
1717 maymove = !load_too_imbalanced(src_load, dst_load, env);
1718
1719 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1720
1721 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1722 continue;
1723
1724 env->dst_cpu = cpu;
1725 task_numa_compare(env, taskimp, groupimp, maymove);
1726 }
1727}
1728
1729static int task_numa_migrate(struct task_struct *p)
1730{
1731 struct task_numa_env env = {
1732 .p = p,
1733
1734 .src_cpu = task_cpu(p),
1735 .src_nid = task_node(p),
1736
1737 .imbalance_pct = 112,
1738
1739 .best_task = NULL,
1740 .best_imp = 0,
1741 .best_cpu = -1,
1742 };
1743 struct sched_domain *sd;
1744 struct rq *best_rq;
1745 unsigned long taskweight, groupweight;
1746 int nid, ret, dist;
1747 long taskimp, groupimp;
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757 rcu_read_lock();
1758 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1759 if (sd)
1760 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1761 rcu_read_unlock();
1762
1763
1764
1765
1766
1767
1768
1769 if (unlikely(!sd)) {
1770 sched_setnuma(p, task_node(p));
1771 return -EINVAL;
1772 }
1773
1774 env.dst_nid = p->numa_preferred_nid;
1775 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1776 taskweight = task_weight(p, env.src_nid, dist);
1777 groupweight = group_weight(p, env.src_nid, dist);
1778 update_numa_stats(&env.src_stats, env.src_nid);
1779 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1780 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1781 update_numa_stats(&env.dst_stats, env.dst_nid);
1782
1783
1784 task_numa_find_cpu(&env, taskimp, groupimp);
1785
1786
1787
1788
1789
1790
1791
1792
1793 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1794 for_each_online_node(nid) {
1795 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1796 continue;
1797
1798 dist = node_distance(env.src_nid, env.dst_nid);
1799 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1800 dist != env.dist) {
1801 taskweight = task_weight(p, env.src_nid, dist);
1802 groupweight = group_weight(p, env.src_nid, dist);
1803 }
1804
1805
1806 taskimp = task_weight(p, nid, dist) - taskweight;
1807 groupimp = group_weight(p, nid, dist) - groupweight;
1808 if (taskimp < 0 && groupimp < 0)
1809 continue;
1810
1811 env.dist = dist;
1812 env.dst_nid = nid;
1813 update_numa_stats(&env.dst_stats, env.dst_nid);
1814 task_numa_find_cpu(&env, taskimp, groupimp);
1815 }
1816 }
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826 if (p->numa_group) {
1827 if (env.best_cpu == -1)
1828 nid = env.src_nid;
1829 else
1830 nid = cpu_to_node(env.best_cpu);
1831
1832 if (nid != p->numa_preferred_nid)
1833 sched_setnuma(p, nid);
1834 }
1835
1836
1837 if (env.best_cpu == -1)
1838 return -EAGAIN;
1839
1840 best_rq = cpu_rq(env.best_cpu);
1841 if (env.best_task == NULL) {
1842 ret = migrate_task_to(p, env.best_cpu);
1843 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1844 if (ret != 0)
1845 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1846 return ret;
1847 }
1848
1849 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1850 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1851
1852 if (ret != 0)
1853 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1854 put_task_struct(env.best_task);
1855 return ret;
1856}
1857
1858
1859static void numa_migrate_preferred(struct task_struct *p)
1860{
1861 unsigned long interval = HZ;
1862
1863
1864 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1865 return;
1866
1867
1868 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1869 p->numa_migrate_retry = jiffies + interval;
1870
1871
1872 if (task_node(p) == p->numa_preferred_nid)
1873 return;
1874
1875
1876 task_numa_migrate(p);
1877}
1878
1879
1880
1881
1882
1883
1884
1885static void numa_group_count_active_nodes(struct numa_group *numa_group)
1886{
1887 unsigned long faults, max_faults = 0;
1888 int nid, active_nodes = 0;
1889
1890 for_each_online_node(nid) {
1891 faults = group_faults_cpu(numa_group, nid);
1892 if (faults > max_faults)
1893 max_faults = faults;
1894 }
1895
1896 for_each_online_node(nid) {
1897 faults = group_faults_cpu(numa_group, nid);
1898 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1899 active_nodes++;
1900 }
1901
1902 numa_group->max_faults_cpu = max_faults;
1903 numa_group->active_nodes = active_nodes;
1904}
1905
1906
1907
1908
1909
1910
1911
1912
1913#define NUMA_PERIOD_SLOTS 10
1914#define NUMA_PERIOD_THRESHOLD 7
1915
1916
1917
1918
1919
1920
1921
1922static void update_task_scan_period(struct task_struct *p,
1923 unsigned long shared, unsigned long private)
1924{
1925 unsigned int period_slot;
1926 int lr_ratio, ps_ratio;
1927 int diff;
1928
1929 unsigned long remote = p->numa_faults_locality[0];
1930 unsigned long local = p->numa_faults_locality[1];
1931
1932
1933
1934
1935
1936
1937
1938
1939 if (local + shared == 0 || p->numa_faults_locality[2]) {
1940 p->numa_scan_period = min(p->numa_scan_period_max,
1941 p->numa_scan_period << 1);
1942
1943 p->mm->numa_next_scan = jiffies +
1944 msecs_to_jiffies(p->numa_scan_period);
1945
1946 return;
1947 }
1948
1949
1950
1951
1952
1953
1954
1955 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1956 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1957 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1958
1959 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1960
1961
1962
1963
1964 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1965 if (!slot)
1966 slot = 1;
1967 diff = slot * period_slot;
1968 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1969
1970
1971
1972
1973
1974 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1975 if (!slot)
1976 slot = 1;
1977 diff = slot * period_slot;
1978 } else {
1979
1980
1981
1982
1983
1984 int ratio = max(lr_ratio, ps_ratio);
1985 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1986 }
1987
1988 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1989 task_scan_min(p), task_scan_max(p));
1990 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1991}
1992
1993
1994
1995
1996
1997
1998
1999
2000static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2001{
2002 u64 runtime, delta, now;
2003
2004 now = p->se.exec_start;
2005 runtime = p->se.sum_exec_runtime;
2006
2007 if (p->last_task_numa_placement) {
2008 delta = runtime - p->last_sum_exec_runtime;
2009 *period = now - p->last_task_numa_placement;
2010
2011
2012 if (unlikely((s64)*period < 0))
2013 *period = 0;
2014 } else {
2015 delta = p->se.avg.load_sum;
2016 *period = LOAD_AVG_MAX;
2017 }
2018
2019 p->last_sum_exec_runtime = runtime;
2020 p->last_task_numa_placement = now;
2021
2022 return delta;
2023}
2024
2025
2026
2027
2028
2029
2030static int preferred_group_nid(struct task_struct *p, int nid)
2031{
2032 nodemask_t nodes;
2033 int dist;
2034
2035
2036 if (sched_numa_topology_type == NUMA_DIRECT)
2037 return nid;
2038
2039
2040
2041
2042
2043
2044 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2045 unsigned long score, max_score = 0;
2046 int node, max_node = nid;
2047
2048 dist = sched_max_numa_distance;
2049
2050 for_each_online_node(node) {
2051 score = group_weight(p, node, dist);
2052 if (score > max_score) {
2053 max_score = score;
2054 max_node = node;
2055 }
2056 }
2057 return max_node;
2058 }
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069 nodes = node_online_map;
2070 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2071 unsigned long max_faults = 0;
2072 nodemask_t max_group = NODE_MASK_NONE;
2073 int a, b;
2074
2075
2076 if (!find_numa_distance(dist))
2077 continue;
2078
2079 for_each_node_mask(a, nodes) {
2080 unsigned long faults = 0;
2081 nodemask_t this_group;
2082 nodes_clear(this_group);
2083
2084
2085 for_each_node_mask(b, nodes) {
2086 if (node_distance(a, b) < dist) {
2087 faults += group_faults(p, b);
2088 node_set(b, this_group);
2089 node_clear(b, nodes);
2090 }
2091 }
2092
2093
2094 if (faults > max_faults) {
2095 max_faults = faults;
2096 max_group = this_group;
2097
2098
2099
2100
2101
2102 nid = a;
2103 }
2104 }
2105
2106 if (!max_faults)
2107 break;
2108 nodes = max_group;
2109 }
2110 return nid;
2111}
2112
2113static void task_numa_placement(struct task_struct *p)
2114{
2115 int seq, nid, max_nid = NUMA_NO_NODE;
2116 unsigned long max_faults = 0;
2117 unsigned long fault_types[2] = { 0, 0 };
2118 unsigned long total_faults;
2119 u64 runtime, period;
2120 spinlock_t *group_lock = NULL;
2121
2122
2123
2124
2125
2126
2127 seq = READ_ONCE(p->mm->numa_scan_seq);
2128 if (p->numa_scan_seq == seq)
2129 return;
2130 p->numa_scan_seq = seq;
2131 p->numa_scan_period_max = task_scan_max(p);
2132
2133 total_faults = p->numa_faults_locality[0] +
2134 p->numa_faults_locality[1];
2135 runtime = numa_get_avg_runtime(p, &period);
2136
2137
2138 if (p->numa_group) {
2139 group_lock = &p->numa_group->lock;
2140 spin_lock_irq(group_lock);
2141 }
2142
2143
2144 for_each_online_node(nid) {
2145
2146 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2147 unsigned long faults = 0, group_faults = 0;
2148 int priv;
2149
2150 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2151 long diff, f_diff, f_weight;
2152
2153 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2154 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2155 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2156 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2157
2158
2159 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2160 fault_types[priv] += p->numa_faults[membuf_idx];
2161 p->numa_faults[membuf_idx] = 0;
2162
2163
2164
2165
2166
2167
2168
2169
2170 f_weight = div64_u64(runtime << 16, period + 1);
2171 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2172 (total_faults + 1);
2173 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2174 p->numa_faults[cpubuf_idx] = 0;
2175
2176 p->numa_faults[mem_idx] += diff;
2177 p->numa_faults[cpu_idx] += f_diff;
2178 faults += p->numa_faults[mem_idx];
2179 p->total_numa_faults += diff;
2180 if (p->numa_group) {
2181
2182
2183
2184
2185
2186
2187
2188 p->numa_group->faults[mem_idx] += diff;
2189 p->numa_group->faults_cpu[mem_idx] += f_diff;
2190 p->numa_group->total_faults += diff;
2191 group_faults += p->numa_group->faults[mem_idx];
2192 }
2193 }
2194
2195 if (!p->numa_group) {
2196 if (faults > max_faults) {
2197 max_faults = faults;
2198 max_nid = nid;
2199 }
2200 } else if (group_faults > max_faults) {
2201 max_faults = group_faults;
2202 max_nid = nid;
2203 }
2204 }
2205
2206 if (p->numa_group) {
2207 numa_group_count_active_nodes(p->numa_group);
2208 spin_unlock_irq(group_lock);
2209 max_nid = preferred_group_nid(p, max_nid);
2210 }
2211
2212 if (max_faults) {
2213
2214 if (max_nid != p->numa_preferred_nid)
2215 sched_setnuma(p, max_nid);
2216 }
2217
2218 update_task_scan_period(p, fault_types[0], fault_types[1]);
2219}
2220
2221static inline int get_numa_group(struct numa_group *grp)
2222{
2223 return refcount_inc_not_zero(&grp->refcount);
2224}
2225
2226static inline void put_numa_group(struct numa_group *grp)
2227{
2228 if (refcount_dec_and_test(&grp->refcount))
2229 kfree_rcu(grp, rcu);
2230}
2231
2232static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2233 int *priv)
2234{
2235 struct numa_group *grp, *my_grp;
2236 struct task_struct *tsk;
2237 bool join = false;
2238 int cpu = cpupid_to_cpu(cpupid);
2239 int i;
2240
2241 if (unlikely(!p->numa_group)) {
2242 unsigned int size = sizeof(struct numa_group) +
2243 4*nr_node_ids*sizeof(unsigned long);
2244
2245 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2246 if (!grp)
2247 return;
2248
2249 refcount_set(&grp->refcount, 1);
2250 grp->active_nodes = 1;
2251 grp->max_faults_cpu = 0;
2252 spin_lock_init(&grp->lock);
2253 grp->gid = p->pid;
2254
2255 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2256 nr_node_ids;
2257
2258 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2259 grp->faults[i] = p->numa_faults[i];
2260
2261 grp->total_faults = p->total_numa_faults;
2262
2263 grp->nr_tasks++;
2264 rcu_assign_pointer(p->numa_group, grp);
2265 }
2266
2267 rcu_read_lock();
2268 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2269
2270 if (!cpupid_match_pid(tsk, cpupid))
2271 goto no_join;
2272
2273 grp = rcu_dereference(tsk->numa_group);
2274 if (!grp)
2275 goto no_join;
2276
2277 my_grp = p->numa_group;
2278 if (grp == my_grp)
2279 goto no_join;
2280
2281
2282
2283
2284
2285 if (my_grp->nr_tasks > grp->nr_tasks)
2286 goto no_join;
2287
2288
2289
2290
2291 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2292 goto no_join;
2293
2294
2295 if (tsk->mm == current->mm)
2296 join = true;
2297
2298
2299 if (flags & TNF_SHARED)
2300 join = true;
2301
2302
2303 *priv = !join;
2304
2305 if (join && !get_numa_group(grp))
2306 goto no_join;
2307
2308 rcu_read_unlock();
2309
2310 if (!join)
2311 return;
2312
2313 BUG_ON(irqs_disabled());
2314 double_lock_irq(&my_grp->lock, &grp->lock);
2315
2316 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2317 my_grp->faults[i] -= p->numa_faults[i];
2318 grp->faults[i] += p->numa_faults[i];
2319 }
2320 my_grp->total_faults -= p->total_numa_faults;
2321 grp->total_faults += p->total_numa_faults;
2322
2323 my_grp->nr_tasks--;
2324 grp->nr_tasks++;
2325
2326 spin_unlock(&my_grp->lock);
2327 spin_unlock_irq(&grp->lock);
2328
2329 rcu_assign_pointer(p->numa_group, grp);
2330
2331 put_numa_group(my_grp);
2332 return;
2333
2334no_join:
2335 rcu_read_unlock();
2336 return;
2337}
2338
2339void task_numa_free(struct task_struct *p)
2340{
2341 struct numa_group *grp = p->numa_group;
2342 void *numa_faults = p->numa_faults;
2343 unsigned long flags;
2344 int i;
2345
2346 if (grp) {
2347 spin_lock_irqsave(&grp->lock, flags);
2348 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2349 grp->faults[i] -= p->numa_faults[i];
2350 grp->total_faults -= p->total_numa_faults;
2351
2352 grp->nr_tasks--;
2353 spin_unlock_irqrestore(&grp->lock, flags);
2354 RCU_INIT_POINTER(p->numa_group, NULL);
2355 put_numa_group(grp);
2356 }
2357
2358 p->numa_faults = NULL;
2359 kfree(numa_faults);
2360}
2361
2362
2363
2364
2365void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2366{
2367 struct task_struct *p = current;
2368 bool migrated = flags & TNF_MIGRATED;
2369 int cpu_node = task_node(current);
2370 int local = !!(flags & TNF_FAULT_LOCAL);
2371 struct numa_group *ng;
2372 int priv;
2373
2374 if (!static_branch_likely(&sched_numa_balancing))
2375 return;
2376
2377
2378 if (!p->mm)
2379 return;
2380
2381
2382 if (unlikely(!p->numa_faults)) {
2383 int size = sizeof(*p->numa_faults) *
2384 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2385
2386 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2387 if (!p->numa_faults)
2388 return;
2389
2390 p->total_numa_faults = 0;
2391 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2392 }
2393
2394
2395
2396
2397
2398 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2399 priv = 1;
2400 } else {
2401 priv = cpupid_match_pid(p, last_cpupid);
2402 if (!priv && !(flags & TNF_NO_GROUP))
2403 task_numa_group(p, last_cpupid, flags, &priv);
2404 }
2405
2406
2407
2408
2409
2410
2411
2412 ng = p->numa_group;
2413 if (!priv && !local && ng && ng->active_nodes > 1 &&
2414 numa_is_active_node(cpu_node, ng) &&
2415 numa_is_active_node(mem_node, ng))
2416 local = 1;
2417
2418
2419
2420
2421
2422 if (time_after(jiffies, p->numa_migrate_retry)) {
2423 task_numa_placement(p);
2424 numa_migrate_preferred(p);
2425 }
2426
2427 if (migrated)
2428 p->numa_pages_migrated += pages;
2429 if (flags & TNF_MIGRATE_FAIL)
2430 p->numa_faults_locality[2] += pages;
2431
2432 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2433 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2434 p->numa_faults_locality[local] += pages;
2435}
2436
2437static void reset_ptenuma_scan(struct task_struct *p)
2438{
2439
2440
2441
2442
2443
2444
2445
2446
2447 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2448 p->mm->numa_scan_offset = 0;
2449}
2450
2451
2452
2453
2454
2455void task_numa_work(struct callback_head *work)
2456{
2457 unsigned long migrate, next_scan, now = jiffies;
2458 struct task_struct *p = current;
2459 struct mm_struct *mm = p->mm;
2460 u64 runtime = p->se.sum_exec_runtime;
2461 struct vm_area_struct *vma;
2462 unsigned long start, end;
2463 unsigned long nr_pte_updates = 0;
2464 long pages, virtpages;
2465
2466 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2467
2468 work->next = work;
2469
2470
2471
2472
2473
2474
2475
2476
2477 if (p->flags & PF_EXITING)
2478 return;
2479
2480 if (!mm->numa_next_scan) {
2481 mm->numa_next_scan = now +
2482 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2483 }
2484
2485
2486
2487
2488 migrate = mm->numa_next_scan;
2489 if (time_before(now, migrate))
2490 return;
2491
2492 if (p->numa_scan_period == 0) {
2493 p->numa_scan_period_max = task_scan_max(p);
2494 p->numa_scan_period = task_scan_start(p);
2495 }
2496
2497 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2498 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2499 return;
2500
2501
2502
2503
2504
2505 p->node_stamp += 2 * TICK_NSEC;
2506
2507 start = mm->numa_scan_offset;
2508 pages = sysctl_numa_balancing_scan_size;
2509 pages <<= 20 - PAGE_SHIFT;
2510 virtpages = pages * 8;
2511 if (!pages)
2512 return;
2513
2514
2515 if (!down_read_trylock(&mm->mmap_sem))
2516 return;
2517 vma = find_vma(mm, start);
2518 if (!vma) {
2519 reset_ptenuma_scan(p);
2520 start = 0;
2521 vma = mm->mmap;
2522 }
2523 for (; vma; vma = vma->vm_next) {
2524 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2525 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2526 continue;
2527 }
2528
2529
2530
2531
2532
2533
2534
2535 if (!vma->vm_mm ||
2536 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2537 continue;
2538
2539
2540
2541
2542
2543 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2544 continue;
2545
2546 do {
2547 start = max(start, vma->vm_start);
2548 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2549 end = min(end, vma->vm_end);
2550 nr_pte_updates = change_prot_numa(vma, start, end);
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560 if (nr_pte_updates)
2561 pages -= (end - start) >> PAGE_SHIFT;
2562 virtpages -= (end - start) >> PAGE_SHIFT;
2563
2564 start = end;
2565 if (pages <= 0 || virtpages <= 0)
2566 goto out;
2567
2568 cond_resched();
2569 } while (end != vma->vm_end);
2570 }
2571
2572out:
2573
2574
2575
2576
2577
2578
2579 if (vma)
2580 mm->numa_scan_offset = start;
2581 else
2582 reset_ptenuma_scan(p);
2583 up_read(&mm->mmap_sem);
2584
2585
2586
2587
2588
2589
2590
2591 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2592 u64 diff = p->se.sum_exec_runtime - runtime;
2593 p->node_stamp += 32 * diff;
2594 }
2595}
2596
2597
2598
2599
2600static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2601{
2602 struct callback_head *work = &curr->numa_work;
2603 u64 period, now;
2604
2605
2606
2607
2608 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2609 return;
2610
2611
2612
2613
2614
2615
2616
2617 now = curr->se.sum_exec_runtime;
2618 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2619
2620 if (now > curr->node_stamp + period) {
2621 if (!curr->node_stamp)
2622 curr->numa_scan_period = task_scan_start(curr);
2623 curr->node_stamp += period;
2624
2625 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2626 init_task_work(work, task_numa_work);
2627 task_work_add(curr, work, true);
2628 }
2629 }
2630}
2631
2632static void update_scan_period(struct task_struct *p, int new_cpu)
2633{
2634 int src_nid = cpu_to_node(task_cpu(p));
2635 int dst_nid = cpu_to_node(new_cpu);
2636
2637 if (!static_branch_likely(&sched_numa_balancing))
2638 return;
2639
2640 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2641 return;
2642
2643 if (src_nid == dst_nid)
2644 return;
2645
2646
2647
2648
2649
2650
2651 if (p->numa_scan_seq) {
2652
2653
2654
2655
2656
2657 if (dst_nid == p->numa_preferred_nid ||
2658 (p->numa_preferred_nid != NUMA_NO_NODE &&
2659 src_nid != p->numa_preferred_nid))
2660 return;
2661 }
2662
2663 p->numa_scan_period = task_scan_start(p);
2664}
2665
2666#else
2667static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2668{
2669}
2670
2671static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2672{
2673}
2674
2675static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2676{
2677}
2678
2679static inline void update_scan_period(struct task_struct *p, int new_cpu)
2680{
2681}
2682
2683#endif
2684
2685static void
2686account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2687{
2688 update_load_add(&cfs_rq->load, se->load.weight);
2689 if (!parent_entity(se))
2690 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2691#ifdef CONFIG_SMP
2692 if (entity_is_task(se)) {
2693 struct rq *rq = rq_of(cfs_rq);
2694
2695 account_numa_enqueue(rq, task_of(se));
2696 list_add(&se->group_node, &rq->cfs_tasks);
2697 }
2698#endif
2699 cfs_rq->nr_running++;
2700}
2701
2702static void
2703account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2704{
2705 update_load_sub(&cfs_rq->load, se->load.weight);
2706 if (!parent_entity(se))
2707 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2708#ifdef CONFIG_SMP
2709 if (entity_is_task(se)) {
2710 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2711 list_del_init(&se->group_node);
2712 }
2713#endif
2714 cfs_rq->nr_running--;
2715}
2716
2717
2718
2719
2720
2721
2722
2723
2724#define add_positive(_ptr, _val) do { \
2725 typeof(_ptr) ptr = (_ptr); \
2726 typeof(_val) val = (_val); \
2727 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2728 \
2729 res = var + val; \
2730 \
2731 if (val < 0 && res > var) \
2732 res = 0; \
2733 \
2734 WRITE_ONCE(*ptr, res); \
2735} while (0)
2736
2737
2738
2739
2740
2741
2742
2743
2744#define sub_positive(_ptr, _val) do { \
2745 typeof(_ptr) ptr = (_ptr); \
2746 typeof(*ptr) val = (_val); \
2747 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2748 res = var - val; \
2749 if (res > var) \
2750 res = 0; \
2751 WRITE_ONCE(*ptr, res); \
2752} while (0)
2753
2754
2755
2756
2757
2758
2759
2760#define lsub_positive(_ptr, _val) do { \
2761 typeof(_ptr) ptr = (_ptr); \
2762 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
2763} while (0)
2764
2765#ifdef CONFIG_SMP
2766static inline void
2767enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2768{
2769 cfs_rq->runnable_weight += se->runnable_weight;
2770
2771 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2772 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2773}
2774
2775static inline void
2776dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2777{
2778 cfs_rq->runnable_weight -= se->runnable_weight;
2779
2780 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2781 sub_positive(&cfs_rq->avg.runnable_load_sum,
2782 se_runnable(se) * se->avg.runnable_load_sum);
2783}
2784
2785static inline void
2786enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2787{
2788 cfs_rq->avg.load_avg += se->avg.load_avg;
2789 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2790}
2791
2792static inline void
2793dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2794{
2795 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2796 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2797}
2798#else
2799static inline void
2800enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2801static inline void
2802dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2803static inline void
2804enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2805static inline void
2806dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2807#endif
2808
2809static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2810 unsigned long weight, unsigned long runnable)
2811{
2812 if (se->on_rq) {
2813
2814 if (cfs_rq->curr == se)
2815 update_curr(cfs_rq);
2816 account_entity_dequeue(cfs_rq, se);
2817 dequeue_runnable_load_avg(cfs_rq, se);
2818 }
2819 dequeue_load_avg(cfs_rq, se);
2820
2821 se->runnable_weight = runnable;
2822 update_load_set(&se->load, weight);
2823
2824#ifdef CONFIG_SMP
2825 do {
2826 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2827
2828 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2829 se->avg.runnable_load_avg =
2830 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2831 } while (0);
2832#endif
2833
2834 enqueue_load_avg(cfs_rq, se);
2835 if (se->on_rq) {
2836 account_entity_enqueue(cfs_rq, se);
2837 enqueue_runnable_load_avg(cfs_rq, se);
2838 }
2839}
2840
2841void reweight_task(struct task_struct *p, int prio)
2842{
2843 struct sched_entity *se = &p->se;
2844 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2845 struct load_weight *load = &se->load;
2846 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2847
2848 reweight_entity(cfs_rq, se, weight, weight);
2849 load->inv_weight = sched_prio_to_wmult[prio];
2850}
2851
2852#ifdef CONFIG_FAIR_GROUP_SCHED
2853#ifdef CONFIG_SMP
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927static long calc_group_shares(struct cfs_rq *cfs_rq)
2928{
2929 long tg_weight, tg_shares, load, shares;
2930 struct task_group *tg = cfs_rq->tg;
2931
2932 tg_shares = READ_ONCE(tg->shares);
2933
2934 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
2935
2936 tg_weight = atomic_long_read(&tg->load_avg);
2937
2938
2939 tg_weight -= cfs_rq->tg_load_avg_contrib;
2940 tg_weight += load;
2941
2942 shares = (tg_shares * load);
2943 if (tg_weight)
2944 shares /= tg_weight;
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958 return clamp_t(long, shares, MIN_SHARES, tg_shares);
2959}
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2989{
2990 long runnable, load_avg;
2991
2992 load_avg = max(cfs_rq->avg.load_avg,
2993 scale_load_down(cfs_rq->load.weight));
2994
2995 runnable = max(cfs_rq->avg.runnable_load_avg,
2996 scale_load_down(cfs_rq->runnable_weight));
2997
2998 runnable *= shares;
2999 if (load_avg)
3000 runnable /= load_avg;
3001
3002 return clamp_t(long, runnable, MIN_SHARES, shares);
3003}
3004#endif
3005
3006static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3007
3008
3009
3010
3011
3012static void update_cfs_group(struct sched_entity *se)
3013{
3014 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3015 long shares, runnable;
3016
3017 if (!gcfs_rq)
3018 return;
3019
3020 if (throttled_hierarchy(gcfs_rq))
3021 return;
3022
3023#ifndef CONFIG_SMP
3024 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3025
3026 if (likely(se->load.weight == shares))
3027 return;
3028#else
3029 shares = calc_group_shares(gcfs_rq);
3030 runnable = calc_group_runnable(gcfs_rq, shares);
3031#endif
3032
3033 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3034}
3035
3036#else
3037static inline void update_cfs_group(struct sched_entity *se)
3038{
3039}
3040#endif
3041
3042static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3043{
3044 struct rq *rq = rq_of(cfs_rq);
3045
3046 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061 cpufreq_update_util(rq, flags);
3062 }
3063}
3064
3065#ifdef CONFIG_SMP
3066#ifdef CONFIG_FAIR_GROUP_SCHED
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3083{
3084 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3085
3086
3087
3088
3089 if (cfs_rq->tg == &root_task_group)
3090 return;
3091
3092 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3093 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3094 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3095 }
3096}
3097
3098
3099
3100
3101
3102
3103void set_task_rq_fair(struct sched_entity *se,
3104 struct cfs_rq *prev, struct cfs_rq *next)
3105{
3106 u64 p_last_update_time;
3107 u64 n_last_update_time;
3108
3109 if (!sched_feat(ATTACH_AGE_LOAD))
3110 return;
3111
3112
3113
3114
3115
3116
3117
3118
3119 if (!(se->avg.last_update_time && prev))
3120 return;
3121
3122#ifndef CONFIG_64BIT
3123 {
3124 u64 p_last_update_time_copy;
3125 u64 n_last_update_time_copy;
3126
3127 do {
3128 p_last_update_time_copy = prev->load_last_update_time_copy;
3129 n_last_update_time_copy = next->load_last_update_time_copy;
3130
3131 smp_rmb();
3132
3133 p_last_update_time = prev->avg.last_update_time;
3134 n_last_update_time = next->avg.last_update_time;
3135
3136 } while (p_last_update_time != p_last_update_time_copy ||
3137 n_last_update_time != n_last_update_time_copy);
3138 }
3139#else
3140 p_last_update_time = prev->avg.last_update_time;
3141 n_last_update_time = next->avg.last_update_time;
3142#endif
3143 __update_load_avg_blocked_se(p_last_update_time, se);
3144 se->avg.last_update_time = n_last_update_time;
3145}
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216static inline void
3217update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3218{
3219 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3220
3221
3222 if (!delta)
3223 return;
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234 se->avg.util_avg = gcfs_rq->avg.util_avg;
3235 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3236
3237
3238 add_positive(&cfs_rq->avg.util_avg, delta);
3239 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3240}
3241
3242static inline void
3243update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3244{
3245 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3246 unsigned long runnable_load_avg, load_avg;
3247 u64 runnable_load_sum, load_sum = 0;
3248 s64 delta_sum;
3249
3250 if (!runnable_sum)
3251 return;
3252
3253 gcfs_rq->prop_runnable_sum = 0;
3254
3255 if (runnable_sum >= 0) {
3256
3257
3258
3259
3260 runnable_sum += se->avg.load_sum;
3261 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3262 } else {
3263
3264
3265
3266
3267 if (scale_load_down(gcfs_rq->load.weight)) {
3268 load_sum = div_s64(gcfs_rq->avg.load_sum,
3269 scale_load_down(gcfs_rq->load.weight));
3270 }
3271
3272
3273 runnable_sum = min(se->avg.load_sum, load_sum);
3274 }
3275
3276
3277
3278
3279
3280
3281
3282 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3283 runnable_sum = max(runnable_sum, running_sum);
3284
3285 load_sum = (s64)se_weight(se) * runnable_sum;
3286 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3287
3288 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3289 delta_avg = load_avg - se->avg.load_avg;
3290
3291 se->avg.load_sum = runnable_sum;
3292 se->avg.load_avg = load_avg;
3293 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3294 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3295
3296 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3297 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3298 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3299 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3300
3301 se->avg.runnable_load_sum = runnable_sum;
3302 se->avg.runnable_load_avg = runnable_load_avg;
3303
3304 if (se->on_rq) {
3305 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3306 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3307 }
3308}
3309
3310static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3311{
3312 cfs_rq->propagate = 1;
3313 cfs_rq->prop_runnable_sum += runnable_sum;
3314}
3315
3316
3317static inline int propagate_entity_load_avg(struct sched_entity *se)
3318{
3319 struct cfs_rq *cfs_rq, *gcfs_rq;
3320
3321 if (entity_is_task(se))
3322 return 0;
3323
3324 gcfs_rq = group_cfs_rq(se);
3325 if (!gcfs_rq->propagate)
3326 return 0;
3327
3328 gcfs_rq->propagate = 0;
3329
3330 cfs_rq = cfs_rq_of(se);
3331
3332 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3333
3334 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3335 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3336
3337 return 1;
3338}
3339
3340
3341
3342
3343
3344static inline bool skip_blocked_update(struct sched_entity *se)
3345{
3346 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3347
3348
3349
3350
3351
3352 if (se->avg.load_avg || se->avg.util_avg)
3353 return false;
3354
3355
3356
3357
3358
3359 if (gcfs_rq->propagate)
3360 return false;
3361
3362
3363
3364
3365
3366
3367 return true;
3368}
3369
3370#else
3371
3372static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3373
3374static inline int propagate_entity_load_avg(struct sched_entity *se)
3375{
3376 return 0;
3377}
3378
3379static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3380
3381#endif
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399static inline int
3400update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3401{
3402 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3403 struct sched_avg *sa = &cfs_rq->avg;
3404 int decayed = 0;
3405
3406 if (cfs_rq->removed.nr) {
3407 unsigned long r;
3408 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3409
3410 raw_spin_lock(&cfs_rq->removed.lock);
3411 swap(cfs_rq->removed.util_avg, removed_util);
3412 swap(cfs_rq->removed.load_avg, removed_load);
3413 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3414 cfs_rq->removed.nr = 0;
3415 raw_spin_unlock(&cfs_rq->removed.lock);
3416
3417 r = removed_load;
3418 sub_positive(&sa->load_avg, r);
3419 sub_positive(&sa->load_sum, r * divider);
3420
3421 r = removed_util;
3422 sub_positive(&sa->util_avg, r);
3423 sub_positive(&sa->util_sum, r * divider);
3424
3425 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3426
3427 decayed = 1;
3428 }
3429
3430 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3431
3432#ifndef CONFIG_64BIT
3433 smp_wmb();
3434 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3435#endif
3436
3437 if (decayed)
3438 cfs_rq_util_change(cfs_rq, 0);
3439
3440 return decayed;
3441}
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3453{
3454 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3455
3456
3457
3458
3459
3460
3461
3462
3463 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3464 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3465
3466
3467
3468
3469
3470
3471
3472 se->avg.util_sum = se->avg.util_avg * divider;
3473
3474 se->avg.load_sum = divider;
3475 if (se_weight(se)) {
3476 se->avg.load_sum =
3477 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3478 }
3479
3480 se->avg.runnable_load_sum = se->avg.load_sum;
3481
3482 enqueue_load_avg(cfs_rq, se);
3483 cfs_rq->avg.util_avg += se->avg.util_avg;
3484 cfs_rq->avg.util_sum += se->avg.util_sum;
3485
3486 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3487
3488 cfs_rq_util_change(cfs_rq, flags);
3489}
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3500{
3501 dequeue_load_avg(cfs_rq, se);
3502 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3503 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3504
3505 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3506
3507 cfs_rq_util_change(cfs_rq, 0);
3508}
3509
3510
3511
3512
3513#define UPDATE_TG 0x1
3514#define SKIP_AGE_LOAD 0x2
3515#define DO_ATTACH 0x4
3516
3517
3518static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3519{
3520 u64 now = cfs_rq_clock_pelt(cfs_rq);
3521 int decayed;
3522
3523
3524
3525
3526
3527 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3528 __update_load_avg_se(now, cfs_rq, se);
3529
3530 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3531 decayed |= propagate_entity_load_avg(se);
3532
3533 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3534
3535
3536
3537
3538
3539
3540
3541
3542 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3543 update_tg_load_avg(cfs_rq, 0);
3544
3545 } else if (decayed && (flags & UPDATE_TG))
3546 update_tg_load_avg(cfs_rq, 0);
3547}
3548
3549#ifndef CONFIG_64BIT
3550static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3551{
3552 u64 last_update_time_copy;
3553 u64 last_update_time;
3554
3555 do {
3556 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3557 smp_rmb();
3558 last_update_time = cfs_rq->avg.last_update_time;
3559 } while (last_update_time != last_update_time_copy);
3560
3561 return last_update_time;
3562}
3563#else
3564static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3565{
3566 return cfs_rq->avg.last_update_time;
3567}
3568#endif
3569
3570
3571
3572
3573
3574static void sync_entity_load_avg(struct sched_entity *se)
3575{
3576 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3577 u64 last_update_time;
3578
3579 last_update_time = cfs_rq_last_update_time(cfs_rq);
3580 __update_load_avg_blocked_se(last_update_time, se);
3581}
3582
3583
3584
3585
3586
3587static void remove_entity_load_avg(struct sched_entity *se)
3588{
3589 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3590 unsigned long flags;
3591
3592
3593
3594
3595
3596
3597
3598 sync_entity_load_avg(se);
3599
3600 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3601 ++cfs_rq->removed.nr;
3602 cfs_rq->removed.util_avg += se->avg.util_avg;
3603 cfs_rq->removed.load_avg += se->avg.load_avg;
3604 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3605 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3606}
3607
3608static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3609{
3610 return cfs_rq->avg.runnable_load_avg;
3611}
3612
3613static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3614{
3615 return cfs_rq->avg.load_avg;
3616}
3617
3618static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3619
3620static inline unsigned long task_util(struct task_struct *p)
3621{
3622 return READ_ONCE(p->se.avg.util_avg);
3623}
3624
3625static inline unsigned long _task_util_est(struct task_struct *p)
3626{
3627 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3628
3629 return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3630}
3631
3632static inline unsigned long task_util_est(struct task_struct *p)
3633{
3634 return max(task_util(p), _task_util_est(p));
3635}
3636
3637static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3638 struct task_struct *p)
3639{
3640 unsigned int enqueued;
3641
3642 if (!sched_feat(UTIL_EST))
3643 return;
3644
3645
3646 enqueued = cfs_rq->avg.util_est.enqueued;
3647 enqueued += _task_util_est(p);
3648 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3649}
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659static inline bool within_margin(int value, int margin)
3660{
3661 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3662}
3663
3664static void
3665util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3666{
3667 long last_ewma_diff;
3668 struct util_est ue;
3669 int cpu;
3670
3671 if (!sched_feat(UTIL_EST))
3672 return;
3673
3674
3675 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3676 ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3677 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3678
3679
3680
3681
3682
3683 if (!task_sleep)
3684 return;
3685
3686
3687
3688
3689
3690 ue = p->se.avg.util_est;
3691 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3692 return;
3693
3694
3695
3696
3697
3698 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3699 last_ewma_diff = ue.enqueued - ue.ewma;
3700 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3701 return;
3702
3703
3704
3705
3706
3707 cpu = cpu_of(rq_of(cfs_rq));
3708 if (task_util(p) > capacity_orig_of(cpu))
3709 return;
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3729 ue.ewma += last_ewma_diff;
3730 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3731 WRITE_ONCE(p->se.avg.util_est, ue);
3732}
3733
3734static inline int task_fits_capacity(struct task_struct *p, long capacity)
3735{
3736 return capacity * 1024 > task_util_est(p) * capacity_margin;
3737}
3738
3739static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3740{
3741 if (!static_branch_unlikely(&sched_asym_cpucapacity))
3742 return;
3743
3744 if (!p) {
3745 rq->misfit_task_load = 0;
3746 return;
3747 }
3748
3749 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3750 rq->misfit_task_load = 0;
3751 return;
3752 }
3753
3754 rq->misfit_task_load = task_h_load(p);
3755}
3756
3757#else
3758
3759#define UPDATE_TG 0x0
3760#define SKIP_AGE_LOAD 0x0
3761#define DO_ATTACH 0x0
3762
3763static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3764{
3765 cfs_rq_util_change(cfs_rq, 0);
3766}
3767
3768static inline void remove_entity_load_avg(struct sched_entity *se) {}
3769
3770static inline void
3771attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3772static inline void
3773detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3774
3775static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3776{
3777 return 0;
3778}
3779
3780static inline void
3781util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3782
3783static inline void
3784util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3785 bool task_sleep) {}
3786static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3787
3788#endif
3789
3790static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3791{
3792#ifdef CONFIG_SCHED_DEBUG
3793 s64 d = se->vruntime - cfs_rq->min_vruntime;
3794
3795 if (d < 0)
3796 d = -d;
3797
3798 if (d > 3*sysctl_sched_latency)
3799 schedstat_inc(cfs_rq->nr_spread_over);
3800#endif
3801}
3802
3803static void
3804place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3805{
3806 u64 vruntime = cfs_rq->min_vruntime;
3807
3808
3809
3810
3811
3812
3813
3814 if (initial && sched_feat(START_DEBIT))
3815 vruntime += sched_vslice(cfs_rq, se);
3816
3817
3818 if (!initial) {
3819 unsigned long thresh = sysctl_sched_latency;
3820
3821
3822
3823
3824
3825 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3826 thresh >>= 1;
3827
3828 vruntime -= thresh;
3829 }
3830
3831
3832 se->vruntime = max_vruntime(se->vruntime, vruntime);
3833}
3834
3835static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3836
3837static inline void check_schedstat_required(void)
3838{
3839#ifdef CONFIG_SCHEDSTATS
3840 if (schedstat_enabled())
3841 return;
3842
3843
3844 if (trace_sched_stat_wait_enabled() ||
3845 trace_sched_stat_sleep_enabled() ||
3846 trace_sched_stat_iowait_enabled() ||
3847 trace_sched_stat_blocked_enabled() ||
3848 trace_sched_stat_runtime_enabled()) {
3849 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3850 "stat_blocked and stat_runtime require the "
3851 "kernel parameter schedstats=enable or "
3852 "kernel.sched_schedstats=1\n");
3853 }
3854#endif
3855}
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888static void
3889enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3890{
3891 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3892 bool curr = cfs_rq->curr == se;
3893
3894
3895
3896
3897
3898 if (renorm && curr)
3899 se->vruntime += cfs_rq->min_vruntime;
3900
3901 update_curr(cfs_rq);
3902
3903
3904
3905
3906
3907
3908
3909 if (renorm && !curr)
3910 se->vruntime += cfs_rq->min_vruntime;
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3921 update_cfs_group(se);
3922 enqueue_runnable_load_avg(cfs_rq, se);
3923 account_entity_enqueue(cfs_rq, se);
3924
3925 if (flags & ENQUEUE_WAKEUP)
3926 place_entity(cfs_rq, se, 0);
3927
3928 check_schedstat_required();
3929 update_stats_enqueue(cfs_rq, se, flags);
3930 check_spread(cfs_rq, se);
3931 if (!curr)
3932 __enqueue_entity(cfs_rq, se);
3933 se->on_rq = 1;
3934
3935 if (cfs_rq->nr_running == 1) {
3936 list_add_leaf_cfs_rq(cfs_rq);
3937 check_enqueue_throttle(cfs_rq);
3938 }
3939}
3940
3941static void __clear_buddies_last(struct sched_entity *se)
3942{
3943 for_each_sched_entity(se) {
3944 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3945 if (cfs_rq->last != se)
3946 break;
3947
3948 cfs_rq->last = NULL;
3949 }
3950}
3951
3952static void __clear_buddies_next(struct sched_entity *se)
3953{
3954 for_each_sched_entity(se) {
3955 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3956 if (cfs_rq->next != se)
3957 break;
3958
3959 cfs_rq->next = NULL;
3960 }
3961}
3962
3963static void __clear_buddies_skip(struct sched_entity *se)
3964{
3965 for_each_sched_entity(se) {
3966 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3967 if (cfs_rq->skip != se)
3968 break;
3969
3970 cfs_rq->skip = NULL;
3971 }
3972}
3973
3974static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3975{
3976 if (cfs_rq->last == se)
3977 __clear_buddies_last(se);
3978
3979 if (cfs_rq->next == se)
3980 __clear_buddies_next(se);
3981
3982 if (cfs_rq->skip == se)
3983 __clear_buddies_skip(se);
3984}
3985
3986static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3987
3988static void
3989dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3990{
3991
3992
3993
3994 update_curr(cfs_rq);
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004 update_load_avg(cfs_rq, se, UPDATE_TG);
4005 dequeue_runnable_load_avg(cfs_rq, se);
4006
4007 update_stats_dequeue(cfs_rq, se, flags);
4008
4009 clear_buddies(cfs_rq, se);
4010
4011 if (se != cfs_rq->curr)
4012 __dequeue_entity(cfs_rq, se);
4013 se->on_rq = 0;
4014 account_entity_dequeue(cfs_rq, se);
4015
4016
4017
4018
4019
4020
4021
4022 if (!(flags & DEQUEUE_SLEEP))
4023 se->vruntime -= cfs_rq->min_vruntime;
4024
4025
4026 return_cfs_rq_runtime(cfs_rq);
4027
4028 update_cfs_group(se);
4029
4030
4031
4032
4033
4034
4035
4036 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4037 update_min_vruntime(cfs_rq);
4038}
4039
4040
4041
4042
4043static void
4044check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4045{
4046 unsigned long ideal_runtime, delta_exec;
4047 struct sched_entity *se;
4048 s64 delta;
4049
4050 ideal_runtime = sched_slice(cfs_rq, curr);
4051 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4052 if (delta_exec > ideal_runtime) {
4053 resched_curr(rq_of(cfs_rq));
4054
4055
4056
4057
4058 clear_buddies(cfs_rq, curr);
4059 return;
4060 }
4061
4062
4063
4064
4065
4066
4067 if (delta_exec < sysctl_sched_min_granularity)
4068 return;
4069
4070 se = __pick_first_entity(cfs_rq);
4071 delta = curr->vruntime - se->vruntime;
4072
4073 if (delta < 0)
4074 return;
4075
4076 if (delta > ideal_runtime)
4077 resched_curr(rq_of(cfs_rq));
4078}
4079
4080static void
4081set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4082{
4083
4084 if (se->on_rq) {
4085
4086
4087
4088
4089
4090 update_stats_wait_end(cfs_rq, se);
4091 __dequeue_entity(cfs_rq, se);
4092 update_load_avg(cfs_rq, se, UPDATE_TG);
4093 }
4094
4095 update_stats_curr_start(cfs_rq, se);
4096 cfs_rq->curr = se;
4097
4098
4099
4100
4101
4102
4103 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4104 schedstat_set(se->statistics.slice_max,
4105 max((u64)schedstat_val(se->statistics.slice_max),
4106 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4107 }
4108
4109 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4110}
4111
4112static int
4113wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4114
4115
4116
4117
4118
4119
4120
4121
4122static struct sched_entity *
4123pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4124{
4125 struct sched_entity *left = __pick_first_entity(cfs_rq);
4126 struct sched_entity *se;
4127
4128
4129
4130
4131
4132 if (!left || (curr && entity_before(curr, left)))
4133 left = curr;
4134
4135 se = left;
4136
4137
4138
4139
4140
4141 if (cfs_rq->skip == se) {
4142 struct sched_entity *second;
4143
4144 if (se == curr) {
4145 second = __pick_first_entity(cfs_rq);
4146 } else {
4147 second = __pick_next_entity(se);
4148 if (!second || (curr && entity_before(curr, second)))
4149 second = curr;
4150 }
4151
4152 if (second && wakeup_preempt_entity(second, left) < 1)
4153 se = second;
4154 }
4155
4156
4157
4158
4159 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4160 se = cfs_rq->last;
4161
4162
4163
4164
4165 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4166 se = cfs_rq->next;
4167
4168 clear_buddies(cfs_rq, se);
4169
4170 return se;
4171}
4172
4173static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4174
4175static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4176{
4177
4178
4179
4180
4181 if (prev->on_rq)
4182 update_curr(cfs_rq);
4183
4184
4185 check_cfs_rq_runtime(cfs_rq);
4186
4187 check_spread(cfs_rq, prev);
4188
4189 if (prev->on_rq) {
4190 update_stats_wait_start(cfs_rq, prev);
4191
4192 __enqueue_entity(cfs_rq, prev);
4193
4194 update_load_avg(cfs_rq, prev, 0);
4195 }
4196 cfs_rq->curr = NULL;
4197}
4198
4199static void
4200entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4201{
4202
4203
4204
4205 update_curr(cfs_rq);
4206
4207
4208
4209
4210 update_load_avg(cfs_rq, curr, UPDATE_TG);
4211 update_cfs_group(curr);
4212
4213#ifdef CONFIG_SCHED_HRTICK
4214
4215
4216
4217
4218 if (queued) {
4219 resched_curr(rq_of(cfs_rq));
4220 return;
4221 }
4222
4223
4224
4225 if (!sched_feat(DOUBLE_TICK) &&
4226 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4227 return;
4228#endif
4229
4230 if (cfs_rq->nr_running > 1)
4231 check_preempt_tick(cfs_rq, curr);
4232}
4233
4234
4235
4236
4237
4238
4239#ifdef CONFIG_CFS_BANDWIDTH
4240
4241#ifdef CONFIG_JUMP_LABEL
4242static struct static_key __cfs_bandwidth_used;
4243
4244static inline bool cfs_bandwidth_used(void)
4245{
4246 return static_key_false(&__cfs_bandwidth_used);
4247}
4248
4249void cfs_bandwidth_usage_inc(void)
4250{
4251 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4252}
4253
4254void cfs_bandwidth_usage_dec(void)
4255{
4256 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4257}
4258#else
4259static bool cfs_bandwidth_used(void)
4260{
4261 return true;
4262}
4263
4264void cfs_bandwidth_usage_inc(void) {}
4265void cfs_bandwidth_usage_dec(void) {}
4266#endif
4267
4268
4269
4270
4271
4272static inline u64 default_cfs_period(void)
4273{
4274 return 100000000ULL;
4275}
4276
4277static inline u64 sched_cfs_bandwidth_slice(void)
4278{
4279 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4280}
4281
4282
4283
4284
4285
4286
4287
4288
4289void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4290{
4291 u64 now;
4292
4293 if (cfs_b->quota == RUNTIME_INF)
4294 return;
4295
4296 now = sched_clock_cpu(smp_processor_id());
4297 cfs_b->runtime = cfs_b->quota;
4298 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4299 cfs_b->expires_seq++;
4300}
4301
4302static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4303{
4304 return &tg->cfs_bandwidth;
4305}
4306
4307
4308static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4309{
4310 if (unlikely(cfs_rq->throttle_count))
4311 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4312
4313 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4314}
4315
4316
4317static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4318{
4319 struct task_group *tg = cfs_rq->tg;
4320 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4321 u64 amount = 0, min_amount, expires;
4322 int expires_seq;
4323
4324
4325 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4326
4327 raw_spin_lock(&cfs_b->lock);
4328 if (cfs_b->quota == RUNTIME_INF)
4329 amount = min_amount;
4330 else {
4331 start_cfs_bandwidth(cfs_b);
4332
4333 if (cfs_b->runtime > 0) {
4334 amount = min(cfs_b->runtime, min_amount);
4335 cfs_b->runtime -= amount;
4336 cfs_b->idle = 0;
4337 }
4338 }
4339 expires_seq = cfs_b->expires_seq;
4340 expires = cfs_b->runtime_expires;
4341 raw_spin_unlock(&cfs_b->lock);
4342
4343 cfs_rq->runtime_remaining += amount;
4344
4345
4346
4347
4348
4349 if (cfs_rq->expires_seq != expires_seq) {
4350 cfs_rq->expires_seq = expires_seq;
4351 cfs_rq->runtime_expires = expires;
4352 }
4353
4354 return cfs_rq->runtime_remaining > 0;
4355}
4356
4357
4358
4359
4360
4361static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4362{
4363 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4364
4365
4366 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4367 return;
4368
4369 if (cfs_rq->runtime_remaining < 0)
4370 return;
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4381
4382 cfs_rq->runtime_expires += TICK_NSEC;
4383 } else {
4384
4385 cfs_rq->runtime_remaining = 0;
4386 }
4387}
4388
4389static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4390{
4391
4392 cfs_rq->runtime_remaining -= delta_exec;
4393 expire_cfs_rq_runtime(cfs_rq);
4394
4395 if (likely(cfs_rq->runtime_remaining > 0))
4396 return;
4397
4398
4399
4400
4401
4402 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4403 resched_curr(rq_of(cfs_rq));
4404}
4405
4406static __always_inline
4407void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4408{
4409 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4410 return;
4411
4412 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4413}
4414
4415static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4416{
4417 return cfs_bandwidth_used() && cfs_rq->throttled;
4418}
4419
4420
4421static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4422{
4423 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4424}
4425
4426
4427
4428
4429
4430
4431static inline int throttled_lb_pair(struct task_group *tg,
4432 int src_cpu, int dest_cpu)
4433{
4434 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4435
4436 src_cfs_rq = tg->cfs_rq[src_cpu];
4437 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4438
4439 return throttled_hierarchy(src_cfs_rq) ||
4440 throttled_hierarchy(dest_cfs_rq);
4441}
4442
4443static int tg_unthrottle_up(struct task_group *tg, void *data)
4444{
4445 struct rq *rq = data;
4446 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4447
4448 cfs_rq->throttle_count--;
4449 if (!cfs_rq->throttle_count) {
4450
4451 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4452 cfs_rq->throttled_clock_task;
4453
4454
4455 if (cfs_rq->nr_running >= 1)
4456 list_add_leaf_cfs_rq(cfs_rq);
4457 }
4458
4459 return 0;
4460}
4461
4462static int tg_throttle_down(struct task_group *tg, void *data)
4463{
4464 struct rq *rq = data;
4465 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4466
4467
4468 if (!cfs_rq->throttle_count) {
4469 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4470 list_del_leaf_cfs_rq(cfs_rq);
4471 }
4472 cfs_rq->throttle_count++;
4473
4474 return 0;
4475}
4476
4477static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4478{
4479 struct rq *rq = rq_of(cfs_rq);
4480 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4481 struct sched_entity *se;
4482 long task_delta, dequeue = 1;
4483 bool empty;
4484
4485 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4486
4487
4488 rcu_read_lock();
4489 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4490 rcu_read_unlock();
4491
4492 task_delta = cfs_rq->h_nr_running;
4493 for_each_sched_entity(se) {
4494 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4495
4496 if (!se->on_rq)
4497 break;
4498
4499 if (dequeue)
4500 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4501 qcfs_rq->h_nr_running -= task_delta;
4502
4503 if (qcfs_rq->load.weight)
4504 dequeue = 0;
4505 }
4506
4507 if (!se)
4508 sub_nr_running(rq, task_delta);
4509
4510 cfs_rq->throttled = 1;
4511 cfs_rq->throttled_clock = rq_clock(rq);
4512 raw_spin_lock(&cfs_b->lock);
4513 empty = list_empty(&cfs_b->throttled_cfs_rq);
4514
4515
4516
4517
4518
4519
4520 if (cfs_b->distribute_running)
4521 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4522 else
4523 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4524
4525
4526
4527
4528
4529 if (empty)
4530 start_cfs_bandwidth(cfs_b);
4531
4532 raw_spin_unlock(&cfs_b->lock);
4533}
4534
4535void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4536{
4537 struct rq *rq = rq_of(cfs_rq);
4538 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4539 struct sched_entity *se;
4540 int enqueue = 1;
4541 long task_delta;
4542
4543 se = cfs_rq->tg->se[cpu_of(rq)];
4544
4545 cfs_rq->throttled = 0;
4546
4547 update_rq_clock(rq);
4548
4549 raw_spin_lock(&cfs_b->lock);
4550 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4551 list_del_rcu(&cfs_rq->throttled_list);
4552 raw_spin_unlock(&cfs_b->lock);
4553
4554
4555 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4556
4557 if (!cfs_rq->load.weight)
4558 return;
4559
4560 task_delta = cfs_rq->h_nr_running;
4561 for_each_sched_entity(se) {
4562 if (se->on_rq)
4563 enqueue = 0;
4564
4565 cfs_rq = cfs_rq_of(se);
4566 if (enqueue)
4567 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4568 cfs_rq->h_nr_running += task_delta;
4569
4570 if (cfs_rq_throttled(cfs_rq))
4571 break;
4572 }
4573
4574 assert_list_leaf_cfs_rq(rq);
4575
4576 if (!se)
4577 add_nr_running(rq, task_delta);
4578
4579
4580 if (rq->curr == rq->idle && rq->cfs.nr_running)
4581 resched_curr(rq);
4582}
4583
4584static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4585 u64 remaining, u64 expires)
4586{
4587 struct cfs_rq *cfs_rq;
4588 u64 runtime;
4589 u64 starting_runtime = remaining;
4590
4591 rcu_read_lock();
4592 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4593 throttled_list) {
4594 struct rq *rq = rq_of(cfs_rq);
4595 struct rq_flags rf;
4596
4597 rq_lock_irqsave(rq, &rf);
4598 if (!cfs_rq_throttled(cfs_rq))
4599 goto next;
4600
4601 runtime = -cfs_rq->runtime_remaining + 1;
4602 if (runtime > remaining)
4603 runtime = remaining;
4604 remaining -= runtime;
4605
4606 cfs_rq->runtime_remaining += runtime;
4607 cfs_rq->runtime_expires = expires;
4608
4609
4610 if (cfs_rq->runtime_remaining > 0)
4611 unthrottle_cfs_rq(cfs_rq);
4612
4613next:
4614 rq_unlock_irqrestore(rq, &rf);
4615
4616 if (!remaining)
4617 break;
4618 }
4619 rcu_read_unlock();
4620
4621 return starting_runtime - remaining;
4622}
4623
4624
4625
4626
4627
4628
4629
4630static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4631{
4632 u64 runtime, runtime_expires;
4633 int throttled;
4634
4635
4636 if (cfs_b->quota == RUNTIME_INF)
4637 goto out_deactivate;
4638
4639 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4640 cfs_b->nr_periods += overrun;
4641
4642
4643
4644
4645
4646 if (cfs_b->idle && !throttled)
4647 goto out_deactivate;
4648
4649 __refill_cfs_bandwidth_runtime(cfs_b);
4650
4651 if (!throttled) {
4652
4653 cfs_b->idle = 1;
4654 return 0;
4655 }
4656
4657
4658 cfs_b->nr_throttled += overrun;
4659
4660 runtime_expires = cfs_b->runtime_expires;
4661
4662
4663
4664
4665
4666
4667
4668
4669 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4670 runtime = cfs_b->runtime;
4671 cfs_b->distribute_running = 1;
4672 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4673
4674 runtime = distribute_cfs_runtime(cfs_b, runtime,
4675 runtime_expires);
4676 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4677
4678 cfs_b->distribute_running = 0;
4679 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4680
4681 lsub_positive(&cfs_b->runtime, runtime);
4682 }
4683
4684
4685
4686
4687
4688
4689
4690 cfs_b->idle = 0;
4691
4692 return 0;
4693
4694out_deactivate:
4695 return 1;
4696}
4697
4698
4699static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4700
4701static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4702
4703static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4704
4705
4706
4707
4708
4709
4710
4711
4712static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4713{
4714 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4715 u64 remaining;
4716
4717
4718 if (hrtimer_callback_running(refresh_timer))
4719 return 1;
4720
4721
4722 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4723 if (remaining < min_expire)
4724 return 1;
4725
4726 return 0;
4727}
4728
4729static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4730{
4731 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4732
4733
4734 if (runtime_refresh_within(cfs_b, min_left))
4735 return;
4736
4737 hrtimer_start(&cfs_b->slack_timer,
4738 ns_to_ktime(cfs_bandwidth_slack_period),
4739 HRTIMER_MODE_REL);
4740}
4741
4742
4743static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4744{
4745 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4746 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4747
4748 if (slack_runtime <= 0)
4749 return;
4750
4751 raw_spin_lock(&cfs_b->lock);
4752 if (cfs_b->quota != RUNTIME_INF &&
4753 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4754 cfs_b->runtime += slack_runtime;
4755
4756
4757 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4758 !list_empty(&cfs_b->throttled_cfs_rq))
4759 start_cfs_slack_bandwidth(cfs_b);
4760 }
4761 raw_spin_unlock(&cfs_b->lock);
4762
4763
4764 cfs_rq->runtime_remaining -= slack_runtime;
4765}
4766
4767static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4768{
4769 if (!cfs_bandwidth_used())
4770 return;
4771
4772 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4773 return;
4774
4775 __return_cfs_rq_runtime(cfs_rq);
4776}
4777
4778
4779
4780
4781
4782static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4783{
4784 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4785 unsigned long flags;
4786 u64 expires;
4787
4788
4789 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4790 if (cfs_b->distribute_running) {
4791 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4792 return;
4793 }
4794
4795 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4796 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4797 return;
4798 }
4799
4800 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4801 runtime = cfs_b->runtime;
4802
4803 expires = cfs_b->runtime_expires;
4804 if (runtime)
4805 cfs_b->distribute_running = 1;
4806
4807 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4808
4809 if (!runtime)
4810 return;
4811
4812 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4813
4814 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4815 if (expires == cfs_b->runtime_expires)
4816 lsub_positive(&cfs_b->runtime, runtime);
4817 cfs_b->distribute_running = 0;
4818 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4819}
4820
4821
4822
4823
4824
4825
4826static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4827{
4828 if (!cfs_bandwidth_used())
4829 return;
4830
4831
4832 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4833 return;
4834
4835
4836 if (cfs_rq_throttled(cfs_rq))
4837 return;
4838
4839
4840 account_cfs_rq_runtime(cfs_rq, 0);
4841 if (cfs_rq->runtime_remaining <= 0)
4842 throttle_cfs_rq(cfs_rq);
4843}
4844
4845static void sync_throttle(struct task_group *tg, int cpu)
4846{
4847 struct cfs_rq *pcfs_rq, *cfs_rq;
4848
4849 if (!cfs_bandwidth_used())
4850 return;
4851
4852 if (!tg->parent)
4853 return;
4854
4855 cfs_rq = tg->cfs_rq[cpu];
4856 pcfs_rq = tg->parent->cfs_rq[cpu];
4857
4858 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4859 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4860}
4861
4862
4863static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4864{
4865 if (!cfs_bandwidth_used())
4866 return false;
4867
4868 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4869 return false;
4870
4871
4872
4873
4874
4875 if (cfs_rq_throttled(cfs_rq))
4876 return true;
4877
4878 throttle_cfs_rq(cfs_rq);
4879 return true;
4880}
4881
4882static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4883{
4884 struct cfs_bandwidth *cfs_b =
4885 container_of(timer, struct cfs_bandwidth, slack_timer);
4886
4887 do_sched_cfs_slack_timer(cfs_b);
4888
4889 return HRTIMER_NORESTART;
4890}
4891
4892extern const u64 max_cfs_quota_period;
4893
4894static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4895{
4896 struct cfs_bandwidth *cfs_b =
4897 container_of(timer, struct cfs_bandwidth, period_timer);
4898 unsigned long flags;
4899 int overrun;
4900 int idle = 0;
4901 int count = 0;
4902
4903 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4904 for (;;) {
4905 overrun = hrtimer_forward_now(timer, cfs_b->period);
4906 if (!overrun)
4907 break;
4908
4909 if (++count > 3) {
4910 u64 new, old = ktime_to_ns(cfs_b->period);
4911
4912 new = (old * 147) / 128;
4913 new = min(new, max_cfs_quota_period);
4914
4915 cfs_b->period = ns_to_ktime(new);
4916
4917
4918 cfs_b->quota *= new;
4919 cfs_b->quota = div64_u64(cfs_b->quota, old);
4920
4921 pr_warn_ratelimited(
4922 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
4923 smp_processor_id(),
4924 div_u64(new, NSEC_PER_USEC),
4925 div_u64(cfs_b->quota, NSEC_PER_USEC));
4926
4927
4928 count = 0;
4929 }
4930
4931 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4932 }
4933 if (idle)
4934 cfs_b->period_active = 0;
4935 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4936
4937 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4938}
4939
4940void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4941{
4942 raw_spin_lock_init(&cfs_b->lock);
4943 cfs_b->runtime = 0;
4944 cfs_b->quota = RUNTIME_INF;
4945 cfs_b->period = ns_to_ktime(default_cfs_period());
4946
4947 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4948 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4949 cfs_b->period_timer.function = sched_cfs_period_timer;
4950 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4951 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4952 cfs_b->distribute_running = 0;
4953}
4954
4955static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4956{
4957 cfs_rq->runtime_enabled = 0;
4958 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4959}
4960
4961void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4962{
4963 u64 overrun;
4964
4965 lockdep_assert_held(&cfs_b->lock);
4966
4967 if (cfs_b->period_active)
4968 return;
4969
4970 cfs_b->period_active = 1;
4971 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4972 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
4973 cfs_b->expires_seq++;
4974 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4975}
4976
4977static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4978{
4979
4980 if (!cfs_b->throttled_cfs_rq.next)
4981 return;
4982
4983 hrtimer_cancel(&cfs_b->period_timer);
4984 hrtimer_cancel(&cfs_b->slack_timer);
4985}
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995static void __maybe_unused update_runtime_enabled(struct rq *rq)
4996{
4997 struct task_group *tg;
4998
4999 lockdep_assert_held(&rq->lock);
5000
5001 rcu_read_lock();
5002 list_for_each_entry_rcu(tg, &task_groups, list) {
5003 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5004 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5005
5006 raw_spin_lock(&cfs_b->lock);
5007 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5008 raw_spin_unlock(&cfs_b->lock);
5009 }
5010 rcu_read_unlock();
5011}
5012
5013
5014static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5015{
5016 struct task_group *tg;
5017
5018 lockdep_assert_held(&rq->lock);
5019
5020 rcu_read_lock();
5021 list_for_each_entry_rcu(tg, &task_groups, list) {
5022 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5023
5024 if (!cfs_rq->runtime_enabled)
5025 continue;
5026
5027
5028
5029
5030
5031 cfs_rq->runtime_remaining = 1;
5032
5033
5034
5035
5036 cfs_rq->runtime_enabled = 0;
5037
5038 if (cfs_rq_throttled(cfs_rq))
5039 unthrottle_cfs_rq(cfs_rq);
5040 }
5041 rcu_read_unlock();
5042}
5043
5044#else
5045
5046static inline bool cfs_bandwidth_used(void)
5047{
5048 return false;
5049}
5050
5051static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5052{
5053 return rq_clock_task(rq_of(cfs_rq));
5054}
5055
5056static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5057static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5058static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5059static inline void sync_throttle(struct task_group *tg, int cpu) {}
5060static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5061
5062static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5063{
5064 return 0;
5065}
5066
5067static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5068{
5069 return 0;
5070}
5071
5072static inline int throttled_lb_pair(struct task_group *tg,
5073 int src_cpu, int dest_cpu)
5074{
5075 return 0;
5076}
5077
5078void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5079
5080#ifdef CONFIG_FAIR_GROUP_SCHED
5081static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5082#endif
5083
5084static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5085{
5086 return NULL;
5087}
5088static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5089static inline void update_runtime_enabled(struct rq *rq) {}
5090static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5091
5092#endif
5093
5094
5095
5096
5097
5098#ifdef CONFIG_SCHED_HRTICK
5099static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5100{
5101 struct sched_entity *se = &p->se;
5102 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5103
5104 SCHED_WARN_ON(task_rq(p) != rq);
5105
5106 if (rq->cfs.h_nr_running > 1) {
5107 u64 slice = sched_slice(cfs_rq, se);
5108 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5109 s64 delta = slice - ran;
5110
5111 if (delta < 0) {
5112 if (rq->curr == p)
5113 resched_curr(rq);
5114 return;
5115 }
5116 hrtick_start(rq, delta);
5117 }
5118}
5119
5120
5121
5122
5123
5124
5125static void hrtick_update(struct rq *rq)
5126{
5127 struct task_struct *curr = rq->curr;
5128
5129 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5130 return;
5131
5132 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5133 hrtick_start_fair(rq, curr);
5134}
5135#else
5136static inline void
5137hrtick_start_fair(struct rq *rq, struct task_struct *p)
5138{
5139}
5140
5141static inline void hrtick_update(struct rq *rq)
5142{
5143}
5144#endif
5145
5146#ifdef CONFIG_SMP
5147static inline unsigned long cpu_util(int cpu);
5148
5149static inline bool cpu_overutilized(int cpu)
5150{
5151 return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
5152}
5153
5154static inline void update_overutilized_status(struct rq *rq)
5155{
5156 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
5157 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5158}
5159#else
5160static inline void update_overutilized_status(struct rq *rq) { }
5161#endif
5162
5163
5164
5165
5166
5167
5168static void
5169enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5170{
5171 struct cfs_rq *cfs_rq;
5172 struct sched_entity *se = &p->se;
5173
5174
5175
5176
5177
5178
5179
5180 util_est_enqueue(&rq->cfs, p);
5181
5182
5183
5184
5185
5186
5187 if (p->in_iowait)
5188 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5189
5190 for_each_sched_entity(se) {
5191 if (se->on_rq)
5192 break;
5193 cfs_rq = cfs_rq_of(se);
5194 enqueue_entity(cfs_rq, se, flags);
5195
5196
5197
5198
5199
5200
5201
5202 if (cfs_rq_throttled(cfs_rq))
5203 break;
5204 cfs_rq->h_nr_running++;
5205
5206 flags = ENQUEUE_WAKEUP;
5207 }
5208
5209 for_each_sched_entity(se) {
5210 cfs_rq = cfs_rq_of(se);
5211 cfs_rq->h_nr_running++;
5212
5213 if (cfs_rq_throttled(cfs_rq))
5214 break;
5215
5216 update_load_avg(cfs_rq, se, UPDATE_TG);
5217 update_cfs_group(se);
5218 }
5219
5220 if (!se) {
5221 add_nr_running(rq, 1);
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236 if (flags & ENQUEUE_WAKEUP)
5237 update_overutilized_status(rq);
5238
5239 }
5240
5241 if (cfs_bandwidth_used()) {
5242
5243
5244
5245
5246
5247
5248 for_each_sched_entity(se) {
5249 cfs_rq = cfs_rq_of(se);
5250
5251 if (list_add_leaf_cfs_rq(cfs_rq))
5252 break;
5253 }
5254 }
5255
5256 assert_list_leaf_cfs_rq(rq);
5257
5258 hrtick_update(rq);
5259}
5260
5261static void set_next_buddy(struct sched_entity *se);
5262
5263
5264
5265
5266
5267
5268static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5269{
5270 struct cfs_rq *cfs_rq;
5271 struct sched_entity *se = &p->se;
5272 int task_sleep = flags & DEQUEUE_SLEEP;
5273
5274 for_each_sched_entity(se) {
5275 cfs_rq = cfs_rq_of(se);
5276 dequeue_entity(cfs_rq, se, flags);
5277
5278
5279
5280
5281
5282
5283
5284 if (cfs_rq_throttled(cfs_rq))
5285 break;
5286 cfs_rq->h_nr_running--;
5287
5288
5289 if (cfs_rq->load.weight) {
5290
5291 se = parent_entity(se);
5292
5293
5294
5295
5296 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5297 set_next_buddy(se);
5298 break;
5299 }
5300 flags |= DEQUEUE_SLEEP;
5301 }
5302
5303 for_each_sched_entity(se) {
5304 cfs_rq = cfs_rq_of(se);
5305 cfs_rq->h_nr_running--;
5306
5307 if (cfs_rq_throttled(cfs_rq))
5308 break;
5309
5310 update_load_avg(cfs_rq, se, UPDATE_TG);
5311 update_cfs_group(se);
5312 }
5313
5314 if (!se)
5315 sub_nr_running(rq, 1);
5316
5317 util_est_dequeue(&rq->cfs, p, task_sleep);
5318 hrtick_update(rq);
5319}
5320
5321#ifdef CONFIG_SMP
5322
5323
5324DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5325DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5326
5327#ifdef CONFIG_NO_HZ_COMMON
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354#define DEGRADE_SHIFT 7
5355
5356static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5357static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5358 { 0, 0, 0, 0, 0, 0, 0, 0 },
5359 { 64, 32, 8, 0, 0, 0, 0, 0 },
5360 { 96, 72, 40, 12, 1, 0, 0, 0 },
5361 { 112, 98, 75, 43, 15, 1, 0, 0 },
5362 { 120, 112, 98, 76, 45, 16, 2, 0 }
5363};
5364
5365
5366
5367
5368
5369
5370static unsigned long
5371decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5372{
5373 int j = 0;
5374
5375 if (!missed_updates)
5376 return load;
5377
5378 if (missed_updates >= degrade_zero_ticks[idx])
5379 return 0;
5380
5381 if (idx == 1)
5382 return load >> missed_updates;
5383
5384 while (missed_updates) {
5385 if (missed_updates % 2)
5386 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5387
5388 missed_updates >>= 1;
5389 j++;
5390 }
5391 return load;
5392}
5393
5394static struct {
5395 cpumask_var_t idle_cpus_mask;
5396 atomic_t nr_cpus;
5397 int has_blocked;
5398 unsigned long next_balance;
5399 unsigned long next_blocked;
5400} nohz ____cacheline_aligned;
5401
5402#endif
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5440 unsigned long pending_updates)
5441{
5442 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5443 int i, scale;
5444
5445 this_rq->nr_load_updates++;
5446
5447
5448 this_rq->cpu_load[0] = this_load;
5449 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5450 unsigned long old_load, new_load;
5451
5452
5453
5454 old_load = this_rq->cpu_load[i];
5455#ifdef CONFIG_NO_HZ_COMMON
5456 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5457 if (tickless_load) {
5458 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5459
5460
5461
5462
5463
5464 old_load += tickless_load;
5465 }
5466#endif
5467 new_load = this_load;
5468
5469
5470
5471
5472
5473 if (new_load > old_load)
5474 new_load += scale - 1;
5475
5476 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5477 }
5478}
5479
5480
5481static unsigned long weighted_cpuload(struct rq *rq)
5482{
5483 return cfs_rq_runnable_load_avg(&rq->cfs);
5484}
5485
5486#ifdef CONFIG_NO_HZ_COMMON
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501static void cpu_load_update_nohz(struct rq *this_rq,
5502 unsigned long curr_jiffies,
5503 unsigned long load)
5504{
5505 unsigned long pending_updates;
5506
5507 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5508 if (pending_updates) {
5509 this_rq->last_load_update_tick = curr_jiffies;
5510
5511
5512
5513
5514
5515 cpu_load_update(this_rq, load, pending_updates);
5516 }
5517}
5518
5519
5520
5521
5522
5523static void cpu_load_update_idle(struct rq *this_rq)
5524{
5525
5526
5527
5528 if (weighted_cpuload(this_rq))
5529 return;
5530
5531 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5532}
5533
5534
5535
5536
5537
5538
5539
5540void cpu_load_update_nohz_start(void)
5541{
5542 struct rq *this_rq = this_rq();
5543
5544
5545
5546
5547
5548
5549 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5550}
5551
5552
5553
5554
5555void cpu_load_update_nohz_stop(void)
5556{
5557 unsigned long curr_jiffies = READ_ONCE(jiffies);
5558 struct rq *this_rq = this_rq();
5559 unsigned long load;
5560 struct rq_flags rf;
5561
5562 if (curr_jiffies == this_rq->last_load_update_tick)
5563 return;
5564
5565 load = weighted_cpuload(this_rq);
5566 rq_lock(this_rq, &rf);
5567 update_rq_clock(this_rq);
5568 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5569 rq_unlock(this_rq, &rf);
5570}
5571#else
5572static inline void cpu_load_update_nohz(struct rq *this_rq,
5573 unsigned long curr_jiffies,
5574 unsigned long load) { }
5575#endif
5576
5577static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5578{
5579#ifdef CONFIG_NO_HZ_COMMON
5580
5581 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5582#endif
5583 cpu_load_update(this_rq, load, 1);
5584}
5585
5586
5587
5588
5589void cpu_load_update_active(struct rq *this_rq)
5590{
5591 unsigned long load = weighted_cpuload(this_rq);
5592
5593 if (tick_nohz_tick_stopped())
5594 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5595 else
5596 cpu_load_update_periodic(this_rq, load);
5597}
5598
5599
5600
5601
5602
5603
5604
5605
5606static unsigned long source_load(int cpu, int type)
5607{
5608 struct rq *rq = cpu_rq(cpu);
5609 unsigned long total = weighted_cpuload(rq);
5610
5611 if (type == 0 || !sched_feat(LB_BIAS))
5612 return total;
5613
5614 return min(rq->cpu_load[type-1], total);
5615}
5616
5617
5618
5619
5620
5621static unsigned long target_load(int cpu, int type)
5622{
5623 struct rq *rq = cpu_rq(cpu);
5624 unsigned long total = weighted_cpuload(rq);
5625
5626 if (type == 0 || !sched_feat(LB_BIAS))
5627 return total;
5628
5629 return max(rq->cpu_load[type-1], total);
5630}
5631
5632static unsigned long capacity_of(int cpu)
5633{
5634 return cpu_rq(cpu)->cpu_capacity;
5635}
5636
5637static unsigned long cpu_avg_load_per_task(int cpu)
5638{
5639 struct rq *rq = cpu_rq(cpu);
5640 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5641 unsigned long load_avg = weighted_cpuload(rq);
5642
5643 if (nr_running)
5644 return load_avg / nr_running;
5645
5646 return 0;
5647}
5648
5649static void record_wakee(struct task_struct *p)
5650{
5651
5652
5653
5654
5655 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5656 current->wakee_flips >>= 1;
5657 current->wakee_flip_decay_ts = jiffies;
5658 }
5659
5660 if (current->last_wakee != p) {
5661 current->last_wakee = p;
5662 current->wakee_flips++;
5663 }
5664}
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683static int wake_wide(struct task_struct *p)
5684{
5685 unsigned int master = current->wakee_flips;
5686 unsigned int slave = p->wakee_flips;
5687 int factor = this_cpu_read(sd_llc_size);
5688
5689 if (master < slave)
5690 swap(master, slave);
5691 if (slave < factor || master < slave * factor)
5692 return 0;
5693 return 1;
5694}
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708static int
5709wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5710{
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5724 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5725
5726 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5727 return this_cpu;
5728
5729 return nr_cpumask_bits;
5730}
5731
5732static int
5733wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5734 int this_cpu, int prev_cpu, int sync)
5735{
5736 s64 this_eff_load, prev_eff_load;
5737 unsigned long task_load;
5738
5739 this_eff_load = target_load(this_cpu, sd->wake_idx);
5740
5741 if (sync) {
5742 unsigned long current_load = task_h_load(current);
5743
5744 if (current_load > this_eff_load)
5745 return this_cpu;
5746
5747 this_eff_load -= current_load;
5748 }
5749
5750 task_load = task_h_load(p);
5751
5752 this_eff_load += task_load;
5753 if (sched_feat(WA_BIAS))
5754 this_eff_load *= 100;
5755 this_eff_load *= capacity_of(prev_cpu);
5756
5757 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5758 prev_eff_load -= task_load;
5759 if (sched_feat(WA_BIAS))
5760 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5761 prev_eff_load *= capacity_of(this_cpu);
5762
5763
5764
5765
5766
5767
5768
5769 if (sync)
5770 prev_eff_load += 1;
5771
5772 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5773}
5774
5775static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5776 int this_cpu, int prev_cpu, int sync)
5777{
5778 int target = nr_cpumask_bits;
5779
5780 if (sched_feat(WA_IDLE))
5781 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5782
5783 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5784 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5785
5786 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5787 if (target == nr_cpumask_bits)
5788 return prev_cpu;
5789
5790 schedstat_inc(sd->ttwu_move_affine);
5791 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5792 return target;
5793}
5794
5795static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5796
5797static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5798{
5799 return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5800}
5801
5802
5803
5804
5805
5806
5807
5808static struct sched_group *
5809find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5810 int this_cpu, int sd_flag)
5811{
5812 struct sched_group *idlest = NULL, *group = sd->groups;
5813 struct sched_group *most_spare_sg = NULL;
5814 unsigned long min_runnable_load = ULONG_MAX;
5815 unsigned long this_runnable_load = ULONG_MAX;
5816 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5817 unsigned long most_spare = 0, this_spare = 0;
5818 int load_idx = sd->forkexec_idx;
5819 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5820 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5821 (sd->imbalance_pct-100) / 100;
5822
5823 if (sd_flag & SD_BALANCE_WAKE)
5824 load_idx = sd->wake_idx;
5825
5826 do {
5827 unsigned long load, avg_load, runnable_load;
5828 unsigned long spare_cap, max_spare_cap;
5829 int local_group;
5830 int i;
5831
5832
5833 if (!cpumask_intersects(sched_group_span(group),
5834 &p->cpus_allowed))
5835 continue;
5836
5837 local_group = cpumask_test_cpu(this_cpu,
5838 sched_group_span(group));
5839
5840
5841
5842
5843
5844 avg_load = 0;
5845 runnable_load = 0;
5846 max_spare_cap = 0;
5847
5848 for_each_cpu(i, sched_group_span(group)) {
5849
5850 if (local_group)
5851 load = source_load(i, load_idx);
5852 else
5853 load = target_load(i, load_idx);
5854
5855 runnable_load += load;
5856
5857 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5858
5859 spare_cap = capacity_spare_without(i, p);
5860
5861 if (spare_cap > max_spare_cap)
5862 max_spare_cap = spare_cap;
5863 }
5864
5865
5866 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5867 group->sgc->capacity;
5868 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5869 group->sgc->capacity;
5870
5871 if (local_group) {
5872 this_runnable_load = runnable_load;
5873 this_avg_load = avg_load;
5874 this_spare = max_spare_cap;
5875 } else {
5876 if (min_runnable_load > (runnable_load + imbalance)) {
5877
5878
5879
5880
5881 min_runnable_load = runnable_load;
5882 min_avg_load = avg_load;
5883 idlest = group;
5884 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5885 (100*min_avg_load > imbalance_scale*avg_load)) {
5886
5887
5888
5889
5890 min_avg_load = avg_load;
5891 idlest = group;
5892 }
5893
5894 if (most_spare < max_spare_cap) {
5895 most_spare = max_spare_cap;
5896 most_spare_sg = group;
5897 }
5898 }
5899 } while (group = group->next, group != sd->groups);
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912 if (sd_flag & SD_BALANCE_FORK)
5913 goto skip_spare;
5914
5915 if (this_spare > task_util(p) / 2 &&
5916 imbalance_scale*this_spare > 100*most_spare)
5917 return NULL;
5918
5919 if (most_spare > task_util(p) / 2)
5920 return most_spare_sg;
5921
5922skip_spare:
5923 if (!idlest)
5924 return NULL;
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934 if ((sd->flags & SD_NUMA) &&
5935 min_runnable_load + imbalance >= this_runnable_load)
5936 return NULL;
5937
5938 if (min_runnable_load > (this_runnable_load + imbalance))
5939 return NULL;
5940
5941 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5942 (100*this_avg_load < imbalance_scale*min_avg_load))
5943 return NULL;
5944
5945 return idlest;
5946}
5947
5948
5949
5950
5951static int
5952find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5953{
5954 unsigned long load, min_load = ULONG_MAX;
5955 unsigned int min_exit_latency = UINT_MAX;
5956 u64 latest_idle_timestamp = 0;
5957 int least_loaded_cpu = this_cpu;
5958 int shallowest_idle_cpu = -1;
5959 int i;
5960
5961
5962 if (group->group_weight == 1)
5963 return cpumask_first(sched_group_span(group));
5964
5965
5966 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5967 if (available_idle_cpu(i)) {
5968 struct rq *rq = cpu_rq(i);
5969 struct cpuidle_state *idle = idle_get_state(rq);
5970 if (idle && idle->exit_latency < min_exit_latency) {
5971
5972
5973
5974
5975
5976 min_exit_latency = idle->exit_latency;
5977 latest_idle_timestamp = rq->idle_stamp;
5978 shallowest_idle_cpu = i;
5979 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5980 rq->idle_stamp > latest_idle_timestamp) {
5981
5982
5983
5984
5985
5986 latest_idle_timestamp = rq->idle_stamp;
5987 shallowest_idle_cpu = i;
5988 }
5989 } else if (shallowest_idle_cpu == -1) {
5990 load = weighted_cpuload(cpu_rq(i));
5991 if (load < min_load) {
5992 min_load = load;
5993 least_loaded_cpu = i;
5994 }
5995 }
5996 }
5997
5998 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5999}
6000
6001static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6002 int cpu, int prev_cpu, int sd_flag)
6003{
6004 int new_cpu = cpu;
6005
6006 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6007 return prev_cpu;
6008
6009
6010
6011
6012
6013 if (!(sd_flag & SD_BALANCE_FORK))
6014 sync_entity_load_avg(&p->se);
6015
6016 while (sd) {
6017 struct sched_group *group;
6018 struct sched_domain *tmp;
6019 int weight;
6020
6021 if (!(sd->flags & sd_flag)) {
6022 sd = sd->child;
6023 continue;
6024 }
6025
6026 group = find_idlest_group(sd, p, cpu, sd_flag);
6027 if (!group) {
6028 sd = sd->child;
6029 continue;
6030 }
6031
6032 new_cpu = find_idlest_group_cpu(group, p, cpu);
6033 if (new_cpu == cpu) {
6034
6035 sd = sd->child;
6036 continue;
6037 }
6038
6039
6040 cpu = new_cpu;
6041 weight = sd->span_weight;
6042 sd = NULL;
6043 for_each_domain(cpu, tmp) {
6044 if (weight <= tmp->span_weight)
6045 break;
6046 if (tmp->flags & sd_flag)
6047 sd = tmp;
6048 }
6049 }
6050
6051 return new_cpu;
6052}
6053
6054#ifdef CONFIG_SCHED_SMT
6055DEFINE_STATIC_KEY_FALSE(sched_smt_present);
6056EXPORT_SYMBOL_GPL(sched_smt_present);
6057
6058static inline void set_idle_cores(int cpu, int val)
6059{
6060 struct sched_domain_shared *sds;
6061
6062 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6063 if (sds)
6064 WRITE_ONCE(sds->has_idle_cores, val);
6065}
6066
6067static inline bool test_idle_cores(int cpu, bool def)
6068{
6069 struct sched_domain_shared *sds;
6070
6071 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6072 if (sds)
6073 return READ_ONCE(sds->has_idle_cores);
6074
6075 return def;
6076}
6077
6078
6079
6080
6081
6082
6083
6084
6085void __update_idle_core(struct rq *rq)
6086{
6087 int core = cpu_of(rq);
6088 int cpu;
6089
6090 rcu_read_lock();
6091 if (test_idle_cores(core, true))
6092 goto unlock;
6093
6094 for_each_cpu(cpu, cpu_smt_mask(core)) {
6095 if (cpu == core)
6096 continue;
6097
6098 if (!available_idle_cpu(cpu))
6099 goto unlock;
6100 }
6101
6102 set_idle_cores(core, 1);
6103unlock:
6104 rcu_read_unlock();
6105}
6106
6107
6108
6109
6110
6111
6112static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6113{
6114 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6115 int core, cpu;
6116
6117 if (!static_branch_likely(&sched_smt_present))
6118 return -1;
6119
6120 if (!test_idle_cores(target, false))
6121 return -1;
6122
6123 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6124
6125 for_each_cpu_wrap(core, cpus, target) {
6126 bool idle = true;
6127
6128 for_each_cpu(cpu, cpu_smt_mask(core)) {
6129 __cpumask_clear_cpu(cpu, cpus);
6130 if (!available_idle_cpu(cpu))
6131 idle = false;
6132 }
6133
6134 if (idle)
6135 return core;
6136 }
6137
6138
6139
6140
6141 set_idle_cores(target, 0);
6142
6143 return -1;
6144}
6145
6146
6147
6148
6149static int select_idle_smt(struct task_struct *p, int target)
6150{
6151 int cpu;
6152
6153 if (!static_branch_likely(&sched_smt_present))
6154 return -1;
6155
6156 for_each_cpu(cpu, cpu_smt_mask(target)) {
6157 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6158 continue;
6159 if (available_idle_cpu(cpu))
6160 return cpu;
6161 }
6162
6163 return -1;
6164}
6165
6166#else
6167
6168static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6169{
6170 return -1;
6171}
6172
6173static inline int select_idle_smt(struct task_struct *p, int target)
6174{
6175 return -1;
6176}
6177
6178#endif
6179
6180
6181
6182
6183
6184
6185static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6186{
6187 struct sched_domain *this_sd;
6188 u64 avg_cost, avg_idle;
6189 u64 time, cost;
6190 s64 delta;
6191 int cpu, nr = INT_MAX;
6192
6193 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6194 if (!this_sd)
6195 return -1;
6196
6197
6198
6199
6200
6201 avg_idle = this_rq()->avg_idle / 512;
6202 avg_cost = this_sd->avg_scan_cost + 1;
6203
6204 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
6205 return -1;
6206
6207 if (sched_feat(SIS_PROP)) {
6208 u64 span_avg = sd->span_weight * avg_idle;
6209 if (span_avg > 4*avg_cost)
6210 nr = div_u64(span_avg, avg_cost);
6211 else
6212 nr = 4;
6213 }
6214
6215 time = local_clock();
6216
6217 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6218 if (!--nr)
6219 return -1;
6220 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6221 continue;
6222 if (available_idle_cpu(cpu))
6223 break;
6224 }
6225
6226 time = local_clock() - time;
6227 cost = this_sd->avg_scan_cost;
6228 delta = (s64)(time - cost) / 8;
6229 this_sd->avg_scan_cost += delta;
6230
6231 return cpu;
6232}
6233
6234
6235
6236
6237static int select_idle_sibling(struct task_struct *p, int prev, int target)
6238{
6239 struct sched_domain *sd;
6240 int i, recent_used_cpu;
6241
6242 if (available_idle_cpu(target))
6243 return target;
6244
6245
6246
6247
6248 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6249 return prev;
6250
6251
6252 recent_used_cpu = p->recent_used_cpu;
6253 if (recent_used_cpu != prev &&
6254 recent_used_cpu != target &&
6255 cpus_share_cache(recent_used_cpu, target) &&
6256 available_idle_cpu(recent_used_cpu) &&
6257 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6258
6259
6260
6261
6262 p->recent_used_cpu = prev;
6263 return recent_used_cpu;
6264 }
6265
6266 sd = rcu_dereference(per_cpu(sd_llc, target));
6267 if (!sd)
6268 return target;
6269
6270 i = select_idle_core(p, sd, target);
6271 if ((unsigned)i < nr_cpumask_bits)
6272 return i;
6273
6274 i = select_idle_cpu(p, sd, target);
6275 if ((unsigned)i < nr_cpumask_bits)
6276 return i;
6277
6278 i = select_idle_smt(p, target);
6279 if ((unsigned)i < nr_cpumask_bits)
6280 return i;
6281
6282 return target;
6283}
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323static inline unsigned long cpu_util(int cpu)
6324{
6325 struct cfs_rq *cfs_rq;
6326 unsigned int util;
6327
6328 cfs_rq = &cpu_rq(cpu)->cfs;
6329 util = READ_ONCE(cfs_rq->avg.util_avg);
6330
6331 if (sched_feat(UTIL_EST))
6332 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6333
6334 return min_t(unsigned long, util, capacity_orig_of(cpu));
6335}
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6351{
6352 struct cfs_rq *cfs_rq;
6353 unsigned int util;
6354
6355
6356 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6357 return cpu_util(cpu);
6358
6359 cfs_rq = &cpu_rq(cpu)->cfs;
6360 util = READ_ONCE(cfs_rq->avg.util_avg);
6361
6362
6363 lsub_positive(&util, task_util(p));
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391 if (sched_feat(UTIL_EST)) {
6392 unsigned int estimated =
6393 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412 if (unlikely(task_on_rq_queued(p) || current == p))
6413 lsub_positive(&estimated, _task_util_est(p));
6414
6415 util = max(util, estimated);
6416 }
6417
6418
6419
6420
6421
6422
6423 return min_t(unsigned long, util, capacity_orig_of(cpu));
6424}
6425
6426
6427
6428
6429
6430
6431
6432
6433static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6434{
6435 long min_cap, max_cap;
6436
6437 if (!static_branch_unlikely(&sched_asym_cpucapacity))
6438 return 0;
6439
6440 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6441 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6442
6443
6444 if (max_cap - min_cap < max_cap >> 3)
6445 return 0;
6446
6447
6448 sync_entity_load_avg(&p->se);
6449
6450 return !task_fits_capacity(p, min_cap);
6451}
6452
6453
6454
6455
6456
6457static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6458{
6459 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6460 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6461
6462
6463
6464
6465
6466
6467
6468 if (task_cpu(p) == cpu && dst_cpu != cpu)
6469 sub_positive(&util, task_util(p));
6470 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6471 util += task_util(p);
6472
6473 if (sched_feat(UTIL_EST)) {
6474 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6475
6476
6477
6478
6479
6480
6481
6482 if (dst_cpu == cpu)
6483 util_est += _task_util_est(p);
6484
6485 util = max(util, util_est);
6486 }
6487
6488 return min(util, capacity_orig_of(cpu));
6489}
6490
6491
6492
6493
6494
6495
6496
6497
6498static long
6499compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6500{
6501 long util, max_util, sum_util, energy = 0;
6502 int cpu;
6503
6504 for (; pd; pd = pd->next) {
6505 max_util = sum_util = 0;
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516 for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
6517 util = cpu_util_next(cpu, p, dst_cpu);
6518 util = schedutil_energy_util(cpu, util);
6519 max_util = max(util, max_util);
6520 sum_util += util;
6521 }
6522
6523 energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6524 }
6525
6526 return energy;
6527}
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6570{
6571 unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
6572 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6573 int cpu, best_energy_cpu = prev_cpu;
6574 struct perf_domain *head, *pd;
6575 unsigned long cpu_cap, util;
6576 struct sched_domain *sd;
6577
6578 rcu_read_lock();
6579 pd = rcu_dereference(rd->pd);
6580 if (!pd || READ_ONCE(rd->overutilized))
6581 goto fail;
6582 head = pd;
6583
6584
6585
6586
6587
6588 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6589 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6590 sd = sd->parent;
6591 if (!sd)
6592 goto fail;
6593
6594 sync_entity_load_avg(&p->se);
6595 if (!task_util_est(p))
6596 goto unlock;
6597
6598 for (; pd; pd = pd->next) {
6599 unsigned long cur_energy, spare_cap, max_spare_cap = 0;
6600 int max_spare_cap_cpu = -1;
6601
6602 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6603 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6604 continue;
6605
6606
6607 util = cpu_util_next(cpu, p, cpu);
6608 cpu_cap = capacity_of(cpu);
6609 if (cpu_cap * 1024 < util * capacity_margin)
6610 continue;
6611
6612
6613 if (cpu == prev_cpu) {
6614 prev_energy = compute_energy(p, prev_cpu, head);
6615 best_energy = min(best_energy, prev_energy);
6616 continue;
6617 }
6618
6619
6620
6621
6622
6623 spare_cap = cpu_cap - util;
6624 if (spare_cap > max_spare_cap) {
6625 max_spare_cap = spare_cap;
6626 max_spare_cap_cpu = cpu;
6627 }
6628 }
6629
6630
6631 if (max_spare_cap_cpu >= 0) {
6632 cur_energy = compute_energy(p, max_spare_cap_cpu, head);
6633 if (cur_energy < best_energy) {
6634 best_energy = cur_energy;
6635 best_energy_cpu = max_spare_cap_cpu;
6636 }
6637 }
6638 }
6639unlock:
6640 rcu_read_unlock();
6641
6642
6643
6644
6645
6646 if (prev_energy == ULONG_MAX)
6647 return best_energy_cpu;
6648
6649 if ((prev_energy - best_energy) > (prev_energy >> 4))
6650 return best_energy_cpu;
6651
6652 return prev_cpu;
6653
6654fail:
6655 rcu_read_unlock();
6656
6657 return -1;
6658}
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672static int
6673select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6674{
6675 struct sched_domain *tmp, *sd = NULL;
6676 int cpu = smp_processor_id();
6677 int new_cpu = prev_cpu;
6678 int want_affine = 0;
6679 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6680
6681 if (sd_flag & SD_BALANCE_WAKE) {
6682 record_wakee(p);
6683
6684 if (sched_energy_enabled()) {
6685 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6686 if (new_cpu >= 0)
6687 return new_cpu;
6688 new_cpu = prev_cpu;
6689 }
6690
6691 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6692 cpumask_test_cpu(cpu, &p->cpus_allowed);
6693 }
6694
6695 rcu_read_lock();
6696 for_each_domain(cpu, tmp) {
6697 if (!(tmp->flags & SD_LOAD_BALANCE))
6698 break;
6699
6700
6701
6702
6703
6704 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6705 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6706 if (cpu != prev_cpu)
6707 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6708
6709 sd = NULL;
6710 break;
6711 }
6712
6713 if (tmp->flags & sd_flag)
6714 sd = tmp;
6715 else if (!want_affine)
6716 break;
6717 }
6718
6719 if (unlikely(sd)) {
6720
6721 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6722 } else if (sd_flag & SD_BALANCE_WAKE) {
6723
6724
6725 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6726
6727 if (want_affine)
6728 current->recent_used_cpu = cpu;
6729 }
6730 rcu_read_unlock();
6731
6732 return new_cpu;
6733}
6734
6735static void detach_entity_cfs_rq(struct sched_entity *se);
6736
6737
6738
6739
6740
6741
6742static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6743{
6744
6745
6746
6747
6748
6749
6750 if (p->state == TASK_WAKING) {
6751 struct sched_entity *se = &p->se;
6752 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6753 u64 min_vruntime;
6754
6755#ifndef CONFIG_64BIT
6756 u64 min_vruntime_copy;
6757
6758 do {
6759 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6760 smp_rmb();
6761 min_vruntime = cfs_rq->min_vruntime;
6762 } while (min_vruntime != min_vruntime_copy);
6763#else
6764 min_vruntime = cfs_rq->min_vruntime;
6765#endif
6766
6767 se->vruntime -= min_vruntime;
6768 }
6769
6770 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6771
6772
6773
6774
6775 lockdep_assert_held(&task_rq(p)->lock);
6776 detach_entity_cfs_rq(&p->se);
6777
6778 } else {
6779
6780
6781
6782
6783
6784
6785
6786
6787 remove_entity_load_avg(&p->se);
6788 }
6789
6790
6791 p->se.avg.last_update_time = 0;
6792
6793
6794 p->se.exec_start = 0;
6795
6796 update_scan_period(p, new_cpu);
6797}
6798
6799static void task_dead_fair(struct task_struct *p)
6800{
6801 remove_entity_load_avg(&p->se);
6802}
6803#endif
6804
6805static unsigned long wakeup_gran(struct sched_entity *se)
6806{
6807 unsigned long gran = sysctl_sched_wakeup_granularity;
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822 return calc_delta_fair(gran, se);
6823}
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839static int
6840wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6841{
6842 s64 gran, vdiff = curr->vruntime - se->vruntime;
6843
6844 if (vdiff <= 0)
6845 return -1;
6846
6847 gran = wakeup_gran(se);
6848 if (vdiff > gran)
6849 return 1;
6850
6851 return 0;
6852}
6853
6854static void set_last_buddy(struct sched_entity *se)
6855{
6856 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6857 return;
6858
6859 for_each_sched_entity(se) {
6860 if (SCHED_WARN_ON(!se->on_rq))
6861 return;
6862 cfs_rq_of(se)->last = se;
6863 }
6864}
6865
6866static void set_next_buddy(struct sched_entity *se)
6867{
6868 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6869 return;
6870
6871 for_each_sched_entity(se) {
6872 if (SCHED_WARN_ON(!se->on_rq))
6873 return;
6874 cfs_rq_of(se)->next = se;
6875 }
6876}
6877
6878static void set_skip_buddy(struct sched_entity *se)
6879{
6880 for_each_sched_entity(se)
6881 cfs_rq_of(se)->skip = se;
6882}
6883
6884
6885
6886
6887static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6888{
6889 struct task_struct *curr = rq->curr;
6890 struct sched_entity *se = &curr->se, *pse = &p->se;
6891 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6892 int scale = cfs_rq->nr_running >= sched_nr_latency;
6893 int next_buddy_marked = 0;
6894
6895 if (unlikely(se == pse))
6896 return;
6897
6898
6899
6900
6901
6902
6903
6904 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6905 return;
6906
6907 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6908 set_next_buddy(pse);
6909 next_buddy_marked = 1;
6910 }
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922 if (test_tsk_need_resched(curr))
6923 return;
6924
6925
6926 if (unlikely(task_has_idle_policy(curr)) &&
6927 likely(!task_has_idle_policy(p)))
6928 goto preempt;
6929
6930
6931
6932
6933
6934 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6935 return;
6936
6937 find_matching_se(&se, &pse);
6938 update_curr(cfs_rq_of(se));
6939 BUG_ON(!pse);
6940 if (wakeup_preempt_entity(se, pse) == 1) {
6941
6942
6943
6944
6945 if (!next_buddy_marked)
6946 set_next_buddy(pse);
6947 goto preempt;
6948 }
6949
6950 return;
6951
6952preempt:
6953 resched_curr(rq);
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963 if (unlikely(!se->on_rq || curr == rq->idle))
6964 return;
6965
6966 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6967 set_last_buddy(se);
6968}
6969
6970static struct task_struct *
6971pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6972{
6973 struct cfs_rq *cfs_rq = &rq->cfs;
6974 struct sched_entity *se;
6975 struct task_struct *p;
6976 int new_tasks;
6977
6978again:
6979 if (!cfs_rq->nr_running)
6980 goto idle;
6981
6982#ifdef CONFIG_FAIR_GROUP_SCHED
6983 if (prev->sched_class != &fair_sched_class)
6984 goto simple;
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994 do {
6995 struct sched_entity *curr = cfs_rq->curr;
6996
6997
6998
6999
7000
7001
7002
7003 if (curr) {
7004 if (curr->on_rq)
7005 update_curr(cfs_rq);
7006 else
7007 curr = NULL;
7008
7009
7010
7011
7012
7013
7014
7015 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7016 cfs_rq = &rq->cfs;
7017
7018 if (!cfs_rq->nr_running)
7019 goto idle;
7020
7021 goto simple;
7022 }
7023 }
7024
7025 se = pick_next_entity(cfs_rq, curr);
7026 cfs_rq = group_cfs_rq(se);
7027 } while (cfs_rq);
7028
7029 p = task_of(se);
7030
7031
7032
7033
7034
7035
7036 if (prev != p) {
7037 struct sched_entity *pse = &prev->se;
7038
7039 while (!(cfs_rq = is_same_group(se, pse))) {
7040 int se_depth = se->depth;
7041 int pse_depth = pse->depth;
7042
7043 if (se_depth <= pse_depth) {
7044 put_prev_entity(cfs_rq_of(pse), pse);
7045 pse = parent_entity(pse);
7046 }
7047 if (se_depth >= pse_depth) {
7048 set_next_entity(cfs_rq_of(se), se);
7049 se = parent_entity(se);
7050 }
7051 }
7052
7053 put_prev_entity(cfs_rq, pse);
7054 set_next_entity(cfs_rq, se);
7055 }
7056
7057 goto done;
7058simple:
7059#endif
7060
7061 put_prev_task(rq, prev);
7062
7063 do {
7064 se = pick_next_entity(cfs_rq, NULL);
7065 set_next_entity(cfs_rq, se);
7066 cfs_rq = group_cfs_rq(se);
7067 } while (cfs_rq);
7068
7069 p = task_of(se);
7070
7071done: __maybe_unused;
7072#ifdef CONFIG_SMP
7073
7074
7075
7076
7077
7078 list_move(&p->se.group_node, &rq->cfs_tasks);
7079#endif
7080
7081 if (hrtick_enabled(rq))
7082 hrtick_start_fair(rq, p);
7083
7084 update_misfit_status(p, rq);
7085
7086 return p;
7087
7088idle:
7089 update_misfit_status(NULL, rq);
7090 new_tasks = idle_balance(rq, rf);
7091
7092
7093
7094
7095
7096
7097 if (new_tasks < 0)
7098 return RETRY_TASK;
7099
7100 if (new_tasks > 0)
7101 goto again;
7102
7103
7104
7105
7106
7107 update_idle_rq_clock_pelt(rq);
7108
7109 return NULL;
7110}
7111
7112
7113
7114
7115static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7116{
7117 struct sched_entity *se = &prev->se;
7118 struct cfs_rq *cfs_rq;
7119
7120 for_each_sched_entity(se) {
7121 cfs_rq = cfs_rq_of(se);
7122 put_prev_entity(cfs_rq, se);
7123 }
7124}
7125
7126
7127
7128
7129
7130
7131static void yield_task_fair(struct rq *rq)
7132{
7133 struct task_struct *curr = rq->curr;
7134 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7135 struct sched_entity *se = &curr->se;
7136
7137
7138
7139
7140 if (unlikely(rq->nr_running == 1))
7141 return;
7142
7143 clear_buddies(cfs_rq, se);
7144
7145 if (curr->policy != SCHED_BATCH) {
7146 update_rq_clock(rq);
7147
7148
7149
7150 update_curr(cfs_rq);
7151
7152
7153
7154
7155
7156 rq_clock_skip_update(rq);
7157 }
7158
7159 set_skip_buddy(se);
7160}
7161
7162static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7163{
7164 struct sched_entity *se = &p->se;
7165
7166
7167 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7168 return false;
7169
7170
7171 set_next_buddy(se);
7172
7173 yield_task_fair(rq);
7174
7175 return true;
7176}
7177
7178#ifdef CONFIG_SMP
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7298
7299enum fbq_type { regular, remote, all };
7300
7301enum group_type {
7302 group_other = 0,
7303 group_misfit_task,
7304 group_imbalanced,
7305 group_overloaded,
7306};
7307
7308#define LBF_ALL_PINNED 0x01
7309#define LBF_NEED_BREAK 0x02
7310#define LBF_DST_PINNED 0x04
7311#define LBF_SOME_PINNED 0x08
7312#define LBF_NOHZ_STATS 0x10
7313#define LBF_NOHZ_AGAIN 0x20
7314
7315struct lb_env {
7316 struct sched_domain *sd;
7317
7318 struct rq *src_rq;
7319 int src_cpu;
7320
7321 int dst_cpu;
7322 struct rq *dst_rq;
7323
7324 struct cpumask *dst_grpmask;
7325 int new_dst_cpu;
7326 enum cpu_idle_type idle;
7327 long imbalance;
7328
7329 struct cpumask *cpus;
7330
7331 unsigned int flags;
7332
7333 unsigned int loop;
7334 unsigned int loop_break;
7335 unsigned int loop_max;
7336
7337 enum fbq_type fbq_type;
7338 enum group_type src_grp_type;
7339 struct list_head tasks;
7340};
7341
7342
7343
7344
7345static int task_hot(struct task_struct *p, struct lb_env *env)
7346{
7347 s64 delta;
7348
7349 lockdep_assert_held(&env->src_rq->lock);
7350
7351 if (p->sched_class != &fair_sched_class)
7352 return 0;
7353
7354 if (unlikely(task_has_idle_policy(p)))
7355 return 0;
7356
7357
7358
7359
7360 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7361 (&p->se == cfs_rq_of(&p->se)->next ||
7362 &p->se == cfs_rq_of(&p->se)->last))
7363 return 1;
7364
7365 if (sysctl_sched_migration_cost == -1)
7366 return 1;
7367 if (sysctl_sched_migration_cost == 0)
7368 return 0;
7369
7370 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7371
7372 return delta < (s64)sysctl_sched_migration_cost;
7373}
7374
7375#ifdef CONFIG_NUMA_BALANCING
7376
7377
7378
7379
7380
7381static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7382{
7383 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7384 unsigned long src_weight, dst_weight;
7385 int src_nid, dst_nid, dist;
7386
7387 if (!static_branch_likely(&sched_numa_balancing))
7388 return -1;
7389
7390 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7391 return -1;
7392
7393 src_nid = cpu_to_node(env->src_cpu);
7394 dst_nid = cpu_to_node(env->dst_cpu);
7395
7396 if (src_nid == dst_nid)
7397 return -1;
7398
7399
7400 if (src_nid == p->numa_preferred_nid) {
7401 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7402 return 1;
7403 else
7404 return -1;
7405 }
7406
7407
7408 if (dst_nid == p->numa_preferred_nid)
7409 return 0;
7410
7411
7412 if (env->idle == CPU_IDLE)
7413 return -1;
7414
7415 dist = node_distance(src_nid, dst_nid);
7416 if (numa_group) {
7417 src_weight = group_weight(p, src_nid, dist);
7418 dst_weight = group_weight(p, dst_nid, dist);
7419 } else {
7420 src_weight = task_weight(p, src_nid, dist);
7421 dst_weight = task_weight(p, dst_nid, dist);
7422 }
7423
7424 return dst_weight < src_weight;
7425}
7426
7427#else
7428static inline int migrate_degrades_locality(struct task_struct *p,
7429 struct lb_env *env)
7430{
7431 return -1;
7432}
7433#endif
7434
7435
7436
7437
7438static
7439int can_migrate_task(struct task_struct *p, struct lb_env *env)
7440{
7441 int tsk_cache_hot;
7442
7443 lockdep_assert_held(&env->src_rq->lock);
7444
7445
7446
7447
7448
7449
7450
7451
7452 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7453 return 0;
7454
7455 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
7456 int cpu;
7457
7458 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7459
7460 env->flags |= LBF_SOME_PINNED;
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7471 return 0;
7472
7473
7474 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7475 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7476 env->flags |= LBF_DST_PINNED;
7477 env->new_dst_cpu = cpu;
7478 break;
7479 }
7480 }
7481
7482 return 0;
7483 }
7484
7485
7486 env->flags &= ~LBF_ALL_PINNED;
7487
7488 if (task_running(env->src_rq, p)) {
7489 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7490 return 0;
7491 }
7492
7493
7494
7495
7496
7497
7498
7499 tsk_cache_hot = migrate_degrades_locality(p, env);
7500 if (tsk_cache_hot == -1)
7501 tsk_cache_hot = task_hot(p, env);
7502
7503 if (tsk_cache_hot <= 0 ||
7504 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7505 if (tsk_cache_hot == 1) {
7506 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7507 schedstat_inc(p->se.statistics.nr_forced_migrations);
7508 }
7509 return 1;
7510 }
7511
7512 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7513 return 0;
7514}
7515
7516
7517
7518
7519static void detach_task(struct task_struct *p, struct lb_env *env)
7520{
7521 lockdep_assert_held(&env->src_rq->lock);
7522
7523 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7524 set_task_cpu(p, env->dst_cpu);
7525}
7526
7527
7528
7529
7530
7531
7532
7533static struct task_struct *detach_one_task(struct lb_env *env)
7534{
7535 struct task_struct *p;
7536
7537 lockdep_assert_held(&env->src_rq->lock);
7538
7539 list_for_each_entry_reverse(p,
7540 &env->src_rq->cfs_tasks, se.group_node) {
7541 if (!can_migrate_task(p, env))
7542 continue;
7543
7544 detach_task(p, env);
7545
7546
7547
7548
7549
7550
7551
7552 schedstat_inc(env->sd->lb_gained[env->idle]);
7553 return p;
7554 }
7555 return NULL;
7556}
7557
7558static const unsigned int sched_nr_migrate_break = 32;
7559
7560
7561
7562
7563
7564
7565
7566static int detach_tasks(struct lb_env *env)
7567{
7568 struct list_head *tasks = &env->src_rq->cfs_tasks;
7569 struct task_struct *p;
7570 unsigned long load;
7571 int detached = 0;
7572
7573 lockdep_assert_held(&env->src_rq->lock);
7574
7575 if (env->imbalance <= 0)
7576 return 0;
7577
7578 while (!list_empty(tasks)) {
7579
7580
7581
7582
7583 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7584 break;
7585
7586 p = list_last_entry(tasks, struct task_struct, se.group_node);
7587
7588 env->loop++;
7589
7590 if (env->loop > env->loop_max)
7591 break;
7592
7593
7594 if (env->loop > env->loop_break) {
7595 env->loop_break += sched_nr_migrate_break;
7596 env->flags |= LBF_NEED_BREAK;
7597 break;
7598 }
7599
7600 if (!can_migrate_task(p, env))
7601 goto next;
7602
7603 load = task_h_load(p);
7604
7605 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7606 goto next;
7607
7608 if ((load / 2) > env->imbalance)
7609 goto next;
7610
7611 detach_task(p, env);
7612 list_add(&p->se.group_node, &env->tasks);
7613
7614 detached++;
7615 env->imbalance -= load;
7616
7617#ifdef CONFIG_PREEMPT
7618
7619
7620
7621
7622
7623 if (env->idle == CPU_NEWLY_IDLE)
7624 break;
7625#endif
7626
7627
7628
7629
7630
7631 if (env->imbalance <= 0)
7632 break;
7633
7634 continue;
7635next:
7636 list_move(&p->se.group_node, tasks);
7637 }
7638
7639
7640
7641
7642
7643
7644 schedstat_add(env->sd->lb_gained[env->idle], detached);
7645
7646 return detached;
7647}
7648
7649
7650
7651
7652static void attach_task(struct rq *rq, struct task_struct *p)
7653{
7654 lockdep_assert_held(&rq->lock);
7655
7656 BUG_ON(task_rq(p) != rq);
7657 activate_task(rq, p, ENQUEUE_NOCLOCK);
7658 check_preempt_curr(rq, p, 0);
7659}
7660
7661
7662
7663
7664
7665static void attach_one_task(struct rq *rq, struct task_struct *p)
7666{
7667 struct rq_flags rf;
7668
7669 rq_lock(rq, &rf);
7670 update_rq_clock(rq);
7671 attach_task(rq, p);
7672 rq_unlock(rq, &rf);
7673}
7674
7675
7676
7677
7678
7679static void attach_tasks(struct lb_env *env)
7680{
7681 struct list_head *tasks = &env->tasks;
7682 struct task_struct *p;
7683 struct rq_flags rf;
7684
7685 rq_lock(env->dst_rq, &rf);
7686 update_rq_clock(env->dst_rq);
7687
7688 while (!list_empty(tasks)) {
7689 p = list_first_entry(tasks, struct task_struct, se.group_node);
7690 list_del_init(&p->se.group_node);
7691
7692 attach_task(env->dst_rq, p);
7693 }
7694
7695 rq_unlock(env->dst_rq, &rf);
7696}
7697
7698static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7699{
7700 if (cfs_rq->avg.load_avg)
7701 return true;
7702
7703 if (cfs_rq->avg.util_avg)
7704 return true;
7705
7706 return false;
7707}
7708
7709static inline bool others_have_blocked(struct rq *rq)
7710{
7711 if (READ_ONCE(rq->avg_rt.util_avg))
7712 return true;
7713
7714 if (READ_ONCE(rq->avg_dl.util_avg))
7715 return true;
7716
7717#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
7718 if (READ_ONCE(rq->avg_irq.util_avg))
7719 return true;
7720#endif
7721
7722 return false;
7723}
7724
7725#ifdef CONFIG_FAIR_GROUP_SCHED
7726
7727static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7728{
7729 if (cfs_rq->load.weight)
7730 return false;
7731
7732 if (cfs_rq->avg.load_sum)
7733 return false;
7734
7735 if (cfs_rq->avg.util_sum)
7736 return false;
7737
7738 if (cfs_rq->avg.runnable_load_sum)
7739 return false;
7740
7741 return true;
7742}
7743
7744static void update_blocked_averages(int cpu)
7745{
7746 struct rq *rq = cpu_rq(cpu);
7747 struct cfs_rq *cfs_rq, *pos;
7748 const struct sched_class *curr_class;
7749 struct rq_flags rf;
7750 bool done = true;
7751
7752 rq_lock_irqsave(rq, &rf);
7753 update_rq_clock(rq);
7754
7755
7756
7757
7758
7759 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7760 struct sched_entity *se;
7761
7762 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
7763 update_tg_load_avg(cfs_rq, 0);
7764
7765
7766 se = cfs_rq->tg->se[cpu];
7767 if (se && !skip_blocked_update(se))
7768 update_load_avg(cfs_rq_of(se), se, 0);
7769
7770
7771
7772
7773
7774 if (cfs_rq_is_decayed(cfs_rq))
7775 list_del_leaf_cfs_rq(cfs_rq);
7776
7777
7778 if (cfs_rq_has_blocked(cfs_rq))
7779 done = false;
7780 }
7781
7782 curr_class = rq->curr->sched_class;
7783 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7784 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7785 update_irq_load_avg(rq, 0);
7786
7787 if (others_have_blocked(rq))
7788 done = false;
7789
7790#ifdef CONFIG_NO_HZ_COMMON
7791 rq->last_blocked_load_update_tick = jiffies;
7792 if (done)
7793 rq->has_blocked_load = 0;
7794#endif
7795 rq_unlock_irqrestore(rq, &rf);
7796}
7797
7798
7799
7800
7801
7802
7803static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7804{
7805 struct rq *rq = rq_of(cfs_rq);
7806 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7807 unsigned long now = jiffies;
7808 unsigned long load;
7809
7810 if (cfs_rq->last_h_load_update == now)
7811 return;
7812
7813 WRITE_ONCE(cfs_rq->h_load_next, NULL);
7814 for_each_sched_entity(se) {
7815 cfs_rq = cfs_rq_of(se);
7816 WRITE_ONCE(cfs_rq->h_load_next, se);
7817 if (cfs_rq->last_h_load_update == now)
7818 break;
7819 }
7820
7821 if (!se) {
7822 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7823 cfs_rq->last_h_load_update = now;
7824 }
7825
7826 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7827 load = cfs_rq->h_load;
7828 load = div64_ul(load * se->avg.load_avg,
7829 cfs_rq_load_avg(cfs_rq) + 1);
7830 cfs_rq = group_cfs_rq(se);
7831 cfs_rq->h_load = load;
7832 cfs_rq->last_h_load_update = now;
7833 }
7834}
7835
7836static unsigned long task_h_load(struct task_struct *p)
7837{
7838 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7839
7840 update_cfs_rq_h_load(cfs_rq);
7841 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7842 cfs_rq_load_avg(cfs_rq) + 1);
7843}
7844#else
7845static inline void update_blocked_averages(int cpu)
7846{
7847 struct rq *rq = cpu_rq(cpu);
7848 struct cfs_rq *cfs_rq = &rq->cfs;
7849 const struct sched_class *curr_class;
7850 struct rq_flags rf;
7851
7852 rq_lock_irqsave(rq, &rf);
7853 update_rq_clock(rq);
7854 update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7855
7856 curr_class = rq->curr->sched_class;
7857 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7858 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7859 update_irq_load_avg(rq, 0);
7860#ifdef CONFIG_NO_HZ_COMMON
7861 rq->last_blocked_load_update_tick = jiffies;
7862 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
7863 rq->has_blocked_load = 0;
7864#endif
7865 rq_unlock_irqrestore(rq, &rf);
7866}
7867
7868static unsigned long task_h_load(struct task_struct *p)
7869{
7870 return p->se.avg.load_avg;
7871}
7872#endif
7873
7874
7875
7876
7877
7878
7879struct sg_lb_stats {
7880 unsigned long avg_load;
7881 unsigned long group_load;
7882 unsigned long sum_weighted_load;
7883 unsigned long load_per_task;
7884 unsigned long group_capacity;
7885 unsigned long group_util;
7886 unsigned int sum_nr_running;
7887 unsigned int idle_cpus;
7888 unsigned int group_weight;
7889 enum group_type group_type;
7890 int group_no_capacity;
7891 unsigned long group_misfit_task_load;
7892#ifdef CONFIG_NUMA_BALANCING
7893 unsigned int nr_numa_running;
7894 unsigned int nr_preferred_running;
7895#endif
7896};
7897
7898
7899
7900
7901
7902struct sd_lb_stats {
7903 struct sched_group *busiest;
7904 struct sched_group *local;
7905 unsigned long total_running;
7906 unsigned long total_load;
7907 unsigned long total_capacity;
7908 unsigned long avg_load;
7909
7910 struct sg_lb_stats busiest_stat;
7911 struct sg_lb_stats local_stat;
7912};
7913
7914static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7915{
7916
7917
7918
7919
7920
7921
7922 *sds = (struct sd_lb_stats){
7923 .busiest = NULL,
7924 .local = NULL,
7925 .total_running = 0UL,
7926 .total_load = 0UL,
7927 .total_capacity = 0UL,
7928 .busiest_stat = {
7929 .avg_load = 0UL,
7930 .sum_nr_running = 0,
7931 .group_type = group_other,
7932 },
7933 };
7934}
7935
7936
7937
7938
7939
7940
7941
7942
7943static inline int get_sd_load_idx(struct sched_domain *sd,
7944 enum cpu_idle_type idle)
7945{
7946 int load_idx;
7947
7948 switch (idle) {
7949 case CPU_NOT_IDLE:
7950 load_idx = sd->busy_idx;
7951 break;
7952
7953 case CPU_NEWLY_IDLE:
7954 load_idx = sd->newidle_idx;
7955 break;
7956 default:
7957 load_idx = sd->idle_idx;
7958 break;
7959 }
7960
7961 return load_idx;
7962}
7963
7964static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7965{
7966 struct rq *rq = cpu_rq(cpu);
7967 unsigned long max = arch_scale_cpu_capacity(sd, cpu);
7968 unsigned long used, free;
7969 unsigned long irq;
7970
7971 irq = cpu_util_irq(rq);
7972
7973 if (unlikely(irq >= max))
7974 return 1;
7975
7976 used = READ_ONCE(rq->avg_rt.util_avg);
7977 used += READ_ONCE(rq->avg_dl.util_avg);
7978
7979 if (unlikely(used >= max))
7980 return 1;
7981
7982 free = max - used;
7983
7984 return scale_irq_capacity(free, irq, max);
7985}
7986
7987static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7988{
7989 unsigned long capacity = scale_rt_capacity(sd, cpu);
7990 struct sched_group *sdg = sd->groups;
7991
7992 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
7993
7994 if (!capacity)
7995 capacity = 1;
7996
7997 cpu_rq(cpu)->cpu_capacity = capacity;
7998 sdg->sgc->capacity = capacity;
7999 sdg->sgc->min_capacity = capacity;
8000 sdg->sgc->max_capacity = capacity;
8001}
8002
8003void update_group_capacity(struct sched_domain *sd, int cpu)
8004{
8005 struct sched_domain *child = sd->child;
8006 struct sched_group *group, *sdg = sd->groups;
8007 unsigned long capacity, min_capacity, max_capacity;
8008 unsigned long interval;
8009
8010 interval = msecs_to_jiffies(sd->balance_interval);
8011 interval = clamp(interval, 1UL, max_load_balance_interval);
8012 sdg->sgc->next_update = jiffies + interval;
8013
8014 if (!child) {
8015 update_cpu_capacity(sd, cpu);
8016 return;
8017 }
8018
8019 capacity = 0;
8020 min_capacity = ULONG_MAX;
8021 max_capacity = 0;
8022
8023 if (child->flags & SD_OVERLAP) {
8024
8025
8026
8027
8028
8029 for_each_cpu(cpu, sched_group_span(sdg)) {
8030 struct sched_group_capacity *sgc;
8031 struct rq *rq = cpu_rq(cpu);
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044 if (unlikely(!rq->sd)) {
8045 capacity += capacity_of(cpu);
8046 } else {
8047 sgc = rq->sd->groups->sgc;
8048 capacity += sgc->capacity;
8049 }
8050
8051 min_capacity = min(capacity, min_capacity);
8052 max_capacity = max(capacity, max_capacity);
8053 }
8054 } else {
8055
8056
8057
8058
8059
8060 group = child->groups;
8061 do {
8062 struct sched_group_capacity *sgc = group->sgc;
8063
8064 capacity += sgc->capacity;
8065 min_capacity = min(sgc->min_capacity, min_capacity);
8066 max_capacity = max(sgc->max_capacity, max_capacity);
8067 group = group->next;
8068 } while (group != child->groups);
8069 }
8070
8071 sdg->sgc->capacity = capacity;
8072 sdg->sgc->min_capacity = min_capacity;
8073 sdg->sgc->max_capacity = max_capacity;
8074}
8075
8076
8077
8078
8079
8080
8081static inline int
8082check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8083{
8084 return ((rq->cpu_capacity * sd->imbalance_pct) <
8085 (rq->cpu_capacity_orig * 100));
8086}
8087
8088
8089
8090
8091
8092
8093static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8094{
8095 return rq->misfit_task_load &&
8096 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8097 check_cpu_capacity(rq, sd));
8098}
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129static inline int sg_imbalanced(struct sched_group *group)
8130{
8131 return group->sgc->imbalance;
8132}
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146static inline bool
8147group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8148{
8149 if (sgs->sum_nr_running < sgs->group_weight)
8150 return true;
8151
8152 if ((sgs->group_capacity * 100) >
8153 (sgs->group_util * env->sd->imbalance_pct))
8154 return true;
8155
8156 return false;
8157}
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167static inline bool
8168group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8169{
8170 if (sgs->sum_nr_running <= sgs->group_weight)
8171 return false;
8172
8173 if ((sgs->group_capacity * 100) <
8174 (sgs->group_util * env->sd->imbalance_pct))
8175 return true;
8176
8177 return false;
8178}
8179
8180
8181
8182
8183
8184static inline bool
8185group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8186{
8187 return sg->sgc->min_capacity * capacity_margin <
8188 ref->sgc->min_capacity * 1024;
8189}
8190
8191
8192
8193
8194
8195static inline bool
8196group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8197{
8198 return sg->sgc->max_capacity * capacity_margin <
8199 ref->sgc->max_capacity * 1024;
8200}
8201
8202static inline enum
8203group_type group_classify(struct sched_group *group,
8204 struct sg_lb_stats *sgs)
8205{
8206 if (sgs->group_no_capacity)
8207 return group_overloaded;
8208
8209 if (sg_imbalanced(group))
8210 return group_imbalanced;
8211
8212 if (sgs->group_misfit_task_load)
8213 return group_misfit_task;
8214
8215 return group_other;
8216}
8217
8218static bool update_nohz_stats(struct rq *rq, bool force)
8219{
8220#ifdef CONFIG_NO_HZ_COMMON
8221 unsigned int cpu = rq->cpu;
8222
8223 if (!rq->has_blocked_load)
8224 return false;
8225
8226 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8227 return false;
8228
8229 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8230 return true;
8231
8232 update_blocked_averages(cpu);
8233
8234 return rq->has_blocked_load;
8235#else
8236 return false;
8237#endif
8238}
8239
8240
8241
8242
8243
8244
8245
8246
8247static inline void update_sg_lb_stats(struct lb_env *env,
8248 struct sched_group *group,
8249 struct sg_lb_stats *sgs,
8250 int *sg_status)
8251{
8252 int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8253 int load_idx = get_sd_load_idx(env->sd, env->idle);
8254 unsigned long load;
8255 int i, nr_running;
8256
8257 memset(sgs, 0, sizeof(*sgs));
8258
8259 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8260 struct rq *rq = cpu_rq(i);
8261
8262 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8263 env->flags |= LBF_NOHZ_AGAIN;
8264
8265
8266 if (local_group)
8267 load = target_load(i, load_idx);
8268 else
8269 load = source_load(i, load_idx);
8270
8271 sgs->group_load += load;
8272 sgs->group_util += cpu_util(i);
8273 sgs->sum_nr_running += rq->cfs.h_nr_running;
8274
8275 nr_running = rq->nr_running;
8276 if (nr_running > 1)
8277 *sg_status |= SG_OVERLOAD;
8278
8279 if (cpu_overutilized(i))
8280 *sg_status |= SG_OVERUTILIZED;
8281
8282#ifdef CONFIG_NUMA_BALANCING
8283 sgs->nr_numa_running += rq->nr_numa_running;
8284 sgs->nr_preferred_running += rq->nr_preferred_running;
8285#endif
8286 sgs->sum_weighted_load += weighted_cpuload(rq);
8287
8288
8289
8290 if (!nr_running && idle_cpu(i))
8291 sgs->idle_cpus++;
8292
8293 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8294 sgs->group_misfit_task_load < rq->misfit_task_load) {
8295 sgs->group_misfit_task_load = rq->misfit_task_load;
8296 *sg_status |= SG_OVERLOAD;
8297 }
8298 }
8299
8300
8301 sgs->group_capacity = group->sgc->capacity;
8302 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8303
8304 if (sgs->sum_nr_running)
8305 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8306
8307 sgs->group_weight = group->group_weight;
8308
8309 sgs->group_no_capacity = group_is_overloaded(env, sgs);
8310 sgs->group_type = group_classify(group, sgs);
8311}
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326static bool update_sd_pick_busiest(struct lb_env *env,
8327 struct sd_lb_stats *sds,
8328 struct sched_group *sg,
8329 struct sg_lb_stats *sgs)
8330{
8331 struct sg_lb_stats *busiest = &sds->busiest_stat;
8332
8333
8334
8335
8336
8337
8338
8339 if (sgs->group_type == group_misfit_task &&
8340 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
8341 !group_has_capacity(env, &sds->local_stat)))
8342 return false;
8343
8344 if (sgs->group_type > busiest->group_type)
8345 return true;
8346
8347 if (sgs->group_type < busiest->group_type)
8348 return false;
8349
8350 if (sgs->avg_load <= busiest->avg_load)
8351 return false;
8352
8353 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8354 goto asym_packing;
8355
8356
8357
8358
8359
8360
8361
8362 if (sgs->sum_nr_running <= sgs->group_weight &&
8363 group_smaller_min_cpu_capacity(sds->local, sg))
8364 return false;
8365
8366
8367
8368
8369 if (sgs->group_type == group_misfit_task &&
8370 sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8371 return false;
8372
8373asym_packing:
8374
8375 if (!(env->sd->flags & SD_ASYM_PACKING))
8376 return true;
8377
8378
8379 if (env->idle == CPU_NOT_IDLE)
8380 return true;
8381
8382
8383
8384
8385
8386 if (sgs->sum_nr_running &&
8387 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
8388 if (!sds->busiest)
8389 return true;
8390
8391
8392 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
8393 sg->asym_prefer_cpu))
8394 return true;
8395 }
8396
8397 return false;
8398}
8399
8400#ifdef CONFIG_NUMA_BALANCING
8401static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8402{
8403 if (sgs->sum_nr_running > sgs->nr_numa_running)
8404 return regular;
8405 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8406 return remote;
8407 return all;
8408}
8409
8410static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8411{
8412 if (rq->nr_running > rq->nr_numa_running)
8413 return regular;
8414 if (rq->nr_running > rq->nr_preferred_running)
8415 return remote;
8416 return all;
8417}
8418#else
8419static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8420{
8421 return all;
8422}
8423
8424static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8425{
8426 return regular;
8427}
8428#endif
8429
8430
8431
8432
8433
8434
8435static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8436{
8437 struct sched_domain *child = env->sd->child;
8438 struct sched_group *sg = env->sd->groups;
8439 struct sg_lb_stats *local = &sds->local_stat;
8440 struct sg_lb_stats tmp_sgs;
8441 bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
8442 int sg_status = 0;
8443
8444#ifdef CONFIG_NO_HZ_COMMON
8445 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8446 env->flags |= LBF_NOHZ_STATS;
8447#endif
8448
8449 do {
8450 struct sg_lb_stats *sgs = &tmp_sgs;
8451 int local_group;
8452
8453 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8454 if (local_group) {
8455 sds->local = sg;
8456 sgs = local;
8457
8458 if (env->idle != CPU_NEWLY_IDLE ||
8459 time_after_eq(jiffies, sg->sgc->next_update))
8460 update_group_capacity(env->sd, env->dst_cpu);
8461 }
8462
8463 update_sg_lb_stats(env, sg, sgs, &sg_status);
8464
8465 if (local_group)
8466 goto next_group;
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478 if (prefer_sibling && sds->local &&
8479 group_has_capacity(env, local) &&
8480 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8481 sgs->group_no_capacity = 1;
8482 sgs->group_type = group_classify(sg, sgs);
8483 }
8484
8485 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8486 sds->busiest = sg;
8487 sds->busiest_stat = *sgs;
8488 }
8489
8490next_group:
8491
8492 sds->total_running += sgs->sum_nr_running;
8493 sds->total_load += sgs->group_load;
8494 sds->total_capacity += sgs->group_capacity;
8495
8496 sg = sg->next;
8497 } while (sg != env->sd->groups);
8498
8499#ifdef CONFIG_NO_HZ_COMMON
8500 if ((env->flags & LBF_NOHZ_AGAIN) &&
8501 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8502
8503 WRITE_ONCE(nohz.next_blocked,
8504 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8505 }
8506#endif
8507
8508 if (env->sd->flags & SD_NUMA)
8509 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8510
8511 if (!env->sd->parent) {
8512 struct root_domain *rd = env->dst_rq->rd;
8513
8514
8515 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
8516
8517
8518 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8519 } else if (sg_status & SG_OVERUTILIZED) {
8520 WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
8521 }
8522}
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8548{
8549 int busiest_cpu;
8550
8551 if (!(env->sd->flags & SD_ASYM_PACKING))
8552 return 0;
8553
8554 if (env->idle == CPU_NOT_IDLE)
8555 return 0;
8556
8557 if (!sds->busiest)
8558 return 0;
8559
8560 busiest_cpu = sds->busiest->asym_prefer_cpu;
8561 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8562 return 0;
8563
8564 env->imbalance = sds->busiest_stat.group_load;
8565
8566 return 1;
8567}
8568
8569
8570
8571
8572
8573
8574
8575
8576static inline
8577void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8578{
8579 unsigned long tmp, capa_now = 0, capa_move = 0;
8580 unsigned int imbn = 2;
8581 unsigned long scaled_busy_load_per_task;
8582 struct sg_lb_stats *local, *busiest;
8583
8584 local = &sds->local_stat;
8585 busiest = &sds->busiest_stat;
8586
8587 if (!local->sum_nr_running)
8588 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8589 else if (busiest->load_per_task > local->load_per_task)
8590 imbn = 1;
8591
8592 scaled_busy_load_per_task =
8593 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8594 busiest->group_capacity;
8595
8596 if (busiest->avg_load + scaled_busy_load_per_task >=
8597 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8598 env->imbalance = busiest->load_per_task;
8599 return;
8600 }
8601
8602
8603
8604
8605
8606
8607
8608 capa_now += busiest->group_capacity *
8609 min(busiest->load_per_task, busiest->avg_load);
8610 capa_now += local->group_capacity *
8611 min(local->load_per_task, local->avg_load);
8612 capa_now /= SCHED_CAPACITY_SCALE;
8613
8614
8615 if (busiest->avg_load > scaled_busy_load_per_task) {
8616 capa_move += busiest->group_capacity *
8617 min(busiest->load_per_task,
8618 busiest->avg_load - scaled_busy_load_per_task);
8619 }
8620
8621
8622 if (busiest->avg_load * busiest->group_capacity <
8623 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8624 tmp = (busiest->avg_load * busiest->group_capacity) /
8625 local->group_capacity;
8626 } else {
8627 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8628 local->group_capacity;
8629 }
8630 capa_move += local->group_capacity *
8631 min(local->load_per_task, local->avg_load + tmp);
8632 capa_move /= SCHED_CAPACITY_SCALE;
8633
8634
8635 if (capa_move > capa_now)
8636 env->imbalance = busiest->load_per_task;
8637}
8638
8639
8640
8641
8642
8643
8644
8645static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8646{
8647 unsigned long max_pull, load_above_capacity = ~0UL;
8648 struct sg_lb_stats *local, *busiest;
8649
8650 local = &sds->local_stat;
8651 busiest = &sds->busiest_stat;
8652
8653 if (busiest->group_type == group_imbalanced) {
8654
8655
8656
8657
8658 busiest->load_per_task =
8659 min(busiest->load_per_task, sds->avg_load);
8660 }
8661
8662
8663
8664
8665
8666
8667
8668 if (busiest->group_type != group_misfit_task &&
8669 (busiest->avg_load <= sds->avg_load ||
8670 local->avg_load >= sds->avg_load)) {
8671 env->imbalance = 0;
8672 return fix_small_imbalance(env, sds);
8673 }
8674
8675
8676
8677
8678 if (busiest->group_type == group_overloaded &&
8679 local->group_type == group_overloaded) {
8680 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8681 if (load_above_capacity > busiest->group_capacity) {
8682 load_above_capacity -= busiest->group_capacity;
8683 load_above_capacity *= scale_load_down(NICE_0_LOAD);
8684 load_above_capacity /= busiest->group_capacity;
8685 } else
8686 load_above_capacity = ~0UL;
8687 }
8688
8689
8690
8691
8692
8693
8694
8695
8696 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8697
8698
8699 env->imbalance = min(
8700 max_pull * busiest->group_capacity,
8701 (sds->avg_load - local->avg_load) * local->group_capacity
8702 ) / SCHED_CAPACITY_SCALE;
8703
8704
8705 if (busiest->group_type == group_misfit_task) {
8706 env->imbalance = max_t(long, env->imbalance,
8707 busiest->group_misfit_task_load);
8708 }
8709
8710
8711
8712
8713
8714
8715
8716 if (env->imbalance < busiest->load_per_task)
8717 return fix_small_imbalance(env, sds);
8718}
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733static struct sched_group *find_busiest_group(struct lb_env *env)
8734{
8735 struct sg_lb_stats *local, *busiest;
8736 struct sd_lb_stats sds;
8737
8738 init_sd_lb_stats(&sds);
8739
8740
8741
8742
8743
8744 update_sd_lb_stats(env, &sds);
8745
8746 if (sched_energy_enabled()) {
8747 struct root_domain *rd = env->dst_rq->rd;
8748
8749 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
8750 goto out_balanced;
8751 }
8752
8753 local = &sds.local_stat;
8754 busiest = &sds.busiest_stat;
8755
8756
8757 if (check_asym_packing(env, &sds))
8758 return sds.busiest;
8759
8760
8761 if (!sds.busiest || busiest->sum_nr_running == 0)
8762 goto out_balanced;
8763
8764
8765 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8766 / sds.total_capacity;
8767
8768
8769
8770
8771
8772
8773 if (busiest->group_type == group_imbalanced)
8774 goto force_balance;
8775
8776
8777
8778
8779
8780 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8781 busiest->group_no_capacity)
8782 goto force_balance;
8783
8784
8785 if (busiest->group_type == group_misfit_task)
8786 goto force_balance;
8787
8788
8789
8790
8791
8792 if (local->avg_load >= busiest->avg_load)
8793 goto out_balanced;
8794
8795
8796
8797
8798
8799 if (local->avg_load >= sds.avg_load)
8800 goto out_balanced;
8801
8802 if (env->idle == CPU_IDLE) {
8803
8804
8805
8806
8807
8808
8809
8810 if ((busiest->group_type != group_overloaded) &&
8811 (local->idle_cpus <= (busiest->idle_cpus + 1)))
8812 goto out_balanced;
8813 } else {
8814
8815
8816
8817
8818 if (100 * busiest->avg_load <=
8819 env->sd->imbalance_pct * local->avg_load)
8820 goto out_balanced;
8821 }
8822
8823force_balance:
8824
8825 env->src_grp_type = busiest->group_type;
8826 calculate_imbalance(env, &sds);
8827 return env->imbalance ? sds.busiest : NULL;
8828
8829out_balanced:
8830 env->imbalance = 0;
8831 return NULL;
8832}
8833
8834
8835
8836
8837static struct rq *find_busiest_queue(struct lb_env *env,
8838 struct sched_group *group)
8839{
8840 struct rq *busiest = NULL, *rq;
8841 unsigned long busiest_load = 0, busiest_capacity = 1;
8842 int i;
8843
8844 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8845 unsigned long capacity, wl;
8846 enum fbq_type rt;
8847
8848 rq = cpu_rq(i);
8849 rt = fbq_classify_rq(rq);
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870 if (rt > env->fbq_type)
8871 continue;
8872
8873
8874
8875
8876
8877 if (env->src_grp_type == group_misfit_task) {
8878 if (rq->misfit_task_load > busiest_load) {
8879 busiest_load = rq->misfit_task_load;
8880 busiest = rq;
8881 }
8882
8883 continue;
8884 }
8885
8886 capacity = capacity_of(i);
8887
8888
8889
8890
8891
8892
8893
8894 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8895 capacity_of(env->dst_cpu) < capacity &&
8896 rq->nr_running == 1)
8897 continue;
8898
8899 wl = weighted_cpuload(rq);
8900
8901
8902
8903
8904
8905
8906 if (rq->nr_running == 1 && wl > env->imbalance &&
8907 !check_cpu_capacity(rq, env->sd))
8908 continue;
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921 if (wl * busiest_capacity > busiest_load * capacity) {
8922 busiest_load = wl;
8923 busiest_capacity = capacity;
8924 busiest = rq;
8925 }
8926 }
8927
8928 return busiest;
8929}
8930
8931
8932
8933
8934
8935#define MAX_PINNED_INTERVAL 512
8936
8937static inline bool
8938asym_active_balance(struct lb_env *env)
8939{
8940
8941
8942
8943
8944
8945 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
8946 sched_asym_prefer(env->dst_cpu, env->src_cpu);
8947}
8948
8949static inline bool
8950voluntary_active_balance(struct lb_env *env)
8951{
8952 struct sched_domain *sd = env->sd;
8953
8954 if (asym_active_balance(env))
8955 return 1;
8956
8957
8958
8959
8960
8961
8962
8963 if ((env->idle != CPU_NOT_IDLE) &&
8964 (env->src_rq->cfs.h_nr_running == 1)) {
8965 if ((check_cpu_capacity(env->src_rq, sd)) &&
8966 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8967 return 1;
8968 }
8969
8970 if (env->src_grp_type == group_misfit_task)
8971 return 1;
8972
8973 return 0;
8974}
8975
8976static int need_active_balance(struct lb_env *env)
8977{
8978 struct sched_domain *sd = env->sd;
8979
8980 if (voluntary_active_balance(env))
8981 return 1;
8982
8983 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8984}
8985
8986static int active_load_balance_cpu_stop(void *data);
8987
8988static int should_we_balance(struct lb_env *env)
8989{
8990 struct sched_group *sg = env->sd->groups;
8991 int cpu, balance_cpu = -1;
8992
8993
8994
8995
8996
8997 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8998 return 0;
8999
9000
9001
9002
9003
9004 if (env->idle == CPU_NEWLY_IDLE)
9005 return 1;
9006
9007
9008 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
9009 if (!idle_cpu(cpu))
9010 continue;
9011
9012 balance_cpu = cpu;
9013 break;
9014 }
9015
9016 if (balance_cpu == -1)
9017 balance_cpu = group_balance_cpu(sg);
9018
9019
9020
9021
9022
9023 return balance_cpu == env->dst_cpu;
9024}
9025
9026
9027
9028
9029
9030static int load_balance(int this_cpu, struct rq *this_rq,
9031 struct sched_domain *sd, enum cpu_idle_type idle,
9032 int *continue_balancing)
9033{
9034 int ld_moved, cur_ld_moved, active_balance = 0;
9035 struct sched_domain *sd_parent = sd->parent;
9036 struct sched_group *group;
9037 struct rq *busiest;
9038 struct rq_flags rf;
9039 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9040
9041 struct lb_env env = {
9042 .sd = sd,
9043 .dst_cpu = this_cpu,
9044 .dst_rq = this_rq,
9045 .dst_grpmask = sched_group_span(sd->groups),
9046 .idle = idle,
9047 .loop_break = sched_nr_migrate_break,
9048 .cpus = cpus,
9049 .fbq_type = all,
9050 .tasks = LIST_HEAD_INIT(env.tasks),
9051 };
9052
9053 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
9054
9055 schedstat_inc(sd->lb_count[idle]);
9056
9057redo:
9058 if (!should_we_balance(&env)) {
9059 *continue_balancing = 0;
9060 goto out_balanced;
9061 }
9062
9063 group = find_busiest_group(&env);
9064 if (!group) {
9065 schedstat_inc(sd->lb_nobusyg[idle]);
9066 goto out_balanced;
9067 }
9068
9069 busiest = find_busiest_queue(&env, group);
9070 if (!busiest) {
9071 schedstat_inc(sd->lb_nobusyq[idle]);
9072 goto out_balanced;
9073 }
9074
9075 BUG_ON(busiest == env.dst_rq);
9076
9077 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
9078
9079 env.src_cpu = busiest->cpu;
9080 env.src_rq = busiest;
9081
9082 ld_moved = 0;
9083 if (busiest->nr_running > 1) {
9084
9085
9086
9087
9088
9089
9090 env.flags |= LBF_ALL_PINNED;
9091 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9092
9093more_balance:
9094 rq_lock_irqsave(busiest, &rf);
9095 update_rq_clock(busiest);
9096
9097
9098
9099
9100
9101 cur_ld_moved = detach_tasks(&env);
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111 rq_unlock(busiest, &rf);
9112
9113 if (cur_ld_moved) {
9114 attach_tasks(&env);
9115 ld_moved += cur_ld_moved;
9116 }
9117
9118 local_irq_restore(rf.flags);
9119
9120 if (env.flags & LBF_NEED_BREAK) {
9121 env.flags &= ~LBF_NEED_BREAK;
9122 goto more_balance;
9123 }
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9145
9146
9147 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9148
9149 env.dst_rq = cpu_rq(env.new_dst_cpu);
9150 env.dst_cpu = env.new_dst_cpu;
9151 env.flags &= ~LBF_DST_PINNED;
9152 env.loop = 0;
9153 env.loop_break = sched_nr_migrate_break;
9154
9155
9156
9157
9158
9159 goto more_balance;
9160 }
9161
9162
9163
9164
9165 if (sd_parent) {
9166 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9167
9168 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9169 *group_imbalance = 1;
9170 }
9171
9172
9173 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9174 __cpumask_clear_cpu(cpu_of(busiest), cpus);
9175
9176
9177
9178
9179
9180
9181
9182
9183 if (!cpumask_subset(cpus, env.dst_grpmask)) {
9184 env.loop = 0;
9185 env.loop_break = sched_nr_migrate_break;
9186 goto redo;
9187 }
9188 goto out_all_pinned;
9189 }
9190 }
9191
9192 if (!ld_moved) {
9193 schedstat_inc(sd->lb_failed[idle]);
9194
9195
9196
9197
9198
9199
9200 if (idle != CPU_NEWLY_IDLE)
9201 sd->nr_balance_failed++;
9202
9203 if (need_active_balance(&env)) {
9204 unsigned long flags;
9205
9206 raw_spin_lock_irqsave(&busiest->lock, flags);
9207
9208
9209
9210
9211
9212
9213 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
9214 raw_spin_unlock_irqrestore(&busiest->lock,
9215 flags);
9216 env.flags |= LBF_ALL_PINNED;
9217 goto out_one_pinned;
9218 }
9219
9220
9221
9222
9223
9224
9225 if (!busiest->active_balance) {
9226 busiest->active_balance = 1;
9227 busiest->push_cpu = this_cpu;
9228 active_balance = 1;
9229 }
9230 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9231
9232 if (active_balance) {
9233 stop_one_cpu_nowait(cpu_of(busiest),
9234 active_load_balance_cpu_stop, busiest,
9235 &busiest->active_balance_work);
9236 }
9237
9238
9239 sd->nr_balance_failed = sd->cache_nice_tries+1;
9240 }
9241 } else
9242 sd->nr_balance_failed = 0;
9243
9244 if (likely(!active_balance) || voluntary_active_balance(&env)) {
9245
9246 sd->balance_interval = sd->min_interval;
9247 } else {
9248
9249
9250
9251
9252
9253
9254 if (sd->balance_interval < sd->max_interval)
9255 sd->balance_interval *= 2;
9256 }
9257
9258 goto out;
9259
9260out_balanced:
9261
9262
9263
9264
9265 if (sd_parent) {
9266 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9267
9268 if (*group_imbalance)
9269 *group_imbalance = 0;
9270 }
9271
9272out_all_pinned:
9273
9274
9275
9276
9277
9278 schedstat_inc(sd->lb_balanced[idle]);
9279
9280 sd->nr_balance_failed = 0;
9281
9282out_one_pinned:
9283 ld_moved = 0;
9284
9285
9286
9287
9288
9289
9290
9291 if (env.idle == CPU_NEWLY_IDLE)
9292 goto out;
9293
9294
9295 if ((env.flags & LBF_ALL_PINNED &&
9296 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9297 sd->balance_interval < sd->max_interval)
9298 sd->balance_interval *= 2;
9299out:
9300 return ld_moved;
9301}
9302
9303static inline unsigned long
9304get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9305{
9306 unsigned long interval = sd->balance_interval;
9307
9308 if (cpu_busy)
9309 interval *= sd->busy_factor;
9310
9311
9312 interval = msecs_to_jiffies(interval);
9313 interval = clamp(interval, 1UL, max_load_balance_interval);
9314
9315 return interval;
9316}
9317
9318static inline void
9319update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9320{
9321 unsigned long interval, next;
9322
9323
9324 interval = get_sd_balance_interval(sd, 0);
9325 next = sd->last_balance + interval;
9326
9327 if (time_after(*next_balance, next))
9328 *next_balance = next;
9329}
9330
9331
9332
9333
9334
9335
9336
9337static int active_load_balance_cpu_stop(void *data)
9338{
9339 struct rq *busiest_rq = data;
9340 int busiest_cpu = cpu_of(busiest_rq);
9341 int target_cpu = busiest_rq->push_cpu;
9342 struct rq *target_rq = cpu_rq(target_cpu);
9343 struct sched_domain *sd;
9344 struct task_struct *p = NULL;
9345 struct rq_flags rf;
9346
9347 rq_lock_irq(busiest_rq, &rf);
9348
9349
9350
9351
9352
9353 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9354 goto out_unlock;
9355
9356
9357 if (unlikely(busiest_cpu != smp_processor_id() ||
9358 !busiest_rq->active_balance))
9359 goto out_unlock;
9360
9361
9362 if (busiest_rq->nr_running <= 1)
9363 goto out_unlock;
9364
9365
9366
9367
9368
9369
9370 BUG_ON(busiest_rq == target_rq);
9371
9372
9373 rcu_read_lock();
9374 for_each_domain(target_cpu, sd) {
9375 if ((sd->flags & SD_LOAD_BALANCE) &&
9376 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9377 break;
9378 }
9379
9380 if (likely(sd)) {
9381 struct lb_env env = {
9382 .sd = sd,
9383 .dst_cpu = target_cpu,
9384 .dst_rq = target_rq,
9385 .src_cpu = busiest_rq->cpu,
9386 .src_rq = busiest_rq,
9387 .idle = CPU_IDLE,
9388
9389
9390
9391
9392
9393
9394 .flags = LBF_DST_PINNED,
9395 };
9396
9397 schedstat_inc(sd->alb_count);
9398 update_rq_clock(busiest_rq);
9399
9400 p = detach_one_task(&env);
9401 if (p) {
9402 schedstat_inc(sd->alb_pushed);
9403
9404 sd->nr_balance_failed = 0;
9405 } else {
9406 schedstat_inc(sd->alb_failed);
9407 }
9408 }
9409 rcu_read_unlock();
9410out_unlock:
9411 busiest_rq->active_balance = 0;
9412 rq_unlock(busiest_rq, &rf);
9413
9414 if (p)
9415 attach_one_task(target_rq, p);
9416
9417 local_irq_enable();
9418
9419 return 0;
9420}
9421
9422static DEFINE_SPINLOCK(balancing);
9423
9424
9425
9426
9427
9428void update_max_interval(void)
9429{
9430 max_load_balance_interval = HZ*num_online_cpus()/10;
9431}
9432
9433
9434
9435
9436
9437
9438
9439static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9440{
9441 int continue_balancing = 1;
9442 int cpu = rq->cpu;
9443 unsigned long interval;
9444 struct sched_domain *sd;
9445
9446 unsigned long next_balance = jiffies + 60*HZ;
9447 int update_next_balance = 0;
9448 int need_serialize, need_decay = 0;
9449 u64 max_cost = 0;
9450
9451 rcu_read_lock();
9452 for_each_domain(cpu, sd) {
9453
9454
9455
9456
9457 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9458 sd->max_newidle_lb_cost =
9459 (sd->max_newidle_lb_cost * 253) / 256;
9460 sd->next_decay_max_lb_cost = jiffies + HZ;
9461 need_decay = 1;
9462 }
9463 max_cost += sd->max_newidle_lb_cost;
9464
9465 if (!(sd->flags & SD_LOAD_BALANCE))
9466 continue;
9467
9468
9469
9470
9471
9472
9473 if (!continue_balancing) {
9474 if (need_decay)
9475 continue;
9476 break;
9477 }
9478
9479 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9480
9481 need_serialize = sd->flags & SD_SERIALIZE;
9482 if (need_serialize) {
9483 if (!spin_trylock(&balancing))
9484 goto out;
9485 }
9486
9487 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9488 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9489
9490
9491
9492
9493
9494 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9495 }
9496 sd->last_balance = jiffies;
9497 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9498 }
9499 if (need_serialize)
9500 spin_unlock(&balancing);
9501out:
9502 if (time_after(next_balance, sd->last_balance + interval)) {
9503 next_balance = sd->last_balance + interval;
9504 update_next_balance = 1;
9505 }
9506 }
9507 if (need_decay) {
9508
9509
9510
9511
9512 rq->max_idle_balance_cost =
9513 max((u64)sysctl_sched_migration_cost, max_cost);
9514 }
9515 rcu_read_unlock();
9516
9517
9518
9519
9520
9521
9522 if (likely(update_next_balance)) {
9523 rq->next_balance = next_balance;
9524
9525#ifdef CONFIG_NO_HZ_COMMON
9526
9527
9528
9529
9530
9531
9532
9533
9534 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9535 nohz.next_balance = rq->next_balance;
9536#endif
9537 }
9538}
9539
9540static inline int on_null_domain(struct rq *rq)
9541{
9542 return unlikely(!rcu_dereference_sched(rq->sd));
9543}
9544
9545#ifdef CONFIG_NO_HZ_COMMON
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555static inline int find_new_ilb(void)
9556{
9557 int ilb;
9558
9559 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9560 housekeeping_cpumask(HK_FLAG_MISC)) {
9561 if (idle_cpu(ilb))
9562 return ilb;
9563 }
9564
9565 return nr_cpu_ids;
9566}
9567
9568
9569
9570
9571
9572static void kick_ilb(unsigned int flags)
9573{
9574 int ilb_cpu;
9575
9576 nohz.next_balance++;
9577
9578 ilb_cpu = find_new_ilb();
9579
9580 if (ilb_cpu >= nr_cpu_ids)
9581 return;
9582
9583 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9584 if (flags & NOHZ_KICK_MASK)
9585 return;
9586
9587
9588
9589
9590
9591
9592
9593 smp_send_reschedule(ilb_cpu);
9594}
9595
9596
9597
9598
9599
9600static void nohz_balancer_kick(struct rq *rq)
9601{
9602 unsigned long now = jiffies;
9603 struct sched_domain_shared *sds;
9604 struct sched_domain *sd;
9605 int nr_busy, i, cpu = rq->cpu;
9606 unsigned int flags = 0;
9607
9608 if (unlikely(rq->idle_balance))
9609 return;
9610
9611
9612
9613
9614
9615 nohz_balance_exit_idle(rq);
9616
9617
9618
9619
9620
9621 if (likely(!atomic_read(&nohz.nr_cpus)))
9622 return;
9623
9624 if (READ_ONCE(nohz.has_blocked) &&
9625 time_after(now, READ_ONCE(nohz.next_blocked)))
9626 flags = NOHZ_STATS_KICK;
9627
9628 if (time_before(now, nohz.next_balance))
9629 goto out;
9630
9631 if (rq->nr_running >= 2) {
9632 flags = NOHZ_KICK_MASK;
9633 goto out;
9634 }
9635
9636 rcu_read_lock();
9637
9638 sd = rcu_dereference(rq->sd);
9639 if (sd) {
9640
9641
9642
9643
9644
9645 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
9646 flags = NOHZ_KICK_MASK;
9647 goto unlock;
9648 }
9649 }
9650
9651 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9652 if (sd) {
9653
9654
9655
9656
9657
9658 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9659 if (sched_asym_prefer(i, cpu)) {
9660 flags = NOHZ_KICK_MASK;
9661 goto unlock;
9662 }
9663 }
9664 }
9665
9666 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
9667 if (sd) {
9668
9669
9670
9671
9672 if (check_misfit_status(rq, sd)) {
9673 flags = NOHZ_KICK_MASK;
9674 goto unlock;
9675 }
9676
9677
9678
9679
9680
9681
9682
9683
9684 goto unlock;
9685 }
9686
9687 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9688 if (sds) {
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698 nr_busy = atomic_read(&sds->nr_busy_cpus);
9699 if (nr_busy > 1) {
9700 flags = NOHZ_KICK_MASK;
9701 goto unlock;
9702 }
9703 }
9704unlock:
9705 rcu_read_unlock();
9706out:
9707 if (flags)
9708 kick_ilb(flags);
9709}
9710
9711static void set_cpu_sd_state_busy(int cpu)
9712{
9713 struct sched_domain *sd;
9714
9715 rcu_read_lock();
9716 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9717
9718 if (!sd || !sd->nohz_idle)
9719 goto unlock;
9720 sd->nohz_idle = 0;
9721
9722 atomic_inc(&sd->shared->nr_busy_cpus);
9723unlock:
9724 rcu_read_unlock();
9725}
9726
9727void nohz_balance_exit_idle(struct rq *rq)
9728{
9729 SCHED_WARN_ON(rq != this_rq());
9730
9731 if (likely(!rq->nohz_tick_stopped))
9732 return;
9733
9734 rq->nohz_tick_stopped = 0;
9735 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9736 atomic_dec(&nohz.nr_cpus);
9737
9738 set_cpu_sd_state_busy(rq->cpu);
9739}
9740
9741static void set_cpu_sd_state_idle(int cpu)
9742{
9743 struct sched_domain *sd;
9744
9745 rcu_read_lock();
9746 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9747
9748 if (!sd || sd->nohz_idle)
9749 goto unlock;
9750 sd->nohz_idle = 1;
9751
9752 atomic_dec(&sd->shared->nr_busy_cpus);
9753unlock:
9754 rcu_read_unlock();
9755}
9756
9757
9758
9759
9760
9761void nohz_balance_enter_idle(int cpu)
9762{
9763 struct rq *rq = cpu_rq(cpu);
9764
9765 SCHED_WARN_ON(cpu != smp_processor_id());
9766
9767
9768 if (!cpu_active(cpu))
9769 return;
9770
9771
9772 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9773 return;
9774
9775
9776
9777
9778
9779
9780 rq->has_blocked_load = 1;
9781
9782
9783
9784
9785
9786
9787
9788 if (rq->nohz_tick_stopped)
9789 goto out;
9790
9791
9792 if (on_null_domain(rq))
9793 return;
9794
9795 rq->nohz_tick_stopped = 1;
9796
9797 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9798 atomic_inc(&nohz.nr_cpus);
9799
9800
9801
9802
9803
9804
9805 smp_mb__after_atomic();
9806
9807 set_cpu_sd_state_idle(cpu);
9808
9809out:
9810
9811
9812
9813
9814 WRITE_ONCE(nohz.has_blocked, 1);
9815}
9816
9817
9818
9819
9820
9821
9822
9823
9824static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9825 enum cpu_idle_type idle)
9826{
9827
9828 unsigned long now = jiffies;
9829 unsigned long next_balance = now + 60*HZ;
9830 bool has_blocked_load = false;
9831 int update_next_balance = 0;
9832 int this_cpu = this_rq->cpu;
9833 int balance_cpu;
9834 int ret = false;
9835 struct rq *rq;
9836
9837 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847 WRITE_ONCE(nohz.has_blocked, 0);
9848
9849
9850
9851
9852
9853 smp_mb();
9854
9855 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9856 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9857 continue;
9858
9859
9860
9861
9862
9863
9864 if (need_resched()) {
9865 has_blocked_load = true;
9866 goto abort;
9867 }
9868
9869 rq = cpu_rq(balance_cpu);
9870
9871 has_blocked_load |= update_nohz_stats(rq, true);
9872
9873
9874
9875
9876
9877 if (time_after_eq(jiffies, rq->next_balance)) {
9878 struct rq_flags rf;
9879
9880 rq_lock_irqsave(rq, &rf);
9881 update_rq_clock(rq);
9882 cpu_load_update_idle(rq);
9883 rq_unlock_irqrestore(rq, &rf);
9884
9885 if (flags & NOHZ_BALANCE_KICK)
9886 rebalance_domains(rq, CPU_IDLE);
9887 }
9888
9889 if (time_after(next_balance, rq->next_balance)) {
9890 next_balance = rq->next_balance;
9891 update_next_balance = 1;
9892 }
9893 }
9894
9895
9896 if (idle != CPU_NEWLY_IDLE) {
9897 update_blocked_averages(this_cpu);
9898 has_blocked_load |= this_rq->has_blocked_load;
9899 }
9900
9901 if (flags & NOHZ_BALANCE_KICK)
9902 rebalance_domains(this_rq, CPU_IDLE);
9903
9904 WRITE_ONCE(nohz.next_blocked,
9905 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9906
9907
9908 ret = true;
9909
9910abort:
9911
9912 if (has_blocked_load)
9913 WRITE_ONCE(nohz.has_blocked, 1);
9914
9915
9916
9917
9918
9919
9920 if (likely(update_next_balance))
9921 nohz.next_balance = next_balance;
9922
9923 return ret;
9924}
9925
9926
9927
9928
9929
9930static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9931{
9932 int this_cpu = this_rq->cpu;
9933 unsigned int flags;
9934
9935 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9936 return false;
9937
9938 if (idle != CPU_IDLE) {
9939 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9940 return false;
9941 }
9942
9943
9944 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9945 if (!(flags & NOHZ_KICK_MASK))
9946 return false;
9947
9948 _nohz_idle_balance(this_rq, flags, idle);
9949
9950 return true;
9951}
9952
9953static void nohz_newidle_balance(struct rq *this_rq)
9954{
9955 int this_cpu = this_rq->cpu;
9956
9957
9958
9959
9960
9961 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9962 return;
9963
9964
9965 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9966 return;
9967
9968
9969 if (!READ_ONCE(nohz.has_blocked) ||
9970 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9971 return;
9972
9973 raw_spin_unlock(&this_rq->lock);
9974
9975
9976
9977
9978
9979
9980 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9981 kick_ilb(NOHZ_STATS_KICK);
9982 raw_spin_lock(&this_rq->lock);
9983}
9984
9985#else
9986static inline void nohz_balancer_kick(struct rq *rq) { }
9987
9988static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9989{
9990 return false;
9991}
9992
9993static inline void nohz_newidle_balance(struct rq *this_rq) { }
9994#endif
9995
9996
9997
9998
9999
10000static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
10001{
10002 unsigned long next_balance = jiffies + HZ;
10003 int this_cpu = this_rq->cpu;
10004 struct sched_domain *sd;
10005 int pulled_task = 0;
10006 u64 curr_cost = 0;
10007
10008
10009
10010
10011
10012 this_rq->idle_stamp = rq_clock(this_rq);
10013
10014
10015
10016
10017 if (!cpu_active(this_cpu))
10018 return 0;
10019
10020
10021
10022
10023
10024
10025
10026 rq_unpin_lock(this_rq, rf);
10027
10028 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
10029 !READ_ONCE(this_rq->rd->overload)) {
10030
10031 rcu_read_lock();
10032 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10033 if (sd)
10034 update_next_balance(sd, &next_balance);
10035 rcu_read_unlock();
10036
10037 nohz_newidle_balance(this_rq);
10038
10039 goto out;
10040 }
10041
10042 raw_spin_unlock(&this_rq->lock);
10043
10044 update_blocked_averages(this_cpu);
10045 rcu_read_lock();
10046 for_each_domain(this_cpu, sd) {
10047 int continue_balancing = 1;
10048 u64 t0, domain_cost;
10049
10050 if (!(sd->flags & SD_LOAD_BALANCE))
10051 continue;
10052
10053 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10054 update_next_balance(sd, &next_balance);
10055 break;
10056 }
10057
10058 if (sd->flags & SD_BALANCE_NEWIDLE) {
10059 t0 = sched_clock_cpu(this_cpu);
10060
10061 pulled_task = load_balance(this_cpu, this_rq,
10062 sd, CPU_NEWLY_IDLE,
10063 &continue_balancing);
10064
10065 domain_cost = sched_clock_cpu(this_cpu) - t0;
10066 if (domain_cost > sd->max_newidle_lb_cost)
10067 sd->max_newidle_lb_cost = domain_cost;
10068
10069 curr_cost += domain_cost;
10070 }
10071
10072 update_next_balance(sd, &next_balance);
10073
10074
10075
10076
10077
10078 if (pulled_task || this_rq->nr_running > 0)
10079 break;
10080 }
10081 rcu_read_unlock();
10082
10083 raw_spin_lock(&this_rq->lock);
10084
10085 if (curr_cost > this_rq->max_idle_balance_cost)
10086 this_rq->max_idle_balance_cost = curr_cost;
10087
10088out:
10089
10090
10091
10092
10093
10094 if (this_rq->cfs.h_nr_running && !pulled_task)
10095 pulled_task = 1;
10096
10097
10098 if (time_after(this_rq->next_balance, next_balance))
10099 this_rq->next_balance = next_balance;
10100
10101
10102 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10103 pulled_task = -1;
10104
10105 if (pulled_task)
10106 this_rq->idle_stamp = 0;
10107
10108 rq_repin_lock(this_rq, rf);
10109
10110 return pulled_task;
10111}
10112
10113
10114
10115
10116
10117static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
10118{
10119 struct rq *this_rq = this_rq();
10120 enum cpu_idle_type idle = this_rq->idle_balance ?
10121 CPU_IDLE : CPU_NOT_IDLE;
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131 if (nohz_idle_balance(this_rq, idle))
10132 return;
10133
10134
10135 update_blocked_averages(this_rq->cpu);
10136 rebalance_domains(this_rq, idle);
10137}
10138
10139
10140
10141
10142void trigger_load_balance(struct rq *rq)
10143{
10144
10145 if (unlikely(on_null_domain(rq)))
10146 return;
10147
10148 if (time_after_eq(jiffies, rq->next_balance))
10149 raise_softirq(SCHED_SOFTIRQ);
10150
10151 nohz_balancer_kick(rq);
10152}
10153
10154static void rq_online_fair(struct rq *rq)
10155{
10156 update_sysctl();
10157
10158 update_runtime_enabled(rq);
10159}
10160
10161static void rq_offline_fair(struct rq *rq)
10162{
10163 update_sysctl();
10164
10165
10166 unthrottle_offline_cfs_rqs(rq);
10167}
10168
10169#endif
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10180{
10181 struct cfs_rq *cfs_rq;
10182 struct sched_entity *se = &curr->se;
10183
10184 for_each_sched_entity(se) {
10185 cfs_rq = cfs_rq_of(se);
10186 entity_tick(cfs_rq, se, queued);
10187 }
10188
10189 if (static_branch_unlikely(&sched_numa_balancing))
10190 task_tick_numa(rq, curr);
10191
10192 update_misfit_status(curr, rq);
10193 update_overutilized_status(task_rq(curr));
10194}
10195
10196
10197
10198
10199
10200
10201static void task_fork_fair(struct task_struct *p)
10202{
10203 struct cfs_rq *cfs_rq;
10204 struct sched_entity *se = &p->se, *curr;
10205 struct rq *rq = this_rq();
10206 struct rq_flags rf;
10207
10208 rq_lock(rq, &rf);
10209 update_rq_clock(rq);
10210
10211 cfs_rq = task_cfs_rq(current);
10212 curr = cfs_rq->curr;
10213 if (curr) {
10214 update_curr(cfs_rq);
10215 se->vruntime = curr->vruntime;
10216 }
10217 place_entity(cfs_rq, se, 1);
10218
10219 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10220
10221
10222
10223
10224 swap(curr->vruntime, se->vruntime);
10225 resched_curr(rq);
10226 }
10227
10228 se->vruntime -= cfs_rq->min_vruntime;
10229 rq_unlock(rq, &rf);
10230}
10231
10232
10233
10234
10235
10236static void
10237prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10238{
10239 if (!task_on_rq_queued(p))
10240 return;
10241
10242
10243
10244
10245
10246
10247 if (rq->curr == p) {
10248 if (p->prio > oldprio)
10249 resched_curr(rq);
10250 } else
10251 check_preempt_curr(rq, p, 0);
10252}
10253
10254static inline bool vruntime_normalized(struct task_struct *p)
10255{
10256 struct sched_entity *se = &p->se;
10257
10258
10259
10260
10261
10262
10263 if (p->on_rq)
10264 return true;
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275 if (!se->sum_exec_runtime ||
10276 (p->state == TASK_WAKING && p->sched_remote_wakeup))
10277 return true;
10278
10279 return false;
10280}
10281
10282#ifdef CONFIG_FAIR_GROUP_SCHED
10283
10284
10285
10286
10287static void propagate_entity_cfs_rq(struct sched_entity *se)
10288{
10289 struct cfs_rq *cfs_rq;
10290
10291
10292 se = se->parent;
10293
10294 for_each_sched_entity(se) {
10295 cfs_rq = cfs_rq_of(se);
10296
10297 if (cfs_rq_throttled(cfs_rq))
10298 break;
10299
10300 update_load_avg(cfs_rq, se, UPDATE_TG);
10301 }
10302}
10303#else
10304static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10305#endif
10306
10307static void detach_entity_cfs_rq(struct sched_entity *se)
10308{
10309 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10310
10311
10312 update_load_avg(cfs_rq, se, 0);
10313 detach_entity_load_avg(cfs_rq, se);
10314 update_tg_load_avg(cfs_rq, false);
10315 propagate_entity_cfs_rq(se);
10316}
10317
10318static void attach_entity_cfs_rq(struct sched_entity *se)
10319{
10320 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10321
10322#ifdef CONFIG_FAIR_GROUP_SCHED
10323
10324
10325
10326
10327 se->depth = se->parent ? se->parent->depth + 1 : 0;
10328#endif
10329
10330
10331 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10332 attach_entity_load_avg(cfs_rq, se, 0);
10333 update_tg_load_avg(cfs_rq, false);
10334 propagate_entity_cfs_rq(se);
10335}
10336
10337static void detach_task_cfs_rq(struct task_struct *p)
10338{
10339 struct sched_entity *se = &p->se;
10340 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10341
10342 if (!vruntime_normalized(p)) {
10343
10344
10345
10346
10347 place_entity(cfs_rq, se, 0);
10348 se->vruntime -= cfs_rq->min_vruntime;
10349 }
10350
10351 detach_entity_cfs_rq(se);
10352}
10353
10354static void attach_task_cfs_rq(struct task_struct *p)
10355{
10356 struct sched_entity *se = &p->se;
10357 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10358
10359 attach_entity_cfs_rq(se);
10360
10361 if (!vruntime_normalized(p))
10362 se->vruntime += cfs_rq->min_vruntime;
10363}
10364
10365static void switched_from_fair(struct rq *rq, struct task_struct *p)
10366{
10367 detach_task_cfs_rq(p);
10368}
10369
10370static void switched_to_fair(struct rq *rq, struct task_struct *p)
10371{
10372 attach_task_cfs_rq(p);
10373
10374 if (task_on_rq_queued(p)) {
10375
10376
10377
10378
10379
10380 if (rq->curr == p)
10381 resched_curr(rq);
10382 else
10383 check_preempt_curr(rq, p, 0);
10384 }
10385}
10386
10387
10388
10389
10390
10391
10392static void set_curr_task_fair(struct rq *rq)
10393{
10394 struct sched_entity *se = &rq->curr->se;
10395
10396 for_each_sched_entity(se) {
10397 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10398
10399 set_next_entity(cfs_rq, se);
10400
10401 account_cfs_rq_runtime(cfs_rq, 0);
10402 }
10403}
10404
10405void init_cfs_rq(struct cfs_rq *cfs_rq)
10406{
10407 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10408 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10409#ifndef CONFIG_64BIT
10410 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10411#endif
10412#ifdef CONFIG_SMP
10413 raw_spin_lock_init(&cfs_rq->removed.lock);
10414#endif
10415}
10416
10417#ifdef CONFIG_FAIR_GROUP_SCHED
10418static void task_set_group_fair(struct task_struct *p)
10419{
10420 struct sched_entity *se = &p->se;
10421
10422 set_task_rq(p, task_cpu(p));
10423 se->depth = se->parent ? se->parent->depth + 1 : 0;
10424}
10425
10426static void task_move_group_fair(struct task_struct *p)
10427{
10428 detach_task_cfs_rq(p);
10429 set_task_rq(p, task_cpu(p));
10430
10431#ifdef CONFIG_SMP
10432
10433 p->se.avg.last_update_time = 0;
10434#endif
10435 attach_task_cfs_rq(p);
10436}
10437
10438static void task_change_group_fair(struct task_struct *p, int type)
10439{
10440 switch (type) {
10441 case TASK_SET_GROUP:
10442 task_set_group_fair(p);
10443 break;
10444
10445 case TASK_MOVE_GROUP:
10446 task_move_group_fair(p);
10447 break;
10448 }
10449}
10450
10451void free_fair_sched_group(struct task_group *tg)
10452{
10453 int i;
10454
10455 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10456
10457 for_each_possible_cpu(i) {
10458 if (tg->cfs_rq)
10459 kfree(tg->cfs_rq[i]);
10460 if (tg->se)
10461 kfree(tg->se[i]);
10462 }
10463
10464 kfree(tg->cfs_rq);
10465 kfree(tg->se);
10466}
10467
10468int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10469{
10470 struct sched_entity *se;
10471 struct cfs_rq *cfs_rq;
10472 int i;
10473
10474 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10475 if (!tg->cfs_rq)
10476 goto err;
10477 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10478 if (!tg->se)
10479 goto err;
10480
10481 tg->shares = NICE_0_LOAD;
10482
10483 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10484
10485 for_each_possible_cpu(i) {
10486 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10487 GFP_KERNEL, cpu_to_node(i));
10488 if (!cfs_rq)
10489 goto err;
10490
10491 se = kzalloc_node(sizeof(struct sched_entity),
10492 GFP_KERNEL, cpu_to_node(i));
10493 if (!se)
10494 goto err_free_rq;
10495
10496 init_cfs_rq(cfs_rq);
10497 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10498 init_entity_runnable_average(se);
10499 }
10500
10501 return 1;
10502
10503err_free_rq:
10504 kfree(cfs_rq);
10505err:
10506 return 0;
10507}
10508
10509void online_fair_sched_group(struct task_group *tg)
10510{
10511 struct sched_entity *se;
10512 struct rq *rq;
10513 int i;
10514
10515 for_each_possible_cpu(i) {
10516 rq = cpu_rq(i);
10517 se = tg->se[i];
10518
10519 raw_spin_lock_irq(&rq->lock);
10520 update_rq_clock(rq);
10521 attach_entity_cfs_rq(se);
10522 sync_throttle(tg, i);
10523 raw_spin_unlock_irq(&rq->lock);
10524 }
10525}
10526
10527void unregister_fair_sched_group(struct task_group *tg)
10528{
10529 unsigned long flags;
10530 struct rq *rq;
10531 int cpu;
10532
10533 for_each_possible_cpu(cpu) {
10534 if (tg->se[cpu])
10535 remove_entity_load_avg(tg->se[cpu]);
10536
10537
10538
10539
10540
10541 if (!tg->cfs_rq[cpu]->on_list)
10542 continue;
10543
10544 rq = cpu_rq(cpu);
10545
10546 raw_spin_lock_irqsave(&rq->lock, flags);
10547 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10548 raw_spin_unlock_irqrestore(&rq->lock, flags);
10549 }
10550}
10551
10552void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10553 struct sched_entity *se, int cpu,
10554 struct sched_entity *parent)
10555{
10556 struct rq *rq = cpu_rq(cpu);
10557
10558 cfs_rq->tg = tg;
10559 cfs_rq->rq = rq;
10560 init_cfs_rq_runtime(cfs_rq);
10561
10562 tg->cfs_rq[cpu] = cfs_rq;
10563 tg->se[cpu] = se;
10564
10565
10566 if (!se)
10567 return;
10568
10569 if (!parent) {
10570 se->cfs_rq = &rq->cfs;
10571 se->depth = 0;
10572 } else {
10573 se->cfs_rq = parent->my_q;
10574 se->depth = parent->depth + 1;
10575 }
10576
10577 se->my_q = cfs_rq;
10578
10579 update_load_set(&se->load, NICE_0_LOAD);
10580 se->parent = parent;
10581}
10582
10583static DEFINE_MUTEX(shares_mutex);
10584
10585int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10586{
10587 int i;
10588
10589
10590
10591
10592 if (!tg->se[0])
10593 return -EINVAL;
10594
10595 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10596
10597 mutex_lock(&shares_mutex);
10598 if (tg->shares == shares)
10599 goto done;
10600
10601 tg->shares = shares;
10602 for_each_possible_cpu(i) {
10603 struct rq *rq = cpu_rq(i);
10604 struct sched_entity *se = tg->se[i];
10605 struct rq_flags rf;
10606
10607
10608 rq_lock_irqsave(rq, &rf);
10609 update_rq_clock(rq);
10610 for_each_sched_entity(se) {
10611 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10612 update_cfs_group(se);
10613 }
10614 rq_unlock_irqrestore(rq, &rf);
10615 }
10616
10617done:
10618 mutex_unlock(&shares_mutex);
10619 return 0;
10620}
10621#else
10622
10623void free_fair_sched_group(struct task_group *tg) { }
10624
10625int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10626{
10627 return 1;
10628}
10629
10630void online_fair_sched_group(struct task_group *tg) { }
10631
10632void unregister_fair_sched_group(struct task_group *tg) { }
10633
10634#endif
10635
10636
10637static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10638{
10639 struct sched_entity *se = &task->se;
10640 unsigned int rr_interval = 0;
10641
10642
10643
10644
10645
10646 if (rq->cfs.load.weight)
10647 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10648
10649 return rr_interval;
10650}
10651
10652
10653
10654
10655const struct sched_class fair_sched_class = {
10656 .next = &idle_sched_class,
10657 .enqueue_task = enqueue_task_fair,
10658 .dequeue_task = dequeue_task_fair,
10659 .yield_task = yield_task_fair,
10660 .yield_to_task = yield_to_task_fair,
10661
10662 .check_preempt_curr = check_preempt_wakeup,
10663
10664 .pick_next_task = pick_next_task_fair,
10665 .put_prev_task = put_prev_task_fair,
10666
10667#ifdef CONFIG_SMP
10668 .select_task_rq = select_task_rq_fair,
10669 .migrate_task_rq = migrate_task_rq_fair,
10670
10671 .rq_online = rq_online_fair,
10672 .rq_offline = rq_offline_fair,
10673
10674 .task_dead = task_dead_fair,
10675 .set_cpus_allowed = set_cpus_allowed_common,
10676#endif
10677
10678 .set_curr_task = set_curr_task_fair,
10679 .task_tick = task_tick_fair,
10680 .task_fork = task_fork_fair,
10681
10682 .prio_changed = prio_changed_fair,
10683 .switched_from = switched_from_fair,
10684 .switched_to = switched_to_fair,
10685
10686 .get_rr_interval = get_rr_interval_fair,
10687
10688 .update_curr = update_curr_fair,
10689
10690#ifdef CONFIG_FAIR_GROUP_SCHED
10691 .task_change_group = task_change_group_fair,
10692#endif
10693};
10694
10695#ifdef CONFIG_SCHED_DEBUG
10696void print_cfs_stats(struct seq_file *m, int cpu)
10697{
10698 struct cfs_rq *cfs_rq, *pos;
10699
10700 rcu_read_lock();
10701 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10702 print_cfs_rq(m, cpu, cfs_rq);
10703 rcu_read_unlock();
10704}
10705
10706#ifdef CONFIG_NUMA_BALANCING
10707void show_numa_stats(struct task_struct *p, struct seq_file *m)
10708{
10709 int node;
10710 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10711
10712 for_each_online_node(node) {
10713 if (p->numa_faults) {
10714 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10715 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10716 }
10717 if (p->numa_group) {
10718 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10719 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10720 }
10721 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10722 }
10723}
10724#endif
10725#endif
10726
10727__init void init_sched_fair_class(void)
10728{
10729#ifdef CONFIG_SMP
10730 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10731
10732#ifdef CONFIG_NO_HZ_COMMON
10733 nohz.next_balance = jiffies;
10734 nohz.next_blocked = jiffies;
10735 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10736#endif
10737#endif
10738
10739}
10740