1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/sched/mm.h>
25#include <linux/sched/topology.h>
26
27#include <linux/latencytop.h>
28#include <linux/cpumask.h>
29#include <linux/cpuidle.h>
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
33#include <linux/mempolicy.h>
34#include <linux/migrate.h>
35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
37
38#include <trace/events/sched.h>
39
40#include "sched.h"
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55unsigned int sysctl_sched_latency = 6000000ULL;
56unsigned int normalized_sysctl_sched_latency = 6000000ULL;
57
58
59
60
61
62
63
64
65
66
67
68
69enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
70
71
72
73
74
75
76unsigned int sysctl_sched_min_granularity = 750000ULL;
77unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
78
79
80
81
82static unsigned int sched_nr_latency = 8;
83
84
85
86
87
88unsigned int sysctl_sched_child_runs_first __read_mostly;
89
90
91
92
93
94
95
96
97
98
99unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
100unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
101
102const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103
104#ifdef CONFIG_SMP
105
106
107
108int __weak arch_asym_cpu_priority(int cpu)
109{
110 return -cpu;
111}
112#endif
113
114#ifdef CONFIG_CFS_BANDWIDTH
115
116
117
118
119
120
121
122
123
124
125unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
126#endif
127
128
129
130
131
132
133
134unsigned int capacity_margin = 1280;
135
136static inline void update_load_add(struct load_weight *lw, unsigned long inc)
137{
138 lw->weight += inc;
139 lw->inv_weight = 0;
140}
141
142static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
143{
144 lw->weight -= dec;
145 lw->inv_weight = 0;
146}
147
148static inline void update_load_set(struct load_weight *lw, unsigned long w)
149{
150 lw->weight = w;
151 lw->inv_weight = 0;
152}
153
154
155
156
157
158
159
160
161
162
163static unsigned int get_update_sysctl_factor(void)
164{
165 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
166 unsigned int factor;
167
168 switch (sysctl_sched_tunable_scaling) {
169 case SCHED_TUNABLESCALING_NONE:
170 factor = 1;
171 break;
172 case SCHED_TUNABLESCALING_LINEAR:
173 factor = cpus;
174 break;
175 case SCHED_TUNABLESCALING_LOG:
176 default:
177 factor = 1 + ilog2(cpus);
178 break;
179 }
180
181 return factor;
182}
183
184static void update_sysctl(void)
185{
186 unsigned int factor = get_update_sysctl_factor();
187
188#define SET_SYSCTL(name) \
189 (sysctl_##name = (factor) * normalized_sysctl_##name)
190 SET_SYSCTL(sched_min_granularity);
191 SET_SYSCTL(sched_latency);
192 SET_SYSCTL(sched_wakeup_granularity);
193#undef SET_SYSCTL
194}
195
196void sched_init_granularity(void)
197{
198 update_sysctl();
199}
200
201#define WMULT_CONST (~0U)
202#define WMULT_SHIFT 32
203
204static void __update_inv_weight(struct load_weight *lw)
205{
206 unsigned long w;
207
208 if (likely(lw->inv_weight))
209 return;
210
211 w = scale_load_down(lw->weight);
212
213 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
214 lw->inv_weight = 1;
215 else if (unlikely(!w))
216 lw->inv_weight = WMULT_CONST;
217 else
218 lw->inv_weight = WMULT_CONST / w;
219}
220
221
222
223
224
225
226
227
228
229
230
231
232
233static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
234{
235 u64 fact = scale_load_down(weight);
236 int shift = WMULT_SHIFT;
237
238 __update_inv_weight(lw);
239
240 if (unlikely(fact >> 32)) {
241 while (fact >> 32) {
242 fact >>= 1;
243 shift--;
244 }
245 }
246
247
248 fact = (u64)(u32)fact * lw->inv_weight;
249
250 while (fact >> 32) {
251 fact >>= 1;
252 shift--;
253 }
254
255 return mul_u64_u32_shr(delta_exec, fact, shift);
256}
257
258
259const struct sched_class fair_sched_class;
260
261
262
263
264
265#ifdef CONFIG_FAIR_GROUP_SCHED
266
267
268static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
269{
270 return cfs_rq->rq;
271}
272
273
274#define entity_is_task(se) (!se->my_q)
275
276static inline struct task_struct *task_of(struct sched_entity *se)
277{
278 SCHED_WARN_ON(!entity_is_task(se));
279 return container_of(se, struct task_struct, se);
280}
281
282
283#define for_each_sched_entity(se) \
284 for (; se; se = se->parent)
285
286static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
287{
288 return p->se.cfs_rq;
289}
290
291
292static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
293{
294 return se->cfs_rq;
295}
296
297
298static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
299{
300 return grp->my_q;
301}
302
303static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
304{
305 if (!cfs_rq->on_list) {
306 struct rq *rq = rq_of(cfs_rq);
307 int cpu = cpu_of(rq);
308
309
310
311
312
313
314
315
316
317 if (cfs_rq->tg->parent &&
318 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
319
320
321
322
323
324
325 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
326 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
327
328
329
330
331
332 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
333 } else if (!cfs_rq->tg->parent) {
334
335
336
337
338 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
339 &rq->leaf_cfs_rq_list);
340
341
342
343
344 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
345 } else {
346
347
348
349
350
351
352 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
353 rq->tmp_alone_branch);
354
355
356
357
358 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
359 }
360
361 cfs_rq->on_list = 1;
362 }
363}
364
365static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
366{
367 if (cfs_rq->on_list) {
368 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
369 cfs_rq->on_list = 0;
370 }
371}
372
373
374#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
375 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
376 leaf_cfs_rq_list)
377
378
379static inline struct cfs_rq *
380is_same_group(struct sched_entity *se, struct sched_entity *pse)
381{
382 if (se->cfs_rq == pse->cfs_rq)
383 return se->cfs_rq;
384
385 return NULL;
386}
387
388static inline struct sched_entity *parent_entity(struct sched_entity *se)
389{
390 return se->parent;
391}
392
393static void
394find_matching_se(struct sched_entity **se, struct sched_entity **pse)
395{
396 int se_depth, pse_depth;
397
398
399
400
401
402
403
404
405
406 se_depth = (*se)->depth;
407 pse_depth = (*pse)->depth;
408
409 while (se_depth > pse_depth) {
410 se_depth--;
411 *se = parent_entity(*se);
412 }
413
414 while (pse_depth > se_depth) {
415 pse_depth--;
416 *pse = parent_entity(*pse);
417 }
418
419 while (!is_same_group(*se, *pse)) {
420 *se = parent_entity(*se);
421 *pse = parent_entity(*pse);
422 }
423}
424
425#else
426
427static inline struct task_struct *task_of(struct sched_entity *se)
428{
429 return container_of(se, struct task_struct, se);
430}
431
432static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
433{
434 return container_of(cfs_rq, struct rq, cfs);
435}
436
437#define entity_is_task(se) 1
438
439#define for_each_sched_entity(se) \
440 for (; se; se = NULL)
441
442static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
443{
444 return &task_rq(p)->cfs;
445}
446
447static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
448{
449 struct task_struct *p = task_of(se);
450 struct rq *rq = task_rq(p);
451
452 return &rq->cfs;
453}
454
455
456static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457{
458 return NULL;
459}
460
461static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
462{
463}
464
465static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
466{
467}
468
469#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
470 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
471
472static inline struct sched_entity *parent_entity(struct sched_entity *se)
473{
474 return NULL;
475}
476
477static inline void
478find_matching_se(struct sched_entity **se, struct sched_entity **pse)
479{
480}
481
482#endif
483
484static __always_inline
485void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
486
487
488
489
490
491static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
492{
493 s64 delta = (s64)(vruntime - max_vruntime);
494 if (delta > 0)
495 max_vruntime = vruntime;
496
497 return max_vruntime;
498}
499
500static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
501{
502 s64 delta = (s64)(vruntime - min_vruntime);
503 if (delta < 0)
504 min_vruntime = vruntime;
505
506 return min_vruntime;
507}
508
509static inline int entity_before(struct sched_entity *a,
510 struct sched_entity *b)
511{
512 return (s64)(a->vruntime - b->vruntime) < 0;
513}
514
515static void update_min_vruntime(struct cfs_rq *cfs_rq)
516{
517 struct sched_entity *curr = cfs_rq->curr;
518 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
519
520 u64 vruntime = cfs_rq->min_vruntime;
521
522 if (curr) {
523 if (curr->on_rq)
524 vruntime = curr->vruntime;
525 else
526 curr = NULL;
527 }
528
529 if (leftmost) {
530 struct sched_entity *se;
531 se = rb_entry(leftmost, struct sched_entity, run_node);
532
533 if (!curr)
534 vruntime = se->vruntime;
535 else
536 vruntime = min_vruntime(vruntime, se->vruntime);
537 }
538
539
540 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
541#ifndef CONFIG_64BIT
542 smp_wmb();
543 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
544#endif
545}
546
547
548
549
550static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
551{
552 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
553 struct rb_node *parent = NULL;
554 struct sched_entity *entry;
555 bool leftmost = true;
556
557
558
559
560 while (*link) {
561 parent = *link;
562 entry = rb_entry(parent, struct sched_entity, run_node);
563
564
565
566
567 if (entity_before(se, entry)) {
568 link = &parent->rb_left;
569 } else {
570 link = &parent->rb_right;
571 leftmost = false;
572 }
573 }
574
575 rb_link_node(&se->run_node, parent, link);
576 rb_insert_color_cached(&se->run_node,
577 &cfs_rq->tasks_timeline, leftmost);
578}
579
580static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
581{
582 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
583}
584
585struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
586{
587 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
588
589 if (!left)
590 return NULL;
591
592 return rb_entry(left, struct sched_entity, run_node);
593}
594
595static struct sched_entity *__pick_next_entity(struct sched_entity *se)
596{
597 struct rb_node *next = rb_next(&se->run_node);
598
599 if (!next)
600 return NULL;
601
602 return rb_entry(next, struct sched_entity, run_node);
603}
604
605#ifdef CONFIG_SCHED_DEBUG
606struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
607{
608 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
609
610 if (!last)
611 return NULL;
612
613 return rb_entry(last, struct sched_entity, run_node);
614}
615
616
617
618
619
620int sched_proc_update_handler(struct ctl_table *table, int write,
621 void __user *buffer, size_t *lenp,
622 loff_t *ppos)
623{
624 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
625 unsigned int factor = get_update_sysctl_factor();
626
627 if (ret || !write)
628 return ret;
629
630 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
631 sysctl_sched_min_granularity);
632
633#define WRT_SYSCTL(name) \
634 (normalized_sysctl_##name = sysctl_##name / (factor))
635 WRT_SYSCTL(sched_min_granularity);
636 WRT_SYSCTL(sched_latency);
637 WRT_SYSCTL(sched_wakeup_granularity);
638#undef WRT_SYSCTL
639
640 return 0;
641}
642#endif
643
644
645
646
647static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
648{
649 if (unlikely(se->load.weight != NICE_0_LOAD))
650 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
651
652 return delta;
653}
654
655
656
657
658
659
660
661
662
663static u64 __sched_period(unsigned long nr_running)
664{
665 if (unlikely(nr_running > sched_nr_latency))
666 return nr_running * sysctl_sched_min_granularity;
667 else
668 return sysctl_sched_latency;
669}
670
671
672
673
674
675
676
677static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
678{
679 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
680
681 for_each_sched_entity(se) {
682 struct load_weight *load;
683 struct load_weight lw;
684
685 cfs_rq = cfs_rq_of(se);
686 load = &cfs_rq->load;
687
688 if (unlikely(!se->on_rq)) {
689 lw = cfs_rq->load;
690
691 update_load_add(&lw, se->load.weight);
692 load = &lw;
693 }
694 slice = __calc_delta(slice, se->load.weight, load);
695 }
696 return slice;
697}
698
699
700
701
702
703
704static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
705{
706 return calc_delta_fair(sched_slice(cfs_rq, se), se);
707}
708
709#ifdef CONFIG_SMP
710
711#include "sched-pelt.h"
712
713static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
714static unsigned long task_h_load(struct task_struct *p);
715
716
717void init_entity_runnable_average(struct sched_entity *se)
718{
719 struct sched_avg *sa = &se->avg;
720
721 memset(sa, 0, sizeof(*sa));
722
723
724
725
726
727
728
729 if (entity_is_task(se))
730 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
731
732 se->runnable_weight = se->load.weight;
733
734
735}
736
737static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
738static void attach_entity_cfs_rq(struct sched_entity *se);
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765void post_init_entity_util_avg(struct sched_entity *se)
766{
767 struct cfs_rq *cfs_rq = cfs_rq_of(se);
768 struct sched_avg *sa = &se->avg;
769 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
770
771 if (cap > 0) {
772 if (cfs_rq->avg.util_avg != 0) {
773 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
774 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
775
776 if (sa->util_avg > cap)
777 sa->util_avg = cap;
778 } else {
779 sa->util_avg = cap;
780 }
781 }
782
783 if (entity_is_task(se)) {
784 struct task_struct *p = task_of(se);
785 if (p->sched_class != &fair_sched_class) {
786
787
788
789
790
791
792
793
794
795
796 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
797 return;
798 }
799 }
800
801 attach_entity_cfs_rq(se);
802}
803
804#else
805void init_entity_runnable_average(struct sched_entity *se)
806{
807}
808void post_init_entity_util_avg(struct sched_entity *se)
809{
810}
811static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
812{
813}
814#endif
815
816
817
818
819static void update_curr(struct cfs_rq *cfs_rq)
820{
821 struct sched_entity *curr = cfs_rq->curr;
822 u64 now = rq_clock_task(rq_of(cfs_rq));
823 u64 delta_exec;
824
825 if (unlikely(!curr))
826 return;
827
828 delta_exec = now - curr->exec_start;
829 if (unlikely((s64)delta_exec <= 0))
830 return;
831
832 curr->exec_start = now;
833
834 schedstat_set(curr->statistics.exec_max,
835 max(delta_exec, curr->statistics.exec_max));
836
837 curr->sum_exec_runtime += delta_exec;
838 schedstat_add(cfs_rq->exec_clock, delta_exec);
839
840 curr->vruntime += calc_delta_fair(delta_exec, curr);
841 update_min_vruntime(cfs_rq);
842
843 if (entity_is_task(curr)) {
844 struct task_struct *curtask = task_of(curr);
845
846 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
847 cgroup_account_cputime(curtask, delta_exec);
848 account_group_exec_runtime(curtask, delta_exec);
849 }
850
851 account_cfs_rq_runtime(cfs_rq, delta_exec);
852}
853
854static void update_curr_fair(struct rq *rq)
855{
856 update_curr(cfs_rq_of(&rq->curr->se));
857}
858
859static inline void
860update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
861{
862 u64 wait_start, prev_wait_start;
863
864 if (!schedstat_enabled())
865 return;
866
867 wait_start = rq_clock(rq_of(cfs_rq));
868 prev_wait_start = schedstat_val(se->statistics.wait_start);
869
870 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
871 likely(wait_start > prev_wait_start))
872 wait_start -= prev_wait_start;
873
874 __schedstat_set(se->statistics.wait_start, wait_start);
875}
876
877static inline void
878update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
879{
880 struct task_struct *p;
881 u64 delta;
882
883 if (!schedstat_enabled())
884 return;
885
886 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
887
888 if (entity_is_task(se)) {
889 p = task_of(se);
890 if (task_on_rq_migrating(p)) {
891
892
893
894
895
896 __schedstat_set(se->statistics.wait_start, delta);
897 return;
898 }
899 trace_sched_stat_wait(p, delta);
900 }
901
902 __schedstat_set(se->statistics.wait_max,
903 max(schedstat_val(se->statistics.wait_max), delta));
904 __schedstat_inc(se->statistics.wait_count);
905 __schedstat_add(se->statistics.wait_sum, delta);
906 __schedstat_set(se->statistics.wait_start, 0);
907}
908
909static inline void
910update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
911{
912 struct task_struct *tsk = NULL;
913 u64 sleep_start, block_start;
914
915 if (!schedstat_enabled())
916 return;
917
918 sleep_start = schedstat_val(se->statistics.sleep_start);
919 block_start = schedstat_val(se->statistics.block_start);
920
921 if (entity_is_task(se))
922 tsk = task_of(se);
923
924 if (sleep_start) {
925 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
926
927 if ((s64)delta < 0)
928 delta = 0;
929
930 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
931 __schedstat_set(se->statistics.sleep_max, delta);
932
933 __schedstat_set(se->statistics.sleep_start, 0);
934 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
935
936 if (tsk) {
937 account_scheduler_latency(tsk, delta >> 10, 1);
938 trace_sched_stat_sleep(tsk, delta);
939 }
940 }
941 if (block_start) {
942 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
943
944 if ((s64)delta < 0)
945 delta = 0;
946
947 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
948 __schedstat_set(se->statistics.block_max, delta);
949
950 __schedstat_set(se->statistics.block_start, 0);
951 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
952
953 if (tsk) {
954 if (tsk->in_iowait) {
955 __schedstat_add(se->statistics.iowait_sum, delta);
956 __schedstat_inc(se->statistics.iowait_count);
957 trace_sched_stat_iowait(tsk, delta);
958 }
959
960 trace_sched_stat_blocked(tsk, delta);
961
962
963
964
965
966
967 if (unlikely(prof_on == SLEEP_PROFILING)) {
968 profile_hits(SLEEP_PROFILING,
969 (void *)get_wchan(tsk),
970 delta >> 20);
971 }
972 account_scheduler_latency(tsk, delta >> 10, 0);
973 }
974 }
975}
976
977
978
979
980static inline void
981update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
982{
983 if (!schedstat_enabled())
984 return;
985
986
987
988
989
990 if (se != cfs_rq->curr)
991 update_stats_wait_start(cfs_rq, se);
992
993 if (flags & ENQUEUE_WAKEUP)
994 update_stats_enqueue_sleeper(cfs_rq, se);
995}
996
997static inline void
998update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
999{
1000
1001 if (!schedstat_enabled())
1002 return;
1003
1004
1005
1006
1007
1008 if (se != cfs_rq->curr)
1009 update_stats_wait_end(cfs_rq, se);
1010
1011 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1012 struct task_struct *tsk = task_of(se);
1013
1014 if (tsk->state & TASK_INTERRUPTIBLE)
1015 __schedstat_set(se->statistics.sleep_start,
1016 rq_clock(rq_of(cfs_rq)));
1017 if (tsk->state & TASK_UNINTERRUPTIBLE)
1018 __schedstat_set(se->statistics.block_start,
1019 rq_clock(rq_of(cfs_rq)));
1020 }
1021}
1022
1023
1024
1025
1026static inline void
1027update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028{
1029
1030
1031
1032 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1033}
1034
1035
1036
1037
1038
1039#ifdef CONFIG_NUMA_BALANCING
1040
1041
1042
1043
1044
1045unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1046unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1047
1048
1049unsigned int sysctl_numa_balancing_scan_size = 256;
1050
1051
1052unsigned int sysctl_numa_balancing_scan_delay = 1000;
1053
1054struct numa_group {
1055 atomic_t refcount;
1056
1057 spinlock_t lock;
1058 int nr_tasks;
1059 pid_t gid;
1060 int active_nodes;
1061
1062 struct rcu_head rcu;
1063 unsigned long total_faults;
1064 unsigned long max_faults_cpu;
1065
1066
1067
1068
1069
1070 unsigned long *faults_cpu;
1071 unsigned long faults[0];
1072};
1073
1074static inline unsigned long group_faults_priv(struct numa_group *ng);
1075static inline unsigned long group_faults_shared(struct numa_group *ng);
1076
1077static unsigned int task_nr_scan_windows(struct task_struct *p)
1078{
1079 unsigned long rss = 0;
1080 unsigned long nr_scan_pages;
1081
1082
1083
1084
1085
1086
1087 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1088 rss = get_mm_rss(p->mm);
1089 if (!rss)
1090 rss = nr_scan_pages;
1091
1092 rss = round_up(rss, nr_scan_pages);
1093 return rss / nr_scan_pages;
1094}
1095
1096
1097#define MAX_SCAN_WINDOW 2560
1098
1099static unsigned int task_scan_min(struct task_struct *p)
1100{
1101 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1102 unsigned int scan, floor;
1103 unsigned int windows = 1;
1104
1105 if (scan_size < MAX_SCAN_WINDOW)
1106 windows = MAX_SCAN_WINDOW / scan_size;
1107 floor = 1000 / windows;
1108
1109 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1110 return max_t(unsigned int, floor, scan);
1111}
1112
1113static unsigned int task_scan_start(struct task_struct *p)
1114{
1115 unsigned long smin = task_scan_min(p);
1116 unsigned long period = smin;
1117
1118
1119 if (p->numa_group) {
1120 struct numa_group *ng = p->numa_group;
1121 unsigned long shared = group_faults_shared(ng);
1122 unsigned long private = group_faults_priv(ng);
1123
1124 period *= atomic_read(&ng->refcount);
1125 period *= shared + 1;
1126 period /= private + shared + 1;
1127 }
1128
1129 return max(smin, period);
1130}
1131
1132static unsigned int task_scan_max(struct task_struct *p)
1133{
1134 unsigned long smin = task_scan_min(p);
1135 unsigned long smax;
1136
1137
1138 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1139
1140
1141 if (p->numa_group) {
1142 struct numa_group *ng = p->numa_group;
1143 unsigned long shared = group_faults_shared(ng);
1144 unsigned long private = group_faults_priv(ng);
1145 unsigned long period = smax;
1146
1147 period *= atomic_read(&ng->refcount);
1148 period *= shared + 1;
1149 period /= private + shared + 1;
1150
1151 smax = max(smax, period);
1152 }
1153
1154 return max(smin, smax);
1155}
1156
1157static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1158{
1159 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1160 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1161}
1162
1163static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1164{
1165 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1166 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1167}
1168
1169
1170#define NR_NUMA_HINT_FAULT_TYPES 2
1171
1172
1173#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1174
1175
1176#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1177
1178pid_t task_numa_group_id(struct task_struct *p)
1179{
1180 return p->numa_group ? p->numa_group->gid : 0;
1181}
1182
1183
1184
1185
1186
1187
1188
1189static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1190{
1191 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1192}
1193
1194static inline unsigned long task_faults(struct task_struct *p, int nid)
1195{
1196 if (!p->numa_faults)
1197 return 0;
1198
1199 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1200 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1201}
1202
1203static inline unsigned long group_faults(struct task_struct *p, int nid)
1204{
1205 if (!p->numa_group)
1206 return 0;
1207
1208 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1209 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1210}
1211
1212static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1213{
1214 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1215 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1216}
1217
1218static inline unsigned long group_faults_priv(struct numa_group *ng)
1219{
1220 unsigned long faults = 0;
1221 int node;
1222
1223 for_each_online_node(node) {
1224 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1225 }
1226
1227 return faults;
1228}
1229
1230static inline unsigned long group_faults_shared(struct numa_group *ng)
1231{
1232 unsigned long faults = 0;
1233 int node;
1234
1235 for_each_online_node(node) {
1236 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1237 }
1238
1239 return faults;
1240}
1241
1242
1243
1244
1245
1246
1247#define ACTIVE_NODE_FRACTION 3
1248
1249static bool numa_is_active_node(int nid, struct numa_group *ng)
1250{
1251 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1252}
1253
1254
1255static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1256 int maxdist, bool task)
1257{
1258 unsigned long score = 0;
1259 int node;
1260
1261
1262
1263
1264
1265 if (sched_numa_topology_type == NUMA_DIRECT)
1266 return 0;
1267
1268
1269
1270
1271
1272 for_each_online_node(node) {
1273 unsigned long faults;
1274 int dist = node_distance(nid, node);
1275
1276
1277
1278
1279
1280 if (dist == sched_max_numa_distance || node == nid)
1281 continue;
1282
1283
1284
1285
1286
1287
1288
1289
1290 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1291 dist > maxdist)
1292 continue;
1293
1294
1295 if (task)
1296 faults = task_faults(p, node);
1297 else
1298 faults = group_faults(p, node);
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1309 faults *= (sched_max_numa_distance - dist);
1310 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1311 }
1312
1313 score += faults;
1314 }
1315
1316 return score;
1317}
1318
1319
1320
1321
1322
1323
1324
1325static inline unsigned long task_weight(struct task_struct *p, int nid,
1326 int dist)
1327{
1328 unsigned long faults, total_faults;
1329
1330 if (!p->numa_faults)
1331 return 0;
1332
1333 total_faults = p->total_numa_faults;
1334
1335 if (!total_faults)
1336 return 0;
1337
1338 faults = task_faults(p, nid);
1339 faults += score_nearby_nodes(p, nid, dist, true);
1340
1341 return 1000 * faults / total_faults;
1342}
1343
1344static inline unsigned long group_weight(struct task_struct *p, int nid,
1345 int dist)
1346{
1347 unsigned long faults, total_faults;
1348
1349 if (!p->numa_group)
1350 return 0;
1351
1352 total_faults = p->numa_group->total_faults;
1353
1354 if (!total_faults)
1355 return 0;
1356
1357 faults = group_faults(p, nid);
1358 faults += score_nearby_nodes(p, nid, dist, false);
1359
1360 return 1000 * faults / total_faults;
1361}
1362
1363bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1364 int src_nid, int dst_cpu)
1365{
1366 struct numa_group *ng = p->numa_group;
1367 int dst_nid = cpu_to_node(dst_cpu);
1368 int last_cpupid, this_cpupid;
1369
1370 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1390 if (!cpupid_pid_unset(last_cpupid) &&
1391 cpupid_to_nid(last_cpupid) != dst_nid)
1392 return false;
1393
1394
1395 if (cpupid_match_pid(p, last_cpupid))
1396 return true;
1397
1398
1399 if (!ng)
1400 return true;
1401
1402
1403
1404
1405
1406 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1407 ACTIVE_NODE_FRACTION)
1408 return true;
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1419 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1420}
1421
1422static unsigned long weighted_cpuload(struct rq *rq);
1423static unsigned long source_load(int cpu, int type);
1424static unsigned long target_load(int cpu, int type);
1425static unsigned long capacity_of(int cpu);
1426
1427
1428struct numa_stats {
1429 unsigned long nr_running;
1430 unsigned long load;
1431
1432
1433 unsigned long compute_capacity;
1434
1435
1436 unsigned long task_capacity;
1437 int has_free_capacity;
1438};
1439
1440
1441
1442
1443static void update_numa_stats(struct numa_stats *ns, int nid)
1444{
1445 int smt, cpu, cpus = 0;
1446 unsigned long capacity;
1447
1448 memset(ns, 0, sizeof(*ns));
1449 for_each_cpu(cpu, cpumask_of_node(nid)) {
1450 struct rq *rq = cpu_rq(cpu);
1451
1452 ns->nr_running += rq->nr_running;
1453 ns->load += weighted_cpuload(rq);
1454 ns->compute_capacity += capacity_of(cpu);
1455
1456 cpus++;
1457 }
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 if (!cpus)
1468 return;
1469
1470
1471 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1472 capacity = cpus / smt;
1473
1474 ns->task_capacity = min_t(unsigned, capacity,
1475 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1476 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1477}
1478
1479struct task_numa_env {
1480 struct task_struct *p;
1481
1482 int src_cpu, src_nid;
1483 int dst_cpu, dst_nid;
1484
1485 struct numa_stats src_stats, dst_stats;
1486
1487 int imbalance_pct;
1488 int dist;
1489
1490 struct task_struct *best_task;
1491 long best_imp;
1492 int best_cpu;
1493};
1494
1495static void task_numa_assign(struct task_numa_env *env,
1496 struct task_struct *p, long imp)
1497{
1498 if (env->best_task)
1499 put_task_struct(env->best_task);
1500 if (p)
1501 get_task_struct(p);
1502
1503 env->best_task = p;
1504 env->best_imp = imp;
1505 env->best_cpu = env->dst_cpu;
1506}
1507
1508static bool load_too_imbalanced(long src_load, long dst_load,
1509 struct task_numa_env *env)
1510{
1511 long imb, old_imb;
1512 long orig_src_load, orig_dst_load;
1513 long src_capacity, dst_capacity;
1514
1515
1516
1517
1518
1519
1520
1521
1522 src_capacity = env->src_stats.compute_capacity;
1523 dst_capacity = env->dst_stats.compute_capacity;
1524
1525
1526 if (dst_load < src_load)
1527 swap(dst_load, src_load);
1528
1529
1530 imb = dst_load * src_capacity * 100 -
1531 src_load * dst_capacity * env->imbalance_pct;
1532 if (imb <= 0)
1533 return false;
1534
1535
1536
1537
1538
1539 orig_src_load = env->src_stats.load;
1540 orig_dst_load = env->dst_stats.load;
1541
1542 if (orig_dst_load < orig_src_load)
1543 swap(orig_dst_load, orig_src_load);
1544
1545 old_imb = orig_dst_load * src_capacity * 100 -
1546 orig_src_load * dst_capacity * env->imbalance_pct;
1547
1548
1549 return (imb > old_imb);
1550}
1551
1552
1553
1554
1555
1556
1557
1558static void task_numa_compare(struct task_numa_env *env,
1559 long taskimp, long groupimp)
1560{
1561 struct rq *src_rq = cpu_rq(env->src_cpu);
1562 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1563 struct task_struct *cur;
1564 long src_load, dst_load;
1565 long load;
1566 long imp = env->p->numa_group ? groupimp : taskimp;
1567 long moveimp = imp;
1568 int dist = env->dist;
1569
1570 rcu_read_lock();
1571 cur = task_rcu_dereference(&dst_rq->curr);
1572 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1573 cur = NULL;
1574
1575
1576
1577
1578
1579 if (cur == env->p)
1580 goto unlock;
1581
1582
1583
1584
1585
1586
1587
1588
1589 if (cur) {
1590
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock;
1593
1594
1595
1596
1597
1598 if (cur->numa_group == env->p->numa_group) {
1599 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1600 task_weight(cur, env->dst_nid, dist);
1601
1602
1603
1604
1605 if (cur->numa_group)
1606 imp -= imp/16;
1607 } else {
1608
1609
1610
1611
1612
1613 if (cur->numa_group)
1614 imp += group_weight(cur, env->src_nid, dist) -
1615 group_weight(cur, env->dst_nid, dist);
1616 else
1617 imp += task_weight(cur, env->src_nid, dist) -
1618 task_weight(cur, env->dst_nid, dist);
1619 }
1620 }
1621
1622 if (imp <= env->best_imp && moveimp <= env->best_imp)
1623 goto unlock;
1624
1625 if (!cur) {
1626
1627 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1628 !env->dst_stats.has_free_capacity)
1629 goto unlock;
1630
1631 goto balance;
1632 }
1633
1634
1635 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1)
1637 goto assign;
1638
1639
1640
1641
1642balance:
1643 load = task_h_load(env->p);
1644 dst_load = env->dst_stats.load + load;
1645 src_load = env->src_stats.load - load;
1646
1647 if (moveimp > imp && moveimp > env->best_imp) {
1648
1649
1650
1651
1652
1653
1654 if (!load_too_imbalanced(src_load, dst_load, env)) {
1655 imp = moveimp - 1;
1656 cur = NULL;
1657 goto assign;
1658 }
1659 }
1660
1661 if (imp <= env->best_imp)
1662 goto unlock;
1663
1664 if (cur) {
1665 load = task_h_load(cur);
1666 dst_load -= load;
1667 src_load += load;
1668 }
1669
1670 if (load_too_imbalanced(src_load, dst_load, env))
1671 goto unlock;
1672
1673
1674
1675
1676
1677 if (!cur) {
1678
1679
1680
1681
1682 local_irq_disable();
1683 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1684 env->dst_cpu);
1685 local_irq_enable();
1686 }
1687
1688assign:
1689 task_numa_assign(env, cur, imp);
1690unlock:
1691 rcu_read_unlock();
1692}
1693
1694static void task_numa_find_cpu(struct task_numa_env *env,
1695 long taskimp, long groupimp)
1696{
1697 int cpu;
1698
1699 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1700
1701 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1702 continue;
1703
1704 env->dst_cpu = cpu;
1705 task_numa_compare(env, taskimp, groupimp);
1706 }
1707}
1708
1709
1710static bool numa_has_capacity(struct task_numa_env *env)
1711{
1712 struct numa_stats *src = &env->src_stats;
1713 struct numa_stats *dst = &env->dst_stats;
1714
1715 if (src->has_free_capacity && !dst->has_free_capacity)
1716 return false;
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726 if (src->load * dst->compute_capacity * env->imbalance_pct >
1727
1728 dst->load * src->compute_capacity * 100)
1729 return true;
1730
1731 return false;
1732}
1733
1734static int task_numa_migrate(struct task_struct *p)
1735{
1736 struct task_numa_env env = {
1737 .p = p,
1738
1739 .src_cpu = task_cpu(p),
1740 .src_nid = task_node(p),
1741
1742 .imbalance_pct = 112,
1743
1744 .best_task = NULL,
1745 .best_imp = 0,
1746 .best_cpu = -1,
1747 };
1748 struct sched_domain *sd;
1749 unsigned long taskweight, groupweight;
1750 int nid, ret, dist;
1751 long taskimp, groupimp;
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761 rcu_read_lock();
1762 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1763 if (sd)
1764 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1765 rcu_read_unlock();
1766
1767
1768
1769
1770
1771
1772
1773 if (unlikely(!sd)) {
1774 p->numa_preferred_nid = task_node(p);
1775 return -EINVAL;
1776 }
1777
1778 env.dst_nid = p->numa_preferred_nid;
1779 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1780 taskweight = task_weight(p, env.src_nid, dist);
1781 groupweight = group_weight(p, env.src_nid, dist);
1782 update_numa_stats(&env.src_stats, env.src_nid);
1783 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1784 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1785 update_numa_stats(&env.dst_stats, env.dst_nid);
1786
1787
1788 if (numa_has_capacity(&env))
1789 task_numa_find_cpu(&env, taskimp, groupimp);
1790
1791
1792
1793
1794
1795
1796
1797
1798 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1799 for_each_online_node(nid) {
1800 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1801 continue;
1802
1803 dist = node_distance(env.src_nid, env.dst_nid);
1804 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1805 dist != env.dist) {
1806 taskweight = task_weight(p, env.src_nid, dist);
1807 groupweight = group_weight(p, env.src_nid, dist);
1808 }
1809
1810
1811 taskimp = task_weight(p, nid, dist) - taskweight;
1812 groupimp = group_weight(p, nid, dist) - groupweight;
1813 if (taskimp < 0 && groupimp < 0)
1814 continue;
1815
1816 env.dist = dist;
1817 env.dst_nid = nid;
1818 update_numa_stats(&env.dst_stats, env.dst_nid);
1819 if (numa_has_capacity(&env))
1820 task_numa_find_cpu(&env, taskimp, groupimp);
1821 }
1822 }
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832 if (p->numa_group) {
1833 struct numa_group *ng = p->numa_group;
1834
1835 if (env.best_cpu == -1)
1836 nid = env.src_nid;
1837 else
1838 nid = env.dst_nid;
1839
1840 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1841 sched_setnuma(p, env.dst_nid);
1842 }
1843
1844
1845 if (env.best_cpu == -1)
1846 return -EAGAIN;
1847
1848
1849
1850
1851
1852 p->numa_scan_period = task_scan_start(p);
1853
1854 if (env.best_task == NULL) {
1855 ret = migrate_task_to(p, env.best_cpu);
1856 if (ret != 0)
1857 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1858 return ret;
1859 }
1860
1861 ret = migrate_swap(p, env.best_task);
1862 if (ret != 0)
1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1864 put_task_struct(env.best_task);
1865 return ret;
1866}
1867
1868
1869static void numa_migrate_preferred(struct task_struct *p)
1870{
1871 unsigned long interval = HZ;
1872
1873
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1875 return;
1876
1877
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval;
1880
1881
1882 if (task_node(p) == p->numa_preferred_nid)
1883 return;
1884
1885
1886 task_numa_migrate(p);
1887}
1888
1889
1890
1891
1892
1893
1894
1895static void numa_group_count_active_nodes(struct numa_group *numa_group)
1896{
1897 unsigned long faults, max_faults = 0;
1898 int nid, active_nodes = 0;
1899
1900 for_each_online_node(nid) {
1901 faults = group_faults_cpu(numa_group, nid);
1902 if (faults > max_faults)
1903 max_faults = faults;
1904 }
1905
1906 for_each_online_node(nid) {
1907 faults = group_faults_cpu(numa_group, nid);
1908 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1909 active_nodes++;
1910 }
1911
1912 numa_group->max_faults_cpu = max_faults;
1913 numa_group->active_nodes = active_nodes;
1914}
1915
1916
1917
1918
1919
1920
1921
1922
1923#define NUMA_PERIOD_SLOTS 10
1924#define NUMA_PERIOD_THRESHOLD 7
1925
1926
1927
1928
1929
1930
1931
1932static void update_task_scan_period(struct task_struct *p,
1933 unsigned long shared, unsigned long private)
1934{
1935 unsigned int period_slot;
1936 int lr_ratio, ps_ratio;
1937 int diff;
1938
1939 unsigned long remote = p->numa_faults_locality[0];
1940 unsigned long local = p->numa_faults_locality[1];
1941
1942
1943
1944
1945
1946
1947
1948
1949 if (local + shared == 0 || p->numa_faults_locality[2]) {
1950 p->numa_scan_period = min(p->numa_scan_period_max,
1951 p->numa_scan_period << 1);
1952
1953 p->mm->numa_next_scan = jiffies +
1954 msecs_to_jiffies(p->numa_scan_period);
1955
1956 return;
1957 }
1958
1959
1960
1961
1962
1963
1964
1965 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1966 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1967 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1968
1969 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1970
1971
1972
1973
1974 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1975 if (!slot)
1976 slot = 1;
1977 diff = slot * period_slot;
1978 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1979
1980
1981
1982
1983
1984 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1985 if (!slot)
1986 slot = 1;
1987 diff = slot * period_slot;
1988 } else {
1989
1990
1991
1992
1993
1994 int ratio = max(lr_ratio, ps_ratio);
1995 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1996 }
1997
1998 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1999 task_scan_min(p), task_scan_max(p));
2000 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2001}
2002
2003
2004
2005
2006
2007
2008
2009
2010static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2011{
2012 u64 runtime, delta, now;
2013
2014 now = p->se.exec_start;
2015 runtime = p->se.sum_exec_runtime;
2016
2017 if (p->last_task_numa_placement) {
2018 delta = runtime - p->last_sum_exec_runtime;
2019 *period = now - p->last_task_numa_placement;
2020 } else {
2021 delta = p->se.avg.load_sum;
2022 *period = LOAD_AVG_MAX;
2023 }
2024
2025 p->last_sum_exec_runtime = runtime;
2026 p->last_task_numa_placement = now;
2027
2028 return delta;
2029}
2030
2031
2032
2033
2034
2035
2036static int preferred_group_nid(struct task_struct *p, int nid)
2037{
2038 nodemask_t nodes;
2039 int dist;
2040
2041
2042 if (sched_numa_topology_type == NUMA_DIRECT)
2043 return nid;
2044
2045
2046
2047
2048
2049
2050 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2051 unsigned long score, max_score = 0;
2052 int node, max_node = nid;
2053
2054 dist = sched_max_numa_distance;
2055
2056 for_each_online_node(node) {
2057 score = group_weight(p, node, dist);
2058 if (score > max_score) {
2059 max_score = score;
2060 max_node = node;
2061 }
2062 }
2063 return max_node;
2064 }
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075 nodes = node_online_map;
2076 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2077 unsigned long max_faults = 0;
2078 nodemask_t max_group = NODE_MASK_NONE;
2079 int a, b;
2080
2081
2082 if (!find_numa_distance(dist))
2083 continue;
2084
2085 for_each_node_mask(a, nodes) {
2086 unsigned long faults = 0;
2087 nodemask_t this_group;
2088 nodes_clear(this_group);
2089
2090
2091 for_each_node_mask(b, nodes) {
2092 if (node_distance(a, b) < dist) {
2093 faults += group_faults(p, b);
2094 node_set(b, this_group);
2095 node_clear(b, nodes);
2096 }
2097 }
2098
2099
2100 if (faults > max_faults) {
2101 max_faults = faults;
2102 max_group = this_group;
2103
2104
2105
2106
2107
2108 nid = a;
2109 }
2110 }
2111
2112 if (!max_faults)
2113 break;
2114 nodes = max_group;
2115 }
2116 return nid;
2117}
2118
2119static void task_numa_placement(struct task_struct *p)
2120{
2121 int seq, nid, max_nid = -1, max_group_nid = -1;
2122 unsigned long max_faults = 0, max_group_faults = 0;
2123 unsigned long fault_types[2] = { 0, 0 };
2124 unsigned long total_faults;
2125 u64 runtime, period;
2126 spinlock_t *group_lock = NULL;
2127
2128
2129
2130
2131
2132
2133 seq = READ_ONCE(p->mm->numa_scan_seq);
2134 if (p->numa_scan_seq == seq)
2135 return;
2136 p->numa_scan_seq = seq;
2137 p->numa_scan_period_max = task_scan_max(p);
2138
2139 total_faults = p->numa_faults_locality[0] +
2140 p->numa_faults_locality[1];
2141 runtime = numa_get_avg_runtime(p, &period);
2142
2143
2144 if (p->numa_group) {
2145 group_lock = &p->numa_group->lock;
2146 spin_lock_irq(group_lock);
2147 }
2148
2149
2150 for_each_online_node(nid) {
2151
2152 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2153 unsigned long faults = 0, group_faults = 0;
2154 int priv;
2155
2156 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2157 long diff, f_diff, f_weight;
2158
2159 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2160 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2161 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2162 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2163
2164
2165 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2166 fault_types[priv] += p->numa_faults[membuf_idx];
2167 p->numa_faults[membuf_idx] = 0;
2168
2169
2170
2171
2172
2173
2174
2175
2176 f_weight = div64_u64(runtime << 16, period + 1);
2177 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2178 (total_faults + 1);
2179 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2180 p->numa_faults[cpubuf_idx] = 0;
2181
2182 p->numa_faults[mem_idx] += diff;
2183 p->numa_faults[cpu_idx] += f_diff;
2184 faults += p->numa_faults[mem_idx];
2185 p->total_numa_faults += diff;
2186 if (p->numa_group) {
2187
2188
2189
2190
2191
2192
2193
2194 p->numa_group->faults[mem_idx] += diff;
2195 p->numa_group->faults_cpu[mem_idx] += f_diff;
2196 p->numa_group->total_faults += diff;
2197 group_faults += p->numa_group->faults[mem_idx];
2198 }
2199 }
2200
2201 if (faults > max_faults) {
2202 max_faults = faults;
2203 max_nid = nid;
2204 }
2205
2206 if (group_faults > max_group_faults) {
2207 max_group_faults = group_faults;
2208 max_group_nid = nid;
2209 }
2210 }
2211
2212 update_task_scan_period(p, fault_types[0], fault_types[1]);
2213
2214 if (p->numa_group) {
2215 numa_group_count_active_nodes(p->numa_group);
2216 spin_unlock_irq(group_lock);
2217 max_nid = preferred_group_nid(p, max_group_nid);
2218 }
2219
2220 if (max_faults) {
2221
2222 if (max_nid != p->numa_preferred_nid)
2223 sched_setnuma(p, max_nid);
2224
2225 if (task_node(p) != p->numa_preferred_nid)
2226 numa_migrate_preferred(p);
2227 }
2228}
2229
2230static inline int get_numa_group(struct numa_group *grp)
2231{
2232 return atomic_inc_not_zero(&grp->refcount);
2233}
2234
2235static inline void put_numa_group(struct numa_group *grp)
2236{
2237 if (atomic_dec_and_test(&grp->refcount))
2238 kfree_rcu(grp, rcu);
2239}
2240
2241static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2242 int *priv)
2243{
2244 struct numa_group *grp, *my_grp;
2245 struct task_struct *tsk;
2246 bool join = false;
2247 int cpu = cpupid_to_cpu(cpupid);
2248 int i;
2249
2250 if (unlikely(!p->numa_group)) {
2251 unsigned int size = sizeof(struct numa_group) +
2252 4*nr_node_ids*sizeof(unsigned long);
2253
2254 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2255 if (!grp)
2256 return;
2257
2258 atomic_set(&grp->refcount, 1);
2259 grp->active_nodes = 1;
2260 grp->max_faults_cpu = 0;
2261 spin_lock_init(&grp->lock);
2262 grp->gid = p->pid;
2263
2264 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2265 nr_node_ids;
2266
2267 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2268 grp->faults[i] = p->numa_faults[i];
2269
2270 grp->total_faults = p->total_numa_faults;
2271
2272 grp->nr_tasks++;
2273 rcu_assign_pointer(p->numa_group, grp);
2274 }
2275
2276 rcu_read_lock();
2277 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2278
2279 if (!cpupid_match_pid(tsk, cpupid))
2280 goto no_join;
2281
2282 grp = rcu_dereference(tsk->numa_group);
2283 if (!grp)
2284 goto no_join;
2285
2286 my_grp = p->numa_group;
2287 if (grp == my_grp)
2288 goto no_join;
2289
2290
2291
2292
2293
2294 if (my_grp->nr_tasks > grp->nr_tasks)
2295 goto no_join;
2296
2297
2298
2299
2300 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2301 goto no_join;
2302
2303
2304 if (tsk->mm == current->mm)
2305 join = true;
2306
2307
2308 if (flags & TNF_SHARED)
2309 join = true;
2310
2311
2312 *priv = !join;
2313
2314 if (join && !get_numa_group(grp))
2315 goto no_join;
2316
2317 rcu_read_unlock();
2318
2319 if (!join)
2320 return;
2321
2322 BUG_ON(irqs_disabled());
2323 double_lock_irq(&my_grp->lock, &grp->lock);
2324
2325 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2326 my_grp->faults[i] -= p->numa_faults[i];
2327 grp->faults[i] += p->numa_faults[i];
2328 }
2329 my_grp->total_faults -= p->total_numa_faults;
2330 grp->total_faults += p->total_numa_faults;
2331
2332 my_grp->nr_tasks--;
2333 grp->nr_tasks++;
2334
2335 spin_unlock(&my_grp->lock);
2336 spin_unlock_irq(&grp->lock);
2337
2338 rcu_assign_pointer(p->numa_group, grp);
2339
2340 put_numa_group(my_grp);
2341 return;
2342
2343no_join:
2344 rcu_read_unlock();
2345 return;
2346}
2347
2348void task_numa_free(struct task_struct *p)
2349{
2350 struct numa_group *grp = p->numa_group;
2351 void *numa_faults = p->numa_faults;
2352 unsigned long flags;
2353 int i;
2354
2355 if (grp) {
2356 spin_lock_irqsave(&grp->lock, flags);
2357 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2358 grp->faults[i] -= p->numa_faults[i];
2359 grp->total_faults -= p->total_numa_faults;
2360
2361 grp->nr_tasks--;
2362 spin_unlock_irqrestore(&grp->lock, flags);
2363 RCU_INIT_POINTER(p->numa_group, NULL);
2364 put_numa_group(grp);
2365 }
2366
2367 p->numa_faults = NULL;
2368 kfree(numa_faults);
2369}
2370
2371
2372
2373
2374void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2375{
2376 struct task_struct *p = current;
2377 bool migrated = flags & TNF_MIGRATED;
2378 int cpu_node = task_node(current);
2379 int local = !!(flags & TNF_FAULT_LOCAL);
2380 struct numa_group *ng;
2381 int priv;
2382
2383 if (!static_branch_likely(&sched_numa_balancing))
2384 return;
2385
2386
2387 if (!p->mm)
2388 return;
2389
2390
2391 if (unlikely(!p->numa_faults)) {
2392 int size = sizeof(*p->numa_faults) *
2393 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2394
2395 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2396 if (!p->numa_faults)
2397 return;
2398
2399 p->total_numa_faults = 0;
2400 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2401 }
2402
2403
2404
2405
2406
2407 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2408 priv = 1;
2409 } else {
2410 priv = cpupid_match_pid(p, last_cpupid);
2411 if (!priv && !(flags & TNF_NO_GROUP))
2412 task_numa_group(p, last_cpupid, flags, &priv);
2413 }
2414
2415
2416
2417
2418
2419
2420
2421 ng = p->numa_group;
2422 if (!priv && !local && ng && ng->active_nodes > 1 &&
2423 numa_is_active_node(cpu_node, ng) &&
2424 numa_is_active_node(mem_node, ng))
2425 local = 1;
2426
2427 task_numa_placement(p);
2428
2429
2430
2431
2432
2433 if (time_after(jiffies, p->numa_migrate_retry))
2434 numa_migrate_preferred(p);
2435
2436 if (migrated)
2437 p->numa_pages_migrated += pages;
2438 if (flags & TNF_MIGRATE_FAIL)
2439 p->numa_faults_locality[2] += pages;
2440
2441 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2442 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2443 p->numa_faults_locality[local] += pages;
2444}
2445
2446static void reset_ptenuma_scan(struct task_struct *p)
2447{
2448
2449
2450
2451
2452
2453
2454
2455
2456 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2457 p->mm->numa_scan_offset = 0;
2458}
2459
2460
2461
2462
2463
2464void task_numa_work(struct callback_head *work)
2465{
2466 unsigned long migrate, next_scan, now = jiffies;
2467 struct task_struct *p = current;
2468 struct mm_struct *mm = p->mm;
2469 u64 runtime = p->se.sum_exec_runtime;
2470 struct vm_area_struct *vma;
2471 unsigned long start, end;
2472 unsigned long nr_pte_updates = 0;
2473 long pages, virtpages;
2474
2475 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2476
2477 work->next = work;
2478
2479
2480
2481
2482
2483
2484
2485
2486 if (p->flags & PF_EXITING)
2487 return;
2488
2489 if (!mm->numa_next_scan) {
2490 mm->numa_next_scan = now +
2491 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2492 }
2493
2494
2495
2496
2497 migrate = mm->numa_next_scan;
2498 if (time_before(now, migrate))
2499 return;
2500
2501 if (p->numa_scan_period == 0) {
2502 p->numa_scan_period_max = task_scan_max(p);
2503 p->numa_scan_period = task_scan_start(p);
2504 }
2505
2506 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2507 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2508 return;
2509
2510
2511
2512
2513
2514 p->node_stamp += 2 * TICK_NSEC;
2515
2516 start = mm->numa_scan_offset;
2517 pages = sysctl_numa_balancing_scan_size;
2518 pages <<= 20 - PAGE_SHIFT;
2519 virtpages = pages * 8;
2520 if (!pages)
2521 return;
2522
2523
2524 if (!down_read_trylock(&mm->mmap_sem))
2525 return;
2526 vma = find_vma(mm, start);
2527 if (!vma) {
2528 reset_ptenuma_scan(p);
2529 start = 0;
2530 vma = mm->mmap;
2531 }
2532 for (; vma; vma = vma->vm_next) {
2533 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2534 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2535 continue;
2536 }
2537
2538
2539
2540
2541
2542
2543
2544 if (!vma->vm_mm ||
2545 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2546 continue;
2547
2548
2549
2550
2551
2552 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2553 continue;
2554
2555 do {
2556 start = max(start, vma->vm_start);
2557 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2558 end = min(end, vma->vm_end);
2559 nr_pte_updates = change_prot_numa(vma, start, end);
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569 if (nr_pte_updates)
2570 pages -= (end - start) >> PAGE_SHIFT;
2571 virtpages -= (end - start) >> PAGE_SHIFT;
2572
2573 start = end;
2574 if (pages <= 0 || virtpages <= 0)
2575 goto out;
2576
2577 cond_resched();
2578 } while (end != vma->vm_end);
2579 }
2580
2581out:
2582
2583
2584
2585
2586
2587
2588 if (vma)
2589 mm->numa_scan_offset = start;
2590 else
2591 reset_ptenuma_scan(p);
2592 up_read(&mm->mmap_sem);
2593
2594
2595
2596
2597
2598
2599
2600 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2601 u64 diff = p->se.sum_exec_runtime - runtime;
2602 p->node_stamp += 32 * diff;
2603 }
2604}
2605
2606
2607
2608
2609void task_tick_numa(struct rq *rq, struct task_struct *curr)
2610{
2611 struct callback_head *work = &curr->numa_work;
2612 u64 period, now;
2613
2614
2615
2616
2617 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2618 return;
2619
2620
2621
2622
2623
2624
2625
2626 now = curr->se.sum_exec_runtime;
2627 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2628
2629 if (now > curr->node_stamp + period) {
2630 if (!curr->node_stamp)
2631 curr->numa_scan_period = task_scan_start(curr);
2632 curr->node_stamp += period;
2633
2634 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2635 init_task_work(work, task_numa_work);
2636 task_work_add(curr, work, true);
2637 }
2638 }
2639}
2640
2641#else
2642static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2643{
2644}
2645
2646static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2647{
2648}
2649
2650static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2651{
2652}
2653
2654#endif
2655
2656static void
2657account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2658{
2659 update_load_add(&cfs_rq->load, se->load.weight);
2660 if (!parent_entity(se))
2661 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2662#ifdef CONFIG_SMP
2663 if (entity_is_task(se)) {
2664 struct rq *rq = rq_of(cfs_rq);
2665
2666 account_numa_enqueue(rq, task_of(se));
2667 list_add(&se->group_node, &rq->cfs_tasks);
2668 }
2669#endif
2670 cfs_rq->nr_running++;
2671}
2672
2673static void
2674account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2675{
2676 update_load_sub(&cfs_rq->load, se->load.weight);
2677 if (!parent_entity(se))
2678 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2679#ifdef CONFIG_SMP
2680 if (entity_is_task(se)) {
2681 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2682 list_del_init(&se->group_node);
2683 }
2684#endif
2685 cfs_rq->nr_running--;
2686}
2687
2688
2689
2690
2691
2692
2693
2694
2695#define add_positive(_ptr, _val) do { \
2696 typeof(_ptr) ptr = (_ptr); \
2697 typeof(_val) val = (_val); \
2698 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2699 \
2700 res = var + val; \
2701 \
2702 if (val < 0 && res > var) \
2703 res = 0; \
2704 \
2705 WRITE_ONCE(*ptr, res); \
2706} while (0)
2707
2708
2709
2710
2711
2712
2713
2714
2715#define sub_positive(_ptr, _val) do { \
2716 typeof(_ptr) ptr = (_ptr); \
2717 typeof(*ptr) val = (_val); \
2718 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2719 res = var - val; \
2720 if (res > var) \
2721 res = 0; \
2722 WRITE_ONCE(*ptr, res); \
2723} while (0)
2724
2725#ifdef CONFIG_SMP
2726
2727
2728
2729static inline long se_weight(struct sched_entity *se)
2730{
2731 return scale_load_down(se->load.weight);
2732}
2733
2734static inline long se_runnable(struct sched_entity *se)
2735{
2736 return scale_load_down(se->runnable_weight);
2737}
2738
2739static inline void
2740enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2741{
2742 cfs_rq->runnable_weight += se->runnable_weight;
2743
2744 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2745 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2746}
2747
2748static inline void
2749dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2750{
2751 cfs_rq->runnable_weight -= se->runnable_weight;
2752
2753 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2754 sub_positive(&cfs_rq->avg.runnable_load_sum,
2755 se_runnable(se) * se->avg.runnable_load_sum);
2756}
2757
2758static inline void
2759enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2760{
2761 cfs_rq->avg.load_avg += se->avg.load_avg;
2762 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2763}
2764
2765static inline void
2766dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2767{
2768 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2769 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2770}
2771#else
2772static inline void
2773enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2774static inline void
2775dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2776static inline void
2777enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2778static inline void
2779dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2780#endif
2781
2782static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2783 unsigned long weight, unsigned long runnable)
2784{
2785 if (se->on_rq) {
2786
2787 if (cfs_rq->curr == se)
2788 update_curr(cfs_rq);
2789 account_entity_dequeue(cfs_rq, se);
2790 dequeue_runnable_load_avg(cfs_rq, se);
2791 }
2792 dequeue_load_avg(cfs_rq, se);
2793
2794 se->runnable_weight = runnable;
2795 update_load_set(&se->load, weight);
2796
2797#ifdef CONFIG_SMP
2798 do {
2799 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2800
2801 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2802 se->avg.runnable_load_avg =
2803 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2804 } while (0);
2805#endif
2806
2807 enqueue_load_avg(cfs_rq, se);
2808 if (se->on_rq) {
2809 account_entity_enqueue(cfs_rq, se);
2810 enqueue_runnable_load_avg(cfs_rq, se);
2811 }
2812}
2813
2814void reweight_task(struct task_struct *p, int prio)
2815{
2816 struct sched_entity *se = &p->se;
2817 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2818 struct load_weight *load = &se->load;
2819 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2820
2821 reweight_entity(cfs_rq, se, weight, weight);
2822 load->inv_weight = sched_prio_to_wmult[prio];
2823}
2824
2825#ifdef CONFIG_FAIR_GROUP_SCHED
2826# ifdef CONFIG_SMP
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900static long calc_group_shares(struct cfs_rq *cfs_rq)
2901{
2902 long tg_weight, tg_shares, load, shares;
2903 struct task_group *tg = cfs_rq->tg;
2904
2905 tg_shares = READ_ONCE(tg->shares);
2906
2907 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
2908
2909 tg_weight = atomic_long_read(&tg->load_avg);
2910
2911
2912 tg_weight -= cfs_rq->tg_load_avg_contrib;
2913 tg_weight += load;
2914
2915 shares = (tg_shares * load);
2916 if (tg_weight)
2917 shares /= tg_weight;
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931 return clamp_t(long, shares, MIN_SHARES, tg_shares);
2932}
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2962{
2963 long runnable, load_avg;
2964
2965 load_avg = max(cfs_rq->avg.load_avg,
2966 scale_load_down(cfs_rq->load.weight));
2967
2968 runnable = max(cfs_rq->avg.runnable_load_avg,
2969 scale_load_down(cfs_rq->runnable_weight));
2970
2971 runnable *= shares;
2972 if (load_avg)
2973 runnable /= load_avg;
2974
2975 return clamp_t(long, runnable, MIN_SHARES, shares);
2976}
2977# endif
2978
2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2980
2981
2982
2983
2984
2985static void update_cfs_group(struct sched_entity *se)
2986{
2987 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2988 long shares, runnable;
2989
2990 if (!gcfs_rq)
2991 return;
2992
2993 if (throttled_hierarchy(gcfs_rq))
2994 return;
2995
2996#ifndef CONFIG_SMP
2997 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
2998
2999 if (likely(se->load.weight == shares))
3000 return;
3001#else
3002 shares = calc_group_shares(gcfs_rq);
3003 runnable = calc_group_runnable(gcfs_rq, shares);
3004#endif
3005
3006 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3007}
3008
3009#else
3010static inline void update_cfs_group(struct sched_entity *se)
3011{
3012}
3013#endif
3014
3015static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3016{
3017 struct rq *rq = rq_of(cfs_rq);
3018
3019 if (&rq->cfs == cfs_rq) {
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034 cpufreq_update_util(rq, 0);
3035 }
3036}
3037
3038#ifdef CONFIG_SMP
3039
3040
3041
3042
3043static u64 decay_load(u64 val, u64 n)
3044{
3045 unsigned int local_n;
3046
3047 if (unlikely(n > LOAD_AVG_PERIOD * 63))
3048 return 0;
3049
3050
3051 local_n = n;
3052
3053
3054
3055
3056
3057
3058
3059
3060 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
3061 val >>= local_n / LOAD_AVG_PERIOD;
3062 local_n %= LOAD_AVG_PERIOD;
3063 }
3064
3065 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
3066 return val;
3067}
3068
3069static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
3070{
3071 u32 c1, c2, c3 = d3;
3072
3073
3074
3075
3076 c1 = decay_load((u64)d1, periods);
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
3088
3089 return c1 + c2 + c3;
3090}
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113static __always_inline u32
3114accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
3115 unsigned long load, unsigned long runnable, int running)
3116{
3117 unsigned long scale_freq, scale_cpu;
3118 u32 contrib = (u32)delta;
3119 u64 periods;
3120
3121 scale_freq = arch_scale_freq_capacity(cpu);
3122 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3123
3124 delta += sa->period_contrib;
3125 periods = delta / 1024;
3126
3127
3128
3129
3130 if (periods) {
3131 sa->load_sum = decay_load(sa->load_sum, periods);
3132 sa->runnable_load_sum =
3133 decay_load(sa->runnable_load_sum, periods);
3134 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
3135
3136
3137
3138
3139 delta %= 1024;
3140 contrib = __accumulate_pelt_segments(periods,
3141 1024 - sa->period_contrib, delta);
3142 }
3143 sa->period_contrib = delta;
3144
3145 contrib = cap_scale(contrib, scale_freq);
3146 if (load)
3147 sa->load_sum += load * contrib;
3148 if (runnable)
3149 sa->runnable_load_sum += runnable * contrib;
3150 if (running)
3151 sa->util_sum += contrib * scale_cpu;
3152
3153 return periods;
3154}
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184static __always_inline int
3185___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
3186 unsigned long load, unsigned long runnable, int running)
3187{
3188 u64 delta;
3189
3190 delta = now - sa->last_update_time;
3191
3192
3193
3194
3195 if ((s64)delta < 0) {
3196 sa->last_update_time = now;
3197 return 0;
3198 }
3199
3200
3201
3202
3203
3204 delta >>= 10;
3205 if (!delta)
3206 return 0;
3207
3208 sa->last_update_time += delta << 10;
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219 if (!load)
3220 runnable = running = 0;
3221
3222
3223
3224
3225
3226
3227
3228
3229 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
3230 return 0;
3231
3232 return 1;
3233}
3234
3235static __always_inline void
3236___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
3237{
3238 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3239
3240
3241
3242
3243 sa->load_avg = div_u64(load * sa->load_sum, divider);
3244 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
3245 sa->util_avg = sa->util_sum / divider;
3246}
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275static int
3276__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3277{
3278 if (entity_is_task(se))
3279 se->runnable_weight = se->load.weight;
3280
3281 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
3282 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3283 return 1;
3284 }
3285
3286 return 0;
3287}
3288
3289static int
3290__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3291{
3292 if (entity_is_task(se))
3293 se->runnable_weight = se->load.weight;
3294
3295 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
3296 cfs_rq->curr == se)) {
3297
3298 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3299 return 1;
3300 }
3301
3302 return 0;
3303}
3304
3305static int
3306__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3307{
3308 if (___update_load_sum(now, cpu, &cfs_rq->avg,
3309 scale_load_down(cfs_rq->load.weight),
3310 scale_load_down(cfs_rq->runnable_weight),
3311 cfs_rq->curr != NULL)) {
3312
3313 ___update_load_avg(&cfs_rq->avg, 1, 1);
3314 return 1;
3315 }
3316
3317 return 0;
3318}
3319
3320#ifdef CONFIG_FAIR_GROUP_SCHED
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3337{
3338 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3339
3340
3341
3342
3343 if (cfs_rq->tg == &root_task_group)
3344 return;
3345
3346 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3347 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3348 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3349 }
3350}
3351
3352
3353
3354
3355
3356
3357void set_task_rq_fair(struct sched_entity *se,
3358 struct cfs_rq *prev, struct cfs_rq *next)
3359{
3360 u64 p_last_update_time;
3361 u64 n_last_update_time;
3362
3363 if (!sched_feat(ATTACH_AGE_LOAD))
3364 return;
3365
3366
3367
3368
3369
3370
3371
3372
3373 if (!(se->avg.last_update_time && prev))
3374 return;
3375
3376#ifndef CONFIG_64BIT
3377 {
3378 u64 p_last_update_time_copy;
3379 u64 n_last_update_time_copy;
3380
3381 do {
3382 p_last_update_time_copy = prev->load_last_update_time_copy;
3383 n_last_update_time_copy = next->load_last_update_time_copy;
3384
3385 smp_rmb();
3386
3387 p_last_update_time = prev->avg.last_update_time;
3388 n_last_update_time = next->avg.last_update_time;
3389
3390 } while (p_last_update_time != p_last_update_time_copy ||
3391 n_last_update_time != n_last_update_time_copy);
3392 }
3393#else
3394 p_last_update_time = prev->avg.last_update_time;
3395 n_last_update_time = next->avg.last_update_time;
3396#endif
3397 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3398 se->avg.last_update_time = n_last_update_time;
3399}
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470static inline void
3471update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3472{
3473 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3474
3475
3476 if (!delta)
3477 return;
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488 se->avg.util_avg = gcfs_rq->avg.util_avg;
3489 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3490
3491
3492 add_positive(&cfs_rq->avg.util_avg, delta);
3493 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3494}
3495
3496static inline void
3497update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3498{
3499 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3500 unsigned long runnable_load_avg, load_avg;
3501 u64 runnable_load_sum, load_sum = 0;
3502 s64 delta_sum;
3503
3504 if (!runnable_sum)
3505 return;
3506
3507 gcfs_rq->prop_runnable_sum = 0;
3508
3509 if (runnable_sum >= 0) {
3510
3511
3512
3513
3514 runnable_sum += se->avg.load_sum;
3515 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3516 } else {
3517
3518
3519
3520
3521 if (scale_load_down(gcfs_rq->load.weight)) {
3522 load_sum = div_s64(gcfs_rq->avg.load_sum,
3523 scale_load_down(gcfs_rq->load.weight));
3524 }
3525
3526
3527 runnable_sum = min(se->avg.load_sum, load_sum);
3528 }
3529
3530
3531
3532
3533
3534
3535 running_sum = se->avg.util_sum /
3536 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3537 runnable_sum = max(runnable_sum, running_sum);
3538
3539 load_sum = (s64)se_weight(se) * runnable_sum;
3540 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3541
3542 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3543 delta_avg = load_avg - se->avg.load_avg;
3544
3545 se->avg.load_sum = runnable_sum;
3546 se->avg.load_avg = load_avg;
3547 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3548 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3549
3550 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3551 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3552 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3553 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3554
3555 se->avg.runnable_load_sum = runnable_sum;
3556 se->avg.runnable_load_avg = runnable_load_avg;
3557
3558 if (se->on_rq) {
3559 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3560 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3561 }
3562}
3563
3564static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3565{
3566 cfs_rq->propagate = 1;
3567 cfs_rq->prop_runnable_sum += runnable_sum;
3568}
3569
3570
3571static inline int propagate_entity_load_avg(struct sched_entity *se)
3572{
3573 struct cfs_rq *cfs_rq, *gcfs_rq;
3574
3575 if (entity_is_task(se))
3576 return 0;
3577
3578 gcfs_rq = group_cfs_rq(se);
3579 if (!gcfs_rq->propagate)
3580 return 0;
3581
3582 gcfs_rq->propagate = 0;
3583
3584 cfs_rq = cfs_rq_of(se);
3585
3586 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3587
3588 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3589 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3590
3591 return 1;
3592}
3593
3594
3595
3596
3597
3598static inline bool skip_blocked_update(struct sched_entity *se)
3599{
3600 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3601
3602
3603
3604
3605
3606 if (se->avg.load_avg || se->avg.util_avg)
3607 return false;
3608
3609
3610
3611
3612
3613 if (gcfs_rq->propagate)
3614 return false;
3615
3616
3617
3618
3619
3620
3621 return true;
3622}
3623
3624#else
3625
3626static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3627
3628static inline int propagate_entity_load_avg(struct sched_entity *se)
3629{
3630 return 0;
3631}
3632
3633static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3634
3635#endif
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653static inline int
3654update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3655{
3656 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3657 struct sched_avg *sa = &cfs_rq->avg;
3658 int decayed = 0;
3659
3660 if (cfs_rq->removed.nr) {
3661 unsigned long r;
3662 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3663
3664 raw_spin_lock(&cfs_rq->removed.lock);
3665 swap(cfs_rq->removed.util_avg, removed_util);
3666 swap(cfs_rq->removed.load_avg, removed_load);
3667 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3668 cfs_rq->removed.nr = 0;
3669 raw_spin_unlock(&cfs_rq->removed.lock);
3670
3671 r = removed_load;
3672 sub_positive(&sa->load_avg, r);
3673 sub_positive(&sa->load_sum, r * divider);
3674
3675 r = removed_util;
3676 sub_positive(&sa->util_avg, r);
3677 sub_positive(&sa->util_sum, r * divider);
3678
3679 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3680
3681 decayed = 1;
3682 }
3683
3684 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3685
3686#ifndef CONFIG_64BIT
3687 smp_wmb();
3688 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3689#endif
3690
3691 if (decayed)
3692 cfs_rq_util_change(cfs_rq);
3693
3694 return decayed;
3695}
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3706{
3707 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3708
3709
3710
3711
3712
3713
3714
3715
3716 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3717 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3718
3719
3720
3721
3722
3723
3724
3725 se->avg.util_sum = se->avg.util_avg * divider;
3726
3727 se->avg.load_sum = divider;
3728 if (se_weight(se)) {
3729 se->avg.load_sum =
3730 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3731 }
3732
3733 se->avg.runnable_load_sum = se->avg.load_sum;
3734
3735 enqueue_load_avg(cfs_rq, se);
3736 cfs_rq->avg.util_avg += se->avg.util_avg;
3737 cfs_rq->avg.util_sum += se->avg.util_sum;
3738
3739 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3740
3741 cfs_rq_util_change(cfs_rq);
3742}
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3753{
3754 dequeue_load_avg(cfs_rq, se);
3755 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3756 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3757
3758 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3759
3760 cfs_rq_util_change(cfs_rq);
3761}
3762
3763
3764
3765
3766#define UPDATE_TG 0x1
3767#define SKIP_AGE_LOAD 0x2
3768#define DO_ATTACH 0x4
3769
3770
3771static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3772{
3773 u64 now = cfs_rq_clock_task(cfs_rq);
3774 struct rq *rq = rq_of(cfs_rq);
3775 int cpu = cpu_of(rq);
3776 int decayed;
3777
3778
3779
3780
3781
3782 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3783 __update_load_avg_se(now, cpu, cfs_rq, se);
3784
3785 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3786 decayed |= propagate_entity_load_avg(se);
3787
3788 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3789
3790 attach_entity_load_avg(cfs_rq, se);
3791 update_tg_load_avg(cfs_rq, 0);
3792
3793 } else if (decayed && (flags & UPDATE_TG))
3794 update_tg_load_avg(cfs_rq, 0);
3795}
3796
3797#ifndef CONFIG_64BIT
3798static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3799{
3800 u64 last_update_time_copy;
3801 u64 last_update_time;
3802
3803 do {
3804 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3805 smp_rmb();
3806 last_update_time = cfs_rq->avg.last_update_time;
3807 } while (last_update_time != last_update_time_copy);
3808
3809 return last_update_time;
3810}
3811#else
3812static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3813{
3814 return cfs_rq->avg.last_update_time;
3815}
3816#endif
3817
3818
3819
3820
3821
3822void sync_entity_load_avg(struct sched_entity *se)
3823{
3824 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3825 u64 last_update_time;
3826
3827 last_update_time = cfs_rq_last_update_time(cfs_rq);
3828 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
3829}
3830
3831
3832
3833
3834
3835void remove_entity_load_avg(struct sched_entity *se)
3836{
3837 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3838 unsigned long flags;
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850 sync_entity_load_avg(se);
3851
3852 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3853 ++cfs_rq->removed.nr;
3854 cfs_rq->removed.util_avg += se->avg.util_avg;
3855 cfs_rq->removed.load_avg += se->avg.load_avg;
3856 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3857 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3858}
3859
3860static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3861{
3862 return cfs_rq->avg.runnable_load_avg;
3863}
3864
3865static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3866{
3867 return cfs_rq->avg.load_avg;
3868}
3869
3870static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3871
3872#else
3873
3874static inline int
3875update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3876{
3877 return 0;
3878}
3879
3880#define UPDATE_TG 0x0
3881#define SKIP_AGE_LOAD 0x0
3882#define DO_ATTACH 0x0
3883
3884static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3885{
3886 cfs_rq_util_change(cfs_rq);
3887}
3888
3889static inline void remove_entity_load_avg(struct sched_entity *se) {}
3890
3891static inline void
3892attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3893static inline void
3894detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3895
3896static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3897{
3898 return 0;
3899}
3900
3901#endif
3902
3903static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3904{
3905#ifdef CONFIG_SCHED_DEBUG
3906 s64 d = se->vruntime - cfs_rq->min_vruntime;
3907
3908 if (d < 0)
3909 d = -d;
3910
3911 if (d > 3*sysctl_sched_latency)
3912 schedstat_inc(cfs_rq->nr_spread_over);
3913#endif
3914}
3915
3916static void
3917place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3918{
3919 u64 vruntime = cfs_rq->min_vruntime;
3920
3921
3922
3923
3924
3925
3926
3927 if (initial && sched_feat(START_DEBIT))
3928 vruntime += sched_vslice(cfs_rq, se);
3929
3930
3931 if (!initial) {
3932 unsigned long thresh = sysctl_sched_latency;
3933
3934
3935
3936
3937
3938 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3939 thresh >>= 1;
3940
3941 vruntime -= thresh;
3942 }
3943
3944
3945 se->vruntime = max_vruntime(se->vruntime, vruntime);
3946}
3947
3948static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3949
3950static inline void check_schedstat_required(void)
3951{
3952#ifdef CONFIG_SCHEDSTATS
3953 if (schedstat_enabled())
3954 return;
3955
3956
3957 if (trace_sched_stat_wait_enabled() ||
3958 trace_sched_stat_sleep_enabled() ||
3959 trace_sched_stat_iowait_enabled() ||
3960 trace_sched_stat_blocked_enabled() ||
3961 trace_sched_stat_runtime_enabled()) {
3962 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3963 "stat_blocked and stat_runtime require the "
3964 "kernel parameter schedstats=enable or "
3965 "kernel.sched_schedstats=1\n");
3966 }
3967#endif
3968}
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001static void
4002enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4003{
4004 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4005 bool curr = cfs_rq->curr == se;
4006
4007
4008
4009
4010
4011 if (renorm && curr)
4012 se->vruntime += cfs_rq->min_vruntime;
4013
4014 update_curr(cfs_rq);
4015
4016
4017
4018
4019
4020
4021
4022 if (renorm && !curr)
4023 se->vruntime += cfs_rq->min_vruntime;
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4034 update_cfs_group(se);
4035 enqueue_runnable_load_avg(cfs_rq, se);
4036 account_entity_enqueue(cfs_rq, se);
4037
4038 if (flags & ENQUEUE_WAKEUP)
4039 place_entity(cfs_rq, se, 0);
4040
4041 check_schedstat_required();
4042 update_stats_enqueue(cfs_rq, se, flags);
4043 check_spread(cfs_rq, se);
4044 if (!curr)
4045 __enqueue_entity(cfs_rq, se);
4046 se->on_rq = 1;
4047
4048 if (cfs_rq->nr_running == 1) {
4049 list_add_leaf_cfs_rq(cfs_rq);
4050 check_enqueue_throttle(cfs_rq);
4051 }
4052}
4053
4054static void __clear_buddies_last(struct sched_entity *se)
4055{
4056 for_each_sched_entity(se) {
4057 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4058 if (cfs_rq->last != se)
4059 break;
4060
4061 cfs_rq->last = NULL;
4062 }
4063}
4064
4065static void __clear_buddies_next(struct sched_entity *se)
4066{
4067 for_each_sched_entity(se) {
4068 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4069 if (cfs_rq->next != se)
4070 break;
4071
4072 cfs_rq->next = NULL;
4073 }
4074}
4075
4076static void __clear_buddies_skip(struct sched_entity *se)
4077{
4078 for_each_sched_entity(se) {
4079 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4080 if (cfs_rq->skip != se)
4081 break;
4082
4083 cfs_rq->skip = NULL;
4084 }
4085}
4086
4087static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4088{
4089 if (cfs_rq->last == se)
4090 __clear_buddies_last(se);
4091
4092 if (cfs_rq->next == se)
4093 __clear_buddies_next(se);
4094
4095 if (cfs_rq->skip == se)
4096 __clear_buddies_skip(se);
4097}
4098
4099static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4100
4101static void
4102dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4103{
4104
4105
4106
4107 update_curr(cfs_rq);
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117 update_load_avg(cfs_rq, se, UPDATE_TG);
4118 dequeue_runnable_load_avg(cfs_rq, se);
4119
4120 update_stats_dequeue(cfs_rq, se, flags);
4121
4122 clear_buddies(cfs_rq, se);
4123
4124 if (se != cfs_rq->curr)
4125 __dequeue_entity(cfs_rq, se);
4126 se->on_rq = 0;
4127 account_entity_dequeue(cfs_rq, se);
4128
4129
4130
4131
4132
4133
4134
4135 if (!(flags & DEQUEUE_SLEEP))
4136 se->vruntime -= cfs_rq->min_vruntime;
4137
4138
4139 return_cfs_rq_runtime(cfs_rq);
4140
4141 update_cfs_group(se);
4142
4143
4144
4145
4146
4147
4148
4149 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
4150 update_min_vruntime(cfs_rq);
4151}
4152
4153
4154
4155
4156static void
4157check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4158{
4159 unsigned long ideal_runtime, delta_exec;
4160 struct sched_entity *se;
4161 s64 delta;
4162
4163 ideal_runtime = sched_slice(cfs_rq, curr);
4164 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4165 if (delta_exec > ideal_runtime) {
4166 resched_curr(rq_of(cfs_rq));
4167
4168
4169
4170
4171 clear_buddies(cfs_rq, curr);
4172 return;
4173 }
4174
4175
4176
4177
4178
4179
4180 if (delta_exec < sysctl_sched_min_granularity)
4181 return;
4182
4183 se = __pick_first_entity(cfs_rq);
4184 delta = curr->vruntime - se->vruntime;
4185
4186 if (delta < 0)
4187 return;
4188
4189 if (delta > ideal_runtime)
4190 resched_curr(rq_of(cfs_rq));
4191}
4192
4193static void
4194set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4195{
4196
4197 if (se->on_rq) {
4198
4199
4200
4201
4202
4203 update_stats_wait_end(cfs_rq, se);
4204 __dequeue_entity(cfs_rq, se);
4205 update_load_avg(cfs_rq, se, UPDATE_TG);
4206 }
4207
4208 update_stats_curr_start(cfs_rq, se);
4209 cfs_rq->curr = se;
4210
4211
4212
4213
4214
4215
4216 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4217 schedstat_set(se->statistics.slice_max,
4218 max((u64)schedstat_val(se->statistics.slice_max),
4219 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4220 }
4221
4222 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4223}
4224
4225static int
4226wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4227
4228
4229
4230
4231
4232
4233
4234
4235static struct sched_entity *
4236pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4237{
4238 struct sched_entity *left = __pick_first_entity(cfs_rq);
4239 struct sched_entity *se;
4240
4241
4242
4243
4244
4245 if (!left || (curr && entity_before(curr, left)))
4246 left = curr;
4247
4248 se = left;
4249
4250
4251
4252
4253
4254 if (cfs_rq->skip == se) {
4255 struct sched_entity *second;
4256
4257 if (se == curr) {
4258 second = __pick_first_entity(cfs_rq);
4259 } else {
4260 second = __pick_next_entity(se);
4261 if (!second || (curr && entity_before(curr, second)))
4262 second = curr;
4263 }
4264
4265 if (second && wakeup_preempt_entity(second, left) < 1)
4266 se = second;
4267 }
4268
4269
4270
4271
4272 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4273 se = cfs_rq->last;
4274
4275
4276
4277
4278 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4279 se = cfs_rq->next;
4280
4281 clear_buddies(cfs_rq, se);
4282
4283 return se;
4284}
4285
4286static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4287
4288static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4289{
4290
4291
4292
4293
4294 if (prev->on_rq)
4295 update_curr(cfs_rq);
4296
4297
4298 check_cfs_rq_runtime(cfs_rq);
4299
4300 check_spread(cfs_rq, prev);
4301
4302 if (prev->on_rq) {
4303 update_stats_wait_start(cfs_rq, prev);
4304
4305 __enqueue_entity(cfs_rq, prev);
4306
4307 update_load_avg(cfs_rq, prev, 0);
4308 }
4309 cfs_rq->curr = NULL;
4310}
4311
4312static void
4313entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4314{
4315
4316
4317
4318 update_curr(cfs_rq);
4319
4320
4321
4322
4323 update_load_avg(cfs_rq, curr, UPDATE_TG);
4324 update_cfs_group(curr);
4325
4326#ifdef CONFIG_SCHED_HRTICK
4327
4328
4329
4330
4331 if (queued) {
4332 resched_curr(rq_of(cfs_rq));
4333 return;
4334 }
4335
4336
4337
4338 if (!sched_feat(DOUBLE_TICK) &&
4339 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4340 return;
4341#endif
4342
4343 if (cfs_rq->nr_running > 1)
4344 check_preempt_tick(cfs_rq, curr);
4345}
4346
4347
4348
4349
4350
4351
4352#ifdef CONFIG_CFS_BANDWIDTH
4353
4354#ifdef HAVE_JUMP_LABEL
4355static struct static_key __cfs_bandwidth_used;
4356
4357static inline bool cfs_bandwidth_used(void)
4358{
4359 return static_key_false(&__cfs_bandwidth_used);
4360}
4361
4362void cfs_bandwidth_usage_inc(void)
4363{
4364 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4365}
4366
4367void cfs_bandwidth_usage_dec(void)
4368{
4369 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4370}
4371#else
4372static bool cfs_bandwidth_used(void)
4373{
4374 return true;
4375}
4376
4377void cfs_bandwidth_usage_inc(void) {}
4378void cfs_bandwidth_usage_dec(void) {}
4379#endif
4380
4381
4382
4383
4384
4385static inline u64 default_cfs_period(void)
4386{
4387 return 100000000ULL;
4388}
4389
4390static inline u64 sched_cfs_bandwidth_slice(void)
4391{
4392 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4393}
4394
4395
4396
4397
4398
4399
4400
4401
4402void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4403{
4404 u64 now;
4405
4406 if (cfs_b->quota == RUNTIME_INF)
4407 return;
4408
4409 now = sched_clock_cpu(smp_processor_id());
4410 cfs_b->runtime = cfs_b->quota;
4411 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4412}
4413
4414static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4415{
4416 return &tg->cfs_bandwidth;
4417}
4418
4419
4420static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4421{
4422 if (unlikely(cfs_rq->throttle_count))
4423 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4424
4425 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4426}
4427
4428
4429static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4430{
4431 struct task_group *tg = cfs_rq->tg;
4432 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4433 u64 amount = 0, min_amount, expires;
4434
4435
4436 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4437
4438 raw_spin_lock(&cfs_b->lock);
4439 if (cfs_b->quota == RUNTIME_INF)
4440 amount = min_amount;
4441 else {
4442 start_cfs_bandwidth(cfs_b);
4443
4444 if (cfs_b->runtime > 0) {
4445 amount = min(cfs_b->runtime, min_amount);
4446 cfs_b->runtime -= amount;
4447 cfs_b->idle = 0;
4448 }
4449 }
4450 expires = cfs_b->runtime_expires;
4451 raw_spin_unlock(&cfs_b->lock);
4452
4453 cfs_rq->runtime_remaining += amount;
4454
4455
4456
4457
4458
4459 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
4460 cfs_rq->runtime_expires = expires;
4461
4462 return cfs_rq->runtime_remaining > 0;
4463}
4464
4465
4466
4467
4468
4469static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4470{
4471 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4472
4473
4474 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4475 return;
4476
4477 if (cfs_rq->runtime_remaining < 0)
4478 return;
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4492
4493 cfs_rq->runtime_expires += TICK_NSEC;
4494 } else {
4495
4496 cfs_rq->runtime_remaining = 0;
4497 }
4498}
4499
4500static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4501{
4502
4503 cfs_rq->runtime_remaining -= delta_exec;
4504 expire_cfs_rq_runtime(cfs_rq);
4505
4506 if (likely(cfs_rq->runtime_remaining > 0))
4507 return;
4508
4509
4510
4511
4512
4513 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4514 resched_curr(rq_of(cfs_rq));
4515}
4516
4517static __always_inline
4518void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4519{
4520 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4521 return;
4522
4523 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4524}
4525
4526static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4527{
4528 return cfs_bandwidth_used() && cfs_rq->throttled;
4529}
4530
4531
4532static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4533{
4534 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4535}
4536
4537
4538
4539
4540
4541
4542static inline int throttled_lb_pair(struct task_group *tg,
4543 int src_cpu, int dest_cpu)
4544{
4545 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4546
4547 src_cfs_rq = tg->cfs_rq[src_cpu];
4548 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4549
4550 return throttled_hierarchy(src_cfs_rq) ||
4551 throttled_hierarchy(dest_cfs_rq);
4552}
4553
4554
4555static int tg_unthrottle_up(struct task_group *tg, void *data)
4556{
4557 struct rq *rq = data;
4558 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4559
4560 cfs_rq->throttle_count--;
4561 if (!cfs_rq->throttle_count) {
4562
4563 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4564 cfs_rq->throttled_clock_task;
4565 }
4566
4567 return 0;
4568}
4569
4570static int tg_throttle_down(struct task_group *tg, void *data)
4571{
4572 struct rq *rq = data;
4573 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4574
4575
4576 if (!cfs_rq->throttle_count)
4577 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4578 cfs_rq->throttle_count++;
4579
4580 return 0;
4581}
4582
4583static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4584{
4585 struct rq *rq = rq_of(cfs_rq);
4586 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4587 struct sched_entity *se;
4588 long task_delta, dequeue = 1;
4589 bool empty;
4590
4591 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4592
4593
4594 rcu_read_lock();
4595 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4596 rcu_read_unlock();
4597
4598 task_delta = cfs_rq->h_nr_running;
4599 for_each_sched_entity(se) {
4600 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4601
4602 if (!se->on_rq)
4603 break;
4604
4605 if (dequeue)
4606 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4607 qcfs_rq->h_nr_running -= task_delta;
4608
4609 if (qcfs_rq->load.weight)
4610 dequeue = 0;
4611 }
4612
4613 if (!se)
4614 sub_nr_running(rq, task_delta);
4615
4616 cfs_rq->throttled = 1;
4617 cfs_rq->throttled_clock = rq_clock(rq);
4618 raw_spin_lock(&cfs_b->lock);
4619 empty = list_empty(&cfs_b->throttled_cfs_rq);
4620
4621
4622
4623
4624
4625 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4626
4627
4628
4629
4630
4631 if (empty)
4632 start_cfs_bandwidth(cfs_b);
4633
4634 raw_spin_unlock(&cfs_b->lock);
4635}
4636
4637void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4638{
4639 struct rq *rq = rq_of(cfs_rq);
4640 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4641 struct sched_entity *se;
4642 int enqueue = 1;
4643 long task_delta;
4644
4645 se = cfs_rq->tg->se[cpu_of(rq)];
4646
4647 cfs_rq->throttled = 0;
4648
4649 update_rq_clock(rq);
4650
4651 raw_spin_lock(&cfs_b->lock);
4652 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4653 list_del_rcu(&cfs_rq->throttled_list);
4654 raw_spin_unlock(&cfs_b->lock);
4655
4656
4657 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4658
4659 if (!cfs_rq->load.weight)
4660 return;
4661
4662 task_delta = cfs_rq->h_nr_running;
4663 for_each_sched_entity(se) {
4664 if (se->on_rq)
4665 enqueue = 0;
4666
4667 cfs_rq = cfs_rq_of(se);
4668 if (enqueue)
4669 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4670 cfs_rq->h_nr_running += task_delta;
4671
4672 if (cfs_rq_throttled(cfs_rq))
4673 break;
4674 }
4675
4676 if (!se)
4677 add_nr_running(rq, task_delta);
4678
4679
4680 if (rq->curr == rq->idle && rq->cfs.nr_running)
4681 resched_curr(rq);
4682}
4683
4684static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4685 u64 remaining, u64 expires)
4686{
4687 struct cfs_rq *cfs_rq;
4688 u64 runtime;
4689 u64 starting_runtime = remaining;
4690
4691 rcu_read_lock();
4692 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4693 throttled_list) {
4694 struct rq *rq = rq_of(cfs_rq);
4695 struct rq_flags rf;
4696
4697 rq_lock(rq, &rf);
4698 if (!cfs_rq_throttled(cfs_rq))
4699 goto next;
4700
4701 runtime = -cfs_rq->runtime_remaining + 1;
4702 if (runtime > remaining)
4703 runtime = remaining;
4704 remaining -= runtime;
4705
4706 cfs_rq->runtime_remaining += runtime;
4707 cfs_rq->runtime_expires = expires;
4708
4709
4710 if (cfs_rq->runtime_remaining > 0)
4711 unthrottle_cfs_rq(cfs_rq);
4712
4713next:
4714 rq_unlock(rq, &rf);
4715
4716 if (!remaining)
4717 break;
4718 }
4719 rcu_read_unlock();
4720
4721 return starting_runtime - remaining;
4722}
4723
4724
4725
4726
4727
4728
4729
4730static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4731{
4732 u64 runtime, runtime_expires;
4733 int throttled;
4734
4735
4736 if (cfs_b->quota == RUNTIME_INF)
4737 goto out_deactivate;
4738
4739 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4740 cfs_b->nr_periods += overrun;
4741
4742
4743
4744
4745
4746 if (cfs_b->idle && !throttled)
4747 goto out_deactivate;
4748
4749 __refill_cfs_bandwidth_runtime(cfs_b);
4750
4751 if (!throttled) {
4752
4753 cfs_b->idle = 1;
4754 return 0;
4755 }
4756
4757
4758 cfs_b->nr_throttled += overrun;
4759
4760 runtime_expires = cfs_b->runtime_expires;
4761
4762
4763
4764
4765
4766
4767
4768
4769 while (throttled && cfs_b->runtime > 0) {
4770 runtime = cfs_b->runtime;
4771 raw_spin_unlock(&cfs_b->lock);
4772
4773 runtime = distribute_cfs_runtime(cfs_b, runtime,
4774 runtime_expires);
4775 raw_spin_lock(&cfs_b->lock);
4776
4777 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4778
4779 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4780 }
4781
4782
4783
4784
4785
4786
4787
4788 cfs_b->idle = 0;
4789
4790 return 0;
4791
4792out_deactivate:
4793 return 1;
4794}
4795
4796
4797static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4798
4799static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4800
4801static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4802
4803
4804
4805
4806
4807
4808
4809
4810static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4811{
4812 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4813 u64 remaining;
4814
4815
4816 if (hrtimer_callback_running(refresh_timer))
4817 return 1;
4818
4819
4820 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4821 if (remaining < min_expire)
4822 return 1;
4823
4824 return 0;
4825}
4826
4827static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4828{
4829 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4830
4831
4832 if (runtime_refresh_within(cfs_b, min_left))
4833 return;
4834
4835 hrtimer_start(&cfs_b->slack_timer,
4836 ns_to_ktime(cfs_bandwidth_slack_period),
4837 HRTIMER_MODE_REL);
4838}
4839
4840
4841static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4842{
4843 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4844 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4845
4846 if (slack_runtime <= 0)
4847 return;
4848
4849 raw_spin_lock(&cfs_b->lock);
4850 if (cfs_b->quota != RUNTIME_INF &&
4851 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4852 cfs_b->runtime += slack_runtime;
4853
4854
4855 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4856 !list_empty(&cfs_b->throttled_cfs_rq))
4857 start_cfs_slack_bandwidth(cfs_b);
4858 }
4859 raw_spin_unlock(&cfs_b->lock);
4860
4861
4862 cfs_rq->runtime_remaining -= slack_runtime;
4863}
4864
4865static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4866{
4867 if (!cfs_bandwidth_used())
4868 return;
4869
4870 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4871 return;
4872
4873 __return_cfs_rq_runtime(cfs_rq);
4874}
4875
4876
4877
4878
4879
4880static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4881{
4882 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4883 u64 expires;
4884
4885
4886 raw_spin_lock(&cfs_b->lock);
4887 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4888 raw_spin_unlock(&cfs_b->lock);
4889 return;
4890 }
4891
4892 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4893 runtime = cfs_b->runtime;
4894
4895 expires = cfs_b->runtime_expires;
4896 raw_spin_unlock(&cfs_b->lock);
4897
4898 if (!runtime)
4899 return;
4900
4901 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4902
4903 raw_spin_lock(&cfs_b->lock);
4904 if (expires == cfs_b->runtime_expires)
4905 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4906 raw_spin_unlock(&cfs_b->lock);
4907}
4908
4909
4910
4911
4912
4913
4914static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4915{
4916 if (!cfs_bandwidth_used())
4917 return;
4918
4919
4920 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4921 return;
4922
4923
4924 if (cfs_rq_throttled(cfs_rq))
4925 return;
4926
4927
4928 account_cfs_rq_runtime(cfs_rq, 0);
4929 if (cfs_rq->runtime_remaining <= 0)
4930 throttle_cfs_rq(cfs_rq);
4931}
4932
4933static void sync_throttle(struct task_group *tg, int cpu)
4934{
4935 struct cfs_rq *pcfs_rq, *cfs_rq;
4936
4937 if (!cfs_bandwidth_used())
4938 return;
4939
4940 if (!tg->parent)
4941 return;
4942
4943 cfs_rq = tg->cfs_rq[cpu];
4944 pcfs_rq = tg->parent->cfs_rq[cpu];
4945
4946 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4947 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4948}
4949
4950
4951static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4952{
4953 if (!cfs_bandwidth_used())
4954 return false;
4955
4956 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4957 return false;
4958
4959
4960
4961
4962
4963 if (cfs_rq_throttled(cfs_rq))
4964 return true;
4965
4966 throttle_cfs_rq(cfs_rq);
4967 return true;
4968}
4969
4970static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4971{
4972 struct cfs_bandwidth *cfs_b =
4973 container_of(timer, struct cfs_bandwidth, slack_timer);
4974
4975 do_sched_cfs_slack_timer(cfs_b);
4976
4977 return HRTIMER_NORESTART;
4978}
4979
4980static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4981{
4982 struct cfs_bandwidth *cfs_b =
4983 container_of(timer, struct cfs_bandwidth, period_timer);
4984 int overrun;
4985 int idle = 0;
4986
4987 raw_spin_lock(&cfs_b->lock);
4988 for (;;) {
4989 overrun = hrtimer_forward_now(timer, cfs_b->period);
4990 if (!overrun)
4991 break;
4992
4993 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4994 }
4995 if (idle)
4996 cfs_b->period_active = 0;
4997 raw_spin_unlock(&cfs_b->lock);
4998
4999 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5000}
5001
5002void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5003{
5004 raw_spin_lock_init(&cfs_b->lock);
5005 cfs_b->runtime = 0;
5006 cfs_b->quota = RUNTIME_INF;
5007 cfs_b->period = ns_to_ktime(default_cfs_period());
5008
5009 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5010 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5011 cfs_b->period_timer.function = sched_cfs_period_timer;
5012 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5013 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5014}
5015
5016static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5017{
5018 cfs_rq->runtime_enabled = 0;
5019 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5020}
5021
5022void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5023{
5024 lockdep_assert_held(&cfs_b->lock);
5025
5026 if (!cfs_b->period_active) {
5027 cfs_b->period_active = 1;
5028 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5029 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5030 }
5031}
5032
5033static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5034{
5035
5036 if (!cfs_b->throttled_cfs_rq.next)
5037 return;
5038
5039 hrtimer_cancel(&cfs_b->period_timer);
5040 hrtimer_cancel(&cfs_b->slack_timer);
5041}
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051static void __maybe_unused update_runtime_enabled(struct rq *rq)
5052{
5053 struct task_group *tg;
5054
5055 lockdep_assert_held(&rq->lock);
5056
5057 rcu_read_lock();
5058 list_for_each_entry_rcu(tg, &task_groups, list) {
5059 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5060 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5061
5062 raw_spin_lock(&cfs_b->lock);
5063 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5064 raw_spin_unlock(&cfs_b->lock);
5065 }
5066 rcu_read_unlock();
5067}
5068
5069
5070static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5071{
5072 struct task_group *tg;
5073
5074 lockdep_assert_held(&rq->lock);
5075
5076 rcu_read_lock();
5077 list_for_each_entry_rcu(tg, &task_groups, list) {
5078 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5079
5080 if (!cfs_rq->runtime_enabled)
5081 continue;
5082
5083
5084
5085
5086
5087 cfs_rq->runtime_remaining = 1;
5088
5089
5090
5091
5092 cfs_rq->runtime_enabled = 0;
5093
5094 if (cfs_rq_throttled(cfs_rq))
5095 unthrottle_cfs_rq(cfs_rq);
5096 }
5097 rcu_read_unlock();
5098}
5099
5100#else
5101static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5102{
5103 return rq_clock_task(rq_of(cfs_rq));
5104}
5105
5106static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5107static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5108static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5109static inline void sync_throttle(struct task_group *tg, int cpu) {}
5110static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5111
5112static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5113{
5114 return 0;
5115}
5116
5117static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5118{
5119 return 0;
5120}
5121
5122static inline int throttled_lb_pair(struct task_group *tg,
5123 int src_cpu, int dest_cpu)
5124{
5125 return 0;
5126}
5127
5128void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5129
5130#ifdef CONFIG_FAIR_GROUP_SCHED
5131static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5132#endif
5133
5134static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5135{
5136 return NULL;
5137}
5138static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5139static inline void update_runtime_enabled(struct rq *rq) {}
5140static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5141
5142#endif
5143
5144
5145
5146
5147
5148#ifdef CONFIG_SCHED_HRTICK
5149static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5150{
5151 struct sched_entity *se = &p->se;
5152 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5153
5154 SCHED_WARN_ON(task_rq(p) != rq);
5155
5156 if (rq->cfs.h_nr_running > 1) {
5157 u64 slice = sched_slice(cfs_rq, se);
5158 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5159 s64 delta = slice - ran;
5160
5161 if (delta < 0) {
5162 if (rq->curr == p)
5163 resched_curr(rq);
5164 return;
5165 }
5166 hrtick_start(rq, delta);
5167 }
5168}
5169
5170
5171
5172
5173
5174
5175static void hrtick_update(struct rq *rq)
5176{
5177 struct task_struct *curr = rq->curr;
5178
5179 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5180 return;
5181
5182 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5183 hrtick_start_fair(rq, curr);
5184}
5185#else
5186static inline void
5187hrtick_start_fair(struct rq *rq, struct task_struct *p)
5188{
5189}
5190
5191static inline void hrtick_update(struct rq *rq)
5192{
5193}
5194#endif
5195
5196
5197
5198
5199
5200
5201static void
5202enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5203{
5204 struct cfs_rq *cfs_rq;
5205 struct sched_entity *se = &p->se;
5206
5207
5208
5209
5210
5211
5212 if (p->in_iowait)
5213 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5214
5215 for_each_sched_entity(se) {
5216 if (se->on_rq)
5217 break;
5218 cfs_rq = cfs_rq_of(se);
5219 enqueue_entity(cfs_rq, se, flags);
5220
5221
5222
5223
5224
5225
5226
5227 if (cfs_rq_throttled(cfs_rq))
5228 break;
5229 cfs_rq->h_nr_running++;
5230
5231 flags = ENQUEUE_WAKEUP;
5232 }
5233
5234 for_each_sched_entity(se) {
5235 cfs_rq = cfs_rq_of(se);
5236 cfs_rq->h_nr_running++;
5237
5238 if (cfs_rq_throttled(cfs_rq))
5239 break;
5240
5241 update_load_avg(cfs_rq, se, UPDATE_TG);
5242 update_cfs_group(se);
5243 }
5244
5245 if (!se)
5246 add_nr_running(rq, 1);
5247
5248 hrtick_update(rq);
5249}
5250
5251static void set_next_buddy(struct sched_entity *se);
5252
5253
5254
5255
5256
5257
5258static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5259{
5260 struct cfs_rq *cfs_rq;
5261 struct sched_entity *se = &p->se;
5262 int task_sleep = flags & DEQUEUE_SLEEP;
5263
5264 for_each_sched_entity(se) {
5265 cfs_rq = cfs_rq_of(se);
5266 dequeue_entity(cfs_rq, se, flags);
5267
5268
5269
5270
5271
5272
5273
5274 if (cfs_rq_throttled(cfs_rq))
5275 break;
5276 cfs_rq->h_nr_running--;
5277
5278
5279 if (cfs_rq->load.weight) {
5280
5281 se = parent_entity(se);
5282
5283
5284
5285
5286 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5287 set_next_buddy(se);
5288 break;
5289 }
5290 flags |= DEQUEUE_SLEEP;
5291 }
5292
5293 for_each_sched_entity(se) {
5294 cfs_rq = cfs_rq_of(se);
5295 cfs_rq->h_nr_running--;
5296
5297 if (cfs_rq_throttled(cfs_rq))
5298 break;
5299
5300 update_load_avg(cfs_rq, se, UPDATE_TG);
5301 update_cfs_group(se);
5302 }
5303
5304 if (!se)
5305 sub_nr_running(rq, 1);
5306
5307 hrtick_update(rq);
5308}
5309
5310#ifdef CONFIG_SMP
5311
5312
5313DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5314DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5315
5316#ifdef CONFIG_NO_HZ_COMMON
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343#define DEGRADE_SHIFT 7
5344
5345static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5346static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5347 { 0, 0, 0, 0, 0, 0, 0, 0 },
5348 { 64, 32, 8, 0, 0, 0, 0, 0 },
5349 { 96, 72, 40, 12, 1, 0, 0, 0 },
5350 { 112, 98, 75, 43, 15, 1, 0, 0 },
5351 { 120, 112, 98, 76, 45, 16, 2, 0 }
5352};
5353
5354
5355
5356
5357
5358
5359static unsigned long
5360decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5361{
5362 int j = 0;
5363
5364 if (!missed_updates)
5365 return load;
5366
5367 if (missed_updates >= degrade_zero_ticks[idx])
5368 return 0;
5369
5370 if (idx == 1)
5371 return load >> missed_updates;
5372
5373 while (missed_updates) {
5374 if (missed_updates % 2)
5375 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5376
5377 missed_updates >>= 1;
5378 j++;
5379 }
5380 return load;
5381}
5382#endif
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5420 unsigned long pending_updates)
5421{
5422 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5423 int i, scale;
5424
5425 this_rq->nr_load_updates++;
5426
5427
5428 this_rq->cpu_load[0] = this_load;
5429 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5430 unsigned long old_load, new_load;
5431
5432
5433
5434 old_load = this_rq->cpu_load[i];
5435#ifdef CONFIG_NO_HZ_COMMON
5436 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5437 if (tickless_load) {
5438 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5439
5440
5441
5442
5443
5444 old_load += tickless_load;
5445 }
5446#endif
5447 new_load = this_load;
5448
5449
5450
5451
5452
5453 if (new_load > old_load)
5454 new_load += scale - 1;
5455
5456 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5457 }
5458
5459 sched_avg_update(this_rq);
5460}
5461
5462
5463static unsigned long weighted_cpuload(struct rq *rq)
5464{
5465 return cfs_rq_runnable_load_avg(&rq->cfs);
5466}
5467
5468#ifdef CONFIG_NO_HZ_COMMON
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483static void cpu_load_update_nohz(struct rq *this_rq,
5484 unsigned long curr_jiffies,
5485 unsigned long load)
5486{
5487 unsigned long pending_updates;
5488
5489 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5490 if (pending_updates) {
5491 this_rq->last_load_update_tick = curr_jiffies;
5492
5493
5494
5495
5496
5497 cpu_load_update(this_rq, load, pending_updates);
5498 }
5499}
5500
5501
5502
5503
5504
5505static void cpu_load_update_idle(struct rq *this_rq)
5506{
5507
5508
5509
5510 if (weighted_cpuload(this_rq))
5511 return;
5512
5513 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5514}
5515
5516
5517
5518
5519
5520
5521
5522void cpu_load_update_nohz_start(void)
5523{
5524 struct rq *this_rq = this_rq();
5525
5526
5527
5528
5529
5530
5531 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5532}
5533
5534
5535
5536
5537void cpu_load_update_nohz_stop(void)
5538{
5539 unsigned long curr_jiffies = READ_ONCE(jiffies);
5540 struct rq *this_rq = this_rq();
5541 unsigned long load;
5542 struct rq_flags rf;
5543
5544 if (curr_jiffies == this_rq->last_load_update_tick)
5545 return;
5546
5547 load = weighted_cpuload(this_rq);
5548 rq_lock(this_rq, &rf);
5549 update_rq_clock(this_rq);
5550 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5551 rq_unlock(this_rq, &rf);
5552}
5553#else
5554static inline void cpu_load_update_nohz(struct rq *this_rq,
5555 unsigned long curr_jiffies,
5556 unsigned long load) { }
5557#endif
5558
5559static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5560{
5561#ifdef CONFIG_NO_HZ_COMMON
5562
5563 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5564#endif
5565 cpu_load_update(this_rq, load, 1);
5566}
5567
5568
5569
5570
5571void cpu_load_update_active(struct rq *this_rq)
5572{
5573 unsigned long load = weighted_cpuload(this_rq);
5574
5575 if (tick_nohz_tick_stopped())
5576 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5577 else
5578 cpu_load_update_periodic(this_rq, load);
5579}
5580
5581
5582
5583
5584
5585
5586
5587
5588static unsigned long source_load(int cpu, int type)
5589{
5590 struct rq *rq = cpu_rq(cpu);
5591 unsigned long total = weighted_cpuload(rq);
5592
5593 if (type == 0 || !sched_feat(LB_BIAS))
5594 return total;
5595
5596 return min(rq->cpu_load[type-1], total);
5597}
5598
5599
5600
5601
5602
5603static unsigned long target_load(int cpu, int type)
5604{
5605 struct rq *rq = cpu_rq(cpu);
5606 unsigned long total = weighted_cpuload(rq);
5607
5608 if (type == 0 || !sched_feat(LB_BIAS))
5609 return total;
5610
5611 return max(rq->cpu_load[type-1], total);
5612}
5613
5614static unsigned long capacity_of(int cpu)
5615{
5616 return cpu_rq(cpu)->cpu_capacity;
5617}
5618
5619static unsigned long capacity_orig_of(int cpu)
5620{
5621 return cpu_rq(cpu)->cpu_capacity_orig;
5622}
5623
5624static unsigned long cpu_avg_load_per_task(int cpu)
5625{
5626 struct rq *rq = cpu_rq(cpu);
5627 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5628 unsigned long load_avg = weighted_cpuload(rq);
5629
5630 if (nr_running)
5631 return load_avg / nr_running;
5632
5633 return 0;
5634}
5635
5636static void record_wakee(struct task_struct *p)
5637{
5638
5639
5640
5641
5642 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5643 current->wakee_flips >>= 1;
5644 current->wakee_flip_decay_ts = jiffies;
5645 }
5646
5647 if (current->last_wakee != p) {
5648 current->last_wakee = p;
5649 current->wakee_flips++;
5650 }
5651}
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670static int wake_wide(struct task_struct *p)
5671{
5672 unsigned int master = current->wakee_flips;
5673 unsigned int slave = p->wakee_flips;
5674 int factor = this_cpu_read(sd_llc_size);
5675
5676 if (master < slave)
5677 swap(master, slave);
5678 if (slave < factor || master < slave * factor)
5679 return 0;
5680 return 1;
5681}
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695static int
5696wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5697{
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710 if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5711 return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5712
5713 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5714 return this_cpu;
5715
5716 return nr_cpumask_bits;
5717}
5718
5719static int
5720wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5721 int this_cpu, int prev_cpu, int sync)
5722{
5723 s64 this_eff_load, prev_eff_load;
5724 unsigned long task_load;
5725
5726 this_eff_load = target_load(this_cpu, sd->wake_idx);
5727 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5728
5729 if (sync) {
5730 unsigned long current_load = task_h_load(current);
5731
5732 if (current_load > this_eff_load)
5733 return this_cpu;
5734
5735 this_eff_load -= current_load;
5736 }
5737
5738 task_load = task_h_load(p);
5739
5740 this_eff_load += task_load;
5741 if (sched_feat(WA_BIAS))
5742 this_eff_load *= 100;
5743 this_eff_load *= capacity_of(prev_cpu);
5744
5745 prev_eff_load -= task_load;
5746 if (sched_feat(WA_BIAS))
5747 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5748 prev_eff_load *= capacity_of(this_cpu);
5749
5750 return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
5751}
5752
5753static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5754 int prev_cpu, int sync)
5755{
5756 int this_cpu = smp_processor_id();
5757 int target = nr_cpumask_bits;
5758
5759 if (sched_feat(WA_IDLE))
5760 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5761
5762 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5763 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5764
5765 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5766 if (target == nr_cpumask_bits)
5767 return prev_cpu;
5768
5769 schedstat_inc(sd->ttwu_move_affine);
5770 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5771 return target;
5772}
5773
5774static inline unsigned long task_util(struct task_struct *p);
5775static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5776
5777static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5778{
5779 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
5780}
5781
5782
5783
5784
5785
5786
5787
5788static struct sched_group *
5789find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5790 int this_cpu, int sd_flag)
5791{
5792 struct sched_group *idlest = NULL, *group = sd->groups;
5793 struct sched_group *most_spare_sg = NULL;
5794 unsigned long min_runnable_load = ULONG_MAX;
5795 unsigned long this_runnable_load = ULONG_MAX;
5796 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5797 unsigned long most_spare = 0, this_spare = 0;
5798 int load_idx = sd->forkexec_idx;
5799 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5800 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5801 (sd->imbalance_pct-100) / 100;
5802
5803 if (sd_flag & SD_BALANCE_WAKE)
5804 load_idx = sd->wake_idx;
5805
5806 do {
5807 unsigned long load, avg_load, runnable_load;
5808 unsigned long spare_cap, max_spare_cap;
5809 int local_group;
5810 int i;
5811
5812
5813 if (!cpumask_intersects(sched_group_span(group),
5814 &p->cpus_allowed))
5815 continue;
5816
5817 local_group = cpumask_test_cpu(this_cpu,
5818 sched_group_span(group));
5819
5820
5821
5822
5823
5824 avg_load = 0;
5825 runnable_load = 0;
5826 max_spare_cap = 0;
5827
5828 for_each_cpu(i, sched_group_span(group)) {
5829
5830 if (local_group)
5831 load = source_load(i, load_idx);
5832 else
5833 load = target_load(i, load_idx);
5834
5835 runnable_load += load;
5836
5837 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5838
5839 spare_cap = capacity_spare_wake(i, p);
5840
5841 if (spare_cap > max_spare_cap)
5842 max_spare_cap = spare_cap;
5843 }
5844
5845
5846 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5847 group->sgc->capacity;
5848 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5849 group->sgc->capacity;
5850
5851 if (local_group) {
5852 this_runnable_load = runnable_load;
5853 this_avg_load = avg_load;
5854 this_spare = max_spare_cap;
5855 } else {
5856 if (min_runnable_load > (runnable_load + imbalance)) {
5857
5858
5859
5860
5861 min_runnable_load = runnable_load;
5862 min_avg_load = avg_load;
5863 idlest = group;
5864 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5865 (100*min_avg_load > imbalance_scale*avg_load)) {
5866
5867
5868
5869
5870 min_avg_load = avg_load;
5871 idlest = group;
5872 }
5873
5874 if (most_spare < max_spare_cap) {
5875 most_spare = max_spare_cap;
5876 most_spare_sg = group;
5877 }
5878 }
5879 } while (group = group->next, group != sd->groups);
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892 if (sd_flag & SD_BALANCE_FORK)
5893 goto skip_spare;
5894
5895 if (this_spare > task_util(p) / 2 &&
5896 imbalance_scale*this_spare > 100*most_spare)
5897 return NULL;
5898
5899 if (most_spare > task_util(p) / 2)
5900 return most_spare_sg;
5901
5902skip_spare:
5903 if (!idlest)
5904 return NULL;
5905
5906 if (min_runnable_load > (this_runnable_load + imbalance))
5907 return NULL;
5908
5909 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5910 (100*this_avg_load < imbalance_scale*min_avg_load))
5911 return NULL;
5912
5913 return idlest;
5914}
5915
5916
5917
5918
5919static int
5920find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5921{
5922 unsigned long load, min_load = ULONG_MAX;
5923 unsigned int min_exit_latency = UINT_MAX;
5924 u64 latest_idle_timestamp = 0;
5925 int least_loaded_cpu = this_cpu;
5926 int shallowest_idle_cpu = -1;
5927 int i;
5928
5929
5930 if (group->group_weight == 1)
5931 return cpumask_first(sched_group_span(group));
5932
5933
5934 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5935 if (idle_cpu(i)) {
5936 struct rq *rq = cpu_rq(i);
5937 struct cpuidle_state *idle = idle_get_state(rq);
5938 if (idle && idle->exit_latency < min_exit_latency) {
5939
5940
5941
5942
5943
5944 min_exit_latency = idle->exit_latency;
5945 latest_idle_timestamp = rq->idle_stamp;
5946 shallowest_idle_cpu = i;
5947 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5948 rq->idle_stamp > latest_idle_timestamp) {
5949
5950
5951
5952
5953
5954 latest_idle_timestamp = rq->idle_stamp;
5955 shallowest_idle_cpu = i;
5956 }
5957 } else if (shallowest_idle_cpu == -1) {
5958 load = weighted_cpuload(cpu_rq(i));
5959 if (load < min_load) {
5960 min_load = load;
5961 least_loaded_cpu = i;
5962 }
5963 }
5964 }
5965
5966 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5967}
5968
5969static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5970 int cpu, int prev_cpu, int sd_flag)
5971{
5972 int new_cpu = cpu;
5973
5974 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
5975 return prev_cpu;
5976
5977 while (sd) {
5978 struct sched_group *group;
5979 struct sched_domain *tmp;
5980 int weight;
5981
5982 if (!(sd->flags & sd_flag)) {
5983 sd = sd->child;
5984 continue;
5985 }
5986
5987 group = find_idlest_group(sd, p, cpu, sd_flag);
5988 if (!group) {
5989 sd = sd->child;
5990 continue;
5991 }
5992
5993 new_cpu = find_idlest_group_cpu(group, p, cpu);
5994 if (new_cpu == cpu) {
5995
5996 sd = sd->child;
5997 continue;
5998 }
5999
6000
6001 cpu = new_cpu;
6002 weight = sd->span_weight;
6003 sd = NULL;
6004 for_each_domain(cpu, tmp) {
6005 if (weight <= tmp->span_weight)
6006 break;
6007 if (tmp->flags & sd_flag)
6008 sd = tmp;
6009 }
6010
6011 }
6012
6013 return new_cpu;
6014}
6015
6016#ifdef CONFIG_SCHED_SMT
6017
6018static inline void set_idle_cores(int cpu, int val)
6019{
6020 struct sched_domain_shared *sds;
6021
6022 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6023 if (sds)
6024 WRITE_ONCE(sds->has_idle_cores, val);
6025}
6026
6027static inline bool test_idle_cores(int cpu, bool def)
6028{
6029 struct sched_domain_shared *sds;
6030
6031 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6032 if (sds)
6033 return READ_ONCE(sds->has_idle_cores);
6034
6035 return def;
6036}
6037
6038
6039
6040
6041
6042
6043
6044
6045void __update_idle_core(struct rq *rq)
6046{
6047 int core = cpu_of(rq);
6048 int cpu;
6049
6050 rcu_read_lock();
6051 if (test_idle_cores(core, true))
6052 goto unlock;
6053
6054 for_each_cpu(cpu, cpu_smt_mask(core)) {
6055 if (cpu == core)
6056 continue;
6057
6058 if (!idle_cpu(cpu))
6059 goto unlock;
6060 }
6061
6062 set_idle_cores(core, 1);
6063unlock:
6064 rcu_read_unlock();
6065}
6066
6067
6068
6069
6070
6071
6072static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6073{
6074 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6075 int core, cpu;
6076
6077 if (!static_branch_likely(&sched_smt_present))
6078 return -1;
6079
6080 if (!test_idle_cores(target, false))
6081 return -1;
6082
6083 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6084
6085 for_each_cpu_wrap(core, cpus, target) {
6086 bool idle = true;
6087
6088 for_each_cpu(cpu, cpu_smt_mask(core)) {
6089 cpumask_clear_cpu(cpu, cpus);
6090 if (!idle_cpu(cpu))
6091 idle = false;
6092 }
6093
6094 if (idle)
6095 return core;
6096 }
6097
6098
6099
6100
6101 set_idle_cores(target, 0);
6102
6103 return -1;
6104}
6105
6106
6107
6108
6109static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6110{
6111 int cpu;
6112
6113 if (!static_branch_likely(&sched_smt_present))
6114 return -1;
6115
6116 for_each_cpu(cpu, cpu_smt_mask(target)) {
6117 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6118 continue;
6119 if (idle_cpu(cpu))
6120 return cpu;
6121 }
6122
6123 return -1;
6124}
6125
6126#else
6127
6128static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6129{
6130 return -1;
6131}
6132
6133static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6134{
6135 return -1;
6136}
6137
6138#endif
6139
6140
6141
6142
6143
6144
6145static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6146{
6147 struct sched_domain *this_sd;
6148 u64 avg_cost, avg_idle;
6149 u64 time, cost;
6150 s64 delta;
6151 int cpu, nr = INT_MAX;
6152
6153 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6154 if (!this_sd)
6155 return -1;
6156
6157
6158
6159
6160
6161 avg_idle = this_rq()->avg_idle / 512;
6162 avg_cost = this_sd->avg_scan_cost + 1;
6163
6164 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
6165 return -1;
6166
6167 if (sched_feat(SIS_PROP)) {
6168 u64 span_avg = sd->span_weight * avg_idle;
6169 if (span_avg > 4*avg_cost)
6170 nr = div_u64(span_avg, avg_cost);
6171 else
6172 nr = 4;
6173 }
6174
6175 time = local_clock();
6176
6177 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6178 if (!--nr)
6179 return -1;
6180 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6181 continue;
6182 if (idle_cpu(cpu))
6183 break;
6184 }
6185
6186 time = local_clock() - time;
6187 cost = this_sd->avg_scan_cost;
6188 delta = (s64)(time - cost) / 8;
6189 this_sd->avg_scan_cost += delta;
6190
6191 return cpu;
6192}
6193
6194
6195
6196
6197static int select_idle_sibling(struct task_struct *p, int prev, int target)
6198{
6199 struct sched_domain *sd;
6200 int i, recent_used_cpu;
6201
6202 if (idle_cpu(target))
6203 return target;
6204
6205
6206
6207
6208 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6209 return prev;
6210
6211
6212 recent_used_cpu = p->recent_used_cpu;
6213 if (recent_used_cpu != prev &&
6214 recent_used_cpu != target &&
6215 cpus_share_cache(recent_used_cpu, target) &&
6216 idle_cpu(recent_used_cpu) &&
6217 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6218
6219
6220
6221
6222 p->recent_used_cpu = prev;
6223 return recent_used_cpu;
6224 }
6225
6226 sd = rcu_dereference(per_cpu(sd_llc, target));
6227 if (!sd)
6228 return target;
6229
6230 i = select_idle_core(p, sd, target);
6231 if ((unsigned)i < nr_cpumask_bits)
6232 return i;
6233
6234 i = select_idle_cpu(p, sd, target);
6235 if ((unsigned)i < nr_cpumask_bits)
6236 return i;
6237
6238 i = select_idle_smt(p, sd, target);
6239 if ((unsigned)i < nr_cpumask_bits)
6240 return i;
6241
6242 return target;
6243}
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271static unsigned long cpu_util(int cpu)
6272{
6273 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
6274 unsigned long capacity = capacity_orig_of(cpu);
6275
6276 return (util >= capacity) ? capacity : util;
6277}
6278
6279static inline unsigned long task_util(struct task_struct *p)
6280{
6281 return p->se.avg.util_avg;
6282}
6283
6284
6285
6286
6287
6288static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6289{
6290 unsigned long util, capacity;
6291
6292
6293 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
6294 return cpu_util(cpu);
6295
6296 capacity = capacity_orig_of(cpu);
6297 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
6298
6299 return (util >= capacity) ? capacity : util;
6300}
6301
6302
6303
6304
6305
6306
6307
6308
6309static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6310{
6311 long min_cap, max_cap;
6312
6313 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6314 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6315
6316
6317 if (max_cap - min_cap < max_cap >> 3)
6318 return 0;
6319
6320
6321 sync_entity_load_avg(&p->se);
6322
6323 return min_cap * 1024 < task_util(p) * capacity_margin;
6324}
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338static int
6339select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6340{
6341 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
6342 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu;
6344 int want_affine = 0;
6345 int sync = wake_flags & WF_SYNC;
6346
6347 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p);
6349 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
6350 && cpumask_test_cpu(cpu, &p->cpus_allowed);
6351 }
6352
6353 rcu_read_lock();
6354 for_each_domain(cpu, tmp) {
6355 if (!(tmp->flags & SD_LOAD_BALANCE))
6356 break;
6357
6358
6359
6360
6361
6362 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6363 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6364 affine_sd = tmp;
6365 break;
6366 }
6367
6368 if (tmp->flags & sd_flag)
6369 sd = tmp;
6370 else if (!want_affine)
6371 break;
6372 }
6373
6374 if (affine_sd) {
6375 sd = NULL;
6376 if (cpu == prev_cpu)
6377 goto pick_cpu;
6378
6379 new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
6380 }
6381
6382 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6383
6384
6385
6386
6387
6388 sync_entity_load_avg(&p->se);
6389 }
6390
6391 if (!sd) {
6392pick_cpu:
6393 if (sd_flag & SD_BALANCE_WAKE) {
6394 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6395
6396 if (want_affine)
6397 current->recent_used_cpu = cpu;
6398 }
6399 } else {
6400 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6401 }
6402 rcu_read_unlock();
6403
6404 return new_cpu;
6405}
6406
6407static void detach_entity_cfs_rq(struct sched_entity *se);
6408
6409
6410
6411
6412
6413
6414static void migrate_task_rq_fair(struct task_struct *p)
6415{
6416
6417
6418
6419
6420
6421
6422 if (p->state == TASK_WAKING) {
6423 struct sched_entity *se = &p->se;
6424 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6425 u64 min_vruntime;
6426
6427#ifndef CONFIG_64BIT
6428 u64 min_vruntime_copy;
6429
6430 do {
6431 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6432 smp_rmb();
6433 min_vruntime = cfs_rq->min_vruntime;
6434 } while (min_vruntime != min_vruntime_copy);
6435#else
6436 min_vruntime = cfs_rq->min_vruntime;
6437#endif
6438
6439 se->vruntime -= min_vruntime;
6440 }
6441
6442 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6443
6444
6445
6446
6447 lockdep_assert_held(&task_rq(p)->lock);
6448 detach_entity_cfs_rq(&p->se);
6449
6450 } else {
6451
6452
6453
6454
6455
6456
6457
6458
6459 remove_entity_load_avg(&p->se);
6460 }
6461
6462
6463 p->se.avg.last_update_time = 0;
6464
6465
6466 p->se.exec_start = 0;
6467}
6468
6469static void task_dead_fair(struct task_struct *p)
6470{
6471 remove_entity_load_avg(&p->se);
6472}
6473#endif
6474
6475static unsigned long wakeup_gran(struct sched_entity *se)
6476{
6477 unsigned long gran = sysctl_sched_wakeup_granularity;
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492 return calc_delta_fair(gran, se);
6493}
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509static int
6510wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6511{
6512 s64 gran, vdiff = curr->vruntime - se->vruntime;
6513
6514 if (vdiff <= 0)
6515 return -1;
6516
6517 gran = wakeup_gran(se);
6518 if (vdiff > gran)
6519 return 1;
6520
6521 return 0;
6522}
6523
6524static void set_last_buddy(struct sched_entity *se)
6525{
6526 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6527 return;
6528
6529 for_each_sched_entity(se) {
6530 if (SCHED_WARN_ON(!se->on_rq))
6531 return;
6532 cfs_rq_of(se)->last = se;
6533 }
6534}
6535
6536static void set_next_buddy(struct sched_entity *se)
6537{
6538 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6539 return;
6540
6541 for_each_sched_entity(se) {
6542 if (SCHED_WARN_ON(!se->on_rq))
6543 return;
6544 cfs_rq_of(se)->next = se;
6545 }
6546}
6547
6548static void set_skip_buddy(struct sched_entity *se)
6549{
6550 for_each_sched_entity(se)
6551 cfs_rq_of(se)->skip = se;
6552}
6553
6554
6555
6556
6557static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6558{
6559 struct task_struct *curr = rq->curr;
6560 struct sched_entity *se = &curr->se, *pse = &p->se;
6561 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6562 int scale = cfs_rq->nr_running >= sched_nr_latency;
6563 int next_buddy_marked = 0;
6564
6565 if (unlikely(se == pse))
6566 return;
6567
6568
6569
6570
6571
6572
6573
6574 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6575 return;
6576
6577 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6578 set_next_buddy(pse);
6579 next_buddy_marked = 1;
6580 }
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592 if (test_tsk_need_resched(curr))
6593 return;
6594
6595
6596 if (unlikely(curr->policy == SCHED_IDLE) &&
6597 likely(p->policy != SCHED_IDLE))
6598 goto preempt;
6599
6600
6601
6602
6603
6604 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6605 return;
6606
6607 find_matching_se(&se, &pse);
6608 update_curr(cfs_rq_of(se));
6609 BUG_ON(!pse);
6610 if (wakeup_preempt_entity(se, pse) == 1) {
6611
6612
6613
6614
6615 if (!next_buddy_marked)
6616 set_next_buddy(pse);
6617 goto preempt;
6618 }
6619
6620 return;
6621
6622preempt:
6623 resched_curr(rq);
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633 if (unlikely(!se->on_rq || curr == rq->idle))
6634 return;
6635
6636 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6637 set_last_buddy(se);
6638}
6639
6640static struct task_struct *
6641pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6642{
6643 struct cfs_rq *cfs_rq = &rq->cfs;
6644 struct sched_entity *se;
6645 struct task_struct *p;
6646 int new_tasks;
6647
6648again:
6649 if (!cfs_rq->nr_running)
6650 goto idle;
6651
6652#ifdef CONFIG_FAIR_GROUP_SCHED
6653 if (prev->sched_class != &fair_sched_class)
6654 goto simple;
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664 do {
6665 struct sched_entity *curr = cfs_rq->curr;
6666
6667
6668
6669
6670
6671
6672
6673 if (curr) {
6674 if (curr->on_rq)
6675 update_curr(cfs_rq);
6676 else
6677 curr = NULL;
6678
6679
6680
6681
6682
6683
6684
6685 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6686 cfs_rq = &rq->cfs;
6687
6688 if (!cfs_rq->nr_running)
6689 goto idle;
6690
6691 goto simple;
6692 }
6693 }
6694
6695 se = pick_next_entity(cfs_rq, curr);
6696 cfs_rq = group_cfs_rq(se);
6697 } while (cfs_rq);
6698
6699 p = task_of(se);
6700
6701
6702
6703
6704
6705
6706 if (prev != p) {
6707 struct sched_entity *pse = &prev->se;
6708
6709 while (!(cfs_rq = is_same_group(se, pse))) {
6710 int se_depth = se->depth;
6711 int pse_depth = pse->depth;
6712
6713 if (se_depth <= pse_depth) {
6714 put_prev_entity(cfs_rq_of(pse), pse);
6715 pse = parent_entity(pse);
6716 }
6717 if (se_depth >= pse_depth) {
6718 set_next_entity(cfs_rq_of(se), se);
6719 se = parent_entity(se);
6720 }
6721 }
6722
6723 put_prev_entity(cfs_rq, pse);
6724 set_next_entity(cfs_rq, se);
6725 }
6726
6727 goto done;
6728simple:
6729#endif
6730
6731 put_prev_task(rq, prev);
6732
6733 do {
6734 se = pick_next_entity(cfs_rq, NULL);
6735 set_next_entity(cfs_rq, se);
6736 cfs_rq = group_cfs_rq(se);
6737 } while (cfs_rq);
6738
6739 p = task_of(se);
6740
6741done: __maybe_unused
6742#ifdef CONFIG_SMP
6743
6744
6745
6746
6747
6748 list_move(&p->se.group_node, &rq->cfs_tasks);
6749#endif
6750
6751 if (hrtick_enabled(rq))
6752 hrtick_start_fair(rq, p);
6753
6754 return p;
6755
6756idle:
6757 new_tasks = idle_balance(rq, rf);
6758
6759
6760
6761
6762
6763
6764 if (new_tasks < 0)
6765 return RETRY_TASK;
6766
6767 if (new_tasks > 0)
6768 goto again;
6769
6770 return NULL;
6771}
6772
6773
6774
6775
6776static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6777{
6778 struct sched_entity *se = &prev->se;
6779 struct cfs_rq *cfs_rq;
6780
6781 for_each_sched_entity(se) {
6782 cfs_rq = cfs_rq_of(se);
6783 put_prev_entity(cfs_rq, se);
6784 }
6785}
6786
6787
6788
6789
6790
6791
6792static void yield_task_fair(struct rq *rq)
6793{
6794 struct task_struct *curr = rq->curr;
6795 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6796 struct sched_entity *se = &curr->se;
6797
6798
6799
6800
6801 if (unlikely(rq->nr_running == 1))
6802 return;
6803
6804 clear_buddies(cfs_rq, se);
6805
6806 if (curr->policy != SCHED_BATCH) {
6807 update_rq_clock(rq);
6808
6809
6810
6811 update_curr(cfs_rq);
6812
6813
6814
6815
6816
6817 rq_clock_skip_update(rq, true);
6818 }
6819
6820 set_skip_buddy(se);
6821}
6822
6823static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6824{
6825 struct sched_entity *se = &p->se;
6826
6827
6828 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6829 return false;
6830
6831
6832 set_next_buddy(se);
6833
6834 yield_task_fair(rq);
6835
6836 return true;
6837}
6838
6839#ifdef CONFIG_SMP
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6959
6960enum fbq_type { regular, remote, all };
6961
6962#define LBF_ALL_PINNED 0x01
6963#define LBF_NEED_BREAK 0x02
6964#define LBF_DST_PINNED 0x04
6965#define LBF_SOME_PINNED 0x08
6966
6967struct lb_env {
6968 struct sched_domain *sd;
6969
6970 struct rq *src_rq;
6971 int src_cpu;
6972
6973 int dst_cpu;
6974 struct rq *dst_rq;
6975
6976 struct cpumask *dst_grpmask;
6977 int new_dst_cpu;
6978 enum cpu_idle_type idle;
6979 long imbalance;
6980
6981 struct cpumask *cpus;
6982
6983 unsigned int flags;
6984
6985 unsigned int loop;
6986 unsigned int loop_break;
6987 unsigned int loop_max;
6988
6989 enum fbq_type fbq_type;
6990 struct list_head tasks;
6991};
6992
6993
6994
6995
6996static int task_hot(struct task_struct *p, struct lb_env *env)
6997{
6998 s64 delta;
6999
7000 lockdep_assert_held(&env->src_rq->lock);
7001
7002 if (p->sched_class != &fair_sched_class)
7003 return 0;
7004
7005 if (unlikely(p->policy == SCHED_IDLE))
7006 return 0;
7007
7008
7009
7010
7011 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7012 (&p->se == cfs_rq_of(&p->se)->next ||
7013 &p->se == cfs_rq_of(&p->se)->last))
7014 return 1;
7015
7016 if (sysctl_sched_migration_cost == -1)
7017 return 1;
7018 if (sysctl_sched_migration_cost == 0)
7019 return 0;
7020
7021 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7022
7023 return delta < (s64)sysctl_sched_migration_cost;
7024}
7025
7026#ifdef CONFIG_NUMA_BALANCING
7027
7028
7029
7030
7031
7032static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7033{
7034 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7035 unsigned long src_faults, dst_faults;
7036 int src_nid, dst_nid;
7037
7038 if (!static_branch_likely(&sched_numa_balancing))
7039 return -1;
7040
7041 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7042 return -1;
7043
7044 src_nid = cpu_to_node(env->src_cpu);
7045 dst_nid = cpu_to_node(env->dst_cpu);
7046
7047 if (src_nid == dst_nid)
7048 return -1;
7049
7050
7051 if (src_nid == p->numa_preferred_nid) {
7052 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7053 return 1;
7054 else
7055 return -1;
7056 }
7057
7058
7059 if (dst_nid == p->numa_preferred_nid)
7060 return 0;
7061
7062
7063 if (env->idle != CPU_NOT_IDLE)
7064 return -1;
7065
7066 if (numa_group) {
7067 src_faults = group_faults(p, src_nid);
7068 dst_faults = group_faults(p, dst_nid);
7069 } else {
7070 src_faults = task_faults(p, src_nid);
7071 dst_faults = task_faults(p, dst_nid);
7072 }
7073
7074 return dst_faults < src_faults;
7075}
7076
7077#else
7078static inline int migrate_degrades_locality(struct task_struct *p,
7079 struct lb_env *env)
7080{
7081 return -1;
7082}
7083#endif
7084
7085
7086
7087
7088static
7089int can_migrate_task(struct task_struct *p, struct lb_env *env)
7090{
7091 int tsk_cache_hot;
7092
7093 lockdep_assert_held(&env->src_rq->lock);
7094
7095
7096
7097
7098
7099
7100
7101
7102 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7103 return 0;
7104
7105 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
7106 int cpu;
7107
7108 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7109
7110 env->flags |= LBF_SOME_PINNED;
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7121 return 0;
7122
7123
7124 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7125 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7126 env->flags |= LBF_DST_PINNED;
7127 env->new_dst_cpu = cpu;
7128 break;
7129 }
7130 }
7131
7132 return 0;
7133 }
7134
7135
7136 env->flags &= ~LBF_ALL_PINNED;
7137
7138 if (task_running(env->src_rq, p)) {
7139 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7140 return 0;
7141 }
7142
7143
7144
7145
7146
7147
7148
7149 tsk_cache_hot = migrate_degrades_locality(p, env);
7150 if (tsk_cache_hot == -1)
7151 tsk_cache_hot = task_hot(p, env);
7152
7153 if (tsk_cache_hot <= 0 ||
7154 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7155 if (tsk_cache_hot == 1) {
7156 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7157 schedstat_inc(p->se.statistics.nr_forced_migrations);
7158 }
7159 return 1;
7160 }
7161
7162 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7163 return 0;
7164}
7165
7166
7167
7168
7169static void detach_task(struct task_struct *p, struct lb_env *env)
7170{
7171 lockdep_assert_held(&env->src_rq->lock);
7172
7173 p->on_rq = TASK_ON_RQ_MIGRATING;
7174 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7175 set_task_cpu(p, env->dst_cpu);
7176}
7177
7178
7179
7180
7181
7182
7183
7184static struct task_struct *detach_one_task(struct lb_env *env)
7185{
7186 struct task_struct *p;
7187
7188 lockdep_assert_held(&env->src_rq->lock);
7189
7190 list_for_each_entry_reverse(p,
7191 &env->src_rq->cfs_tasks, se.group_node) {
7192 if (!can_migrate_task(p, env))
7193 continue;
7194
7195 detach_task(p, env);
7196
7197
7198
7199
7200
7201
7202
7203 schedstat_inc(env->sd->lb_gained[env->idle]);
7204 return p;
7205 }
7206 return NULL;
7207}
7208
7209static const unsigned int sched_nr_migrate_break = 32;
7210
7211
7212
7213
7214
7215
7216
7217static int detach_tasks(struct lb_env *env)
7218{
7219 struct list_head *tasks = &env->src_rq->cfs_tasks;
7220 struct task_struct *p;
7221 unsigned long load;
7222 int detached = 0;
7223
7224 lockdep_assert_held(&env->src_rq->lock);
7225
7226 if (env->imbalance <= 0)
7227 return 0;
7228
7229 while (!list_empty(tasks)) {
7230
7231
7232
7233
7234 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7235 break;
7236
7237 p = list_last_entry(tasks, struct task_struct, se.group_node);
7238
7239 env->loop++;
7240
7241 if (env->loop > env->loop_max)
7242 break;
7243
7244
7245 if (env->loop > env->loop_break) {
7246 env->loop_break += sched_nr_migrate_break;
7247 env->flags |= LBF_NEED_BREAK;
7248 break;
7249 }
7250
7251 if (!can_migrate_task(p, env))
7252 goto next;
7253
7254 load = task_h_load(p);
7255
7256 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7257 goto next;
7258
7259 if ((load / 2) > env->imbalance)
7260 goto next;
7261
7262 detach_task(p, env);
7263 list_add(&p->se.group_node, &env->tasks);
7264
7265 detached++;
7266 env->imbalance -= load;
7267
7268#ifdef CONFIG_PREEMPT
7269
7270
7271
7272
7273
7274 if (env->idle == CPU_NEWLY_IDLE)
7275 break;
7276#endif
7277
7278
7279
7280
7281
7282 if (env->imbalance <= 0)
7283 break;
7284
7285 continue;
7286next:
7287 list_move(&p->se.group_node, tasks);
7288 }
7289
7290
7291
7292
7293
7294
7295 schedstat_add(env->sd->lb_gained[env->idle], detached);
7296
7297 return detached;
7298}
7299
7300
7301
7302
7303static void attach_task(struct rq *rq, struct task_struct *p)
7304{
7305 lockdep_assert_held(&rq->lock);
7306
7307 BUG_ON(task_rq(p) != rq);
7308 activate_task(rq, p, ENQUEUE_NOCLOCK);
7309 p->on_rq = TASK_ON_RQ_QUEUED;
7310 check_preempt_curr(rq, p, 0);
7311}
7312
7313
7314
7315
7316
7317static void attach_one_task(struct rq *rq, struct task_struct *p)
7318{
7319 struct rq_flags rf;
7320
7321 rq_lock(rq, &rf);
7322 update_rq_clock(rq);
7323 attach_task(rq, p);
7324 rq_unlock(rq, &rf);
7325}
7326
7327
7328
7329
7330
7331static void attach_tasks(struct lb_env *env)
7332{
7333 struct list_head *tasks = &env->tasks;
7334 struct task_struct *p;
7335 struct rq_flags rf;
7336
7337 rq_lock(env->dst_rq, &rf);
7338 update_rq_clock(env->dst_rq);
7339
7340 while (!list_empty(tasks)) {
7341 p = list_first_entry(tasks, struct task_struct, se.group_node);
7342 list_del_init(&p->se.group_node);
7343
7344 attach_task(env->dst_rq, p);
7345 }
7346
7347 rq_unlock(env->dst_rq, &rf);
7348}
7349
7350#ifdef CONFIG_FAIR_GROUP_SCHED
7351
7352static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7353{
7354 if (cfs_rq->load.weight)
7355 return false;
7356
7357 if (cfs_rq->avg.load_sum)
7358 return false;
7359
7360 if (cfs_rq->avg.util_sum)
7361 return false;
7362
7363 if (cfs_rq->avg.runnable_load_sum)
7364 return false;
7365
7366 return true;
7367}
7368
7369static void update_blocked_averages(int cpu)
7370{
7371 struct rq *rq = cpu_rq(cpu);
7372 struct cfs_rq *cfs_rq, *pos;
7373 struct rq_flags rf;
7374
7375 rq_lock_irqsave(rq, &rf);
7376 update_rq_clock(rq);
7377
7378
7379
7380
7381
7382 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7383 struct sched_entity *se;
7384
7385
7386 if (throttled_hierarchy(cfs_rq))
7387 continue;
7388
7389 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
7390 update_tg_load_avg(cfs_rq, 0);
7391
7392
7393 se = cfs_rq->tg->se[cpu];
7394 if (se && !skip_blocked_update(se))
7395 update_load_avg(cfs_rq_of(se), se, 0);
7396
7397
7398
7399
7400
7401 if (cfs_rq_is_decayed(cfs_rq))
7402 list_del_leaf_cfs_rq(cfs_rq);
7403 }
7404 rq_unlock_irqrestore(rq, &rf);
7405}
7406
7407
7408
7409
7410
7411
7412static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7413{
7414 struct rq *rq = rq_of(cfs_rq);
7415 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7416 unsigned long now = jiffies;
7417 unsigned long load;
7418
7419 if (cfs_rq->last_h_load_update == now)
7420 return;
7421
7422 cfs_rq->h_load_next = NULL;
7423 for_each_sched_entity(se) {
7424 cfs_rq = cfs_rq_of(se);
7425 cfs_rq->h_load_next = se;
7426 if (cfs_rq->last_h_load_update == now)
7427 break;
7428 }
7429
7430 if (!se) {
7431 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7432 cfs_rq->last_h_load_update = now;
7433 }
7434
7435 while ((se = cfs_rq->h_load_next) != NULL) {
7436 load = cfs_rq->h_load;
7437 load = div64_ul(load * se->avg.load_avg,
7438 cfs_rq_load_avg(cfs_rq) + 1);
7439 cfs_rq = group_cfs_rq(se);
7440 cfs_rq->h_load = load;
7441 cfs_rq->last_h_load_update = now;
7442 }
7443}
7444
7445static unsigned long task_h_load(struct task_struct *p)
7446{
7447 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7448
7449 update_cfs_rq_h_load(cfs_rq);
7450 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7451 cfs_rq_load_avg(cfs_rq) + 1);
7452}
7453#else
7454static inline void update_blocked_averages(int cpu)
7455{
7456 struct rq *rq = cpu_rq(cpu);
7457 struct cfs_rq *cfs_rq = &rq->cfs;
7458 struct rq_flags rf;
7459
7460 rq_lock_irqsave(rq, &rf);
7461 update_rq_clock(rq);
7462 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7463 rq_unlock_irqrestore(rq, &rf);
7464}
7465
7466static unsigned long task_h_load(struct task_struct *p)
7467{
7468 return p->se.avg.load_avg;
7469}
7470#endif
7471
7472
7473
7474enum group_type {
7475 group_other = 0,
7476 group_imbalanced,
7477 group_overloaded,
7478};
7479
7480
7481
7482
7483struct sg_lb_stats {
7484 unsigned long avg_load;
7485 unsigned long group_load;
7486 unsigned long sum_weighted_load;
7487 unsigned long load_per_task;
7488 unsigned long group_capacity;
7489 unsigned long group_util;
7490 unsigned int sum_nr_running;
7491 unsigned int idle_cpus;
7492 unsigned int group_weight;
7493 enum group_type group_type;
7494 int group_no_capacity;
7495#ifdef CONFIG_NUMA_BALANCING
7496 unsigned int nr_numa_running;
7497 unsigned int nr_preferred_running;
7498#endif
7499};
7500
7501
7502
7503
7504
7505struct sd_lb_stats {
7506 struct sched_group *busiest;
7507 struct sched_group *local;
7508 unsigned long total_running;
7509 unsigned long total_load;
7510 unsigned long total_capacity;
7511 unsigned long avg_load;
7512
7513 struct sg_lb_stats busiest_stat;
7514 struct sg_lb_stats local_stat;
7515};
7516
7517static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7518{
7519
7520
7521
7522
7523
7524
7525 *sds = (struct sd_lb_stats){
7526 .busiest = NULL,
7527 .local = NULL,
7528 .total_running = 0UL,
7529 .total_load = 0UL,
7530 .total_capacity = 0UL,
7531 .busiest_stat = {
7532 .avg_load = 0UL,
7533 .sum_nr_running = 0,
7534 .group_type = group_other,
7535 },
7536 };
7537}
7538
7539
7540
7541
7542
7543
7544
7545
7546static inline int get_sd_load_idx(struct sched_domain *sd,
7547 enum cpu_idle_type idle)
7548{
7549 int load_idx;
7550
7551 switch (idle) {
7552 case CPU_NOT_IDLE:
7553 load_idx = sd->busy_idx;
7554 break;
7555
7556 case CPU_NEWLY_IDLE:
7557 load_idx = sd->newidle_idx;
7558 break;
7559 default:
7560 load_idx = sd->idle_idx;
7561 break;
7562 }
7563
7564 return load_idx;
7565}
7566
7567static unsigned long scale_rt_capacity(int cpu)
7568{
7569 struct rq *rq = cpu_rq(cpu);
7570 u64 total, used, age_stamp, avg;
7571 s64 delta;
7572
7573
7574
7575
7576
7577 age_stamp = READ_ONCE(rq->age_stamp);
7578 avg = READ_ONCE(rq->rt_avg);
7579 delta = __rq_clock_broken(rq) - age_stamp;
7580
7581 if (unlikely(delta < 0))
7582 delta = 0;
7583
7584 total = sched_avg_period() + delta;
7585
7586 used = div_u64(avg, total);
7587
7588 if (likely(used < SCHED_CAPACITY_SCALE))
7589 return SCHED_CAPACITY_SCALE - used;
7590
7591 return 1;
7592}
7593
7594static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7595{
7596 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
7597 struct sched_group *sdg = sd->groups;
7598
7599 cpu_rq(cpu)->cpu_capacity_orig = capacity;
7600
7601 capacity *= scale_rt_capacity(cpu);
7602 capacity >>= SCHED_CAPACITY_SHIFT;
7603
7604 if (!capacity)
7605 capacity = 1;
7606
7607 cpu_rq(cpu)->cpu_capacity = capacity;
7608 sdg->sgc->capacity = capacity;
7609 sdg->sgc->min_capacity = capacity;
7610}
7611
7612void update_group_capacity(struct sched_domain *sd, int cpu)
7613{
7614 struct sched_domain *child = sd->child;
7615 struct sched_group *group, *sdg = sd->groups;
7616 unsigned long capacity, min_capacity;
7617 unsigned long interval;
7618
7619 interval = msecs_to_jiffies(sd->balance_interval);
7620 interval = clamp(interval, 1UL, max_load_balance_interval);
7621 sdg->sgc->next_update = jiffies + interval;
7622
7623 if (!child) {
7624 update_cpu_capacity(sd, cpu);
7625 return;
7626 }
7627
7628 capacity = 0;
7629 min_capacity = ULONG_MAX;
7630
7631 if (child->flags & SD_OVERLAP) {
7632
7633
7634
7635
7636
7637 for_each_cpu(cpu, sched_group_span(sdg)) {
7638 struct sched_group_capacity *sgc;
7639 struct rq *rq = cpu_rq(cpu);
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652 if (unlikely(!rq->sd)) {
7653 capacity += capacity_of(cpu);
7654 } else {
7655 sgc = rq->sd->groups->sgc;
7656 capacity += sgc->capacity;
7657 }
7658
7659 min_capacity = min(capacity, min_capacity);
7660 }
7661 } else {
7662
7663
7664
7665
7666
7667 group = child->groups;
7668 do {
7669 struct sched_group_capacity *sgc = group->sgc;
7670
7671 capacity += sgc->capacity;
7672 min_capacity = min(sgc->min_capacity, min_capacity);
7673 group = group->next;
7674 } while (group != child->groups);
7675 }
7676
7677 sdg->sgc->capacity = capacity;
7678 sdg->sgc->min_capacity = min_capacity;
7679}
7680
7681
7682
7683
7684
7685
7686static inline int
7687check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7688{
7689 return ((rq->cpu_capacity * sd->imbalance_pct) <
7690 (rq->cpu_capacity_orig * 100));
7691}
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722static inline int sg_imbalanced(struct sched_group *group)
7723{
7724 return group->sgc->imbalance;
7725}
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739static inline bool
7740group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
7741{
7742 if (sgs->sum_nr_running < sgs->group_weight)
7743 return true;
7744
7745 if ((sgs->group_capacity * 100) >
7746 (sgs->group_util * env->sd->imbalance_pct))
7747 return true;
7748
7749 return false;
7750}
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760static inline bool
7761group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7762{
7763 if (sgs->sum_nr_running <= sgs->group_weight)
7764 return false;
7765
7766 if ((sgs->group_capacity * 100) <
7767 (sgs->group_util * env->sd->imbalance_pct))
7768 return true;
7769
7770 return false;
7771}
7772
7773
7774
7775
7776
7777static inline bool
7778group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7779{
7780 return sg->sgc->min_capacity * capacity_margin <
7781 ref->sgc->min_capacity * 1024;
7782}
7783
7784static inline enum
7785group_type group_classify(struct sched_group *group,
7786 struct sg_lb_stats *sgs)
7787{
7788 if (sgs->group_no_capacity)
7789 return group_overloaded;
7790
7791 if (sg_imbalanced(group))
7792 return group_imbalanced;
7793
7794 return group_other;
7795}
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806static inline void update_sg_lb_stats(struct lb_env *env,
7807 struct sched_group *group, int load_idx,
7808 int local_group, struct sg_lb_stats *sgs,
7809 bool *overload)
7810{
7811 unsigned long load;
7812 int i, nr_running;
7813
7814 memset(sgs, 0, sizeof(*sgs));
7815
7816 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7817 struct rq *rq = cpu_rq(i);
7818
7819
7820 if (local_group)
7821 load = target_load(i, load_idx);
7822 else
7823 load = source_load(i, load_idx);
7824
7825 sgs->group_load += load;
7826 sgs->group_util += cpu_util(i);
7827 sgs->sum_nr_running += rq->cfs.h_nr_running;
7828
7829 nr_running = rq->nr_running;
7830 if (nr_running > 1)
7831 *overload = true;
7832
7833#ifdef CONFIG_NUMA_BALANCING
7834 sgs->nr_numa_running += rq->nr_numa_running;
7835 sgs->nr_preferred_running += rq->nr_preferred_running;
7836#endif
7837 sgs->sum_weighted_load += weighted_cpuload(rq);
7838
7839
7840
7841 if (!nr_running && idle_cpu(i))
7842 sgs->idle_cpus++;
7843 }
7844
7845
7846 sgs->group_capacity = group->sgc->capacity;
7847 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
7848
7849 if (sgs->sum_nr_running)
7850 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7851
7852 sgs->group_weight = group->group_weight;
7853
7854 sgs->group_no_capacity = group_is_overloaded(env, sgs);
7855 sgs->group_type = group_classify(group, sgs);
7856}
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871static bool update_sd_pick_busiest(struct lb_env *env,
7872 struct sd_lb_stats *sds,
7873 struct sched_group *sg,
7874 struct sg_lb_stats *sgs)
7875{
7876 struct sg_lb_stats *busiest = &sds->busiest_stat;
7877
7878 if (sgs->group_type > busiest->group_type)
7879 return true;
7880
7881 if (sgs->group_type < busiest->group_type)
7882 return false;
7883
7884 if (sgs->avg_load <= busiest->avg_load)
7885 return false;
7886
7887 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7888 goto asym_packing;
7889
7890
7891
7892
7893
7894
7895
7896 if (sgs->sum_nr_running <= sgs->group_weight &&
7897 group_smaller_cpu_capacity(sds->local, sg))
7898 return false;
7899
7900asym_packing:
7901
7902 if (!(env->sd->flags & SD_ASYM_PACKING))
7903 return true;
7904
7905
7906 if (env->idle == CPU_NOT_IDLE)
7907 return true;
7908
7909
7910
7911
7912
7913 if (sgs->sum_nr_running &&
7914 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
7915 if (!sds->busiest)
7916 return true;
7917
7918
7919 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7920 sg->asym_prefer_cpu))
7921 return true;
7922 }
7923
7924 return false;
7925}
7926
7927#ifdef CONFIG_NUMA_BALANCING
7928static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7929{
7930 if (sgs->sum_nr_running > sgs->nr_numa_running)
7931 return regular;
7932 if (sgs->sum_nr_running > sgs->nr_preferred_running)
7933 return remote;
7934 return all;
7935}
7936
7937static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7938{
7939 if (rq->nr_running > rq->nr_numa_running)
7940 return regular;
7941 if (rq->nr_running > rq->nr_preferred_running)
7942 return remote;
7943 return all;
7944}
7945#else
7946static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7947{
7948 return all;
7949}
7950
7951static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7952{
7953 return regular;
7954}
7955#endif
7956
7957
7958
7959
7960
7961
7962static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7963{
7964 struct sched_domain *child = env->sd->child;
7965 struct sched_group *sg = env->sd->groups;
7966 struct sg_lb_stats *local = &sds->local_stat;
7967 struct sg_lb_stats tmp_sgs;
7968 int load_idx, prefer_sibling = 0;
7969 bool overload = false;
7970
7971 if (child && child->flags & SD_PREFER_SIBLING)
7972 prefer_sibling = 1;
7973
7974 load_idx = get_sd_load_idx(env->sd, env->idle);
7975
7976 do {
7977 struct sg_lb_stats *sgs = &tmp_sgs;
7978 int local_group;
7979
7980 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
7981 if (local_group) {
7982 sds->local = sg;
7983 sgs = local;
7984
7985 if (env->idle != CPU_NEWLY_IDLE ||
7986 time_after_eq(jiffies, sg->sgc->next_update))
7987 update_group_capacity(env->sd, env->dst_cpu);
7988 }
7989
7990 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
7991 &overload);
7992
7993 if (local_group)
7994 goto next_group;
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006 if (prefer_sibling && sds->local &&
8007 group_has_capacity(env, local) &&
8008 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8009 sgs->group_no_capacity = 1;
8010 sgs->group_type = group_classify(sg, sgs);
8011 }
8012
8013 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8014 sds->busiest = sg;
8015 sds->busiest_stat = *sgs;
8016 }
8017
8018next_group:
8019
8020 sds->total_running += sgs->sum_nr_running;
8021 sds->total_load += sgs->group_load;
8022 sds->total_capacity += sgs->group_capacity;
8023
8024 sg = sg->next;
8025 } while (sg != env->sd->groups);
8026
8027 if (env->sd->flags & SD_NUMA)
8028 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8029
8030 if (!env->sd->parent) {
8031
8032 if (env->dst_rq->rd->overload != overload)
8033 env->dst_rq->rd->overload = overload;
8034 }
8035}
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8061{
8062 int busiest_cpu;
8063
8064 if (!(env->sd->flags & SD_ASYM_PACKING))
8065 return 0;
8066
8067 if (env->idle == CPU_NOT_IDLE)
8068 return 0;
8069
8070 if (!sds->busiest)
8071 return 0;
8072
8073 busiest_cpu = sds->busiest->asym_prefer_cpu;
8074 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8075 return 0;
8076
8077 env->imbalance = DIV_ROUND_CLOSEST(
8078 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8079 SCHED_CAPACITY_SCALE);
8080
8081 return 1;
8082}
8083
8084
8085
8086
8087
8088
8089
8090
8091static inline
8092void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8093{
8094 unsigned long tmp, capa_now = 0, capa_move = 0;
8095 unsigned int imbn = 2;
8096 unsigned long scaled_busy_load_per_task;
8097 struct sg_lb_stats *local, *busiest;
8098
8099 local = &sds->local_stat;
8100 busiest = &sds->busiest_stat;
8101
8102 if (!local->sum_nr_running)
8103 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8104 else if (busiest->load_per_task > local->load_per_task)
8105 imbn = 1;
8106
8107 scaled_busy_load_per_task =
8108 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8109 busiest->group_capacity;
8110
8111 if (busiest->avg_load + scaled_busy_load_per_task >=
8112 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8113 env->imbalance = busiest->load_per_task;
8114 return;
8115 }
8116
8117
8118
8119
8120
8121
8122
8123 capa_now += busiest->group_capacity *
8124 min(busiest->load_per_task, busiest->avg_load);
8125 capa_now += local->group_capacity *
8126 min(local->load_per_task, local->avg_load);
8127 capa_now /= SCHED_CAPACITY_SCALE;
8128
8129
8130 if (busiest->avg_load > scaled_busy_load_per_task) {
8131 capa_move += busiest->group_capacity *
8132 min(busiest->load_per_task,
8133 busiest->avg_load - scaled_busy_load_per_task);
8134 }
8135
8136
8137 if (busiest->avg_load * busiest->group_capacity <
8138 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8139 tmp = (busiest->avg_load * busiest->group_capacity) /
8140 local->group_capacity;
8141 } else {
8142 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8143 local->group_capacity;
8144 }
8145 capa_move += local->group_capacity *
8146 min(local->load_per_task, local->avg_load + tmp);
8147 capa_move /= SCHED_CAPACITY_SCALE;
8148
8149
8150 if (capa_move > capa_now)
8151 env->imbalance = busiest->load_per_task;
8152}
8153
8154
8155
8156
8157
8158
8159
8160static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8161{
8162 unsigned long max_pull, load_above_capacity = ~0UL;
8163 struct sg_lb_stats *local, *busiest;
8164
8165 local = &sds->local_stat;
8166 busiest = &sds->busiest_stat;
8167
8168 if (busiest->group_type == group_imbalanced) {
8169
8170
8171
8172
8173 busiest->load_per_task =
8174 min(busiest->load_per_task, sds->avg_load);
8175 }
8176
8177
8178
8179
8180
8181
8182
8183 if (busiest->avg_load <= sds->avg_load ||
8184 local->avg_load >= sds->avg_load) {
8185 env->imbalance = 0;
8186 return fix_small_imbalance(env, sds);
8187 }
8188
8189
8190
8191
8192 if (busiest->group_type == group_overloaded &&
8193 local->group_type == group_overloaded) {
8194 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8195 if (load_above_capacity > busiest->group_capacity) {
8196 load_above_capacity -= busiest->group_capacity;
8197 load_above_capacity *= scale_load_down(NICE_0_LOAD);
8198 load_above_capacity /= busiest->group_capacity;
8199 } else
8200 load_above_capacity = ~0UL;
8201 }
8202
8203
8204
8205
8206
8207
8208
8209
8210 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8211
8212
8213 env->imbalance = min(
8214 max_pull * busiest->group_capacity,
8215 (sds->avg_load - local->avg_load) * local->group_capacity
8216 ) / SCHED_CAPACITY_SCALE;
8217
8218
8219
8220
8221
8222
8223
8224 if (env->imbalance < busiest->load_per_task)
8225 return fix_small_imbalance(env, sds);
8226}
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241static struct sched_group *find_busiest_group(struct lb_env *env)
8242{
8243 struct sg_lb_stats *local, *busiest;
8244 struct sd_lb_stats sds;
8245
8246 init_sd_lb_stats(&sds);
8247
8248
8249
8250
8251
8252 update_sd_lb_stats(env, &sds);
8253 local = &sds.local_stat;
8254 busiest = &sds.busiest_stat;
8255
8256
8257 if (check_asym_packing(env, &sds))
8258 return sds.busiest;
8259
8260
8261 if (!sds.busiest || busiest->sum_nr_running == 0)
8262 goto out_balanced;
8263
8264
8265 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8266 / sds.total_capacity;
8267
8268
8269
8270
8271
8272
8273 if (busiest->group_type == group_imbalanced)
8274 goto force_balance;
8275
8276
8277
8278
8279
8280 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8281 busiest->group_no_capacity)
8282 goto force_balance;
8283
8284
8285
8286
8287
8288 if (local->avg_load >= busiest->avg_load)
8289 goto out_balanced;
8290
8291
8292
8293
8294
8295 if (local->avg_load >= sds.avg_load)
8296 goto out_balanced;
8297
8298 if (env->idle == CPU_IDLE) {
8299
8300
8301
8302
8303
8304
8305
8306 if ((busiest->group_type != group_overloaded) &&
8307 (local->idle_cpus <= (busiest->idle_cpus + 1)))
8308 goto out_balanced;
8309 } else {
8310
8311
8312
8313
8314 if (100 * busiest->avg_load <=
8315 env->sd->imbalance_pct * local->avg_load)
8316 goto out_balanced;
8317 }
8318
8319force_balance:
8320
8321 calculate_imbalance(env, &sds);
8322 return sds.busiest;
8323
8324out_balanced:
8325 env->imbalance = 0;
8326 return NULL;
8327}
8328
8329
8330
8331
8332static struct rq *find_busiest_queue(struct lb_env *env,
8333 struct sched_group *group)
8334{
8335 struct rq *busiest = NULL, *rq;
8336 unsigned long busiest_load = 0, busiest_capacity = 1;
8337 int i;
8338
8339 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8340 unsigned long capacity, wl;
8341 enum fbq_type rt;
8342
8343 rq = cpu_rq(i);
8344 rt = fbq_classify_rq(rq);
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365 if (rt > env->fbq_type)
8366 continue;
8367
8368 capacity = capacity_of(i);
8369
8370 wl = weighted_cpuload(rq);
8371
8372
8373
8374
8375
8376
8377 if (rq->nr_running == 1 && wl > env->imbalance &&
8378 !check_cpu_capacity(rq, env->sd))
8379 continue;
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392 if (wl * busiest_capacity > busiest_load * capacity) {
8393 busiest_load = wl;
8394 busiest_capacity = capacity;
8395 busiest = rq;
8396 }
8397 }
8398
8399 return busiest;
8400}
8401
8402
8403
8404
8405
8406#define MAX_PINNED_INTERVAL 512
8407
8408static int need_active_balance(struct lb_env *env)
8409{
8410 struct sched_domain *sd = env->sd;
8411
8412 if (env->idle == CPU_NEWLY_IDLE) {
8413
8414
8415
8416
8417
8418
8419 if ((sd->flags & SD_ASYM_PACKING) &&
8420 sched_asym_prefer(env->dst_cpu, env->src_cpu))
8421 return 1;
8422 }
8423
8424
8425
8426
8427
8428
8429
8430 if ((env->idle != CPU_NOT_IDLE) &&
8431 (env->src_rq->cfs.h_nr_running == 1)) {
8432 if ((check_cpu_capacity(env->src_rq, sd)) &&
8433 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8434 return 1;
8435 }
8436
8437 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8438}
8439
8440static int active_load_balance_cpu_stop(void *data);
8441
8442static int should_we_balance(struct lb_env *env)
8443{
8444 struct sched_group *sg = env->sd->groups;
8445 int cpu, balance_cpu = -1;
8446
8447
8448
8449
8450
8451 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8452 return 0;
8453
8454
8455
8456
8457
8458 if (env->idle == CPU_NEWLY_IDLE)
8459 return 1;
8460
8461
8462 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8463 if (!idle_cpu(cpu))
8464 continue;
8465
8466 balance_cpu = cpu;
8467 break;
8468 }
8469
8470 if (balance_cpu == -1)
8471 balance_cpu = group_balance_cpu(sg);
8472
8473
8474
8475
8476
8477 return balance_cpu == env->dst_cpu;
8478}
8479
8480
8481
8482
8483
8484static int load_balance(int this_cpu, struct rq *this_rq,
8485 struct sched_domain *sd, enum cpu_idle_type idle,
8486 int *continue_balancing)
8487{
8488 int ld_moved, cur_ld_moved, active_balance = 0;
8489 struct sched_domain *sd_parent = sd->parent;
8490 struct sched_group *group;
8491 struct rq *busiest;
8492 struct rq_flags rf;
8493 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8494
8495 struct lb_env env = {
8496 .sd = sd,
8497 .dst_cpu = this_cpu,
8498 .dst_rq = this_rq,
8499 .dst_grpmask = sched_group_span(sd->groups),
8500 .idle = idle,
8501 .loop_break = sched_nr_migrate_break,
8502 .cpus = cpus,
8503 .fbq_type = all,
8504 .tasks = LIST_HEAD_INIT(env.tasks),
8505 };
8506
8507 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8508
8509 schedstat_inc(sd->lb_count[idle]);
8510
8511redo:
8512 if (!should_we_balance(&env)) {
8513 *continue_balancing = 0;
8514 goto out_balanced;
8515 }
8516
8517 group = find_busiest_group(&env);
8518 if (!group) {
8519 schedstat_inc(sd->lb_nobusyg[idle]);
8520 goto out_balanced;
8521 }
8522
8523 busiest = find_busiest_queue(&env, group);
8524 if (!busiest) {
8525 schedstat_inc(sd->lb_nobusyq[idle]);
8526 goto out_balanced;
8527 }
8528
8529 BUG_ON(busiest == env.dst_rq);
8530
8531 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8532
8533 env.src_cpu = busiest->cpu;
8534 env.src_rq = busiest;
8535
8536 ld_moved = 0;
8537 if (busiest->nr_running > 1) {
8538
8539
8540
8541
8542
8543
8544 env.flags |= LBF_ALL_PINNED;
8545 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8546
8547more_balance:
8548 rq_lock_irqsave(busiest, &rf);
8549 update_rq_clock(busiest);
8550
8551
8552
8553
8554
8555 cur_ld_moved = detach_tasks(&env);
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565 rq_unlock(busiest, &rf);
8566
8567 if (cur_ld_moved) {
8568 attach_tasks(&env);
8569 ld_moved += cur_ld_moved;
8570 }
8571
8572 local_irq_restore(rf.flags);
8573
8574 if (env.flags & LBF_NEED_BREAK) {
8575 env.flags &= ~LBF_NEED_BREAK;
8576 goto more_balance;
8577 }
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8599
8600
8601 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8602
8603 env.dst_rq = cpu_rq(env.new_dst_cpu);
8604 env.dst_cpu = env.new_dst_cpu;
8605 env.flags &= ~LBF_DST_PINNED;
8606 env.loop = 0;
8607 env.loop_break = sched_nr_migrate_break;
8608
8609
8610
8611
8612
8613 goto more_balance;
8614 }
8615
8616
8617
8618
8619 if (sd_parent) {
8620 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8621
8622 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8623 *group_imbalance = 1;
8624 }
8625
8626
8627 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8628 cpumask_clear_cpu(cpu_of(busiest), cpus);
8629
8630
8631
8632
8633
8634
8635
8636
8637 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8638 env.loop = 0;
8639 env.loop_break = sched_nr_migrate_break;
8640 goto redo;
8641 }
8642 goto out_all_pinned;
8643 }
8644 }
8645
8646 if (!ld_moved) {
8647 schedstat_inc(sd->lb_failed[idle]);
8648
8649
8650
8651
8652
8653
8654 if (idle != CPU_NEWLY_IDLE)
8655 sd->nr_balance_failed++;
8656
8657 if (need_active_balance(&env)) {
8658 unsigned long flags;
8659
8660 raw_spin_lock_irqsave(&busiest->lock, flags);
8661
8662
8663
8664
8665
8666 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8667 raw_spin_unlock_irqrestore(&busiest->lock,
8668 flags);
8669 env.flags |= LBF_ALL_PINNED;
8670 goto out_one_pinned;
8671 }
8672
8673
8674
8675
8676
8677
8678 if (!busiest->active_balance) {
8679 busiest->active_balance = 1;
8680 busiest->push_cpu = this_cpu;
8681 active_balance = 1;
8682 }
8683 raw_spin_unlock_irqrestore(&busiest->lock, flags);
8684
8685 if (active_balance) {
8686 stop_one_cpu_nowait(cpu_of(busiest),
8687 active_load_balance_cpu_stop, busiest,
8688 &busiest->active_balance_work);
8689 }
8690
8691
8692 sd->nr_balance_failed = sd->cache_nice_tries+1;
8693 }
8694 } else
8695 sd->nr_balance_failed = 0;
8696
8697 if (likely(!active_balance)) {
8698
8699 sd->balance_interval = sd->min_interval;
8700 } else {
8701
8702
8703
8704
8705
8706
8707 if (sd->balance_interval < sd->max_interval)
8708 sd->balance_interval *= 2;
8709 }
8710
8711 goto out;
8712
8713out_balanced:
8714
8715
8716
8717
8718 if (sd_parent) {
8719 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8720
8721 if (*group_imbalance)
8722 *group_imbalance = 0;
8723 }
8724
8725out_all_pinned:
8726
8727
8728
8729
8730
8731 schedstat_inc(sd->lb_balanced[idle]);
8732
8733 sd->nr_balance_failed = 0;
8734
8735out_one_pinned:
8736
8737 if (((env.flags & LBF_ALL_PINNED) &&
8738 sd->balance_interval < MAX_PINNED_INTERVAL) ||
8739 (sd->balance_interval < sd->max_interval))
8740 sd->balance_interval *= 2;
8741
8742 ld_moved = 0;
8743out:
8744 return ld_moved;
8745}
8746
8747static inline unsigned long
8748get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
8749{
8750 unsigned long interval = sd->balance_interval;
8751
8752 if (cpu_busy)
8753 interval *= sd->busy_factor;
8754
8755
8756 interval = msecs_to_jiffies(interval);
8757 interval = clamp(interval, 1UL, max_load_balance_interval);
8758
8759 return interval;
8760}
8761
8762static inline void
8763update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8764{
8765 unsigned long interval, next;
8766
8767
8768 interval = get_sd_balance_interval(sd, 0);
8769 next = sd->last_balance + interval;
8770
8771 if (time_after(*next_balance, next))
8772 *next_balance = next;
8773}
8774
8775
8776
8777
8778
8779static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
8780{
8781 unsigned long next_balance = jiffies + HZ;
8782 int this_cpu = this_rq->cpu;
8783 struct sched_domain *sd;
8784 int pulled_task = 0;
8785 u64 curr_cost = 0;
8786
8787
8788
8789
8790
8791 this_rq->idle_stamp = rq_clock(this_rq);
8792
8793
8794
8795
8796 if (!cpu_active(this_cpu))
8797 return 0;
8798
8799
8800
8801
8802
8803
8804
8805 rq_unpin_lock(this_rq, rf);
8806
8807 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8808 !this_rq->rd->overload) {
8809 rcu_read_lock();
8810 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8811 if (sd)
8812 update_next_balance(sd, &next_balance);
8813 rcu_read_unlock();
8814
8815 goto out;
8816 }
8817
8818 raw_spin_unlock(&this_rq->lock);
8819
8820 update_blocked_averages(this_cpu);
8821 rcu_read_lock();
8822 for_each_domain(this_cpu, sd) {
8823 int continue_balancing = 1;
8824 u64 t0, domain_cost;
8825
8826 if (!(sd->flags & SD_LOAD_BALANCE))
8827 continue;
8828
8829 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
8830 update_next_balance(sd, &next_balance);
8831 break;
8832 }
8833
8834 if (sd->flags & SD_BALANCE_NEWIDLE) {
8835 t0 = sched_clock_cpu(this_cpu);
8836
8837 pulled_task = load_balance(this_cpu, this_rq,
8838 sd, CPU_NEWLY_IDLE,
8839 &continue_balancing);
8840
8841 domain_cost = sched_clock_cpu(this_cpu) - t0;
8842 if (domain_cost > sd->max_newidle_lb_cost)
8843 sd->max_newidle_lb_cost = domain_cost;
8844
8845 curr_cost += domain_cost;
8846 }
8847
8848 update_next_balance(sd, &next_balance);
8849
8850
8851
8852
8853
8854 if (pulled_task || this_rq->nr_running > 0)
8855 break;
8856 }
8857 rcu_read_unlock();
8858
8859 raw_spin_lock(&this_rq->lock);
8860
8861 if (curr_cost > this_rq->max_idle_balance_cost)
8862 this_rq->max_idle_balance_cost = curr_cost;
8863
8864
8865
8866
8867
8868
8869 if (this_rq->cfs.h_nr_running && !pulled_task)
8870 pulled_task = 1;
8871
8872out:
8873
8874 if (time_after(this_rq->next_balance, next_balance))
8875 this_rq->next_balance = next_balance;
8876
8877
8878 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8879 pulled_task = -1;
8880
8881 if (pulled_task)
8882 this_rq->idle_stamp = 0;
8883
8884 rq_repin_lock(this_rq, rf);
8885
8886 return pulled_task;
8887}
8888
8889
8890
8891
8892
8893
8894
8895static int active_load_balance_cpu_stop(void *data)
8896{
8897 struct rq *busiest_rq = data;
8898 int busiest_cpu = cpu_of(busiest_rq);
8899 int target_cpu = busiest_rq->push_cpu;
8900 struct rq *target_rq = cpu_rq(target_cpu);
8901 struct sched_domain *sd;
8902 struct task_struct *p = NULL;
8903 struct rq_flags rf;
8904
8905 rq_lock_irq(busiest_rq, &rf);
8906
8907
8908
8909
8910
8911 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8912 goto out_unlock;
8913
8914
8915 if (unlikely(busiest_cpu != smp_processor_id() ||
8916 !busiest_rq->active_balance))
8917 goto out_unlock;
8918
8919
8920 if (busiest_rq->nr_running <= 1)
8921 goto out_unlock;
8922
8923
8924
8925
8926
8927
8928 BUG_ON(busiest_rq == target_rq);
8929
8930
8931 rcu_read_lock();
8932 for_each_domain(target_cpu, sd) {
8933 if ((sd->flags & SD_LOAD_BALANCE) &&
8934 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
8935 break;
8936 }
8937
8938 if (likely(sd)) {
8939 struct lb_env env = {
8940 .sd = sd,
8941 .dst_cpu = target_cpu,
8942 .dst_rq = target_rq,
8943 .src_cpu = busiest_rq->cpu,
8944 .src_rq = busiest_rq,
8945 .idle = CPU_IDLE,
8946
8947
8948
8949
8950
8951
8952 .flags = LBF_DST_PINNED,
8953 };
8954
8955 schedstat_inc(sd->alb_count);
8956 update_rq_clock(busiest_rq);
8957
8958 p = detach_one_task(&env);
8959 if (p) {
8960 schedstat_inc(sd->alb_pushed);
8961
8962 sd->nr_balance_failed = 0;
8963 } else {
8964 schedstat_inc(sd->alb_failed);
8965 }
8966 }
8967 rcu_read_unlock();
8968out_unlock:
8969 busiest_rq->active_balance = 0;
8970 rq_unlock(busiest_rq, &rf);
8971
8972 if (p)
8973 attach_one_task(target_rq, p);
8974
8975 local_irq_enable();
8976
8977 return 0;
8978}
8979
8980static inline int on_null_domain(struct rq *rq)
8981{
8982 return unlikely(!rcu_dereference_sched(rq->sd));
8983}
8984
8985#ifdef CONFIG_NO_HZ_COMMON
8986
8987
8988
8989
8990
8991
8992static struct {
8993 cpumask_var_t idle_cpus_mask;
8994 atomic_t nr_cpus;
8995 unsigned long next_balance;
8996} nohz ____cacheline_aligned;
8997
8998static inline int find_new_ilb(void)
8999{
9000 int ilb = cpumask_first(nohz.idle_cpus_mask);
9001
9002 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9003 return ilb;
9004
9005 return nr_cpu_ids;
9006}
9007
9008
9009
9010
9011
9012
9013static void nohz_balancer_kick(void)
9014{
9015 int ilb_cpu;
9016
9017 nohz.next_balance++;
9018
9019 ilb_cpu = find_new_ilb();
9020
9021 if (ilb_cpu >= nr_cpu_ids)
9022 return;
9023
9024 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9025 return;
9026
9027
9028
9029
9030
9031
9032 smp_send_reschedule(ilb_cpu);
9033 return;
9034}
9035
9036void nohz_balance_exit_idle(unsigned int cpu)
9037{
9038 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9039
9040
9041
9042 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9043 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9044 atomic_dec(&nohz.nr_cpus);
9045 }
9046 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9047 }
9048}
9049
9050static inline void set_cpu_sd_state_busy(void)
9051{
9052 struct sched_domain *sd;
9053 int cpu = smp_processor_id();
9054
9055 rcu_read_lock();
9056 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9057
9058 if (!sd || !sd->nohz_idle)
9059 goto unlock;
9060 sd->nohz_idle = 0;
9061
9062 atomic_inc(&sd->shared->nr_busy_cpus);
9063unlock:
9064 rcu_read_unlock();
9065}
9066
9067void set_cpu_sd_state_idle(void)
9068{
9069 struct sched_domain *sd;
9070 int cpu = smp_processor_id();
9071
9072 rcu_read_lock();
9073 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9074
9075 if (!sd || sd->nohz_idle)
9076 goto unlock;
9077 sd->nohz_idle = 1;
9078
9079 atomic_dec(&sd->shared->nr_busy_cpus);
9080unlock:
9081 rcu_read_unlock();
9082}
9083
9084
9085
9086
9087
9088void nohz_balance_enter_idle(int cpu)
9089{
9090
9091
9092
9093 if (!cpu_active(cpu))
9094 return;
9095
9096
9097 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9098 return;
9099
9100 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9101 return;
9102
9103
9104
9105
9106 if (on_null_domain(cpu_rq(cpu)))
9107 return;
9108
9109 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9110 atomic_inc(&nohz.nr_cpus);
9111 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9112}
9113#endif
9114
9115static DEFINE_SPINLOCK(balancing);
9116
9117
9118
9119
9120
9121void update_max_interval(void)
9122{
9123 max_load_balance_interval = HZ*num_online_cpus()/10;
9124}
9125
9126
9127
9128
9129
9130
9131
9132static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9133{
9134 int continue_balancing = 1;
9135 int cpu = rq->cpu;
9136 unsigned long interval;
9137 struct sched_domain *sd;
9138
9139 unsigned long next_balance = jiffies + 60*HZ;
9140 int update_next_balance = 0;
9141 int need_serialize, need_decay = 0;
9142 u64 max_cost = 0;
9143
9144 update_blocked_averages(cpu);
9145
9146 rcu_read_lock();
9147 for_each_domain(cpu, sd) {
9148
9149
9150
9151
9152 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9153 sd->max_newidle_lb_cost =
9154 (sd->max_newidle_lb_cost * 253) / 256;
9155 sd->next_decay_max_lb_cost = jiffies + HZ;
9156 need_decay = 1;
9157 }
9158 max_cost += sd->max_newidle_lb_cost;
9159
9160 if (!(sd->flags & SD_LOAD_BALANCE))
9161 continue;
9162
9163
9164
9165
9166
9167
9168 if (!continue_balancing) {
9169 if (need_decay)
9170 continue;
9171 break;
9172 }
9173
9174 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9175
9176 need_serialize = sd->flags & SD_SERIALIZE;
9177 if (need_serialize) {
9178 if (!spin_trylock(&balancing))
9179 goto out;
9180 }
9181
9182 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9183 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9184
9185
9186
9187
9188
9189 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9190 }
9191 sd->last_balance = jiffies;
9192 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9193 }
9194 if (need_serialize)
9195 spin_unlock(&balancing);
9196out:
9197 if (time_after(next_balance, sd->last_balance + interval)) {
9198 next_balance = sd->last_balance + interval;
9199 update_next_balance = 1;
9200 }
9201 }
9202 if (need_decay) {
9203
9204
9205
9206
9207 rq->max_idle_balance_cost =
9208 max((u64)sysctl_sched_migration_cost, max_cost);
9209 }
9210 rcu_read_unlock();
9211
9212
9213
9214
9215
9216
9217 if (likely(update_next_balance)) {
9218 rq->next_balance = next_balance;
9219
9220#ifdef CONFIG_NO_HZ_COMMON
9221
9222
9223
9224
9225
9226
9227
9228
9229 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9230 nohz.next_balance = rq->next_balance;
9231#endif
9232 }
9233}
9234
9235#ifdef CONFIG_NO_HZ_COMMON
9236
9237
9238
9239
9240static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9241{
9242 int this_cpu = this_rq->cpu;
9243 struct rq *rq;
9244 int balance_cpu;
9245
9246 unsigned long next_balance = jiffies + 60*HZ;
9247 int update_next_balance = 0;
9248
9249 if (idle != CPU_IDLE ||
9250 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
9251 goto end;
9252
9253 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9254 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9255 continue;
9256
9257
9258
9259
9260
9261
9262 if (need_resched())
9263 break;
9264
9265 rq = cpu_rq(balance_cpu);
9266
9267
9268
9269
9270
9271 if (time_after_eq(jiffies, rq->next_balance)) {
9272 struct rq_flags rf;
9273
9274 rq_lock_irq(rq, &rf);
9275 update_rq_clock(rq);
9276 cpu_load_update_idle(rq);
9277 rq_unlock_irq(rq, &rf);
9278
9279 rebalance_domains(rq, CPU_IDLE);
9280 }
9281
9282 if (time_after(next_balance, rq->next_balance)) {
9283 next_balance = rq->next_balance;
9284 update_next_balance = 1;
9285 }
9286 }
9287
9288
9289
9290
9291
9292
9293 if (likely(update_next_balance))
9294 nohz.next_balance = next_balance;
9295end:
9296 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9297}
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310static inline bool nohz_kick_needed(struct rq *rq)
9311{
9312 unsigned long now = jiffies;
9313 struct sched_domain_shared *sds;
9314 struct sched_domain *sd;
9315 int nr_busy, i, cpu = rq->cpu;
9316 bool kick = false;
9317
9318 if (unlikely(rq->idle_balance))
9319 return false;
9320
9321
9322
9323
9324
9325 set_cpu_sd_state_busy();
9326 nohz_balance_exit_idle(cpu);
9327
9328
9329
9330
9331
9332 if (likely(!atomic_read(&nohz.nr_cpus)))
9333 return false;
9334
9335 if (time_before(now, nohz.next_balance))
9336 return false;
9337
9338 if (rq->nr_running >= 2)
9339 return true;
9340
9341 rcu_read_lock();
9342 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9343 if (sds) {
9344
9345
9346
9347
9348 nr_busy = atomic_read(&sds->nr_busy_cpus);
9349 if (nr_busy > 1) {
9350 kick = true;
9351 goto unlock;
9352 }
9353
9354 }
9355
9356 sd = rcu_dereference(rq->sd);
9357 if (sd) {
9358 if ((rq->cfs.h_nr_running >= 1) &&
9359 check_cpu_capacity(rq, sd)) {
9360 kick = true;
9361 goto unlock;
9362 }
9363 }
9364
9365 sd = rcu_dereference(per_cpu(sd_asym, cpu));
9366 if (sd) {
9367 for_each_cpu(i, sched_domain_span(sd)) {
9368 if (i == cpu ||
9369 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9370 continue;
9371
9372 if (sched_asym_prefer(i, cpu)) {
9373 kick = true;
9374 goto unlock;
9375 }
9376 }
9377 }
9378unlock:
9379 rcu_read_unlock();
9380 return kick;
9381}
9382#else
9383static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9384#endif
9385
9386
9387
9388
9389
9390static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9391{
9392 struct rq *this_rq = this_rq();
9393 enum cpu_idle_type idle = this_rq->idle_balance ?
9394 CPU_IDLE : CPU_NOT_IDLE;
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404 nohz_idle_balance(this_rq, idle);
9405 rebalance_domains(this_rq, idle);
9406}
9407
9408
9409
9410
9411void trigger_load_balance(struct rq *rq)
9412{
9413
9414 if (unlikely(on_null_domain(rq)))
9415 return;
9416
9417 if (time_after_eq(jiffies, rq->next_balance))
9418 raise_softirq(SCHED_SOFTIRQ);
9419#ifdef CONFIG_NO_HZ_COMMON
9420 if (nohz_kick_needed(rq))
9421 nohz_balancer_kick();
9422#endif
9423}
9424
9425static void rq_online_fair(struct rq *rq)
9426{
9427 update_sysctl();
9428
9429 update_runtime_enabled(rq);
9430}
9431
9432static void rq_offline_fair(struct rq *rq)
9433{
9434 update_sysctl();
9435
9436
9437 unthrottle_offline_cfs_rqs(rq);
9438}
9439
9440#endif
9441
9442
9443
9444
9445static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9446{
9447 struct cfs_rq *cfs_rq;
9448 struct sched_entity *se = &curr->se;
9449
9450 for_each_sched_entity(se) {
9451 cfs_rq = cfs_rq_of(se);
9452 entity_tick(cfs_rq, se, queued);
9453 }
9454
9455 if (static_branch_unlikely(&sched_numa_balancing))
9456 task_tick_numa(rq, curr);
9457}
9458
9459
9460
9461
9462
9463
9464static void task_fork_fair(struct task_struct *p)
9465{
9466 struct cfs_rq *cfs_rq;
9467 struct sched_entity *se = &p->se, *curr;
9468 struct rq *rq = this_rq();
9469 struct rq_flags rf;
9470
9471 rq_lock(rq, &rf);
9472 update_rq_clock(rq);
9473
9474 cfs_rq = task_cfs_rq(current);
9475 curr = cfs_rq->curr;
9476 if (curr) {
9477 update_curr(cfs_rq);
9478 se->vruntime = curr->vruntime;
9479 }
9480 place_entity(cfs_rq, se, 1);
9481
9482 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9483
9484
9485
9486
9487 swap(curr->vruntime, se->vruntime);
9488 resched_curr(rq);
9489 }
9490
9491 se->vruntime -= cfs_rq->min_vruntime;
9492 rq_unlock(rq, &rf);
9493}
9494
9495
9496
9497
9498
9499static void
9500prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9501{
9502 if (!task_on_rq_queued(p))
9503 return;
9504
9505
9506
9507
9508
9509
9510 if (rq->curr == p) {
9511 if (p->prio > oldprio)
9512 resched_curr(rq);
9513 } else
9514 check_preempt_curr(rq, p, 0);
9515}
9516
9517static inline bool vruntime_normalized(struct task_struct *p)
9518{
9519 struct sched_entity *se = &p->se;
9520
9521
9522
9523
9524
9525
9526 if (p->on_rq)
9527 return true;
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
9539 return true;
9540
9541 return false;
9542}
9543
9544#ifdef CONFIG_FAIR_GROUP_SCHED
9545
9546
9547
9548
9549static void propagate_entity_cfs_rq(struct sched_entity *se)
9550{
9551 struct cfs_rq *cfs_rq;
9552
9553
9554 se = se->parent;
9555
9556 for_each_sched_entity(se) {
9557 cfs_rq = cfs_rq_of(se);
9558
9559 if (cfs_rq_throttled(cfs_rq))
9560 break;
9561
9562 update_load_avg(cfs_rq, se, UPDATE_TG);
9563 }
9564}
9565#else
9566static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9567#endif
9568
9569static void detach_entity_cfs_rq(struct sched_entity *se)
9570{
9571 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9572
9573
9574 update_load_avg(cfs_rq, se, 0);
9575 detach_entity_load_avg(cfs_rq, se);
9576 update_tg_load_avg(cfs_rq, false);
9577 propagate_entity_cfs_rq(se);
9578}
9579
9580static void attach_entity_cfs_rq(struct sched_entity *se)
9581{
9582 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9583
9584#ifdef CONFIG_FAIR_GROUP_SCHED
9585
9586
9587
9588
9589 se->depth = se->parent ? se->parent->depth + 1 : 0;
9590#endif
9591
9592
9593 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9594 attach_entity_load_avg(cfs_rq, se);
9595 update_tg_load_avg(cfs_rq, false);
9596 propagate_entity_cfs_rq(se);
9597}
9598
9599static void detach_task_cfs_rq(struct task_struct *p)
9600{
9601 struct sched_entity *se = &p->se;
9602 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9603
9604 if (!vruntime_normalized(p)) {
9605
9606
9607
9608
9609 place_entity(cfs_rq, se, 0);
9610 se->vruntime -= cfs_rq->min_vruntime;
9611 }
9612
9613 detach_entity_cfs_rq(se);
9614}
9615
9616static void attach_task_cfs_rq(struct task_struct *p)
9617{
9618 struct sched_entity *se = &p->se;
9619 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9620
9621 attach_entity_cfs_rq(se);
9622
9623 if (!vruntime_normalized(p))
9624 se->vruntime += cfs_rq->min_vruntime;
9625}
9626
9627static void switched_from_fair(struct rq *rq, struct task_struct *p)
9628{
9629 detach_task_cfs_rq(p);
9630}
9631
9632static void switched_to_fair(struct rq *rq, struct task_struct *p)
9633{
9634 attach_task_cfs_rq(p);
9635
9636 if (task_on_rq_queued(p)) {
9637
9638
9639
9640
9641
9642 if (rq->curr == p)
9643 resched_curr(rq);
9644 else
9645 check_preempt_curr(rq, p, 0);
9646 }
9647}
9648
9649
9650
9651
9652
9653
9654static void set_curr_task_fair(struct rq *rq)
9655{
9656 struct sched_entity *se = &rq->curr->se;
9657
9658 for_each_sched_entity(se) {
9659 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9660
9661 set_next_entity(cfs_rq, se);
9662
9663 account_cfs_rq_runtime(cfs_rq, 0);
9664 }
9665}
9666
9667void init_cfs_rq(struct cfs_rq *cfs_rq)
9668{
9669 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
9670 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9671#ifndef CONFIG_64BIT
9672 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9673#endif
9674#ifdef CONFIG_SMP
9675 raw_spin_lock_init(&cfs_rq->removed.lock);
9676#endif
9677}
9678
9679#ifdef CONFIG_FAIR_GROUP_SCHED
9680static void task_set_group_fair(struct task_struct *p)
9681{
9682 struct sched_entity *se = &p->se;
9683
9684 set_task_rq(p, task_cpu(p));
9685 se->depth = se->parent ? se->parent->depth + 1 : 0;
9686}
9687
9688static void task_move_group_fair(struct task_struct *p)
9689{
9690 detach_task_cfs_rq(p);
9691 set_task_rq(p, task_cpu(p));
9692
9693#ifdef CONFIG_SMP
9694
9695 p->se.avg.last_update_time = 0;
9696#endif
9697 attach_task_cfs_rq(p);
9698}
9699
9700static void task_change_group_fair(struct task_struct *p, int type)
9701{
9702 switch (type) {
9703 case TASK_SET_GROUP:
9704 task_set_group_fair(p);
9705 break;
9706
9707 case TASK_MOVE_GROUP:
9708 task_move_group_fair(p);
9709 break;
9710 }
9711}
9712
9713void free_fair_sched_group(struct task_group *tg)
9714{
9715 int i;
9716
9717 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9718
9719 for_each_possible_cpu(i) {
9720 if (tg->cfs_rq)
9721 kfree(tg->cfs_rq[i]);
9722 if (tg->se)
9723 kfree(tg->se[i]);
9724 }
9725
9726 kfree(tg->cfs_rq);
9727 kfree(tg->se);
9728}
9729
9730int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9731{
9732 struct sched_entity *se;
9733 struct cfs_rq *cfs_rq;
9734 int i;
9735
9736 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9737 if (!tg->cfs_rq)
9738 goto err;
9739 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9740 if (!tg->se)
9741 goto err;
9742
9743 tg->shares = NICE_0_LOAD;
9744
9745 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9746
9747 for_each_possible_cpu(i) {
9748 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9749 GFP_KERNEL, cpu_to_node(i));
9750 if (!cfs_rq)
9751 goto err;
9752
9753 se = kzalloc_node(sizeof(struct sched_entity),
9754 GFP_KERNEL, cpu_to_node(i));
9755 if (!se)
9756 goto err_free_rq;
9757
9758 init_cfs_rq(cfs_rq);
9759 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9760 init_entity_runnable_average(se);
9761 }
9762
9763 return 1;
9764
9765err_free_rq:
9766 kfree(cfs_rq);
9767err:
9768 return 0;
9769}
9770
9771void online_fair_sched_group(struct task_group *tg)
9772{
9773 struct sched_entity *se;
9774 struct rq *rq;
9775 int i;
9776
9777 for_each_possible_cpu(i) {
9778 rq = cpu_rq(i);
9779 se = tg->se[i];
9780
9781 raw_spin_lock_irq(&rq->lock);
9782 update_rq_clock(rq);
9783 attach_entity_cfs_rq(se);
9784 sync_throttle(tg, i);
9785 raw_spin_unlock_irq(&rq->lock);
9786 }
9787}
9788
9789void unregister_fair_sched_group(struct task_group *tg)
9790{
9791 unsigned long flags;
9792 struct rq *rq;
9793 int cpu;
9794
9795 for_each_possible_cpu(cpu) {
9796 if (tg->se[cpu])
9797 remove_entity_load_avg(tg->se[cpu]);
9798
9799
9800
9801
9802
9803 if (!tg->cfs_rq[cpu]->on_list)
9804 continue;
9805
9806 rq = cpu_rq(cpu);
9807
9808 raw_spin_lock_irqsave(&rq->lock, flags);
9809 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9810 raw_spin_unlock_irqrestore(&rq->lock, flags);
9811 }
9812}
9813
9814void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9815 struct sched_entity *se, int cpu,
9816 struct sched_entity *parent)
9817{
9818 struct rq *rq = cpu_rq(cpu);
9819
9820 cfs_rq->tg = tg;
9821 cfs_rq->rq = rq;
9822 init_cfs_rq_runtime(cfs_rq);
9823
9824 tg->cfs_rq[cpu] = cfs_rq;
9825 tg->se[cpu] = se;
9826
9827
9828 if (!se)
9829 return;
9830
9831 if (!parent) {
9832 se->cfs_rq = &rq->cfs;
9833 se->depth = 0;
9834 } else {
9835 se->cfs_rq = parent->my_q;
9836 se->depth = parent->depth + 1;
9837 }
9838
9839 se->my_q = cfs_rq;
9840
9841 update_load_set(&se->load, NICE_0_LOAD);
9842 se->parent = parent;
9843}
9844
9845static DEFINE_MUTEX(shares_mutex);
9846
9847int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9848{
9849 int i;
9850
9851
9852
9853
9854 if (!tg->se[0])
9855 return -EINVAL;
9856
9857 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9858
9859 mutex_lock(&shares_mutex);
9860 if (tg->shares == shares)
9861 goto done;
9862
9863 tg->shares = shares;
9864 for_each_possible_cpu(i) {
9865 struct rq *rq = cpu_rq(i);
9866 struct sched_entity *se = tg->se[i];
9867 struct rq_flags rf;
9868
9869
9870 rq_lock_irqsave(rq, &rf);
9871 update_rq_clock(rq);
9872 for_each_sched_entity(se) {
9873 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9874 update_cfs_group(se);
9875 }
9876 rq_unlock_irqrestore(rq, &rf);
9877 }
9878
9879done:
9880 mutex_unlock(&shares_mutex);
9881 return 0;
9882}
9883#else
9884
9885void free_fair_sched_group(struct task_group *tg) { }
9886
9887int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9888{
9889 return 1;
9890}
9891
9892void online_fair_sched_group(struct task_group *tg) { }
9893
9894void unregister_fair_sched_group(struct task_group *tg) { }
9895
9896#endif
9897
9898
9899static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
9900{
9901 struct sched_entity *se = &task->se;
9902 unsigned int rr_interval = 0;
9903
9904
9905
9906
9907
9908 if (rq->cfs.load.weight)
9909 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
9910
9911 return rr_interval;
9912}
9913
9914
9915
9916
9917const struct sched_class fair_sched_class = {
9918 .next = &idle_sched_class,
9919 .enqueue_task = enqueue_task_fair,
9920 .dequeue_task = dequeue_task_fair,
9921 .yield_task = yield_task_fair,
9922 .yield_to_task = yield_to_task_fair,
9923
9924 .check_preempt_curr = check_preempt_wakeup,
9925
9926 .pick_next_task = pick_next_task_fair,
9927 .put_prev_task = put_prev_task_fair,
9928
9929#ifdef CONFIG_SMP
9930 .select_task_rq = select_task_rq_fair,
9931 .migrate_task_rq = migrate_task_rq_fair,
9932
9933 .rq_online = rq_online_fair,
9934 .rq_offline = rq_offline_fair,
9935
9936 .task_dead = task_dead_fair,
9937 .set_cpus_allowed = set_cpus_allowed_common,
9938#endif
9939
9940 .set_curr_task = set_curr_task_fair,
9941 .task_tick = task_tick_fair,
9942 .task_fork = task_fork_fair,
9943
9944 .prio_changed = prio_changed_fair,
9945 .switched_from = switched_from_fair,
9946 .switched_to = switched_to_fair,
9947
9948 .get_rr_interval = get_rr_interval_fair,
9949
9950 .update_curr = update_curr_fair,
9951
9952#ifdef CONFIG_FAIR_GROUP_SCHED
9953 .task_change_group = task_change_group_fair,
9954#endif
9955};
9956
9957#ifdef CONFIG_SCHED_DEBUG
9958void print_cfs_stats(struct seq_file *m, int cpu)
9959{
9960 struct cfs_rq *cfs_rq, *pos;
9961
9962 rcu_read_lock();
9963 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
9964 print_cfs_rq(m, cpu, cfs_rq);
9965 rcu_read_unlock();
9966}
9967
9968#ifdef CONFIG_NUMA_BALANCING
9969void show_numa_stats(struct task_struct *p, struct seq_file *m)
9970{
9971 int node;
9972 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
9973
9974 for_each_online_node(node) {
9975 if (p->numa_faults) {
9976 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
9977 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
9978 }
9979 if (p->numa_group) {
9980 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
9981 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
9982 }
9983 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
9984 }
9985}
9986#endif
9987#endif
9988
9989__init void init_sched_fair_class(void)
9990{
9991#ifdef CONFIG_SMP
9992 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9993
9994#ifdef CONFIG_NO_HZ_COMMON
9995 nohz.next_balance = jiffies;
9996 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
9997#endif
9998#endif
9999
10000}
10001