1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/sched.h>
24#include <linux/latencytop.h>
25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
27#include <linux/slab.h>
28#include <linux/profile.h>
29#include <linux/interrupt.h>
30#include <linux/mempolicy.h>
31#include <linux/migrate.h>
32#include <linux/task_work.h>
33
34#include <trace/events/sched.h>
35
36#include "sched.h"
37
38
39
40
41
42
43
44
45
46
47
48
49
50unsigned int sysctl_sched_latency = 6000000ULL;
51unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52
53
54
55
56
57
58
59
60
61
62enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 = SCHED_TUNABLESCALING_LOG;
64
65
66
67
68
69unsigned int sysctl_sched_min_granularity = 750000ULL;
70unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71
72
73
74
75static unsigned int sched_nr_latency = 8;
76
77
78
79
80
81unsigned int sysctl_sched_child_runs_first __read_mostly;
82
83
84
85
86
87
88
89
90
91unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93
94const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95
96
97
98
99
100
101unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102
103#ifdef CONFIG_CFS_BANDWIDTH
104
105
106
107
108
109
110
111
112
113
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif
116
117static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118{
119 lw->weight += inc;
120 lw->inv_weight = 0;
121}
122
123static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124{
125 lw->weight -= dec;
126 lw->inv_weight = 0;
127}
128
129static inline void update_load_set(struct load_weight *lw, unsigned long w)
130{
131 lw->weight = w;
132 lw->inv_weight = 0;
133}
134
135
136
137
138
139
140
141
142
143
144static unsigned int get_update_sysctl_factor(void)
145{
146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor;
148
149 switch (sysctl_sched_tunable_scaling) {
150 case SCHED_TUNABLESCALING_NONE:
151 factor = 1;
152 break;
153 case SCHED_TUNABLESCALING_LINEAR:
154 factor = cpus;
155 break;
156 case SCHED_TUNABLESCALING_LOG:
157 default:
158 factor = 1 + ilog2(cpus);
159 break;
160 }
161
162 return factor;
163}
164
165static void update_sysctl(void)
166{
167 unsigned int factor = get_update_sysctl_factor();
168
169#define SET_SYSCTL(name) \
170 (sysctl_##name = (factor) * normalized_sysctl_##name)
171 SET_SYSCTL(sched_min_granularity);
172 SET_SYSCTL(sched_latency);
173 SET_SYSCTL(sched_wakeup_granularity);
174#undef SET_SYSCTL
175}
176
177void sched_init_granularity(void)
178{
179 update_sysctl();
180}
181
182#define WMULT_CONST (~0U)
183#define WMULT_SHIFT 32
184
185static void __update_inv_weight(struct load_weight *lw)
186{
187 unsigned long w;
188
189 if (likely(lw->inv_weight))
190 return;
191
192 w = scale_load_down(lw->weight);
193
194 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 lw->inv_weight = 1;
196 else if (unlikely(!w))
197 lw->inv_weight = WMULT_CONST;
198 else
199 lw->inv_weight = WMULT_CONST / w;
200}
201
202
203
204
205
206
207
208
209
210
211
212
213
214static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215{
216 u64 fact = scale_load_down(weight);
217 int shift = WMULT_SHIFT;
218
219 __update_inv_weight(lw);
220
221 if (unlikely(fact >> 32)) {
222 while (fact >> 32) {
223 fact >>= 1;
224 shift--;
225 }
226 }
227
228
229 fact = (u64)(u32)fact * lw->inv_weight;
230
231 while (fact >> 32) {
232 fact >>= 1;
233 shift--;
234 }
235
236 return mul_u64_u32_shr(delta_exec, fact, shift);
237}
238
239
240const struct sched_class fair_sched_class;
241
242
243
244
245
246#ifdef CONFIG_FAIR_GROUP_SCHED
247
248
249static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250{
251 return cfs_rq->rq;
252}
253
254
255#define entity_is_task(se) (!se->my_q)
256
257static inline struct task_struct *task_of(struct sched_entity *se)
258{
259#ifdef CONFIG_SCHED_DEBUG
260 WARN_ON_ONCE(!entity_is_task(se));
261#endif
262 return container_of(se, struct task_struct, se);
263}
264
265
266#define for_each_sched_entity(se) \
267 for (; se; se = se->parent)
268
269static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270{
271 return p->se.cfs_rq;
272}
273
274
275static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276{
277 return se->cfs_rq;
278}
279
280
281static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282{
283 return grp->my_q;
284}
285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288 if (!cfs_rq->on_list) {
289
290
291
292
293
294
295 if (cfs_rq->tg->parent &&
296 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 &rq_of(cfs_rq)->leaf_cfs_rq_list);
299 } else {
300 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 &rq_of(cfs_rq)->leaf_cfs_rq_list);
302 }
303
304 cfs_rq->on_list = 1;
305 }
306}
307
308static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309{
310 if (cfs_rq->on_list) {
311 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312 cfs_rq->on_list = 0;
313 }
314}
315
316
317#define for_each_leaf_cfs_rq(rq, cfs_rq) \
318 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319
320
321static inline struct cfs_rq *
322is_same_group(struct sched_entity *se, struct sched_entity *pse)
323{
324 if (se->cfs_rq == pse->cfs_rq)
325 return se->cfs_rq;
326
327 return NULL;
328}
329
330static inline struct sched_entity *parent_entity(struct sched_entity *se)
331{
332 return se->parent;
333}
334
335static void
336find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337{
338 int se_depth, pse_depth;
339
340
341
342
343
344
345
346
347
348 se_depth = (*se)->depth;
349 pse_depth = (*pse)->depth;
350
351 while (se_depth > pse_depth) {
352 se_depth--;
353 *se = parent_entity(*se);
354 }
355
356 while (pse_depth > se_depth) {
357 pse_depth--;
358 *pse = parent_entity(*pse);
359 }
360
361 while (!is_same_group(*se, *pse)) {
362 *se = parent_entity(*se);
363 *pse = parent_entity(*pse);
364 }
365}
366
367#else
368
369static inline struct task_struct *task_of(struct sched_entity *se)
370{
371 return container_of(se, struct task_struct, se);
372}
373
374static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375{
376 return container_of(cfs_rq, struct rq, cfs);
377}
378
379#define entity_is_task(se) 1
380
381#define for_each_sched_entity(se) \
382 for (; se; se = NULL)
383
384static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385{
386 return &task_rq(p)->cfs;
387}
388
389static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390{
391 struct task_struct *p = task_of(se);
392 struct rq *rq = task_rq(p);
393
394 return &rq->cfs;
395}
396
397
398static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399{
400 return NULL;
401}
402
403static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404{
405}
406
407static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408{
409}
410
411#define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413
414static inline struct sched_entity *parent_entity(struct sched_entity *se)
415{
416 return NULL;
417}
418
419static inline void
420find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421{
422}
423
424#endif
425
426static __always_inline
427void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428
429
430
431
432
433static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434{
435 s64 delta = (s64)(vruntime - max_vruntime);
436 if (delta > 0)
437 max_vruntime = vruntime;
438
439 return max_vruntime;
440}
441
442static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443{
444 s64 delta = (s64)(vruntime - min_vruntime);
445 if (delta < 0)
446 min_vruntime = vruntime;
447
448 return min_vruntime;
449}
450
451static inline int entity_before(struct sched_entity *a,
452 struct sched_entity *b)
453{
454 return (s64)(a->vruntime - b->vruntime) < 0;
455}
456
457static void update_min_vruntime(struct cfs_rq *cfs_rq)
458{
459 u64 vruntime = cfs_rq->min_vruntime;
460
461 if (cfs_rq->curr)
462 vruntime = cfs_rq->curr->vruntime;
463
464 if (cfs_rq->rb_leftmost) {
465 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 struct sched_entity,
467 run_node);
468
469 if (!cfs_rq->curr)
470 vruntime = se->vruntime;
471 else
472 vruntime = min_vruntime(vruntime, se->vruntime);
473 }
474
475
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT
478 smp_wmb();
479 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480#endif
481}
482
483
484
485
486static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487{
488 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 struct rb_node *parent = NULL;
490 struct sched_entity *entry;
491 int leftmost = 1;
492
493
494
495
496 while (*link) {
497 parent = *link;
498 entry = rb_entry(parent, struct sched_entity, run_node);
499
500
501
502
503 if (entity_before(se, entry)) {
504 link = &parent->rb_left;
505 } else {
506 link = &parent->rb_right;
507 leftmost = 0;
508 }
509 }
510
511
512
513
514
515 if (leftmost)
516 cfs_rq->rb_leftmost = &se->run_node;
517
518 rb_link_node(&se->run_node, parent, link);
519 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520}
521
522static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{
524 if (cfs_rq->rb_leftmost == &se->run_node) {
525 struct rb_node *next_node;
526
527 next_node = rb_next(&se->run_node);
528 cfs_rq->rb_leftmost = next_node;
529 }
530
531 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532}
533
534struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535{
536 struct rb_node *left = cfs_rq->rb_leftmost;
537
538 if (!left)
539 return NULL;
540
541 return rb_entry(left, struct sched_entity, run_node);
542}
543
544static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545{
546 struct rb_node *next = rb_next(&se->run_node);
547
548 if (!next)
549 return NULL;
550
551 return rb_entry(next, struct sched_entity, run_node);
552}
553
554#ifdef CONFIG_SCHED_DEBUG
555struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556{
557 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558
559 if (!last)
560 return NULL;
561
562 return rb_entry(last, struct sched_entity, run_node);
563}
564
565
566
567
568
569int sched_proc_update_handler(struct ctl_table *table, int write,
570 void __user *buffer, size_t *lenp,
571 loff_t *ppos)
572{
573 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 unsigned int factor = get_update_sysctl_factor();
575
576 if (ret || !write)
577 return ret;
578
579 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 sysctl_sched_min_granularity);
581
582#define WRT_SYSCTL(name) \
583 (normalized_sysctl_##name = sysctl_##name / (factor))
584 WRT_SYSCTL(sched_min_granularity);
585 WRT_SYSCTL(sched_latency);
586 WRT_SYSCTL(sched_wakeup_granularity);
587#undef WRT_SYSCTL
588
589 return 0;
590}
591#endif
592
593
594
595
596static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597{
598 if (unlikely(se->load.weight != NICE_0_LOAD))
599 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600
601 return delta;
602}
603
604
605
606
607
608
609
610
611
612static u64 __sched_period(unsigned long nr_running)
613{
614 if (unlikely(nr_running > sched_nr_latency))
615 return nr_running * sysctl_sched_min_granularity;
616 else
617 return sysctl_sched_latency;
618}
619
620
621
622
623
624
625
626static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627{
628 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629
630 for_each_sched_entity(se) {
631 struct load_weight *load;
632 struct load_weight lw;
633
634 cfs_rq = cfs_rq_of(se);
635 load = &cfs_rq->load;
636
637 if (unlikely(!se->on_rq)) {
638 lw = cfs_rq->load;
639
640 update_load_add(&lw, se->load.weight);
641 load = &lw;
642 }
643 slice = __calc_delta(slice, se->load.weight, load);
644 }
645 return slice;
646}
647
648
649
650
651
652
653static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{
655 return calc_delta_fair(sched_slice(cfs_rq, se), se);
656}
657
658#ifdef CONFIG_SMP
659static int select_idle_sibling(struct task_struct *p, int cpu);
660static unsigned long task_h_load(struct task_struct *p);
661
662
663
664
665
666
667#define LOAD_AVG_PERIOD 32
668#define LOAD_AVG_MAX 47742
669#define LOAD_AVG_MAX_N 345
670
671
672void init_entity_runnable_average(struct sched_entity *se)
673{
674 struct sched_avg *sa = &se->avg;
675
676 sa->last_update_time = 0;
677
678
679
680
681
682 sa->period_contrib = 1023;
683 sa->load_avg = scale_load_down(se->load.weight);
684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685
686
687
688 sa->util_avg = 0;
689 sa->util_sum = 0;
690
691}
692
693static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
694static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
695static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
696static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723void post_init_entity_util_avg(struct sched_entity *se)
724{
725 struct cfs_rq *cfs_rq = cfs_rq_of(se);
726 struct sched_avg *sa = &se->avg;
727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
730
731 if (cap > 0) {
732 if (cfs_rq->avg.util_avg != 0) {
733 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
734 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
735
736 if (sa->util_avg > cap)
737 sa->util_avg = cap;
738 } else {
739 sa->util_avg = cap;
740 }
741 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
742 }
743
744 if (entity_is_task(se)) {
745 struct task_struct *p = task_of(se);
746 if (p->sched_class != &fair_sched_class) {
747
748
749
750
751
752
753
754
755
756
757 se->avg.last_update_time = now;
758 return;
759 }
760 }
761
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update)
765 update_tg_load_avg(cfs_rq, false);
766}
767
768#else
769void init_entity_runnable_average(struct sched_entity *se)
770{
771}
772void post_init_entity_util_avg(struct sched_entity *se)
773{
774}
775static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
776{
777}
778#endif
779
780
781
782
783static void update_curr(struct cfs_rq *cfs_rq)
784{
785 struct sched_entity *curr = cfs_rq->curr;
786 u64 now = rq_clock_task(rq_of(cfs_rq));
787 u64 delta_exec;
788
789 if (unlikely(!curr))
790 return;
791
792 delta_exec = now - curr->exec_start;
793 if (unlikely((s64)delta_exec <= 0))
794 return;
795
796 curr->exec_start = now;
797
798 schedstat_set(curr->statistics.exec_max,
799 max(delta_exec, curr->statistics.exec_max));
800
801 curr->sum_exec_runtime += delta_exec;
802 schedstat_add(cfs_rq, exec_clock, delta_exec);
803
804 curr->vruntime += calc_delta_fair(delta_exec, curr);
805 update_min_vruntime(cfs_rq);
806
807 if (entity_is_task(curr)) {
808 struct task_struct *curtask = task_of(curr);
809
810 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
811 cpuacct_charge(curtask, delta_exec);
812 account_group_exec_runtime(curtask, delta_exec);
813 }
814
815 account_cfs_rq_runtime(cfs_rq, delta_exec);
816}
817
818static void update_curr_fair(struct rq *rq)
819{
820 update_curr(cfs_rq_of(&rq->curr->se));
821}
822
823#ifdef CONFIG_SCHEDSTATS
824static inline void
825update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
826{
827 u64 wait_start = rq_clock(rq_of(cfs_rq));
828
829 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
830 likely(wait_start > se->statistics.wait_start))
831 wait_start -= se->statistics.wait_start;
832
833 se->statistics.wait_start = wait_start;
834}
835
836static void
837update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
838{
839 struct task_struct *p;
840 u64 delta;
841
842 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
843
844 if (entity_is_task(se)) {
845 p = task_of(se);
846 if (task_on_rq_migrating(p)) {
847
848
849
850
851
852 se->statistics.wait_start = delta;
853 return;
854 }
855 trace_sched_stat_wait(p, delta);
856 }
857
858 se->statistics.wait_max = max(se->statistics.wait_max, delta);
859 se->statistics.wait_count++;
860 se->statistics.wait_sum += delta;
861 se->statistics.wait_start = 0;
862}
863
864
865
866
867static inline void
868update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
869{
870
871
872
873
874 if (se != cfs_rq->curr)
875 update_stats_wait_start(cfs_rq, se);
876}
877
878static inline void
879update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
880{
881
882
883
884
885 if (se != cfs_rq->curr)
886 update_stats_wait_end(cfs_rq, se);
887
888 if (flags & DEQUEUE_SLEEP) {
889 if (entity_is_task(se)) {
890 struct task_struct *tsk = task_of(se);
891
892 if (tsk->state & TASK_INTERRUPTIBLE)
893 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
894 if (tsk->state & TASK_UNINTERRUPTIBLE)
895 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
896 }
897 }
898
899}
900#else
901static inline void
902update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
903{
904}
905
906static inline void
907update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
908{
909}
910
911static inline void
912update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
913{
914}
915
916static inline void
917update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
918{
919}
920#endif
921
922
923
924
925static inline void
926update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
927{
928
929
930
931 se->exec_start = rq_clock_task(rq_of(cfs_rq));
932}
933
934
935
936
937
938#ifdef CONFIG_NUMA_BALANCING
939
940
941
942
943
944unsigned int sysctl_numa_balancing_scan_period_min = 1000;
945unsigned int sysctl_numa_balancing_scan_period_max = 60000;
946
947
948unsigned int sysctl_numa_balancing_scan_size = 256;
949
950
951unsigned int sysctl_numa_balancing_scan_delay = 1000;
952
953static unsigned int task_nr_scan_windows(struct task_struct *p)
954{
955 unsigned long rss = 0;
956 unsigned long nr_scan_pages;
957
958
959
960
961
962
963 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
964 rss = get_mm_rss(p->mm);
965 if (!rss)
966 rss = nr_scan_pages;
967
968 rss = round_up(rss, nr_scan_pages);
969 return rss / nr_scan_pages;
970}
971
972
973#define MAX_SCAN_WINDOW 2560
974
975static unsigned int task_scan_min(struct task_struct *p)
976{
977 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
978 unsigned int scan, floor;
979 unsigned int windows = 1;
980
981 if (scan_size < MAX_SCAN_WINDOW)
982 windows = MAX_SCAN_WINDOW / scan_size;
983 floor = 1000 / windows;
984
985 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
986 return max_t(unsigned int, floor, scan);
987}
988
989static unsigned int task_scan_max(struct task_struct *p)
990{
991 unsigned int smin = task_scan_min(p);
992 unsigned int smax;
993
994
995 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
996 return max(smin, smax);
997}
998
999static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1000{
1001 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1002 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1003}
1004
1005static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1006{
1007 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1008 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1009}
1010
1011struct numa_group {
1012 atomic_t refcount;
1013
1014 spinlock_t lock;
1015 int nr_tasks;
1016 pid_t gid;
1017 int active_nodes;
1018
1019 struct rcu_head rcu;
1020 unsigned long total_faults;
1021 unsigned long max_faults_cpu;
1022
1023
1024
1025
1026
1027 unsigned long *faults_cpu;
1028 unsigned long faults[0];
1029};
1030
1031
1032#define NR_NUMA_HINT_FAULT_TYPES 2
1033
1034
1035#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1036
1037
1038#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1039
1040pid_t task_numa_group_id(struct task_struct *p)
1041{
1042 return p->numa_group ? p->numa_group->gid : 0;
1043}
1044
1045
1046
1047
1048
1049
1050
1051static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1052{
1053 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1054}
1055
1056static inline unsigned long task_faults(struct task_struct *p, int nid)
1057{
1058 if (!p->numa_faults)
1059 return 0;
1060
1061 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1062 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1063}
1064
1065static inline unsigned long group_faults(struct task_struct *p, int nid)
1066{
1067 if (!p->numa_group)
1068 return 0;
1069
1070 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1071 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1072}
1073
1074static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1075{
1076 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1077 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1078}
1079
1080
1081
1082
1083
1084
1085#define ACTIVE_NODE_FRACTION 3
1086
1087static bool numa_is_active_node(int nid, struct numa_group *ng)
1088{
1089 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1090}
1091
1092
1093static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1094 int maxdist, bool task)
1095{
1096 unsigned long score = 0;
1097 int node;
1098
1099
1100
1101
1102
1103 if (sched_numa_topology_type == NUMA_DIRECT)
1104 return 0;
1105
1106
1107
1108
1109
1110 for_each_online_node(node) {
1111 unsigned long faults;
1112 int dist = node_distance(nid, node);
1113
1114
1115
1116
1117
1118 if (dist == sched_max_numa_distance || node == nid)
1119 continue;
1120
1121
1122
1123
1124
1125
1126
1127
1128 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1129 dist > maxdist)
1130 continue;
1131
1132
1133 if (task)
1134 faults = task_faults(p, node);
1135 else
1136 faults = group_faults(p, node);
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1147 faults *= (sched_max_numa_distance - dist);
1148 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1149 }
1150
1151 score += faults;
1152 }
1153
1154 return score;
1155}
1156
1157
1158
1159
1160
1161
1162
1163static inline unsigned long task_weight(struct task_struct *p, int nid,
1164 int dist)
1165{
1166 unsigned long faults, total_faults;
1167
1168 if (!p->numa_faults)
1169 return 0;
1170
1171 total_faults = p->total_numa_faults;
1172
1173 if (!total_faults)
1174 return 0;
1175
1176 faults = task_faults(p, nid);
1177 faults += score_nearby_nodes(p, nid, dist, true);
1178
1179 return 1000 * faults / total_faults;
1180}
1181
1182static inline unsigned long group_weight(struct task_struct *p, int nid,
1183 int dist)
1184{
1185 unsigned long faults, total_faults;
1186
1187 if (!p->numa_group)
1188 return 0;
1189
1190 total_faults = p->numa_group->total_faults;
1191
1192 if (!total_faults)
1193 return 0;
1194
1195 faults = group_faults(p, nid);
1196 faults += score_nearby_nodes(p, nid, dist, false);
1197
1198 return 1000 * faults / total_faults;
1199}
1200
1201bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1202 int src_nid, int dst_cpu)
1203{
1204 struct numa_group *ng = p->numa_group;
1205 int dst_nid = cpu_to_node(dst_cpu);
1206 int last_cpupid, this_cpupid;
1207
1208 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1228 if (!cpupid_pid_unset(last_cpupid) &&
1229 cpupid_to_nid(last_cpupid) != dst_nid)
1230 return false;
1231
1232
1233 if (cpupid_match_pid(p, last_cpupid))
1234 return true;
1235
1236
1237 if (!ng)
1238 return true;
1239
1240
1241
1242
1243
1244 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1245 ACTIVE_NODE_FRACTION)
1246 return true;
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1257 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1258}
1259
1260static unsigned long weighted_cpuload(const int cpu);
1261static unsigned long source_load(int cpu, int type);
1262static unsigned long target_load(int cpu, int type);
1263static unsigned long capacity_of(int cpu);
1264static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1265
1266
1267struct numa_stats {
1268 unsigned long nr_running;
1269 unsigned long load;
1270
1271
1272 unsigned long compute_capacity;
1273
1274
1275 unsigned long task_capacity;
1276 int has_free_capacity;
1277};
1278
1279
1280
1281
1282static void update_numa_stats(struct numa_stats *ns, int nid)
1283{
1284 int smt, cpu, cpus = 0;
1285 unsigned long capacity;
1286
1287 memset(ns, 0, sizeof(*ns));
1288 for_each_cpu(cpu, cpumask_of_node(nid)) {
1289 struct rq *rq = cpu_rq(cpu);
1290
1291 ns->nr_running += rq->nr_running;
1292 ns->load += weighted_cpuload(cpu);
1293 ns->compute_capacity += capacity_of(cpu);
1294
1295 cpus++;
1296 }
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306 if (!cpus)
1307 return;
1308
1309
1310 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1311 capacity = cpus / smt;
1312
1313 ns->task_capacity = min_t(unsigned, capacity,
1314 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1315 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1316}
1317
1318struct task_numa_env {
1319 struct task_struct *p;
1320
1321 int src_cpu, src_nid;
1322 int dst_cpu, dst_nid;
1323
1324 struct numa_stats src_stats, dst_stats;
1325
1326 int imbalance_pct;
1327 int dist;
1328
1329 struct task_struct *best_task;
1330 long best_imp;
1331 int best_cpu;
1332};
1333
1334static void task_numa_assign(struct task_numa_env *env,
1335 struct task_struct *p, long imp)
1336{
1337 if (env->best_task)
1338 put_task_struct(env->best_task);
1339 if (p)
1340 get_task_struct(p);
1341
1342 env->best_task = p;
1343 env->best_imp = imp;
1344 env->best_cpu = env->dst_cpu;
1345}
1346
1347static bool load_too_imbalanced(long src_load, long dst_load,
1348 struct task_numa_env *env)
1349{
1350 long imb, old_imb;
1351 long orig_src_load, orig_dst_load;
1352 long src_capacity, dst_capacity;
1353
1354
1355
1356
1357
1358
1359
1360
1361 src_capacity = env->src_stats.compute_capacity;
1362 dst_capacity = env->dst_stats.compute_capacity;
1363
1364
1365 if (dst_load < src_load)
1366 swap(dst_load, src_load);
1367
1368
1369 imb = dst_load * src_capacity * 100 -
1370 src_load * dst_capacity * env->imbalance_pct;
1371 if (imb <= 0)
1372 return false;
1373
1374
1375
1376
1377
1378 orig_src_load = env->src_stats.load;
1379 orig_dst_load = env->dst_stats.load;
1380
1381 if (orig_dst_load < orig_src_load)
1382 swap(orig_dst_load, orig_src_load);
1383
1384 old_imb = orig_dst_load * src_capacity * 100 -
1385 orig_src_load * dst_capacity * env->imbalance_pct;
1386
1387
1388 return (imb > old_imb);
1389}
1390
1391
1392
1393
1394
1395
1396
1397static void task_numa_compare(struct task_numa_env *env,
1398 long taskimp, long groupimp)
1399{
1400 struct rq *src_rq = cpu_rq(env->src_cpu);
1401 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1402 struct task_struct *cur;
1403 long src_load, dst_load;
1404 long load;
1405 long imp = env->p->numa_group ? groupimp : taskimp;
1406 long moveimp = imp;
1407 int dist = env->dist;
1408
1409 rcu_read_lock();
1410 cur = task_rcu_dereference(&dst_rq->curr);
1411 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1412 cur = NULL;
1413
1414
1415
1416
1417
1418 if (cur == env->p)
1419 goto unlock;
1420
1421
1422
1423
1424
1425
1426
1427
1428 if (cur) {
1429
1430 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1431 goto unlock;
1432
1433
1434
1435
1436
1437 if (cur->numa_group == env->p->numa_group) {
1438 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1439 task_weight(cur, env->dst_nid, dist);
1440
1441
1442
1443
1444 if (cur->numa_group)
1445 imp -= imp/16;
1446 } else {
1447
1448
1449
1450
1451
1452 if (cur->numa_group)
1453 imp += group_weight(cur, env->src_nid, dist) -
1454 group_weight(cur, env->dst_nid, dist);
1455 else
1456 imp += task_weight(cur, env->src_nid, dist) -
1457 task_weight(cur, env->dst_nid, dist);
1458 }
1459 }
1460
1461 if (imp <= env->best_imp && moveimp <= env->best_imp)
1462 goto unlock;
1463
1464 if (!cur) {
1465
1466 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1467 !env->dst_stats.has_free_capacity)
1468 goto unlock;
1469
1470 goto balance;
1471 }
1472
1473
1474 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1475 dst_rq->nr_running == 1)
1476 goto assign;
1477
1478
1479
1480
1481balance:
1482 load = task_h_load(env->p);
1483 dst_load = env->dst_stats.load + load;
1484 src_load = env->src_stats.load - load;
1485
1486 if (moveimp > imp && moveimp > env->best_imp) {
1487
1488
1489
1490
1491
1492
1493 if (!load_too_imbalanced(src_load, dst_load, env)) {
1494 imp = moveimp - 1;
1495 cur = NULL;
1496 goto assign;
1497 }
1498 }
1499
1500 if (imp <= env->best_imp)
1501 goto unlock;
1502
1503 if (cur) {
1504 load = task_h_load(cur);
1505 dst_load -= load;
1506 src_load += load;
1507 }
1508
1509 if (load_too_imbalanced(src_load, dst_load, env))
1510 goto unlock;
1511
1512
1513
1514
1515
1516 if (!cur)
1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1518
1519assign:
1520 task_numa_assign(env, cur, imp);
1521unlock:
1522 rcu_read_unlock();
1523}
1524
1525static void task_numa_find_cpu(struct task_numa_env *env,
1526 long taskimp, long groupimp)
1527{
1528 int cpu;
1529
1530 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1531
1532 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1533 continue;
1534
1535 env->dst_cpu = cpu;
1536 task_numa_compare(env, taskimp, groupimp);
1537 }
1538}
1539
1540
1541static bool numa_has_capacity(struct task_numa_env *env)
1542{
1543 struct numa_stats *src = &env->src_stats;
1544 struct numa_stats *dst = &env->dst_stats;
1545
1546 if (src->has_free_capacity && !dst->has_free_capacity)
1547 return false;
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557 if (src->load * dst->compute_capacity * env->imbalance_pct >
1558
1559 dst->load * src->compute_capacity * 100)
1560 return true;
1561
1562 return false;
1563}
1564
1565static int task_numa_migrate(struct task_struct *p)
1566{
1567 struct task_numa_env env = {
1568 .p = p,
1569
1570 .src_cpu = task_cpu(p),
1571 .src_nid = task_node(p),
1572
1573 .imbalance_pct = 112,
1574
1575 .best_task = NULL,
1576 .best_imp = 0,
1577 .best_cpu = -1,
1578 };
1579 struct sched_domain *sd;
1580 unsigned long taskweight, groupweight;
1581 int nid, ret, dist;
1582 long taskimp, groupimp;
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592 rcu_read_lock();
1593 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1594 if (sd)
1595 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1596 rcu_read_unlock();
1597
1598
1599
1600
1601
1602
1603
1604 if (unlikely(!sd)) {
1605 p->numa_preferred_nid = task_node(p);
1606 return -EINVAL;
1607 }
1608
1609 env.dst_nid = p->numa_preferred_nid;
1610 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1611 taskweight = task_weight(p, env.src_nid, dist);
1612 groupweight = group_weight(p, env.src_nid, dist);
1613 update_numa_stats(&env.src_stats, env.src_nid);
1614 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1615 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1616 update_numa_stats(&env.dst_stats, env.dst_nid);
1617
1618
1619 if (numa_has_capacity(&env))
1620 task_numa_find_cpu(&env, taskimp, groupimp);
1621
1622
1623
1624
1625
1626
1627
1628
1629 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1630 for_each_online_node(nid) {
1631 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1632 continue;
1633
1634 dist = node_distance(env.src_nid, env.dst_nid);
1635 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1636 dist != env.dist) {
1637 taskweight = task_weight(p, env.src_nid, dist);
1638 groupweight = group_weight(p, env.src_nid, dist);
1639 }
1640
1641
1642 taskimp = task_weight(p, nid, dist) - taskweight;
1643 groupimp = group_weight(p, nid, dist) - groupweight;
1644 if (taskimp < 0 && groupimp < 0)
1645 continue;
1646
1647 env.dist = dist;
1648 env.dst_nid = nid;
1649 update_numa_stats(&env.dst_stats, env.dst_nid);
1650 if (numa_has_capacity(&env))
1651 task_numa_find_cpu(&env, taskimp, groupimp);
1652 }
1653 }
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663 if (p->numa_group) {
1664 struct numa_group *ng = p->numa_group;
1665
1666 if (env.best_cpu == -1)
1667 nid = env.src_nid;
1668 else
1669 nid = env.dst_nid;
1670
1671 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1672 sched_setnuma(p, env.dst_nid);
1673 }
1674
1675
1676 if (env.best_cpu == -1)
1677 return -EAGAIN;
1678
1679
1680
1681
1682
1683 p->numa_scan_period = task_scan_min(p);
1684
1685 if (env.best_task == NULL) {
1686 ret = migrate_task_to(p, env.best_cpu);
1687 if (ret != 0)
1688 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1689 return ret;
1690 }
1691
1692 ret = migrate_swap(p, env.best_task);
1693 if (ret != 0)
1694 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1695 put_task_struct(env.best_task);
1696 return ret;
1697}
1698
1699
1700static void numa_migrate_preferred(struct task_struct *p)
1701{
1702 unsigned long interval = HZ;
1703
1704
1705 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1706 return;
1707
1708
1709 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1710 p->numa_migrate_retry = jiffies + interval;
1711
1712
1713 if (task_node(p) == p->numa_preferred_nid)
1714 return;
1715
1716
1717 task_numa_migrate(p);
1718}
1719
1720
1721
1722
1723
1724
1725
1726static void numa_group_count_active_nodes(struct numa_group *numa_group)
1727{
1728 unsigned long faults, max_faults = 0;
1729 int nid, active_nodes = 0;
1730
1731 for_each_online_node(nid) {
1732 faults = group_faults_cpu(numa_group, nid);
1733 if (faults > max_faults)
1734 max_faults = faults;
1735 }
1736
1737 for_each_online_node(nid) {
1738 faults = group_faults_cpu(numa_group, nid);
1739 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1740 active_nodes++;
1741 }
1742
1743 numa_group->max_faults_cpu = max_faults;
1744 numa_group->active_nodes = active_nodes;
1745}
1746
1747
1748
1749
1750
1751
1752
1753
1754#define NUMA_PERIOD_SLOTS 10
1755#define NUMA_PERIOD_THRESHOLD 7
1756
1757
1758
1759
1760
1761
1762
1763static void update_task_scan_period(struct task_struct *p,
1764 unsigned long shared, unsigned long private)
1765{
1766 unsigned int period_slot;
1767 int ratio;
1768 int diff;
1769
1770 unsigned long remote = p->numa_faults_locality[0];
1771 unsigned long local = p->numa_faults_locality[1];
1772
1773
1774
1775
1776
1777
1778
1779
1780 if (local + shared == 0 || p->numa_faults_locality[2]) {
1781 p->numa_scan_period = min(p->numa_scan_period_max,
1782 p->numa_scan_period << 1);
1783
1784 p->mm->numa_next_scan = jiffies +
1785 msecs_to_jiffies(p->numa_scan_period);
1786
1787 return;
1788 }
1789
1790
1791
1792
1793
1794
1795
1796 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1797 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1798 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1799 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1800 if (!slot)
1801 slot = 1;
1802 diff = slot * period_slot;
1803 } else {
1804 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1815 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1816 }
1817
1818 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1819 task_scan_min(p), task_scan_max(p));
1820 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1821}
1822
1823
1824
1825
1826
1827
1828
1829
1830static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1831{
1832 u64 runtime, delta, now;
1833
1834 now = p->se.exec_start;
1835 runtime = p->se.sum_exec_runtime;
1836
1837 if (p->last_task_numa_placement) {
1838 delta = runtime - p->last_sum_exec_runtime;
1839 *period = now - p->last_task_numa_placement;
1840 } else {
1841 delta = p->se.avg.load_sum / p->se.load.weight;
1842 *period = LOAD_AVG_MAX;
1843 }
1844
1845 p->last_sum_exec_runtime = runtime;
1846 p->last_task_numa_placement = now;
1847
1848 return delta;
1849}
1850
1851
1852
1853
1854
1855
1856static int preferred_group_nid(struct task_struct *p, int nid)
1857{
1858 nodemask_t nodes;
1859 int dist;
1860
1861
1862 if (sched_numa_topology_type == NUMA_DIRECT)
1863 return nid;
1864
1865
1866
1867
1868
1869
1870 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1871 unsigned long score, max_score = 0;
1872 int node, max_node = nid;
1873
1874 dist = sched_max_numa_distance;
1875
1876 for_each_online_node(node) {
1877 score = group_weight(p, node, dist);
1878 if (score > max_score) {
1879 max_score = score;
1880 max_node = node;
1881 }
1882 }
1883 return max_node;
1884 }
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895 nodes = node_online_map;
1896 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1897 unsigned long max_faults = 0;
1898 nodemask_t max_group = NODE_MASK_NONE;
1899 int a, b;
1900
1901
1902 if (!find_numa_distance(dist))
1903 continue;
1904
1905 for_each_node_mask(a, nodes) {
1906 unsigned long faults = 0;
1907 nodemask_t this_group;
1908 nodes_clear(this_group);
1909
1910
1911 for_each_node_mask(b, nodes) {
1912 if (node_distance(a, b) < dist) {
1913 faults += group_faults(p, b);
1914 node_set(b, this_group);
1915 node_clear(b, nodes);
1916 }
1917 }
1918
1919
1920 if (faults > max_faults) {
1921 max_faults = faults;
1922 max_group = this_group;
1923
1924
1925
1926
1927
1928 nid = a;
1929 }
1930 }
1931
1932 if (!max_faults)
1933 break;
1934 nodes = max_group;
1935 }
1936 return nid;
1937}
1938
1939static void task_numa_placement(struct task_struct *p)
1940{
1941 int seq, nid, max_nid = -1, max_group_nid = -1;
1942 unsigned long max_faults = 0, max_group_faults = 0;
1943 unsigned long fault_types[2] = { 0, 0 };
1944 unsigned long total_faults;
1945 u64 runtime, period;
1946 spinlock_t *group_lock = NULL;
1947
1948
1949
1950
1951
1952
1953 seq = READ_ONCE(p->mm->numa_scan_seq);
1954 if (p->numa_scan_seq == seq)
1955 return;
1956 p->numa_scan_seq = seq;
1957 p->numa_scan_period_max = task_scan_max(p);
1958
1959 total_faults = p->numa_faults_locality[0] +
1960 p->numa_faults_locality[1];
1961 runtime = numa_get_avg_runtime(p, &period);
1962
1963
1964 if (p->numa_group) {
1965 group_lock = &p->numa_group->lock;
1966 spin_lock_irq(group_lock);
1967 }
1968
1969
1970 for_each_online_node(nid) {
1971
1972 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1973 unsigned long faults = 0, group_faults = 0;
1974 int priv;
1975
1976 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1977 long diff, f_diff, f_weight;
1978
1979 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1980 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1981 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1982 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1983
1984
1985 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1986 fault_types[priv] += p->numa_faults[membuf_idx];
1987 p->numa_faults[membuf_idx] = 0;
1988
1989
1990
1991
1992
1993
1994
1995
1996 f_weight = div64_u64(runtime << 16, period + 1);
1997 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1998 (total_faults + 1);
1999 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2000 p->numa_faults[cpubuf_idx] = 0;
2001
2002 p->numa_faults[mem_idx] += diff;
2003 p->numa_faults[cpu_idx] += f_diff;
2004 faults += p->numa_faults[mem_idx];
2005 p->total_numa_faults += diff;
2006 if (p->numa_group) {
2007
2008
2009
2010
2011
2012
2013
2014 p->numa_group->faults[mem_idx] += diff;
2015 p->numa_group->faults_cpu[mem_idx] += f_diff;
2016 p->numa_group->total_faults += diff;
2017 group_faults += p->numa_group->faults[mem_idx];
2018 }
2019 }
2020
2021 if (faults > max_faults) {
2022 max_faults = faults;
2023 max_nid = nid;
2024 }
2025
2026 if (group_faults > max_group_faults) {
2027 max_group_faults = group_faults;
2028 max_group_nid = nid;
2029 }
2030 }
2031
2032 update_task_scan_period(p, fault_types[0], fault_types[1]);
2033
2034 if (p->numa_group) {
2035 numa_group_count_active_nodes(p->numa_group);
2036 spin_unlock_irq(group_lock);
2037 max_nid = preferred_group_nid(p, max_group_nid);
2038 }
2039
2040 if (max_faults) {
2041
2042 if (max_nid != p->numa_preferred_nid)
2043 sched_setnuma(p, max_nid);
2044
2045 if (task_node(p) != p->numa_preferred_nid)
2046 numa_migrate_preferred(p);
2047 }
2048}
2049
2050static inline int get_numa_group(struct numa_group *grp)
2051{
2052 return atomic_inc_not_zero(&grp->refcount);
2053}
2054
2055static inline void put_numa_group(struct numa_group *grp)
2056{
2057 if (atomic_dec_and_test(&grp->refcount))
2058 kfree_rcu(grp, rcu);
2059}
2060
2061static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2062 int *priv)
2063{
2064 struct numa_group *grp, *my_grp;
2065 struct task_struct *tsk;
2066 bool join = false;
2067 int cpu = cpupid_to_cpu(cpupid);
2068 int i;
2069
2070 if (unlikely(!p->numa_group)) {
2071 unsigned int size = sizeof(struct numa_group) +
2072 4*nr_node_ids*sizeof(unsigned long);
2073
2074 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2075 if (!grp)
2076 return;
2077
2078 atomic_set(&grp->refcount, 1);
2079 grp->active_nodes = 1;
2080 grp->max_faults_cpu = 0;
2081 spin_lock_init(&grp->lock);
2082 grp->gid = p->pid;
2083
2084 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2085 nr_node_ids;
2086
2087 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2088 grp->faults[i] = p->numa_faults[i];
2089
2090 grp->total_faults = p->total_numa_faults;
2091
2092 grp->nr_tasks++;
2093 rcu_assign_pointer(p->numa_group, grp);
2094 }
2095
2096 rcu_read_lock();
2097 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2098
2099 if (!cpupid_match_pid(tsk, cpupid))
2100 goto no_join;
2101
2102 grp = rcu_dereference(tsk->numa_group);
2103 if (!grp)
2104 goto no_join;
2105
2106 my_grp = p->numa_group;
2107 if (grp == my_grp)
2108 goto no_join;
2109
2110
2111
2112
2113
2114 if (my_grp->nr_tasks > grp->nr_tasks)
2115 goto no_join;
2116
2117
2118
2119
2120 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2121 goto no_join;
2122
2123
2124 if (tsk->mm == current->mm)
2125 join = true;
2126
2127
2128 if (flags & TNF_SHARED)
2129 join = true;
2130
2131
2132 *priv = !join;
2133
2134 if (join && !get_numa_group(grp))
2135 goto no_join;
2136
2137 rcu_read_unlock();
2138
2139 if (!join)
2140 return;
2141
2142 BUG_ON(irqs_disabled());
2143 double_lock_irq(&my_grp->lock, &grp->lock);
2144
2145 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2146 my_grp->faults[i] -= p->numa_faults[i];
2147 grp->faults[i] += p->numa_faults[i];
2148 }
2149 my_grp->total_faults -= p->total_numa_faults;
2150 grp->total_faults += p->total_numa_faults;
2151
2152 my_grp->nr_tasks--;
2153 grp->nr_tasks++;
2154
2155 spin_unlock(&my_grp->lock);
2156 spin_unlock_irq(&grp->lock);
2157
2158 rcu_assign_pointer(p->numa_group, grp);
2159
2160 put_numa_group(my_grp);
2161 return;
2162
2163no_join:
2164 rcu_read_unlock();
2165 return;
2166}
2167
2168void task_numa_free(struct task_struct *p)
2169{
2170 struct numa_group *grp = p->numa_group;
2171 void *numa_faults = p->numa_faults;
2172 unsigned long flags;
2173 int i;
2174
2175 if (grp) {
2176 spin_lock_irqsave(&grp->lock, flags);
2177 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2178 grp->faults[i] -= p->numa_faults[i];
2179 grp->total_faults -= p->total_numa_faults;
2180
2181 grp->nr_tasks--;
2182 spin_unlock_irqrestore(&grp->lock, flags);
2183 RCU_INIT_POINTER(p->numa_group, NULL);
2184 put_numa_group(grp);
2185 }
2186
2187 p->numa_faults = NULL;
2188 kfree(numa_faults);
2189}
2190
2191
2192
2193
2194void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2195{
2196 struct task_struct *p = current;
2197 bool migrated = flags & TNF_MIGRATED;
2198 int cpu_node = task_node(current);
2199 int local = !!(flags & TNF_FAULT_LOCAL);
2200 struct numa_group *ng;
2201 int priv;
2202
2203 if (!static_branch_likely(&sched_numa_balancing))
2204 return;
2205
2206
2207 if (!p->mm)
2208 return;
2209
2210
2211 if (unlikely(!p->numa_faults)) {
2212 int size = sizeof(*p->numa_faults) *
2213 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2214
2215 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2216 if (!p->numa_faults)
2217 return;
2218
2219 p->total_numa_faults = 0;
2220 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2221 }
2222
2223
2224
2225
2226
2227 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2228 priv = 1;
2229 } else {
2230 priv = cpupid_match_pid(p, last_cpupid);
2231 if (!priv && !(flags & TNF_NO_GROUP))
2232 task_numa_group(p, last_cpupid, flags, &priv);
2233 }
2234
2235
2236
2237
2238
2239
2240
2241 ng = p->numa_group;
2242 if (!priv && !local && ng && ng->active_nodes > 1 &&
2243 numa_is_active_node(cpu_node, ng) &&
2244 numa_is_active_node(mem_node, ng))
2245 local = 1;
2246
2247 task_numa_placement(p);
2248
2249
2250
2251
2252
2253 if (time_after(jiffies, p->numa_migrate_retry))
2254 numa_migrate_preferred(p);
2255
2256 if (migrated)
2257 p->numa_pages_migrated += pages;
2258 if (flags & TNF_MIGRATE_FAIL)
2259 p->numa_faults_locality[2] += pages;
2260
2261 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2262 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2263 p->numa_faults_locality[local] += pages;
2264}
2265
2266static void reset_ptenuma_scan(struct task_struct *p)
2267{
2268
2269
2270
2271
2272
2273
2274
2275
2276 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2277 p->mm->numa_scan_offset = 0;
2278}
2279
2280
2281
2282
2283
2284void task_numa_work(struct callback_head *work)
2285{
2286 unsigned long migrate, next_scan, now = jiffies;
2287 struct task_struct *p = current;
2288 struct mm_struct *mm = p->mm;
2289 u64 runtime = p->se.sum_exec_runtime;
2290 struct vm_area_struct *vma;
2291 unsigned long start, end;
2292 unsigned long nr_pte_updates = 0;
2293 long pages, virtpages;
2294
2295 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2296
2297 work->next = work;
2298
2299
2300
2301
2302
2303
2304
2305
2306 if (p->flags & PF_EXITING)
2307 return;
2308
2309 if (!mm->numa_next_scan) {
2310 mm->numa_next_scan = now +
2311 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2312 }
2313
2314
2315
2316
2317 migrate = mm->numa_next_scan;
2318 if (time_before(now, migrate))
2319 return;
2320
2321 if (p->numa_scan_period == 0) {
2322 p->numa_scan_period_max = task_scan_max(p);
2323 p->numa_scan_period = task_scan_min(p);
2324 }
2325
2326 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2327 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2328 return;
2329
2330
2331
2332
2333
2334 p->node_stamp += 2 * TICK_NSEC;
2335
2336 start = mm->numa_scan_offset;
2337 pages = sysctl_numa_balancing_scan_size;
2338 pages <<= 20 - PAGE_SHIFT;
2339 virtpages = pages * 8;
2340 if (!pages)
2341 return;
2342
2343
2344 down_read(&mm->mmap_sem);
2345 vma = find_vma(mm, start);
2346 if (!vma) {
2347 reset_ptenuma_scan(p);
2348 start = 0;
2349 vma = mm->mmap;
2350 }
2351 for (; vma; vma = vma->vm_next) {
2352 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2353 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2354 continue;
2355 }
2356
2357
2358
2359
2360
2361
2362
2363 if (!vma->vm_mm ||
2364 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2365 continue;
2366
2367
2368
2369
2370
2371 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2372 continue;
2373
2374 do {
2375 start = max(start, vma->vm_start);
2376 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2377 end = min(end, vma->vm_end);
2378 nr_pte_updates = change_prot_numa(vma, start, end);
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388 if (nr_pte_updates)
2389 pages -= (end - start) >> PAGE_SHIFT;
2390 virtpages -= (end - start) >> PAGE_SHIFT;
2391
2392 start = end;
2393 if (pages <= 0 || virtpages <= 0)
2394 goto out;
2395
2396 cond_resched();
2397 } while (end != vma->vm_end);
2398 }
2399
2400out:
2401
2402
2403
2404
2405
2406
2407 if (vma)
2408 mm->numa_scan_offset = start;
2409 else
2410 reset_ptenuma_scan(p);
2411 up_read(&mm->mmap_sem);
2412
2413
2414
2415
2416
2417
2418
2419 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2420 u64 diff = p->se.sum_exec_runtime - runtime;
2421 p->node_stamp += 32 * diff;
2422 }
2423}
2424
2425
2426
2427
2428void task_tick_numa(struct rq *rq, struct task_struct *curr)
2429{
2430 struct callback_head *work = &curr->numa_work;
2431 u64 period, now;
2432
2433
2434
2435
2436 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2437 return;
2438
2439
2440
2441
2442
2443
2444
2445 now = curr->se.sum_exec_runtime;
2446 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2447
2448 if (now > curr->node_stamp + period) {
2449 if (!curr->node_stamp)
2450 curr->numa_scan_period = task_scan_min(curr);
2451 curr->node_stamp += period;
2452
2453 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2454 init_task_work(work, task_numa_work);
2455 task_work_add(curr, work, true);
2456 }
2457 }
2458}
2459#else
2460static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2461{
2462}
2463
2464static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2465{
2466}
2467
2468static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2469{
2470}
2471#endif
2472
2473static void
2474account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2475{
2476 update_load_add(&cfs_rq->load, se->load.weight);
2477 if (!parent_entity(se))
2478 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2479#ifdef CONFIG_SMP
2480 if (entity_is_task(se)) {
2481 struct rq *rq = rq_of(cfs_rq);
2482
2483 account_numa_enqueue(rq, task_of(se));
2484 list_add(&se->group_node, &rq->cfs_tasks);
2485 }
2486#endif
2487 cfs_rq->nr_running++;
2488}
2489
2490static void
2491account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2492{
2493 update_load_sub(&cfs_rq->load, se->load.weight);
2494 if (!parent_entity(se))
2495 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2496#ifdef CONFIG_SMP
2497 if (entity_is_task(se)) {
2498 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2499 list_del_init(&se->group_node);
2500 }
2501#endif
2502 cfs_rq->nr_running--;
2503}
2504
2505#ifdef CONFIG_FAIR_GROUP_SCHED
2506# ifdef CONFIG_SMP
2507static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2508{
2509 long tg_weight, load, shares;
2510
2511
2512
2513
2514
2515
2516 load = scale_load_down(cfs_rq->load.weight);
2517
2518 tg_weight = atomic_long_read(&tg->load_avg);
2519
2520
2521 tg_weight -= cfs_rq->tg_load_avg_contrib;
2522 tg_weight += load;
2523
2524 shares = (tg->shares * load);
2525 if (tg_weight)
2526 shares /= tg_weight;
2527
2528 if (shares < MIN_SHARES)
2529 shares = MIN_SHARES;
2530 if (shares > tg->shares)
2531 shares = tg->shares;
2532
2533 return shares;
2534}
2535# else
2536static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2537{
2538 return tg->shares;
2539}
2540# endif
2541
2542static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2543 unsigned long weight)
2544{
2545 if (se->on_rq) {
2546
2547 if (cfs_rq->curr == se)
2548 update_curr(cfs_rq);
2549 account_entity_dequeue(cfs_rq, se);
2550 }
2551
2552 update_load_set(&se->load, weight);
2553
2554 if (se->on_rq)
2555 account_entity_enqueue(cfs_rq, se);
2556}
2557
2558static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2559
2560static void update_cfs_shares(struct cfs_rq *cfs_rq)
2561{
2562 struct task_group *tg;
2563 struct sched_entity *se;
2564 long shares;
2565
2566 tg = cfs_rq->tg;
2567 se = tg->se[cpu_of(rq_of(cfs_rq))];
2568 if (!se || throttled_hierarchy(cfs_rq))
2569 return;
2570#ifndef CONFIG_SMP
2571 if (likely(se->load.weight == tg->shares))
2572 return;
2573#endif
2574 shares = calc_cfs_shares(cfs_rq, tg);
2575
2576 reweight_entity(cfs_rq_of(se), se, shares);
2577}
2578#else
2579static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2580{
2581}
2582#endif
2583
2584#ifdef CONFIG_SMP
2585
2586static const u32 runnable_avg_yN_inv[] = {
2587 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2588 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2589 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2590 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2591 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2592 0x85aac367, 0x82cd8698,
2593};
2594
2595
2596
2597
2598
2599static const u32 runnable_avg_yN_sum[] = {
2600 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2601 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2602 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2603};
2604
2605
2606
2607
2608
2609
2610static const u32 __accumulated_sum_N32[] = {
2611 0, 23371, 35056, 40899, 43820, 45281,
2612 46011, 46376, 46559, 46650, 46696, 46719,
2613};
2614
2615
2616
2617
2618
2619static __always_inline u64 decay_load(u64 val, u64 n)
2620{
2621 unsigned int local_n;
2622
2623 if (!n)
2624 return val;
2625 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2626 return 0;
2627
2628
2629 local_n = n;
2630
2631
2632
2633
2634
2635
2636
2637
2638 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2639 val >>= local_n / LOAD_AVG_PERIOD;
2640 local_n %= LOAD_AVG_PERIOD;
2641 }
2642
2643 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2644 return val;
2645}
2646
2647
2648
2649
2650
2651
2652
2653
2654static u32 __compute_runnable_contrib(u64 n)
2655{
2656 u32 contrib = 0;
2657
2658 if (likely(n <= LOAD_AVG_PERIOD))
2659 return runnable_avg_yN_sum[n];
2660 else if (unlikely(n >= LOAD_AVG_MAX_N))
2661 return LOAD_AVG_MAX;
2662
2663
2664 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2665 n %= LOAD_AVG_PERIOD;
2666 contrib = decay_load(contrib, n);
2667 return contrib + runnable_avg_yN_sum[n];
2668}
2669
2670#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700static __always_inline int
2701__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2702 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2703{
2704 u64 delta, scaled_delta, periods;
2705 u32 contrib;
2706 unsigned int delta_w, scaled_delta_w, decayed = 0;
2707 unsigned long scale_freq, scale_cpu;
2708
2709 delta = now - sa->last_update_time;
2710
2711
2712
2713
2714 if ((s64)delta < 0) {
2715 sa->last_update_time = now;
2716 return 0;
2717 }
2718
2719
2720
2721
2722
2723 delta >>= 10;
2724 if (!delta)
2725 return 0;
2726 sa->last_update_time = now;
2727
2728 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2729 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2730
2731
2732 delta_w = sa->period_contrib;
2733 if (delta + delta_w >= 1024) {
2734 decayed = 1;
2735
2736
2737 sa->period_contrib = 0;
2738
2739
2740
2741
2742
2743
2744 delta_w = 1024 - delta_w;
2745 scaled_delta_w = cap_scale(delta_w, scale_freq);
2746 if (weight) {
2747 sa->load_sum += weight * scaled_delta_w;
2748 if (cfs_rq) {
2749 cfs_rq->runnable_load_sum +=
2750 weight * scaled_delta_w;
2751 }
2752 }
2753 if (running)
2754 sa->util_sum += scaled_delta_w * scale_cpu;
2755
2756 delta -= delta_w;
2757
2758
2759 periods = delta / 1024;
2760 delta %= 1024;
2761
2762 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2763 if (cfs_rq) {
2764 cfs_rq->runnable_load_sum =
2765 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2766 }
2767 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2768
2769
2770 contrib = __compute_runnable_contrib(periods);
2771 contrib = cap_scale(contrib, scale_freq);
2772 if (weight) {
2773 sa->load_sum += weight * contrib;
2774 if (cfs_rq)
2775 cfs_rq->runnable_load_sum += weight * contrib;
2776 }
2777 if (running)
2778 sa->util_sum += contrib * scale_cpu;
2779 }
2780
2781
2782 scaled_delta = cap_scale(delta, scale_freq);
2783 if (weight) {
2784 sa->load_sum += weight * scaled_delta;
2785 if (cfs_rq)
2786 cfs_rq->runnable_load_sum += weight * scaled_delta;
2787 }
2788 if (running)
2789 sa->util_sum += scaled_delta * scale_cpu;
2790
2791 sa->period_contrib += delta;
2792
2793 if (decayed) {
2794 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2795 if (cfs_rq) {
2796 cfs_rq->runnable_load_avg =
2797 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2798 }
2799 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2800 }
2801
2802 return decayed;
2803}
2804
2805#ifdef CONFIG_FAIR_GROUP_SCHED
2806
2807
2808
2809
2810static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811{
2812 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2813
2814
2815
2816
2817 if (cfs_rq->tg == &root_task_group)
2818 return;
2819
2820 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2821 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2822 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2823 }
2824}
2825
2826
2827
2828
2829
2830
2831void set_task_rq_fair(struct sched_entity *se,
2832 struct cfs_rq *prev, struct cfs_rq *next)
2833{
2834 if (!sched_feat(ATTACH_AGE_LOAD))
2835 return;
2836
2837
2838
2839
2840
2841
2842
2843
2844 if (se->avg.last_update_time && prev) {
2845 u64 p_last_update_time;
2846 u64 n_last_update_time;
2847
2848#ifndef CONFIG_64BIT
2849 u64 p_last_update_time_copy;
2850 u64 n_last_update_time_copy;
2851
2852 do {
2853 p_last_update_time_copy = prev->load_last_update_time_copy;
2854 n_last_update_time_copy = next->load_last_update_time_copy;
2855
2856 smp_rmb();
2857
2858 p_last_update_time = prev->avg.last_update_time;
2859 n_last_update_time = next->avg.last_update_time;
2860
2861 } while (p_last_update_time != p_last_update_time_copy ||
2862 n_last_update_time != n_last_update_time_copy);
2863#else
2864 p_last_update_time = prev->avg.last_update_time;
2865 n_last_update_time = next->avg.last_update_time;
2866#endif
2867 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2868 &se->avg, 0, 0, NULL);
2869 se->avg.last_update_time = n_last_update_time;
2870 }
2871}
2872#else
2873static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2874#endif
2875
2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2877{
2878 struct rq *rq = rq_of(cfs_rq);
2879 int cpu = cpu_of(rq);
2880
2881 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2882 unsigned long max = rq->cpu_capacity_orig;
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900 cpufreq_update_util(rq_clock(rq),
2901 min(cfs_rq->avg.util_avg, max), max);
2902 }
2903}
2904
2905
2906
2907
2908
2909
2910
2911
2912#define sub_positive(_ptr, _val) do { \
2913 typeof(_ptr) ptr = (_ptr); \
2914 typeof(*ptr) val = (_val); \
2915 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2916 res = var - val; \
2917 if (res > var) \
2918 res = 0; \
2919 WRITE_ONCE(*ptr, res); \
2920} while (0)
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939static inline int
2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2941{
2942 struct sched_avg *sa = &cfs_rq->avg;
2943 int decayed, removed_load = 0, removed_util = 0;
2944
2945 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2946 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2947 sub_positive(&sa->load_avg, r);
2948 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2949 removed_load = 1;
2950 }
2951
2952 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2953 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2954 sub_positive(&sa->util_avg, r);
2955 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2956 removed_util = 1;
2957 }
2958
2959 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2960 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2961
2962#ifndef CONFIG_64BIT
2963 smp_wmb();
2964 cfs_rq->load_last_update_time_copy = sa->last_update_time;
2965#endif
2966
2967 if (update_freq && (decayed || removed_util))
2968 cfs_rq_util_change(cfs_rq);
2969
2970 return decayed || removed_load;
2971}
2972
2973
2974static inline void update_load_avg(struct sched_entity *se, int update_tg)
2975{
2976 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2977 u64 now = cfs_rq_clock_task(cfs_rq);
2978 struct rq *rq = rq_of(cfs_rq);
2979 int cpu = cpu_of(rq);
2980
2981
2982
2983
2984
2985 __update_load_avg(now, cpu, &se->avg,
2986 se->on_rq * scale_load_down(se->load.weight),
2987 cfs_rq->curr == se, NULL);
2988
2989 if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
2990 update_tg_load_avg(cfs_rq, 0);
2991}
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3002{
3003 if (!sched_feat(ATTACH_AGE_LOAD))
3004 goto skip_aging;
3005
3006
3007
3008
3009
3010
3011
3012 if (se->avg.last_update_time) {
3013 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3014 &se->avg, 0, 0, NULL);
3015
3016
3017
3018
3019
3020 }
3021
3022skip_aging:
3023 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3024 cfs_rq->avg.load_avg += se->avg.load_avg;
3025 cfs_rq->avg.load_sum += se->avg.load_sum;
3026 cfs_rq->avg.util_avg += se->avg.util_avg;
3027 cfs_rq->avg.util_sum += se->avg.util_sum;
3028
3029 cfs_rq_util_change(cfs_rq);
3030}
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3041{
3042 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3043 &se->avg, se->on_rq * scale_load_down(se->load.weight),
3044 cfs_rq->curr == se, NULL);
3045
3046 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3047 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3048 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3049 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3050
3051 cfs_rq_util_change(cfs_rq);
3052}
3053
3054
3055static inline void
3056enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3057{
3058 struct sched_avg *sa = &se->avg;
3059 u64 now = cfs_rq_clock_task(cfs_rq);
3060 int migrated, decayed;
3061
3062 migrated = !sa->last_update_time;
3063 if (!migrated) {
3064 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3065 se->on_rq * scale_load_down(se->load.weight),
3066 cfs_rq->curr == se, NULL);
3067 }
3068
3069 decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
3070
3071 cfs_rq->runnable_load_avg += sa->load_avg;
3072 cfs_rq->runnable_load_sum += sa->load_sum;
3073
3074 if (migrated)
3075 attach_entity_load_avg(cfs_rq, se);
3076
3077 if (decayed || migrated)
3078 update_tg_load_avg(cfs_rq, 0);
3079}
3080
3081
3082static inline void
3083dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3084{
3085 update_load_avg(se, 1);
3086
3087 cfs_rq->runnable_load_avg =
3088 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3089 cfs_rq->runnable_load_sum =
3090 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3091}
3092
3093#ifndef CONFIG_64BIT
3094static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3095{
3096 u64 last_update_time_copy;
3097 u64 last_update_time;
3098
3099 do {
3100 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3101 smp_rmb();
3102 last_update_time = cfs_rq->avg.last_update_time;
3103 } while (last_update_time != last_update_time_copy);
3104
3105 return last_update_time;
3106}
3107#else
3108static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3109{
3110 return cfs_rq->avg.last_update_time;
3111}
3112#endif
3113
3114
3115
3116
3117
3118void remove_entity_load_avg(struct sched_entity *se)
3119{
3120 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3121 u64 last_update_time;
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133 last_update_time = cfs_rq_last_update_time(cfs_rq);
3134
3135 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3136 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3137 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3138}
3139
3140static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3141{
3142 return cfs_rq->runnable_load_avg;
3143}
3144
3145static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3146{
3147 return cfs_rq->avg.load_avg;
3148}
3149
3150static int idle_balance(struct rq *this_rq);
3151
3152#else
3153
3154static inline int
3155update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3156{
3157 return 0;
3158}
3159
3160static inline void update_load_avg(struct sched_entity *se, int not_used)
3161{
3162 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3163 struct rq *rq = rq_of(cfs_rq);
3164
3165 cpufreq_trigger_update(rq_clock(rq));
3166}
3167
3168static inline void
3169enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3170static inline void
3171dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3172static inline void remove_entity_load_avg(struct sched_entity *se) {}
3173
3174static inline void
3175attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3176static inline void
3177detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3178
3179static inline int idle_balance(struct rq *rq)
3180{
3181 return 0;
3182}
3183
3184#endif
3185
3186static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3187{
3188#ifdef CONFIG_SCHEDSTATS
3189 struct task_struct *tsk = NULL;
3190
3191 if (entity_is_task(se))
3192 tsk = task_of(se);
3193
3194 if (se->statistics.sleep_start) {
3195 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3196
3197 if ((s64)delta < 0)
3198 delta = 0;
3199
3200 if (unlikely(delta > se->statistics.sleep_max))
3201 se->statistics.sleep_max = delta;
3202
3203 se->statistics.sleep_start = 0;
3204 se->statistics.sum_sleep_runtime += delta;
3205
3206 if (tsk) {
3207 account_scheduler_latency(tsk, delta >> 10, 1);
3208 trace_sched_stat_sleep(tsk, delta);
3209 }
3210 }
3211 if (se->statistics.block_start) {
3212 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3213
3214 if ((s64)delta < 0)
3215 delta = 0;
3216
3217 if (unlikely(delta > se->statistics.block_max))
3218 se->statistics.block_max = delta;
3219
3220 se->statistics.block_start = 0;
3221 se->statistics.sum_sleep_runtime += delta;
3222
3223 if (tsk) {
3224 if (tsk->in_iowait) {
3225 se->statistics.iowait_sum += delta;
3226 se->statistics.iowait_count++;
3227 trace_sched_stat_iowait(tsk, delta);
3228 }
3229
3230 trace_sched_stat_blocked(tsk, delta);
3231
3232
3233
3234
3235
3236
3237 if (unlikely(prof_on == SLEEP_PROFILING)) {
3238 profile_hits(SLEEP_PROFILING,
3239 (void *)get_wchan(tsk),
3240 delta >> 20);
3241 }
3242 account_scheduler_latency(tsk, delta >> 10, 0);
3243 }
3244 }
3245#endif
3246}
3247
3248static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3249{
3250#ifdef CONFIG_SCHED_DEBUG
3251 s64 d = se->vruntime - cfs_rq->min_vruntime;
3252
3253 if (d < 0)
3254 d = -d;
3255
3256 if (d > 3*sysctl_sched_latency)
3257 schedstat_inc(cfs_rq, nr_spread_over);
3258#endif
3259}
3260
3261static void
3262place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3263{
3264 u64 vruntime = cfs_rq->min_vruntime;
3265
3266
3267
3268
3269
3270
3271
3272 if (initial && sched_feat(START_DEBIT))
3273 vruntime += sched_vslice(cfs_rq, se);
3274
3275
3276 if (!initial) {
3277 unsigned long thresh = sysctl_sched_latency;
3278
3279
3280
3281
3282
3283 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3284 thresh >>= 1;
3285
3286 vruntime -= thresh;
3287 }
3288
3289
3290 se->vruntime = max_vruntime(se->vruntime, vruntime);
3291}
3292
3293static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3294
3295static inline void check_schedstat_required(void)
3296{
3297#ifdef CONFIG_SCHEDSTATS
3298 if (schedstat_enabled())
3299 return;
3300
3301
3302 if (trace_sched_stat_wait_enabled() ||
3303 trace_sched_stat_sleep_enabled() ||
3304 trace_sched_stat_iowait_enabled() ||
3305 trace_sched_stat_blocked_enabled() ||
3306 trace_sched_stat_runtime_enabled()) {
3307 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3308 "stat_blocked and stat_runtime require the "
3309 "kernel parameter schedstats=enabled or "
3310 "kernel.sched_schedstats=1\n");
3311 }
3312#endif
3313}
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346static void
3347enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3348{
3349 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3350 bool curr = cfs_rq->curr == se;
3351
3352
3353
3354
3355
3356 if (renorm && curr)
3357 se->vruntime += cfs_rq->min_vruntime;
3358
3359 update_curr(cfs_rq);
3360
3361
3362
3363
3364
3365
3366
3367 if (renorm && !curr)
3368 se->vruntime += cfs_rq->min_vruntime;
3369
3370 enqueue_entity_load_avg(cfs_rq, se);
3371 account_entity_enqueue(cfs_rq, se);
3372 update_cfs_shares(cfs_rq);
3373
3374 if (flags & ENQUEUE_WAKEUP) {
3375 place_entity(cfs_rq, se, 0);
3376 if (schedstat_enabled())
3377 enqueue_sleeper(cfs_rq, se);
3378 }
3379
3380 check_schedstat_required();
3381 if (schedstat_enabled()) {
3382 update_stats_enqueue(cfs_rq, se);
3383 check_spread(cfs_rq, se);
3384 }
3385 if (!curr)
3386 __enqueue_entity(cfs_rq, se);
3387 se->on_rq = 1;
3388
3389 if (cfs_rq->nr_running == 1) {
3390 list_add_leaf_cfs_rq(cfs_rq);
3391 check_enqueue_throttle(cfs_rq);
3392 }
3393}
3394
3395static void __clear_buddies_last(struct sched_entity *se)
3396{
3397 for_each_sched_entity(se) {
3398 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3399 if (cfs_rq->last != se)
3400 break;
3401
3402 cfs_rq->last = NULL;
3403 }
3404}
3405
3406static void __clear_buddies_next(struct sched_entity *se)
3407{
3408 for_each_sched_entity(se) {
3409 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3410 if (cfs_rq->next != se)
3411 break;
3412
3413 cfs_rq->next = NULL;
3414 }
3415}
3416
3417static void __clear_buddies_skip(struct sched_entity *se)
3418{
3419 for_each_sched_entity(se) {
3420 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3421 if (cfs_rq->skip != se)
3422 break;
3423
3424 cfs_rq->skip = NULL;
3425 }
3426}
3427
3428static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3429{
3430 if (cfs_rq->last == se)
3431 __clear_buddies_last(se);
3432
3433 if (cfs_rq->next == se)
3434 __clear_buddies_next(se);
3435
3436 if (cfs_rq->skip == se)
3437 __clear_buddies_skip(se);
3438}
3439
3440static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3441
3442static void
3443dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3444{
3445
3446
3447
3448 update_curr(cfs_rq);
3449 dequeue_entity_load_avg(cfs_rq, se);
3450
3451 if (schedstat_enabled())
3452 update_stats_dequeue(cfs_rq, se, flags);
3453
3454 clear_buddies(cfs_rq, se);
3455
3456 if (se != cfs_rq->curr)
3457 __dequeue_entity(cfs_rq, se);
3458 se->on_rq = 0;
3459 account_entity_dequeue(cfs_rq, se);
3460
3461
3462
3463
3464
3465
3466 if (!(flags & DEQUEUE_SLEEP))
3467 se->vruntime -= cfs_rq->min_vruntime;
3468
3469
3470 return_cfs_rq_runtime(cfs_rq);
3471
3472 update_min_vruntime(cfs_rq);
3473 update_cfs_shares(cfs_rq);
3474}
3475
3476
3477
3478
3479static void
3480check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3481{
3482 unsigned long ideal_runtime, delta_exec;
3483 struct sched_entity *se;
3484 s64 delta;
3485
3486 ideal_runtime = sched_slice(cfs_rq, curr);
3487 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3488 if (delta_exec > ideal_runtime) {
3489 resched_curr(rq_of(cfs_rq));
3490
3491
3492
3493
3494 clear_buddies(cfs_rq, curr);
3495 return;
3496 }
3497
3498
3499
3500
3501
3502
3503 if (delta_exec < sysctl_sched_min_granularity)
3504 return;
3505
3506 se = __pick_first_entity(cfs_rq);
3507 delta = curr->vruntime - se->vruntime;
3508
3509 if (delta < 0)
3510 return;
3511
3512 if (delta > ideal_runtime)
3513 resched_curr(rq_of(cfs_rq));
3514}
3515
3516static void
3517set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3518{
3519
3520 if (se->on_rq) {
3521
3522
3523
3524
3525
3526 if (schedstat_enabled())
3527 update_stats_wait_end(cfs_rq, se);
3528 __dequeue_entity(cfs_rq, se);
3529 update_load_avg(se, 1);
3530 }
3531
3532 update_stats_curr_start(cfs_rq, se);
3533 cfs_rq->curr = se;
3534#ifdef CONFIG_SCHEDSTATS
3535
3536
3537
3538
3539
3540 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3541 se->statistics.slice_max = max(se->statistics.slice_max,
3542 se->sum_exec_runtime - se->prev_sum_exec_runtime);
3543 }
3544#endif
3545 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3546}
3547
3548static int
3549wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3550
3551
3552
3553
3554
3555
3556
3557
3558static struct sched_entity *
3559pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3560{
3561 struct sched_entity *left = __pick_first_entity(cfs_rq);
3562 struct sched_entity *se;
3563
3564
3565
3566
3567
3568 if (!left || (curr && entity_before(curr, left)))
3569 left = curr;
3570
3571 se = left;
3572
3573
3574
3575
3576
3577 if (cfs_rq->skip == se) {
3578 struct sched_entity *second;
3579
3580 if (se == curr) {
3581 second = __pick_first_entity(cfs_rq);
3582 } else {
3583 second = __pick_next_entity(se);
3584 if (!second || (curr && entity_before(curr, second)))
3585 second = curr;
3586 }
3587
3588 if (second && wakeup_preempt_entity(second, left) < 1)
3589 se = second;
3590 }
3591
3592
3593
3594
3595 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3596 se = cfs_rq->last;
3597
3598
3599
3600
3601 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3602 se = cfs_rq->next;
3603
3604 clear_buddies(cfs_rq, se);
3605
3606 return se;
3607}
3608
3609static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3610
3611static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3612{
3613
3614
3615
3616
3617 if (prev->on_rq)
3618 update_curr(cfs_rq);
3619
3620
3621 check_cfs_rq_runtime(cfs_rq);
3622
3623 if (schedstat_enabled()) {
3624 check_spread(cfs_rq, prev);
3625 if (prev->on_rq)
3626 update_stats_wait_start(cfs_rq, prev);
3627 }
3628
3629 if (prev->on_rq) {
3630
3631 __enqueue_entity(cfs_rq, prev);
3632
3633 update_load_avg(prev, 0);
3634 }
3635 cfs_rq->curr = NULL;
3636}
3637
3638static void
3639entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3640{
3641
3642
3643
3644 update_curr(cfs_rq);
3645
3646
3647
3648
3649 update_load_avg(curr, 1);
3650 update_cfs_shares(cfs_rq);
3651
3652#ifdef CONFIG_SCHED_HRTICK
3653
3654
3655
3656
3657 if (queued) {
3658 resched_curr(rq_of(cfs_rq));
3659 return;
3660 }
3661
3662
3663
3664 if (!sched_feat(DOUBLE_TICK) &&
3665 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3666 return;
3667#endif
3668
3669 if (cfs_rq->nr_running > 1)
3670 check_preempt_tick(cfs_rq, curr);
3671}
3672
3673
3674
3675
3676
3677
3678#ifdef CONFIG_CFS_BANDWIDTH
3679
3680#ifdef HAVE_JUMP_LABEL
3681static struct static_key __cfs_bandwidth_used;
3682
3683static inline bool cfs_bandwidth_used(void)
3684{
3685 return static_key_false(&__cfs_bandwidth_used);
3686}
3687
3688void cfs_bandwidth_usage_inc(void)
3689{
3690 static_key_slow_inc(&__cfs_bandwidth_used);
3691}
3692
3693void cfs_bandwidth_usage_dec(void)
3694{
3695 static_key_slow_dec(&__cfs_bandwidth_used);
3696}
3697#else
3698static bool cfs_bandwidth_used(void)
3699{
3700 return true;
3701}
3702
3703void cfs_bandwidth_usage_inc(void) {}
3704void cfs_bandwidth_usage_dec(void) {}
3705#endif
3706
3707
3708
3709
3710
3711static inline u64 default_cfs_period(void)
3712{
3713 return 100000000ULL;
3714}
3715
3716static inline u64 sched_cfs_bandwidth_slice(void)
3717{
3718 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3719}
3720
3721
3722
3723
3724
3725
3726
3727
3728void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3729{
3730 u64 now;
3731
3732 if (cfs_b->quota == RUNTIME_INF)
3733 return;
3734
3735 now = sched_clock_cpu(smp_processor_id());
3736 cfs_b->runtime = cfs_b->quota;
3737 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3738}
3739
3740static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3741{
3742 return &tg->cfs_bandwidth;
3743}
3744
3745
3746static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3747{
3748 if (unlikely(cfs_rq->throttle_count))
3749 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
3750
3751 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3752}
3753
3754
3755static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3756{
3757 struct task_group *tg = cfs_rq->tg;
3758 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3759 u64 amount = 0, min_amount, expires;
3760
3761
3762 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3763
3764 raw_spin_lock(&cfs_b->lock);
3765 if (cfs_b->quota == RUNTIME_INF)
3766 amount = min_amount;
3767 else {
3768 start_cfs_bandwidth(cfs_b);
3769
3770 if (cfs_b->runtime > 0) {
3771 amount = min(cfs_b->runtime, min_amount);
3772 cfs_b->runtime -= amount;
3773 cfs_b->idle = 0;
3774 }
3775 }
3776 expires = cfs_b->runtime_expires;
3777 raw_spin_unlock(&cfs_b->lock);
3778
3779 cfs_rq->runtime_remaining += amount;
3780
3781
3782
3783
3784
3785 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3786 cfs_rq->runtime_expires = expires;
3787
3788 return cfs_rq->runtime_remaining > 0;
3789}
3790
3791
3792
3793
3794
3795static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3796{
3797 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3798
3799
3800 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3801 return;
3802
3803 if (cfs_rq->runtime_remaining < 0)
3804 return;
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3818
3819 cfs_rq->runtime_expires += TICK_NSEC;
3820 } else {
3821
3822 cfs_rq->runtime_remaining = 0;
3823 }
3824}
3825
3826static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3827{
3828
3829 cfs_rq->runtime_remaining -= delta_exec;
3830 expire_cfs_rq_runtime(cfs_rq);
3831
3832 if (likely(cfs_rq->runtime_remaining > 0))
3833 return;
3834
3835
3836
3837
3838
3839 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3840 resched_curr(rq_of(cfs_rq));
3841}
3842
3843static __always_inline
3844void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3845{
3846 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3847 return;
3848
3849 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3850}
3851
3852static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3853{
3854 return cfs_bandwidth_used() && cfs_rq->throttled;
3855}
3856
3857
3858static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3859{
3860 return cfs_bandwidth_used() && cfs_rq->throttle_count;
3861}
3862
3863
3864
3865
3866
3867
3868static inline int throttled_lb_pair(struct task_group *tg,
3869 int src_cpu, int dest_cpu)
3870{
3871 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3872
3873 src_cfs_rq = tg->cfs_rq[src_cpu];
3874 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3875
3876 return throttled_hierarchy(src_cfs_rq) ||
3877 throttled_hierarchy(dest_cfs_rq);
3878}
3879
3880
3881static int tg_unthrottle_up(struct task_group *tg, void *data)
3882{
3883 struct rq *rq = data;
3884 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3885
3886 cfs_rq->throttle_count--;
3887 if (!cfs_rq->throttle_count) {
3888
3889 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3890 cfs_rq->throttled_clock_task;
3891 }
3892
3893 return 0;
3894}
3895
3896static int tg_throttle_down(struct task_group *tg, void *data)
3897{
3898 struct rq *rq = data;
3899 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3900
3901
3902 if (!cfs_rq->throttle_count)
3903 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3904 cfs_rq->throttle_count++;
3905
3906 return 0;
3907}
3908
3909static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3910{
3911 struct rq *rq = rq_of(cfs_rq);
3912 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3913 struct sched_entity *se;
3914 long task_delta, dequeue = 1;
3915 bool empty;
3916
3917 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3918
3919
3920 rcu_read_lock();
3921 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3922 rcu_read_unlock();
3923
3924 task_delta = cfs_rq->h_nr_running;
3925 for_each_sched_entity(se) {
3926 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3927
3928 if (!se->on_rq)
3929 break;
3930
3931 if (dequeue)
3932 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3933 qcfs_rq->h_nr_running -= task_delta;
3934
3935 if (qcfs_rq->load.weight)
3936 dequeue = 0;
3937 }
3938
3939 if (!se)
3940 sub_nr_running(rq, task_delta);
3941
3942 cfs_rq->throttled = 1;
3943 cfs_rq->throttled_clock = rq_clock(rq);
3944 raw_spin_lock(&cfs_b->lock);
3945 empty = list_empty(&cfs_b->throttled_cfs_rq);
3946
3947
3948
3949
3950
3951 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3952
3953
3954
3955
3956
3957 if (empty)
3958 start_cfs_bandwidth(cfs_b);
3959
3960 raw_spin_unlock(&cfs_b->lock);
3961}
3962
3963void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3964{
3965 struct rq *rq = rq_of(cfs_rq);
3966 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3967 struct sched_entity *se;
3968 int enqueue = 1;
3969 long task_delta;
3970
3971 se = cfs_rq->tg->se[cpu_of(rq)];
3972
3973 cfs_rq->throttled = 0;
3974
3975 update_rq_clock(rq);
3976
3977 raw_spin_lock(&cfs_b->lock);
3978 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3979 list_del_rcu(&cfs_rq->throttled_list);
3980 raw_spin_unlock(&cfs_b->lock);
3981
3982
3983 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3984
3985 if (!cfs_rq->load.weight)
3986 return;
3987
3988 task_delta = cfs_rq->h_nr_running;
3989 for_each_sched_entity(se) {
3990 if (se->on_rq)
3991 enqueue = 0;
3992
3993 cfs_rq = cfs_rq_of(se);
3994 if (enqueue)
3995 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3996 cfs_rq->h_nr_running += task_delta;
3997
3998 if (cfs_rq_throttled(cfs_rq))
3999 break;
4000 }
4001
4002 if (!se)
4003 add_nr_running(rq, task_delta);
4004
4005
4006 if (rq->curr == rq->idle && rq->cfs.nr_running)
4007 resched_curr(rq);
4008}
4009
4010static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4011 u64 remaining, u64 expires)
4012{
4013 struct cfs_rq *cfs_rq;
4014 u64 runtime;
4015 u64 starting_runtime = remaining;
4016
4017 rcu_read_lock();
4018 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4019 throttled_list) {
4020 struct rq *rq = rq_of(cfs_rq);
4021
4022 raw_spin_lock(&rq->lock);
4023 if (!cfs_rq_throttled(cfs_rq))
4024 goto next;
4025
4026 runtime = -cfs_rq->runtime_remaining + 1;
4027 if (runtime > remaining)
4028 runtime = remaining;
4029 remaining -= runtime;
4030
4031 cfs_rq->runtime_remaining += runtime;
4032 cfs_rq->runtime_expires = expires;
4033
4034
4035 if (cfs_rq->runtime_remaining > 0)
4036 unthrottle_cfs_rq(cfs_rq);
4037
4038next:
4039 raw_spin_unlock(&rq->lock);
4040
4041 if (!remaining)
4042 break;
4043 }
4044 rcu_read_unlock();
4045
4046 return starting_runtime - remaining;
4047}
4048
4049
4050
4051
4052
4053
4054
4055static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4056{
4057 u64 runtime, runtime_expires;
4058 int throttled;
4059
4060
4061 if (cfs_b->quota == RUNTIME_INF)
4062 goto out_deactivate;
4063
4064 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4065 cfs_b->nr_periods += overrun;
4066
4067
4068
4069
4070
4071 if (cfs_b->idle && !throttled)
4072 goto out_deactivate;
4073
4074 __refill_cfs_bandwidth_runtime(cfs_b);
4075
4076 if (!throttled) {
4077
4078 cfs_b->idle = 1;
4079 return 0;
4080 }
4081
4082
4083 cfs_b->nr_throttled += overrun;
4084
4085 runtime_expires = cfs_b->runtime_expires;
4086
4087
4088
4089
4090
4091
4092
4093
4094 while (throttled && cfs_b->runtime > 0) {
4095 runtime = cfs_b->runtime;
4096 raw_spin_unlock(&cfs_b->lock);
4097
4098 runtime = distribute_cfs_runtime(cfs_b, runtime,
4099 runtime_expires);
4100 raw_spin_lock(&cfs_b->lock);
4101
4102 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4103
4104 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4105 }
4106
4107
4108
4109
4110
4111
4112
4113 cfs_b->idle = 0;
4114
4115 return 0;
4116
4117out_deactivate:
4118 return 1;
4119}
4120
4121
4122static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4123
4124static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4125
4126static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4127
4128
4129
4130
4131
4132
4133
4134
4135static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4136{
4137 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4138 u64 remaining;
4139
4140
4141 if (hrtimer_callback_running(refresh_timer))
4142 return 1;
4143
4144
4145 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4146 if (remaining < min_expire)
4147 return 1;
4148
4149 return 0;
4150}
4151
4152static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4153{
4154 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4155
4156
4157 if (runtime_refresh_within(cfs_b, min_left))
4158 return;
4159
4160 hrtimer_start(&cfs_b->slack_timer,
4161 ns_to_ktime(cfs_bandwidth_slack_period),
4162 HRTIMER_MODE_REL);
4163}
4164
4165
4166static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4167{
4168 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4169 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4170
4171 if (slack_runtime <= 0)
4172 return;
4173
4174 raw_spin_lock(&cfs_b->lock);
4175 if (cfs_b->quota != RUNTIME_INF &&
4176 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4177 cfs_b->runtime += slack_runtime;
4178
4179
4180 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4181 !list_empty(&cfs_b->throttled_cfs_rq))
4182 start_cfs_slack_bandwidth(cfs_b);
4183 }
4184 raw_spin_unlock(&cfs_b->lock);
4185
4186
4187 cfs_rq->runtime_remaining -= slack_runtime;
4188}
4189
4190static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4191{
4192 if (!cfs_bandwidth_used())
4193 return;
4194
4195 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4196 return;
4197
4198 __return_cfs_rq_runtime(cfs_rq);
4199}
4200
4201
4202
4203
4204
4205static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4206{
4207 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4208 u64 expires;
4209
4210
4211 raw_spin_lock(&cfs_b->lock);
4212 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4213 raw_spin_unlock(&cfs_b->lock);
4214 return;
4215 }
4216
4217 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4218 runtime = cfs_b->runtime;
4219
4220 expires = cfs_b->runtime_expires;
4221 raw_spin_unlock(&cfs_b->lock);
4222
4223 if (!runtime)
4224 return;
4225
4226 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4227
4228 raw_spin_lock(&cfs_b->lock);
4229 if (expires == cfs_b->runtime_expires)
4230 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4231 raw_spin_unlock(&cfs_b->lock);
4232}
4233
4234
4235
4236
4237
4238
4239static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4240{
4241 if (!cfs_bandwidth_used())
4242 return;
4243
4244
4245 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4246 return;
4247
4248
4249 if (cfs_rq_throttled(cfs_rq))
4250 return;
4251
4252
4253 account_cfs_rq_runtime(cfs_rq, 0);
4254 if (cfs_rq->runtime_remaining <= 0)
4255 throttle_cfs_rq(cfs_rq);
4256}
4257
4258static void sync_throttle(struct task_group *tg, int cpu)
4259{
4260 struct cfs_rq *pcfs_rq, *cfs_rq;
4261
4262 if (!cfs_bandwidth_used())
4263 return;
4264
4265 if (!tg->parent)
4266 return;
4267
4268 cfs_rq = tg->cfs_rq[cpu];
4269 pcfs_rq = tg->parent->cfs_rq[cpu];
4270
4271 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4272 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4273}
4274
4275
4276static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4277{
4278 if (!cfs_bandwidth_used())
4279 return false;
4280
4281 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4282 return false;
4283
4284
4285
4286
4287
4288 if (cfs_rq_throttled(cfs_rq))
4289 return true;
4290
4291 throttle_cfs_rq(cfs_rq);
4292 return true;
4293}
4294
4295static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4296{
4297 struct cfs_bandwidth *cfs_b =
4298 container_of(timer, struct cfs_bandwidth, slack_timer);
4299
4300 do_sched_cfs_slack_timer(cfs_b);
4301
4302 return HRTIMER_NORESTART;
4303}
4304
4305static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4306{
4307 struct cfs_bandwidth *cfs_b =
4308 container_of(timer, struct cfs_bandwidth, period_timer);
4309 int overrun;
4310 int idle = 0;
4311
4312 raw_spin_lock(&cfs_b->lock);
4313 for (;;) {
4314 overrun = hrtimer_forward_now(timer, cfs_b->period);
4315 if (!overrun)
4316 break;
4317
4318 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4319 }
4320 if (idle)
4321 cfs_b->period_active = 0;
4322 raw_spin_unlock(&cfs_b->lock);
4323
4324 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4325}
4326
4327void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4328{
4329 raw_spin_lock_init(&cfs_b->lock);
4330 cfs_b->runtime = 0;
4331 cfs_b->quota = RUNTIME_INF;
4332 cfs_b->period = ns_to_ktime(default_cfs_period());
4333
4334 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4335 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4336 cfs_b->period_timer.function = sched_cfs_period_timer;
4337 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4338 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4339}
4340
4341static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4342{
4343 cfs_rq->runtime_enabled = 0;
4344 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4345}
4346
4347void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4348{
4349 lockdep_assert_held(&cfs_b->lock);
4350
4351 if (!cfs_b->period_active) {
4352 cfs_b->period_active = 1;
4353 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4354 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4355 }
4356}
4357
4358static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4359{
4360
4361 if (!cfs_b->throttled_cfs_rq.next)
4362 return;
4363
4364 hrtimer_cancel(&cfs_b->period_timer);
4365 hrtimer_cancel(&cfs_b->slack_timer);
4366}
4367
4368static void __maybe_unused update_runtime_enabled(struct rq *rq)
4369{
4370 struct cfs_rq *cfs_rq;
4371
4372 for_each_leaf_cfs_rq(rq, cfs_rq) {
4373 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4374
4375 raw_spin_lock(&cfs_b->lock);
4376 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4377 raw_spin_unlock(&cfs_b->lock);
4378 }
4379}
4380
4381static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4382{
4383 struct cfs_rq *cfs_rq;
4384
4385 for_each_leaf_cfs_rq(rq, cfs_rq) {
4386 if (!cfs_rq->runtime_enabled)
4387 continue;
4388
4389
4390
4391
4392
4393 cfs_rq->runtime_remaining = 1;
4394
4395
4396
4397
4398 cfs_rq->runtime_enabled = 0;
4399
4400 if (cfs_rq_throttled(cfs_rq))
4401 unthrottle_cfs_rq(cfs_rq);
4402 }
4403}
4404
4405#else
4406static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4407{
4408 return rq_clock_task(rq_of(cfs_rq));
4409}
4410
4411static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4412static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4413static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4414static inline void sync_throttle(struct task_group *tg, int cpu) {}
4415static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4416
4417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4418{
4419 return 0;
4420}
4421
4422static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4423{
4424 return 0;
4425}
4426
4427static inline int throttled_lb_pair(struct task_group *tg,
4428 int src_cpu, int dest_cpu)
4429{
4430 return 0;
4431}
4432
4433void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4434
4435#ifdef CONFIG_FAIR_GROUP_SCHED
4436static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4437#endif
4438
4439static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4440{
4441 return NULL;
4442}
4443static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4444static inline void update_runtime_enabled(struct rq *rq) {}
4445static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4446
4447#endif
4448
4449
4450
4451
4452
4453#ifdef CONFIG_SCHED_HRTICK
4454static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4455{
4456 struct sched_entity *se = &p->se;
4457 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4458
4459 WARN_ON(task_rq(p) != rq);
4460
4461 if (cfs_rq->nr_running > 1) {
4462 u64 slice = sched_slice(cfs_rq, se);
4463 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4464 s64 delta = slice - ran;
4465
4466 if (delta < 0) {
4467 if (rq->curr == p)
4468 resched_curr(rq);
4469 return;
4470 }
4471 hrtick_start(rq, delta);
4472 }
4473}
4474
4475
4476
4477
4478
4479
4480static void hrtick_update(struct rq *rq)
4481{
4482 struct task_struct *curr = rq->curr;
4483
4484 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4485 return;
4486
4487 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4488 hrtick_start_fair(rq, curr);
4489}
4490#else
4491static inline void
4492hrtick_start_fair(struct rq *rq, struct task_struct *p)
4493{
4494}
4495
4496static inline void hrtick_update(struct rq *rq)
4497{
4498}
4499#endif
4500
4501
4502
4503
4504
4505
4506static void
4507enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4508{
4509 struct cfs_rq *cfs_rq;
4510 struct sched_entity *se = &p->se;
4511
4512 for_each_sched_entity(se) {
4513 if (se->on_rq)
4514 break;
4515 cfs_rq = cfs_rq_of(se);
4516 enqueue_entity(cfs_rq, se, flags);
4517
4518
4519
4520
4521
4522
4523
4524 if (cfs_rq_throttled(cfs_rq))
4525 break;
4526 cfs_rq->h_nr_running++;
4527
4528 flags = ENQUEUE_WAKEUP;
4529 }
4530
4531 for_each_sched_entity(se) {
4532 cfs_rq = cfs_rq_of(se);
4533 cfs_rq->h_nr_running++;
4534
4535 if (cfs_rq_throttled(cfs_rq))
4536 break;
4537
4538 update_load_avg(se, 1);
4539 update_cfs_shares(cfs_rq);
4540 }
4541
4542 if (!se)
4543 add_nr_running(rq, 1);
4544
4545 hrtick_update(rq);
4546}
4547
4548static void set_next_buddy(struct sched_entity *se);
4549
4550
4551
4552
4553
4554
4555static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4556{
4557 struct cfs_rq *cfs_rq;
4558 struct sched_entity *se = &p->se;
4559 int task_sleep = flags & DEQUEUE_SLEEP;
4560
4561 for_each_sched_entity(se) {
4562 cfs_rq = cfs_rq_of(se);
4563 dequeue_entity(cfs_rq, se, flags);
4564
4565
4566
4567
4568
4569
4570
4571 if (cfs_rq_throttled(cfs_rq))
4572 break;
4573 cfs_rq->h_nr_running--;
4574
4575
4576 if (cfs_rq->load.weight) {
4577
4578 se = parent_entity(se);
4579
4580
4581
4582
4583 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4584 set_next_buddy(se);
4585 break;
4586 }
4587 flags |= DEQUEUE_SLEEP;
4588 }
4589
4590 for_each_sched_entity(se) {
4591 cfs_rq = cfs_rq_of(se);
4592 cfs_rq->h_nr_running--;
4593
4594 if (cfs_rq_throttled(cfs_rq))
4595 break;
4596
4597 update_load_avg(se, 1);
4598 update_cfs_shares(cfs_rq);
4599 }
4600
4601 if (!se)
4602 sub_nr_running(rq, 1);
4603
4604 hrtick_update(rq);
4605}
4606
4607#ifdef CONFIG_SMP
4608#ifdef CONFIG_NO_HZ_COMMON
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635#define DEGRADE_SHIFT 7
4636
4637static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4638static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4639 { 0, 0, 0, 0, 0, 0, 0, 0 },
4640 { 64, 32, 8, 0, 0, 0, 0, 0 },
4641 { 96, 72, 40, 12, 1, 0, 0, 0 },
4642 { 112, 98, 75, 43, 15, 1, 0, 0 },
4643 { 120, 112, 98, 76, 45, 16, 2, 0 }
4644};
4645
4646
4647
4648
4649
4650
4651static unsigned long
4652decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4653{
4654 int j = 0;
4655
4656 if (!missed_updates)
4657 return load;
4658
4659 if (missed_updates >= degrade_zero_ticks[idx])
4660 return 0;
4661
4662 if (idx == 1)
4663 return load >> missed_updates;
4664
4665 while (missed_updates) {
4666 if (missed_updates % 2)
4667 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4668
4669 missed_updates >>= 1;
4670 j++;
4671 }
4672 return load;
4673}
4674#endif
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4712 unsigned long pending_updates)
4713{
4714 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
4715 int i, scale;
4716
4717 this_rq->nr_load_updates++;
4718
4719
4720 this_rq->cpu_load[0] = this_load;
4721 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4722 unsigned long old_load, new_load;
4723
4724
4725
4726 old_load = this_rq->cpu_load[i];
4727#ifdef CONFIG_NO_HZ_COMMON
4728 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4729 if (tickless_load) {
4730 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4731
4732
4733
4734
4735
4736 old_load += tickless_load;
4737 }
4738#endif
4739 new_load = this_load;
4740
4741
4742
4743
4744
4745 if (new_load > old_load)
4746 new_load += scale - 1;
4747
4748 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4749 }
4750
4751 sched_avg_update(this_rq);
4752}
4753
4754
4755static unsigned long weighted_cpuload(const int cpu)
4756{
4757 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4758}
4759
4760#ifdef CONFIG_NO_HZ_COMMON
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775static void cpu_load_update_nohz(struct rq *this_rq,
4776 unsigned long curr_jiffies,
4777 unsigned long load)
4778{
4779 unsigned long pending_updates;
4780
4781 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4782 if (pending_updates) {
4783 this_rq->last_load_update_tick = curr_jiffies;
4784
4785
4786
4787
4788
4789 cpu_load_update(this_rq, load, pending_updates);
4790 }
4791}
4792
4793
4794
4795
4796
4797static void cpu_load_update_idle(struct rq *this_rq)
4798{
4799
4800
4801
4802 if (weighted_cpuload(cpu_of(this_rq)))
4803 return;
4804
4805 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
4806}
4807
4808
4809
4810
4811
4812
4813
4814void cpu_load_update_nohz_start(void)
4815{
4816 struct rq *this_rq = this_rq();
4817
4818
4819
4820
4821
4822
4823 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4824}
4825
4826
4827
4828
4829void cpu_load_update_nohz_stop(void)
4830{
4831 unsigned long curr_jiffies = READ_ONCE(jiffies);
4832 struct rq *this_rq = this_rq();
4833 unsigned long load;
4834
4835 if (curr_jiffies == this_rq->last_load_update_tick)
4836 return;
4837
4838 load = weighted_cpuload(cpu_of(this_rq));
4839 raw_spin_lock(&this_rq->lock);
4840 update_rq_clock(this_rq);
4841 cpu_load_update_nohz(this_rq, curr_jiffies, load);
4842 raw_spin_unlock(&this_rq->lock);
4843}
4844#else
4845static inline void cpu_load_update_nohz(struct rq *this_rq,
4846 unsigned long curr_jiffies,
4847 unsigned long load) { }
4848#endif
4849
4850static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4851{
4852#ifdef CONFIG_NO_HZ_COMMON
4853
4854 this_rq->last_load_update_tick = READ_ONCE(jiffies);
4855#endif
4856 cpu_load_update(this_rq, load, 1);
4857}
4858
4859
4860
4861
4862void cpu_load_update_active(struct rq *this_rq)
4863{
4864 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4865
4866 if (tick_nohz_tick_stopped())
4867 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4868 else
4869 cpu_load_update_periodic(this_rq, load);
4870}
4871
4872
4873
4874
4875
4876
4877
4878
4879static unsigned long source_load(int cpu, int type)
4880{
4881 struct rq *rq = cpu_rq(cpu);
4882 unsigned long total = weighted_cpuload(cpu);
4883
4884 if (type == 0 || !sched_feat(LB_BIAS))
4885 return total;
4886
4887 return min(rq->cpu_load[type-1], total);
4888}
4889
4890
4891
4892
4893
4894static unsigned long target_load(int cpu, int type)
4895{
4896 struct rq *rq = cpu_rq(cpu);
4897 unsigned long total = weighted_cpuload(cpu);
4898
4899 if (type == 0 || !sched_feat(LB_BIAS))
4900 return total;
4901
4902 return max(rq->cpu_load[type-1], total);
4903}
4904
4905static unsigned long capacity_of(int cpu)
4906{
4907 return cpu_rq(cpu)->cpu_capacity;
4908}
4909
4910static unsigned long capacity_orig_of(int cpu)
4911{
4912 return cpu_rq(cpu)->cpu_capacity_orig;
4913}
4914
4915static unsigned long cpu_avg_load_per_task(int cpu)
4916{
4917 struct rq *rq = cpu_rq(cpu);
4918 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4919 unsigned long load_avg = weighted_cpuload(cpu);
4920
4921 if (nr_running)
4922 return load_avg / nr_running;
4923
4924 return 0;
4925}
4926
4927#ifdef CONFIG_FAIR_GROUP_SCHED
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4979{
4980 struct sched_entity *se = tg->se[cpu];
4981
4982 if (!tg->parent)
4983 return wl;
4984
4985 for_each_sched_entity(se) {
4986 struct cfs_rq *cfs_rq = se->my_q;
4987 long W, w = cfs_rq_load_avg(cfs_rq);
4988
4989 tg = cfs_rq->tg;
4990
4991
4992
4993
4994 W = wg + atomic_long_read(&tg->load_avg);
4995
4996
4997 W -= cfs_rq->tg_load_avg_contrib;
4998 W += w;
4999
5000
5001
5002
5003 w += wl;
5004
5005
5006
5007
5008 if (W > 0 && w < W)
5009 wl = (w * (long)tg->shares) / W;
5010 else
5011 wl = tg->shares;
5012
5013
5014
5015
5016
5017
5018 if (wl < MIN_SHARES)
5019 wl = MIN_SHARES;
5020
5021
5022
5023
5024 wl -= se->avg.load_avg;
5025
5026
5027
5028
5029
5030
5031
5032
5033 wg = 0;
5034 }
5035
5036 return wl;
5037}
5038#else
5039
5040static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5041{
5042 return wl;
5043}
5044
5045#endif
5046
5047static void record_wakee(struct task_struct *p)
5048{
5049
5050
5051
5052
5053 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5054 current->wakee_flips >>= 1;
5055 current->wakee_flip_decay_ts = jiffies;
5056 }
5057
5058 if (current->last_wakee != p) {
5059 current->last_wakee = p;
5060 current->wakee_flips++;
5061 }
5062}
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081static int wake_wide(struct task_struct *p)
5082{
5083 unsigned int master = current->wakee_flips;
5084 unsigned int slave = p->wakee_flips;
5085 int factor = this_cpu_read(sd_llc_size);
5086
5087 if (master < slave)
5088 swap(master, slave);
5089 if (slave < factor || master < slave * factor)
5090 return 0;
5091 return 1;
5092}
5093
5094static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5095{
5096 s64 this_load, load;
5097 s64 this_eff_load, prev_eff_load;
5098 int idx, this_cpu, prev_cpu;
5099 struct task_group *tg;
5100 unsigned long weight;
5101 int balanced;
5102
5103 idx = sd->wake_idx;
5104 this_cpu = smp_processor_id();
5105 prev_cpu = task_cpu(p);
5106 load = source_load(prev_cpu, idx);
5107 this_load = target_load(this_cpu, idx);
5108
5109
5110
5111
5112
5113
5114 if (sync) {
5115 tg = task_group(current);
5116 weight = current->se.avg.load_avg;
5117
5118 this_load += effective_load(tg, this_cpu, -weight, -weight);
5119 load += effective_load(tg, prev_cpu, 0, -weight);
5120 }
5121
5122 tg = task_group(p);
5123 weight = p->se.avg.load_avg;
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134 this_eff_load = 100;
5135 this_eff_load *= capacity_of(prev_cpu);
5136
5137 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5138 prev_eff_load *= capacity_of(this_cpu);
5139
5140 if (this_load > 0) {
5141 this_eff_load *= this_load +
5142 effective_load(tg, this_cpu, weight, weight);
5143
5144 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5145 }
5146
5147 balanced = this_eff_load <= prev_eff_load;
5148
5149 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
5150
5151 if (!balanced)
5152 return 0;
5153
5154 schedstat_inc(sd, ttwu_move_affine);
5155 schedstat_inc(p, se.statistics.nr_wakeups_affine);
5156
5157 return 1;
5158}
5159
5160
5161
5162
5163
5164static struct sched_group *
5165find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5166 int this_cpu, int sd_flag)
5167{
5168 struct sched_group *idlest = NULL, *group = sd->groups;
5169 unsigned long min_load = ULONG_MAX, this_load = 0;
5170 int load_idx = sd->forkexec_idx;
5171 int imbalance = 100 + (sd->imbalance_pct-100)/2;
5172
5173 if (sd_flag & SD_BALANCE_WAKE)
5174 load_idx = sd->wake_idx;
5175
5176 do {
5177 unsigned long load, avg_load;
5178 int local_group;
5179 int i;
5180
5181
5182 if (!cpumask_intersects(sched_group_cpus(group),
5183 tsk_cpus_allowed(p)))
5184 continue;
5185
5186 local_group = cpumask_test_cpu(this_cpu,
5187 sched_group_cpus(group));
5188
5189
5190 avg_load = 0;
5191
5192 for_each_cpu(i, sched_group_cpus(group)) {
5193
5194 if (local_group)
5195 load = source_load(i, load_idx);
5196 else
5197 load = target_load(i, load_idx);
5198
5199 avg_load += load;
5200 }
5201
5202
5203 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
5204
5205 if (local_group) {
5206 this_load = avg_load;
5207 } else if (avg_load < min_load) {
5208 min_load = avg_load;
5209 idlest = group;
5210 }
5211 } while (group = group->next, group != sd->groups);
5212
5213 if (!idlest || 100*this_load < imbalance*min_load)
5214 return NULL;
5215 return idlest;
5216}
5217
5218
5219
5220
5221static int
5222find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5223{
5224 unsigned long load, min_load = ULONG_MAX;
5225 unsigned int min_exit_latency = UINT_MAX;
5226 u64 latest_idle_timestamp = 0;
5227 int least_loaded_cpu = this_cpu;
5228 int shallowest_idle_cpu = -1;
5229 int i;
5230
5231
5232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5233 if (idle_cpu(i)) {
5234 struct rq *rq = cpu_rq(i);
5235 struct cpuidle_state *idle = idle_get_state(rq);
5236 if (idle && idle->exit_latency < min_exit_latency) {
5237
5238
5239
5240
5241
5242 min_exit_latency = idle->exit_latency;
5243 latest_idle_timestamp = rq->idle_stamp;
5244 shallowest_idle_cpu = i;
5245 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5246 rq->idle_stamp > latest_idle_timestamp) {
5247
5248
5249
5250
5251
5252 latest_idle_timestamp = rq->idle_stamp;
5253 shallowest_idle_cpu = i;
5254 }
5255 } else if (shallowest_idle_cpu == -1) {
5256 load = weighted_cpuload(i);
5257 if (load < min_load || (load == min_load && i == this_cpu)) {
5258 min_load = load;
5259 least_loaded_cpu = i;
5260 }
5261 }
5262 }
5263
5264 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5265}
5266
5267
5268
5269
5270static int select_idle_sibling(struct task_struct *p, int target)
5271{
5272 struct sched_domain *sd;
5273 struct sched_group *sg;
5274 int i = task_cpu(p);
5275
5276 if (idle_cpu(target))
5277 return target;
5278
5279
5280
5281
5282 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
5283 return i;
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300 sd = rcu_dereference(per_cpu(sd_llc, target));
5301 for_each_lower_domain(sd) {
5302 sg = sd->groups;
5303 do {
5304 if (!cpumask_intersects(sched_group_cpus(sg),
5305 tsk_cpus_allowed(p)))
5306 goto next;
5307
5308
5309 for_each_cpu(i, sched_group_cpus(sg)) {
5310 if (i == target || !idle_cpu(i))
5311 goto next;
5312 }
5313
5314
5315
5316
5317
5318 target = cpumask_first_and(sched_group_cpus(sg),
5319 tsk_cpus_allowed(p));
5320 goto done;
5321next:
5322 sg = sg->next;
5323 } while (sg != sd->groups);
5324 }
5325done:
5326 return target;
5327}
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355static int cpu_util(int cpu)
5356{
5357 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5358 unsigned long capacity = capacity_orig_of(cpu);
5359
5360 return (util >= capacity) ? capacity : util;
5361}
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375static int
5376select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5377{
5378 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5379 int cpu = smp_processor_id();
5380 int new_cpu = prev_cpu;
5381 int want_affine = 0;
5382 int sync = wake_flags & WF_SYNC;
5383
5384 if (sd_flag & SD_BALANCE_WAKE) {
5385 record_wakee(p);
5386 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5387 }
5388
5389 rcu_read_lock();
5390 for_each_domain(cpu, tmp) {
5391 if (!(tmp->flags & SD_LOAD_BALANCE))
5392 break;
5393
5394
5395
5396
5397
5398 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5399 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5400 affine_sd = tmp;
5401 break;
5402 }
5403
5404 if (tmp->flags & sd_flag)
5405 sd = tmp;
5406 else if (!want_affine)
5407 break;
5408 }
5409
5410 if (affine_sd) {
5411 sd = NULL;
5412 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5413 new_cpu = cpu;
5414 }
5415
5416 if (!sd) {
5417 if (sd_flag & SD_BALANCE_WAKE)
5418 new_cpu = select_idle_sibling(p, new_cpu);
5419
5420 } else while (sd) {
5421 struct sched_group *group;
5422 int weight;
5423
5424 if (!(sd->flags & sd_flag)) {
5425 sd = sd->child;
5426 continue;
5427 }
5428
5429 group = find_idlest_group(sd, p, cpu, sd_flag);
5430 if (!group) {
5431 sd = sd->child;
5432 continue;
5433 }
5434
5435 new_cpu = find_idlest_cpu(group, p, cpu);
5436 if (new_cpu == -1 || new_cpu == cpu) {
5437
5438 sd = sd->child;
5439 continue;
5440 }
5441
5442
5443 cpu = new_cpu;
5444 weight = sd->span_weight;
5445 sd = NULL;
5446 for_each_domain(cpu, tmp) {
5447 if (weight <= tmp->span_weight)
5448 break;
5449 if (tmp->flags & sd_flag)
5450 sd = tmp;
5451 }
5452
5453 }
5454 rcu_read_unlock();
5455
5456 return new_cpu;
5457}
5458
5459
5460
5461
5462
5463
5464static void migrate_task_rq_fair(struct task_struct *p)
5465{
5466
5467
5468
5469
5470
5471
5472 if (p->state == TASK_WAKING) {
5473 struct sched_entity *se = &p->se;
5474 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5475 u64 min_vruntime;
5476
5477#ifndef CONFIG_64BIT
5478 u64 min_vruntime_copy;
5479
5480 do {
5481 min_vruntime_copy = cfs_rq->min_vruntime_copy;
5482 smp_rmb();
5483 min_vruntime = cfs_rq->min_vruntime;
5484 } while (min_vruntime != min_vruntime_copy);
5485#else
5486 min_vruntime = cfs_rq->min_vruntime;
5487#endif
5488
5489 se->vruntime -= min_vruntime;
5490 }
5491
5492
5493
5494
5495
5496
5497
5498
5499 remove_entity_load_avg(&p->se);
5500
5501
5502 p->se.avg.last_update_time = 0;
5503
5504
5505 p->se.exec_start = 0;
5506}
5507
5508static void task_dead_fair(struct task_struct *p)
5509{
5510 remove_entity_load_avg(&p->se);
5511}
5512#endif
5513
5514static unsigned long
5515wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5516{
5517 unsigned long gran = sysctl_sched_wakeup_granularity;
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532 return calc_delta_fair(gran, se);
5533}
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549static int
5550wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5551{
5552 s64 gran, vdiff = curr->vruntime - se->vruntime;
5553
5554 if (vdiff <= 0)
5555 return -1;
5556
5557 gran = wakeup_gran(curr, se);
5558 if (vdiff > gran)
5559 return 1;
5560
5561 return 0;
5562}
5563
5564static void set_last_buddy(struct sched_entity *se)
5565{
5566 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5567 return;
5568
5569 for_each_sched_entity(se)
5570 cfs_rq_of(se)->last = se;
5571}
5572
5573static void set_next_buddy(struct sched_entity *se)
5574{
5575 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5576 return;
5577
5578 for_each_sched_entity(se)
5579 cfs_rq_of(se)->next = se;
5580}
5581
5582static void set_skip_buddy(struct sched_entity *se)
5583{
5584 for_each_sched_entity(se)
5585 cfs_rq_of(se)->skip = se;
5586}
5587
5588
5589
5590
5591static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5592{
5593 struct task_struct *curr = rq->curr;
5594 struct sched_entity *se = &curr->se, *pse = &p->se;
5595 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5596 int scale = cfs_rq->nr_running >= sched_nr_latency;
5597 int next_buddy_marked = 0;
5598
5599 if (unlikely(se == pse))
5600 return;
5601
5602
5603
5604
5605
5606
5607
5608 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5609 return;
5610
5611 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5612 set_next_buddy(pse);
5613 next_buddy_marked = 1;
5614 }
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626 if (test_tsk_need_resched(curr))
5627 return;
5628
5629
5630 if (unlikely(curr->policy == SCHED_IDLE) &&
5631 likely(p->policy != SCHED_IDLE))
5632 goto preempt;
5633
5634
5635
5636
5637
5638 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5639 return;
5640
5641 find_matching_se(&se, &pse);
5642 update_curr(cfs_rq_of(se));
5643 BUG_ON(!pse);
5644 if (wakeup_preempt_entity(se, pse) == 1) {
5645
5646
5647
5648
5649 if (!next_buddy_marked)
5650 set_next_buddy(pse);
5651 goto preempt;
5652 }
5653
5654 return;
5655
5656preempt:
5657 resched_curr(rq);
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667 if (unlikely(!se->on_rq || curr == rq->idle))
5668 return;
5669
5670 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5671 set_last_buddy(se);
5672}
5673
5674static struct task_struct *
5675pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
5676{
5677 struct cfs_rq *cfs_rq = &rq->cfs;
5678 struct sched_entity *se;
5679 struct task_struct *p;
5680 int new_tasks;
5681
5682again:
5683#ifdef CONFIG_FAIR_GROUP_SCHED
5684 if (!cfs_rq->nr_running)
5685 goto idle;
5686
5687 if (prev->sched_class != &fair_sched_class)
5688 goto simple;
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698 do {
5699 struct sched_entity *curr = cfs_rq->curr;
5700
5701
5702
5703
5704
5705
5706
5707 if (curr) {
5708 if (curr->on_rq)
5709 update_curr(cfs_rq);
5710 else
5711 curr = NULL;
5712
5713
5714
5715
5716
5717
5718
5719 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5720 goto simple;
5721 }
5722
5723 se = pick_next_entity(cfs_rq, curr);
5724 cfs_rq = group_cfs_rq(se);
5725 } while (cfs_rq);
5726
5727 p = task_of(se);
5728
5729
5730
5731
5732
5733
5734 if (prev != p) {
5735 struct sched_entity *pse = &prev->se;
5736
5737 while (!(cfs_rq = is_same_group(se, pse))) {
5738 int se_depth = se->depth;
5739 int pse_depth = pse->depth;
5740
5741 if (se_depth <= pse_depth) {
5742 put_prev_entity(cfs_rq_of(pse), pse);
5743 pse = parent_entity(pse);
5744 }
5745 if (se_depth >= pse_depth) {
5746 set_next_entity(cfs_rq_of(se), se);
5747 se = parent_entity(se);
5748 }
5749 }
5750
5751 put_prev_entity(cfs_rq, pse);
5752 set_next_entity(cfs_rq, se);
5753 }
5754
5755 if (hrtick_enabled(rq))
5756 hrtick_start_fair(rq, p);
5757
5758 return p;
5759simple:
5760 cfs_rq = &rq->cfs;
5761#endif
5762
5763 if (!cfs_rq->nr_running)
5764 goto idle;
5765
5766 put_prev_task(rq, prev);
5767
5768 do {
5769 se = pick_next_entity(cfs_rq, NULL);
5770 set_next_entity(cfs_rq, se);
5771 cfs_rq = group_cfs_rq(se);
5772 } while (cfs_rq);
5773
5774 p = task_of(se);
5775
5776 if (hrtick_enabled(rq))
5777 hrtick_start_fair(rq, p);
5778
5779 return p;
5780
5781idle:
5782
5783
5784
5785
5786
5787
5788 lockdep_unpin_lock(&rq->lock, cookie);
5789 new_tasks = idle_balance(rq);
5790 lockdep_repin_lock(&rq->lock, cookie);
5791
5792
5793
5794
5795
5796 if (new_tasks < 0)
5797 return RETRY_TASK;
5798
5799 if (new_tasks > 0)
5800 goto again;
5801
5802 return NULL;
5803}
5804
5805
5806
5807
5808static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5809{
5810 struct sched_entity *se = &prev->se;
5811 struct cfs_rq *cfs_rq;
5812
5813 for_each_sched_entity(se) {
5814 cfs_rq = cfs_rq_of(se);
5815 put_prev_entity(cfs_rq, se);
5816 }
5817}
5818
5819
5820
5821
5822
5823
5824static void yield_task_fair(struct rq *rq)
5825{
5826 struct task_struct *curr = rq->curr;
5827 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5828 struct sched_entity *se = &curr->se;
5829
5830
5831
5832
5833 if (unlikely(rq->nr_running == 1))
5834 return;
5835
5836 clear_buddies(cfs_rq, se);
5837
5838 if (curr->policy != SCHED_BATCH) {
5839 update_rq_clock(rq);
5840
5841
5842
5843 update_curr(cfs_rq);
5844
5845
5846
5847
5848
5849 rq_clock_skip_update(rq, true);
5850 }
5851
5852 set_skip_buddy(se);
5853}
5854
5855static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5856{
5857 struct sched_entity *se = &p->se;
5858
5859
5860 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5861 return false;
5862
5863
5864 set_next_buddy(se);
5865
5866 yield_task_fair(rq);
5867
5868 return true;
5869}
5870
5871#ifdef CONFIG_SMP
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5991
5992enum fbq_type { regular, remote, all };
5993
5994#define LBF_ALL_PINNED 0x01
5995#define LBF_NEED_BREAK 0x02
5996#define LBF_DST_PINNED 0x04
5997#define LBF_SOME_PINNED 0x08
5998
5999struct lb_env {
6000 struct sched_domain *sd;
6001
6002 struct rq *src_rq;
6003 int src_cpu;
6004
6005 int dst_cpu;
6006 struct rq *dst_rq;
6007
6008 struct cpumask *dst_grpmask;
6009 int new_dst_cpu;
6010 enum cpu_idle_type idle;
6011 long imbalance;
6012
6013 struct cpumask *cpus;
6014
6015 unsigned int flags;
6016
6017 unsigned int loop;
6018 unsigned int loop_break;
6019 unsigned int loop_max;
6020
6021 enum fbq_type fbq_type;
6022 struct list_head tasks;
6023};
6024
6025
6026
6027
6028static int task_hot(struct task_struct *p, struct lb_env *env)
6029{
6030 s64 delta;
6031
6032 lockdep_assert_held(&env->src_rq->lock);
6033
6034 if (p->sched_class != &fair_sched_class)
6035 return 0;
6036
6037 if (unlikely(p->policy == SCHED_IDLE))
6038 return 0;
6039
6040
6041
6042
6043 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
6044 (&p->se == cfs_rq_of(&p->se)->next ||
6045 &p->se == cfs_rq_of(&p->se)->last))
6046 return 1;
6047
6048 if (sysctl_sched_migration_cost == -1)
6049 return 1;
6050 if (sysctl_sched_migration_cost == 0)
6051 return 0;
6052
6053 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
6054
6055 return delta < (s64)sysctl_sched_migration_cost;
6056}
6057
6058#ifdef CONFIG_NUMA_BALANCING
6059
6060
6061
6062
6063
6064static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6065{
6066 struct numa_group *numa_group = rcu_dereference(p->numa_group);
6067 unsigned long src_faults, dst_faults;
6068 int src_nid, dst_nid;
6069
6070 if (!static_branch_likely(&sched_numa_balancing))
6071 return -1;
6072
6073 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
6074 return -1;
6075
6076 src_nid = cpu_to_node(env->src_cpu);
6077 dst_nid = cpu_to_node(env->dst_cpu);
6078
6079 if (src_nid == dst_nid)
6080 return -1;
6081
6082
6083 if (src_nid == p->numa_preferred_nid) {
6084 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6085 return 1;
6086 else
6087 return -1;
6088 }
6089
6090
6091 if (dst_nid == p->numa_preferred_nid)
6092 return 0;
6093
6094 if (numa_group) {
6095 src_faults = group_faults(p, src_nid);
6096 dst_faults = group_faults(p, dst_nid);
6097 } else {
6098 src_faults = task_faults(p, src_nid);
6099 dst_faults = task_faults(p, dst_nid);
6100 }
6101
6102 return dst_faults < src_faults;
6103}
6104
6105#else
6106static inline int migrate_degrades_locality(struct task_struct *p,
6107 struct lb_env *env)
6108{
6109 return -1;
6110}
6111#endif
6112
6113
6114
6115
6116static
6117int can_migrate_task(struct task_struct *p, struct lb_env *env)
6118{
6119 int tsk_cache_hot;
6120
6121 lockdep_assert_held(&env->src_rq->lock);
6122
6123
6124
6125
6126
6127
6128
6129
6130 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6131 return 0;
6132
6133 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6134 int cpu;
6135
6136 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6137
6138 env->flags |= LBF_SOME_PINNED;
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
6149 return 0;
6150
6151
6152 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6153 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6154 env->flags |= LBF_DST_PINNED;
6155 env->new_dst_cpu = cpu;
6156 break;
6157 }
6158 }
6159
6160 return 0;
6161 }
6162
6163
6164 env->flags &= ~LBF_ALL_PINNED;
6165
6166 if (task_running(env->src_rq, p)) {
6167 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6168 return 0;
6169 }
6170
6171
6172
6173
6174
6175
6176
6177 tsk_cache_hot = migrate_degrades_locality(p, env);
6178 if (tsk_cache_hot == -1)
6179 tsk_cache_hot = task_hot(p, env);
6180
6181 if (tsk_cache_hot <= 0 ||
6182 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6183 if (tsk_cache_hot == 1) {
6184 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6185 schedstat_inc(p, se.statistics.nr_forced_migrations);
6186 }
6187 return 1;
6188 }
6189
6190 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
6191 return 0;
6192}
6193
6194
6195
6196
6197static void detach_task(struct task_struct *p, struct lb_env *env)
6198{
6199 lockdep_assert_held(&env->src_rq->lock);
6200
6201 p->on_rq = TASK_ON_RQ_MIGRATING;
6202 deactivate_task(env->src_rq, p, 0);
6203 set_task_cpu(p, env->dst_cpu);
6204}
6205
6206
6207
6208
6209
6210
6211
6212static struct task_struct *detach_one_task(struct lb_env *env)
6213{
6214 struct task_struct *p, *n;
6215
6216 lockdep_assert_held(&env->src_rq->lock);
6217
6218 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6219 if (!can_migrate_task(p, env))
6220 continue;
6221
6222 detach_task(p, env);
6223
6224
6225
6226
6227
6228
6229
6230 schedstat_inc(env->sd, lb_gained[env->idle]);
6231 return p;
6232 }
6233 return NULL;
6234}
6235
6236static const unsigned int sched_nr_migrate_break = 32;
6237
6238
6239
6240
6241
6242
6243
6244static int detach_tasks(struct lb_env *env)
6245{
6246 struct list_head *tasks = &env->src_rq->cfs_tasks;
6247 struct task_struct *p;
6248 unsigned long load;
6249 int detached = 0;
6250
6251 lockdep_assert_held(&env->src_rq->lock);
6252
6253 if (env->imbalance <= 0)
6254 return 0;
6255
6256 while (!list_empty(tasks)) {
6257
6258
6259
6260
6261 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6262 break;
6263
6264 p = list_first_entry(tasks, struct task_struct, se.group_node);
6265
6266 env->loop++;
6267
6268 if (env->loop > env->loop_max)
6269 break;
6270
6271
6272 if (env->loop > env->loop_break) {
6273 env->loop_break += sched_nr_migrate_break;
6274 env->flags |= LBF_NEED_BREAK;
6275 break;
6276 }
6277
6278 if (!can_migrate_task(p, env))
6279 goto next;
6280
6281 load = task_h_load(p);
6282
6283 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6284 goto next;
6285
6286 if ((load / 2) > env->imbalance)
6287 goto next;
6288
6289 detach_task(p, env);
6290 list_add(&p->se.group_node, &env->tasks);
6291
6292 detached++;
6293 env->imbalance -= load;
6294
6295#ifdef CONFIG_PREEMPT
6296
6297
6298
6299
6300
6301 if (env->idle == CPU_NEWLY_IDLE)
6302 break;
6303#endif
6304
6305
6306
6307
6308
6309 if (env->imbalance <= 0)
6310 break;
6311
6312 continue;
6313next:
6314 list_move_tail(&p->se.group_node, tasks);
6315 }
6316
6317
6318
6319
6320
6321
6322 schedstat_add(env->sd, lb_gained[env->idle], detached);
6323
6324 return detached;
6325}
6326
6327
6328
6329
6330static void attach_task(struct rq *rq, struct task_struct *p)
6331{
6332 lockdep_assert_held(&rq->lock);
6333
6334 BUG_ON(task_rq(p) != rq);
6335 activate_task(rq, p, 0);
6336 p->on_rq = TASK_ON_RQ_QUEUED;
6337 check_preempt_curr(rq, p, 0);
6338}
6339
6340
6341
6342
6343
6344static void attach_one_task(struct rq *rq, struct task_struct *p)
6345{
6346 raw_spin_lock(&rq->lock);
6347 attach_task(rq, p);
6348 raw_spin_unlock(&rq->lock);
6349}
6350
6351
6352
6353
6354
6355static void attach_tasks(struct lb_env *env)
6356{
6357 struct list_head *tasks = &env->tasks;
6358 struct task_struct *p;
6359
6360 raw_spin_lock(&env->dst_rq->lock);
6361
6362 while (!list_empty(tasks)) {
6363 p = list_first_entry(tasks, struct task_struct, se.group_node);
6364 list_del_init(&p->se.group_node);
6365
6366 attach_task(env->dst_rq, p);
6367 }
6368
6369 raw_spin_unlock(&env->dst_rq->lock);
6370}
6371
6372#ifdef CONFIG_FAIR_GROUP_SCHED
6373static void update_blocked_averages(int cpu)
6374{
6375 struct rq *rq = cpu_rq(cpu);
6376 struct cfs_rq *cfs_rq;
6377 unsigned long flags;
6378
6379 raw_spin_lock_irqsave(&rq->lock, flags);
6380 update_rq_clock(rq);
6381
6382
6383
6384
6385
6386 for_each_leaf_cfs_rq(rq, cfs_rq) {
6387
6388 if (throttled_hierarchy(cfs_rq))
6389 continue;
6390
6391 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6392 update_tg_load_avg(cfs_rq, 0);
6393 }
6394 raw_spin_unlock_irqrestore(&rq->lock, flags);
6395}
6396
6397
6398
6399
6400
6401
6402static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6403{
6404 struct rq *rq = rq_of(cfs_rq);
6405 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6406 unsigned long now = jiffies;
6407 unsigned long load;
6408
6409 if (cfs_rq->last_h_load_update == now)
6410 return;
6411
6412 cfs_rq->h_load_next = NULL;
6413 for_each_sched_entity(se) {
6414 cfs_rq = cfs_rq_of(se);
6415 cfs_rq->h_load_next = se;
6416 if (cfs_rq->last_h_load_update == now)
6417 break;
6418 }
6419
6420 if (!se) {
6421 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6422 cfs_rq->last_h_load_update = now;
6423 }
6424
6425 while ((se = cfs_rq->h_load_next) != NULL) {
6426 load = cfs_rq->h_load;
6427 load = div64_ul(load * se->avg.load_avg,
6428 cfs_rq_load_avg(cfs_rq) + 1);
6429 cfs_rq = group_cfs_rq(se);
6430 cfs_rq->h_load = load;
6431 cfs_rq->last_h_load_update = now;
6432 }
6433}
6434
6435static unsigned long task_h_load(struct task_struct *p)
6436{
6437 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6438
6439 update_cfs_rq_h_load(cfs_rq);
6440 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6441 cfs_rq_load_avg(cfs_rq) + 1);
6442}
6443#else
6444static inline void update_blocked_averages(int cpu)
6445{
6446 struct rq *rq = cpu_rq(cpu);
6447 struct cfs_rq *cfs_rq = &rq->cfs;
6448 unsigned long flags;
6449
6450 raw_spin_lock_irqsave(&rq->lock, flags);
6451 update_rq_clock(rq);
6452 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
6453 raw_spin_unlock_irqrestore(&rq->lock, flags);
6454}
6455
6456static unsigned long task_h_load(struct task_struct *p)
6457{
6458 return p->se.avg.load_avg;
6459}
6460#endif
6461
6462
6463
6464enum group_type {
6465 group_other = 0,
6466 group_imbalanced,
6467 group_overloaded,
6468};
6469
6470
6471
6472
6473struct sg_lb_stats {
6474 unsigned long avg_load;
6475 unsigned long group_load;
6476 unsigned long sum_weighted_load;
6477 unsigned long load_per_task;
6478 unsigned long group_capacity;
6479 unsigned long group_util;
6480 unsigned int sum_nr_running;
6481 unsigned int idle_cpus;
6482 unsigned int group_weight;
6483 enum group_type group_type;
6484 int group_no_capacity;
6485#ifdef CONFIG_NUMA_BALANCING
6486 unsigned int nr_numa_running;
6487 unsigned int nr_preferred_running;
6488#endif
6489};
6490
6491
6492
6493
6494
6495struct sd_lb_stats {
6496 struct sched_group *busiest;
6497 struct sched_group *local;
6498 unsigned long total_load;
6499 unsigned long total_capacity;
6500 unsigned long avg_load;
6501
6502 struct sg_lb_stats busiest_stat;
6503 struct sg_lb_stats local_stat;
6504};
6505
6506static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6507{
6508
6509
6510
6511
6512
6513
6514 *sds = (struct sd_lb_stats){
6515 .busiest = NULL,
6516 .local = NULL,
6517 .total_load = 0UL,
6518 .total_capacity = 0UL,
6519 .busiest_stat = {
6520 .avg_load = 0UL,
6521 .sum_nr_running = 0,
6522 .group_type = group_other,
6523 },
6524 };
6525}
6526
6527
6528
6529
6530
6531
6532
6533
6534static inline int get_sd_load_idx(struct sched_domain *sd,
6535 enum cpu_idle_type idle)
6536{
6537 int load_idx;
6538
6539 switch (idle) {
6540 case CPU_NOT_IDLE:
6541 load_idx = sd->busy_idx;
6542 break;
6543
6544 case CPU_NEWLY_IDLE:
6545 load_idx = sd->newidle_idx;
6546 break;
6547 default:
6548 load_idx = sd->idle_idx;
6549 break;
6550 }
6551
6552 return load_idx;
6553}
6554
6555static unsigned long scale_rt_capacity(int cpu)
6556{
6557 struct rq *rq = cpu_rq(cpu);
6558 u64 total, used, age_stamp, avg;
6559 s64 delta;
6560
6561
6562
6563
6564
6565 age_stamp = READ_ONCE(rq->age_stamp);
6566 avg = READ_ONCE(rq->rt_avg);
6567 delta = __rq_clock_broken(rq) - age_stamp;
6568
6569 if (unlikely(delta < 0))
6570 delta = 0;
6571
6572 total = sched_avg_period() + delta;
6573
6574 used = div_u64(avg, total);
6575
6576 if (likely(used < SCHED_CAPACITY_SCALE))
6577 return SCHED_CAPACITY_SCALE - used;
6578
6579 return 1;
6580}
6581
6582static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6583{
6584 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6585 struct sched_group *sdg = sd->groups;
6586
6587 cpu_rq(cpu)->cpu_capacity_orig = capacity;
6588
6589 capacity *= scale_rt_capacity(cpu);
6590 capacity >>= SCHED_CAPACITY_SHIFT;
6591
6592 if (!capacity)
6593 capacity = 1;
6594
6595 cpu_rq(cpu)->cpu_capacity = capacity;
6596 sdg->sgc->capacity = capacity;
6597}
6598
6599void update_group_capacity(struct sched_domain *sd, int cpu)
6600{
6601 struct sched_domain *child = sd->child;
6602 struct sched_group *group, *sdg = sd->groups;
6603 unsigned long capacity;
6604 unsigned long interval;
6605
6606 interval = msecs_to_jiffies(sd->balance_interval);
6607 interval = clamp(interval, 1UL, max_load_balance_interval);
6608 sdg->sgc->next_update = jiffies + interval;
6609
6610 if (!child) {
6611 update_cpu_capacity(sd, cpu);
6612 return;
6613 }
6614
6615 capacity = 0;
6616
6617 if (child->flags & SD_OVERLAP) {
6618
6619
6620
6621
6622
6623 for_each_cpu(cpu, sched_group_cpus(sdg)) {
6624 struct sched_group_capacity *sgc;
6625 struct rq *rq = cpu_rq(cpu);
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638 if (unlikely(!rq->sd)) {
6639 capacity += capacity_of(cpu);
6640 continue;
6641 }
6642
6643 sgc = rq->sd->groups->sgc;
6644 capacity += sgc->capacity;
6645 }
6646 } else {
6647
6648
6649
6650
6651
6652 group = child->groups;
6653 do {
6654 capacity += group->sgc->capacity;
6655 group = group->next;
6656 } while (group != child->groups);
6657 }
6658
6659 sdg->sgc->capacity = capacity;
6660}
6661
6662
6663
6664
6665
6666
6667static inline int
6668check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6669{
6670 return ((rq->cpu_capacity * sd->imbalance_pct) <
6671 (rq->cpu_capacity_orig * 100));
6672}
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703static inline int sg_imbalanced(struct sched_group *group)
6704{
6705 return group->sgc->imbalance;
6706}
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720static inline bool
6721group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6722{
6723 if (sgs->sum_nr_running < sgs->group_weight)
6724 return true;
6725
6726 if ((sgs->group_capacity * 100) >
6727 (sgs->group_util * env->sd->imbalance_pct))
6728 return true;
6729
6730 return false;
6731}
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741static inline bool
6742group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6743{
6744 if (sgs->sum_nr_running <= sgs->group_weight)
6745 return false;
6746
6747 if ((sgs->group_capacity * 100) <
6748 (sgs->group_util * env->sd->imbalance_pct))
6749 return true;
6750
6751 return false;
6752}
6753
6754static inline enum
6755group_type group_classify(struct sched_group *group,
6756 struct sg_lb_stats *sgs)
6757{
6758 if (sgs->group_no_capacity)
6759 return group_overloaded;
6760
6761 if (sg_imbalanced(group))
6762 return group_imbalanced;
6763
6764 return group_other;
6765}
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776static inline void update_sg_lb_stats(struct lb_env *env,
6777 struct sched_group *group, int load_idx,
6778 int local_group, struct sg_lb_stats *sgs,
6779 bool *overload)
6780{
6781 unsigned long load;
6782 int i, nr_running;
6783
6784 memset(sgs, 0, sizeof(*sgs));
6785
6786 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6787 struct rq *rq = cpu_rq(i);
6788
6789
6790 if (local_group)
6791 load = target_load(i, load_idx);
6792 else
6793 load = source_load(i, load_idx);
6794
6795 sgs->group_load += load;
6796 sgs->group_util += cpu_util(i);
6797 sgs->sum_nr_running += rq->cfs.h_nr_running;
6798
6799 nr_running = rq->nr_running;
6800 if (nr_running > 1)
6801 *overload = true;
6802
6803#ifdef CONFIG_NUMA_BALANCING
6804 sgs->nr_numa_running += rq->nr_numa_running;
6805 sgs->nr_preferred_running += rq->nr_preferred_running;
6806#endif
6807 sgs->sum_weighted_load += weighted_cpuload(i);
6808
6809
6810
6811 if (!nr_running && idle_cpu(i))
6812 sgs->idle_cpus++;
6813 }
6814
6815
6816 sgs->group_capacity = group->sgc->capacity;
6817 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6818
6819 if (sgs->sum_nr_running)
6820 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6821
6822 sgs->group_weight = group->group_weight;
6823
6824 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6825 sgs->group_type = group_classify(group, sgs);
6826}
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841static bool update_sd_pick_busiest(struct lb_env *env,
6842 struct sd_lb_stats *sds,
6843 struct sched_group *sg,
6844 struct sg_lb_stats *sgs)
6845{
6846 struct sg_lb_stats *busiest = &sds->busiest_stat;
6847
6848 if (sgs->group_type > busiest->group_type)
6849 return true;
6850
6851 if (sgs->group_type < busiest->group_type)
6852 return false;
6853
6854 if (sgs->avg_load <= busiest->avg_load)
6855 return false;
6856
6857
6858 if (!(env->sd->flags & SD_ASYM_PACKING))
6859 return true;
6860
6861
6862 if (env->idle == CPU_NOT_IDLE)
6863 return true;
6864
6865
6866
6867
6868
6869 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6870 if (!sds->busiest)
6871 return true;
6872
6873
6874 if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
6875 return true;
6876 }
6877
6878 return false;
6879}
6880
6881#ifdef CONFIG_NUMA_BALANCING
6882static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6883{
6884 if (sgs->sum_nr_running > sgs->nr_numa_running)
6885 return regular;
6886 if (sgs->sum_nr_running > sgs->nr_preferred_running)
6887 return remote;
6888 return all;
6889}
6890
6891static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6892{
6893 if (rq->nr_running > rq->nr_numa_running)
6894 return regular;
6895 if (rq->nr_running > rq->nr_preferred_running)
6896 return remote;
6897 return all;
6898}
6899#else
6900static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6901{
6902 return all;
6903}
6904
6905static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6906{
6907 return regular;
6908}
6909#endif
6910
6911
6912
6913
6914
6915
6916static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6917{
6918 struct sched_domain *child = env->sd->child;
6919 struct sched_group *sg = env->sd->groups;
6920 struct sg_lb_stats tmp_sgs;
6921 int load_idx, prefer_sibling = 0;
6922 bool overload = false;
6923
6924 if (child && child->flags & SD_PREFER_SIBLING)
6925 prefer_sibling = 1;
6926
6927 load_idx = get_sd_load_idx(env->sd, env->idle);
6928
6929 do {
6930 struct sg_lb_stats *sgs = &tmp_sgs;
6931 int local_group;
6932
6933 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6934 if (local_group) {
6935 sds->local = sg;
6936 sgs = &sds->local_stat;
6937
6938 if (env->idle != CPU_NEWLY_IDLE ||
6939 time_after_eq(jiffies, sg->sgc->next_update))
6940 update_group_capacity(env->sd, env->dst_cpu);
6941 }
6942
6943 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6944 &overload);
6945
6946 if (local_group)
6947 goto next_group;
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959 if (prefer_sibling && sds->local &&
6960 group_has_capacity(env, &sds->local_stat) &&
6961 (sgs->sum_nr_running > 1)) {
6962 sgs->group_no_capacity = 1;
6963 sgs->group_type = group_classify(sg, sgs);
6964 }
6965
6966 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6967 sds->busiest = sg;
6968 sds->busiest_stat = *sgs;
6969 }
6970
6971next_group:
6972
6973 sds->total_load += sgs->group_load;
6974 sds->total_capacity += sgs->group_capacity;
6975
6976 sg = sg->next;
6977 } while (sg != env->sd->groups);
6978
6979 if (env->sd->flags & SD_NUMA)
6980 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6981
6982 if (!env->sd->parent) {
6983
6984 if (env->dst_rq->rd->overload != overload)
6985 env->dst_rq->rd->overload = overload;
6986 }
6987
6988}
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7014{
7015 int busiest_cpu;
7016
7017 if (!(env->sd->flags & SD_ASYM_PACKING))
7018 return 0;
7019
7020 if (env->idle == CPU_NOT_IDLE)
7021 return 0;
7022
7023 if (!sds->busiest)
7024 return 0;
7025
7026 busiest_cpu = group_first_cpu(sds->busiest);
7027 if (env->dst_cpu > busiest_cpu)
7028 return 0;
7029
7030 env->imbalance = DIV_ROUND_CLOSEST(
7031 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
7032 SCHED_CAPACITY_SCALE);
7033
7034 return 1;
7035}
7036
7037
7038
7039
7040
7041
7042
7043
7044static inline
7045void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7046{
7047 unsigned long tmp, capa_now = 0, capa_move = 0;
7048 unsigned int imbn = 2;
7049 unsigned long scaled_busy_load_per_task;
7050 struct sg_lb_stats *local, *busiest;
7051
7052 local = &sds->local_stat;
7053 busiest = &sds->busiest_stat;
7054
7055 if (!local->sum_nr_running)
7056 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
7057 else if (busiest->load_per_task > local->load_per_task)
7058 imbn = 1;
7059
7060 scaled_busy_load_per_task =
7061 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7062 busiest->group_capacity;
7063
7064 if (busiest->avg_load + scaled_busy_load_per_task >=
7065 local->avg_load + (scaled_busy_load_per_task * imbn)) {
7066 env->imbalance = busiest->load_per_task;
7067 return;
7068 }
7069
7070
7071
7072
7073
7074
7075
7076 capa_now += busiest->group_capacity *
7077 min(busiest->load_per_task, busiest->avg_load);
7078 capa_now += local->group_capacity *
7079 min(local->load_per_task, local->avg_load);
7080 capa_now /= SCHED_CAPACITY_SCALE;
7081
7082
7083 if (busiest->avg_load > scaled_busy_load_per_task) {
7084 capa_move += busiest->group_capacity *
7085 min(busiest->load_per_task,
7086 busiest->avg_load - scaled_busy_load_per_task);
7087 }
7088
7089
7090 if (busiest->avg_load * busiest->group_capacity <
7091 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
7092 tmp = (busiest->avg_load * busiest->group_capacity) /
7093 local->group_capacity;
7094 } else {
7095 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7096 local->group_capacity;
7097 }
7098 capa_move += local->group_capacity *
7099 min(local->load_per_task, local->avg_load + tmp);
7100 capa_move /= SCHED_CAPACITY_SCALE;
7101
7102
7103 if (capa_move > capa_now)
7104 env->imbalance = busiest->load_per_task;
7105}
7106
7107
7108
7109
7110
7111
7112
7113static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7114{
7115 unsigned long max_pull, load_above_capacity = ~0UL;
7116 struct sg_lb_stats *local, *busiest;
7117
7118 local = &sds->local_stat;
7119 busiest = &sds->busiest_stat;
7120
7121 if (busiest->group_type == group_imbalanced) {
7122
7123
7124
7125
7126 busiest->load_per_task =
7127 min(busiest->load_per_task, sds->avg_load);
7128 }
7129
7130
7131
7132
7133
7134
7135
7136 if (busiest->avg_load <= sds->avg_load ||
7137 local->avg_load >= sds->avg_load) {
7138 env->imbalance = 0;
7139 return fix_small_imbalance(env, sds);
7140 }
7141
7142
7143
7144
7145 if (busiest->group_type == group_overloaded &&
7146 local->group_type == group_overloaded) {
7147 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7148 if (load_above_capacity > busiest->group_capacity) {
7149 load_above_capacity -= busiest->group_capacity;
7150 load_above_capacity *= NICE_0_LOAD;
7151 load_above_capacity /= busiest->group_capacity;
7152 } else
7153 load_above_capacity = ~0UL;
7154 }
7155
7156
7157
7158
7159
7160
7161
7162
7163 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
7164
7165
7166 env->imbalance = min(
7167 max_pull * busiest->group_capacity,
7168 (sds->avg_load - local->avg_load) * local->group_capacity
7169 ) / SCHED_CAPACITY_SCALE;
7170
7171
7172
7173
7174
7175
7176
7177 if (env->imbalance < busiest->load_per_task)
7178 return fix_small_imbalance(env, sds);
7179}
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194static struct sched_group *find_busiest_group(struct lb_env *env)
7195{
7196 struct sg_lb_stats *local, *busiest;
7197 struct sd_lb_stats sds;
7198
7199 init_sd_lb_stats(&sds);
7200
7201
7202
7203
7204
7205 update_sd_lb_stats(env, &sds);
7206 local = &sds.local_stat;
7207 busiest = &sds.busiest_stat;
7208
7209
7210 if (check_asym_packing(env, &sds))
7211 return sds.busiest;
7212
7213
7214 if (!sds.busiest || busiest->sum_nr_running == 0)
7215 goto out_balanced;
7216
7217 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7218 / sds.total_capacity;
7219
7220
7221
7222
7223
7224
7225 if (busiest->group_type == group_imbalanced)
7226 goto force_balance;
7227
7228
7229 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7230 busiest->group_no_capacity)
7231 goto force_balance;
7232
7233
7234
7235
7236
7237 if (local->avg_load >= busiest->avg_load)
7238 goto out_balanced;
7239
7240
7241
7242
7243
7244 if (local->avg_load >= sds.avg_load)
7245 goto out_balanced;
7246
7247 if (env->idle == CPU_IDLE) {
7248
7249
7250
7251
7252
7253
7254
7255 if ((busiest->group_type != group_overloaded) &&
7256 (local->idle_cpus <= (busiest->idle_cpus + 1)))
7257 goto out_balanced;
7258 } else {
7259
7260
7261
7262
7263 if (100 * busiest->avg_load <=
7264 env->sd->imbalance_pct * local->avg_load)
7265 goto out_balanced;
7266 }
7267
7268force_balance:
7269
7270 calculate_imbalance(env, &sds);
7271 return sds.busiest;
7272
7273out_balanced:
7274 env->imbalance = 0;
7275 return NULL;
7276}
7277
7278
7279
7280
7281static struct rq *find_busiest_queue(struct lb_env *env,
7282 struct sched_group *group)
7283{
7284 struct rq *busiest = NULL, *rq;
7285 unsigned long busiest_load = 0, busiest_capacity = 1;
7286 int i;
7287
7288 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
7289 unsigned long capacity, wl;
7290 enum fbq_type rt;
7291
7292 rq = cpu_rq(i);
7293 rt = fbq_classify_rq(rq);
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314 if (rt > env->fbq_type)
7315 continue;
7316
7317 capacity = capacity_of(i);
7318
7319 wl = weighted_cpuload(i);
7320
7321
7322
7323
7324
7325
7326 if (rq->nr_running == 1 && wl > env->imbalance &&
7327 !check_cpu_capacity(rq, env->sd))
7328 continue;
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341 if (wl * busiest_capacity > busiest_load * capacity) {
7342 busiest_load = wl;
7343 busiest_capacity = capacity;
7344 busiest = rq;
7345 }
7346 }
7347
7348 return busiest;
7349}
7350
7351
7352
7353
7354
7355#define MAX_PINNED_INTERVAL 512
7356
7357
7358DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7359
7360static int need_active_balance(struct lb_env *env)
7361{
7362 struct sched_domain *sd = env->sd;
7363
7364 if (env->idle == CPU_NEWLY_IDLE) {
7365
7366
7367
7368
7369
7370
7371 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7372 return 1;
7373 }
7374
7375
7376
7377
7378
7379
7380
7381 if ((env->idle != CPU_NOT_IDLE) &&
7382 (env->src_rq->cfs.h_nr_running == 1)) {
7383 if ((check_cpu_capacity(env->src_rq, sd)) &&
7384 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7385 return 1;
7386 }
7387
7388 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7389}
7390
7391static int active_load_balance_cpu_stop(void *data);
7392
7393static int should_we_balance(struct lb_env *env)
7394{
7395 struct sched_group *sg = env->sd->groups;
7396 struct cpumask *sg_cpus, *sg_mask;
7397 int cpu, balance_cpu = -1;
7398
7399
7400
7401
7402
7403 if (env->idle == CPU_NEWLY_IDLE)
7404 return 1;
7405
7406 sg_cpus = sched_group_cpus(sg);
7407 sg_mask = sched_group_mask(sg);
7408
7409 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7410 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7411 continue;
7412
7413 balance_cpu = cpu;
7414 break;
7415 }
7416
7417 if (balance_cpu == -1)
7418 balance_cpu = group_balance_cpu(sg);
7419
7420
7421
7422
7423
7424 return balance_cpu == env->dst_cpu;
7425}
7426
7427
7428
7429
7430
7431static int load_balance(int this_cpu, struct rq *this_rq,
7432 struct sched_domain *sd, enum cpu_idle_type idle,
7433 int *continue_balancing)
7434{
7435 int ld_moved, cur_ld_moved, active_balance = 0;
7436 struct sched_domain *sd_parent = sd->parent;
7437 struct sched_group *group;
7438 struct rq *busiest;
7439 unsigned long flags;
7440 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7441
7442 struct lb_env env = {
7443 .sd = sd,
7444 .dst_cpu = this_cpu,
7445 .dst_rq = this_rq,
7446 .dst_grpmask = sched_group_cpus(sd->groups),
7447 .idle = idle,
7448 .loop_break = sched_nr_migrate_break,
7449 .cpus = cpus,
7450 .fbq_type = all,
7451 .tasks = LIST_HEAD_INIT(env.tasks),
7452 };
7453
7454
7455
7456
7457
7458 if (idle == CPU_NEWLY_IDLE)
7459 env.dst_grpmask = NULL;
7460
7461 cpumask_copy(cpus, cpu_active_mask);
7462
7463 schedstat_inc(sd, lb_count[idle]);
7464
7465redo:
7466 if (!should_we_balance(&env)) {
7467 *continue_balancing = 0;
7468 goto out_balanced;
7469 }
7470
7471 group = find_busiest_group(&env);
7472 if (!group) {
7473 schedstat_inc(sd, lb_nobusyg[idle]);
7474 goto out_balanced;
7475 }
7476
7477 busiest = find_busiest_queue(&env, group);
7478 if (!busiest) {
7479 schedstat_inc(sd, lb_nobusyq[idle]);
7480 goto out_balanced;
7481 }
7482
7483 BUG_ON(busiest == env.dst_rq);
7484
7485 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7486
7487 env.src_cpu = busiest->cpu;
7488 env.src_rq = busiest;
7489
7490 ld_moved = 0;
7491 if (busiest->nr_running > 1) {
7492
7493
7494
7495
7496
7497
7498 env.flags |= LBF_ALL_PINNED;
7499 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
7500
7501more_balance:
7502 raw_spin_lock_irqsave(&busiest->lock, flags);
7503
7504
7505
7506
7507
7508 cur_ld_moved = detach_tasks(&env);
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518 raw_spin_unlock(&busiest->lock);
7519
7520 if (cur_ld_moved) {
7521 attach_tasks(&env);
7522 ld_moved += cur_ld_moved;
7523 }
7524
7525 local_irq_restore(flags);
7526
7527 if (env.flags & LBF_NEED_BREAK) {
7528 env.flags &= ~LBF_NEED_BREAK;
7529 goto more_balance;
7530 }
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7552
7553
7554 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7555
7556 env.dst_rq = cpu_rq(env.new_dst_cpu);
7557 env.dst_cpu = env.new_dst_cpu;
7558 env.flags &= ~LBF_DST_PINNED;
7559 env.loop = 0;
7560 env.loop_break = sched_nr_migrate_break;
7561
7562
7563
7564
7565
7566 goto more_balance;
7567 }
7568
7569
7570
7571
7572 if (sd_parent) {
7573 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7574
7575 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7576 *group_imbalance = 1;
7577 }
7578
7579
7580 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7581 cpumask_clear_cpu(cpu_of(busiest), cpus);
7582 if (!cpumask_empty(cpus)) {
7583 env.loop = 0;
7584 env.loop_break = sched_nr_migrate_break;
7585 goto redo;
7586 }
7587 goto out_all_pinned;
7588 }
7589 }
7590
7591 if (!ld_moved) {
7592 schedstat_inc(sd, lb_failed[idle]);
7593
7594
7595
7596
7597
7598
7599 if (idle != CPU_NEWLY_IDLE)
7600 sd->nr_balance_failed++;
7601
7602 if (need_active_balance(&env)) {
7603 raw_spin_lock_irqsave(&busiest->lock, flags);
7604
7605
7606
7607
7608
7609 if (!cpumask_test_cpu(this_cpu,
7610 tsk_cpus_allowed(busiest->curr))) {
7611 raw_spin_unlock_irqrestore(&busiest->lock,
7612 flags);
7613 env.flags |= LBF_ALL_PINNED;
7614 goto out_one_pinned;
7615 }
7616
7617
7618
7619
7620
7621
7622 if (!busiest->active_balance) {
7623 busiest->active_balance = 1;
7624 busiest->push_cpu = this_cpu;
7625 active_balance = 1;
7626 }
7627 raw_spin_unlock_irqrestore(&busiest->lock, flags);
7628
7629 if (active_balance) {
7630 stop_one_cpu_nowait(cpu_of(busiest),
7631 active_load_balance_cpu_stop, busiest,
7632 &busiest->active_balance_work);
7633 }
7634
7635
7636 sd->nr_balance_failed = sd->cache_nice_tries+1;
7637 }
7638 } else
7639 sd->nr_balance_failed = 0;
7640
7641 if (likely(!active_balance)) {
7642
7643 sd->balance_interval = sd->min_interval;
7644 } else {
7645
7646
7647
7648
7649
7650
7651 if (sd->balance_interval < sd->max_interval)
7652 sd->balance_interval *= 2;
7653 }
7654
7655 goto out;
7656
7657out_balanced:
7658
7659
7660
7661
7662 if (sd_parent) {
7663 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7664
7665 if (*group_imbalance)
7666 *group_imbalance = 0;
7667 }
7668
7669out_all_pinned:
7670
7671
7672
7673
7674
7675 schedstat_inc(sd, lb_balanced[idle]);
7676
7677 sd->nr_balance_failed = 0;
7678
7679out_one_pinned:
7680
7681 if (((env.flags & LBF_ALL_PINNED) &&
7682 sd->balance_interval < MAX_PINNED_INTERVAL) ||
7683 (sd->balance_interval < sd->max_interval))
7684 sd->balance_interval *= 2;
7685
7686 ld_moved = 0;
7687out:
7688 return ld_moved;
7689}
7690
7691static inline unsigned long
7692get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7693{
7694 unsigned long interval = sd->balance_interval;
7695
7696 if (cpu_busy)
7697 interval *= sd->busy_factor;
7698
7699
7700 interval = msecs_to_jiffies(interval);
7701 interval = clamp(interval, 1UL, max_load_balance_interval);
7702
7703 return interval;
7704}
7705
7706static inline void
7707update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7708{
7709 unsigned long interval, next;
7710
7711 interval = get_sd_balance_interval(sd, cpu_busy);
7712 next = sd->last_balance + interval;
7713
7714 if (time_after(*next_balance, next))
7715 *next_balance = next;
7716}
7717
7718
7719
7720
7721
7722static int idle_balance(struct rq *this_rq)
7723{
7724 unsigned long next_balance = jiffies + HZ;
7725 int this_cpu = this_rq->cpu;
7726 struct sched_domain *sd;
7727 int pulled_task = 0;
7728 u64 curr_cost = 0;
7729
7730
7731
7732
7733
7734 this_rq->idle_stamp = rq_clock(this_rq);
7735
7736 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7737 !this_rq->rd->overload) {
7738 rcu_read_lock();
7739 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7740 if (sd)
7741 update_next_balance(sd, 0, &next_balance);
7742 rcu_read_unlock();
7743
7744 goto out;
7745 }
7746
7747 raw_spin_unlock(&this_rq->lock);
7748
7749 update_blocked_averages(this_cpu);
7750 rcu_read_lock();
7751 for_each_domain(this_cpu, sd) {
7752 int continue_balancing = 1;
7753 u64 t0, domain_cost;
7754
7755 if (!(sd->flags & SD_LOAD_BALANCE))
7756 continue;
7757
7758 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7759 update_next_balance(sd, 0, &next_balance);
7760 break;
7761 }
7762
7763 if (sd->flags & SD_BALANCE_NEWIDLE) {
7764 t0 = sched_clock_cpu(this_cpu);
7765
7766 pulled_task = load_balance(this_cpu, this_rq,
7767 sd, CPU_NEWLY_IDLE,
7768 &continue_balancing);
7769
7770 domain_cost = sched_clock_cpu(this_cpu) - t0;
7771 if (domain_cost > sd->max_newidle_lb_cost)
7772 sd->max_newidle_lb_cost = domain_cost;
7773
7774 curr_cost += domain_cost;
7775 }
7776
7777 update_next_balance(sd, 0, &next_balance);
7778
7779
7780
7781
7782
7783 if (pulled_task || this_rq->nr_running > 0)
7784 break;
7785 }
7786 rcu_read_unlock();
7787
7788 raw_spin_lock(&this_rq->lock);
7789
7790 if (curr_cost > this_rq->max_idle_balance_cost)
7791 this_rq->max_idle_balance_cost = curr_cost;
7792
7793
7794
7795
7796
7797
7798 if (this_rq->cfs.h_nr_running && !pulled_task)
7799 pulled_task = 1;
7800
7801out:
7802
7803 if (time_after(this_rq->next_balance, next_balance))
7804 this_rq->next_balance = next_balance;
7805
7806
7807 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7808 pulled_task = -1;
7809
7810 if (pulled_task)
7811 this_rq->idle_stamp = 0;
7812
7813 return pulled_task;
7814}
7815
7816
7817
7818
7819
7820
7821
7822static int active_load_balance_cpu_stop(void *data)
7823{
7824 struct rq *busiest_rq = data;
7825 int busiest_cpu = cpu_of(busiest_rq);
7826 int target_cpu = busiest_rq->push_cpu;
7827 struct rq *target_rq = cpu_rq(target_cpu);
7828 struct sched_domain *sd;
7829 struct task_struct *p = NULL;
7830
7831 raw_spin_lock_irq(&busiest_rq->lock);
7832
7833
7834 if (unlikely(busiest_cpu != smp_processor_id() ||
7835 !busiest_rq->active_balance))
7836 goto out_unlock;
7837
7838
7839 if (busiest_rq->nr_running <= 1)
7840 goto out_unlock;
7841
7842
7843
7844
7845
7846
7847 BUG_ON(busiest_rq == target_rq);
7848
7849
7850 rcu_read_lock();
7851 for_each_domain(target_cpu, sd) {
7852 if ((sd->flags & SD_LOAD_BALANCE) &&
7853 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7854 break;
7855 }
7856
7857 if (likely(sd)) {
7858 struct lb_env env = {
7859 .sd = sd,
7860 .dst_cpu = target_cpu,
7861 .dst_rq = target_rq,
7862 .src_cpu = busiest_rq->cpu,
7863 .src_rq = busiest_rq,
7864 .idle = CPU_IDLE,
7865 };
7866
7867 schedstat_inc(sd, alb_count);
7868
7869 p = detach_one_task(&env);
7870 if (p) {
7871 schedstat_inc(sd, alb_pushed);
7872
7873 sd->nr_balance_failed = 0;
7874 } else {
7875 schedstat_inc(sd, alb_failed);
7876 }
7877 }
7878 rcu_read_unlock();
7879out_unlock:
7880 busiest_rq->active_balance = 0;
7881 raw_spin_unlock(&busiest_rq->lock);
7882
7883 if (p)
7884 attach_one_task(target_rq, p);
7885
7886 local_irq_enable();
7887
7888 return 0;
7889}
7890
7891static inline int on_null_domain(struct rq *rq)
7892{
7893 return unlikely(!rcu_dereference_sched(rq->sd));
7894}
7895
7896#ifdef CONFIG_NO_HZ_COMMON
7897
7898
7899
7900
7901
7902
7903static struct {
7904 cpumask_var_t idle_cpus_mask;
7905 atomic_t nr_cpus;
7906 unsigned long next_balance;
7907} nohz ____cacheline_aligned;
7908
7909static inline int find_new_ilb(void)
7910{
7911 int ilb = cpumask_first(nohz.idle_cpus_mask);
7912
7913 if (ilb < nr_cpu_ids && idle_cpu(ilb))
7914 return ilb;
7915
7916 return nr_cpu_ids;
7917}
7918
7919
7920
7921
7922
7923
7924static void nohz_balancer_kick(void)
7925{
7926 int ilb_cpu;
7927
7928 nohz.next_balance++;
7929
7930 ilb_cpu = find_new_ilb();
7931
7932 if (ilb_cpu >= nr_cpu_ids)
7933 return;
7934
7935 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7936 return;
7937
7938
7939
7940
7941
7942
7943 smp_send_reschedule(ilb_cpu);
7944 return;
7945}
7946
7947void nohz_balance_exit_idle(unsigned int cpu)
7948{
7949 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7950
7951
7952
7953 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7954 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7955 atomic_dec(&nohz.nr_cpus);
7956 }
7957 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7958 }
7959}
7960
7961static inline void set_cpu_sd_state_busy(void)
7962{
7963 struct sched_domain *sd;
7964 int cpu = smp_processor_id();
7965
7966 rcu_read_lock();
7967 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7968
7969 if (!sd || !sd->nohz_idle)
7970 goto unlock;
7971 sd->nohz_idle = 0;
7972
7973 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7974unlock:
7975 rcu_read_unlock();
7976}
7977
7978void set_cpu_sd_state_idle(void)
7979{
7980 struct sched_domain *sd;
7981 int cpu = smp_processor_id();
7982
7983 rcu_read_lock();
7984 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7985
7986 if (!sd || sd->nohz_idle)
7987 goto unlock;
7988 sd->nohz_idle = 1;
7989
7990 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7991unlock:
7992 rcu_read_unlock();
7993}
7994
7995
7996
7997
7998
7999void nohz_balance_enter_idle(int cpu)
8000{
8001
8002
8003
8004 if (!cpu_active(cpu))
8005 return;
8006
8007 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8008 return;
8009
8010
8011
8012
8013 if (on_null_domain(cpu_rq(cpu)))
8014 return;
8015
8016 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8017 atomic_inc(&nohz.nr_cpus);
8018 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8019}
8020#endif
8021
8022static DEFINE_SPINLOCK(balancing);
8023
8024
8025
8026
8027
8028void update_max_interval(void)
8029{
8030 max_load_balance_interval = HZ*num_online_cpus()/10;
8031}
8032
8033
8034
8035
8036
8037
8038
8039static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
8040{
8041 int continue_balancing = 1;
8042 int cpu = rq->cpu;
8043 unsigned long interval;
8044 struct sched_domain *sd;
8045
8046 unsigned long next_balance = jiffies + 60*HZ;
8047 int update_next_balance = 0;
8048 int need_serialize, need_decay = 0;
8049 u64 max_cost = 0;
8050
8051 update_blocked_averages(cpu);
8052
8053 rcu_read_lock();
8054 for_each_domain(cpu, sd) {
8055
8056
8057
8058
8059 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8060 sd->max_newidle_lb_cost =
8061 (sd->max_newidle_lb_cost * 253) / 256;
8062 sd->next_decay_max_lb_cost = jiffies + HZ;
8063 need_decay = 1;
8064 }
8065 max_cost += sd->max_newidle_lb_cost;
8066
8067 if (!(sd->flags & SD_LOAD_BALANCE))
8068 continue;
8069
8070
8071
8072
8073
8074
8075 if (!continue_balancing) {
8076 if (need_decay)
8077 continue;
8078 break;
8079 }
8080
8081 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8082
8083 need_serialize = sd->flags & SD_SERIALIZE;
8084 if (need_serialize) {
8085 if (!spin_trylock(&balancing))
8086 goto out;
8087 }
8088
8089 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8090 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8091
8092
8093
8094
8095
8096 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8097 }
8098 sd->last_balance = jiffies;
8099 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8100 }
8101 if (need_serialize)
8102 spin_unlock(&balancing);
8103out:
8104 if (time_after(next_balance, sd->last_balance + interval)) {
8105 next_balance = sd->last_balance + interval;
8106 update_next_balance = 1;
8107 }
8108 }
8109 if (need_decay) {
8110
8111
8112
8113
8114 rq->max_idle_balance_cost =
8115 max((u64)sysctl_sched_migration_cost, max_cost);
8116 }
8117 rcu_read_unlock();
8118
8119
8120
8121
8122
8123
8124 if (likely(update_next_balance)) {
8125 rq->next_balance = next_balance;
8126
8127#ifdef CONFIG_NO_HZ_COMMON
8128
8129
8130
8131
8132
8133
8134
8135
8136 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8137 nohz.next_balance = rq->next_balance;
8138#endif
8139 }
8140}
8141
8142#ifdef CONFIG_NO_HZ_COMMON
8143
8144
8145
8146
8147static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8148{
8149 int this_cpu = this_rq->cpu;
8150 struct rq *rq;
8151 int balance_cpu;
8152
8153 unsigned long next_balance = jiffies + 60*HZ;
8154 int update_next_balance = 0;
8155
8156 if (idle != CPU_IDLE ||
8157 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8158 goto end;
8159
8160 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8161 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8162 continue;
8163
8164
8165
8166
8167
8168
8169 if (need_resched())
8170 break;
8171
8172 rq = cpu_rq(balance_cpu);
8173
8174
8175
8176
8177
8178 if (time_after_eq(jiffies, rq->next_balance)) {
8179 raw_spin_lock_irq(&rq->lock);
8180 update_rq_clock(rq);
8181 cpu_load_update_idle(rq);
8182 raw_spin_unlock_irq(&rq->lock);
8183 rebalance_domains(rq, CPU_IDLE);
8184 }
8185
8186 if (time_after(next_balance, rq->next_balance)) {
8187 next_balance = rq->next_balance;
8188 update_next_balance = 1;
8189 }
8190 }
8191
8192
8193
8194
8195
8196
8197 if (likely(update_next_balance))
8198 nohz.next_balance = next_balance;
8199end:
8200 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8201}
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214static inline bool nohz_kick_needed(struct rq *rq)
8215{
8216 unsigned long now = jiffies;
8217 struct sched_domain *sd;
8218 struct sched_group_capacity *sgc;
8219 int nr_busy, cpu = rq->cpu;
8220 bool kick = false;
8221
8222 if (unlikely(rq->idle_balance))
8223 return false;
8224
8225
8226
8227
8228
8229 set_cpu_sd_state_busy();
8230 nohz_balance_exit_idle(cpu);
8231
8232
8233
8234
8235
8236 if (likely(!atomic_read(&nohz.nr_cpus)))
8237 return false;
8238
8239 if (time_before(now, nohz.next_balance))
8240 return false;
8241
8242 if (rq->nr_running >= 2)
8243 return true;
8244
8245 rcu_read_lock();
8246 sd = rcu_dereference(per_cpu(sd_busy, cpu));
8247 if (sd) {
8248 sgc = sd->groups->sgc;
8249 nr_busy = atomic_read(&sgc->nr_busy_cpus);
8250
8251 if (nr_busy > 1) {
8252 kick = true;
8253 goto unlock;
8254 }
8255
8256 }
8257
8258 sd = rcu_dereference(rq->sd);
8259 if (sd) {
8260 if ((rq->cfs.h_nr_running >= 1) &&
8261 check_cpu_capacity(rq, sd)) {
8262 kick = true;
8263 goto unlock;
8264 }
8265 }
8266
8267 sd = rcu_dereference(per_cpu(sd_asym, cpu));
8268 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
8269 sched_domain_span(sd)) < cpu)) {
8270 kick = true;
8271 goto unlock;
8272 }
8273
8274unlock:
8275 rcu_read_unlock();
8276 return kick;
8277}
8278#else
8279static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8280#endif
8281
8282
8283
8284
8285
8286static void run_rebalance_domains(struct softirq_action *h)
8287{
8288 struct rq *this_rq = this_rq();
8289 enum cpu_idle_type idle = this_rq->idle_balance ?
8290 CPU_IDLE : CPU_NOT_IDLE;
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300 nohz_idle_balance(this_rq, idle);
8301 rebalance_domains(this_rq, idle);
8302}
8303
8304
8305
8306
8307void trigger_load_balance(struct rq *rq)
8308{
8309
8310 if (unlikely(on_null_domain(rq)))
8311 return;
8312
8313 if (time_after_eq(jiffies, rq->next_balance))
8314 raise_softirq(SCHED_SOFTIRQ);
8315#ifdef CONFIG_NO_HZ_COMMON
8316 if (nohz_kick_needed(rq))
8317 nohz_balancer_kick();
8318#endif
8319}
8320
8321static void rq_online_fair(struct rq *rq)
8322{
8323 update_sysctl();
8324
8325 update_runtime_enabled(rq);
8326}
8327
8328static void rq_offline_fair(struct rq *rq)
8329{
8330 update_sysctl();
8331
8332
8333 unthrottle_offline_cfs_rqs(rq);
8334}
8335
8336#endif
8337
8338
8339
8340
8341static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8342{
8343 struct cfs_rq *cfs_rq;
8344 struct sched_entity *se = &curr->se;
8345
8346 for_each_sched_entity(se) {
8347 cfs_rq = cfs_rq_of(se);
8348 entity_tick(cfs_rq, se, queued);
8349 }
8350
8351 if (static_branch_unlikely(&sched_numa_balancing))
8352 task_tick_numa(rq, curr);
8353}
8354
8355
8356
8357
8358
8359
8360static void task_fork_fair(struct task_struct *p)
8361{
8362 struct cfs_rq *cfs_rq;
8363 struct sched_entity *se = &p->se, *curr;
8364 struct rq *rq = this_rq();
8365
8366 raw_spin_lock(&rq->lock);
8367 update_rq_clock(rq);
8368
8369 cfs_rq = task_cfs_rq(current);
8370 curr = cfs_rq->curr;
8371 if (curr) {
8372 update_curr(cfs_rq);
8373 se->vruntime = curr->vruntime;
8374 }
8375 place_entity(cfs_rq, se, 1);
8376
8377 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8378
8379
8380
8381
8382 swap(curr->vruntime, se->vruntime);
8383 resched_curr(rq);
8384 }
8385
8386 se->vruntime -= cfs_rq->min_vruntime;
8387 raw_spin_unlock(&rq->lock);
8388}
8389
8390
8391
8392
8393
8394static void
8395prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8396{
8397 if (!task_on_rq_queued(p))
8398 return;
8399
8400
8401
8402
8403
8404
8405 if (rq->curr == p) {
8406 if (p->prio > oldprio)
8407 resched_curr(rq);
8408 } else
8409 check_preempt_curr(rq, p, 0);
8410}
8411
8412static inline bool vruntime_normalized(struct task_struct *p)
8413{
8414 struct sched_entity *se = &p->se;
8415
8416
8417
8418
8419
8420
8421 if (p->on_rq)
8422 return true;
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8434 return true;
8435
8436 return false;
8437}
8438
8439static void detach_task_cfs_rq(struct task_struct *p)
8440{
8441 struct sched_entity *se = &p->se;
8442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8445
8446 if (!vruntime_normalized(p)) {
8447
8448
8449
8450
8451 place_entity(cfs_rq, se, 0);
8452 se->vruntime -= cfs_rq->min_vruntime;
8453 }
8454
8455
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8457 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update)
8459 update_tg_load_avg(cfs_rq, false);
8460}
8461
8462static void attach_task_cfs_rq(struct task_struct *p)
8463{
8464 struct sched_entity *se = &p->se;
8465 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8468
8469#ifdef CONFIG_FAIR_GROUP_SCHED
8470
8471
8472
8473
8474 se->depth = se->parent ? se->parent->depth + 1 : 0;
8475#endif
8476
8477
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8479 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update)
8481 update_tg_load_avg(cfs_rq, false);
8482
8483 if (!vruntime_normalized(p))
8484 se->vruntime += cfs_rq->min_vruntime;
8485}
8486
8487static void switched_from_fair(struct rq *rq, struct task_struct *p)
8488{
8489 detach_task_cfs_rq(p);
8490}
8491
8492static void switched_to_fair(struct rq *rq, struct task_struct *p)
8493{
8494 attach_task_cfs_rq(p);
8495
8496 if (task_on_rq_queued(p)) {
8497
8498
8499
8500
8501
8502 if (rq->curr == p)
8503 resched_curr(rq);
8504 else
8505 check_preempt_curr(rq, p, 0);
8506 }
8507}
8508
8509
8510
8511
8512
8513
8514static void set_curr_task_fair(struct rq *rq)
8515{
8516 struct sched_entity *se = &rq->curr->se;
8517
8518 for_each_sched_entity(se) {
8519 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8520
8521 set_next_entity(cfs_rq, se);
8522
8523 account_cfs_rq_runtime(cfs_rq, 0);
8524 }
8525}
8526
8527void init_cfs_rq(struct cfs_rq *cfs_rq)
8528{
8529 cfs_rq->tasks_timeline = RB_ROOT;
8530 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8531#ifndef CONFIG_64BIT
8532 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8533#endif
8534#ifdef CONFIG_SMP
8535 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8536 atomic_long_set(&cfs_rq->removed_util_avg, 0);
8537#endif
8538}
8539
8540#ifdef CONFIG_FAIR_GROUP_SCHED
8541static void task_set_group_fair(struct task_struct *p)
8542{
8543 struct sched_entity *se = &p->se;
8544
8545 set_task_rq(p, task_cpu(p));
8546 se->depth = se->parent ? se->parent->depth + 1 : 0;
8547}
8548
8549static void task_move_group_fair(struct task_struct *p)
8550{
8551 detach_task_cfs_rq(p);
8552 set_task_rq(p, task_cpu(p));
8553
8554#ifdef CONFIG_SMP
8555
8556 p->se.avg.last_update_time = 0;
8557#endif
8558 attach_task_cfs_rq(p);
8559}
8560
8561static void task_change_group_fair(struct task_struct *p, int type)
8562{
8563 switch (type) {
8564 case TASK_SET_GROUP:
8565 task_set_group_fair(p);
8566 break;
8567
8568 case TASK_MOVE_GROUP:
8569 task_move_group_fair(p);
8570 break;
8571 }
8572}
8573
8574void free_fair_sched_group(struct task_group *tg)
8575{
8576 int i;
8577
8578 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8579
8580 for_each_possible_cpu(i) {
8581 if (tg->cfs_rq)
8582 kfree(tg->cfs_rq[i]);
8583 if (tg->se)
8584 kfree(tg->se[i]);
8585 }
8586
8587 kfree(tg->cfs_rq);
8588 kfree(tg->se);
8589}
8590
8591int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8592{
8593 struct sched_entity *se;
8594 struct cfs_rq *cfs_rq;
8595 struct rq *rq;
8596 int i;
8597
8598 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8599 if (!tg->cfs_rq)
8600 goto err;
8601 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8602 if (!tg->se)
8603 goto err;
8604
8605 tg->shares = NICE_0_LOAD;
8606
8607 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8608
8609 for_each_possible_cpu(i) {
8610 rq = cpu_rq(i);
8611
8612 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8613 GFP_KERNEL, cpu_to_node(i));
8614 if (!cfs_rq)
8615 goto err;
8616
8617 se = kzalloc_node(sizeof(struct sched_entity),
8618 GFP_KERNEL, cpu_to_node(i));
8619 if (!se)
8620 goto err_free_rq;
8621
8622 init_cfs_rq(cfs_rq);
8623 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8624 init_entity_runnable_average(se);
8625 }
8626
8627 return 1;
8628
8629err_free_rq:
8630 kfree(cfs_rq);
8631err:
8632 return 0;
8633}
8634
8635void online_fair_sched_group(struct task_group *tg)
8636{
8637 struct sched_entity *se;
8638 struct rq *rq;
8639 int i;
8640
8641 for_each_possible_cpu(i) {
8642 rq = cpu_rq(i);
8643 se = tg->se[i];
8644
8645 raw_spin_lock_irq(&rq->lock);
8646 post_init_entity_util_avg(se);
8647 sync_throttle(tg, i);
8648 raw_spin_unlock_irq(&rq->lock);
8649 }
8650}
8651
8652void unregister_fair_sched_group(struct task_group *tg)
8653{
8654 unsigned long flags;
8655 struct rq *rq;
8656 int cpu;
8657
8658 for_each_possible_cpu(cpu) {
8659 if (tg->se[cpu])
8660 remove_entity_load_avg(tg->se[cpu]);
8661
8662
8663
8664
8665
8666 if (!tg->cfs_rq[cpu]->on_list)
8667 continue;
8668
8669 rq = cpu_rq(cpu);
8670
8671 raw_spin_lock_irqsave(&rq->lock, flags);
8672 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8673 raw_spin_unlock_irqrestore(&rq->lock, flags);
8674 }
8675}
8676
8677void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8678 struct sched_entity *se, int cpu,
8679 struct sched_entity *parent)
8680{
8681 struct rq *rq = cpu_rq(cpu);
8682
8683 cfs_rq->tg = tg;
8684 cfs_rq->rq = rq;
8685 init_cfs_rq_runtime(cfs_rq);
8686
8687 tg->cfs_rq[cpu] = cfs_rq;
8688 tg->se[cpu] = se;
8689
8690
8691 if (!se)
8692 return;
8693
8694 if (!parent) {
8695 se->cfs_rq = &rq->cfs;
8696 se->depth = 0;
8697 } else {
8698 se->cfs_rq = parent->my_q;
8699 se->depth = parent->depth + 1;
8700 }
8701
8702 se->my_q = cfs_rq;
8703
8704 update_load_set(&se->load, NICE_0_LOAD);
8705 se->parent = parent;
8706}
8707
8708static DEFINE_MUTEX(shares_mutex);
8709
8710int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8711{
8712 int i;
8713 unsigned long flags;
8714
8715
8716
8717
8718 if (!tg->se[0])
8719 return -EINVAL;
8720
8721 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8722
8723 mutex_lock(&shares_mutex);
8724 if (tg->shares == shares)
8725 goto done;
8726
8727 tg->shares = shares;
8728 for_each_possible_cpu(i) {
8729 struct rq *rq = cpu_rq(i);
8730 struct sched_entity *se;
8731
8732 se = tg->se[i];
8733
8734 raw_spin_lock_irqsave(&rq->lock, flags);
8735
8736
8737 update_rq_clock(rq);
8738 for_each_sched_entity(se)
8739 update_cfs_shares(group_cfs_rq(se));
8740 raw_spin_unlock_irqrestore(&rq->lock, flags);
8741 }
8742
8743done:
8744 mutex_unlock(&shares_mutex);
8745 return 0;
8746}
8747#else
8748
8749void free_fair_sched_group(struct task_group *tg) { }
8750
8751int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8752{
8753 return 1;
8754}
8755
8756void online_fair_sched_group(struct task_group *tg) { }
8757
8758void unregister_fair_sched_group(struct task_group *tg) { }
8759
8760#endif
8761
8762
8763static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8764{
8765 struct sched_entity *se = &task->se;
8766 unsigned int rr_interval = 0;
8767
8768
8769
8770
8771
8772 if (rq->cfs.load.weight)
8773 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8774
8775 return rr_interval;
8776}
8777
8778
8779
8780
8781const struct sched_class fair_sched_class = {
8782 .next = &idle_sched_class,
8783 .enqueue_task = enqueue_task_fair,
8784 .dequeue_task = dequeue_task_fair,
8785 .yield_task = yield_task_fair,
8786 .yield_to_task = yield_to_task_fair,
8787
8788 .check_preempt_curr = check_preempt_wakeup,
8789
8790 .pick_next_task = pick_next_task_fair,
8791 .put_prev_task = put_prev_task_fair,
8792
8793#ifdef CONFIG_SMP
8794 .select_task_rq = select_task_rq_fair,
8795 .migrate_task_rq = migrate_task_rq_fair,
8796
8797 .rq_online = rq_online_fair,
8798 .rq_offline = rq_offline_fair,
8799
8800 .task_dead = task_dead_fair,
8801 .set_cpus_allowed = set_cpus_allowed_common,
8802#endif
8803
8804 .set_curr_task = set_curr_task_fair,
8805 .task_tick = task_tick_fair,
8806 .task_fork = task_fork_fair,
8807
8808 .prio_changed = prio_changed_fair,
8809 .switched_from = switched_from_fair,
8810 .switched_to = switched_to_fair,
8811
8812 .get_rr_interval = get_rr_interval_fair,
8813
8814 .update_curr = update_curr_fair,
8815
8816#ifdef CONFIG_FAIR_GROUP_SCHED
8817 .task_change_group = task_change_group_fair,
8818#endif
8819};
8820
8821#ifdef CONFIG_SCHED_DEBUG
8822void print_cfs_stats(struct seq_file *m, int cpu)
8823{
8824 struct cfs_rq *cfs_rq;
8825
8826 rcu_read_lock();
8827 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8828 print_cfs_rq(m, cpu, cfs_rq);
8829 rcu_read_unlock();
8830}
8831
8832#ifdef CONFIG_NUMA_BALANCING
8833void show_numa_stats(struct task_struct *p, struct seq_file *m)
8834{
8835 int node;
8836 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8837
8838 for_each_online_node(node) {
8839 if (p->numa_faults) {
8840 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8841 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8842 }
8843 if (p->numa_group) {
8844 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8845 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8846 }
8847 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8848 }
8849}
8850#endif
8851#endif
8852
8853__init void init_sched_fair_class(void)
8854{
8855#ifdef CONFIG_SMP
8856 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8857
8858#ifdef CONFIG_NO_HZ_COMMON
8859 nohz.next_balance = jiffies;
8860 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8861#endif
8862#endif
8863
8864}
8865