1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
27#include <linux/slab.h>
28#include <linux/profile.h>
29#include <linux/interrupt.h>
30#include <linux/mempolicy.h>
31#include <linux/migrate.h>
32#include <linux/task_work.h>
33
34#include <trace/events/sched.h>
35
36#include "sched.h"
37
38
39
40
41
42
43
44
45
46
47
48
49
50unsigned int sysctl_sched_latency = 6000000ULL;
51unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52
53
54
55
56
57
58
59
60
61
62enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 = SCHED_TUNABLESCALING_LOG;
64
65
66
67
68
69unsigned int sysctl_sched_min_granularity = 750000ULL;
70unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71
72
73
74
75static unsigned int sched_nr_latency = 8;
76
77
78
79
80
81unsigned int sysctl_sched_child_runs_first __read_mostly;
82
83
84
85
86
87
88
89
90
91unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93
94const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95
96
97
98
99
100
101unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102
103#ifdef CONFIG_CFS_BANDWIDTH
104
105
106
107
108
109
110
111
112
113
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif
116
117static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118{
119 lw->weight += inc;
120 lw->inv_weight = 0;
121}
122
123static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124{
125 lw->weight -= dec;
126 lw->inv_weight = 0;
127}
128
129static inline void update_load_set(struct load_weight *lw, unsigned long w)
130{
131 lw->weight = w;
132 lw->inv_weight = 0;
133}
134
135
136
137
138
139
140
141
142
143
144static unsigned int get_update_sysctl_factor(void)
145{
146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor;
148
149 switch (sysctl_sched_tunable_scaling) {
150 case SCHED_TUNABLESCALING_NONE:
151 factor = 1;
152 break;
153 case SCHED_TUNABLESCALING_LINEAR:
154 factor = cpus;
155 break;
156 case SCHED_TUNABLESCALING_LOG:
157 default:
158 factor = 1 + ilog2(cpus);
159 break;
160 }
161
162 return factor;
163}
164
165static void update_sysctl(void)
166{
167 unsigned int factor = get_update_sysctl_factor();
168
169#define SET_SYSCTL(name) \
170 (sysctl_##name = (factor) * normalized_sysctl_##name)
171 SET_SYSCTL(sched_min_granularity);
172 SET_SYSCTL(sched_latency);
173 SET_SYSCTL(sched_wakeup_granularity);
174#undef SET_SYSCTL
175}
176
177void sched_init_granularity(void)
178{
179 update_sysctl();
180}
181
182#define WMULT_CONST (~0U)
183#define WMULT_SHIFT 32
184
185static void __update_inv_weight(struct load_weight *lw)
186{
187 unsigned long w;
188
189 if (likely(lw->inv_weight))
190 return;
191
192 w = scale_load_down(lw->weight);
193
194 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 lw->inv_weight = 1;
196 else if (unlikely(!w))
197 lw->inv_weight = WMULT_CONST;
198 else
199 lw->inv_weight = WMULT_CONST / w;
200}
201
202
203
204
205
206
207
208
209
210
211
212
213
214static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215{
216 u64 fact = scale_load_down(weight);
217 int shift = WMULT_SHIFT;
218
219 __update_inv_weight(lw);
220
221 if (unlikely(fact >> 32)) {
222 while (fact >> 32) {
223 fact >>= 1;
224 shift--;
225 }
226 }
227
228
229 fact = (u64)(u32)fact * lw->inv_weight;
230
231 while (fact >> 32) {
232 fact >>= 1;
233 shift--;
234 }
235
236 return mul_u64_u32_shr(delta_exec, fact, shift);
237}
238
239
240const struct sched_class fair_sched_class;
241
242
243
244
245
246#ifdef CONFIG_FAIR_GROUP_SCHED
247
248
249static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250{
251 return cfs_rq->rq;
252}
253
254
255#define entity_is_task(se) (!se->my_q)
256
257static inline struct task_struct *task_of(struct sched_entity *se)
258{
259#ifdef CONFIG_SCHED_DEBUG
260 WARN_ON_ONCE(!entity_is_task(se));
261#endif
262 return container_of(se, struct task_struct, se);
263}
264
265
266#define for_each_sched_entity(se) \
267 for (; se; se = se->parent)
268
269static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270{
271 return p->se.cfs_rq;
272}
273
274
275static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276{
277 return se->cfs_rq;
278}
279
280
281static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282{
283 return grp->my_q;
284}
285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288 if (!cfs_rq->on_list) {
289
290
291
292
293
294
295 if (cfs_rq->tg->parent &&
296 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 &rq_of(cfs_rq)->leaf_cfs_rq_list);
299 } else {
300 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 &rq_of(cfs_rq)->leaf_cfs_rq_list);
302 }
303
304 cfs_rq->on_list = 1;
305 }
306}
307
308static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309{
310 if (cfs_rq->on_list) {
311 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312 cfs_rq->on_list = 0;
313 }
314}
315
316
317#define for_each_leaf_cfs_rq(rq, cfs_rq) \
318 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319
320
321static inline struct cfs_rq *
322is_same_group(struct sched_entity *se, struct sched_entity *pse)
323{
324 if (se->cfs_rq == pse->cfs_rq)
325 return se->cfs_rq;
326
327 return NULL;
328}
329
330static inline struct sched_entity *parent_entity(struct sched_entity *se)
331{
332 return se->parent;
333}
334
335static void
336find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337{
338 int se_depth, pse_depth;
339
340
341
342
343
344
345
346
347
348 se_depth = (*se)->depth;
349 pse_depth = (*pse)->depth;
350
351 while (se_depth > pse_depth) {
352 se_depth--;
353 *se = parent_entity(*se);
354 }
355
356 while (pse_depth > se_depth) {
357 pse_depth--;
358 *pse = parent_entity(*pse);
359 }
360
361 while (!is_same_group(*se, *pse)) {
362 *se = parent_entity(*se);
363 *pse = parent_entity(*pse);
364 }
365}
366
367#else
368
369static inline struct task_struct *task_of(struct sched_entity *se)
370{
371 return container_of(se, struct task_struct, se);
372}
373
374static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375{
376 return container_of(cfs_rq, struct rq, cfs);
377}
378
379#define entity_is_task(se) 1
380
381#define for_each_sched_entity(se) \
382 for (; se; se = NULL)
383
384static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385{
386 return &task_rq(p)->cfs;
387}
388
389static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390{
391 struct task_struct *p = task_of(se);
392 struct rq *rq = task_rq(p);
393
394 return &rq->cfs;
395}
396
397
398static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399{
400 return NULL;
401}
402
403static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404{
405}
406
407static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408{
409}
410
411#define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413
414static inline struct sched_entity *parent_entity(struct sched_entity *se)
415{
416 return NULL;
417}
418
419static inline void
420find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421{
422}
423
424#endif
425
426static __always_inline
427void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428
429
430
431
432
433static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434{
435 s64 delta = (s64)(vruntime - max_vruntime);
436 if (delta > 0)
437 max_vruntime = vruntime;
438
439 return max_vruntime;
440}
441
442static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443{
444 s64 delta = (s64)(vruntime - min_vruntime);
445 if (delta < 0)
446 min_vruntime = vruntime;
447
448 return min_vruntime;
449}
450
451static inline int entity_before(struct sched_entity *a,
452 struct sched_entity *b)
453{
454 return (s64)(a->vruntime - b->vruntime) < 0;
455}
456
457static void update_min_vruntime(struct cfs_rq *cfs_rq)
458{
459 u64 vruntime = cfs_rq->min_vruntime;
460
461 if (cfs_rq->curr)
462 vruntime = cfs_rq->curr->vruntime;
463
464 if (cfs_rq->rb_leftmost) {
465 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 struct sched_entity,
467 run_node);
468
469 if (!cfs_rq->curr)
470 vruntime = se->vruntime;
471 else
472 vruntime = min_vruntime(vruntime, se->vruntime);
473 }
474
475
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT
478 smp_wmb();
479 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480#endif
481}
482
483
484
485
486static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487{
488 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 struct rb_node *parent = NULL;
490 struct sched_entity *entry;
491 int leftmost = 1;
492
493
494
495
496 while (*link) {
497 parent = *link;
498 entry = rb_entry(parent, struct sched_entity, run_node);
499
500
501
502
503 if (entity_before(se, entry)) {
504 link = &parent->rb_left;
505 } else {
506 link = &parent->rb_right;
507 leftmost = 0;
508 }
509 }
510
511
512
513
514
515 if (leftmost)
516 cfs_rq->rb_leftmost = &se->run_node;
517
518 rb_link_node(&se->run_node, parent, link);
519 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520}
521
522static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{
524 if (cfs_rq->rb_leftmost == &se->run_node) {
525 struct rb_node *next_node;
526
527 next_node = rb_next(&se->run_node);
528 cfs_rq->rb_leftmost = next_node;
529 }
530
531 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532}
533
534struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535{
536 struct rb_node *left = cfs_rq->rb_leftmost;
537
538 if (!left)
539 return NULL;
540
541 return rb_entry(left, struct sched_entity, run_node);
542}
543
544static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545{
546 struct rb_node *next = rb_next(&se->run_node);
547
548 if (!next)
549 return NULL;
550
551 return rb_entry(next, struct sched_entity, run_node);
552}
553
554#ifdef CONFIG_SCHED_DEBUG
555struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556{
557 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558
559 if (!last)
560 return NULL;
561
562 return rb_entry(last, struct sched_entity, run_node);
563}
564
565
566
567
568
569int sched_proc_update_handler(struct ctl_table *table, int write,
570 void __user *buffer, size_t *lenp,
571 loff_t *ppos)
572{
573 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 unsigned int factor = get_update_sysctl_factor();
575
576 if (ret || !write)
577 return ret;
578
579 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 sysctl_sched_min_granularity);
581
582#define WRT_SYSCTL(name) \
583 (normalized_sysctl_##name = sysctl_##name / (factor))
584 WRT_SYSCTL(sched_min_granularity);
585 WRT_SYSCTL(sched_latency);
586 WRT_SYSCTL(sched_wakeup_granularity);
587#undef WRT_SYSCTL
588
589 return 0;
590}
591#endif
592
593
594
595
596static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597{
598 if (unlikely(se->load.weight != NICE_0_LOAD))
599 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600
601 return delta;
602}
603
604
605
606
607
608
609
610
611
612static u64 __sched_period(unsigned long nr_running)
613{
614 if (unlikely(nr_running > sched_nr_latency))
615 return nr_running * sysctl_sched_min_granularity;
616 else
617 return sysctl_sched_latency;
618}
619
620
621
622
623
624
625
626static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627{
628 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629
630 for_each_sched_entity(se) {
631 struct load_weight *load;
632 struct load_weight lw;
633
634 cfs_rq = cfs_rq_of(se);
635 load = &cfs_rq->load;
636
637 if (unlikely(!se->on_rq)) {
638 lw = cfs_rq->load;
639
640 update_load_add(&lw, se->load.weight);
641 load = &lw;
642 }
643 slice = __calc_delta(slice, se->load.weight, load);
644 }
645 return slice;
646}
647
648
649
650
651
652
653static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{
655 return calc_delta_fair(sched_slice(cfs_rq, se), se);
656}
657
658#ifdef CONFIG_SMP
659static int select_idle_sibling(struct task_struct *p, int cpu);
660static unsigned long task_h_load(struct task_struct *p);
661
662
663
664
665
666
667#define LOAD_AVG_PERIOD 32
668#define LOAD_AVG_MAX 47742
669#define LOAD_AVG_MAX_N 345
670
671
672void init_entity_runnable_average(struct sched_entity *se)
673{
674 struct sched_avg *sa = &se->avg;
675
676 sa->last_update_time = 0;
677
678
679
680
681
682 sa->period_contrib = 1023;
683 sa->load_avg = scale_load_down(se->load.weight);
684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
686 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
687
688}
689
690static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
691static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
692#else
693void init_entity_runnable_average(struct sched_entity *se)
694{
695}
696#endif
697
698
699
700
701static void update_curr(struct cfs_rq *cfs_rq)
702{
703 struct sched_entity *curr = cfs_rq->curr;
704 u64 now = rq_clock_task(rq_of(cfs_rq));
705 u64 delta_exec;
706
707 if (unlikely(!curr))
708 return;
709
710 delta_exec = now - curr->exec_start;
711 if (unlikely((s64)delta_exec <= 0))
712 return;
713
714 curr->exec_start = now;
715
716 schedstat_set(curr->statistics.exec_max,
717 max(delta_exec, curr->statistics.exec_max));
718
719 curr->sum_exec_runtime += delta_exec;
720 schedstat_add(cfs_rq, exec_clock, delta_exec);
721
722 curr->vruntime += calc_delta_fair(delta_exec, curr);
723 update_min_vruntime(cfs_rq);
724
725 if (entity_is_task(curr)) {
726 struct task_struct *curtask = task_of(curr);
727
728 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
729 cpuacct_charge(curtask, delta_exec);
730 account_group_exec_runtime(curtask, delta_exec);
731 }
732
733 account_cfs_rq_runtime(cfs_rq, delta_exec);
734}
735
736static void update_curr_fair(struct rq *rq)
737{
738 update_curr(cfs_rq_of(&rq->curr->se));
739}
740
741static inline void
742update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
743{
744 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
745}
746
747
748
749
750static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
751{
752
753
754
755
756 if (se != cfs_rq->curr)
757 update_stats_wait_start(cfs_rq, se);
758}
759
760static void
761update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
762{
763 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
764 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
765 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
766 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
767 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768#ifdef CONFIG_SCHEDSTATS
769 if (entity_is_task(se)) {
770 trace_sched_stat_wait(task_of(se),
771 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
772 }
773#endif
774 schedstat_set(se->statistics.wait_start, 0);
775}
776
777static inline void
778update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779{
780
781
782
783
784 if (se != cfs_rq->curr)
785 update_stats_wait_end(cfs_rq, se);
786}
787
788
789
790
791static inline void
792update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
793{
794
795
796
797 se->exec_start = rq_clock_task(rq_of(cfs_rq));
798}
799
800
801
802
803
804#ifdef CONFIG_NUMA_BALANCING
805
806
807
808
809
810unsigned int sysctl_numa_balancing_scan_period_min = 1000;
811unsigned int sysctl_numa_balancing_scan_period_max = 60000;
812
813
814unsigned int sysctl_numa_balancing_scan_size = 256;
815
816
817unsigned int sysctl_numa_balancing_scan_delay = 1000;
818
819static unsigned int task_nr_scan_windows(struct task_struct *p)
820{
821 unsigned long rss = 0;
822 unsigned long nr_scan_pages;
823
824
825
826
827
828
829 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
830 rss = get_mm_rss(p->mm);
831 if (!rss)
832 rss = nr_scan_pages;
833
834 rss = round_up(rss, nr_scan_pages);
835 return rss / nr_scan_pages;
836}
837
838
839#define MAX_SCAN_WINDOW 2560
840
841static unsigned int task_scan_min(struct task_struct *p)
842{
843 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
844 unsigned int scan, floor;
845 unsigned int windows = 1;
846
847 if (scan_size < MAX_SCAN_WINDOW)
848 windows = MAX_SCAN_WINDOW / scan_size;
849 floor = 1000 / windows;
850
851 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
852 return max_t(unsigned int, floor, scan);
853}
854
855static unsigned int task_scan_max(struct task_struct *p)
856{
857 unsigned int smin = task_scan_min(p);
858 unsigned int smax;
859
860
861 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
862 return max(smin, smax);
863}
864
865static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
866{
867 rq->nr_numa_running += (p->numa_preferred_nid != -1);
868 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
869}
870
871static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
872{
873 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
874 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
875}
876
877struct numa_group {
878 atomic_t refcount;
879
880 spinlock_t lock;
881 int nr_tasks;
882 pid_t gid;
883
884 struct rcu_head rcu;
885 nodemask_t active_nodes;
886 unsigned long total_faults;
887
888
889
890
891
892 unsigned long *faults_cpu;
893 unsigned long faults[0];
894};
895
896
897#define NR_NUMA_HINT_FAULT_TYPES 2
898
899
900#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
901
902
903#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
904
905pid_t task_numa_group_id(struct task_struct *p)
906{
907 return p->numa_group ? p->numa_group->gid : 0;
908}
909
910
911
912
913
914
915
916static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
917{
918 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
919}
920
921static inline unsigned long task_faults(struct task_struct *p, int nid)
922{
923 if (!p->numa_faults)
924 return 0;
925
926 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
927 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
928}
929
930static inline unsigned long group_faults(struct task_struct *p, int nid)
931{
932 if (!p->numa_group)
933 return 0;
934
935 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
936 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
937}
938
939static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
940{
941 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
942 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
943}
944
945
946static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
947 int maxdist, bool task)
948{
949 unsigned long score = 0;
950 int node;
951
952
953
954
955
956 if (sched_numa_topology_type == NUMA_DIRECT)
957 return 0;
958
959
960
961
962
963 for_each_online_node(node) {
964 unsigned long faults;
965 int dist = node_distance(nid, node);
966
967
968
969
970
971 if (dist == sched_max_numa_distance || node == nid)
972 continue;
973
974
975
976
977
978
979
980
981 if (sched_numa_topology_type == NUMA_BACKPLANE &&
982 dist > maxdist)
983 continue;
984
985
986 if (task)
987 faults = task_faults(p, node);
988 else
989 faults = group_faults(p, node);
990
991
992
993
994
995
996
997
998
999 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1000 faults *= (sched_max_numa_distance - dist);
1001 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1002 }
1003
1004 score += faults;
1005 }
1006
1007 return score;
1008}
1009
1010
1011
1012
1013
1014
1015
1016static inline unsigned long task_weight(struct task_struct *p, int nid,
1017 int dist)
1018{
1019 unsigned long faults, total_faults;
1020
1021 if (!p->numa_faults)
1022 return 0;
1023
1024 total_faults = p->total_numa_faults;
1025
1026 if (!total_faults)
1027 return 0;
1028
1029 faults = task_faults(p, nid);
1030 faults += score_nearby_nodes(p, nid, dist, true);
1031
1032 return 1000 * faults / total_faults;
1033}
1034
1035static inline unsigned long group_weight(struct task_struct *p, int nid,
1036 int dist)
1037{
1038 unsigned long faults, total_faults;
1039
1040 if (!p->numa_group)
1041 return 0;
1042
1043 total_faults = p->numa_group->total_faults;
1044
1045 if (!total_faults)
1046 return 0;
1047
1048 faults = group_faults(p, nid);
1049 faults += score_nearby_nodes(p, nid, dist, false);
1050
1051 return 1000 * faults / total_faults;
1052}
1053
1054bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1055 int src_nid, int dst_cpu)
1056{
1057 struct numa_group *ng = p->numa_group;
1058 int dst_nid = cpu_to_node(dst_cpu);
1059 int last_cpupid, this_cpupid;
1060
1061 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1081 if (!cpupid_pid_unset(last_cpupid) &&
1082 cpupid_to_nid(last_cpupid) != dst_nid)
1083 return false;
1084
1085
1086 if (cpupid_match_pid(p, last_cpupid))
1087 return true;
1088
1089
1090 if (!ng)
1091 return true;
1092
1093
1094
1095
1096
1097 if (!node_isset(dst_nid, ng->active_nodes))
1098 return false;
1099
1100
1101
1102
1103
1104 if (!node_isset(src_nid, ng->active_nodes))
1105 return true;
1106
1107
1108
1109
1110
1111
1112
1113
1114 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1115}
1116
1117static unsigned long weighted_cpuload(const int cpu);
1118static unsigned long source_load(int cpu, int type);
1119static unsigned long target_load(int cpu, int type);
1120static unsigned long capacity_of(int cpu);
1121static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1122
1123
1124struct numa_stats {
1125 unsigned long nr_running;
1126 unsigned long load;
1127
1128
1129 unsigned long compute_capacity;
1130
1131
1132 unsigned long task_capacity;
1133 int has_free_capacity;
1134};
1135
1136
1137
1138
1139static void update_numa_stats(struct numa_stats *ns, int nid)
1140{
1141 int smt, cpu, cpus = 0;
1142 unsigned long capacity;
1143
1144 memset(ns, 0, sizeof(*ns));
1145 for_each_cpu(cpu, cpumask_of_node(nid)) {
1146 struct rq *rq = cpu_rq(cpu);
1147
1148 ns->nr_running += rq->nr_running;
1149 ns->load += weighted_cpuload(cpu);
1150 ns->compute_capacity += capacity_of(cpu);
1151
1152 cpus++;
1153 }
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163 if (!cpus)
1164 return;
1165
1166
1167 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1168 capacity = cpus / smt;
1169
1170 ns->task_capacity = min_t(unsigned, capacity,
1171 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1172 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1173}
1174
1175struct task_numa_env {
1176 struct task_struct *p;
1177
1178 int src_cpu, src_nid;
1179 int dst_cpu, dst_nid;
1180
1181 struct numa_stats src_stats, dst_stats;
1182
1183 int imbalance_pct;
1184 int dist;
1185
1186 struct task_struct *best_task;
1187 long best_imp;
1188 int best_cpu;
1189};
1190
1191static void task_numa_assign(struct task_numa_env *env,
1192 struct task_struct *p, long imp)
1193{
1194 if (env->best_task)
1195 put_task_struct(env->best_task);
1196 if (p)
1197 get_task_struct(p);
1198
1199 env->best_task = p;
1200 env->best_imp = imp;
1201 env->best_cpu = env->dst_cpu;
1202}
1203
1204static bool load_too_imbalanced(long src_load, long dst_load,
1205 struct task_numa_env *env)
1206{
1207 long imb, old_imb;
1208 long orig_src_load, orig_dst_load;
1209 long src_capacity, dst_capacity;
1210
1211
1212
1213
1214
1215
1216
1217
1218 src_capacity = env->src_stats.compute_capacity;
1219 dst_capacity = env->dst_stats.compute_capacity;
1220
1221
1222 if (dst_load < src_load)
1223 swap(dst_load, src_load);
1224
1225
1226 imb = dst_load * src_capacity * 100 -
1227 src_load * dst_capacity * env->imbalance_pct;
1228 if (imb <= 0)
1229 return false;
1230
1231
1232
1233
1234
1235 orig_src_load = env->src_stats.load;
1236 orig_dst_load = env->dst_stats.load;
1237
1238 if (orig_dst_load < orig_src_load)
1239 swap(orig_dst_load, orig_src_load);
1240
1241 old_imb = orig_dst_load * src_capacity * 100 -
1242 orig_src_load * dst_capacity * env->imbalance_pct;
1243
1244
1245 return (imb > old_imb);
1246}
1247
1248
1249
1250
1251
1252
1253
1254static void task_numa_compare(struct task_numa_env *env,
1255 long taskimp, long groupimp)
1256{
1257 struct rq *src_rq = cpu_rq(env->src_cpu);
1258 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1259 struct task_struct *cur;
1260 long src_load, dst_load;
1261 long load;
1262 long imp = env->p->numa_group ? groupimp : taskimp;
1263 long moveimp = imp;
1264 int dist = env->dist;
1265
1266 rcu_read_lock();
1267
1268 raw_spin_lock_irq(&dst_rq->lock);
1269 cur = dst_rq->curr;
1270
1271
1272
1273
1274
1275
1276
1277 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1278 cur = NULL;
1279 raw_spin_unlock_irq(&dst_rq->lock);
1280
1281
1282
1283
1284
1285 if (cur == env->p)
1286 goto unlock;
1287
1288
1289
1290
1291
1292
1293
1294
1295 if (cur) {
1296
1297 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1298 goto unlock;
1299
1300
1301
1302
1303
1304 if (cur->numa_group == env->p->numa_group) {
1305 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1306 task_weight(cur, env->dst_nid, dist);
1307
1308
1309
1310
1311 if (cur->numa_group)
1312 imp -= imp/16;
1313 } else {
1314
1315
1316
1317
1318
1319 if (cur->numa_group)
1320 imp += group_weight(cur, env->src_nid, dist) -
1321 group_weight(cur, env->dst_nid, dist);
1322 else
1323 imp += task_weight(cur, env->src_nid, dist) -
1324 task_weight(cur, env->dst_nid, dist);
1325 }
1326 }
1327
1328 if (imp <= env->best_imp && moveimp <= env->best_imp)
1329 goto unlock;
1330
1331 if (!cur) {
1332
1333 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1334 !env->dst_stats.has_free_capacity)
1335 goto unlock;
1336
1337 goto balance;
1338 }
1339
1340
1341 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1342 dst_rq->nr_running == 1)
1343 goto assign;
1344
1345
1346
1347
1348balance:
1349 load = task_h_load(env->p);
1350 dst_load = env->dst_stats.load + load;
1351 src_load = env->src_stats.load - load;
1352
1353 if (moveimp > imp && moveimp > env->best_imp) {
1354
1355
1356
1357
1358
1359
1360 if (!load_too_imbalanced(src_load, dst_load, env)) {
1361 imp = moveimp - 1;
1362 cur = NULL;
1363 goto assign;
1364 }
1365 }
1366
1367 if (imp <= env->best_imp)
1368 goto unlock;
1369
1370 if (cur) {
1371 load = task_h_load(cur);
1372 dst_load -= load;
1373 src_load += load;
1374 }
1375
1376 if (load_too_imbalanced(src_load, dst_load, env))
1377 goto unlock;
1378
1379
1380
1381
1382
1383 if (!cur)
1384 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1385
1386assign:
1387 task_numa_assign(env, cur, imp);
1388unlock:
1389 rcu_read_unlock();
1390}
1391
1392static void task_numa_find_cpu(struct task_numa_env *env,
1393 long taskimp, long groupimp)
1394{
1395 int cpu;
1396
1397 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1398
1399 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1400 continue;
1401
1402 env->dst_cpu = cpu;
1403 task_numa_compare(env, taskimp, groupimp);
1404 }
1405}
1406
1407
1408static bool numa_has_capacity(struct task_numa_env *env)
1409{
1410 struct numa_stats *src = &env->src_stats;
1411 struct numa_stats *dst = &env->dst_stats;
1412
1413 if (src->has_free_capacity && !dst->has_free_capacity)
1414 return false;
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424 if (src->load * dst->compute_capacity * env->imbalance_pct >
1425
1426 dst->load * src->compute_capacity * 100)
1427 return true;
1428
1429 return false;
1430}
1431
1432static int task_numa_migrate(struct task_struct *p)
1433{
1434 struct task_numa_env env = {
1435 .p = p,
1436
1437 .src_cpu = task_cpu(p),
1438 .src_nid = task_node(p),
1439
1440 .imbalance_pct = 112,
1441
1442 .best_task = NULL,
1443 .best_imp = 0,
1444 .best_cpu = -1
1445 };
1446 struct sched_domain *sd;
1447 unsigned long taskweight, groupweight;
1448 int nid, ret, dist;
1449 long taskimp, groupimp;
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459 rcu_read_lock();
1460 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1461 if (sd)
1462 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1463 rcu_read_unlock();
1464
1465
1466
1467
1468
1469
1470
1471 if (unlikely(!sd)) {
1472 p->numa_preferred_nid = task_node(p);
1473 return -EINVAL;
1474 }
1475
1476 env.dst_nid = p->numa_preferred_nid;
1477 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1478 taskweight = task_weight(p, env.src_nid, dist);
1479 groupweight = group_weight(p, env.src_nid, dist);
1480 update_numa_stats(&env.src_stats, env.src_nid);
1481 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1482 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1483 update_numa_stats(&env.dst_stats, env.dst_nid);
1484
1485
1486 if (numa_has_capacity(&env))
1487 task_numa_find_cpu(&env, taskimp, groupimp);
1488
1489
1490
1491
1492
1493
1494
1495
1496 if (env.best_cpu == -1 || (p->numa_group &&
1497 nodes_weight(p->numa_group->active_nodes) > 1)) {
1498 for_each_online_node(nid) {
1499 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1500 continue;
1501
1502 dist = node_distance(env.src_nid, env.dst_nid);
1503 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1504 dist != env.dist) {
1505 taskweight = task_weight(p, env.src_nid, dist);
1506 groupweight = group_weight(p, env.src_nid, dist);
1507 }
1508
1509
1510 taskimp = task_weight(p, nid, dist) - taskweight;
1511 groupimp = group_weight(p, nid, dist) - groupweight;
1512 if (taskimp < 0 && groupimp < 0)
1513 continue;
1514
1515 env.dist = dist;
1516 env.dst_nid = nid;
1517 update_numa_stats(&env.dst_stats, env.dst_nid);
1518 if (numa_has_capacity(&env))
1519 task_numa_find_cpu(&env, taskimp, groupimp);
1520 }
1521 }
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531 if (p->numa_group) {
1532 if (env.best_cpu == -1)
1533 nid = env.src_nid;
1534 else
1535 nid = env.dst_nid;
1536
1537 if (node_isset(nid, p->numa_group->active_nodes))
1538 sched_setnuma(p, env.dst_nid);
1539 }
1540
1541
1542 if (env.best_cpu == -1)
1543 return -EAGAIN;
1544
1545
1546
1547
1548
1549 p->numa_scan_period = task_scan_min(p);
1550
1551 if (env.best_task == NULL) {
1552 ret = migrate_task_to(p, env.best_cpu);
1553 if (ret != 0)
1554 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1555 return ret;
1556 }
1557
1558 ret = migrate_swap(p, env.best_task);
1559 if (ret != 0)
1560 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1561 put_task_struct(env.best_task);
1562 return ret;
1563}
1564
1565
1566static void numa_migrate_preferred(struct task_struct *p)
1567{
1568 unsigned long interval = HZ;
1569
1570
1571 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1572 return;
1573
1574
1575 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1576 p->numa_migrate_retry = jiffies + interval;
1577
1578
1579 if (task_node(p) == p->numa_preferred_nid)
1580 return;
1581
1582
1583 task_numa_migrate(p);
1584}
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597static void update_numa_active_node_mask(struct numa_group *numa_group)
1598{
1599 unsigned long faults, max_faults = 0;
1600 int nid;
1601
1602 for_each_online_node(nid) {
1603 faults = group_faults_cpu(numa_group, nid);
1604 if (faults > max_faults)
1605 max_faults = faults;
1606 }
1607
1608 for_each_online_node(nid) {
1609 faults = group_faults_cpu(numa_group, nid);
1610 if (!node_isset(nid, numa_group->active_nodes)) {
1611 if (faults > max_faults * 6 / 16)
1612 node_set(nid, numa_group->active_nodes);
1613 } else if (faults < max_faults * 3 / 16)
1614 node_clear(nid, numa_group->active_nodes);
1615 }
1616}
1617
1618
1619
1620
1621
1622
1623
1624
1625#define NUMA_PERIOD_SLOTS 10
1626#define NUMA_PERIOD_THRESHOLD 7
1627
1628
1629
1630
1631
1632
1633
1634static void update_task_scan_period(struct task_struct *p,
1635 unsigned long shared, unsigned long private)
1636{
1637 unsigned int period_slot;
1638 int ratio;
1639 int diff;
1640
1641 unsigned long remote = p->numa_faults_locality[0];
1642 unsigned long local = p->numa_faults_locality[1];
1643
1644
1645
1646
1647
1648
1649
1650
1651 if (local + shared == 0 || p->numa_faults_locality[2]) {
1652 p->numa_scan_period = min(p->numa_scan_period_max,
1653 p->numa_scan_period << 1);
1654
1655 p->mm->numa_next_scan = jiffies +
1656 msecs_to_jiffies(p->numa_scan_period);
1657
1658 return;
1659 }
1660
1661
1662
1663
1664
1665
1666
1667 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1668 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1669 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1670 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1671 if (!slot)
1672 slot = 1;
1673 diff = slot * period_slot;
1674 } else {
1675 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1686 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1687 }
1688
1689 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1690 task_scan_min(p), task_scan_max(p));
1691 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1692}
1693
1694
1695
1696
1697
1698
1699
1700
1701static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1702{
1703 u64 runtime, delta, now;
1704
1705 now = p->se.exec_start;
1706 runtime = p->se.sum_exec_runtime;
1707
1708 if (p->last_task_numa_placement) {
1709 delta = runtime - p->last_sum_exec_runtime;
1710 *period = now - p->last_task_numa_placement;
1711 } else {
1712 delta = p->se.avg.load_sum / p->se.load.weight;
1713 *period = LOAD_AVG_MAX;
1714 }
1715
1716 p->last_sum_exec_runtime = runtime;
1717 p->last_task_numa_placement = now;
1718
1719 return delta;
1720}
1721
1722
1723
1724
1725
1726
1727static int preferred_group_nid(struct task_struct *p, int nid)
1728{
1729 nodemask_t nodes;
1730 int dist;
1731
1732
1733 if (sched_numa_topology_type == NUMA_DIRECT)
1734 return nid;
1735
1736
1737
1738
1739
1740
1741 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1742 unsigned long score, max_score = 0;
1743 int node, max_node = nid;
1744
1745 dist = sched_max_numa_distance;
1746
1747 for_each_online_node(node) {
1748 score = group_weight(p, node, dist);
1749 if (score > max_score) {
1750 max_score = score;
1751 max_node = node;
1752 }
1753 }
1754 return max_node;
1755 }
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766 nodes = node_online_map;
1767 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1768 unsigned long max_faults = 0;
1769 nodemask_t max_group = NODE_MASK_NONE;
1770 int a, b;
1771
1772
1773 if (!find_numa_distance(dist))
1774 continue;
1775
1776 for_each_node_mask(a, nodes) {
1777 unsigned long faults = 0;
1778 nodemask_t this_group;
1779 nodes_clear(this_group);
1780
1781
1782 for_each_node_mask(b, nodes) {
1783 if (node_distance(a, b) < dist) {
1784 faults += group_faults(p, b);
1785 node_set(b, this_group);
1786 node_clear(b, nodes);
1787 }
1788 }
1789
1790
1791 if (faults > max_faults) {
1792 max_faults = faults;
1793 max_group = this_group;
1794
1795
1796
1797
1798
1799 nid = a;
1800 }
1801 }
1802
1803 if (!max_faults)
1804 break;
1805 nodes = max_group;
1806 }
1807 return nid;
1808}
1809
1810static void task_numa_placement(struct task_struct *p)
1811{
1812 int seq, nid, max_nid = -1, max_group_nid = -1;
1813 unsigned long max_faults = 0, max_group_faults = 0;
1814 unsigned long fault_types[2] = { 0, 0 };
1815 unsigned long total_faults;
1816 u64 runtime, period;
1817 spinlock_t *group_lock = NULL;
1818
1819
1820
1821
1822
1823
1824 seq = READ_ONCE(p->mm->numa_scan_seq);
1825 if (p->numa_scan_seq == seq)
1826 return;
1827 p->numa_scan_seq = seq;
1828 p->numa_scan_period_max = task_scan_max(p);
1829
1830 total_faults = p->numa_faults_locality[0] +
1831 p->numa_faults_locality[1];
1832 runtime = numa_get_avg_runtime(p, &period);
1833
1834
1835 if (p->numa_group) {
1836 group_lock = &p->numa_group->lock;
1837 spin_lock_irq(group_lock);
1838 }
1839
1840
1841 for_each_online_node(nid) {
1842
1843 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1844 unsigned long faults = 0, group_faults = 0;
1845 int priv;
1846
1847 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1848 long diff, f_diff, f_weight;
1849
1850 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1851 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1852 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1853 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1854
1855
1856 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1857 fault_types[priv] += p->numa_faults[membuf_idx];
1858 p->numa_faults[membuf_idx] = 0;
1859
1860
1861
1862
1863
1864
1865
1866
1867 f_weight = div64_u64(runtime << 16, period + 1);
1868 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1869 (total_faults + 1);
1870 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1871 p->numa_faults[cpubuf_idx] = 0;
1872
1873 p->numa_faults[mem_idx] += diff;
1874 p->numa_faults[cpu_idx] += f_diff;
1875 faults += p->numa_faults[mem_idx];
1876 p->total_numa_faults += diff;
1877 if (p->numa_group) {
1878
1879
1880
1881
1882
1883
1884
1885 p->numa_group->faults[mem_idx] += diff;
1886 p->numa_group->faults_cpu[mem_idx] += f_diff;
1887 p->numa_group->total_faults += diff;
1888 group_faults += p->numa_group->faults[mem_idx];
1889 }
1890 }
1891
1892 if (faults > max_faults) {
1893 max_faults = faults;
1894 max_nid = nid;
1895 }
1896
1897 if (group_faults > max_group_faults) {
1898 max_group_faults = group_faults;
1899 max_group_nid = nid;
1900 }
1901 }
1902
1903 update_task_scan_period(p, fault_types[0], fault_types[1]);
1904
1905 if (p->numa_group) {
1906 update_numa_active_node_mask(p->numa_group);
1907 spin_unlock_irq(group_lock);
1908 max_nid = preferred_group_nid(p, max_group_nid);
1909 }
1910
1911 if (max_faults) {
1912
1913 if (max_nid != p->numa_preferred_nid)
1914 sched_setnuma(p, max_nid);
1915
1916 if (task_node(p) != p->numa_preferred_nid)
1917 numa_migrate_preferred(p);
1918 }
1919}
1920
1921static inline int get_numa_group(struct numa_group *grp)
1922{
1923 return atomic_inc_not_zero(&grp->refcount);
1924}
1925
1926static inline void put_numa_group(struct numa_group *grp)
1927{
1928 if (atomic_dec_and_test(&grp->refcount))
1929 kfree_rcu(grp, rcu);
1930}
1931
1932static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1933 int *priv)
1934{
1935 struct numa_group *grp, *my_grp;
1936 struct task_struct *tsk;
1937 bool join = false;
1938 int cpu = cpupid_to_cpu(cpupid);
1939 int i;
1940
1941 if (unlikely(!p->numa_group)) {
1942 unsigned int size = sizeof(struct numa_group) +
1943 4*nr_node_ids*sizeof(unsigned long);
1944
1945 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1946 if (!grp)
1947 return;
1948
1949 atomic_set(&grp->refcount, 1);
1950 spin_lock_init(&grp->lock);
1951 grp->gid = p->pid;
1952
1953 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1954 nr_node_ids;
1955
1956 node_set(task_node(current), grp->active_nodes);
1957
1958 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1959 grp->faults[i] = p->numa_faults[i];
1960
1961 grp->total_faults = p->total_numa_faults;
1962
1963 grp->nr_tasks++;
1964 rcu_assign_pointer(p->numa_group, grp);
1965 }
1966
1967 rcu_read_lock();
1968 tsk = READ_ONCE(cpu_rq(cpu)->curr);
1969
1970 if (!cpupid_match_pid(tsk, cpupid))
1971 goto no_join;
1972
1973 grp = rcu_dereference(tsk->numa_group);
1974 if (!grp)
1975 goto no_join;
1976
1977 my_grp = p->numa_group;
1978 if (grp == my_grp)
1979 goto no_join;
1980
1981
1982
1983
1984
1985 if (my_grp->nr_tasks > grp->nr_tasks)
1986 goto no_join;
1987
1988
1989
1990
1991 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1992 goto no_join;
1993
1994
1995 if (tsk->mm == current->mm)
1996 join = true;
1997
1998
1999 if (flags & TNF_SHARED)
2000 join = true;
2001
2002
2003 *priv = !join;
2004
2005 if (join && !get_numa_group(grp))
2006 goto no_join;
2007
2008 rcu_read_unlock();
2009
2010 if (!join)
2011 return;
2012
2013 BUG_ON(irqs_disabled());
2014 double_lock_irq(&my_grp->lock, &grp->lock);
2015
2016 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2017 my_grp->faults[i] -= p->numa_faults[i];
2018 grp->faults[i] += p->numa_faults[i];
2019 }
2020 my_grp->total_faults -= p->total_numa_faults;
2021 grp->total_faults += p->total_numa_faults;
2022
2023 my_grp->nr_tasks--;
2024 grp->nr_tasks++;
2025
2026 spin_unlock(&my_grp->lock);
2027 spin_unlock_irq(&grp->lock);
2028
2029 rcu_assign_pointer(p->numa_group, grp);
2030
2031 put_numa_group(my_grp);
2032 return;
2033
2034no_join:
2035 rcu_read_unlock();
2036 return;
2037}
2038
2039void task_numa_free(struct task_struct *p)
2040{
2041 struct numa_group *grp = p->numa_group;
2042 void *numa_faults = p->numa_faults;
2043 unsigned long flags;
2044 int i;
2045
2046 if (grp) {
2047 spin_lock_irqsave(&grp->lock, flags);
2048 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2049 grp->faults[i] -= p->numa_faults[i];
2050 grp->total_faults -= p->total_numa_faults;
2051
2052 grp->nr_tasks--;
2053 spin_unlock_irqrestore(&grp->lock, flags);
2054 RCU_INIT_POINTER(p->numa_group, NULL);
2055 put_numa_group(grp);
2056 }
2057
2058 p->numa_faults = NULL;
2059 kfree(numa_faults);
2060}
2061
2062
2063
2064
2065void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2066{
2067 struct task_struct *p = current;
2068 bool migrated = flags & TNF_MIGRATED;
2069 int cpu_node = task_node(current);
2070 int local = !!(flags & TNF_FAULT_LOCAL);
2071 int priv;
2072
2073 if (!static_branch_likely(&sched_numa_balancing))
2074 return;
2075
2076
2077 if (!p->mm)
2078 return;
2079
2080
2081 if (unlikely(!p->numa_faults)) {
2082 int size = sizeof(*p->numa_faults) *
2083 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2084
2085 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2086 if (!p->numa_faults)
2087 return;
2088
2089 p->total_numa_faults = 0;
2090 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2091 }
2092
2093
2094
2095
2096
2097 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2098 priv = 1;
2099 } else {
2100 priv = cpupid_match_pid(p, last_cpupid);
2101 if (!priv && !(flags & TNF_NO_GROUP))
2102 task_numa_group(p, last_cpupid, flags, &priv);
2103 }
2104
2105
2106
2107
2108
2109
2110
2111 if (!priv && !local && p->numa_group &&
2112 node_isset(cpu_node, p->numa_group->active_nodes) &&
2113 node_isset(mem_node, p->numa_group->active_nodes))
2114 local = 1;
2115
2116 task_numa_placement(p);
2117
2118
2119
2120
2121
2122 if (time_after(jiffies, p->numa_migrate_retry))
2123 numa_migrate_preferred(p);
2124
2125 if (migrated)
2126 p->numa_pages_migrated += pages;
2127 if (flags & TNF_MIGRATE_FAIL)
2128 p->numa_faults_locality[2] += pages;
2129
2130 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2131 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2132 p->numa_faults_locality[local] += pages;
2133}
2134
2135static void reset_ptenuma_scan(struct task_struct *p)
2136{
2137
2138
2139
2140
2141
2142
2143
2144
2145 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2146 p->mm->numa_scan_offset = 0;
2147}
2148
2149
2150
2151
2152
2153void task_numa_work(struct callback_head *work)
2154{
2155 unsigned long migrate, next_scan, now = jiffies;
2156 struct task_struct *p = current;
2157 struct mm_struct *mm = p->mm;
2158 struct vm_area_struct *vma;
2159 unsigned long start, end;
2160 unsigned long nr_pte_updates = 0;
2161 long pages, virtpages;
2162
2163 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2164
2165 work->next = work;
2166
2167
2168
2169
2170
2171
2172
2173
2174 if (p->flags & PF_EXITING)
2175 return;
2176
2177 if (!mm->numa_next_scan) {
2178 mm->numa_next_scan = now +
2179 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2180 }
2181
2182
2183
2184
2185 migrate = mm->numa_next_scan;
2186 if (time_before(now, migrate))
2187 return;
2188
2189 if (p->numa_scan_period == 0) {
2190 p->numa_scan_period_max = task_scan_max(p);
2191 p->numa_scan_period = task_scan_min(p);
2192 }
2193
2194 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2195 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2196 return;
2197
2198
2199
2200
2201
2202 p->node_stamp += 2 * TICK_NSEC;
2203
2204 start = mm->numa_scan_offset;
2205 pages = sysctl_numa_balancing_scan_size;
2206 pages <<= 20 - PAGE_SHIFT;
2207 virtpages = pages * 8;
2208 if (!pages)
2209 return;
2210
2211
2212 down_read(&mm->mmap_sem);
2213 vma = find_vma(mm, start);
2214 if (!vma) {
2215 reset_ptenuma_scan(p);
2216 start = 0;
2217 vma = mm->mmap;
2218 }
2219 for (; vma; vma = vma->vm_next) {
2220 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2221 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2222 continue;
2223 }
2224
2225
2226
2227
2228
2229
2230
2231 if (!vma->vm_mm ||
2232 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2233 continue;
2234
2235
2236
2237
2238
2239 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2240 continue;
2241
2242 do {
2243 start = max(start, vma->vm_start);
2244 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2245 end = min(end, vma->vm_end);
2246 nr_pte_updates = change_prot_numa(vma, start, end);
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256 if (nr_pte_updates)
2257 pages -= (end - start) >> PAGE_SHIFT;
2258 virtpages -= (end - start) >> PAGE_SHIFT;
2259
2260 start = end;
2261 if (pages <= 0 || virtpages <= 0)
2262 goto out;
2263
2264 cond_resched();
2265 } while (end != vma->vm_end);
2266 }
2267
2268out:
2269
2270
2271
2272
2273
2274
2275 if (vma)
2276 mm->numa_scan_offset = start;
2277 else
2278 reset_ptenuma_scan(p);
2279 up_read(&mm->mmap_sem);
2280}
2281
2282
2283
2284
2285void task_tick_numa(struct rq *rq, struct task_struct *curr)
2286{
2287 struct callback_head *work = &curr->numa_work;
2288 u64 period, now;
2289
2290
2291
2292
2293 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2294 return;
2295
2296
2297
2298
2299
2300
2301
2302 now = curr->se.sum_exec_runtime;
2303 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2304
2305 if (now > curr->node_stamp + period) {
2306 if (!curr->node_stamp)
2307 curr->numa_scan_period = task_scan_min(curr);
2308 curr->node_stamp += period;
2309
2310 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2311 init_task_work(work, task_numa_work);
2312 task_work_add(curr, work, true);
2313 }
2314 }
2315}
2316#else
2317static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2318{
2319}
2320
2321static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2322{
2323}
2324
2325static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2326{
2327}
2328#endif
2329
2330static void
2331account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2332{
2333 update_load_add(&cfs_rq->load, se->load.weight);
2334 if (!parent_entity(se))
2335 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2336#ifdef CONFIG_SMP
2337 if (entity_is_task(se)) {
2338 struct rq *rq = rq_of(cfs_rq);
2339
2340 account_numa_enqueue(rq, task_of(se));
2341 list_add(&se->group_node, &rq->cfs_tasks);
2342 }
2343#endif
2344 cfs_rq->nr_running++;
2345}
2346
2347static void
2348account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2349{
2350 update_load_sub(&cfs_rq->load, se->load.weight);
2351 if (!parent_entity(se))
2352 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2353 if (entity_is_task(se)) {
2354 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2355 list_del_init(&se->group_node);
2356 }
2357 cfs_rq->nr_running--;
2358}
2359
2360#ifdef CONFIG_FAIR_GROUP_SCHED
2361# ifdef CONFIG_SMP
2362static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2363{
2364 long tg_weight;
2365
2366
2367
2368
2369
2370
2371 tg_weight = atomic_long_read(&tg->load_avg);
2372 tg_weight -= cfs_rq->tg_load_avg_contrib;
2373 tg_weight += cfs_rq->load.weight;
2374
2375 return tg_weight;
2376}
2377
2378static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2379{
2380 long tg_weight, load, shares;
2381
2382 tg_weight = calc_tg_weight(tg, cfs_rq);
2383 load = cfs_rq->load.weight;
2384
2385 shares = (tg->shares * load);
2386 if (tg_weight)
2387 shares /= tg_weight;
2388
2389 if (shares < MIN_SHARES)
2390 shares = MIN_SHARES;
2391 if (shares > tg->shares)
2392 shares = tg->shares;
2393
2394 return shares;
2395}
2396# else
2397static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2398{
2399 return tg->shares;
2400}
2401# endif
2402static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2403 unsigned long weight)
2404{
2405 if (se->on_rq) {
2406
2407 if (cfs_rq->curr == se)
2408 update_curr(cfs_rq);
2409 account_entity_dequeue(cfs_rq, se);
2410 }
2411
2412 update_load_set(&se->load, weight);
2413
2414 if (se->on_rq)
2415 account_entity_enqueue(cfs_rq, se);
2416}
2417
2418static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2419
2420static void update_cfs_shares(struct cfs_rq *cfs_rq)
2421{
2422 struct task_group *tg;
2423 struct sched_entity *se;
2424 long shares;
2425
2426 tg = cfs_rq->tg;
2427 se = tg->se[cpu_of(rq_of(cfs_rq))];
2428 if (!se || throttled_hierarchy(cfs_rq))
2429 return;
2430#ifndef CONFIG_SMP
2431 if (likely(se->load.weight == tg->shares))
2432 return;
2433#endif
2434 shares = calc_cfs_shares(cfs_rq, tg);
2435
2436 reweight_entity(cfs_rq_of(se), se, shares);
2437}
2438#else
2439static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2440{
2441}
2442#endif
2443
2444#ifdef CONFIG_SMP
2445
2446static const u32 runnable_avg_yN_inv[] = {
2447 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2448 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2449 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2450 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2451 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2452 0x85aac367, 0x82cd8698,
2453};
2454
2455
2456
2457
2458
2459static const u32 runnable_avg_yN_sum[] = {
2460 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2461 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2462 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2463};
2464
2465
2466
2467
2468
2469static __always_inline u64 decay_load(u64 val, u64 n)
2470{
2471 unsigned int local_n;
2472
2473 if (!n)
2474 return val;
2475 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2476 return 0;
2477
2478
2479 local_n = n;
2480
2481
2482
2483
2484
2485
2486
2487
2488 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2489 val >>= local_n / LOAD_AVG_PERIOD;
2490 local_n %= LOAD_AVG_PERIOD;
2491 }
2492
2493 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2494 return val;
2495}
2496
2497
2498
2499
2500
2501
2502
2503
2504static u32 __compute_runnable_contrib(u64 n)
2505{
2506 u32 contrib = 0;
2507
2508 if (likely(n <= LOAD_AVG_PERIOD))
2509 return runnable_avg_yN_sum[n];
2510 else if (unlikely(n >= LOAD_AVG_MAX_N))
2511 return LOAD_AVG_MAX;
2512
2513
2514 do {
2515 contrib /= 2;
2516 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2517
2518 n -= LOAD_AVG_PERIOD;
2519 } while (n > LOAD_AVG_PERIOD);
2520
2521 contrib = decay_load(contrib, n);
2522 return contrib + runnable_avg_yN_sum[n];
2523}
2524
2525#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2526#error "load tracking assumes 2^10 as unit"
2527#endif
2528
2529#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559static __always_inline int
2560__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2561 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2562{
2563 u64 delta, scaled_delta, periods;
2564 u32 contrib;
2565 unsigned int delta_w, scaled_delta_w, decayed = 0;
2566 unsigned long scale_freq, scale_cpu;
2567
2568 delta = now - sa->last_update_time;
2569
2570
2571
2572
2573 if ((s64)delta < 0) {
2574 sa->last_update_time = now;
2575 return 0;
2576 }
2577
2578
2579
2580
2581
2582 delta >>= 10;
2583 if (!delta)
2584 return 0;
2585 sa->last_update_time = now;
2586
2587 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2588 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2589
2590
2591 delta_w = sa->period_contrib;
2592 if (delta + delta_w >= 1024) {
2593 decayed = 1;
2594
2595
2596 sa->period_contrib = 0;
2597
2598
2599
2600
2601
2602
2603 delta_w = 1024 - delta_w;
2604 scaled_delta_w = cap_scale(delta_w, scale_freq);
2605 if (weight) {
2606 sa->load_sum += weight * scaled_delta_w;
2607 if (cfs_rq) {
2608 cfs_rq->runnable_load_sum +=
2609 weight * scaled_delta_w;
2610 }
2611 }
2612 if (running)
2613 sa->util_sum += scaled_delta_w * scale_cpu;
2614
2615 delta -= delta_w;
2616
2617
2618 periods = delta / 1024;
2619 delta %= 1024;
2620
2621 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2622 if (cfs_rq) {
2623 cfs_rq->runnable_load_sum =
2624 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2625 }
2626 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2627
2628
2629 contrib = __compute_runnable_contrib(periods);
2630 contrib = cap_scale(contrib, scale_freq);
2631 if (weight) {
2632 sa->load_sum += weight * contrib;
2633 if (cfs_rq)
2634 cfs_rq->runnable_load_sum += weight * contrib;
2635 }
2636 if (running)
2637 sa->util_sum += contrib * scale_cpu;
2638 }
2639
2640
2641 scaled_delta = cap_scale(delta, scale_freq);
2642 if (weight) {
2643 sa->load_sum += weight * scaled_delta;
2644 if (cfs_rq)
2645 cfs_rq->runnable_load_sum += weight * scaled_delta;
2646 }
2647 if (running)
2648 sa->util_sum += scaled_delta * scale_cpu;
2649
2650 sa->period_contrib += delta;
2651
2652 if (decayed) {
2653 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2654 if (cfs_rq) {
2655 cfs_rq->runnable_load_avg =
2656 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2657 }
2658 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2659 }
2660
2661 return decayed;
2662}
2663
2664#ifdef CONFIG_FAIR_GROUP_SCHED
2665
2666
2667
2668
2669static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2670{
2671 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2672
2673 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2674 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2675 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2676 }
2677}
2678
2679#else
2680static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2681#endif
2682
2683static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2684
2685
2686static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2687{
2688 struct sched_avg *sa = &cfs_rq->avg;
2689 int decayed, removed = 0;
2690
2691 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2692 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2693 sa->load_avg = max_t(long, sa->load_avg - r, 0);
2694 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2695 removed = 1;
2696 }
2697
2698 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2699 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2700 sa->util_avg = max_t(long, sa->util_avg - r, 0);
2701 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2702 }
2703
2704 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2705 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2706
2707#ifndef CONFIG_64BIT
2708 smp_wmb();
2709 cfs_rq->load_last_update_time_copy = sa->last_update_time;
2710#endif
2711
2712 return decayed || removed;
2713}
2714
2715
2716static inline void update_load_avg(struct sched_entity *se, int update_tg)
2717{
2718 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2719 u64 now = cfs_rq_clock_task(cfs_rq);
2720 int cpu = cpu_of(rq_of(cfs_rq));
2721
2722
2723
2724
2725
2726 __update_load_avg(now, cpu, &se->avg,
2727 se->on_rq * scale_load_down(se->load.weight),
2728 cfs_rq->curr == se, NULL);
2729
2730 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2731 update_tg_load_avg(cfs_rq, 0);
2732}
2733
2734static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2735{
2736 if (!sched_feat(ATTACH_AGE_LOAD))
2737 goto skip_aging;
2738
2739
2740
2741
2742
2743 if (se->avg.last_update_time) {
2744 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2745 &se->avg, 0, 0, NULL);
2746
2747
2748
2749
2750
2751 }
2752
2753skip_aging:
2754 se->avg.last_update_time = cfs_rq->avg.last_update_time;
2755 cfs_rq->avg.load_avg += se->avg.load_avg;
2756 cfs_rq->avg.load_sum += se->avg.load_sum;
2757 cfs_rq->avg.util_avg += se->avg.util_avg;
2758 cfs_rq->avg.util_sum += se->avg.util_sum;
2759}
2760
2761static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2764 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2765 cfs_rq->curr == se, NULL);
2766
2767 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2768 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2769 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2770 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2771}
2772
2773
2774static inline void
2775enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2776{
2777 struct sched_avg *sa = &se->avg;
2778 u64 now = cfs_rq_clock_task(cfs_rq);
2779 int migrated, decayed;
2780
2781 migrated = !sa->last_update_time;
2782 if (!migrated) {
2783 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2784 se->on_rq * scale_load_down(se->load.weight),
2785 cfs_rq->curr == se, NULL);
2786 }
2787
2788 decayed = update_cfs_rq_load_avg(now, cfs_rq);
2789
2790 cfs_rq->runnable_load_avg += sa->load_avg;
2791 cfs_rq->runnable_load_sum += sa->load_sum;
2792
2793 if (migrated)
2794 attach_entity_load_avg(cfs_rq, se);
2795
2796 if (decayed || migrated)
2797 update_tg_load_avg(cfs_rq, 0);
2798}
2799
2800
2801static inline void
2802dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2803{
2804 update_load_avg(se, 1);
2805
2806 cfs_rq->runnable_load_avg =
2807 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2808 cfs_rq->runnable_load_sum =
2809 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2810}
2811
2812
2813
2814
2815
2816void remove_entity_load_avg(struct sched_entity *se)
2817{
2818 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2819 u64 last_update_time;
2820
2821#ifndef CONFIG_64BIT
2822 u64 last_update_time_copy;
2823
2824 do {
2825 last_update_time_copy = cfs_rq->load_last_update_time_copy;
2826 smp_rmb();
2827 last_update_time = cfs_rq->avg.last_update_time;
2828 } while (last_update_time != last_update_time_copy);
2829#else
2830 last_update_time = cfs_rq->avg.last_update_time;
2831#endif
2832
2833 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2834 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2835 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2836}
2837
2838
2839
2840
2841
2842
2843void idle_enter_fair(struct rq *this_rq)
2844{
2845}
2846
2847
2848
2849
2850
2851
2852void idle_exit_fair(struct rq *this_rq)
2853{
2854}
2855
2856static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2857{
2858 return cfs_rq->runnable_load_avg;
2859}
2860
2861static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2862{
2863 return cfs_rq->avg.load_avg;
2864}
2865
2866static int idle_balance(struct rq *this_rq);
2867
2868#else
2869
2870static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2871static inline void
2872enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2873static inline void
2874dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2875static inline void remove_entity_load_avg(struct sched_entity *se) {}
2876
2877static inline void
2878attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2879static inline void
2880detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2881
2882static inline int idle_balance(struct rq *rq)
2883{
2884 return 0;
2885}
2886
2887#endif
2888
2889static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2890{
2891#ifdef CONFIG_SCHEDSTATS
2892 struct task_struct *tsk = NULL;
2893
2894 if (entity_is_task(se))
2895 tsk = task_of(se);
2896
2897 if (se->statistics.sleep_start) {
2898 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2899
2900 if ((s64)delta < 0)
2901 delta = 0;
2902
2903 if (unlikely(delta > se->statistics.sleep_max))
2904 se->statistics.sleep_max = delta;
2905
2906 se->statistics.sleep_start = 0;
2907 se->statistics.sum_sleep_runtime += delta;
2908
2909 if (tsk) {
2910 account_scheduler_latency(tsk, delta >> 10, 1);
2911 trace_sched_stat_sleep(tsk, delta);
2912 }
2913 }
2914 if (se->statistics.block_start) {
2915 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2916
2917 if ((s64)delta < 0)
2918 delta = 0;
2919
2920 if (unlikely(delta > se->statistics.block_max))
2921 se->statistics.block_max = delta;
2922
2923 se->statistics.block_start = 0;
2924 se->statistics.sum_sleep_runtime += delta;
2925
2926 if (tsk) {
2927 if (tsk->in_iowait) {
2928 se->statistics.iowait_sum += delta;
2929 se->statistics.iowait_count++;
2930 trace_sched_stat_iowait(tsk, delta);
2931 }
2932
2933 trace_sched_stat_blocked(tsk, delta);
2934
2935
2936
2937
2938
2939
2940 if (unlikely(prof_on == SLEEP_PROFILING)) {
2941 profile_hits(SLEEP_PROFILING,
2942 (void *)get_wchan(tsk),
2943 delta >> 20);
2944 }
2945 account_scheduler_latency(tsk, delta >> 10, 0);
2946 }
2947 }
2948#endif
2949}
2950
2951static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2952{
2953#ifdef CONFIG_SCHED_DEBUG
2954 s64 d = se->vruntime - cfs_rq->min_vruntime;
2955
2956 if (d < 0)
2957 d = -d;
2958
2959 if (d > 3*sysctl_sched_latency)
2960 schedstat_inc(cfs_rq, nr_spread_over);
2961#endif
2962}
2963
2964static void
2965place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2966{
2967 u64 vruntime = cfs_rq->min_vruntime;
2968
2969
2970
2971
2972
2973
2974
2975 if (initial && sched_feat(START_DEBIT))
2976 vruntime += sched_vslice(cfs_rq, se);
2977
2978
2979 if (!initial) {
2980 unsigned long thresh = sysctl_sched_latency;
2981
2982
2983
2984
2985
2986 if (sched_feat(GENTLE_FAIR_SLEEPERS))
2987 thresh >>= 1;
2988
2989 vruntime -= thresh;
2990 }
2991
2992
2993 se->vruntime = max_vruntime(se->vruntime, vruntime);
2994}
2995
2996static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2997
2998static void
2999enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3000{
3001
3002
3003
3004
3005 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3006 se->vruntime += cfs_rq->min_vruntime;
3007
3008
3009
3010
3011 update_curr(cfs_rq);
3012 enqueue_entity_load_avg(cfs_rq, se);
3013 account_entity_enqueue(cfs_rq, se);
3014 update_cfs_shares(cfs_rq);
3015
3016 if (flags & ENQUEUE_WAKEUP) {
3017 place_entity(cfs_rq, se, 0);
3018 enqueue_sleeper(cfs_rq, se);
3019 }
3020
3021 update_stats_enqueue(cfs_rq, se);
3022 check_spread(cfs_rq, se);
3023 if (se != cfs_rq->curr)
3024 __enqueue_entity(cfs_rq, se);
3025 se->on_rq = 1;
3026
3027 if (cfs_rq->nr_running == 1) {
3028 list_add_leaf_cfs_rq(cfs_rq);
3029 check_enqueue_throttle(cfs_rq);
3030 }
3031}
3032
3033static void __clear_buddies_last(struct sched_entity *se)
3034{
3035 for_each_sched_entity(se) {
3036 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3037 if (cfs_rq->last != se)
3038 break;
3039
3040 cfs_rq->last = NULL;
3041 }
3042}
3043
3044static void __clear_buddies_next(struct sched_entity *se)
3045{
3046 for_each_sched_entity(se) {
3047 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3048 if (cfs_rq->next != se)
3049 break;
3050
3051 cfs_rq->next = NULL;
3052 }
3053}
3054
3055static void __clear_buddies_skip(struct sched_entity *se)
3056{
3057 for_each_sched_entity(se) {
3058 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3059 if (cfs_rq->skip != se)
3060 break;
3061
3062 cfs_rq->skip = NULL;
3063 }
3064}
3065
3066static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3067{
3068 if (cfs_rq->last == se)
3069 __clear_buddies_last(se);
3070
3071 if (cfs_rq->next == se)
3072 __clear_buddies_next(se);
3073
3074 if (cfs_rq->skip == se)
3075 __clear_buddies_skip(se);
3076}
3077
3078static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3079
3080static void
3081dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3082{
3083
3084
3085
3086 update_curr(cfs_rq);
3087 dequeue_entity_load_avg(cfs_rq, se);
3088
3089 update_stats_dequeue(cfs_rq, se);
3090 if (flags & DEQUEUE_SLEEP) {
3091#ifdef CONFIG_SCHEDSTATS
3092 if (entity_is_task(se)) {
3093 struct task_struct *tsk = task_of(se);
3094
3095 if (tsk->state & TASK_INTERRUPTIBLE)
3096 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3097 if (tsk->state & TASK_UNINTERRUPTIBLE)
3098 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3099 }
3100#endif
3101 }
3102
3103 clear_buddies(cfs_rq, se);
3104
3105 if (se != cfs_rq->curr)
3106 __dequeue_entity(cfs_rq, se);
3107 se->on_rq = 0;
3108 account_entity_dequeue(cfs_rq, se);
3109
3110
3111
3112
3113
3114
3115 if (!(flags & DEQUEUE_SLEEP))
3116 se->vruntime -= cfs_rq->min_vruntime;
3117
3118
3119 return_cfs_rq_runtime(cfs_rq);
3120
3121 update_min_vruntime(cfs_rq);
3122 update_cfs_shares(cfs_rq);
3123}
3124
3125
3126
3127
3128static void
3129check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3130{
3131 unsigned long ideal_runtime, delta_exec;
3132 struct sched_entity *se;
3133 s64 delta;
3134
3135 ideal_runtime = sched_slice(cfs_rq, curr);
3136 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3137 if (delta_exec > ideal_runtime) {
3138 resched_curr(rq_of(cfs_rq));
3139
3140
3141
3142
3143 clear_buddies(cfs_rq, curr);
3144 return;
3145 }
3146
3147
3148
3149
3150
3151
3152 if (delta_exec < sysctl_sched_min_granularity)
3153 return;
3154
3155 se = __pick_first_entity(cfs_rq);
3156 delta = curr->vruntime - se->vruntime;
3157
3158 if (delta < 0)
3159 return;
3160
3161 if (delta > ideal_runtime)
3162 resched_curr(rq_of(cfs_rq));
3163}
3164
3165static void
3166set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3167{
3168
3169 if (se->on_rq) {
3170
3171
3172
3173
3174
3175 update_stats_wait_end(cfs_rq, se);
3176 __dequeue_entity(cfs_rq, se);
3177 update_load_avg(se, 1);
3178 }
3179
3180 update_stats_curr_start(cfs_rq, se);
3181 cfs_rq->curr = se;
3182#ifdef CONFIG_SCHEDSTATS
3183
3184
3185
3186
3187
3188 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3189 se->statistics.slice_max = max(se->statistics.slice_max,
3190 se->sum_exec_runtime - se->prev_sum_exec_runtime);
3191 }
3192#endif
3193 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3194}
3195
3196static int
3197wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3198
3199
3200
3201
3202
3203
3204
3205
3206static struct sched_entity *
3207pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3208{
3209 struct sched_entity *left = __pick_first_entity(cfs_rq);
3210 struct sched_entity *se;
3211
3212
3213
3214
3215
3216 if (!left || (curr && entity_before(curr, left)))
3217 left = curr;
3218
3219 se = left;
3220
3221
3222
3223
3224
3225 if (cfs_rq->skip == se) {
3226 struct sched_entity *second;
3227
3228 if (se == curr) {
3229 second = __pick_first_entity(cfs_rq);
3230 } else {
3231 second = __pick_next_entity(se);
3232 if (!second || (curr && entity_before(curr, second)))
3233 second = curr;
3234 }
3235
3236 if (second && wakeup_preempt_entity(second, left) < 1)
3237 se = second;
3238 }
3239
3240
3241
3242
3243 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3244 se = cfs_rq->last;
3245
3246
3247
3248
3249 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3250 se = cfs_rq->next;
3251
3252 clear_buddies(cfs_rq, se);
3253
3254 return se;
3255}
3256
3257static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3258
3259static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3260{
3261
3262
3263
3264
3265 if (prev->on_rq)
3266 update_curr(cfs_rq);
3267
3268
3269 check_cfs_rq_runtime(cfs_rq);
3270
3271 check_spread(cfs_rq, prev);
3272 if (prev->on_rq) {
3273 update_stats_wait_start(cfs_rq, prev);
3274
3275 __enqueue_entity(cfs_rq, prev);
3276
3277 update_load_avg(prev, 0);
3278 }
3279 cfs_rq->curr = NULL;
3280}
3281
3282static void
3283entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3284{
3285
3286
3287
3288 update_curr(cfs_rq);
3289
3290
3291
3292
3293 update_load_avg(curr, 1);
3294 update_cfs_shares(cfs_rq);
3295
3296#ifdef CONFIG_SCHED_HRTICK
3297
3298
3299
3300
3301 if (queued) {
3302 resched_curr(rq_of(cfs_rq));
3303 return;
3304 }
3305
3306
3307
3308 if (!sched_feat(DOUBLE_TICK) &&
3309 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3310 return;
3311#endif
3312
3313 if (cfs_rq->nr_running > 1)
3314 check_preempt_tick(cfs_rq, curr);
3315}
3316
3317
3318
3319
3320
3321
3322#ifdef CONFIG_CFS_BANDWIDTH
3323
3324#ifdef HAVE_JUMP_LABEL
3325static struct static_key __cfs_bandwidth_used;
3326
3327static inline bool cfs_bandwidth_used(void)
3328{
3329 return static_key_false(&__cfs_bandwidth_used);
3330}
3331
3332void cfs_bandwidth_usage_inc(void)
3333{
3334 static_key_slow_inc(&__cfs_bandwidth_used);
3335}
3336
3337void cfs_bandwidth_usage_dec(void)
3338{
3339 static_key_slow_dec(&__cfs_bandwidth_used);
3340}
3341#else
3342static bool cfs_bandwidth_used(void)
3343{
3344 return true;
3345}
3346
3347void cfs_bandwidth_usage_inc(void) {}
3348void cfs_bandwidth_usage_dec(void) {}
3349#endif
3350
3351
3352
3353
3354
3355static inline u64 default_cfs_period(void)
3356{
3357 return 100000000ULL;
3358}
3359
3360static inline u64 sched_cfs_bandwidth_slice(void)
3361{
3362 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3363}
3364
3365
3366
3367
3368
3369
3370
3371
3372void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3373{
3374 u64 now;
3375
3376 if (cfs_b->quota == RUNTIME_INF)
3377 return;
3378
3379 now = sched_clock_cpu(smp_processor_id());
3380 cfs_b->runtime = cfs_b->quota;
3381 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3382}
3383
3384static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3385{
3386 return &tg->cfs_bandwidth;
3387}
3388
3389
3390static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3391{
3392 if (unlikely(cfs_rq->throttle_count))
3393 return cfs_rq->throttled_clock_task;
3394
3395 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3396}
3397
3398
3399static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3400{
3401 struct task_group *tg = cfs_rq->tg;
3402 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3403 u64 amount = 0, min_amount, expires;
3404
3405
3406 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3407
3408 raw_spin_lock(&cfs_b->lock);
3409 if (cfs_b->quota == RUNTIME_INF)
3410 amount = min_amount;
3411 else {
3412 start_cfs_bandwidth(cfs_b);
3413
3414 if (cfs_b->runtime > 0) {
3415 amount = min(cfs_b->runtime, min_amount);
3416 cfs_b->runtime -= amount;
3417 cfs_b->idle = 0;
3418 }
3419 }
3420 expires = cfs_b->runtime_expires;
3421 raw_spin_unlock(&cfs_b->lock);
3422
3423 cfs_rq->runtime_remaining += amount;
3424
3425
3426
3427
3428
3429 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3430 cfs_rq->runtime_expires = expires;
3431
3432 return cfs_rq->runtime_remaining > 0;
3433}
3434
3435
3436
3437
3438
3439static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3440{
3441 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3442
3443
3444 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3445 return;
3446
3447 if (cfs_rq->runtime_remaining < 0)
3448 return;
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3462
3463 cfs_rq->runtime_expires += TICK_NSEC;
3464 } else {
3465
3466 cfs_rq->runtime_remaining = 0;
3467 }
3468}
3469
3470static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3471{
3472
3473 cfs_rq->runtime_remaining -= delta_exec;
3474 expire_cfs_rq_runtime(cfs_rq);
3475
3476 if (likely(cfs_rq->runtime_remaining > 0))
3477 return;
3478
3479
3480
3481
3482
3483 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3484 resched_curr(rq_of(cfs_rq));
3485}
3486
3487static __always_inline
3488void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3489{
3490 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3491 return;
3492
3493 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3494}
3495
3496static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3497{
3498 return cfs_bandwidth_used() && cfs_rq->throttled;
3499}
3500
3501
3502static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3503{
3504 return cfs_bandwidth_used() && cfs_rq->throttle_count;
3505}
3506
3507
3508
3509
3510
3511
3512static inline int throttled_lb_pair(struct task_group *tg,
3513 int src_cpu, int dest_cpu)
3514{
3515 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3516
3517 src_cfs_rq = tg->cfs_rq[src_cpu];
3518 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3519
3520 return throttled_hierarchy(src_cfs_rq) ||
3521 throttled_hierarchy(dest_cfs_rq);
3522}
3523
3524
3525static int tg_unthrottle_up(struct task_group *tg, void *data)
3526{
3527 struct rq *rq = data;
3528 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3529
3530 cfs_rq->throttle_count--;
3531#ifdef CONFIG_SMP
3532 if (!cfs_rq->throttle_count) {
3533
3534 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3535 cfs_rq->throttled_clock_task;
3536 }
3537#endif
3538
3539 return 0;
3540}
3541
3542static int tg_throttle_down(struct task_group *tg, void *data)
3543{
3544 struct rq *rq = data;
3545 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3546
3547
3548 if (!cfs_rq->throttle_count)
3549 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3550 cfs_rq->throttle_count++;
3551
3552 return 0;
3553}
3554
3555static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3556{
3557 struct rq *rq = rq_of(cfs_rq);
3558 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3559 struct sched_entity *se;
3560 long task_delta, dequeue = 1;
3561 bool empty;
3562
3563 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3564
3565
3566 rcu_read_lock();
3567 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3568 rcu_read_unlock();
3569
3570 task_delta = cfs_rq->h_nr_running;
3571 for_each_sched_entity(se) {
3572 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3573
3574 if (!se->on_rq)
3575 break;
3576
3577 if (dequeue)
3578 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3579 qcfs_rq->h_nr_running -= task_delta;
3580
3581 if (qcfs_rq->load.weight)
3582 dequeue = 0;
3583 }
3584
3585 if (!se)
3586 sub_nr_running(rq, task_delta);
3587
3588 cfs_rq->throttled = 1;
3589 cfs_rq->throttled_clock = rq_clock(rq);
3590 raw_spin_lock(&cfs_b->lock);
3591 empty = list_empty(&cfs_b->throttled_cfs_rq);
3592
3593
3594
3595
3596
3597 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3598
3599
3600
3601
3602
3603 if (empty)
3604 start_cfs_bandwidth(cfs_b);
3605
3606 raw_spin_unlock(&cfs_b->lock);
3607}
3608
3609void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3610{
3611 struct rq *rq = rq_of(cfs_rq);
3612 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3613 struct sched_entity *se;
3614 int enqueue = 1;
3615 long task_delta;
3616
3617 se = cfs_rq->tg->se[cpu_of(rq)];
3618
3619 cfs_rq->throttled = 0;
3620
3621 update_rq_clock(rq);
3622
3623 raw_spin_lock(&cfs_b->lock);
3624 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3625 list_del_rcu(&cfs_rq->throttled_list);
3626 raw_spin_unlock(&cfs_b->lock);
3627
3628
3629 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3630
3631 if (!cfs_rq->load.weight)
3632 return;
3633
3634 task_delta = cfs_rq->h_nr_running;
3635 for_each_sched_entity(se) {
3636 if (se->on_rq)
3637 enqueue = 0;
3638
3639 cfs_rq = cfs_rq_of(se);
3640 if (enqueue)
3641 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3642 cfs_rq->h_nr_running += task_delta;
3643
3644 if (cfs_rq_throttled(cfs_rq))
3645 break;
3646 }
3647
3648 if (!se)
3649 add_nr_running(rq, task_delta);
3650
3651
3652 if (rq->curr == rq->idle && rq->cfs.nr_running)
3653 resched_curr(rq);
3654}
3655
3656static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3657 u64 remaining, u64 expires)
3658{
3659 struct cfs_rq *cfs_rq;
3660 u64 runtime;
3661 u64 starting_runtime = remaining;
3662
3663 rcu_read_lock();
3664 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3665 throttled_list) {
3666 struct rq *rq = rq_of(cfs_rq);
3667
3668 raw_spin_lock(&rq->lock);
3669 if (!cfs_rq_throttled(cfs_rq))
3670 goto next;
3671
3672 runtime = -cfs_rq->runtime_remaining + 1;
3673 if (runtime > remaining)
3674 runtime = remaining;
3675 remaining -= runtime;
3676
3677 cfs_rq->runtime_remaining += runtime;
3678 cfs_rq->runtime_expires = expires;
3679
3680
3681 if (cfs_rq->runtime_remaining > 0)
3682 unthrottle_cfs_rq(cfs_rq);
3683
3684next:
3685 raw_spin_unlock(&rq->lock);
3686
3687 if (!remaining)
3688 break;
3689 }
3690 rcu_read_unlock();
3691
3692 return starting_runtime - remaining;
3693}
3694
3695
3696
3697
3698
3699
3700
3701static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3702{
3703 u64 runtime, runtime_expires;
3704 int throttled;
3705
3706
3707 if (cfs_b->quota == RUNTIME_INF)
3708 goto out_deactivate;
3709
3710 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3711 cfs_b->nr_periods += overrun;
3712
3713
3714
3715
3716
3717 if (cfs_b->idle && !throttled)
3718 goto out_deactivate;
3719
3720 __refill_cfs_bandwidth_runtime(cfs_b);
3721
3722 if (!throttled) {
3723
3724 cfs_b->idle = 1;
3725 return 0;
3726 }
3727
3728
3729 cfs_b->nr_throttled += overrun;
3730
3731 runtime_expires = cfs_b->runtime_expires;
3732
3733
3734
3735
3736
3737
3738
3739
3740 while (throttled && cfs_b->runtime > 0) {
3741 runtime = cfs_b->runtime;
3742 raw_spin_unlock(&cfs_b->lock);
3743
3744 runtime = distribute_cfs_runtime(cfs_b, runtime,
3745 runtime_expires);
3746 raw_spin_lock(&cfs_b->lock);
3747
3748 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3749
3750 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3751 }
3752
3753
3754
3755
3756
3757
3758
3759 cfs_b->idle = 0;
3760
3761 return 0;
3762
3763out_deactivate:
3764 return 1;
3765}
3766
3767
3768static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3769
3770static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3771
3772static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3773
3774
3775
3776
3777
3778
3779
3780
3781static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3782{
3783 struct hrtimer *refresh_timer = &cfs_b->period_timer;
3784 u64 remaining;
3785
3786
3787 if (hrtimer_callback_running(refresh_timer))
3788 return 1;
3789
3790
3791 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3792 if (remaining < min_expire)
3793 return 1;
3794
3795 return 0;
3796}
3797
3798static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3799{
3800 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3801
3802
3803 if (runtime_refresh_within(cfs_b, min_left))
3804 return;
3805
3806 hrtimer_start(&cfs_b->slack_timer,
3807 ns_to_ktime(cfs_bandwidth_slack_period),
3808 HRTIMER_MODE_REL);
3809}
3810
3811
3812static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3813{
3814 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3815 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3816
3817 if (slack_runtime <= 0)
3818 return;
3819
3820 raw_spin_lock(&cfs_b->lock);
3821 if (cfs_b->quota != RUNTIME_INF &&
3822 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3823 cfs_b->runtime += slack_runtime;
3824
3825
3826 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3827 !list_empty(&cfs_b->throttled_cfs_rq))
3828 start_cfs_slack_bandwidth(cfs_b);
3829 }
3830 raw_spin_unlock(&cfs_b->lock);
3831
3832
3833 cfs_rq->runtime_remaining -= slack_runtime;
3834}
3835
3836static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3837{
3838 if (!cfs_bandwidth_used())
3839 return;
3840
3841 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3842 return;
3843
3844 __return_cfs_rq_runtime(cfs_rq);
3845}
3846
3847
3848
3849
3850
3851static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3852{
3853 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3854 u64 expires;
3855
3856
3857 raw_spin_lock(&cfs_b->lock);
3858 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3859 raw_spin_unlock(&cfs_b->lock);
3860 return;
3861 }
3862
3863 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3864 runtime = cfs_b->runtime;
3865
3866 expires = cfs_b->runtime_expires;
3867 raw_spin_unlock(&cfs_b->lock);
3868
3869 if (!runtime)
3870 return;
3871
3872 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3873
3874 raw_spin_lock(&cfs_b->lock);
3875 if (expires == cfs_b->runtime_expires)
3876 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3877 raw_spin_unlock(&cfs_b->lock);
3878}
3879
3880
3881
3882
3883
3884
3885static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3886{
3887 if (!cfs_bandwidth_used())
3888 return;
3889
3890
3891 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3892 return;
3893
3894
3895 if (cfs_rq_throttled(cfs_rq))
3896 return;
3897
3898
3899 account_cfs_rq_runtime(cfs_rq, 0);
3900 if (cfs_rq->runtime_remaining <= 0)
3901 throttle_cfs_rq(cfs_rq);
3902}
3903
3904
3905static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3906{
3907 if (!cfs_bandwidth_used())
3908 return false;
3909
3910 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3911 return false;
3912
3913
3914
3915
3916
3917 if (cfs_rq_throttled(cfs_rq))
3918 return true;
3919
3920 throttle_cfs_rq(cfs_rq);
3921 return true;
3922}
3923
3924static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3925{
3926 struct cfs_bandwidth *cfs_b =
3927 container_of(timer, struct cfs_bandwidth, slack_timer);
3928
3929 do_sched_cfs_slack_timer(cfs_b);
3930
3931 return HRTIMER_NORESTART;
3932}
3933
3934static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3935{
3936 struct cfs_bandwidth *cfs_b =
3937 container_of(timer, struct cfs_bandwidth, period_timer);
3938 int overrun;
3939 int idle = 0;
3940
3941 raw_spin_lock(&cfs_b->lock);
3942 for (;;) {
3943 overrun = hrtimer_forward_now(timer, cfs_b->period);
3944 if (!overrun)
3945 break;
3946
3947 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3948 }
3949 if (idle)
3950 cfs_b->period_active = 0;
3951 raw_spin_unlock(&cfs_b->lock);
3952
3953 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3954}
3955
3956void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3957{
3958 raw_spin_lock_init(&cfs_b->lock);
3959 cfs_b->runtime = 0;
3960 cfs_b->quota = RUNTIME_INF;
3961 cfs_b->period = ns_to_ktime(default_cfs_period());
3962
3963 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3964 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
3965 cfs_b->period_timer.function = sched_cfs_period_timer;
3966 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3967 cfs_b->slack_timer.function = sched_cfs_slack_timer;
3968}
3969
3970static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3971{
3972 cfs_rq->runtime_enabled = 0;
3973 INIT_LIST_HEAD(&cfs_rq->throttled_list);
3974}
3975
3976void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3977{
3978 lockdep_assert_held(&cfs_b->lock);
3979
3980 if (!cfs_b->period_active) {
3981 cfs_b->period_active = 1;
3982 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
3983 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
3984 }
3985}
3986
3987static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3988{
3989
3990 if (!cfs_b->throttled_cfs_rq.next)
3991 return;
3992
3993 hrtimer_cancel(&cfs_b->period_timer);
3994 hrtimer_cancel(&cfs_b->slack_timer);
3995}
3996
3997static void __maybe_unused update_runtime_enabled(struct rq *rq)
3998{
3999 struct cfs_rq *cfs_rq;
4000
4001 for_each_leaf_cfs_rq(rq, cfs_rq) {
4002 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4003
4004 raw_spin_lock(&cfs_b->lock);
4005 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4006 raw_spin_unlock(&cfs_b->lock);
4007 }
4008}
4009
4010static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4011{
4012 struct cfs_rq *cfs_rq;
4013
4014 for_each_leaf_cfs_rq(rq, cfs_rq) {
4015 if (!cfs_rq->runtime_enabled)
4016 continue;
4017
4018
4019
4020
4021
4022 cfs_rq->runtime_remaining = 1;
4023
4024
4025
4026
4027 cfs_rq->runtime_enabled = 0;
4028
4029 if (cfs_rq_throttled(cfs_rq))
4030 unthrottle_cfs_rq(cfs_rq);
4031 }
4032}
4033
4034#else
4035static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4036{
4037 return rq_clock_task(rq_of(cfs_rq));
4038}
4039
4040static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4041static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4042static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4043static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4044
4045static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4046{
4047 return 0;
4048}
4049
4050static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4051{
4052 return 0;
4053}
4054
4055static inline int throttled_lb_pair(struct task_group *tg,
4056 int src_cpu, int dest_cpu)
4057{
4058 return 0;
4059}
4060
4061void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4062
4063#ifdef CONFIG_FAIR_GROUP_SCHED
4064static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4065#endif
4066
4067static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4068{
4069 return NULL;
4070}
4071static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4072static inline void update_runtime_enabled(struct rq *rq) {}
4073static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4074
4075#endif
4076
4077
4078
4079
4080
4081#ifdef CONFIG_SCHED_HRTICK
4082static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4083{
4084 struct sched_entity *se = &p->se;
4085 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4086
4087 WARN_ON(task_rq(p) != rq);
4088
4089 if (cfs_rq->nr_running > 1) {
4090 u64 slice = sched_slice(cfs_rq, se);
4091 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4092 s64 delta = slice - ran;
4093
4094 if (delta < 0) {
4095 if (rq->curr == p)
4096 resched_curr(rq);
4097 return;
4098 }
4099 hrtick_start(rq, delta);
4100 }
4101}
4102
4103
4104
4105
4106
4107
4108static void hrtick_update(struct rq *rq)
4109{
4110 struct task_struct *curr = rq->curr;
4111
4112 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4113 return;
4114
4115 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4116 hrtick_start_fair(rq, curr);
4117}
4118#else
4119static inline void
4120hrtick_start_fair(struct rq *rq, struct task_struct *p)
4121{
4122}
4123
4124static inline void hrtick_update(struct rq *rq)
4125{
4126}
4127#endif
4128
4129
4130
4131
4132
4133
4134static void
4135enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4136{
4137 struct cfs_rq *cfs_rq;
4138 struct sched_entity *se = &p->se;
4139
4140 for_each_sched_entity(se) {
4141 if (se->on_rq)
4142 break;
4143 cfs_rq = cfs_rq_of(se);
4144 enqueue_entity(cfs_rq, se, flags);
4145
4146
4147
4148
4149
4150
4151
4152 if (cfs_rq_throttled(cfs_rq))
4153 break;
4154 cfs_rq->h_nr_running++;
4155
4156 flags = ENQUEUE_WAKEUP;
4157 }
4158
4159 for_each_sched_entity(se) {
4160 cfs_rq = cfs_rq_of(se);
4161 cfs_rq->h_nr_running++;
4162
4163 if (cfs_rq_throttled(cfs_rq))
4164 break;
4165
4166 update_load_avg(se, 1);
4167 update_cfs_shares(cfs_rq);
4168 }
4169
4170 if (!se)
4171 add_nr_running(rq, 1);
4172
4173 hrtick_update(rq);
4174}
4175
4176static void set_next_buddy(struct sched_entity *se);
4177
4178
4179
4180
4181
4182
4183static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4184{
4185 struct cfs_rq *cfs_rq;
4186 struct sched_entity *se = &p->se;
4187 int task_sleep = flags & DEQUEUE_SLEEP;
4188
4189 for_each_sched_entity(se) {
4190 cfs_rq = cfs_rq_of(se);
4191 dequeue_entity(cfs_rq, se, flags);
4192
4193
4194
4195
4196
4197
4198
4199 if (cfs_rq_throttled(cfs_rq))
4200 break;
4201 cfs_rq->h_nr_running--;
4202
4203
4204 if (cfs_rq->load.weight) {
4205
4206
4207
4208
4209 if (task_sleep && parent_entity(se))
4210 set_next_buddy(parent_entity(se));
4211
4212
4213 se = parent_entity(se);
4214 break;
4215 }
4216 flags |= DEQUEUE_SLEEP;
4217 }
4218
4219 for_each_sched_entity(se) {
4220 cfs_rq = cfs_rq_of(se);
4221 cfs_rq->h_nr_running--;
4222
4223 if (cfs_rq_throttled(cfs_rq))
4224 break;
4225
4226 update_load_avg(se, 1);
4227 update_cfs_shares(cfs_rq);
4228 }
4229
4230 if (!se)
4231 sub_nr_running(rq, 1);
4232
4233 hrtick_update(rq);
4234}
4235
4236#ifdef CONFIG_SMP
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269#define DEGRADE_SHIFT 7
4270static const unsigned char
4271 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4272static const unsigned char
4273 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4274 {0, 0, 0, 0, 0, 0, 0, 0},
4275 {64, 32, 8, 0, 0, 0, 0, 0},
4276 {96, 72, 40, 12, 1, 0, 0},
4277 {112, 98, 75, 43, 15, 1, 0},
4278 {120, 112, 98, 76, 45, 16, 2} };
4279
4280
4281
4282
4283
4284
4285static unsigned long
4286decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4287{
4288 int j = 0;
4289
4290 if (!missed_updates)
4291 return load;
4292
4293 if (missed_updates >= degrade_zero_ticks[idx])
4294 return 0;
4295
4296 if (idx == 1)
4297 return load >> missed_updates;
4298
4299 while (missed_updates) {
4300 if (missed_updates % 2)
4301 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4302
4303 missed_updates >>= 1;
4304 j++;
4305 }
4306 return load;
4307}
4308
4309
4310
4311
4312
4313
4314static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4315 unsigned long pending_updates)
4316{
4317 int i, scale;
4318
4319 this_rq->nr_load_updates++;
4320
4321
4322 this_rq->cpu_load[0] = this_load;
4323 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4324 unsigned long old_load, new_load;
4325
4326
4327
4328 old_load = this_rq->cpu_load[i];
4329 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4330 new_load = this_load;
4331
4332
4333
4334
4335
4336 if (new_load > old_load)
4337 new_load += scale - 1;
4338
4339 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4340 }
4341
4342 sched_avg_update(this_rq);
4343}
4344
4345
4346static unsigned long weighted_cpuload(const int cpu)
4347{
4348 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4349}
4350
4351#ifdef CONFIG_NO_HZ_COMMON
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369static void update_idle_cpu_load(struct rq *this_rq)
4370{
4371 unsigned long curr_jiffies = READ_ONCE(jiffies);
4372 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4373 unsigned long pending_updates;
4374
4375
4376
4377
4378 if (load || curr_jiffies == this_rq->last_load_update_tick)
4379 return;
4380
4381 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4382 this_rq->last_load_update_tick = curr_jiffies;
4383
4384 __update_cpu_load(this_rq, load, pending_updates);
4385}
4386
4387
4388
4389
4390void update_cpu_load_nohz(void)
4391{
4392 struct rq *this_rq = this_rq();
4393 unsigned long curr_jiffies = READ_ONCE(jiffies);
4394 unsigned long pending_updates;
4395
4396 if (curr_jiffies == this_rq->last_load_update_tick)
4397 return;
4398
4399 raw_spin_lock(&this_rq->lock);
4400 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4401 if (pending_updates) {
4402 this_rq->last_load_update_tick = curr_jiffies;
4403
4404
4405
4406
4407 __update_cpu_load(this_rq, 0, pending_updates);
4408 }
4409 raw_spin_unlock(&this_rq->lock);
4410}
4411#endif
4412
4413
4414
4415
4416void update_cpu_load_active(struct rq *this_rq)
4417{
4418 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4419
4420
4421
4422 this_rq->last_load_update_tick = jiffies;
4423 __update_cpu_load(this_rq, load, 1);
4424}
4425
4426
4427
4428
4429
4430
4431
4432
4433static unsigned long source_load(int cpu, int type)
4434{
4435 struct rq *rq = cpu_rq(cpu);
4436 unsigned long total = weighted_cpuload(cpu);
4437
4438 if (type == 0 || !sched_feat(LB_BIAS))
4439 return total;
4440
4441 return min(rq->cpu_load[type-1], total);
4442}
4443
4444
4445
4446
4447
4448static unsigned long target_load(int cpu, int type)
4449{
4450 struct rq *rq = cpu_rq(cpu);
4451 unsigned long total = weighted_cpuload(cpu);
4452
4453 if (type == 0 || !sched_feat(LB_BIAS))
4454 return total;
4455
4456 return max(rq->cpu_load[type-1], total);
4457}
4458
4459static unsigned long capacity_of(int cpu)
4460{
4461 return cpu_rq(cpu)->cpu_capacity;
4462}
4463
4464static unsigned long capacity_orig_of(int cpu)
4465{
4466 return cpu_rq(cpu)->cpu_capacity_orig;
4467}
4468
4469static unsigned long cpu_avg_load_per_task(int cpu)
4470{
4471 struct rq *rq = cpu_rq(cpu);
4472 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4473 unsigned long load_avg = weighted_cpuload(cpu);
4474
4475 if (nr_running)
4476 return load_avg / nr_running;
4477
4478 return 0;
4479}
4480
4481static void record_wakee(struct task_struct *p)
4482{
4483
4484
4485
4486
4487
4488 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4489 current->wakee_flips >>= 1;
4490 current->wakee_flip_decay_ts = jiffies;
4491 }
4492
4493 if (current->last_wakee != p) {
4494 current->last_wakee = p;
4495 current->wakee_flips++;
4496 }
4497}
4498
4499static void task_waking_fair(struct task_struct *p)
4500{
4501 struct sched_entity *se = &p->se;
4502 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4503 u64 min_vruntime;
4504
4505#ifndef CONFIG_64BIT
4506 u64 min_vruntime_copy;
4507
4508 do {
4509 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4510 smp_rmb();
4511 min_vruntime = cfs_rq->min_vruntime;
4512 } while (min_vruntime != min_vruntime_copy);
4513#else
4514 min_vruntime = cfs_rq->min_vruntime;
4515#endif
4516
4517 se->vruntime -= min_vruntime;
4518 record_wakee(p);
4519}
4520
4521#ifdef CONFIG_FAIR_GROUP_SCHED
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4573{
4574 struct sched_entity *se = tg->se[cpu];
4575
4576 if (!tg->parent)
4577 return wl;
4578
4579 for_each_sched_entity(se) {
4580 long w, W;
4581
4582 tg = se->my_q->tg;
4583
4584
4585
4586
4587 W = wg + calc_tg_weight(tg, se->my_q);
4588
4589
4590
4591
4592 w = cfs_rq_load_avg(se->my_q) + wl;
4593
4594
4595
4596
4597 if (W > 0 && w < W)
4598 wl = (w * (long)tg->shares) / W;
4599 else
4600 wl = tg->shares;
4601
4602
4603
4604
4605
4606
4607 if (wl < MIN_SHARES)
4608 wl = MIN_SHARES;
4609
4610
4611
4612
4613 wl -= se->avg.load_avg;
4614
4615
4616
4617
4618
4619
4620
4621
4622 wg = 0;
4623 }
4624
4625 return wl;
4626}
4627#else
4628
4629static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4630{
4631 return wl;
4632}
4633
4634#endif
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648static int wake_wide(struct task_struct *p)
4649{
4650 unsigned int master = current->wakee_flips;
4651 unsigned int slave = p->wakee_flips;
4652 int factor = this_cpu_read(sd_llc_size);
4653
4654 if (master < slave)
4655 swap(master, slave);
4656 if (slave < factor || master < slave * factor)
4657 return 0;
4658 return 1;
4659}
4660
4661static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4662{
4663 s64 this_load, load;
4664 s64 this_eff_load, prev_eff_load;
4665 int idx, this_cpu, prev_cpu;
4666 struct task_group *tg;
4667 unsigned long weight;
4668 int balanced;
4669
4670 idx = sd->wake_idx;
4671 this_cpu = smp_processor_id();
4672 prev_cpu = task_cpu(p);
4673 load = source_load(prev_cpu, idx);
4674 this_load = target_load(this_cpu, idx);
4675
4676
4677
4678
4679
4680
4681 if (sync) {
4682 tg = task_group(current);
4683 weight = current->se.avg.load_avg;
4684
4685 this_load += effective_load(tg, this_cpu, -weight, -weight);
4686 load += effective_load(tg, prev_cpu, 0, -weight);
4687 }
4688
4689 tg = task_group(p);
4690 weight = p->se.avg.load_avg;
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701 this_eff_load = 100;
4702 this_eff_load *= capacity_of(prev_cpu);
4703
4704 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4705 prev_eff_load *= capacity_of(this_cpu);
4706
4707 if (this_load > 0) {
4708 this_eff_load *= this_load +
4709 effective_load(tg, this_cpu, weight, weight);
4710
4711 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4712 }
4713
4714 balanced = this_eff_load <= prev_eff_load;
4715
4716 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4717
4718 if (!balanced)
4719 return 0;
4720
4721 schedstat_inc(sd, ttwu_move_affine);
4722 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4723
4724 return 1;
4725}
4726
4727
4728
4729
4730
4731static struct sched_group *
4732find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4733 int this_cpu, int sd_flag)
4734{
4735 struct sched_group *idlest = NULL, *group = sd->groups;
4736 unsigned long min_load = ULONG_MAX, this_load = 0;
4737 int load_idx = sd->forkexec_idx;
4738 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4739
4740 if (sd_flag & SD_BALANCE_WAKE)
4741 load_idx = sd->wake_idx;
4742
4743 do {
4744 unsigned long load, avg_load;
4745 int local_group;
4746 int i;
4747
4748
4749 if (!cpumask_intersects(sched_group_cpus(group),
4750 tsk_cpus_allowed(p)))
4751 continue;
4752
4753 local_group = cpumask_test_cpu(this_cpu,
4754 sched_group_cpus(group));
4755
4756
4757 avg_load = 0;
4758
4759 for_each_cpu(i, sched_group_cpus(group)) {
4760
4761 if (local_group)
4762 load = source_load(i, load_idx);
4763 else
4764 load = target_load(i, load_idx);
4765
4766 avg_load += load;
4767 }
4768
4769
4770 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4771
4772 if (local_group) {
4773 this_load = avg_load;
4774 } else if (avg_load < min_load) {
4775 min_load = avg_load;
4776 idlest = group;
4777 }
4778 } while (group = group->next, group != sd->groups);
4779
4780 if (!idlest || 100*this_load < imbalance*min_load)
4781 return NULL;
4782 return idlest;
4783}
4784
4785
4786
4787
4788static int
4789find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4790{
4791 unsigned long load, min_load = ULONG_MAX;
4792 unsigned int min_exit_latency = UINT_MAX;
4793 u64 latest_idle_timestamp = 0;
4794 int least_loaded_cpu = this_cpu;
4795 int shallowest_idle_cpu = -1;
4796 int i;
4797
4798
4799 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4800 if (idle_cpu(i)) {
4801 struct rq *rq = cpu_rq(i);
4802 struct cpuidle_state *idle = idle_get_state(rq);
4803 if (idle && idle->exit_latency < min_exit_latency) {
4804
4805
4806
4807
4808
4809 min_exit_latency = idle->exit_latency;
4810 latest_idle_timestamp = rq->idle_stamp;
4811 shallowest_idle_cpu = i;
4812 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4813 rq->idle_stamp > latest_idle_timestamp) {
4814
4815
4816
4817
4818
4819 latest_idle_timestamp = rq->idle_stamp;
4820 shallowest_idle_cpu = i;
4821 }
4822 } else if (shallowest_idle_cpu == -1) {
4823 load = weighted_cpuload(i);
4824 if (load < min_load || (load == min_load && i == this_cpu)) {
4825 min_load = load;
4826 least_loaded_cpu = i;
4827 }
4828 }
4829 }
4830
4831 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4832}
4833
4834
4835
4836
4837static int select_idle_sibling(struct task_struct *p, int target)
4838{
4839 struct sched_domain *sd;
4840 struct sched_group *sg;
4841 int i = task_cpu(p);
4842
4843 if (idle_cpu(target))
4844 return target;
4845
4846
4847
4848
4849 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4850 return i;
4851
4852
4853
4854
4855 sd = rcu_dereference(per_cpu(sd_llc, target));
4856 for_each_lower_domain(sd) {
4857 sg = sd->groups;
4858 do {
4859 if (!cpumask_intersects(sched_group_cpus(sg),
4860 tsk_cpus_allowed(p)))
4861 goto next;
4862
4863 for_each_cpu(i, sched_group_cpus(sg)) {
4864 if (i == target || !idle_cpu(i))
4865 goto next;
4866 }
4867
4868 target = cpumask_first_and(sched_group_cpus(sg),
4869 tsk_cpus_allowed(p));
4870 goto done;
4871next:
4872 sg = sg->next;
4873 } while (sg != sd->groups);
4874 }
4875done:
4876 return target;
4877}
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905static int cpu_util(int cpu)
4906{
4907 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
4908 unsigned long capacity = capacity_orig_of(cpu);
4909
4910 return (util >= capacity) ? capacity : util;
4911}
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925static int
4926select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4927{
4928 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4929 int cpu = smp_processor_id();
4930 int new_cpu = prev_cpu;
4931 int want_affine = 0;
4932 int sync = wake_flags & WF_SYNC;
4933
4934 if (sd_flag & SD_BALANCE_WAKE)
4935 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4936
4937 rcu_read_lock();
4938 for_each_domain(cpu, tmp) {
4939 if (!(tmp->flags & SD_LOAD_BALANCE))
4940 break;
4941
4942
4943
4944
4945
4946 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4947 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4948 affine_sd = tmp;
4949 break;
4950 }
4951
4952 if (tmp->flags & sd_flag)
4953 sd = tmp;
4954 else if (!want_affine)
4955 break;
4956 }
4957
4958 if (affine_sd) {
4959 sd = NULL;
4960 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4961 new_cpu = cpu;
4962 }
4963
4964 if (!sd) {
4965 if (sd_flag & SD_BALANCE_WAKE)
4966 new_cpu = select_idle_sibling(p, new_cpu);
4967
4968 } else while (sd) {
4969 struct sched_group *group;
4970 int weight;
4971
4972 if (!(sd->flags & sd_flag)) {
4973 sd = sd->child;
4974 continue;
4975 }
4976
4977 group = find_idlest_group(sd, p, cpu, sd_flag);
4978 if (!group) {
4979 sd = sd->child;
4980 continue;
4981 }
4982
4983 new_cpu = find_idlest_cpu(group, p, cpu);
4984 if (new_cpu == -1 || new_cpu == cpu) {
4985
4986 sd = sd->child;
4987 continue;
4988 }
4989
4990
4991 cpu = new_cpu;
4992 weight = sd->span_weight;
4993 sd = NULL;
4994 for_each_domain(cpu, tmp) {
4995 if (weight <= tmp->span_weight)
4996 break;
4997 if (tmp->flags & sd_flag)
4998 sd = tmp;
4999 }
5000
5001 }
5002 rcu_read_unlock();
5003
5004 return new_cpu;
5005}
5006
5007
5008
5009
5010
5011
5012
5013static void migrate_task_rq_fair(struct task_struct *p)
5014{
5015
5016
5017
5018
5019
5020
5021
5022 remove_entity_load_avg(&p->se);
5023
5024
5025 p->se.avg.last_update_time = 0;
5026
5027
5028 p->se.exec_start = 0;
5029}
5030
5031static void task_dead_fair(struct task_struct *p)
5032{
5033 remove_entity_load_avg(&p->se);
5034}
5035#endif
5036
5037static unsigned long
5038wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5039{
5040 unsigned long gran = sysctl_sched_wakeup_granularity;
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055 return calc_delta_fair(gran, se);
5056}
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072static int
5073wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5074{
5075 s64 gran, vdiff = curr->vruntime - se->vruntime;
5076
5077 if (vdiff <= 0)
5078 return -1;
5079
5080 gran = wakeup_gran(curr, se);
5081 if (vdiff > gran)
5082 return 1;
5083
5084 return 0;
5085}
5086
5087static void set_last_buddy(struct sched_entity *se)
5088{
5089 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5090 return;
5091
5092 for_each_sched_entity(se)
5093 cfs_rq_of(se)->last = se;
5094}
5095
5096static void set_next_buddy(struct sched_entity *se)
5097{
5098 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5099 return;
5100
5101 for_each_sched_entity(se)
5102 cfs_rq_of(se)->next = se;
5103}
5104
5105static void set_skip_buddy(struct sched_entity *se)
5106{
5107 for_each_sched_entity(se)
5108 cfs_rq_of(se)->skip = se;
5109}
5110
5111
5112
5113
5114static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5115{
5116 struct task_struct *curr = rq->curr;
5117 struct sched_entity *se = &curr->se, *pse = &p->se;
5118 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5119 int scale = cfs_rq->nr_running >= sched_nr_latency;
5120 int next_buddy_marked = 0;
5121
5122 if (unlikely(se == pse))
5123 return;
5124
5125
5126
5127
5128
5129
5130
5131 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5132 return;
5133
5134 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5135 set_next_buddy(pse);
5136 next_buddy_marked = 1;
5137 }
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149 if (test_tsk_need_resched(curr))
5150 return;
5151
5152
5153 if (unlikely(curr->policy == SCHED_IDLE) &&
5154 likely(p->policy != SCHED_IDLE))
5155 goto preempt;
5156
5157
5158
5159
5160
5161 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5162 return;
5163
5164 find_matching_se(&se, &pse);
5165 update_curr(cfs_rq_of(se));
5166 BUG_ON(!pse);
5167 if (wakeup_preempt_entity(se, pse) == 1) {
5168
5169
5170
5171
5172 if (!next_buddy_marked)
5173 set_next_buddy(pse);
5174 goto preempt;
5175 }
5176
5177 return;
5178
5179preempt:
5180 resched_curr(rq);
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190 if (unlikely(!se->on_rq || curr == rq->idle))
5191 return;
5192
5193 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5194 set_last_buddy(se);
5195}
5196
5197static struct task_struct *
5198pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5199{
5200 struct cfs_rq *cfs_rq = &rq->cfs;
5201 struct sched_entity *se;
5202 struct task_struct *p;
5203 int new_tasks;
5204
5205again:
5206#ifdef CONFIG_FAIR_GROUP_SCHED
5207 if (!cfs_rq->nr_running)
5208 goto idle;
5209
5210 if (prev->sched_class != &fair_sched_class)
5211 goto simple;
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221 do {
5222 struct sched_entity *curr = cfs_rq->curr;
5223
5224
5225
5226
5227
5228
5229
5230 if (curr) {
5231 if (curr->on_rq)
5232 update_curr(cfs_rq);
5233 else
5234 curr = NULL;
5235
5236
5237
5238
5239
5240
5241
5242 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5243 goto simple;
5244 }
5245
5246 se = pick_next_entity(cfs_rq, curr);
5247 cfs_rq = group_cfs_rq(se);
5248 } while (cfs_rq);
5249
5250 p = task_of(se);
5251
5252
5253
5254
5255
5256
5257 if (prev != p) {
5258 struct sched_entity *pse = &prev->se;
5259
5260 while (!(cfs_rq = is_same_group(se, pse))) {
5261 int se_depth = se->depth;
5262 int pse_depth = pse->depth;
5263
5264 if (se_depth <= pse_depth) {
5265 put_prev_entity(cfs_rq_of(pse), pse);
5266 pse = parent_entity(pse);
5267 }
5268 if (se_depth >= pse_depth) {
5269 set_next_entity(cfs_rq_of(se), se);
5270 se = parent_entity(se);
5271 }
5272 }
5273
5274 put_prev_entity(cfs_rq, pse);
5275 set_next_entity(cfs_rq, se);
5276 }
5277
5278 if (hrtick_enabled(rq))
5279 hrtick_start_fair(rq, p);
5280
5281 return p;
5282simple:
5283 cfs_rq = &rq->cfs;
5284#endif
5285
5286 if (!cfs_rq->nr_running)
5287 goto idle;
5288
5289 put_prev_task(rq, prev);
5290
5291 do {
5292 se = pick_next_entity(cfs_rq, NULL);
5293 set_next_entity(cfs_rq, se);
5294 cfs_rq = group_cfs_rq(se);
5295 } while (cfs_rq);
5296
5297 p = task_of(se);
5298
5299 if (hrtick_enabled(rq))
5300 hrtick_start_fair(rq, p);
5301
5302 return p;
5303
5304idle:
5305
5306
5307
5308
5309
5310
5311 lockdep_unpin_lock(&rq->lock);
5312 new_tasks = idle_balance(rq);
5313 lockdep_pin_lock(&rq->lock);
5314
5315
5316
5317
5318
5319 if (new_tasks < 0)
5320 return RETRY_TASK;
5321
5322 if (new_tasks > 0)
5323 goto again;
5324
5325 return NULL;
5326}
5327
5328
5329
5330
5331static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5332{
5333 struct sched_entity *se = &prev->se;
5334 struct cfs_rq *cfs_rq;
5335
5336 for_each_sched_entity(se) {
5337 cfs_rq = cfs_rq_of(se);
5338 put_prev_entity(cfs_rq, se);
5339 }
5340}
5341
5342
5343
5344
5345
5346
5347static void yield_task_fair(struct rq *rq)
5348{
5349 struct task_struct *curr = rq->curr;
5350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5351 struct sched_entity *se = &curr->se;
5352
5353
5354
5355
5356 if (unlikely(rq->nr_running == 1))
5357 return;
5358
5359 clear_buddies(cfs_rq, se);
5360
5361 if (curr->policy != SCHED_BATCH) {
5362 update_rq_clock(rq);
5363
5364
5365
5366 update_curr(cfs_rq);
5367
5368
5369
5370
5371
5372 rq_clock_skip_update(rq, true);
5373 }
5374
5375 set_skip_buddy(se);
5376}
5377
5378static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5379{
5380 struct sched_entity *se = &p->se;
5381
5382
5383 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5384 return false;
5385
5386
5387 set_next_buddy(se);
5388
5389 yield_task_fair(rq);
5390
5391 return true;
5392}
5393
5394#ifdef CONFIG_SMP
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5514
5515enum fbq_type { regular, remote, all };
5516
5517#define LBF_ALL_PINNED 0x01
5518#define LBF_NEED_BREAK 0x02
5519#define LBF_DST_PINNED 0x04
5520#define LBF_SOME_PINNED 0x08
5521
5522struct lb_env {
5523 struct sched_domain *sd;
5524
5525 struct rq *src_rq;
5526 int src_cpu;
5527
5528 int dst_cpu;
5529 struct rq *dst_rq;
5530
5531 struct cpumask *dst_grpmask;
5532 int new_dst_cpu;
5533 enum cpu_idle_type idle;
5534 long imbalance;
5535
5536 struct cpumask *cpus;
5537
5538 unsigned int flags;
5539
5540 unsigned int loop;
5541 unsigned int loop_break;
5542 unsigned int loop_max;
5543
5544 enum fbq_type fbq_type;
5545 struct list_head tasks;
5546};
5547
5548
5549
5550
5551static int task_hot(struct task_struct *p, struct lb_env *env)
5552{
5553 s64 delta;
5554
5555 lockdep_assert_held(&env->src_rq->lock);
5556
5557 if (p->sched_class != &fair_sched_class)
5558 return 0;
5559
5560 if (unlikely(p->policy == SCHED_IDLE))
5561 return 0;
5562
5563
5564
5565
5566 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5567 (&p->se == cfs_rq_of(&p->se)->next ||
5568 &p->se == cfs_rq_of(&p->se)->last))
5569 return 1;
5570
5571 if (sysctl_sched_migration_cost == -1)
5572 return 1;
5573 if (sysctl_sched_migration_cost == 0)
5574 return 0;
5575
5576 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5577
5578 return delta < (s64)sysctl_sched_migration_cost;
5579}
5580
5581#ifdef CONFIG_NUMA_BALANCING
5582
5583
5584
5585
5586
5587static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5588{
5589 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5590 unsigned long src_faults, dst_faults;
5591 int src_nid, dst_nid;
5592
5593 if (!static_branch_likely(&sched_numa_balancing))
5594 return -1;
5595
5596 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5597 return -1;
5598
5599 src_nid = cpu_to_node(env->src_cpu);
5600 dst_nid = cpu_to_node(env->dst_cpu);
5601
5602 if (src_nid == dst_nid)
5603 return -1;
5604
5605
5606 if (src_nid == p->numa_preferred_nid) {
5607 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5608 return 1;
5609 else
5610 return -1;
5611 }
5612
5613
5614 if (dst_nid == p->numa_preferred_nid)
5615 return 0;
5616
5617 if (numa_group) {
5618 src_faults = group_faults(p, src_nid);
5619 dst_faults = group_faults(p, dst_nid);
5620 } else {
5621 src_faults = task_faults(p, src_nid);
5622 dst_faults = task_faults(p, dst_nid);
5623 }
5624
5625 return dst_faults < src_faults;
5626}
5627
5628#else
5629static inline int migrate_degrades_locality(struct task_struct *p,
5630 struct lb_env *env)
5631{
5632 return -1;
5633}
5634#endif
5635
5636
5637
5638
5639static
5640int can_migrate_task(struct task_struct *p, struct lb_env *env)
5641{
5642 int tsk_cache_hot;
5643
5644 lockdep_assert_held(&env->src_rq->lock);
5645
5646
5647
5648
5649
5650
5651
5652
5653 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5654 return 0;
5655
5656 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5657 int cpu;
5658
5659 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5660
5661 env->flags |= LBF_SOME_PINNED;
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5672 return 0;
5673
5674
5675 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5676 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5677 env->flags |= LBF_DST_PINNED;
5678 env->new_dst_cpu = cpu;
5679 break;
5680 }
5681 }
5682
5683 return 0;
5684 }
5685
5686
5687 env->flags &= ~LBF_ALL_PINNED;
5688
5689 if (task_running(env->src_rq, p)) {
5690 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5691 return 0;
5692 }
5693
5694
5695
5696
5697
5698
5699
5700 tsk_cache_hot = migrate_degrades_locality(p, env);
5701 if (tsk_cache_hot == -1)
5702 tsk_cache_hot = task_hot(p, env);
5703
5704 if (tsk_cache_hot <= 0 ||
5705 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5706 if (tsk_cache_hot == 1) {
5707 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5708 schedstat_inc(p, se.statistics.nr_forced_migrations);
5709 }
5710 return 1;
5711 }
5712
5713 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5714 return 0;
5715}
5716
5717
5718
5719
5720static void detach_task(struct task_struct *p, struct lb_env *env)
5721{
5722 lockdep_assert_held(&env->src_rq->lock);
5723
5724 deactivate_task(env->src_rq, p, 0);
5725 p->on_rq = TASK_ON_RQ_MIGRATING;
5726 set_task_cpu(p, env->dst_cpu);
5727}
5728
5729
5730
5731
5732
5733
5734
5735static struct task_struct *detach_one_task(struct lb_env *env)
5736{
5737 struct task_struct *p, *n;
5738
5739 lockdep_assert_held(&env->src_rq->lock);
5740
5741 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5742 if (!can_migrate_task(p, env))
5743 continue;
5744
5745 detach_task(p, env);
5746
5747
5748
5749
5750
5751
5752
5753 schedstat_inc(env->sd, lb_gained[env->idle]);
5754 return p;
5755 }
5756 return NULL;
5757}
5758
5759static const unsigned int sched_nr_migrate_break = 32;
5760
5761
5762
5763
5764
5765
5766
5767static int detach_tasks(struct lb_env *env)
5768{
5769 struct list_head *tasks = &env->src_rq->cfs_tasks;
5770 struct task_struct *p;
5771 unsigned long load;
5772 int detached = 0;
5773
5774 lockdep_assert_held(&env->src_rq->lock);
5775
5776 if (env->imbalance <= 0)
5777 return 0;
5778
5779 while (!list_empty(tasks)) {
5780
5781
5782
5783
5784 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5785 break;
5786
5787 p = list_first_entry(tasks, struct task_struct, se.group_node);
5788
5789 env->loop++;
5790
5791 if (env->loop > env->loop_max)
5792 break;
5793
5794
5795 if (env->loop > env->loop_break) {
5796 env->loop_break += sched_nr_migrate_break;
5797 env->flags |= LBF_NEED_BREAK;
5798 break;
5799 }
5800
5801 if (!can_migrate_task(p, env))
5802 goto next;
5803
5804 load = task_h_load(p);
5805
5806 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5807 goto next;
5808
5809 if ((load / 2) > env->imbalance)
5810 goto next;
5811
5812 detach_task(p, env);
5813 list_add(&p->se.group_node, &env->tasks);
5814
5815 detached++;
5816 env->imbalance -= load;
5817
5818#ifdef CONFIG_PREEMPT
5819
5820
5821
5822
5823
5824 if (env->idle == CPU_NEWLY_IDLE)
5825 break;
5826#endif
5827
5828
5829
5830
5831
5832 if (env->imbalance <= 0)
5833 break;
5834
5835 continue;
5836next:
5837 list_move_tail(&p->se.group_node, tasks);
5838 }
5839
5840
5841
5842
5843
5844
5845 schedstat_add(env->sd, lb_gained[env->idle], detached);
5846
5847 return detached;
5848}
5849
5850
5851
5852
5853static void attach_task(struct rq *rq, struct task_struct *p)
5854{
5855 lockdep_assert_held(&rq->lock);
5856
5857 BUG_ON(task_rq(p) != rq);
5858 p->on_rq = TASK_ON_RQ_QUEUED;
5859 activate_task(rq, p, 0);
5860 check_preempt_curr(rq, p, 0);
5861}
5862
5863
5864
5865
5866
5867static void attach_one_task(struct rq *rq, struct task_struct *p)
5868{
5869 raw_spin_lock(&rq->lock);
5870 attach_task(rq, p);
5871 raw_spin_unlock(&rq->lock);
5872}
5873
5874
5875
5876
5877
5878static void attach_tasks(struct lb_env *env)
5879{
5880 struct list_head *tasks = &env->tasks;
5881 struct task_struct *p;
5882
5883 raw_spin_lock(&env->dst_rq->lock);
5884
5885 while (!list_empty(tasks)) {
5886 p = list_first_entry(tasks, struct task_struct, se.group_node);
5887 list_del_init(&p->se.group_node);
5888
5889 attach_task(env->dst_rq, p);
5890 }
5891
5892 raw_spin_unlock(&env->dst_rq->lock);
5893}
5894
5895#ifdef CONFIG_FAIR_GROUP_SCHED
5896static void update_blocked_averages(int cpu)
5897{
5898 struct rq *rq = cpu_rq(cpu);
5899 struct cfs_rq *cfs_rq;
5900 unsigned long flags;
5901
5902 raw_spin_lock_irqsave(&rq->lock, flags);
5903 update_rq_clock(rq);
5904
5905
5906
5907
5908
5909 for_each_leaf_cfs_rq(rq, cfs_rq) {
5910
5911 if (throttled_hierarchy(cfs_rq))
5912 continue;
5913
5914 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
5915 update_tg_load_avg(cfs_rq, 0);
5916 }
5917 raw_spin_unlock_irqrestore(&rq->lock, flags);
5918}
5919
5920
5921
5922
5923
5924
5925static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5926{
5927 struct rq *rq = rq_of(cfs_rq);
5928 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5929 unsigned long now = jiffies;
5930 unsigned long load;
5931
5932 if (cfs_rq->last_h_load_update == now)
5933 return;
5934
5935 cfs_rq->h_load_next = NULL;
5936 for_each_sched_entity(se) {
5937 cfs_rq = cfs_rq_of(se);
5938 cfs_rq->h_load_next = se;
5939 if (cfs_rq->last_h_load_update == now)
5940 break;
5941 }
5942
5943 if (!se) {
5944 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
5945 cfs_rq->last_h_load_update = now;
5946 }
5947
5948 while ((se = cfs_rq->h_load_next) != NULL) {
5949 load = cfs_rq->h_load;
5950 load = div64_ul(load * se->avg.load_avg,
5951 cfs_rq_load_avg(cfs_rq) + 1);
5952 cfs_rq = group_cfs_rq(se);
5953 cfs_rq->h_load = load;
5954 cfs_rq->last_h_load_update = now;
5955 }
5956}
5957
5958static unsigned long task_h_load(struct task_struct *p)
5959{
5960 struct cfs_rq *cfs_rq = task_cfs_rq(p);
5961
5962 update_cfs_rq_h_load(cfs_rq);
5963 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
5964 cfs_rq_load_avg(cfs_rq) + 1);
5965}
5966#else
5967static inline void update_blocked_averages(int cpu)
5968{
5969 struct rq *rq = cpu_rq(cpu);
5970 struct cfs_rq *cfs_rq = &rq->cfs;
5971 unsigned long flags;
5972
5973 raw_spin_lock_irqsave(&rq->lock, flags);
5974 update_rq_clock(rq);
5975 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
5976 raw_spin_unlock_irqrestore(&rq->lock, flags);
5977}
5978
5979static unsigned long task_h_load(struct task_struct *p)
5980{
5981 return p->se.avg.load_avg;
5982}
5983#endif
5984
5985
5986
5987enum group_type {
5988 group_other = 0,
5989 group_imbalanced,
5990 group_overloaded,
5991};
5992
5993
5994
5995
5996struct sg_lb_stats {
5997 unsigned long avg_load;
5998 unsigned long group_load;
5999 unsigned long sum_weighted_load;
6000 unsigned long load_per_task;
6001 unsigned long group_capacity;
6002 unsigned long group_util;
6003 unsigned int sum_nr_running;
6004 unsigned int idle_cpus;
6005 unsigned int group_weight;
6006 enum group_type group_type;
6007 int group_no_capacity;
6008#ifdef CONFIG_NUMA_BALANCING
6009 unsigned int nr_numa_running;
6010 unsigned int nr_preferred_running;
6011#endif
6012};
6013
6014
6015
6016
6017
6018struct sd_lb_stats {
6019 struct sched_group *busiest;
6020 struct sched_group *local;
6021 unsigned long total_load;
6022 unsigned long total_capacity;
6023 unsigned long avg_load;
6024
6025 struct sg_lb_stats busiest_stat;
6026 struct sg_lb_stats local_stat;
6027};
6028
6029static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6030{
6031
6032
6033
6034
6035
6036
6037 *sds = (struct sd_lb_stats){
6038 .busiest = NULL,
6039 .local = NULL,
6040 .total_load = 0UL,
6041 .total_capacity = 0UL,
6042 .busiest_stat = {
6043 .avg_load = 0UL,
6044 .sum_nr_running = 0,
6045 .group_type = group_other,
6046 },
6047 };
6048}
6049
6050
6051
6052
6053
6054
6055
6056
6057static inline int get_sd_load_idx(struct sched_domain *sd,
6058 enum cpu_idle_type idle)
6059{
6060 int load_idx;
6061
6062 switch (idle) {
6063 case CPU_NOT_IDLE:
6064 load_idx = sd->busy_idx;
6065 break;
6066
6067 case CPU_NEWLY_IDLE:
6068 load_idx = sd->newidle_idx;
6069 break;
6070 default:
6071 load_idx = sd->idle_idx;
6072 break;
6073 }
6074
6075 return load_idx;
6076}
6077
6078static unsigned long scale_rt_capacity(int cpu)
6079{
6080 struct rq *rq = cpu_rq(cpu);
6081 u64 total, used, age_stamp, avg;
6082 s64 delta;
6083
6084
6085
6086
6087
6088 age_stamp = READ_ONCE(rq->age_stamp);
6089 avg = READ_ONCE(rq->rt_avg);
6090 delta = __rq_clock_broken(rq) - age_stamp;
6091
6092 if (unlikely(delta < 0))
6093 delta = 0;
6094
6095 total = sched_avg_period() + delta;
6096
6097 used = div_u64(avg, total);
6098
6099 if (likely(used < SCHED_CAPACITY_SCALE))
6100 return SCHED_CAPACITY_SCALE - used;
6101
6102 return 1;
6103}
6104
6105static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6106{
6107 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6108 struct sched_group *sdg = sd->groups;
6109
6110 cpu_rq(cpu)->cpu_capacity_orig = capacity;
6111
6112 capacity *= scale_rt_capacity(cpu);
6113 capacity >>= SCHED_CAPACITY_SHIFT;
6114
6115 if (!capacity)
6116 capacity = 1;
6117
6118 cpu_rq(cpu)->cpu_capacity = capacity;
6119 sdg->sgc->capacity = capacity;
6120}
6121
6122void update_group_capacity(struct sched_domain *sd, int cpu)
6123{
6124 struct sched_domain *child = sd->child;
6125 struct sched_group *group, *sdg = sd->groups;
6126 unsigned long capacity;
6127 unsigned long interval;
6128
6129 interval = msecs_to_jiffies(sd->balance_interval);
6130 interval = clamp(interval, 1UL, max_load_balance_interval);
6131 sdg->sgc->next_update = jiffies + interval;
6132
6133 if (!child) {
6134 update_cpu_capacity(sd, cpu);
6135 return;
6136 }
6137
6138 capacity = 0;
6139
6140 if (child->flags & SD_OVERLAP) {
6141
6142
6143
6144
6145
6146 for_each_cpu(cpu, sched_group_cpus(sdg)) {
6147 struct sched_group_capacity *sgc;
6148 struct rq *rq = cpu_rq(cpu);
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161 if (unlikely(!rq->sd)) {
6162 capacity += capacity_of(cpu);
6163 continue;
6164 }
6165
6166 sgc = rq->sd->groups->sgc;
6167 capacity += sgc->capacity;
6168 }
6169 } else {
6170
6171
6172
6173
6174
6175 group = child->groups;
6176 do {
6177 capacity += group->sgc->capacity;
6178 group = group->next;
6179 } while (group != child->groups);
6180 }
6181
6182 sdg->sgc->capacity = capacity;
6183}
6184
6185
6186
6187
6188
6189
6190static inline int
6191check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6192{
6193 return ((rq->cpu_capacity * sd->imbalance_pct) <
6194 (rq->cpu_capacity_orig * 100));
6195}
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226static inline int sg_imbalanced(struct sched_group *group)
6227{
6228 return group->sgc->imbalance;
6229}
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243static inline bool
6244group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6245{
6246 if (sgs->sum_nr_running < sgs->group_weight)
6247 return true;
6248
6249 if ((sgs->group_capacity * 100) >
6250 (sgs->group_util * env->sd->imbalance_pct))
6251 return true;
6252
6253 return false;
6254}
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264static inline bool
6265group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6266{
6267 if (sgs->sum_nr_running <= sgs->group_weight)
6268 return false;
6269
6270 if ((sgs->group_capacity * 100) <
6271 (sgs->group_util * env->sd->imbalance_pct))
6272 return true;
6273
6274 return false;
6275}
6276
6277static inline enum
6278group_type group_classify(struct sched_group *group,
6279 struct sg_lb_stats *sgs)
6280{
6281 if (sgs->group_no_capacity)
6282 return group_overloaded;
6283
6284 if (sg_imbalanced(group))
6285 return group_imbalanced;
6286
6287 return group_other;
6288}
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299static inline void update_sg_lb_stats(struct lb_env *env,
6300 struct sched_group *group, int load_idx,
6301 int local_group, struct sg_lb_stats *sgs,
6302 bool *overload)
6303{
6304 unsigned long load;
6305 int i;
6306
6307 memset(sgs, 0, sizeof(*sgs));
6308
6309 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6310 struct rq *rq = cpu_rq(i);
6311
6312
6313 if (local_group)
6314 load = target_load(i, load_idx);
6315 else
6316 load = source_load(i, load_idx);
6317
6318 sgs->group_load += load;
6319 sgs->group_util += cpu_util(i);
6320 sgs->sum_nr_running += rq->cfs.h_nr_running;
6321
6322 if (rq->nr_running > 1)
6323 *overload = true;
6324
6325#ifdef CONFIG_NUMA_BALANCING
6326 sgs->nr_numa_running += rq->nr_numa_running;
6327 sgs->nr_preferred_running += rq->nr_preferred_running;
6328#endif
6329 sgs->sum_weighted_load += weighted_cpuload(i);
6330 if (idle_cpu(i))
6331 sgs->idle_cpus++;
6332 }
6333
6334
6335 sgs->group_capacity = group->sgc->capacity;
6336 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6337
6338 if (sgs->sum_nr_running)
6339 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6340
6341 sgs->group_weight = group->group_weight;
6342
6343 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6344 sgs->group_type = group_classify(group, sgs);
6345}
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360static bool update_sd_pick_busiest(struct lb_env *env,
6361 struct sd_lb_stats *sds,
6362 struct sched_group *sg,
6363 struct sg_lb_stats *sgs)
6364{
6365 struct sg_lb_stats *busiest = &sds->busiest_stat;
6366
6367 if (sgs->group_type > busiest->group_type)
6368 return true;
6369
6370 if (sgs->group_type < busiest->group_type)
6371 return false;
6372
6373 if (sgs->avg_load <= busiest->avg_load)
6374 return false;
6375
6376
6377 if (!(env->sd->flags & SD_ASYM_PACKING))
6378 return true;
6379
6380
6381
6382
6383
6384
6385 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6386 if (!sds->busiest)
6387 return true;
6388
6389 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6390 return true;
6391 }
6392
6393 return false;
6394}
6395
6396#ifdef CONFIG_NUMA_BALANCING
6397static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6398{
6399 if (sgs->sum_nr_running > sgs->nr_numa_running)
6400 return regular;
6401 if (sgs->sum_nr_running > sgs->nr_preferred_running)
6402 return remote;
6403 return all;
6404}
6405
6406static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6407{
6408 if (rq->nr_running > rq->nr_numa_running)
6409 return regular;
6410 if (rq->nr_running > rq->nr_preferred_running)
6411 return remote;
6412 return all;
6413}
6414#else
6415static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6416{
6417 return all;
6418}
6419
6420static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6421{
6422 return regular;
6423}
6424#endif
6425
6426
6427
6428
6429
6430
6431static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6432{
6433 struct sched_domain *child = env->sd->child;
6434 struct sched_group *sg = env->sd->groups;
6435 struct sg_lb_stats tmp_sgs;
6436 int load_idx, prefer_sibling = 0;
6437 bool overload = false;
6438
6439 if (child && child->flags & SD_PREFER_SIBLING)
6440 prefer_sibling = 1;
6441
6442 load_idx = get_sd_load_idx(env->sd, env->idle);
6443
6444 do {
6445 struct sg_lb_stats *sgs = &tmp_sgs;
6446 int local_group;
6447
6448 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6449 if (local_group) {
6450 sds->local = sg;
6451 sgs = &sds->local_stat;
6452
6453 if (env->idle != CPU_NEWLY_IDLE ||
6454 time_after_eq(jiffies, sg->sgc->next_update))
6455 update_group_capacity(env->sd, env->dst_cpu);
6456 }
6457
6458 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6459 &overload);
6460
6461 if (local_group)
6462 goto next_group;
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474 if (prefer_sibling && sds->local &&
6475 group_has_capacity(env, &sds->local_stat) &&
6476 (sgs->sum_nr_running > 1)) {
6477 sgs->group_no_capacity = 1;
6478 sgs->group_type = group_classify(sg, sgs);
6479 }
6480
6481 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6482 sds->busiest = sg;
6483 sds->busiest_stat = *sgs;
6484 }
6485
6486next_group:
6487
6488 sds->total_load += sgs->group_load;
6489 sds->total_capacity += sgs->group_capacity;
6490
6491 sg = sg->next;
6492 } while (sg != env->sd->groups);
6493
6494 if (env->sd->flags & SD_NUMA)
6495 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6496
6497 if (!env->sd->parent) {
6498
6499 if (env->dst_rq->rd->overload != overload)
6500 env->dst_rq->rd->overload = overload;
6501 }
6502
6503}
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6529{
6530 int busiest_cpu;
6531
6532 if (!(env->sd->flags & SD_ASYM_PACKING))
6533 return 0;
6534
6535 if (!sds->busiest)
6536 return 0;
6537
6538 busiest_cpu = group_first_cpu(sds->busiest);
6539 if (env->dst_cpu > busiest_cpu)
6540 return 0;
6541
6542 env->imbalance = DIV_ROUND_CLOSEST(
6543 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6544 SCHED_CAPACITY_SCALE);
6545
6546 return 1;
6547}
6548
6549
6550
6551
6552
6553
6554
6555
6556static inline
6557void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6558{
6559 unsigned long tmp, capa_now = 0, capa_move = 0;
6560 unsigned int imbn = 2;
6561 unsigned long scaled_busy_load_per_task;
6562 struct sg_lb_stats *local, *busiest;
6563
6564 local = &sds->local_stat;
6565 busiest = &sds->busiest_stat;
6566
6567 if (!local->sum_nr_running)
6568 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6569 else if (busiest->load_per_task > local->load_per_task)
6570 imbn = 1;
6571
6572 scaled_busy_load_per_task =
6573 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6574 busiest->group_capacity;
6575
6576 if (busiest->avg_load + scaled_busy_load_per_task >=
6577 local->avg_load + (scaled_busy_load_per_task * imbn)) {
6578 env->imbalance = busiest->load_per_task;
6579 return;
6580 }
6581
6582
6583
6584
6585
6586
6587
6588 capa_now += busiest->group_capacity *
6589 min(busiest->load_per_task, busiest->avg_load);
6590 capa_now += local->group_capacity *
6591 min(local->load_per_task, local->avg_load);
6592 capa_now /= SCHED_CAPACITY_SCALE;
6593
6594
6595 if (busiest->avg_load > scaled_busy_load_per_task) {
6596 capa_move += busiest->group_capacity *
6597 min(busiest->load_per_task,
6598 busiest->avg_load - scaled_busy_load_per_task);
6599 }
6600
6601
6602 if (busiest->avg_load * busiest->group_capacity <
6603 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6604 tmp = (busiest->avg_load * busiest->group_capacity) /
6605 local->group_capacity;
6606 } else {
6607 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6608 local->group_capacity;
6609 }
6610 capa_move += local->group_capacity *
6611 min(local->load_per_task, local->avg_load + tmp);
6612 capa_move /= SCHED_CAPACITY_SCALE;
6613
6614
6615 if (capa_move > capa_now)
6616 env->imbalance = busiest->load_per_task;
6617}
6618
6619
6620
6621
6622
6623
6624
6625static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6626{
6627 unsigned long max_pull, load_above_capacity = ~0UL;
6628 struct sg_lb_stats *local, *busiest;
6629
6630 local = &sds->local_stat;
6631 busiest = &sds->busiest_stat;
6632
6633 if (busiest->group_type == group_imbalanced) {
6634
6635
6636
6637
6638 busiest->load_per_task =
6639 min(busiest->load_per_task, sds->avg_load);
6640 }
6641
6642
6643
6644
6645
6646
6647 if (busiest->avg_load <= sds->avg_load ||
6648 local->avg_load >= sds->avg_load) {
6649 env->imbalance = 0;
6650 return fix_small_imbalance(env, sds);
6651 }
6652
6653
6654
6655
6656 if (busiest->group_type == group_overloaded &&
6657 local->group_type == group_overloaded) {
6658 load_above_capacity = busiest->sum_nr_running *
6659 SCHED_LOAD_SCALE;
6660 if (load_above_capacity > busiest->group_capacity)
6661 load_above_capacity -= busiest->group_capacity;
6662 else
6663 load_above_capacity = ~0UL;
6664 }
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6675
6676
6677 env->imbalance = min(
6678 max_pull * busiest->group_capacity,
6679 (sds->avg_load - local->avg_load) * local->group_capacity
6680 ) / SCHED_CAPACITY_SCALE;
6681
6682
6683
6684
6685
6686
6687
6688 if (env->imbalance < busiest->load_per_task)
6689 return fix_small_imbalance(env, sds);
6690}
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711static struct sched_group *find_busiest_group(struct lb_env *env)
6712{
6713 struct sg_lb_stats *local, *busiest;
6714 struct sd_lb_stats sds;
6715
6716 init_sd_lb_stats(&sds);
6717
6718
6719
6720
6721
6722 update_sd_lb_stats(env, &sds);
6723 local = &sds.local_stat;
6724 busiest = &sds.busiest_stat;
6725
6726
6727 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6728 check_asym_packing(env, &sds))
6729 return sds.busiest;
6730
6731
6732 if (!sds.busiest || busiest->sum_nr_running == 0)
6733 goto out_balanced;
6734
6735 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6736 / sds.total_capacity;
6737
6738
6739
6740
6741
6742
6743 if (busiest->group_type == group_imbalanced)
6744 goto force_balance;
6745
6746
6747 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6748 busiest->group_no_capacity)
6749 goto force_balance;
6750
6751
6752
6753
6754
6755 if (local->avg_load >= busiest->avg_load)
6756 goto out_balanced;
6757
6758
6759
6760
6761
6762 if (local->avg_load >= sds.avg_load)
6763 goto out_balanced;
6764
6765 if (env->idle == CPU_IDLE) {
6766
6767
6768
6769
6770
6771
6772
6773 if ((busiest->group_type != group_overloaded) &&
6774 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6775 goto out_balanced;
6776 } else {
6777
6778
6779
6780
6781 if (100 * busiest->avg_load <=
6782 env->sd->imbalance_pct * local->avg_load)
6783 goto out_balanced;
6784 }
6785
6786force_balance:
6787
6788 calculate_imbalance(env, &sds);
6789 return sds.busiest;
6790
6791out_balanced:
6792 env->imbalance = 0;
6793 return NULL;
6794}
6795
6796
6797
6798
6799static struct rq *find_busiest_queue(struct lb_env *env,
6800 struct sched_group *group)
6801{
6802 struct rq *busiest = NULL, *rq;
6803 unsigned long busiest_load = 0, busiest_capacity = 1;
6804 int i;
6805
6806 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6807 unsigned long capacity, wl;
6808 enum fbq_type rt;
6809
6810 rq = cpu_rq(i);
6811 rt = fbq_classify_rq(rq);
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832 if (rt > env->fbq_type)
6833 continue;
6834
6835 capacity = capacity_of(i);
6836
6837 wl = weighted_cpuload(i);
6838
6839
6840
6841
6842
6843
6844 if (rq->nr_running == 1 && wl > env->imbalance &&
6845 !check_cpu_capacity(rq, env->sd))
6846 continue;
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859 if (wl * busiest_capacity > busiest_load * capacity) {
6860 busiest_load = wl;
6861 busiest_capacity = capacity;
6862 busiest = rq;
6863 }
6864 }
6865
6866 return busiest;
6867}
6868
6869
6870
6871
6872
6873#define MAX_PINNED_INTERVAL 512
6874
6875
6876DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6877
6878static int need_active_balance(struct lb_env *env)
6879{
6880 struct sched_domain *sd = env->sd;
6881
6882 if (env->idle == CPU_NEWLY_IDLE) {
6883
6884
6885
6886
6887
6888
6889 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6890 return 1;
6891 }
6892
6893
6894
6895
6896
6897
6898
6899 if ((env->idle != CPU_NOT_IDLE) &&
6900 (env->src_rq->cfs.h_nr_running == 1)) {
6901 if ((check_cpu_capacity(env->src_rq, sd)) &&
6902 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
6903 return 1;
6904 }
6905
6906 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6907}
6908
6909static int active_load_balance_cpu_stop(void *data);
6910
6911static int should_we_balance(struct lb_env *env)
6912{
6913 struct sched_group *sg = env->sd->groups;
6914 struct cpumask *sg_cpus, *sg_mask;
6915 int cpu, balance_cpu = -1;
6916
6917
6918
6919
6920
6921 if (env->idle == CPU_NEWLY_IDLE)
6922 return 1;
6923
6924 sg_cpus = sched_group_cpus(sg);
6925 sg_mask = sched_group_mask(sg);
6926
6927 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6928 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6929 continue;
6930
6931 balance_cpu = cpu;
6932 break;
6933 }
6934
6935 if (balance_cpu == -1)
6936 balance_cpu = group_balance_cpu(sg);
6937
6938
6939
6940
6941
6942 return balance_cpu == env->dst_cpu;
6943}
6944
6945
6946
6947
6948
6949static int load_balance(int this_cpu, struct rq *this_rq,
6950 struct sched_domain *sd, enum cpu_idle_type idle,
6951 int *continue_balancing)
6952{
6953 int ld_moved, cur_ld_moved, active_balance = 0;
6954 struct sched_domain *sd_parent = sd->parent;
6955 struct sched_group *group;
6956 struct rq *busiest;
6957 unsigned long flags;
6958 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6959
6960 struct lb_env env = {
6961 .sd = sd,
6962 .dst_cpu = this_cpu,
6963 .dst_rq = this_rq,
6964 .dst_grpmask = sched_group_cpus(sd->groups),
6965 .idle = idle,
6966 .loop_break = sched_nr_migrate_break,
6967 .cpus = cpus,
6968 .fbq_type = all,
6969 .tasks = LIST_HEAD_INIT(env.tasks),
6970 };
6971
6972
6973
6974
6975
6976 if (idle == CPU_NEWLY_IDLE)
6977 env.dst_grpmask = NULL;
6978
6979 cpumask_copy(cpus, cpu_active_mask);
6980
6981 schedstat_inc(sd, lb_count[idle]);
6982
6983redo:
6984 if (!should_we_balance(&env)) {
6985 *continue_balancing = 0;
6986 goto out_balanced;
6987 }
6988
6989 group = find_busiest_group(&env);
6990 if (!group) {
6991 schedstat_inc(sd, lb_nobusyg[idle]);
6992 goto out_balanced;
6993 }
6994
6995 busiest = find_busiest_queue(&env, group);
6996 if (!busiest) {
6997 schedstat_inc(sd, lb_nobusyq[idle]);
6998 goto out_balanced;
6999 }
7000
7001 BUG_ON(busiest == env.dst_rq);
7002
7003 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7004
7005 env.src_cpu = busiest->cpu;
7006 env.src_rq = busiest;
7007
7008 ld_moved = 0;
7009 if (busiest->nr_running > 1) {
7010
7011
7012
7013
7014
7015
7016 env.flags |= LBF_ALL_PINNED;
7017 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
7018
7019more_balance:
7020 raw_spin_lock_irqsave(&busiest->lock, flags);
7021
7022
7023
7024
7025
7026 cur_ld_moved = detach_tasks(&env);
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036 raw_spin_unlock(&busiest->lock);
7037
7038 if (cur_ld_moved) {
7039 attach_tasks(&env);
7040 ld_moved += cur_ld_moved;
7041 }
7042
7043 local_irq_restore(flags);
7044
7045 if (env.flags & LBF_NEED_BREAK) {
7046 env.flags &= ~LBF_NEED_BREAK;
7047 goto more_balance;
7048 }
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7070
7071
7072 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7073
7074 env.dst_rq = cpu_rq(env.new_dst_cpu);
7075 env.dst_cpu = env.new_dst_cpu;
7076 env.flags &= ~LBF_DST_PINNED;
7077 env.loop = 0;
7078 env.loop_break = sched_nr_migrate_break;
7079
7080
7081
7082
7083
7084 goto more_balance;
7085 }
7086
7087
7088
7089
7090 if (sd_parent) {
7091 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7092
7093 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7094 *group_imbalance = 1;
7095 }
7096
7097
7098 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7099 cpumask_clear_cpu(cpu_of(busiest), cpus);
7100 if (!cpumask_empty(cpus)) {
7101 env.loop = 0;
7102 env.loop_break = sched_nr_migrate_break;
7103 goto redo;
7104 }
7105 goto out_all_pinned;
7106 }
7107 }
7108
7109 if (!ld_moved) {
7110 schedstat_inc(sd, lb_failed[idle]);
7111
7112
7113
7114
7115
7116
7117 if (idle != CPU_NEWLY_IDLE)
7118 sd->nr_balance_failed++;
7119
7120 if (need_active_balance(&env)) {
7121 raw_spin_lock_irqsave(&busiest->lock, flags);
7122
7123
7124
7125
7126
7127 if (!cpumask_test_cpu(this_cpu,
7128 tsk_cpus_allowed(busiest->curr))) {
7129 raw_spin_unlock_irqrestore(&busiest->lock,
7130 flags);
7131 env.flags |= LBF_ALL_PINNED;
7132 goto out_one_pinned;
7133 }
7134
7135
7136
7137
7138
7139
7140 if (!busiest->active_balance) {
7141 busiest->active_balance = 1;
7142 busiest->push_cpu = this_cpu;
7143 active_balance = 1;
7144 }
7145 raw_spin_unlock_irqrestore(&busiest->lock, flags);
7146
7147 if (active_balance) {
7148 stop_one_cpu_nowait(cpu_of(busiest),
7149 active_load_balance_cpu_stop, busiest,
7150 &busiest->active_balance_work);
7151 }
7152
7153
7154
7155
7156
7157 sd->nr_balance_failed = sd->cache_nice_tries+1;
7158 }
7159 } else
7160 sd->nr_balance_failed = 0;
7161
7162 if (likely(!active_balance)) {
7163
7164 sd->balance_interval = sd->min_interval;
7165 } else {
7166
7167
7168
7169
7170
7171
7172 if (sd->balance_interval < sd->max_interval)
7173 sd->balance_interval *= 2;
7174 }
7175
7176 goto out;
7177
7178out_balanced:
7179
7180
7181
7182
7183 if (sd_parent) {
7184 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7185
7186 if (*group_imbalance)
7187 *group_imbalance = 0;
7188 }
7189
7190out_all_pinned:
7191
7192
7193
7194
7195
7196 schedstat_inc(sd, lb_balanced[idle]);
7197
7198 sd->nr_balance_failed = 0;
7199
7200out_one_pinned:
7201
7202 if (((env.flags & LBF_ALL_PINNED) &&
7203 sd->balance_interval < MAX_PINNED_INTERVAL) ||
7204 (sd->balance_interval < sd->max_interval))
7205 sd->balance_interval *= 2;
7206
7207 ld_moved = 0;
7208out:
7209 return ld_moved;
7210}
7211
7212static inline unsigned long
7213get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7214{
7215 unsigned long interval = sd->balance_interval;
7216
7217 if (cpu_busy)
7218 interval *= sd->busy_factor;
7219
7220
7221 interval = msecs_to_jiffies(interval);
7222 interval = clamp(interval, 1UL, max_load_balance_interval);
7223
7224 return interval;
7225}
7226
7227static inline void
7228update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7229{
7230 unsigned long interval, next;
7231
7232 interval = get_sd_balance_interval(sd, cpu_busy);
7233 next = sd->last_balance + interval;
7234
7235 if (time_after(*next_balance, next))
7236 *next_balance = next;
7237}
7238
7239
7240
7241
7242
7243static int idle_balance(struct rq *this_rq)
7244{
7245 unsigned long next_balance = jiffies + HZ;
7246 int this_cpu = this_rq->cpu;
7247 struct sched_domain *sd;
7248 int pulled_task = 0;
7249 u64 curr_cost = 0;
7250
7251 idle_enter_fair(this_rq);
7252
7253
7254
7255
7256
7257 this_rq->idle_stamp = rq_clock(this_rq);
7258
7259 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7260 !this_rq->rd->overload) {
7261 rcu_read_lock();
7262 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7263 if (sd)
7264 update_next_balance(sd, 0, &next_balance);
7265 rcu_read_unlock();
7266
7267 goto out;
7268 }
7269
7270 raw_spin_unlock(&this_rq->lock);
7271
7272 update_blocked_averages(this_cpu);
7273 rcu_read_lock();
7274 for_each_domain(this_cpu, sd) {
7275 int continue_balancing = 1;
7276 u64 t0, domain_cost;
7277
7278 if (!(sd->flags & SD_LOAD_BALANCE))
7279 continue;
7280
7281 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7282 update_next_balance(sd, 0, &next_balance);
7283 break;
7284 }
7285
7286 if (sd->flags & SD_BALANCE_NEWIDLE) {
7287 t0 = sched_clock_cpu(this_cpu);
7288
7289 pulled_task = load_balance(this_cpu, this_rq,
7290 sd, CPU_NEWLY_IDLE,
7291 &continue_balancing);
7292
7293 domain_cost = sched_clock_cpu(this_cpu) - t0;
7294 if (domain_cost > sd->max_newidle_lb_cost)
7295 sd->max_newidle_lb_cost = domain_cost;
7296
7297 curr_cost += domain_cost;
7298 }
7299
7300 update_next_balance(sd, 0, &next_balance);
7301
7302
7303
7304
7305
7306 if (pulled_task || this_rq->nr_running > 0)
7307 break;
7308 }
7309 rcu_read_unlock();
7310
7311 raw_spin_lock(&this_rq->lock);
7312
7313 if (curr_cost > this_rq->max_idle_balance_cost)
7314 this_rq->max_idle_balance_cost = curr_cost;
7315
7316
7317
7318
7319
7320
7321 if (this_rq->cfs.h_nr_running && !pulled_task)
7322 pulled_task = 1;
7323
7324out:
7325
7326 if (time_after(this_rq->next_balance, next_balance))
7327 this_rq->next_balance = next_balance;
7328
7329
7330 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7331 pulled_task = -1;
7332
7333 if (pulled_task) {
7334 idle_exit_fair(this_rq);
7335 this_rq->idle_stamp = 0;
7336 }
7337
7338 return pulled_task;
7339}
7340
7341
7342
7343
7344
7345
7346
7347static int active_load_balance_cpu_stop(void *data)
7348{
7349 struct rq *busiest_rq = data;
7350 int busiest_cpu = cpu_of(busiest_rq);
7351 int target_cpu = busiest_rq->push_cpu;
7352 struct rq *target_rq = cpu_rq(target_cpu);
7353 struct sched_domain *sd;
7354 struct task_struct *p = NULL;
7355
7356 raw_spin_lock_irq(&busiest_rq->lock);
7357
7358
7359 if (unlikely(busiest_cpu != smp_processor_id() ||
7360 !busiest_rq->active_balance))
7361 goto out_unlock;
7362
7363
7364 if (busiest_rq->nr_running <= 1)
7365 goto out_unlock;
7366
7367
7368
7369
7370
7371
7372 BUG_ON(busiest_rq == target_rq);
7373
7374
7375 rcu_read_lock();
7376 for_each_domain(target_cpu, sd) {
7377 if ((sd->flags & SD_LOAD_BALANCE) &&
7378 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7379 break;
7380 }
7381
7382 if (likely(sd)) {
7383 struct lb_env env = {
7384 .sd = sd,
7385 .dst_cpu = target_cpu,
7386 .dst_rq = target_rq,
7387 .src_cpu = busiest_rq->cpu,
7388 .src_rq = busiest_rq,
7389 .idle = CPU_IDLE,
7390 };
7391
7392 schedstat_inc(sd, alb_count);
7393
7394 p = detach_one_task(&env);
7395 if (p)
7396 schedstat_inc(sd, alb_pushed);
7397 else
7398 schedstat_inc(sd, alb_failed);
7399 }
7400 rcu_read_unlock();
7401out_unlock:
7402 busiest_rq->active_balance = 0;
7403 raw_spin_unlock(&busiest_rq->lock);
7404
7405 if (p)
7406 attach_one_task(target_rq, p);
7407
7408 local_irq_enable();
7409
7410 return 0;
7411}
7412
7413static inline int on_null_domain(struct rq *rq)
7414{
7415 return unlikely(!rcu_dereference_sched(rq->sd));
7416}
7417
7418#ifdef CONFIG_NO_HZ_COMMON
7419
7420
7421
7422
7423
7424
7425static struct {
7426 cpumask_var_t idle_cpus_mask;
7427 atomic_t nr_cpus;
7428 unsigned long next_balance;
7429} nohz ____cacheline_aligned;
7430
7431static inline int find_new_ilb(void)
7432{
7433 int ilb = cpumask_first(nohz.idle_cpus_mask);
7434
7435 if (ilb < nr_cpu_ids && idle_cpu(ilb))
7436 return ilb;
7437
7438 return nr_cpu_ids;
7439}
7440
7441
7442
7443
7444
7445
7446static void nohz_balancer_kick(void)
7447{
7448 int ilb_cpu;
7449
7450 nohz.next_balance++;
7451
7452 ilb_cpu = find_new_ilb();
7453
7454 if (ilb_cpu >= nr_cpu_ids)
7455 return;
7456
7457 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7458 return;
7459
7460
7461
7462
7463
7464
7465 smp_send_reschedule(ilb_cpu);
7466 return;
7467}
7468
7469static inline void nohz_balance_exit_idle(int cpu)
7470{
7471 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7472
7473
7474
7475 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7476 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7477 atomic_dec(&nohz.nr_cpus);
7478 }
7479 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7480 }
7481}
7482
7483static inline void set_cpu_sd_state_busy(void)
7484{
7485 struct sched_domain *sd;
7486 int cpu = smp_processor_id();
7487
7488 rcu_read_lock();
7489 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7490
7491 if (!sd || !sd->nohz_idle)
7492 goto unlock;
7493 sd->nohz_idle = 0;
7494
7495 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7496unlock:
7497 rcu_read_unlock();
7498}
7499
7500void set_cpu_sd_state_idle(void)
7501{
7502 struct sched_domain *sd;
7503 int cpu = smp_processor_id();
7504
7505 rcu_read_lock();
7506 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7507
7508 if (!sd || sd->nohz_idle)
7509 goto unlock;
7510 sd->nohz_idle = 1;
7511
7512 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7513unlock:
7514 rcu_read_unlock();
7515}
7516
7517
7518
7519
7520
7521void nohz_balance_enter_idle(int cpu)
7522{
7523
7524
7525
7526 if (!cpu_active(cpu))
7527 return;
7528
7529 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7530 return;
7531
7532
7533
7534
7535 if (on_null_domain(cpu_rq(cpu)))
7536 return;
7537
7538 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7539 atomic_inc(&nohz.nr_cpus);
7540 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7541}
7542
7543static int sched_ilb_notifier(struct notifier_block *nfb,
7544 unsigned long action, void *hcpu)
7545{
7546 switch (action & ~CPU_TASKS_FROZEN) {
7547 case CPU_DYING:
7548 nohz_balance_exit_idle(smp_processor_id());
7549 return NOTIFY_OK;
7550 default:
7551 return NOTIFY_DONE;
7552 }
7553}
7554#endif
7555
7556static DEFINE_SPINLOCK(balancing);
7557
7558
7559
7560
7561
7562void update_max_interval(void)
7563{
7564 max_load_balance_interval = HZ*num_online_cpus()/10;
7565}
7566
7567
7568
7569
7570
7571
7572
7573static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7574{
7575 int continue_balancing = 1;
7576 int cpu = rq->cpu;
7577 unsigned long interval;
7578 struct sched_domain *sd;
7579
7580 unsigned long next_balance = jiffies + 60*HZ;
7581 int update_next_balance = 0;
7582 int need_serialize, need_decay = 0;
7583 u64 max_cost = 0;
7584
7585 update_blocked_averages(cpu);
7586
7587 rcu_read_lock();
7588 for_each_domain(cpu, sd) {
7589
7590
7591
7592
7593 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7594 sd->max_newidle_lb_cost =
7595 (sd->max_newidle_lb_cost * 253) / 256;
7596 sd->next_decay_max_lb_cost = jiffies + HZ;
7597 need_decay = 1;
7598 }
7599 max_cost += sd->max_newidle_lb_cost;
7600
7601 if (!(sd->flags & SD_LOAD_BALANCE))
7602 continue;
7603
7604
7605
7606
7607
7608
7609 if (!continue_balancing) {
7610 if (need_decay)
7611 continue;
7612 break;
7613 }
7614
7615 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7616
7617 need_serialize = sd->flags & SD_SERIALIZE;
7618 if (need_serialize) {
7619 if (!spin_trylock(&balancing))
7620 goto out;
7621 }
7622
7623 if (time_after_eq(jiffies, sd->last_balance + interval)) {
7624 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7625
7626
7627
7628
7629
7630 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7631 }
7632 sd->last_balance = jiffies;
7633 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7634 }
7635 if (need_serialize)
7636 spin_unlock(&balancing);
7637out:
7638 if (time_after(next_balance, sd->last_balance + interval)) {
7639 next_balance = sd->last_balance + interval;
7640 update_next_balance = 1;
7641 }
7642 }
7643 if (need_decay) {
7644
7645
7646
7647
7648 rq->max_idle_balance_cost =
7649 max((u64)sysctl_sched_migration_cost, max_cost);
7650 }
7651 rcu_read_unlock();
7652
7653
7654
7655
7656
7657
7658 if (likely(update_next_balance)) {
7659 rq->next_balance = next_balance;
7660
7661#ifdef CONFIG_NO_HZ_COMMON
7662
7663
7664
7665
7666
7667
7668
7669
7670 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7671 nohz.next_balance = rq->next_balance;
7672#endif
7673 }
7674}
7675
7676#ifdef CONFIG_NO_HZ_COMMON
7677
7678
7679
7680
7681static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7682{
7683 int this_cpu = this_rq->cpu;
7684 struct rq *rq;
7685 int balance_cpu;
7686
7687 unsigned long next_balance = jiffies + 60*HZ;
7688 int update_next_balance = 0;
7689
7690 if (idle != CPU_IDLE ||
7691 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7692 goto end;
7693
7694 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7695 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7696 continue;
7697
7698
7699
7700
7701
7702
7703 if (need_resched())
7704 break;
7705
7706 rq = cpu_rq(balance_cpu);
7707
7708
7709
7710
7711
7712 if (time_after_eq(jiffies, rq->next_balance)) {
7713 raw_spin_lock_irq(&rq->lock);
7714 update_rq_clock(rq);
7715 update_idle_cpu_load(rq);
7716 raw_spin_unlock_irq(&rq->lock);
7717 rebalance_domains(rq, CPU_IDLE);
7718 }
7719
7720 if (time_after(next_balance, rq->next_balance)) {
7721 next_balance = rq->next_balance;
7722 update_next_balance = 1;
7723 }
7724 }
7725
7726
7727
7728
7729
7730
7731 if (likely(update_next_balance))
7732 nohz.next_balance = next_balance;
7733end:
7734 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7735}
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748static inline bool nohz_kick_needed(struct rq *rq)
7749{
7750 unsigned long now = jiffies;
7751 struct sched_domain *sd;
7752 struct sched_group_capacity *sgc;
7753 int nr_busy, cpu = rq->cpu;
7754 bool kick = false;
7755
7756 if (unlikely(rq->idle_balance))
7757 return false;
7758
7759
7760
7761
7762
7763 set_cpu_sd_state_busy();
7764 nohz_balance_exit_idle(cpu);
7765
7766
7767
7768
7769
7770 if (likely(!atomic_read(&nohz.nr_cpus)))
7771 return false;
7772
7773 if (time_before(now, nohz.next_balance))
7774 return false;
7775
7776 if (rq->nr_running >= 2)
7777 return true;
7778
7779 rcu_read_lock();
7780 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7781 if (sd) {
7782 sgc = sd->groups->sgc;
7783 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7784
7785 if (nr_busy > 1) {
7786 kick = true;
7787 goto unlock;
7788 }
7789
7790 }
7791
7792 sd = rcu_dereference(rq->sd);
7793 if (sd) {
7794 if ((rq->cfs.h_nr_running >= 1) &&
7795 check_cpu_capacity(rq, sd)) {
7796 kick = true;
7797 goto unlock;
7798 }
7799 }
7800
7801 sd = rcu_dereference(per_cpu(sd_asym, cpu));
7802 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7803 sched_domain_span(sd)) < cpu)) {
7804 kick = true;
7805 goto unlock;
7806 }
7807
7808unlock:
7809 rcu_read_unlock();
7810 return kick;
7811}
7812#else
7813static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7814#endif
7815
7816
7817
7818
7819
7820static void run_rebalance_domains(struct softirq_action *h)
7821{
7822 struct rq *this_rq = this_rq();
7823 enum cpu_idle_type idle = this_rq->idle_balance ?
7824 CPU_IDLE : CPU_NOT_IDLE;
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834 nohz_idle_balance(this_rq, idle);
7835 rebalance_domains(this_rq, idle);
7836}
7837
7838
7839
7840
7841void trigger_load_balance(struct rq *rq)
7842{
7843
7844 if (unlikely(on_null_domain(rq)))
7845 return;
7846
7847 if (time_after_eq(jiffies, rq->next_balance))
7848 raise_softirq(SCHED_SOFTIRQ);
7849#ifdef CONFIG_NO_HZ_COMMON
7850 if (nohz_kick_needed(rq))
7851 nohz_balancer_kick();
7852#endif
7853}
7854
7855static void rq_online_fair(struct rq *rq)
7856{
7857 update_sysctl();
7858
7859 update_runtime_enabled(rq);
7860}
7861
7862static void rq_offline_fair(struct rq *rq)
7863{
7864 update_sysctl();
7865
7866
7867 unthrottle_offline_cfs_rqs(rq);
7868}
7869
7870#endif
7871
7872
7873
7874
7875static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7876{
7877 struct cfs_rq *cfs_rq;
7878 struct sched_entity *se = &curr->se;
7879
7880 for_each_sched_entity(se) {
7881 cfs_rq = cfs_rq_of(se);
7882 entity_tick(cfs_rq, se, queued);
7883 }
7884
7885 if (static_branch_unlikely(&sched_numa_balancing))
7886 task_tick_numa(rq, curr);
7887}
7888
7889
7890
7891
7892
7893
7894static void task_fork_fair(struct task_struct *p)
7895{
7896 struct cfs_rq *cfs_rq;
7897 struct sched_entity *se = &p->se, *curr;
7898 int this_cpu = smp_processor_id();
7899 struct rq *rq = this_rq();
7900 unsigned long flags;
7901
7902 raw_spin_lock_irqsave(&rq->lock, flags);
7903
7904 update_rq_clock(rq);
7905
7906 cfs_rq = task_cfs_rq(current);
7907 curr = cfs_rq->curr;
7908
7909
7910
7911
7912
7913
7914
7915 rcu_read_lock();
7916 __set_task_cpu(p, this_cpu);
7917 rcu_read_unlock();
7918
7919 update_curr(cfs_rq);
7920
7921 if (curr)
7922 se->vruntime = curr->vruntime;
7923 place_entity(cfs_rq, se, 1);
7924
7925 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7926
7927
7928
7929
7930 swap(curr->vruntime, se->vruntime);
7931 resched_curr(rq);
7932 }
7933
7934 se->vruntime -= cfs_rq->min_vruntime;
7935
7936 raw_spin_unlock_irqrestore(&rq->lock, flags);
7937}
7938
7939
7940
7941
7942
7943static void
7944prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7945{
7946 if (!task_on_rq_queued(p))
7947 return;
7948
7949
7950
7951
7952
7953
7954 if (rq->curr == p) {
7955 if (p->prio > oldprio)
7956 resched_curr(rq);
7957 } else
7958 check_preempt_curr(rq, p, 0);
7959}
7960
7961static inline bool vruntime_normalized(struct task_struct *p)
7962{
7963 struct sched_entity *se = &p->se;
7964
7965
7966
7967
7968
7969
7970 if (p->on_rq)
7971 return true;
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
7983 return true;
7984
7985 return false;
7986}
7987
7988static void detach_task_cfs_rq(struct task_struct *p)
7989{
7990 struct sched_entity *se = &p->se;
7991 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7992
7993 if (!vruntime_normalized(p)) {
7994
7995
7996
7997
7998 place_entity(cfs_rq, se, 0);
7999 se->vruntime -= cfs_rq->min_vruntime;
8000 }
8001
8002
8003 detach_entity_load_avg(cfs_rq, se);
8004}
8005
8006static void attach_task_cfs_rq(struct task_struct *p)
8007{
8008 struct sched_entity *se = &p->se;
8009 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8010
8011#ifdef CONFIG_FAIR_GROUP_SCHED
8012
8013
8014
8015
8016 se->depth = se->parent ? se->parent->depth + 1 : 0;
8017#endif
8018
8019
8020 attach_entity_load_avg(cfs_rq, se);
8021
8022 if (!vruntime_normalized(p))
8023 se->vruntime += cfs_rq->min_vruntime;
8024}
8025
8026static void switched_from_fair(struct rq *rq, struct task_struct *p)
8027{
8028 detach_task_cfs_rq(p);
8029}
8030
8031static void switched_to_fair(struct rq *rq, struct task_struct *p)
8032{
8033 attach_task_cfs_rq(p);
8034
8035 if (task_on_rq_queued(p)) {
8036
8037
8038
8039
8040
8041 if (rq->curr == p)
8042 resched_curr(rq);
8043 else
8044 check_preempt_curr(rq, p, 0);
8045 }
8046}
8047
8048
8049
8050
8051
8052
8053static void set_curr_task_fair(struct rq *rq)
8054{
8055 struct sched_entity *se = &rq->curr->se;
8056
8057 for_each_sched_entity(se) {
8058 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8059
8060 set_next_entity(cfs_rq, se);
8061
8062 account_cfs_rq_runtime(cfs_rq, 0);
8063 }
8064}
8065
8066void init_cfs_rq(struct cfs_rq *cfs_rq)
8067{
8068 cfs_rq->tasks_timeline = RB_ROOT;
8069 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8070#ifndef CONFIG_64BIT
8071 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8072#endif
8073#ifdef CONFIG_SMP
8074 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8075 atomic_long_set(&cfs_rq->removed_util_avg, 0);
8076#endif
8077}
8078
8079#ifdef CONFIG_FAIR_GROUP_SCHED
8080static void task_move_group_fair(struct task_struct *p)
8081{
8082 detach_task_cfs_rq(p);
8083 set_task_rq(p, task_cpu(p));
8084
8085#ifdef CONFIG_SMP
8086
8087 p->se.avg.last_update_time = 0;
8088#endif
8089 attach_task_cfs_rq(p);
8090}
8091
8092void free_fair_sched_group(struct task_group *tg)
8093{
8094 int i;
8095
8096 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8097
8098 for_each_possible_cpu(i) {
8099 if (tg->cfs_rq)
8100 kfree(tg->cfs_rq[i]);
8101 if (tg->se) {
8102 if (tg->se[i])
8103 remove_entity_load_avg(tg->se[i]);
8104 kfree(tg->se[i]);
8105 }
8106 }
8107
8108 kfree(tg->cfs_rq);
8109 kfree(tg->se);
8110}
8111
8112int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8113{
8114 struct cfs_rq *cfs_rq;
8115 struct sched_entity *se;
8116 int i;
8117
8118 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8119 if (!tg->cfs_rq)
8120 goto err;
8121 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8122 if (!tg->se)
8123 goto err;
8124
8125 tg->shares = NICE_0_LOAD;
8126
8127 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8128
8129 for_each_possible_cpu(i) {
8130 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8131 GFP_KERNEL, cpu_to_node(i));
8132 if (!cfs_rq)
8133 goto err;
8134
8135 se = kzalloc_node(sizeof(struct sched_entity),
8136 GFP_KERNEL, cpu_to_node(i));
8137 if (!se)
8138 goto err_free_rq;
8139
8140 init_cfs_rq(cfs_rq);
8141 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8142 init_entity_runnable_average(se);
8143 }
8144
8145 return 1;
8146
8147err_free_rq:
8148 kfree(cfs_rq);
8149err:
8150 return 0;
8151}
8152
8153void unregister_fair_sched_group(struct task_group *tg, int cpu)
8154{
8155 struct rq *rq = cpu_rq(cpu);
8156 unsigned long flags;
8157
8158
8159
8160
8161
8162 if (!tg->cfs_rq[cpu]->on_list)
8163 return;
8164
8165 raw_spin_lock_irqsave(&rq->lock, flags);
8166 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8167 raw_spin_unlock_irqrestore(&rq->lock, flags);
8168}
8169
8170void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8171 struct sched_entity *se, int cpu,
8172 struct sched_entity *parent)
8173{
8174 struct rq *rq = cpu_rq(cpu);
8175
8176 cfs_rq->tg = tg;
8177 cfs_rq->rq = rq;
8178 init_cfs_rq_runtime(cfs_rq);
8179
8180 tg->cfs_rq[cpu] = cfs_rq;
8181 tg->se[cpu] = se;
8182
8183
8184 if (!se)
8185 return;
8186
8187 if (!parent) {
8188 se->cfs_rq = &rq->cfs;
8189 se->depth = 0;
8190 } else {
8191 se->cfs_rq = parent->my_q;
8192 se->depth = parent->depth + 1;
8193 }
8194
8195 se->my_q = cfs_rq;
8196
8197 update_load_set(&se->load, NICE_0_LOAD);
8198 se->parent = parent;
8199}
8200
8201static DEFINE_MUTEX(shares_mutex);
8202
8203int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8204{
8205 int i;
8206 unsigned long flags;
8207
8208
8209
8210
8211 if (!tg->se[0])
8212 return -EINVAL;
8213
8214 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8215
8216 mutex_lock(&shares_mutex);
8217 if (tg->shares == shares)
8218 goto done;
8219
8220 tg->shares = shares;
8221 for_each_possible_cpu(i) {
8222 struct rq *rq = cpu_rq(i);
8223 struct sched_entity *se;
8224
8225 se = tg->se[i];
8226
8227 raw_spin_lock_irqsave(&rq->lock, flags);
8228
8229
8230 update_rq_clock(rq);
8231 for_each_sched_entity(se)
8232 update_cfs_shares(group_cfs_rq(se));
8233 raw_spin_unlock_irqrestore(&rq->lock, flags);
8234 }
8235
8236done:
8237 mutex_unlock(&shares_mutex);
8238 return 0;
8239}
8240#else
8241
8242void free_fair_sched_group(struct task_group *tg) { }
8243
8244int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8245{
8246 return 1;
8247}
8248
8249void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8250
8251#endif
8252
8253
8254static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8255{
8256 struct sched_entity *se = &task->se;
8257 unsigned int rr_interval = 0;
8258
8259
8260
8261
8262
8263 if (rq->cfs.load.weight)
8264 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8265
8266 return rr_interval;
8267}
8268
8269
8270
8271
8272const struct sched_class fair_sched_class = {
8273 .next = &idle_sched_class,
8274 .enqueue_task = enqueue_task_fair,
8275 .dequeue_task = dequeue_task_fair,
8276 .yield_task = yield_task_fair,
8277 .yield_to_task = yield_to_task_fair,
8278
8279 .check_preempt_curr = check_preempt_wakeup,
8280
8281 .pick_next_task = pick_next_task_fair,
8282 .put_prev_task = put_prev_task_fair,
8283
8284#ifdef CONFIG_SMP
8285 .select_task_rq = select_task_rq_fair,
8286 .migrate_task_rq = migrate_task_rq_fair,
8287
8288 .rq_online = rq_online_fair,
8289 .rq_offline = rq_offline_fair,
8290
8291 .task_waking = task_waking_fair,
8292 .task_dead = task_dead_fair,
8293 .set_cpus_allowed = set_cpus_allowed_common,
8294#endif
8295
8296 .set_curr_task = set_curr_task_fair,
8297 .task_tick = task_tick_fair,
8298 .task_fork = task_fork_fair,
8299
8300 .prio_changed = prio_changed_fair,
8301 .switched_from = switched_from_fair,
8302 .switched_to = switched_to_fair,
8303
8304 .get_rr_interval = get_rr_interval_fair,
8305
8306 .update_curr = update_curr_fair,
8307
8308#ifdef CONFIG_FAIR_GROUP_SCHED
8309 .task_move_group = task_move_group_fair,
8310#endif
8311};
8312
8313#ifdef CONFIG_SCHED_DEBUG
8314void print_cfs_stats(struct seq_file *m, int cpu)
8315{
8316 struct cfs_rq *cfs_rq;
8317
8318 rcu_read_lock();
8319 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8320 print_cfs_rq(m, cpu, cfs_rq);
8321 rcu_read_unlock();
8322}
8323
8324#ifdef CONFIG_NUMA_BALANCING
8325void show_numa_stats(struct task_struct *p, struct seq_file *m)
8326{
8327 int node;
8328 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8329
8330 for_each_online_node(node) {
8331 if (p->numa_faults) {
8332 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8333 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8334 }
8335 if (p->numa_group) {
8336 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8337 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8338 }
8339 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8340 }
8341}
8342#endif
8343#endif
8344
8345__init void init_sched_fair_class(void)
8346{
8347#ifdef CONFIG_SMP
8348 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8349
8350#ifdef CONFIG_NO_HZ_COMMON
8351 nohz.next_balance = jiffies;
8352 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8353 cpu_notifier(sched_ilb_notifier, 0);
8354#endif
8355#endif
8356
8357}
8358