1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
36
37
38
39
40
41
42
43
44
45
46
47
48
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51
52
53
54
55
56
57
58
59
60
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
64
65
66
67
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70
71
72
73
74static unsigned int sched_nr_latency = 8;
75
76
77
78
79
80unsigned int sysctl_sched_child_runs_first __read_mostly;
81
82
83
84
85
86
87
88
89
90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
95
96
97
98
99
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
102#ifdef CONFIG_CFS_BANDWIDTH
103
104
105
106
107
108
109
110
111
112
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
134
135
136
137
138
139
140
141
142
143static int get_update_sysctl_factor(void)
144{
145 unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 unsigned int factor;
147
148 switch (sysctl_sched_tunable_scaling) {
149 case SCHED_TUNABLESCALING_NONE:
150 factor = 1;
151 break;
152 case SCHED_TUNABLESCALING_LINEAR:
153 factor = cpus;
154 break;
155 case SCHED_TUNABLESCALING_LOG:
156 default:
157 factor = 1 + ilog2(cpus);
158 break;
159 }
160
161 return factor;
162}
163
164static void update_sysctl(void)
165{
166 unsigned int factor = get_update_sysctl_factor();
167
168#define SET_SYSCTL(name) \
169 (sysctl_##name = (factor) * normalized_sysctl_##name)
170 SET_SYSCTL(sched_min_granularity);
171 SET_SYSCTL(sched_latency);
172 SET_SYSCTL(sched_wakeup_granularity);
173#undef SET_SYSCTL
174}
175
176void sched_init_granularity(void)
177{
178 update_sysctl();
179}
180
181#define WMULT_CONST (~0U)
182#define WMULT_SHIFT 32
183
184static void __update_inv_weight(struct load_weight *lw)
185{
186 unsigned long w;
187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
200
201
202
203
204
205
206
207
208
209
210
211
212
213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
214{
215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
217
218 __update_inv_weight(lw);
219
220 if (unlikely(fact >> 32)) {
221 while (fact >> 32) {
222 fact >>= 1;
223 shift--;
224 }
225 }
226
227
228 fact = (u64)(u32)fact * lw->inv_weight;
229
230 while (fact >> 32) {
231 fact >>= 1;
232 shift--;
233 }
234
235 return mul_u64_u32_shr(delta_exec, fact, shift);
236}
237
238
239const struct sched_class fair_sched_class;
240
241
242
243
244
245#ifdef CONFIG_FAIR_GROUP_SCHED
246
247
248static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
249{
250 return cfs_rq->rq;
251}
252
253
254#define entity_is_task(se) (!se->my_q)
255
256static inline struct task_struct *task_of(struct sched_entity *se)
257{
258#ifdef CONFIG_SCHED_DEBUG
259 WARN_ON_ONCE(!entity_is_task(se));
260#endif
261 return container_of(se, struct task_struct, se);
262}
263
264
265#define for_each_sched_entity(se) \
266 for (; se; se = se->parent)
267
268static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
269{
270 return p->se.cfs_rq;
271}
272
273
274static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
275{
276 return se->cfs_rq;
277}
278
279
280static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
281{
282 return grp->my_q;
283}
284
285static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
286 int force_update);
287
288static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
289{
290 if (!cfs_rq->on_list) {
291
292
293
294
295
296
297 if (cfs_rq->tg->parent &&
298 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
299 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
300 &rq_of(cfs_rq)->leaf_cfs_rq_list);
301 } else {
302 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
303 &rq_of(cfs_rq)->leaf_cfs_rq_list);
304 }
305
306 cfs_rq->on_list = 1;
307
308 update_cfs_rq_blocked_load(cfs_rq, 0);
309 }
310}
311
312static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
313{
314 if (cfs_rq->on_list) {
315 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
316 cfs_rq->on_list = 0;
317 }
318}
319
320
321#define for_each_leaf_cfs_rq(rq, cfs_rq) \
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323
324
325static inline int
326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{
328 if (se->cfs_rq == pse->cfs_rq)
329 return 1;
330
331 return 0;
332}
333
334static inline struct sched_entity *parent_entity(struct sched_entity *se)
335{
336 return se->parent;
337}
338
339
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{
353 int se_depth, pse_depth;
354
355
356
357
358
359
360
361
362
363 se_depth = depth_se(*se);
364 pse_depth = depth_se(*pse);
365
366 while (se_depth > pse_depth) {
367 se_depth--;
368 *se = parent_entity(*se);
369 }
370
371 while (pse_depth > se_depth) {
372 pse_depth--;
373 *pse = parent_entity(*pse);
374 }
375
376 while (!is_same_group(*se, *pse)) {
377 *se = parent_entity(*se);
378 *pse = parent_entity(*pse);
379 }
380}
381
382#else
383
384static inline struct task_struct *task_of(struct sched_entity *se)
385{
386 return container_of(se, struct task_struct, se);
387}
388
389static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
390{
391 return container_of(cfs_rq, struct rq, cfs);
392}
393
394#define entity_is_task(se) 1
395
396#define for_each_sched_entity(se) \
397 for (; se; se = NULL)
398
399static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
400{
401 return &task_rq(p)->cfs;
402}
403
404static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
405{
406 struct task_struct *p = task_of(se);
407 struct rq *rq = task_rq(p);
408
409 return &rq->cfs;
410}
411
412
413static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
414{
415 return NULL;
416}
417
418static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
419{
420}
421
422static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
423{
424}
425
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{
437 return NULL;
438}
439
440static inline void
441find_matching_se(struct sched_entity **se, struct sched_entity **pse)
442{
443}
444
445#endif
446
447static __always_inline
448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
449
450
451
452
453
454static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
455{
456 s64 delta = (s64)(vruntime - max_vruntime);
457 if (delta > 0)
458 max_vruntime = vruntime;
459
460 return max_vruntime;
461}
462
463static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
464{
465 s64 delta = (s64)(vruntime - min_vruntime);
466 if (delta < 0)
467 min_vruntime = vruntime;
468
469 return min_vruntime;
470}
471
472static inline int entity_before(struct sched_entity *a,
473 struct sched_entity *b)
474{
475 return (s64)(a->vruntime - b->vruntime) < 0;
476}
477
478static void update_min_vruntime(struct cfs_rq *cfs_rq)
479{
480 u64 vruntime = cfs_rq->min_vruntime;
481
482 if (cfs_rq->curr)
483 vruntime = cfs_rq->curr->vruntime;
484
485 if (cfs_rq->rb_leftmost) {
486 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
487 struct sched_entity,
488 run_node);
489
490 if (!cfs_rq->curr)
491 vruntime = se->vruntime;
492 else
493 vruntime = min_vruntime(vruntime, se->vruntime);
494 }
495
496
497 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
498#ifndef CONFIG_64BIT
499 smp_wmb();
500 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
501#endif
502}
503
504
505
506
507static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
508{
509 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
510 struct rb_node *parent = NULL;
511 struct sched_entity *entry;
512 int leftmost = 1;
513
514
515
516
517 while (*link) {
518 parent = *link;
519 entry = rb_entry(parent, struct sched_entity, run_node);
520
521
522
523
524 if (entity_before(se, entry)) {
525 link = &parent->rb_left;
526 } else {
527 link = &parent->rb_right;
528 leftmost = 0;
529 }
530 }
531
532
533
534
535
536 if (leftmost)
537 cfs_rq->rb_leftmost = &se->run_node;
538
539 rb_link_node(&se->run_node, parent, link);
540 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
541}
542
543static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
544{
545 if (cfs_rq->rb_leftmost == &se->run_node) {
546 struct rb_node *next_node;
547
548 next_node = rb_next(&se->run_node);
549 cfs_rq->rb_leftmost = next_node;
550 }
551
552 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
553}
554
555struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
556{
557 struct rb_node *left = cfs_rq->rb_leftmost;
558
559 if (!left)
560 return NULL;
561
562 return rb_entry(left, struct sched_entity, run_node);
563}
564
565static struct sched_entity *__pick_next_entity(struct sched_entity *se)
566{
567 struct rb_node *next = rb_next(&se->run_node);
568
569 if (!next)
570 return NULL;
571
572 return rb_entry(next, struct sched_entity, run_node);
573}
574
575#ifdef CONFIG_SCHED_DEBUG
576struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
577{
578 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
579
580 if (!last)
581 return NULL;
582
583 return rb_entry(last, struct sched_entity, run_node);
584}
585
586
587
588
589
590int sched_proc_update_handler(struct ctl_table *table, int write,
591 void __user *buffer, size_t *lenp,
592 loff_t *ppos)
593{
594 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
595 int factor = get_update_sysctl_factor();
596
597 if (ret || !write)
598 return ret;
599
600 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
601 sysctl_sched_min_granularity);
602
603#define WRT_SYSCTL(name) \
604 (normalized_sysctl_##name = sysctl_##name / (factor))
605 WRT_SYSCTL(sched_min_granularity);
606 WRT_SYSCTL(sched_latency);
607 WRT_SYSCTL(sched_wakeup_granularity);
608#undef WRT_SYSCTL
609
610 return 0;
611}
612#endif
613
614
615
616
617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
618{
619 if (unlikely(se->load.weight != NICE_0_LOAD))
620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
621
622 return delta;
623}
624
625
626
627
628
629
630
631
632
633static u64 __sched_period(unsigned long nr_running)
634{
635 u64 period = sysctl_sched_latency;
636 unsigned long nr_latency = sched_nr_latency;
637
638 if (unlikely(nr_running > nr_latency)) {
639 period = sysctl_sched_min_granularity;
640 period *= nr_running;
641 }
642
643 return period;
644}
645
646
647
648
649
650
651
652static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653{
654 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
655
656 for_each_sched_entity(se) {
657 struct load_weight *load;
658 struct load_weight lw;
659
660 cfs_rq = cfs_rq_of(se);
661 load = &cfs_rq->load;
662
663 if (unlikely(!se->on_rq)) {
664 lw = cfs_rq->load;
665
666 update_load_add(&lw, se->load.weight);
667 load = &lw;
668 }
669 slice = __calc_delta(slice, se->load.weight, load);
670 }
671 return slice;
672}
673
674
675
676
677
678
679static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
680{
681 return calc_delta_fair(sched_slice(cfs_rq, se), se);
682}
683
684#ifdef CONFIG_SMP
685static unsigned long task_h_load(struct task_struct *p);
686
687static inline void __update_task_entity_contrib(struct sched_entity *se);
688
689
690void init_task_runnable_average(struct task_struct *p)
691{
692 u32 slice;
693
694 p->se.avg.decay_count = 0;
695 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
696 p->se.avg.runnable_avg_sum = slice;
697 p->se.avg.runnable_avg_period = slice;
698 __update_task_entity_contrib(&p->se);
699}
700#else
701void init_task_runnable_average(struct task_struct *p)
702{
703}
704#endif
705
706
707
708
709static void update_curr(struct cfs_rq *cfs_rq)
710{
711 struct sched_entity *curr = cfs_rq->curr;
712 u64 now = rq_clock_task(rq_of(cfs_rq));
713 u64 delta_exec;
714
715 if (unlikely(!curr))
716 return;
717
718 delta_exec = now - curr->exec_start;
719 if (unlikely((s64)delta_exec <= 0))
720 return;
721
722 curr->exec_start = now;
723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
733 if (entity_is_task(curr)) {
734 struct task_struct *curtask = task_of(curr);
735
736 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
737 cpuacct_charge(curtask, delta_exec);
738 account_group_exec_runtime(curtask, delta_exec);
739 }
740
741 account_cfs_rq_runtime(cfs_rq, delta_exec);
742}
743
744static inline void
745update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
746{
747 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
748}
749
750
751
752
753static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754{
755
756
757
758
759 if (se != cfs_rq->curr)
760 update_stats_wait_start(cfs_rq, se);
761}
762
763static void
764update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
765{
766 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
767 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
768 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
769 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
770 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
771#ifdef CONFIG_SCHEDSTATS
772 if (entity_is_task(se)) {
773 trace_sched_stat_wait(task_of(se),
774 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
775 }
776#endif
777 schedstat_set(se->statistics.wait_start, 0);
778}
779
780static inline void
781update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
782{
783
784
785
786
787 if (se != cfs_rq->curr)
788 update_stats_wait_end(cfs_rq, se);
789}
790
791
792
793
794static inline void
795update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
796{
797
798
799
800 se->exec_start = rq_clock_task(rq_of(cfs_rq));
801}
802
803
804
805
806
807#ifdef CONFIG_NUMA_BALANCING
808
809
810
811
812
813unsigned int sysctl_numa_balancing_scan_period_min = 1000;
814unsigned int sysctl_numa_balancing_scan_period_max = 60000;
815
816
817unsigned int sysctl_numa_balancing_scan_size = 256;
818
819
820unsigned int sysctl_numa_balancing_scan_delay = 1000;
821
822
823
824
825
826
827
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p)
831{
832 unsigned long rss = 0;
833 unsigned long nr_scan_pages;
834
835
836
837
838
839
840 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
841 rss = get_mm_rss(p->mm);
842 if (!rss)
843 rss = nr_scan_pages;
844
845 rss = round_up(rss, nr_scan_pages);
846 return rss / nr_scan_pages;
847}
848
849
850#define MAX_SCAN_WINDOW 2560
851
852static unsigned int task_scan_min(struct task_struct *p)
853{
854 unsigned int scan, floor;
855 unsigned int windows = 1;
856
857 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
858 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
859 floor = 1000 / windows;
860
861 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
862 return max_t(unsigned int, floor, scan);
863}
864
865static unsigned int task_scan_max(struct task_struct *p)
866{
867 unsigned int smin = task_scan_min(p);
868 unsigned int smax;
869
870
871 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
872 return max(smin, smax);
873}
874
875
876
877
878
879
880
881
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1);
887 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
888}
889
890static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
891{
892 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
893 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
894}
895
896struct numa_group {
897 atomic_t refcount;
898
899 spinlock_t lock;
900 int nr_tasks;
901 pid_t gid;
902 struct list_head task_list;
903
904 struct rcu_head rcu;
905 unsigned long total_faults;
906 unsigned long faults[0];
907};
908
909pid_t task_numa_group_id(struct task_struct *p)
910{
911 return p->numa_group ? p->numa_group->gid : 0;
912}
913
914static inline int task_faults_idx(int nid, int priv)
915{
916 return 2 * nid + priv;
917}
918
919static inline unsigned long task_faults(struct task_struct *p, int nid)
920{
921 if (!p->numa_faults)
922 return 0;
923
924 return p->numa_faults[task_faults_idx(nid, 0)] +
925 p->numa_faults[task_faults_idx(nid, 1)];
926}
927
928static inline unsigned long group_faults(struct task_struct *p, int nid)
929{
930 if (!p->numa_group)
931 return 0;
932
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
934}
935
936
937
938
939
940
941
942static inline unsigned long task_weight(struct task_struct *p, int nid)
943{
944 unsigned long total_faults;
945
946 if (!p->numa_faults)
947 return 0;
948
949 total_faults = p->total_numa_faults;
950
951 if (!total_faults)
952 return 0;
953
954 return 1000 * task_faults(p, nid) / total_faults;
955}
956
957static inline unsigned long group_weight(struct task_struct *p, int nid)
958{
959 if (!p->numa_group || !p->numa_group->total_faults)
960 return 0;
961
962 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
963}
964
965static unsigned long weighted_cpuload(const int cpu);
966static unsigned long source_load(int cpu, int type);
967static unsigned long target_load(int cpu, int type);
968static unsigned long power_of(int cpu);
969static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
970
971
972struct numa_stats {
973 unsigned long nr_running;
974 unsigned long load;
975
976
977 unsigned long power;
978
979
980 unsigned long capacity;
981 int has_capacity;
982};
983
984
985
986
987static void update_numa_stats(struct numa_stats *ns, int nid)
988{
989 int cpu, cpus = 0;
990
991 memset(ns, 0, sizeof(*ns));
992 for_each_cpu(cpu, cpumask_of_node(nid)) {
993 struct rq *rq = cpu_rq(cpu);
994
995 ns->nr_running += rq->nr_running;
996 ns->load += weighted_cpuload(cpu);
997 ns->power += power_of(cpu);
998
999 cpus++;
1000 }
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010 if (!cpus)
1011 return;
1012
1013 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1014 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1015 ns->has_capacity = (ns->nr_running < ns->capacity);
1016}
1017
1018struct task_numa_env {
1019 struct task_struct *p;
1020
1021 int src_cpu, src_nid;
1022 int dst_cpu, dst_nid;
1023
1024 struct numa_stats src_stats, dst_stats;
1025
1026 int imbalance_pct, idx;
1027
1028 struct task_struct *best_task;
1029 long best_imp;
1030 int best_cpu;
1031};
1032
1033static void task_numa_assign(struct task_numa_env *env,
1034 struct task_struct *p, long imp)
1035{
1036 if (env->best_task)
1037 put_task_struct(env->best_task);
1038 if (p)
1039 get_task_struct(p);
1040
1041 env->best_task = p;
1042 env->best_imp = imp;
1043 env->best_cpu = env->dst_cpu;
1044}
1045
1046
1047
1048
1049
1050
1051
1052static void task_numa_compare(struct task_numa_env *env,
1053 long taskimp, long groupimp)
1054{
1055 struct rq *src_rq = cpu_rq(env->src_cpu);
1056 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1057 struct task_struct *cur;
1058 long dst_load, src_load;
1059 long load;
1060 long imp = (groupimp > 0) ? groupimp : taskimp;
1061
1062 rcu_read_lock();
1063 cur = ACCESS_ONCE(dst_rq->curr);
1064 if (cur->pid == 0)
1065 cur = NULL;
1066
1067
1068
1069
1070
1071
1072
1073
1074 if (cur) {
1075
1076 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1077 goto unlock;
1078
1079
1080
1081
1082
1083 if (cur->numa_group == env->p->numa_group) {
1084 imp = taskimp + task_weight(cur, env->src_nid) -
1085 task_weight(cur, env->dst_nid);
1086
1087
1088
1089
1090 if (cur->numa_group)
1091 imp -= imp/16;
1092 } else {
1093
1094
1095
1096
1097
1098 if (env->p->numa_group)
1099 imp = groupimp;
1100 else
1101 imp = taskimp;
1102
1103 if (cur->numa_group)
1104 imp += group_weight(cur, env->src_nid) -
1105 group_weight(cur, env->dst_nid);
1106 else
1107 imp += task_weight(cur, env->src_nid) -
1108 task_weight(cur, env->dst_nid);
1109 }
1110 }
1111
1112 if (imp < env->best_imp)
1113 goto unlock;
1114
1115 if (!cur) {
1116
1117 if (env->src_stats.has_capacity &&
1118 !env->dst_stats.has_capacity)
1119 goto unlock;
1120
1121 goto balance;
1122 }
1123
1124
1125 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1126 goto assign;
1127
1128
1129
1130
1131balance:
1132 dst_load = env->dst_stats.load;
1133 src_load = env->src_stats.load;
1134
1135
1136 load = task_h_load(env->p);
1137 dst_load += load;
1138 src_load -= load;
1139
1140 if (cur) {
1141 load = task_h_load(cur);
1142 dst_load -= load;
1143 src_load += load;
1144 }
1145
1146
1147 if (dst_load < src_load)
1148 swap(dst_load, src_load);
1149
1150 if (src_load * env->imbalance_pct < dst_load * 100)
1151 goto unlock;
1152
1153assign:
1154 task_numa_assign(env, cur, imp);
1155unlock:
1156 rcu_read_unlock();
1157}
1158
1159static void task_numa_find_cpu(struct task_numa_env *env,
1160 long taskimp, long groupimp)
1161{
1162 int cpu;
1163
1164 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1165
1166 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1167 continue;
1168
1169 env->dst_cpu = cpu;
1170 task_numa_compare(env, taskimp, groupimp);
1171 }
1172}
1173
1174static int task_numa_migrate(struct task_struct *p)
1175{
1176 struct task_numa_env env = {
1177 .p = p,
1178
1179 .src_cpu = task_cpu(p),
1180 .src_nid = task_node(p),
1181
1182 .imbalance_pct = 112,
1183
1184 .best_task = NULL,
1185 .best_imp = 0,
1186 .best_cpu = -1
1187 };
1188 struct sched_domain *sd;
1189 unsigned long taskweight, groupweight;
1190 int nid, ret;
1191 long taskimp, groupimp;
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201 rcu_read_lock();
1202 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1203 if (sd)
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207
1208
1209
1210
1211
1212
1213 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p));
1215 return -EINVAL;
1216 }
1217
1218 taskweight = task_weight(p, env.src_nid);
1219 groupweight = group_weight(p, env.src_nid);
1220 update_numa_stats(&env.src_stats, env.src_nid);
1221 env.dst_nid = p->numa_preferred_nid;
1222 taskimp = task_weight(p, env.dst_nid) - taskweight;
1223 groupimp = group_weight(p, env.dst_nid) - groupweight;
1224 update_numa_stats(&env.dst_stats, env.dst_nid);
1225
1226
1227 if (env.dst_stats.has_capacity)
1228 task_numa_find_cpu(&env, taskimp, groupimp);
1229
1230
1231 if (env.best_cpu == -1) {
1232 for_each_online_node(nid) {
1233 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1234 continue;
1235
1236
1237 taskimp = task_weight(p, nid) - taskweight;
1238 groupimp = group_weight(p, nid) - groupweight;
1239 if (taskimp < 0 && groupimp < 0)
1240 continue;
1241
1242 env.dst_nid = nid;
1243 update_numa_stats(&env.dst_stats, env.dst_nid);
1244 task_numa_find_cpu(&env, taskimp, groupimp);
1245 }
1246 }
1247
1248
1249 if (env.best_cpu == -1)
1250 return -EAGAIN;
1251
1252 sched_setnuma(p, env.dst_nid);
1253
1254
1255
1256
1257
1258 p->numa_scan_period = task_scan_min(p);
1259
1260 if (env.best_task == NULL) {
1261 int ret = migrate_task_to(p, env.best_cpu);
1262 return ret;
1263 }
1264
1265 ret = migrate_swap(p, env.best_task);
1266 put_task_struct(env.best_task);
1267 return ret;
1268}
1269
1270
1271static void numa_migrate_preferred(struct task_struct *p)
1272{
1273
1274 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1275 return;
1276
1277
1278 p->numa_migrate_retry = jiffies + HZ;
1279
1280
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1282 return;
1283
1284
1285 task_numa_migrate(p);
1286}
1287
1288
1289
1290
1291
1292
1293
1294
1295#define NUMA_PERIOD_SLOTS 10
1296#define NUMA_PERIOD_THRESHOLD 3
1297
1298
1299
1300
1301
1302
1303
1304static void update_task_scan_period(struct task_struct *p,
1305 unsigned long shared, unsigned long private)
1306{
1307 unsigned int period_slot;
1308 int ratio;
1309 int diff;
1310
1311 unsigned long remote = p->numa_faults_locality[0];
1312 unsigned long local = p->numa_faults_locality[1];
1313
1314
1315
1316
1317
1318
1319 if (local + shared == 0) {
1320 p->numa_scan_period = min(p->numa_scan_period_max,
1321 p->numa_scan_period << 1);
1322
1323 p->mm->numa_next_scan = jiffies +
1324 msecs_to_jiffies(p->numa_scan_period);
1325
1326 return;
1327 }
1328
1329
1330
1331
1332
1333
1334
1335 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1336 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1337 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1338 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1339 if (!slot)
1340 slot = 1;
1341 diff = slot * period_slot;
1342 } else {
1343 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 }
1357
1358 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1359 task_scan_min(p), task_scan_max(p));
1360 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1361}
1362
1363static void task_numa_placement(struct task_struct *p)
1364{
1365 int seq, nid, max_nid = -1, max_group_nid = -1;
1366 unsigned long max_faults = 0, max_group_faults = 0;
1367 unsigned long fault_types[2] = { 0, 0 };
1368 spinlock_t *group_lock = NULL;
1369
1370 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1371 if (p->numa_scan_seq == seq)
1372 return;
1373 p->numa_scan_seq = seq;
1374 p->numa_scan_period_max = task_scan_max(p);
1375
1376
1377 if (p->numa_group) {
1378 group_lock = &p->numa_group->lock;
1379 spin_lock(group_lock);
1380 }
1381
1382
1383 for_each_online_node(nid) {
1384 unsigned long faults = 0, group_faults = 0;
1385 int priv, i;
1386
1387 for (priv = 0; priv < 2; priv++) {
1388 long diff;
1389
1390 i = task_faults_idx(nid, priv);
1391 diff = -p->numa_faults[i];
1392
1393
1394 p->numa_faults[i] >>= 1;
1395 p->numa_faults[i] += p->numa_faults_buffer[i];
1396 fault_types[priv] += p->numa_faults_buffer[i];
1397 p->numa_faults_buffer[i] = 0;
1398
1399 faults += p->numa_faults[i];
1400 diff += p->numa_faults[i];
1401 p->total_numa_faults += diff;
1402 if (p->numa_group) {
1403
1404 p->numa_group->faults[i] += diff;
1405 p->numa_group->total_faults += diff;
1406 group_faults += p->numa_group->faults[i];
1407 }
1408 }
1409
1410 if (faults > max_faults) {
1411 max_faults = faults;
1412 max_nid = nid;
1413 }
1414
1415 if (group_faults > max_group_faults) {
1416 max_group_faults = group_faults;
1417 max_group_nid = nid;
1418 }
1419 }
1420
1421 update_task_scan_period(p, fault_types[0], fault_types[1]);
1422
1423 if (p->numa_group) {
1424
1425
1426
1427
1428 if (max_nid != max_group_nid) {
1429 unsigned long weight, max_weight = 0;
1430
1431 for_each_online_node(nid) {
1432 weight = task_weight(p, nid) + group_weight(p, nid);
1433 if (weight > max_weight) {
1434 max_weight = weight;
1435 max_nid = nid;
1436 }
1437 }
1438 }
1439
1440 spin_unlock(group_lock);
1441 }
1442
1443
1444 if (max_faults && max_nid != p->numa_preferred_nid) {
1445
1446 sched_setnuma(p, max_nid);
1447 numa_migrate_preferred(p);
1448 }
1449}
1450
1451static inline int get_numa_group(struct numa_group *grp)
1452{
1453 return atomic_inc_not_zero(&grp->refcount);
1454}
1455
1456static inline void put_numa_group(struct numa_group *grp)
1457{
1458 if (atomic_dec_and_test(&grp->refcount))
1459 kfree_rcu(grp, rcu);
1460}
1461
1462static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1463 int *priv)
1464{
1465 struct numa_group *grp, *my_grp;
1466 struct task_struct *tsk;
1467 bool join = false;
1468 int cpu = cpupid_to_cpu(cpupid);
1469 int i;
1470
1471 if (unlikely(!p->numa_group)) {
1472 unsigned int size = sizeof(struct numa_group) +
1473 2*nr_node_ids*sizeof(unsigned long);
1474
1475 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1476 if (!grp)
1477 return;
1478
1479 atomic_set(&grp->refcount, 1);
1480 spin_lock_init(&grp->lock);
1481 INIT_LIST_HEAD(&grp->task_list);
1482 grp->gid = p->pid;
1483
1484 for (i = 0; i < 2*nr_node_ids; i++)
1485 grp->faults[i] = p->numa_faults[i];
1486
1487 grp->total_faults = p->total_numa_faults;
1488
1489 list_add(&p->numa_entry, &grp->task_list);
1490 grp->nr_tasks++;
1491 rcu_assign_pointer(p->numa_group, grp);
1492 }
1493
1494 rcu_read_lock();
1495 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1496
1497 if (!cpupid_match_pid(tsk, cpupid))
1498 goto no_join;
1499
1500 grp = rcu_dereference(tsk->numa_group);
1501 if (!grp)
1502 goto no_join;
1503
1504 my_grp = p->numa_group;
1505 if (grp == my_grp)
1506 goto no_join;
1507
1508
1509
1510
1511
1512 if (my_grp->nr_tasks > grp->nr_tasks)
1513 goto no_join;
1514
1515
1516
1517
1518 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1519 goto no_join;
1520
1521
1522 if (tsk->mm == current->mm)
1523 join = true;
1524
1525
1526 if (flags & TNF_SHARED)
1527 join = true;
1528
1529
1530 *priv = !join;
1531
1532 if (join && !get_numa_group(grp))
1533 goto no_join;
1534
1535 rcu_read_unlock();
1536
1537 if (!join)
1538 return;
1539
1540 double_lock(&my_grp->lock, &grp->lock);
1541
1542 for (i = 0; i < 2*nr_node_ids; i++) {
1543 my_grp->faults[i] -= p->numa_faults[i];
1544 grp->faults[i] += p->numa_faults[i];
1545 }
1546 my_grp->total_faults -= p->total_numa_faults;
1547 grp->total_faults += p->total_numa_faults;
1548
1549 list_move(&p->numa_entry, &grp->task_list);
1550 my_grp->nr_tasks--;
1551 grp->nr_tasks++;
1552
1553 spin_unlock(&my_grp->lock);
1554 spin_unlock(&grp->lock);
1555
1556 rcu_assign_pointer(p->numa_group, grp);
1557
1558 put_numa_group(my_grp);
1559 return;
1560
1561no_join:
1562 rcu_read_unlock();
1563 return;
1564}
1565
1566void task_numa_free(struct task_struct *p)
1567{
1568 struct numa_group *grp = p->numa_group;
1569 int i;
1570 void *numa_faults = p->numa_faults;
1571
1572 if (grp) {
1573 spin_lock(&grp->lock);
1574 for (i = 0; i < 2*nr_node_ids; i++)
1575 grp->faults[i] -= p->numa_faults[i];
1576 grp->total_faults -= p->total_numa_faults;
1577
1578 list_del(&p->numa_entry);
1579 grp->nr_tasks--;
1580 spin_unlock(&grp->lock);
1581 rcu_assign_pointer(p->numa_group, NULL);
1582 put_numa_group(grp);
1583 }
1584
1585 p->numa_faults = NULL;
1586 p->numa_faults_buffer = NULL;
1587 kfree(numa_faults);
1588}
1589
1590
1591
1592
1593void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1594{
1595 struct task_struct *p = current;
1596 bool migrated = flags & TNF_MIGRATED;
1597 int priv;
1598
1599 if (!numabalancing_enabled)
1600 return;
1601
1602
1603 if (!p->mm)
1604 return;
1605
1606
1607 if (p->state == TASK_DEAD)
1608 return;
1609
1610
1611 if (unlikely(!p->numa_faults)) {
1612 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1613
1614
1615 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1616 if (!p->numa_faults)
1617 return;
1618
1619 BUG_ON(p->numa_faults_buffer);
1620 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1621 p->total_numa_faults = 0;
1622 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1623 }
1624
1625
1626
1627
1628
1629 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1630 priv = 1;
1631 } else {
1632 priv = cpupid_match_pid(p, last_cpupid);
1633 if (!priv && !(flags & TNF_NO_GROUP))
1634 task_numa_group(p, last_cpupid, flags, &priv);
1635 }
1636
1637 task_numa_placement(p);
1638
1639
1640
1641
1642
1643 if (time_after(jiffies, p->numa_migrate_retry))
1644 numa_migrate_preferred(p);
1645
1646 if (migrated)
1647 p->numa_pages_migrated += pages;
1648
1649 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1650 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1651}
1652
1653static void reset_ptenuma_scan(struct task_struct *p)
1654{
1655 ACCESS_ONCE(p->mm->numa_scan_seq)++;
1656 p->mm->numa_scan_offset = 0;
1657}
1658
1659
1660
1661
1662
1663void task_numa_work(struct callback_head *work)
1664{
1665 unsigned long migrate, next_scan, now = jiffies;
1666 struct task_struct *p = current;
1667 struct mm_struct *mm = p->mm;
1668 struct vm_area_struct *vma;
1669 unsigned long start, end;
1670 unsigned long nr_pte_updates = 0;
1671 long pages;
1672
1673 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1674
1675 work->next = work;
1676
1677
1678
1679
1680
1681
1682
1683
1684 if (p->flags & PF_EXITING)
1685 return;
1686
1687 if (!mm->numa_next_scan) {
1688 mm->numa_next_scan = now +
1689 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1690 }
1691
1692
1693
1694
1695 migrate = mm->numa_next_scan;
1696 if (time_before(now, migrate))
1697 return;
1698
1699 if (p->numa_scan_period == 0) {
1700 p->numa_scan_period_max = task_scan_max(p);
1701 p->numa_scan_period = task_scan_min(p);
1702 }
1703
1704 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1705 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1706 return;
1707
1708
1709
1710
1711
1712 p->node_stamp += 2 * TICK_NSEC;
1713
1714 start = mm->numa_scan_offset;
1715 pages = sysctl_numa_balancing_scan_size;
1716 pages <<= 20 - PAGE_SHIFT;
1717 if (!pages)
1718 return;
1719
1720 down_read(&mm->mmap_sem);
1721 vma = find_vma(mm, start);
1722 if (!vma) {
1723 reset_ptenuma_scan(p);
1724 start = 0;
1725 vma = mm->mmap;
1726 }
1727 for (; vma; vma = vma->vm_next) {
1728 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
1729 continue;
1730
1731
1732
1733
1734
1735
1736
1737 if (!vma->vm_mm ||
1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1739 continue;
1740
1741
1742
1743
1744
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1746 continue;
1747
1748 do {
1749 start = max(start, vma->vm_start);
1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1751 end = min(end, vma->vm_end);
1752 nr_pte_updates += change_prot_numa(vma, start, end);
1753
1754
1755
1756
1757
1758
1759 if (nr_pte_updates)
1760 pages -= (end - start) >> PAGE_SHIFT;
1761
1762 start = end;
1763 if (pages <= 0)
1764 goto out;
1765 } while (end != vma->vm_end);
1766 }
1767
1768out:
1769
1770
1771
1772
1773
1774
1775 if (vma)
1776 mm->numa_scan_offset = start;
1777 else
1778 reset_ptenuma_scan(p);
1779 up_read(&mm->mmap_sem);
1780}
1781
1782
1783
1784
1785void task_tick_numa(struct rq *rq, struct task_struct *curr)
1786{
1787 struct callback_head *work = &curr->numa_work;
1788 u64 period, now;
1789
1790
1791
1792
1793 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1794 return;
1795
1796
1797
1798
1799
1800
1801
1802 now = curr->se.sum_exec_runtime;
1803 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1804
1805 if (now - curr->node_stamp > period) {
1806 if (!curr->node_stamp)
1807 curr->numa_scan_period = task_scan_min(curr);
1808 curr->node_stamp += period;
1809
1810 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1811 init_task_work(work, task_numa_work);
1812 task_work_add(curr, work, true);
1813 }
1814 }
1815}
1816#else
1817static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1818{
1819}
1820
1821static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1822{
1823}
1824
1825static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1826{
1827}
1828#endif
1829
1830static void
1831account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1832{
1833 update_load_add(&cfs_rq->load, se->load.weight);
1834 if (!parent_entity(se))
1835 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1836#ifdef CONFIG_SMP
1837 if (entity_is_task(se)) {
1838 struct rq *rq = rq_of(cfs_rq);
1839
1840 account_numa_enqueue(rq, task_of(se));
1841 list_add(&se->group_node, &rq->cfs_tasks);
1842 }
1843#endif
1844 cfs_rq->nr_running++;
1845}
1846
1847static void
1848account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1849{
1850 update_load_sub(&cfs_rq->load, se->load.weight);
1851 if (!parent_entity(se))
1852 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1853 if (entity_is_task(se)) {
1854 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1855 list_del_init(&se->group_node);
1856 }
1857 cfs_rq->nr_running--;
1858}
1859
1860#ifdef CONFIG_FAIR_GROUP_SCHED
1861# ifdef CONFIG_SMP
1862static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1863{
1864 long tg_weight;
1865
1866
1867
1868
1869
1870
1871 tg_weight = atomic_long_read(&tg->load_avg);
1872 tg_weight -= cfs_rq->tg_load_contrib;
1873 tg_weight += cfs_rq->load.weight;
1874
1875 return tg_weight;
1876}
1877
1878static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1879{
1880 long tg_weight, load, shares;
1881
1882 tg_weight = calc_tg_weight(tg, cfs_rq);
1883 load = cfs_rq->load.weight;
1884
1885 shares = (tg->shares * load);
1886 if (tg_weight)
1887 shares /= tg_weight;
1888
1889 if (shares < MIN_SHARES)
1890 shares = MIN_SHARES;
1891 if (shares > tg->shares)
1892 shares = tg->shares;
1893
1894 return shares;
1895}
1896# else
1897static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1898{
1899 return tg->shares;
1900}
1901# endif
1902static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1903 unsigned long weight)
1904{
1905 if (se->on_rq) {
1906
1907 if (cfs_rq->curr == se)
1908 update_curr(cfs_rq);
1909 account_entity_dequeue(cfs_rq, se);
1910 }
1911
1912 update_load_set(&se->load, weight);
1913
1914 if (se->on_rq)
1915 account_entity_enqueue(cfs_rq, se);
1916}
1917
1918static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1919
1920static void update_cfs_shares(struct cfs_rq *cfs_rq)
1921{
1922 struct task_group *tg;
1923 struct sched_entity *se;
1924 long shares;
1925
1926 tg = cfs_rq->tg;
1927 se = tg->se[cpu_of(rq_of(cfs_rq))];
1928 if (!se || throttled_hierarchy(cfs_rq))
1929 return;
1930#ifndef CONFIG_SMP
1931 if (likely(se->load.weight == tg->shares))
1932 return;
1933#endif
1934 shares = calc_cfs_shares(cfs_rq, tg);
1935
1936 reweight_entity(cfs_rq_of(se), se, shares);
1937}
1938#else
1939static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1940{
1941}
1942#endif
1943
1944#ifdef CONFIG_SMP
1945
1946
1947
1948
1949#define LOAD_AVG_PERIOD 32
1950#define LOAD_AVG_MAX 47742
1951#define LOAD_AVG_MAX_N 345
1952
1953
1954static const u32 runnable_avg_yN_inv[] = {
1955 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1956 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1957 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1958 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1959 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1960 0x85aac367, 0x82cd8698,
1961};
1962
1963
1964
1965
1966
1967static const u32 runnable_avg_yN_sum[] = {
1968 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1969 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1970 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1971};
1972
1973
1974
1975
1976
1977static __always_inline u64 decay_load(u64 val, u64 n)
1978{
1979 unsigned int local_n;
1980
1981 if (!n)
1982 return val;
1983 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1984 return 0;
1985
1986
1987 local_n = n;
1988
1989
1990
1991
1992
1993
1994
1995
1996 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1997 val >>= local_n / LOAD_AVG_PERIOD;
1998 local_n %= LOAD_AVG_PERIOD;
1999 }
2000
2001 val *= runnable_avg_yN_inv[local_n];
2002
2003 return val >> 32;
2004}
2005
2006
2007
2008
2009
2010
2011
2012
2013static u32 __compute_runnable_contrib(u64 n)
2014{
2015 u32 contrib = 0;
2016
2017 if (likely(n <= LOAD_AVG_PERIOD))
2018 return runnable_avg_yN_sum[n];
2019 else if (unlikely(n >= LOAD_AVG_MAX_N))
2020 return LOAD_AVG_MAX;
2021
2022
2023 do {
2024 contrib /= 2;
2025 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2026
2027 n -= LOAD_AVG_PERIOD;
2028 } while (n > LOAD_AVG_PERIOD);
2029
2030 contrib = decay_load(contrib, n);
2031 return contrib + runnable_avg_yN_sum[n];
2032}
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062static __always_inline int __update_entity_runnable_avg(u64 now,
2063 struct sched_avg *sa,
2064 int runnable)
2065{
2066 u64 delta, periods;
2067 u32 runnable_contrib;
2068 int delta_w, decayed = 0;
2069
2070 delta = now - sa->last_runnable_update;
2071
2072
2073
2074
2075 if ((s64)delta < 0) {
2076 sa->last_runnable_update = now;
2077 return 0;
2078 }
2079
2080
2081
2082
2083
2084 delta >>= 10;
2085 if (!delta)
2086 return 0;
2087 sa->last_runnable_update = now;
2088
2089
2090 delta_w = sa->runnable_avg_period % 1024;
2091 if (delta + delta_w >= 1024) {
2092
2093 decayed = 1;
2094
2095
2096
2097
2098
2099
2100 delta_w = 1024 - delta_w;
2101 if (runnable)
2102 sa->runnable_avg_sum += delta_w;
2103 sa->runnable_avg_period += delta_w;
2104
2105 delta -= delta_w;
2106
2107
2108 periods = delta / 1024;
2109 delta %= 1024;
2110
2111 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2112 periods + 1);
2113 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
2114 periods + 1);
2115
2116
2117 runnable_contrib = __compute_runnable_contrib(periods);
2118 if (runnable)
2119 sa->runnable_avg_sum += runnable_contrib;
2120 sa->runnable_avg_period += runnable_contrib;
2121 }
2122
2123
2124 if (runnable)
2125 sa->runnable_avg_sum += delta;
2126 sa->runnable_avg_period += delta;
2127
2128 return decayed;
2129}
2130
2131
2132static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2133{
2134 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2135 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2136
2137 decays -= se->avg.decay_count;
2138 if (!decays)
2139 return 0;
2140
2141 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2142 se->avg.decay_count = 0;
2143
2144 return decays;
2145}
2146
2147#ifdef CONFIG_FAIR_GROUP_SCHED
2148static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2149 int force_update)
2150{
2151 struct task_group *tg = cfs_rq->tg;
2152 long tg_contrib;
2153
2154 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2155 tg_contrib -= cfs_rq->tg_load_contrib;
2156
2157 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2158 atomic_long_add(tg_contrib, &tg->load_avg);
2159 cfs_rq->tg_load_contrib += tg_contrib;
2160 }
2161}
2162
2163
2164
2165
2166
2167static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2168 struct cfs_rq *cfs_rq)
2169{
2170 struct task_group *tg = cfs_rq->tg;
2171 long contrib;
2172
2173
2174 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2175 sa->runnable_avg_period + 1);
2176 contrib -= cfs_rq->tg_runnable_contrib;
2177
2178 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2179 atomic_add(contrib, &tg->runnable_avg);
2180 cfs_rq->tg_runnable_contrib += contrib;
2181 }
2182}
2183
2184static inline void __update_group_entity_contrib(struct sched_entity *se)
2185{
2186 struct cfs_rq *cfs_rq = group_cfs_rq(se);
2187 struct task_group *tg = cfs_rq->tg;
2188 int runnable_avg;
2189
2190 u64 contrib;
2191
2192 contrib = cfs_rq->tg_load_contrib * tg->shares;
2193 se->avg.load_avg_contrib = div_u64(contrib,
2194 atomic_long_read(&tg->load_avg) + 1);
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219 runnable_avg = atomic_read(&tg->runnable_avg);
2220 if (runnable_avg < NICE_0_LOAD) {
2221 se->avg.load_avg_contrib *= runnable_avg;
2222 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2223 }
2224}
2225#else
2226static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2227 int force_update) {}
2228static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2229 struct cfs_rq *cfs_rq) {}
2230static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2231#endif
2232
2233static inline void __update_task_entity_contrib(struct sched_entity *se)
2234{
2235 u32 contrib;
2236
2237
2238 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2239 contrib /= (se->avg.runnable_avg_period + 1);
2240 se->avg.load_avg_contrib = scale_load(contrib);
2241}
2242
2243
2244static long __update_entity_load_avg_contrib(struct sched_entity *se)
2245{
2246 long old_contrib = se->avg.load_avg_contrib;
2247
2248 if (entity_is_task(se)) {
2249 __update_task_entity_contrib(se);
2250 } else {
2251 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2252 __update_group_entity_contrib(se);
2253 }
2254
2255 return se->avg.load_avg_contrib - old_contrib;
2256}
2257
2258static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2259 long load_contrib)
2260{
2261 if (likely(load_contrib < cfs_rq->blocked_load_avg))
2262 cfs_rq->blocked_load_avg -= load_contrib;
2263 else
2264 cfs_rq->blocked_load_avg = 0;
2265}
2266
2267static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2268
2269
2270static inline void update_entity_load_avg(struct sched_entity *se,
2271 int update_cfs_rq)
2272{
2273 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2274 long contrib_delta;
2275 u64 now;
2276
2277
2278
2279
2280
2281 if (entity_is_task(se))
2282 now = cfs_rq_clock_task(cfs_rq);
2283 else
2284 now = cfs_rq_clock_task(group_cfs_rq(se));
2285
2286 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2287 return;
2288
2289 contrib_delta = __update_entity_load_avg_contrib(se);
2290
2291 if (!update_cfs_rq)
2292 return;
2293
2294 if (se->on_rq)
2295 cfs_rq->runnable_load_avg += contrib_delta;
2296 else
2297 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2298}
2299
2300
2301
2302
2303
2304static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2305{
2306 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2307 u64 decays;
2308
2309 decays = now - cfs_rq->last_decay;
2310 if (!decays && !force_update)
2311 return;
2312
2313 if (atomic_long_read(&cfs_rq->removed_load)) {
2314 unsigned long removed_load;
2315 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2316 subtract_blocked_load_contrib(cfs_rq, removed_load);
2317 }
2318
2319 if (decays) {
2320 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2321 decays);
2322 atomic64_add(decays, &cfs_rq->decay_counter);
2323 cfs_rq->last_decay = now;
2324 }
2325
2326 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2327}
2328
2329static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2330{
2331 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2332 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2333}
2334
2335
2336static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2337 struct sched_entity *se,
2338 int wakeup)
2339{
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349 if (unlikely(se->avg.decay_count <= 0)) {
2350 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2351 if (se->avg.decay_count) {
2352
2353
2354
2355
2356
2357
2358
2359
2360 se->avg.last_runnable_update -= (-se->avg.decay_count)
2361 << 20;
2362 update_entity_load_avg(se, 0);
2363
2364 se->avg.decay_count = 0;
2365 }
2366 wakeup = 0;
2367 } else {
2368
2369
2370
2371
2372
2373 se->avg.last_runnable_update += __synchronize_entity_decay(se)
2374 << 20;
2375 }
2376
2377
2378 if (wakeup) {
2379 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2380 update_entity_load_avg(se, 0);
2381 }
2382
2383 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2384
2385 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2386}
2387
2388
2389
2390
2391
2392
2393static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2394 struct sched_entity *se,
2395 int sleep)
2396{
2397 update_entity_load_avg(se, 1);
2398
2399 update_cfs_rq_blocked_load(cfs_rq, !sleep);
2400
2401 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2402 if (sleep) {
2403 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2404 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2405 }
2406}
2407
2408
2409
2410
2411
2412
2413void idle_enter_fair(struct rq *this_rq)
2414{
2415 update_rq_runnable_avg(this_rq, 1);
2416}
2417
2418
2419
2420
2421
2422
2423void idle_exit_fair(struct rq *this_rq)
2424{
2425 update_rq_runnable_avg(this_rq, 0);
2426}
2427
2428#else
2429static inline void update_entity_load_avg(struct sched_entity *se,
2430 int update_cfs_rq) {}
2431static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2432static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2433 struct sched_entity *se,
2434 int wakeup) {}
2435static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2436 struct sched_entity *se,
2437 int sleep) {}
2438static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2439 int force_update) {}
2440#endif
2441
2442static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2443{
2444#ifdef CONFIG_SCHEDSTATS
2445 struct task_struct *tsk = NULL;
2446
2447 if (entity_is_task(se))
2448 tsk = task_of(se);
2449
2450 if (se->statistics.sleep_start) {
2451 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2452
2453 if ((s64)delta < 0)
2454 delta = 0;
2455
2456 if (unlikely(delta > se->statistics.sleep_max))
2457 se->statistics.sleep_max = delta;
2458
2459 se->statistics.sleep_start = 0;
2460 se->statistics.sum_sleep_runtime += delta;
2461
2462 if (tsk) {
2463 account_scheduler_latency(tsk, delta >> 10, 1);
2464 trace_sched_stat_sleep(tsk, delta);
2465 }
2466 }
2467 if (se->statistics.block_start) {
2468 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2469
2470 if ((s64)delta < 0)
2471 delta = 0;
2472
2473 if (unlikely(delta > se->statistics.block_max))
2474 se->statistics.block_max = delta;
2475
2476 se->statistics.block_start = 0;
2477 se->statistics.sum_sleep_runtime += delta;
2478
2479 if (tsk) {
2480 if (tsk->in_iowait) {
2481 se->statistics.iowait_sum += delta;
2482 se->statistics.iowait_count++;
2483 trace_sched_stat_iowait(tsk, delta);
2484 }
2485
2486 trace_sched_stat_blocked(tsk, delta);
2487
2488
2489
2490
2491
2492
2493 if (unlikely(prof_on == SLEEP_PROFILING)) {
2494 profile_hits(SLEEP_PROFILING,
2495 (void *)get_wchan(tsk),
2496 delta >> 20);
2497 }
2498 account_scheduler_latency(tsk, delta >> 10, 0);
2499 }
2500 }
2501#endif
2502}
2503
2504static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2505{
2506#ifdef CONFIG_SCHED_DEBUG
2507 s64 d = se->vruntime - cfs_rq->min_vruntime;
2508
2509 if (d < 0)
2510 d = -d;
2511
2512 if (d > 3*sysctl_sched_latency)
2513 schedstat_inc(cfs_rq, nr_spread_over);
2514#endif
2515}
2516
2517static void
2518place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2519{
2520 u64 vruntime = cfs_rq->min_vruntime;
2521
2522
2523
2524
2525
2526
2527
2528 if (initial && sched_feat(START_DEBIT))
2529 vruntime += sched_vslice(cfs_rq, se);
2530
2531
2532 if (!initial) {
2533 unsigned long thresh = sysctl_sched_latency;
2534
2535
2536
2537
2538
2539 if (sched_feat(GENTLE_FAIR_SLEEPERS))
2540 thresh >>= 1;
2541
2542 vruntime -= thresh;
2543 }
2544
2545
2546 se->vruntime = max_vruntime(se->vruntime, vruntime);
2547}
2548
2549static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2550
2551static void
2552enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2553{
2554
2555
2556
2557
2558 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
2559 se->vruntime += cfs_rq->min_vruntime;
2560
2561
2562
2563
2564 update_curr(cfs_rq);
2565 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2566 account_entity_enqueue(cfs_rq, se);
2567 update_cfs_shares(cfs_rq);
2568
2569 if (flags & ENQUEUE_WAKEUP) {
2570 place_entity(cfs_rq, se, 0);
2571 enqueue_sleeper(cfs_rq, se);
2572 }
2573
2574 update_stats_enqueue(cfs_rq, se);
2575 check_spread(cfs_rq, se);
2576 if (se != cfs_rq->curr)
2577 __enqueue_entity(cfs_rq, se);
2578 se->on_rq = 1;
2579
2580 if (cfs_rq->nr_running == 1) {
2581 list_add_leaf_cfs_rq(cfs_rq);
2582 check_enqueue_throttle(cfs_rq);
2583 }
2584}
2585
2586static void __clear_buddies_last(struct sched_entity *se)
2587{
2588 for_each_sched_entity(se) {
2589 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2590 if (cfs_rq->last == se)
2591 cfs_rq->last = NULL;
2592 else
2593 break;
2594 }
2595}
2596
2597static void __clear_buddies_next(struct sched_entity *se)
2598{
2599 for_each_sched_entity(se) {
2600 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2601 if (cfs_rq->next == se)
2602 cfs_rq->next = NULL;
2603 else
2604 break;
2605 }
2606}
2607
2608static void __clear_buddies_skip(struct sched_entity *se)
2609{
2610 for_each_sched_entity(se) {
2611 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2612 if (cfs_rq->skip == se)
2613 cfs_rq->skip = NULL;
2614 else
2615 break;
2616 }
2617}
2618
2619static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2620{
2621 if (cfs_rq->last == se)
2622 __clear_buddies_last(se);
2623
2624 if (cfs_rq->next == se)
2625 __clear_buddies_next(se);
2626
2627 if (cfs_rq->skip == se)
2628 __clear_buddies_skip(se);
2629}
2630
2631static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2632
2633static void
2634dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2635{
2636
2637
2638
2639 update_curr(cfs_rq);
2640 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2641
2642 update_stats_dequeue(cfs_rq, se);
2643 if (flags & DEQUEUE_SLEEP) {
2644#ifdef CONFIG_SCHEDSTATS
2645 if (entity_is_task(se)) {
2646 struct task_struct *tsk = task_of(se);
2647
2648 if (tsk->state & TASK_INTERRUPTIBLE)
2649 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
2650 if (tsk->state & TASK_UNINTERRUPTIBLE)
2651 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
2652 }
2653#endif
2654 }
2655
2656 clear_buddies(cfs_rq, se);
2657
2658 if (se != cfs_rq->curr)
2659 __dequeue_entity(cfs_rq, se);
2660 se->on_rq = 0;
2661 account_entity_dequeue(cfs_rq, se);
2662
2663
2664
2665
2666
2667
2668 if (!(flags & DEQUEUE_SLEEP))
2669 se->vruntime -= cfs_rq->min_vruntime;
2670
2671
2672 return_cfs_rq_runtime(cfs_rq);
2673
2674 update_min_vruntime(cfs_rq);
2675 update_cfs_shares(cfs_rq);
2676}
2677
2678
2679
2680
2681static void
2682check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2683{
2684 unsigned long ideal_runtime, delta_exec;
2685 struct sched_entity *se;
2686 s64 delta;
2687
2688 ideal_runtime = sched_slice(cfs_rq, curr);
2689 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2690 if (delta_exec > ideal_runtime) {
2691 resched_task(rq_of(cfs_rq)->curr);
2692
2693
2694
2695
2696 clear_buddies(cfs_rq, curr);
2697 return;
2698 }
2699
2700
2701
2702
2703
2704
2705 if (delta_exec < sysctl_sched_min_granularity)
2706 return;
2707
2708 se = __pick_first_entity(cfs_rq);
2709 delta = curr->vruntime - se->vruntime;
2710
2711 if (delta < 0)
2712 return;
2713
2714 if (delta > ideal_runtime)
2715 resched_task(rq_of(cfs_rq)->curr);
2716}
2717
2718static void
2719set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2720{
2721
2722 if (se->on_rq) {
2723
2724
2725
2726
2727
2728 update_stats_wait_end(cfs_rq, se);
2729 __dequeue_entity(cfs_rq, se);
2730 }
2731
2732 update_stats_curr_start(cfs_rq, se);
2733 cfs_rq->curr = se;
2734#ifdef CONFIG_SCHEDSTATS
2735
2736
2737
2738
2739
2740 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2741 se->statistics.slice_max = max(se->statistics.slice_max,
2742 se->sum_exec_runtime - se->prev_sum_exec_runtime);
2743 }
2744#endif
2745 se->prev_sum_exec_runtime = se->sum_exec_runtime;
2746}
2747
2748static int
2749wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2750
2751
2752
2753
2754
2755
2756
2757
2758static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2759{
2760 struct sched_entity *se = __pick_first_entity(cfs_rq);
2761 struct sched_entity *left = se;
2762
2763
2764
2765
2766
2767 if (cfs_rq->skip == se) {
2768 struct sched_entity *second = __pick_next_entity(se);
2769 if (second && wakeup_preempt_entity(second, left) < 1)
2770 se = second;
2771 }
2772
2773
2774
2775
2776 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2777 se = cfs_rq->last;
2778
2779
2780
2781
2782 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2783 se = cfs_rq->next;
2784
2785 clear_buddies(cfs_rq, se);
2786
2787 return se;
2788}
2789
2790static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2791
2792static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2793{
2794
2795
2796
2797
2798 if (prev->on_rq)
2799 update_curr(cfs_rq);
2800
2801
2802 check_cfs_rq_runtime(cfs_rq);
2803
2804 check_spread(cfs_rq, prev);
2805 if (prev->on_rq) {
2806 update_stats_wait_start(cfs_rq, prev);
2807
2808 __enqueue_entity(cfs_rq, prev);
2809
2810 update_entity_load_avg(prev, 1);
2811 }
2812 cfs_rq->curr = NULL;
2813}
2814
2815static void
2816entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2817{
2818
2819
2820
2821 update_curr(cfs_rq);
2822
2823
2824
2825
2826 update_entity_load_avg(curr, 1);
2827 update_cfs_rq_blocked_load(cfs_rq, 1);
2828 update_cfs_shares(cfs_rq);
2829
2830#ifdef CONFIG_SCHED_HRTICK
2831
2832
2833
2834
2835 if (queued) {
2836 resched_task(rq_of(cfs_rq)->curr);
2837 return;
2838 }
2839
2840
2841
2842 if (!sched_feat(DOUBLE_TICK) &&
2843 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2844 return;
2845#endif
2846
2847 if (cfs_rq->nr_running > 1)
2848 check_preempt_tick(cfs_rq, curr);
2849}
2850
2851
2852
2853
2854
2855
2856#ifdef CONFIG_CFS_BANDWIDTH
2857
2858#ifdef HAVE_JUMP_LABEL
2859static struct static_key __cfs_bandwidth_used;
2860
2861static inline bool cfs_bandwidth_used(void)
2862{
2863 return static_key_false(&__cfs_bandwidth_used);
2864}
2865
2866void cfs_bandwidth_usage_inc(void)
2867{
2868 static_key_slow_inc(&__cfs_bandwidth_used);
2869}
2870
2871void cfs_bandwidth_usage_dec(void)
2872{
2873 static_key_slow_dec(&__cfs_bandwidth_used);
2874}
2875#else
2876static bool cfs_bandwidth_used(void)
2877{
2878 return true;
2879}
2880
2881void cfs_bandwidth_usage_inc(void) {}
2882void cfs_bandwidth_usage_dec(void) {}
2883#endif
2884
2885
2886
2887
2888
2889static inline u64 default_cfs_period(void)
2890{
2891 return 100000000ULL;
2892}
2893
2894static inline u64 sched_cfs_bandwidth_slice(void)
2895{
2896 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2897}
2898
2899
2900
2901
2902
2903
2904
2905
2906void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2907{
2908 u64 now;
2909
2910 if (cfs_b->quota == RUNTIME_INF)
2911 return;
2912
2913 now = sched_clock_cpu(smp_processor_id());
2914 cfs_b->runtime = cfs_b->quota;
2915 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2916}
2917
2918static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2919{
2920 return &tg->cfs_bandwidth;
2921}
2922
2923
2924static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2925{
2926 if (unlikely(cfs_rq->throttle_count))
2927 return cfs_rq->throttled_clock_task;
2928
2929 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2930}
2931
2932
2933static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2934{
2935 struct task_group *tg = cfs_rq->tg;
2936 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2937 u64 amount = 0, min_amount, expires;
2938
2939
2940 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2941
2942 raw_spin_lock(&cfs_b->lock);
2943 if (cfs_b->quota == RUNTIME_INF)
2944 amount = min_amount;
2945 else {
2946
2947
2948
2949
2950
2951
2952 if (!cfs_b->timer_active) {
2953 __refill_cfs_bandwidth_runtime(cfs_b);
2954 __start_cfs_bandwidth(cfs_b);
2955 }
2956
2957 if (cfs_b->runtime > 0) {
2958 amount = min(cfs_b->runtime, min_amount);
2959 cfs_b->runtime -= amount;
2960 cfs_b->idle = 0;
2961 }
2962 }
2963 expires = cfs_b->runtime_expires;
2964 raw_spin_unlock(&cfs_b->lock);
2965
2966 cfs_rq->runtime_remaining += amount;
2967
2968
2969
2970
2971
2972 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2973 cfs_rq->runtime_expires = expires;
2974
2975 return cfs_rq->runtime_remaining > 0;
2976}
2977
2978
2979
2980
2981
2982static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2983{
2984 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2985
2986
2987 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2988 return;
2989
2990 if (cfs_rq->runtime_remaining < 0)
2991 return;
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
3003
3004 cfs_rq->runtime_expires += TICK_NSEC;
3005 } else {
3006
3007 cfs_rq->runtime_remaining = 0;
3008 }
3009}
3010
3011static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3012{
3013
3014 cfs_rq->runtime_remaining -= delta_exec;
3015 expire_cfs_rq_runtime(cfs_rq);
3016
3017 if (likely(cfs_rq->runtime_remaining > 0))
3018 return;
3019
3020
3021
3022
3023
3024 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3025 resched_task(rq_of(cfs_rq)->curr);
3026}
3027
3028static __always_inline
3029void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3030{
3031 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3032 return;
3033
3034 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3035}
3036
3037static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3038{
3039 return cfs_bandwidth_used() && cfs_rq->throttled;
3040}
3041
3042
3043static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3044{
3045 return cfs_bandwidth_used() && cfs_rq->throttle_count;
3046}
3047
3048
3049
3050
3051
3052
3053static inline int throttled_lb_pair(struct task_group *tg,
3054 int src_cpu, int dest_cpu)
3055{
3056 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3057
3058 src_cfs_rq = tg->cfs_rq[src_cpu];
3059 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3060
3061 return throttled_hierarchy(src_cfs_rq) ||
3062 throttled_hierarchy(dest_cfs_rq);
3063}
3064
3065
3066static int tg_unthrottle_up(struct task_group *tg, void *data)
3067{
3068 struct rq *rq = data;
3069 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3070
3071 cfs_rq->throttle_count--;
3072#ifdef CONFIG_SMP
3073 if (!cfs_rq->throttle_count) {
3074
3075 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3076 cfs_rq->throttled_clock_task;
3077 }
3078#endif
3079
3080 return 0;
3081}
3082
3083static int tg_throttle_down(struct task_group *tg, void *data)
3084{
3085 struct rq *rq = data;
3086 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3087
3088
3089 if (!cfs_rq->throttle_count)
3090 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3091 cfs_rq->throttle_count++;
3092
3093 return 0;
3094}
3095
3096static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3097{
3098 struct rq *rq = rq_of(cfs_rq);
3099 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3100 struct sched_entity *se;
3101 long task_delta, dequeue = 1;
3102
3103 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3104
3105
3106 rcu_read_lock();
3107 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3108 rcu_read_unlock();
3109
3110 task_delta = cfs_rq->h_nr_running;
3111 for_each_sched_entity(se) {
3112 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3113
3114 if (!se->on_rq)
3115 break;
3116
3117 if (dequeue)
3118 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3119 qcfs_rq->h_nr_running -= task_delta;
3120
3121 if (qcfs_rq->load.weight)
3122 dequeue = 0;
3123 }
3124
3125 if (!se)
3126 rq->nr_running -= task_delta;
3127
3128 cfs_rq->throttled = 1;
3129 cfs_rq->throttled_clock = rq_clock(rq);
3130 raw_spin_lock(&cfs_b->lock);
3131 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3132 if (!cfs_b->timer_active)
3133 __start_cfs_bandwidth(cfs_b);
3134 raw_spin_unlock(&cfs_b->lock);
3135}
3136
3137void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3138{
3139 struct rq *rq = rq_of(cfs_rq);
3140 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3141 struct sched_entity *se;
3142 int enqueue = 1;
3143 long task_delta;
3144
3145 se = cfs_rq->tg->se[cpu_of(rq)];
3146
3147 cfs_rq->throttled = 0;
3148
3149 update_rq_clock(rq);
3150
3151 raw_spin_lock(&cfs_b->lock);
3152 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3153 list_del_rcu(&cfs_rq->throttled_list);
3154 raw_spin_unlock(&cfs_b->lock);
3155
3156
3157 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3158
3159 if (!cfs_rq->load.weight)
3160 return;
3161
3162 task_delta = cfs_rq->h_nr_running;
3163 for_each_sched_entity(se) {
3164 if (se->on_rq)
3165 enqueue = 0;
3166
3167 cfs_rq = cfs_rq_of(se);
3168 if (enqueue)
3169 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3170 cfs_rq->h_nr_running += task_delta;
3171
3172 if (cfs_rq_throttled(cfs_rq))
3173 break;
3174 }
3175
3176 if (!se)
3177 rq->nr_running += task_delta;
3178
3179
3180 if (rq->curr == rq->idle && rq->cfs.nr_running)
3181 resched_task(rq->curr);
3182}
3183
3184static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3185 u64 remaining, u64 expires)
3186{
3187 struct cfs_rq *cfs_rq;
3188 u64 runtime = remaining;
3189
3190 rcu_read_lock();
3191 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3192 throttled_list) {
3193 struct rq *rq = rq_of(cfs_rq);
3194
3195 raw_spin_lock(&rq->lock);
3196 if (!cfs_rq_throttled(cfs_rq))
3197 goto next;
3198
3199 runtime = -cfs_rq->runtime_remaining + 1;
3200 if (runtime > remaining)
3201 runtime = remaining;
3202 remaining -= runtime;
3203
3204 cfs_rq->runtime_remaining += runtime;
3205 cfs_rq->runtime_expires = expires;
3206
3207
3208 if (cfs_rq->runtime_remaining > 0)
3209 unthrottle_cfs_rq(cfs_rq);
3210
3211next:
3212 raw_spin_unlock(&rq->lock);
3213
3214 if (!remaining)
3215 break;
3216 }
3217 rcu_read_unlock();
3218
3219 return remaining;
3220}
3221
3222
3223
3224
3225
3226
3227
3228static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3229{
3230 u64 runtime, runtime_expires;
3231 int idle = 1, throttled;
3232
3233 raw_spin_lock(&cfs_b->lock);
3234
3235 if (cfs_b->quota == RUNTIME_INF)
3236 goto out_unlock;
3237
3238 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3239
3240 idle = cfs_b->idle && !throttled;
3241 cfs_b->nr_periods += overrun;
3242
3243
3244 if (idle)
3245 goto out_unlock;
3246
3247
3248
3249
3250
3251
3252 cfs_b->timer_active = 1;
3253
3254 __refill_cfs_bandwidth_runtime(cfs_b);
3255
3256 if (!throttled) {
3257
3258 cfs_b->idle = 1;
3259 goto out_unlock;
3260 }
3261
3262
3263 cfs_b->nr_throttled += overrun;
3264
3265
3266
3267
3268
3269
3270
3271 runtime = cfs_b->runtime;
3272 runtime_expires = cfs_b->runtime_expires;
3273 cfs_b->runtime = 0;
3274
3275
3276
3277
3278
3279
3280 while (throttled && runtime > 0) {
3281 raw_spin_unlock(&cfs_b->lock);
3282
3283 runtime = distribute_cfs_runtime(cfs_b, runtime,
3284 runtime_expires);
3285 raw_spin_lock(&cfs_b->lock);
3286
3287 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3288 }
3289
3290
3291 cfs_b->runtime = runtime;
3292
3293
3294
3295
3296
3297
3298 cfs_b->idle = 0;
3299out_unlock:
3300 if (idle)
3301 cfs_b->timer_active = 0;
3302 raw_spin_unlock(&cfs_b->lock);
3303
3304 return idle;
3305}
3306
3307
3308static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3309
3310static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3311
3312static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3313
3314
3315
3316
3317
3318
3319
3320
3321static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3322{
3323 struct hrtimer *refresh_timer = &cfs_b->period_timer;
3324 u64 remaining;
3325
3326
3327 if (hrtimer_callback_running(refresh_timer))
3328 return 1;
3329
3330
3331 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3332 if (remaining < min_expire)
3333 return 1;
3334
3335 return 0;
3336}
3337
3338static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3339{
3340 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3341
3342
3343 if (runtime_refresh_within(cfs_b, min_left))
3344 return;
3345
3346 start_bandwidth_timer(&cfs_b->slack_timer,
3347 ns_to_ktime(cfs_bandwidth_slack_period));
3348}
3349
3350
3351static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3352{
3353 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3354 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3355
3356 if (slack_runtime <= 0)
3357 return;
3358
3359 raw_spin_lock(&cfs_b->lock);
3360 if (cfs_b->quota != RUNTIME_INF &&
3361 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3362 cfs_b->runtime += slack_runtime;
3363
3364
3365 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3366 !list_empty(&cfs_b->throttled_cfs_rq))
3367 start_cfs_slack_bandwidth(cfs_b);
3368 }
3369 raw_spin_unlock(&cfs_b->lock);
3370
3371
3372 cfs_rq->runtime_remaining -= slack_runtime;
3373}
3374
3375static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3376{
3377 if (!cfs_bandwidth_used())
3378 return;
3379
3380 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3381 return;
3382
3383 __return_cfs_rq_runtime(cfs_rq);
3384}
3385
3386
3387
3388
3389
3390static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3391{
3392 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3393 u64 expires;
3394
3395
3396 raw_spin_lock(&cfs_b->lock);
3397 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3398 raw_spin_unlock(&cfs_b->lock);
3399 return;
3400 }
3401
3402 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3403 runtime = cfs_b->runtime;
3404 cfs_b->runtime = 0;
3405 }
3406 expires = cfs_b->runtime_expires;
3407 raw_spin_unlock(&cfs_b->lock);
3408
3409 if (!runtime)
3410 return;
3411
3412 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3413
3414 raw_spin_lock(&cfs_b->lock);
3415 if (expires == cfs_b->runtime_expires)
3416 cfs_b->runtime = runtime;
3417 raw_spin_unlock(&cfs_b->lock);
3418}
3419
3420
3421
3422
3423
3424
3425static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3426{
3427 if (!cfs_bandwidth_used())
3428 return;
3429
3430
3431 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3432 return;
3433
3434
3435 if (cfs_rq_throttled(cfs_rq))
3436 return;
3437
3438
3439 account_cfs_rq_runtime(cfs_rq, 0);
3440 if (cfs_rq->runtime_remaining <= 0)
3441 throttle_cfs_rq(cfs_rq);
3442}
3443
3444
3445static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3446{
3447 if (!cfs_bandwidth_used())
3448 return;
3449
3450 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3451 return;
3452
3453
3454
3455
3456
3457 if (cfs_rq_throttled(cfs_rq))
3458 return;
3459
3460 throttle_cfs_rq(cfs_rq);
3461}
3462
3463static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3464{
3465 struct cfs_bandwidth *cfs_b =
3466 container_of(timer, struct cfs_bandwidth, slack_timer);
3467 do_sched_cfs_slack_timer(cfs_b);
3468
3469 return HRTIMER_NORESTART;
3470}
3471
3472static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3473{
3474 struct cfs_bandwidth *cfs_b =
3475 container_of(timer, struct cfs_bandwidth, period_timer);
3476 ktime_t now;
3477 int overrun;
3478 int idle = 0;
3479
3480 for (;;) {
3481 now = hrtimer_cb_get_time(timer);
3482 overrun = hrtimer_forward(timer, now, cfs_b->period);
3483
3484 if (!overrun)
3485 break;
3486
3487 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3488 }
3489
3490 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3491}
3492
3493void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3494{
3495 raw_spin_lock_init(&cfs_b->lock);
3496 cfs_b->runtime = 0;
3497 cfs_b->quota = RUNTIME_INF;
3498 cfs_b->period = ns_to_ktime(default_cfs_period());
3499
3500 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3501 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3502 cfs_b->period_timer.function = sched_cfs_period_timer;
3503 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3504 cfs_b->slack_timer.function = sched_cfs_slack_timer;
3505}
3506
3507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3508{
3509 cfs_rq->runtime_enabled = 0;
3510 INIT_LIST_HEAD(&cfs_rq->throttled_list);
3511}
3512
3513
3514void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3515{
3516
3517
3518
3519
3520
3521
3522 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3523 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3524
3525 raw_spin_unlock(&cfs_b->lock);
3526 cpu_relax();
3527 raw_spin_lock(&cfs_b->lock);
3528
3529 if (cfs_b->timer_active)
3530 return;
3531 }
3532
3533 cfs_b->timer_active = 1;
3534 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3535}
3536
3537static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3538{
3539 hrtimer_cancel(&cfs_b->period_timer);
3540 hrtimer_cancel(&cfs_b->slack_timer);
3541}
3542
3543static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3544{
3545 struct cfs_rq *cfs_rq;
3546
3547 for_each_leaf_cfs_rq(rq, cfs_rq) {
3548 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3549
3550 if (!cfs_rq->runtime_enabled)
3551 continue;
3552
3553
3554
3555
3556
3557 cfs_rq->runtime_remaining = cfs_b->quota;
3558 if (cfs_rq_throttled(cfs_rq))
3559 unthrottle_cfs_rq(cfs_rq);
3560 }
3561}
3562
3563#else
3564static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3565{
3566 return rq_clock_task(rq_of(cfs_rq));
3567}
3568
3569static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3570static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3571static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3572static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3573
3574static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3575{
3576 return 0;
3577}
3578
3579static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3580{
3581 return 0;
3582}
3583
3584static inline int throttled_lb_pair(struct task_group *tg,
3585 int src_cpu, int dest_cpu)
3586{
3587 return 0;
3588}
3589
3590void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3591
3592#ifdef CONFIG_FAIR_GROUP_SCHED
3593static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3594#endif
3595
3596static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3597{
3598 return NULL;
3599}
3600static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3601static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3602
3603#endif
3604
3605
3606
3607
3608
3609#ifdef CONFIG_SCHED_HRTICK
3610static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3611{
3612 struct sched_entity *se = &p->se;
3613 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3614
3615 WARN_ON(task_rq(p) != rq);
3616
3617 if (cfs_rq->nr_running > 1) {
3618 u64 slice = sched_slice(cfs_rq, se);
3619 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3620 s64 delta = slice - ran;
3621
3622 if (delta < 0) {
3623 if (rq->curr == p)
3624 resched_task(p);
3625 return;
3626 }
3627
3628
3629
3630
3631
3632 if (rq->curr != p)
3633 delta = max_t(s64, 10000LL, delta);
3634
3635 hrtick_start(rq, delta);
3636 }
3637}
3638
3639
3640
3641
3642
3643
3644static void hrtick_update(struct rq *rq)
3645{
3646 struct task_struct *curr = rq->curr;
3647
3648 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
3649 return;
3650
3651 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3652 hrtick_start_fair(rq, curr);
3653}
3654#else
3655static inline void
3656hrtick_start_fair(struct rq *rq, struct task_struct *p)
3657{
3658}
3659
3660static inline void hrtick_update(struct rq *rq)
3661{
3662}
3663#endif
3664
3665
3666
3667
3668
3669
3670static void
3671enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3672{
3673 struct cfs_rq *cfs_rq;
3674 struct sched_entity *se = &p->se;
3675
3676 for_each_sched_entity(se) {
3677 if (se->on_rq)
3678 break;
3679 cfs_rq = cfs_rq_of(se);
3680 enqueue_entity(cfs_rq, se, flags);
3681
3682
3683
3684
3685
3686
3687
3688 if (cfs_rq_throttled(cfs_rq))
3689 break;
3690 cfs_rq->h_nr_running++;
3691
3692 flags = ENQUEUE_WAKEUP;
3693 }
3694
3695 for_each_sched_entity(se) {
3696 cfs_rq = cfs_rq_of(se);
3697 cfs_rq->h_nr_running++;
3698
3699 if (cfs_rq_throttled(cfs_rq))
3700 break;
3701
3702 update_cfs_shares(cfs_rq);
3703 update_entity_load_avg(se, 1);
3704 }
3705
3706 if (!se) {
3707 update_rq_runnable_avg(rq, rq->nr_running);
3708 inc_nr_running(rq);
3709 }
3710 hrtick_update(rq);
3711}
3712
3713static void set_next_buddy(struct sched_entity *se);
3714
3715
3716
3717
3718
3719
3720static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3721{
3722 struct cfs_rq *cfs_rq;
3723 struct sched_entity *se = &p->se;
3724 int task_sleep = flags & DEQUEUE_SLEEP;
3725
3726 for_each_sched_entity(se) {
3727 cfs_rq = cfs_rq_of(se);
3728 dequeue_entity(cfs_rq, se, flags);
3729
3730
3731
3732
3733
3734
3735
3736 if (cfs_rq_throttled(cfs_rq))
3737 break;
3738 cfs_rq->h_nr_running--;
3739
3740
3741 if (cfs_rq->load.weight) {
3742
3743
3744
3745
3746 if (task_sleep && parent_entity(se))
3747 set_next_buddy(parent_entity(se));
3748
3749
3750 se = parent_entity(se);
3751 break;
3752 }
3753 flags |= DEQUEUE_SLEEP;
3754 }
3755
3756 for_each_sched_entity(se) {
3757 cfs_rq = cfs_rq_of(se);
3758 cfs_rq->h_nr_running--;
3759
3760 if (cfs_rq_throttled(cfs_rq))
3761 break;
3762
3763 update_cfs_shares(cfs_rq);
3764 update_entity_load_avg(se, 1);
3765 }
3766
3767 if (!se) {
3768 dec_nr_running(rq);
3769 update_rq_runnable_avg(rq, 1);
3770 }
3771 hrtick_update(rq);
3772}
3773
3774#ifdef CONFIG_SMP
3775
3776static unsigned long weighted_cpuload(const int cpu)
3777{
3778 return cpu_rq(cpu)->cfs.runnable_load_avg;
3779}
3780
3781
3782
3783
3784
3785
3786
3787
3788static unsigned long source_load(int cpu, int type)
3789{
3790 struct rq *rq = cpu_rq(cpu);
3791 unsigned long total = weighted_cpuload(cpu);
3792
3793 if (type == 0 || !sched_feat(LB_BIAS))
3794 return total;
3795
3796 return min(rq->cpu_load[type-1], total);
3797}
3798
3799
3800
3801
3802
3803static unsigned long target_load(int cpu, int type)
3804{
3805 struct rq *rq = cpu_rq(cpu);
3806 unsigned long total = weighted_cpuload(cpu);
3807
3808 if (type == 0 || !sched_feat(LB_BIAS))
3809 return total;
3810
3811 return max(rq->cpu_load[type-1], total);
3812}
3813
3814static unsigned long power_of(int cpu)
3815{
3816 return cpu_rq(cpu)->cpu_power;
3817}
3818
3819static unsigned long cpu_avg_load_per_task(int cpu)
3820{
3821 struct rq *rq = cpu_rq(cpu);
3822 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3823 unsigned long load_avg = rq->cfs.runnable_load_avg;
3824
3825 if (nr_running)
3826 return load_avg / nr_running;
3827
3828 return 0;
3829}
3830
3831static void record_wakee(struct task_struct *p)
3832{
3833
3834
3835
3836
3837
3838 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3839 current->wakee_flips = 0;
3840 current->wakee_flip_decay_ts = jiffies;
3841 }
3842
3843 if (current->last_wakee != p) {
3844 current->last_wakee = p;
3845 current->wakee_flips++;
3846 }
3847}
3848
3849static void task_waking_fair(struct task_struct *p)
3850{
3851 struct sched_entity *se = &p->se;
3852 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3853 u64 min_vruntime;
3854
3855#ifndef CONFIG_64BIT
3856 u64 min_vruntime_copy;
3857
3858 do {
3859 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3860 smp_rmb();
3861 min_vruntime = cfs_rq->min_vruntime;
3862 } while (min_vruntime != min_vruntime_copy);
3863#else
3864 min_vruntime = cfs_rq->min_vruntime;
3865#endif
3866
3867 se->vruntime -= min_vruntime;
3868 record_wakee(p);
3869}
3870
3871#ifdef CONFIG_FAIR_GROUP_SCHED
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3923{
3924 struct sched_entity *se = tg->se[cpu];
3925
3926 if (!tg->parent)
3927 return wl;
3928
3929 for_each_sched_entity(se) {
3930 long w, W;
3931
3932 tg = se->my_q->tg;
3933
3934
3935
3936
3937 W = wg + calc_tg_weight(tg, se->my_q);
3938
3939
3940
3941
3942 w = se->my_q->load.weight + wl;
3943
3944
3945
3946
3947 if (W > 0 && w < W)
3948 wl = (w * tg->shares) / W;
3949 else
3950 wl = tg->shares;
3951
3952
3953
3954
3955
3956
3957 if (wl < MIN_SHARES)
3958 wl = MIN_SHARES;
3959
3960
3961
3962
3963 wl -= se->load.weight;
3964
3965
3966
3967
3968
3969
3970
3971
3972 wg = 0;
3973 }
3974
3975 return wl;
3976}
3977#else
3978
3979static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3980{
3981 return wl;
3982}
3983
3984#endif
3985
3986static int wake_wide(struct task_struct *p)
3987{
3988 int factor = this_cpu_read(sd_llc_size);
3989
3990
3991
3992
3993
3994
3995 if (p->wakee_flips > factor) {
3996
3997
3998
3999
4000
4001 if (current->wakee_flips > (factor * p->wakee_flips))
4002 return 1;
4003 }
4004
4005 return 0;
4006}
4007
4008static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4009{
4010 s64 this_load, load;
4011 int idx, this_cpu, prev_cpu;
4012 unsigned long tl_per_task;
4013 struct task_group *tg;
4014 unsigned long weight;
4015 int balanced;
4016
4017
4018
4019
4020
4021 if (wake_wide(p))
4022 return 0;
4023
4024 idx = sd->wake_idx;
4025 this_cpu = smp_processor_id();
4026 prev_cpu = task_cpu(p);
4027 load = source_load(prev_cpu, idx);
4028 this_load = target_load(this_cpu, idx);
4029
4030
4031
4032
4033
4034
4035 if (sync) {
4036 tg = task_group(current);
4037 weight = current->se.load.weight;
4038
4039 this_load += effective_load(tg, this_cpu, -weight, -weight);
4040 load += effective_load(tg, prev_cpu, 0, -weight);
4041 }
4042
4043 tg = task_group(p);
4044 weight = p->se.load.weight;
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055 if (this_load > 0) {
4056 s64 this_eff_load, prev_eff_load;
4057
4058 this_eff_load = 100;
4059 this_eff_load *= power_of(prev_cpu);
4060 this_eff_load *= this_load +
4061 effective_load(tg, this_cpu, weight, weight);
4062
4063 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4064 prev_eff_load *= power_of(this_cpu);
4065 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4066
4067 balanced = this_eff_load <= prev_eff_load;
4068 } else
4069 balanced = true;
4070
4071
4072
4073
4074
4075
4076 if (sync && balanced)
4077 return 1;
4078
4079 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4080 tl_per_task = cpu_avg_load_per_task(this_cpu);
4081
4082 if (balanced ||
4083 (this_load <= load &&
4084 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4085
4086
4087
4088
4089
4090 schedstat_inc(sd, ttwu_move_affine);
4091 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4092
4093 return 1;
4094 }
4095 return 0;
4096}
4097
4098
4099
4100
4101
4102static struct sched_group *
4103find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4104 int this_cpu, int load_idx)
4105{
4106 struct sched_group *idlest = NULL, *group = sd->groups;
4107 unsigned long min_load = ULONG_MAX, this_load = 0;
4108 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4109
4110 do {
4111 unsigned long load, avg_load;
4112 int local_group;
4113 int i;
4114
4115
4116 if (!cpumask_intersects(sched_group_cpus(group),
4117 tsk_cpus_allowed(p)))
4118 continue;
4119
4120 local_group = cpumask_test_cpu(this_cpu,
4121 sched_group_cpus(group));
4122
4123
4124 avg_load = 0;
4125
4126 for_each_cpu(i, sched_group_cpus(group)) {
4127
4128 if (local_group)
4129 load = source_load(i, load_idx);
4130 else
4131 load = target_load(i, load_idx);
4132
4133 avg_load += load;
4134 }
4135
4136
4137 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
4138
4139 if (local_group) {
4140 this_load = avg_load;
4141 } else if (avg_load < min_load) {
4142 min_load = avg_load;
4143 idlest = group;
4144 }
4145 } while (group = group->next, group != sd->groups);
4146
4147 if (!idlest || 100*this_load < imbalance*min_load)
4148 return NULL;
4149 return idlest;
4150}
4151
4152
4153
4154
4155static int
4156find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4157{
4158 unsigned long load, min_load = ULONG_MAX;
4159 int idlest = -1;
4160 int i;
4161
4162
4163 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4164 load = weighted_cpuload(i);
4165
4166 if (load < min_load || (load == min_load && i == this_cpu)) {
4167 min_load = load;
4168 idlest = i;
4169 }
4170 }
4171
4172 return idlest;
4173}
4174
4175
4176
4177
4178static int select_idle_sibling(struct task_struct *p, int target)
4179{
4180 struct sched_domain *sd;
4181 struct sched_group *sg;
4182 int i = task_cpu(p);
4183
4184 if (idle_cpu(target))
4185 return target;
4186
4187
4188
4189
4190 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4191 return i;
4192
4193
4194
4195
4196 sd = rcu_dereference(per_cpu(sd_llc, target));
4197 for_each_lower_domain(sd) {
4198 sg = sd->groups;
4199 do {
4200 if (!cpumask_intersects(sched_group_cpus(sg),
4201 tsk_cpus_allowed(p)))
4202 goto next;
4203
4204 for_each_cpu(i, sched_group_cpus(sg)) {
4205 if (i == target || !idle_cpu(i))
4206 goto next;
4207 }
4208
4209 target = cpumask_first_and(sched_group_cpus(sg),
4210 tsk_cpus_allowed(p));
4211 goto done;
4212next:
4213 sg = sg->next;
4214 } while (sg != sd->groups);
4215 }
4216done:
4217 return target;
4218}
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231static int
4232select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4233{
4234 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4235 int cpu = smp_processor_id();
4236 int new_cpu = cpu;
4237 int want_affine = 0;
4238 int sync = wake_flags & WF_SYNC;
4239
4240 if (p->nr_cpus_allowed == 1)
4241 return prev_cpu;
4242
4243 if (sd_flag & SD_BALANCE_WAKE) {
4244 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4245 want_affine = 1;
4246 new_cpu = prev_cpu;
4247 }
4248
4249 rcu_read_lock();
4250 for_each_domain(cpu, tmp) {
4251 if (!(tmp->flags & SD_LOAD_BALANCE))
4252 continue;
4253
4254
4255
4256
4257
4258 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4259 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4260 affine_sd = tmp;
4261 break;
4262 }
4263
4264 if (tmp->flags & sd_flag)
4265 sd = tmp;
4266 }
4267
4268 if (affine_sd) {
4269 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4270 prev_cpu = cpu;
4271
4272 new_cpu = select_idle_sibling(p, prev_cpu);
4273 goto unlock;
4274 }
4275
4276 while (sd) {
4277 int load_idx = sd->forkexec_idx;
4278 struct sched_group *group;
4279 int weight;
4280
4281 if (!(sd->flags & sd_flag)) {
4282 sd = sd->child;
4283 continue;
4284 }
4285
4286 if (sd_flag & SD_BALANCE_WAKE)
4287 load_idx = sd->wake_idx;
4288
4289 group = find_idlest_group(sd, p, cpu, load_idx);
4290 if (!group) {
4291 sd = sd->child;
4292 continue;
4293 }
4294
4295 new_cpu = find_idlest_cpu(group, p, cpu);
4296 if (new_cpu == -1 || new_cpu == cpu) {
4297
4298 sd = sd->child;
4299 continue;
4300 }
4301
4302
4303 cpu = new_cpu;
4304 weight = sd->span_weight;
4305 sd = NULL;
4306 for_each_domain(cpu, tmp) {
4307 if (weight <= tmp->span_weight)
4308 break;
4309 if (tmp->flags & sd_flag)
4310 sd = tmp;
4311 }
4312
4313 }
4314unlock:
4315 rcu_read_unlock();
4316
4317 return new_cpu;
4318}
4319
4320
4321
4322
4323
4324
4325
4326static void
4327migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4328{
4329 struct sched_entity *se = &p->se;
4330 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4331
4332
4333
4334
4335
4336
4337
4338 if (se->avg.decay_count) {
4339 se->avg.decay_count = -__synchronize_entity_decay(se);
4340 atomic_long_add(se->avg.load_avg_contrib,
4341 &cfs_rq->removed_load);
4342 }
4343}
4344#endif
4345
4346static unsigned long
4347wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4348{
4349 unsigned long gran = sysctl_sched_wakeup_granularity;
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364 return calc_delta_fair(gran, se);
4365}
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381static int
4382wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4383{
4384 s64 gran, vdiff = curr->vruntime - se->vruntime;
4385
4386 if (vdiff <= 0)
4387 return -1;
4388
4389 gran = wakeup_gran(curr, se);
4390 if (vdiff > gran)
4391 return 1;
4392
4393 return 0;
4394}
4395
4396static void set_last_buddy(struct sched_entity *se)
4397{
4398 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4399 return;
4400
4401 for_each_sched_entity(se)
4402 cfs_rq_of(se)->last = se;
4403}
4404
4405static void set_next_buddy(struct sched_entity *se)
4406{
4407 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4408 return;
4409
4410 for_each_sched_entity(se)
4411 cfs_rq_of(se)->next = se;
4412}
4413
4414static void set_skip_buddy(struct sched_entity *se)
4415{
4416 for_each_sched_entity(se)
4417 cfs_rq_of(se)->skip = se;
4418}
4419
4420
4421
4422
4423static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4424{
4425 struct task_struct *curr = rq->curr;
4426 struct sched_entity *se = &curr->se, *pse = &p->se;
4427 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4428 int scale = cfs_rq->nr_running >= sched_nr_latency;
4429 int next_buddy_marked = 0;
4430
4431 if (unlikely(se == pse))
4432 return;
4433
4434
4435
4436
4437
4438
4439
4440 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4441 return;
4442
4443 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4444 set_next_buddy(pse);
4445 next_buddy_marked = 1;
4446 }
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458 if (test_tsk_need_resched(curr))
4459 return;
4460
4461
4462 if (unlikely(curr->policy == SCHED_IDLE) &&
4463 likely(p->policy != SCHED_IDLE))
4464 goto preempt;
4465
4466
4467
4468
4469
4470 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4471 return;
4472
4473 find_matching_se(&se, &pse);
4474 update_curr(cfs_rq_of(se));
4475 BUG_ON(!pse);
4476 if (wakeup_preempt_entity(se, pse) == 1) {
4477
4478
4479
4480
4481 if (!next_buddy_marked)
4482 set_next_buddy(pse);
4483 goto preempt;
4484 }
4485
4486 return;
4487
4488preempt:
4489 resched_task(curr);
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499 if (unlikely(!se->on_rq || curr == rq->idle))
4500 return;
4501
4502 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4503 set_last_buddy(se);
4504}
4505
4506static struct task_struct *pick_next_task_fair(struct rq *rq)
4507{
4508 struct task_struct *p;
4509 struct cfs_rq *cfs_rq = &rq->cfs;
4510 struct sched_entity *se;
4511
4512 if (!cfs_rq->nr_running)
4513 return NULL;
4514
4515 do {
4516 se = pick_next_entity(cfs_rq);
4517 set_next_entity(cfs_rq, se);
4518 cfs_rq = group_cfs_rq(se);
4519 } while (cfs_rq);
4520
4521 p = task_of(se);
4522 if (hrtick_enabled(rq))
4523 hrtick_start_fair(rq, p);
4524
4525 return p;
4526}
4527
4528
4529
4530
4531static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4532{
4533 struct sched_entity *se = &prev->se;
4534 struct cfs_rq *cfs_rq;
4535
4536 for_each_sched_entity(se) {
4537 cfs_rq = cfs_rq_of(se);
4538 put_prev_entity(cfs_rq, se);
4539 }
4540}
4541
4542
4543
4544
4545
4546
4547static void yield_task_fair(struct rq *rq)
4548{
4549 struct task_struct *curr = rq->curr;
4550 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4551 struct sched_entity *se = &curr->se;
4552
4553
4554
4555
4556 if (unlikely(rq->nr_running == 1))
4557 return;
4558
4559 clear_buddies(cfs_rq, se);
4560
4561 if (curr->policy != SCHED_BATCH) {
4562 update_rq_clock(rq);
4563
4564
4565
4566 update_curr(cfs_rq);
4567
4568
4569
4570
4571
4572 rq->skip_clock_update = 1;
4573 }
4574
4575 set_skip_buddy(se);
4576}
4577
4578static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4579{
4580 struct sched_entity *se = &p->se;
4581
4582
4583 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4584 return false;
4585
4586
4587 set_next_buddy(se);
4588
4589 yield_task_fair(rq);
4590
4591 return true;
4592}
4593
4594#ifdef CONFIG_SMP
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4714
4715enum fbq_type { regular, remote, all };
4716
4717#define LBF_ALL_PINNED 0x01
4718#define LBF_NEED_BREAK 0x02
4719#define LBF_DST_PINNED 0x04
4720#define LBF_SOME_PINNED 0x08
4721
4722struct lb_env {
4723 struct sched_domain *sd;
4724
4725 struct rq *src_rq;
4726 int src_cpu;
4727
4728 int dst_cpu;
4729 struct rq *dst_rq;
4730
4731 struct cpumask *dst_grpmask;
4732 int new_dst_cpu;
4733 enum cpu_idle_type idle;
4734 long imbalance;
4735
4736 struct cpumask *cpus;
4737
4738 unsigned int flags;
4739
4740 unsigned int loop;
4741 unsigned int loop_break;
4742 unsigned int loop_max;
4743
4744 enum fbq_type fbq_type;
4745};
4746
4747
4748
4749
4750
4751static void move_task(struct task_struct *p, struct lb_env *env)
4752{
4753 deactivate_task(env->src_rq, p, 0);
4754 set_task_cpu(p, env->dst_cpu);
4755 activate_task(env->dst_rq, p, 0);
4756 check_preempt_curr(env->dst_rq, p, 0);
4757}
4758
4759
4760
4761
4762static int
4763task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4764{
4765 s64 delta;
4766
4767 if (p->sched_class != &fair_sched_class)
4768 return 0;
4769
4770 if (unlikely(p->policy == SCHED_IDLE))
4771 return 0;
4772
4773
4774
4775
4776 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4777 (&p->se == cfs_rq_of(&p->se)->next ||
4778 &p->se == cfs_rq_of(&p->se)->last))
4779 return 1;
4780
4781 if (sysctl_sched_migration_cost == -1)
4782 return 1;
4783 if (sysctl_sched_migration_cost == 0)
4784 return 0;
4785
4786 delta = now - p->se.exec_start;
4787
4788 return delta < (s64)sysctl_sched_migration_cost;
4789}
4790
4791#ifdef CONFIG_NUMA_BALANCING
4792
4793static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4794{
4795 int src_nid, dst_nid;
4796
4797 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4798 !(env->sd->flags & SD_NUMA)) {
4799 return false;
4800 }
4801
4802 src_nid = cpu_to_node(env->src_cpu);
4803 dst_nid = cpu_to_node(env->dst_cpu);
4804
4805 if (src_nid == dst_nid)
4806 return false;
4807
4808
4809 if (dst_nid == p->numa_preferred_nid)
4810 return true;
4811
4812
4813 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4814 group_weight(p, dst_nid) > group_weight(p, src_nid))
4815 return true;
4816
4817 return false;
4818}
4819
4820
4821static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4822{
4823 int src_nid, dst_nid;
4824
4825 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4826 return false;
4827
4828 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4829 return false;
4830
4831 src_nid = cpu_to_node(env->src_cpu);
4832 dst_nid = cpu_to_node(env->dst_cpu);
4833
4834 if (src_nid == dst_nid)
4835 return false;
4836
4837
4838 if (src_nid == p->numa_preferred_nid)
4839 return true;
4840
4841
4842 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4843 group_weight(p, dst_nid) < group_weight(p, src_nid))
4844 return true;
4845
4846 return false;
4847}
4848
4849#else
4850static inline bool migrate_improves_locality(struct task_struct *p,
4851 struct lb_env *env)
4852{
4853 return false;
4854}
4855
4856static inline bool migrate_degrades_locality(struct task_struct *p,
4857 struct lb_env *env)
4858{
4859 return false;
4860}
4861#endif
4862
4863
4864
4865
4866static
4867int can_migrate_task(struct task_struct *p, struct lb_env *env)
4868{
4869 int tsk_cache_hot = 0;
4870
4871
4872
4873
4874
4875
4876
4877 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4878 return 0;
4879
4880 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
4881 int cpu;
4882
4883 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
4884
4885 env->flags |= LBF_SOME_PINNED;
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4896 return 0;
4897
4898
4899 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4900 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4901 env->flags |= LBF_DST_PINNED;
4902 env->new_dst_cpu = cpu;
4903 break;
4904 }
4905 }
4906
4907 return 0;
4908 }
4909
4910
4911 env->flags &= ~LBF_ALL_PINNED;
4912
4913 if (task_running(env->src_rq, p)) {
4914 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4915 return 0;
4916 }
4917
4918
4919
4920
4921
4922
4923
4924 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4925 if (!tsk_cache_hot)
4926 tsk_cache_hot = migrate_degrades_locality(p, env);
4927
4928 if (migrate_improves_locality(p, env)) {
4929#ifdef CONFIG_SCHEDSTATS
4930 if (tsk_cache_hot) {
4931 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4932 schedstat_inc(p, se.statistics.nr_forced_migrations);
4933 }
4934#endif
4935 return 1;
4936 }
4937
4938 if (!tsk_cache_hot ||
4939 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4940
4941 if (tsk_cache_hot) {
4942 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4943 schedstat_inc(p, se.statistics.nr_forced_migrations);
4944 }
4945
4946 return 1;
4947 }
4948
4949 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4950 return 0;
4951}
4952
4953
4954
4955
4956
4957
4958
4959
4960static int move_one_task(struct lb_env *env)
4961{
4962 struct task_struct *p, *n;
4963
4964 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4965 if (!can_migrate_task(p, env))
4966 continue;
4967
4968 move_task(p, env);
4969
4970
4971
4972
4973
4974 schedstat_inc(env->sd, lb_gained[env->idle]);
4975 return 1;
4976 }
4977 return 0;
4978}
4979
4980static const unsigned int sched_nr_migrate_break = 32;
4981
4982
4983
4984
4985
4986
4987
4988
4989static int move_tasks(struct lb_env *env)
4990{
4991 struct list_head *tasks = &env->src_rq->cfs_tasks;
4992 struct task_struct *p;
4993 unsigned long load;
4994 int pulled = 0;
4995
4996 if (env->imbalance <= 0)
4997 return 0;
4998
4999 while (!list_empty(tasks)) {
5000 p = list_first_entry(tasks, struct task_struct, se.group_node);
5001
5002 env->loop++;
5003
5004 if (env->loop > env->loop_max)
5005 break;
5006
5007
5008 if (env->loop > env->loop_break) {
5009 env->loop_break += sched_nr_migrate_break;
5010 env->flags |= LBF_NEED_BREAK;
5011 break;
5012 }
5013
5014 if (!can_migrate_task(p, env))
5015 goto next;
5016
5017 load = task_h_load(p);
5018
5019 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5020 goto next;
5021
5022 if ((load / 2) > env->imbalance)
5023 goto next;
5024
5025 move_task(p, env);
5026 pulled++;
5027 env->imbalance -= load;
5028
5029#ifdef CONFIG_PREEMPT
5030
5031
5032
5033
5034
5035 if (env->idle == CPU_NEWLY_IDLE)
5036 break;
5037#endif
5038
5039
5040
5041
5042
5043 if (env->imbalance <= 0)
5044 break;
5045
5046 continue;
5047next:
5048 list_move_tail(&p->se.group_node, tasks);
5049 }
5050
5051
5052
5053
5054
5055
5056 schedstat_add(env->sd, lb_gained[env->idle], pulled);
5057
5058 return pulled;
5059}
5060
5061#ifdef CONFIG_FAIR_GROUP_SCHED
5062
5063
5064
5065static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5066{
5067 struct sched_entity *se = tg->se[cpu];
5068 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5069
5070
5071 if (throttled_hierarchy(cfs_rq))
5072 return;
5073
5074 update_cfs_rq_blocked_load(cfs_rq, 1);
5075
5076 if (se) {
5077 update_entity_load_avg(se, 1);
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5088 list_del_leaf_cfs_rq(cfs_rq);
5089 } else {
5090 struct rq *rq = rq_of(cfs_rq);
5091 update_rq_runnable_avg(rq, rq->nr_running);
5092 }
5093}
5094
5095static void update_blocked_averages(int cpu)
5096{
5097 struct rq *rq = cpu_rq(cpu);
5098 struct cfs_rq *cfs_rq;
5099 unsigned long flags;
5100
5101 raw_spin_lock_irqsave(&rq->lock, flags);
5102 update_rq_clock(rq);
5103
5104
5105
5106
5107 for_each_leaf_cfs_rq(rq, cfs_rq) {
5108
5109
5110
5111
5112
5113 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5114 }
5115
5116 raw_spin_unlock_irqrestore(&rq->lock, flags);
5117}
5118
5119
5120
5121
5122
5123
5124static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5125{
5126 struct rq *rq = rq_of(cfs_rq);
5127 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5128 unsigned long now = jiffies;
5129 unsigned long load;
5130
5131 if (cfs_rq->last_h_load_update == now)
5132 return;
5133
5134 cfs_rq->h_load_next = NULL;
5135 for_each_sched_entity(se) {
5136 cfs_rq = cfs_rq_of(se);
5137 cfs_rq->h_load_next = se;
5138 if (cfs_rq->last_h_load_update == now)
5139 break;
5140 }
5141
5142 if (!se) {
5143 cfs_rq->h_load = cfs_rq->runnable_load_avg;
5144 cfs_rq->last_h_load_update = now;
5145 }
5146
5147 while ((se = cfs_rq->h_load_next) != NULL) {
5148 load = cfs_rq->h_load;
5149 load = div64_ul(load * se->avg.load_avg_contrib,
5150 cfs_rq->runnable_load_avg + 1);
5151 cfs_rq = group_cfs_rq(se);
5152 cfs_rq->h_load = load;
5153 cfs_rq->last_h_load_update = now;
5154 }
5155}
5156
5157static unsigned long task_h_load(struct task_struct *p)
5158{
5159 struct cfs_rq *cfs_rq = task_cfs_rq(p);
5160
5161 update_cfs_rq_h_load(cfs_rq);
5162 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
5163 cfs_rq->runnable_load_avg + 1);
5164}
5165#else
5166static inline void update_blocked_averages(int cpu)
5167{
5168}
5169
5170static unsigned long task_h_load(struct task_struct *p)
5171{
5172 return p->se.avg.load_avg_contrib;
5173}
5174#endif
5175
5176
5177
5178
5179
5180struct sg_lb_stats {
5181 unsigned long avg_load;
5182 unsigned long group_load;
5183 unsigned long sum_weighted_load;
5184 unsigned long load_per_task;
5185 unsigned long group_power;
5186 unsigned int sum_nr_running;
5187 unsigned int group_capacity;
5188 unsigned int idle_cpus;
5189 unsigned int group_weight;
5190 int group_imb;
5191 int group_has_capacity;
5192#ifdef CONFIG_NUMA_BALANCING
5193 unsigned int nr_numa_running;
5194 unsigned int nr_preferred_running;
5195#endif
5196};
5197
5198
5199
5200
5201
5202struct sd_lb_stats {
5203 struct sched_group *busiest;
5204 struct sched_group *local;
5205 unsigned long total_load;
5206 unsigned long total_pwr;
5207 unsigned long avg_load;
5208
5209 struct sg_lb_stats busiest_stat;
5210 struct sg_lb_stats local_stat;
5211};
5212
5213static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5214{
5215
5216
5217
5218
5219
5220
5221 *sds = (struct sd_lb_stats){
5222 .busiest = NULL,
5223 .local = NULL,
5224 .total_load = 0UL,
5225 .total_pwr = 0UL,
5226 .busiest_stat = {
5227 .avg_load = 0UL,
5228 },
5229 };
5230}
5231
5232
5233
5234
5235
5236
5237
5238
5239static inline int get_sd_load_idx(struct sched_domain *sd,
5240 enum cpu_idle_type idle)
5241{
5242 int load_idx;
5243
5244 switch (idle) {
5245 case CPU_NOT_IDLE:
5246 load_idx = sd->busy_idx;
5247 break;
5248
5249 case CPU_NEWLY_IDLE:
5250 load_idx = sd->newidle_idx;
5251 break;
5252 default:
5253 load_idx = sd->idle_idx;
5254 break;
5255 }
5256
5257 return load_idx;
5258}
5259
5260static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5261{
5262 return SCHED_POWER_SCALE;
5263}
5264
5265unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5266{
5267 return default_scale_freq_power(sd, cpu);
5268}
5269
5270static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5271{
5272 unsigned long weight = sd->span_weight;
5273 unsigned long smt_gain = sd->smt_gain;
5274
5275 smt_gain /= weight;
5276
5277 return smt_gain;
5278}
5279
5280unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5281{
5282 return default_scale_smt_power(sd, cpu);
5283}
5284
5285static unsigned long scale_rt_power(int cpu)
5286{
5287 struct rq *rq = cpu_rq(cpu);
5288 u64 total, available, age_stamp, avg;
5289
5290
5291
5292
5293
5294 age_stamp = ACCESS_ONCE(rq->age_stamp);
5295 avg = ACCESS_ONCE(rq->rt_avg);
5296
5297 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
5298
5299 if (unlikely(total < avg)) {
5300
5301 available = 0;
5302 } else {
5303 available = total - avg;
5304 }
5305
5306 if (unlikely((s64)total < SCHED_POWER_SCALE))
5307 total = SCHED_POWER_SCALE;
5308
5309 total >>= SCHED_POWER_SHIFT;
5310
5311 return div_u64(available, total);
5312}
5313
5314static void update_cpu_power(struct sched_domain *sd, int cpu)
5315{
5316 unsigned long weight = sd->span_weight;
5317 unsigned long power = SCHED_POWER_SCALE;
5318 struct sched_group *sdg = sd->groups;
5319
5320 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5321 if (sched_feat(ARCH_POWER))
5322 power *= arch_scale_smt_power(sd, cpu);
5323 else
5324 power *= default_scale_smt_power(sd, cpu);
5325
5326 power >>= SCHED_POWER_SHIFT;
5327 }
5328
5329 sdg->sgp->power_orig = power;
5330
5331 if (sched_feat(ARCH_POWER))
5332 power *= arch_scale_freq_power(sd, cpu);
5333 else
5334 power *= default_scale_freq_power(sd, cpu);
5335
5336 power >>= SCHED_POWER_SHIFT;
5337
5338 power *= scale_rt_power(cpu);
5339 power >>= SCHED_POWER_SHIFT;
5340
5341 if (!power)
5342 power = 1;
5343
5344 cpu_rq(cpu)->cpu_power = power;
5345 sdg->sgp->power = power;
5346}
5347
5348void update_group_power(struct sched_domain *sd, int cpu)
5349{
5350 struct sched_domain *child = sd->child;
5351 struct sched_group *group, *sdg = sd->groups;
5352 unsigned long power, power_orig;
5353 unsigned long interval;
5354
5355 interval = msecs_to_jiffies(sd->balance_interval);
5356 interval = clamp(interval, 1UL, max_load_balance_interval);
5357 sdg->sgp->next_update = jiffies + interval;
5358
5359 if (!child) {
5360 update_cpu_power(sd, cpu);
5361 return;
5362 }
5363
5364 power_orig = power = 0;
5365
5366 if (child->flags & SD_OVERLAP) {
5367
5368
5369
5370
5371
5372 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5373 struct sched_group_power *sgp;
5374 struct rq *rq = cpu_rq(cpu);
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389 if (unlikely(!rq->sd)) {
5390 power_orig += power_of(cpu);
5391 power += power_of(cpu);
5392 continue;
5393 }
5394
5395 sgp = rq->sd->groups->sgp;
5396 power_orig += sgp->power_orig;
5397 power += sgp->power;
5398 }
5399 } else {
5400
5401
5402
5403
5404
5405 group = child->groups;
5406 do {
5407 power_orig += group->sgp->power_orig;
5408 power += group->sgp->power;
5409 group = group->next;
5410 } while (group != child->groups);
5411 }
5412
5413 sdg->sgp->power_orig = power_orig;
5414 sdg->sgp->power = power;
5415}
5416
5417
5418
5419
5420
5421
5422
5423
5424static inline int
5425fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5426{
5427
5428
5429
5430 if (!(sd->flags & SD_SHARE_CPUPOWER))
5431 return 0;
5432
5433
5434
5435
5436 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5437 return 1;
5438
5439 return 0;
5440}
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471static inline int sg_imbalanced(struct sched_group *group)
5472{
5473 return group->sgp->imbalance;
5474}
5475
5476
5477
5478
5479
5480
5481
5482
5483static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
5484{
5485 unsigned int capacity, smt, cpus;
5486 unsigned int power, power_orig;
5487
5488 power = group->sgp->power;
5489 power_orig = group->sgp->power_orig;
5490 cpus = group->group_weight;
5491
5492
5493 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
5494 capacity = cpus / smt;
5495
5496 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5497 if (!capacity)
5498 capacity = fix_small_capacity(env->sd, group);
5499
5500 return capacity;
5501}
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511static inline void update_sg_lb_stats(struct lb_env *env,
5512 struct sched_group *group, int load_idx,
5513 int local_group, struct sg_lb_stats *sgs)
5514{
5515 unsigned long nr_running;
5516 unsigned long load;
5517 int i;
5518
5519 memset(sgs, 0, sizeof(*sgs));
5520
5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5522 struct rq *rq = cpu_rq(i);
5523
5524 nr_running = rq->nr_running;
5525
5526
5527 if (local_group)
5528 load = target_load(i, load_idx);
5529 else
5530 load = source_load(i, load_idx);
5531
5532 sgs->group_load += load;
5533 sgs->sum_nr_running += nr_running;
5534#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running;
5537#endif
5538 sgs->sum_weighted_load += weighted_cpuload(i);
5539 if (idle_cpu(i))
5540 sgs->idle_cpus++;
5541 }
5542
5543
5544 sgs->group_power = group->sgp->power;
5545 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
5546
5547 if (sgs->sum_nr_running)
5548 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5549
5550 sgs->group_weight = group->group_weight;
5551
5552 sgs->group_imb = sg_imbalanced(group);
5553 sgs->group_capacity = sg_capacity(env, group);
5554
5555 if (sgs->group_capacity > sgs->sum_nr_running)
5556 sgs->group_has_capacity = 1;
5557}
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572static bool update_sd_pick_busiest(struct lb_env *env,
5573 struct sd_lb_stats *sds,
5574 struct sched_group *sg,
5575 struct sg_lb_stats *sgs)
5576{
5577 if (sgs->avg_load <= sds->busiest_stat.avg_load)
5578 return false;
5579
5580 if (sgs->sum_nr_running > sgs->group_capacity)
5581 return true;
5582
5583 if (sgs->group_imb)
5584 return true;
5585
5586
5587
5588
5589
5590
5591 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5592 env->dst_cpu < group_first_cpu(sg)) {
5593 if (!sds->busiest)
5594 return true;
5595
5596 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5597 return true;
5598 }
5599
5600 return false;
5601}
5602
5603#ifdef CONFIG_NUMA_BALANCING
5604static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5605{
5606 if (sgs->sum_nr_running > sgs->nr_numa_running)
5607 return regular;
5608 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5609 return remote;
5610 return all;
5611}
5612
5613static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5614{
5615 if (rq->nr_running > rq->nr_numa_running)
5616 return regular;
5617 if (rq->nr_running > rq->nr_preferred_running)
5618 return remote;
5619 return all;
5620}
5621#else
5622static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5623{
5624 return all;
5625}
5626
5627static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5628{
5629 return regular;
5630}
5631#endif
5632
5633
5634
5635
5636
5637
5638static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
5639{
5640 struct sched_domain *child = env->sd->child;
5641 struct sched_group *sg = env->sd->groups;
5642 struct sg_lb_stats tmp_sgs;
5643 int load_idx, prefer_sibling = 0;
5644
5645 if (child && child->flags & SD_PREFER_SIBLING)
5646 prefer_sibling = 1;
5647
5648 load_idx = get_sd_load_idx(env->sd, env->idle);
5649
5650 do {
5651 struct sg_lb_stats *sgs = &tmp_sgs;
5652 int local_group;
5653
5654 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5655 if (local_group) {
5656 sds->local = sg;
5657 sgs = &sds->local_stat;
5658
5659 if (env->idle != CPU_NEWLY_IDLE ||
5660 time_after_eq(jiffies, sg->sgp->next_update))
5661 update_group_power(env->sd, env->dst_cpu);
5662 }
5663
5664 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
5665
5666 if (local_group)
5667 goto next_group;
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679 if (prefer_sibling && sds->local &&
5680 sds->local_stat.group_has_capacity)
5681 sgs->group_capacity = min(sgs->group_capacity, 1U);
5682
5683 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
5684 sds->busiest = sg;
5685 sds->busiest_stat = *sgs;
5686 }
5687
5688next_group:
5689
5690 sds->total_load += sgs->group_load;
5691 sds->total_pwr += sgs->group_power;
5692
5693 sg = sg->next;
5694 } while (sg != env->sd->groups);
5695
5696 if (env->sd->flags & SD_NUMA)
5697 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5698}
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
5724{
5725 int busiest_cpu;
5726
5727 if (!(env->sd->flags & SD_ASYM_PACKING))
5728 return 0;
5729
5730 if (!sds->busiest)
5731 return 0;
5732
5733 busiest_cpu = group_first_cpu(sds->busiest);
5734 if (env->dst_cpu > busiest_cpu)
5735 return 0;
5736
5737 env->imbalance = DIV_ROUND_CLOSEST(
5738 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
5739 SCHED_POWER_SCALE);
5740
5741 return 1;
5742}
5743
5744
5745
5746
5747
5748
5749
5750
5751static inline
5752void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5753{
5754 unsigned long tmp, pwr_now = 0, pwr_move = 0;
5755 unsigned int imbn = 2;
5756 unsigned long scaled_busy_load_per_task;
5757 struct sg_lb_stats *local, *busiest;
5758
5759 local = &sds->local_stat;
5760 busiest = &sds->busiest_stat;
5761
5762 if (!local->sum_nr_running)
5763 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
5764 else if (busiest->load_per_task > local->load_per_task)
5765 imbn = 1;
5766
5767 scaled_busy_load_per_task =
5768 (busiest->load_per_task * SCHED_POWER_SCALE) /
5769 busiest->group_power;
5770
5771 if (busiest->avg_load + scaled_busy_load_per_task >=
5772 local->avg_load + (scaled_busy_load_per_task * imbn)) {
5773 env->imbalance = busiest->load_per_task;
5774 return;
5775 }
5776
5777
5778
5779
5780
5781
5782
5783 pwr_now += busiest->group_power *
5784 min(busiest->load_per_task, busiest->avg_load);
5785 pwr_now += local->group_power *
5786 min(local->load_per_task, local->avg_load);
5787 pwr_now /= SCHED_POWER_SCALE;
5788
5789
5790 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
5791 busiest->group_power;
5792 if (busiest->avg_load > tmp) {
5793 pwr_move += busiest->group_power *
5794 min(busiest->load_per_task,
5795 busiest->avg_load - tmp);
5796 }
5797
5798
5799 if (busiest->avg_load * busiest->group_power <
5800 busiest->load_per_task * SCHED_POWER_SCALE) {
5801 tmp = (busiest->avg_load * busiest->group_power) /
5802 local->group_power;
5803 } else {
5804 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
5805 local->group_power;
5806 }
5807 pwr_move += local->group_power *
5808 min(local->load_per_task, local->avg_load + tmp);
5809 pwr_move /= SCHED_POWER_SCALE;
5810
5811
5812 if (pwr_move > pwr_now)
5813 env->imbalance = busiest->load_per_task;
5814}
5815
5816
5817
5818
5819
5820
5821
5822static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5823{
5824 unsigned long max_pull, load_above_capacity = ~0UL;
5825 struct sg_lb_stats *local, *busiest;
5826
5827 local = &sds->local_stat;
5828 busiest = &sds->busiest_stat;
5829
5830 if (busiest->group_imb) {
5831
5832
5833
5834
5835 busiest->load_per_task =
5836 min(busiest->load_per_task, sds->avg_load);
5837 }
5838
5839
5840
5841
5842
5843
5844 if (busiest->avg_load <= sds->avg_load ||
5845 local->avg_load >= sds->avg_load) {
5846 env->imbalance = 0;
5847 return fix_small_imbalance(env, sds);
5848 }
5849
5850 if (!busiest->group_imb) {
5851
5852
5853
5854
5855
5856 load_above_capacity =
5857 (busiest->sum_nr_running - busiest->group_capacity);
5858
5859 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
5860 load_above_capacity /= busiest->group_power;
5861 }
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
5872
5873
5874 env->imbalance = min(
5875 max_pull * busiest->group_power,
5876 (sds->avg_load - local->avg_load) * local->group_power
5877 ) / SCHED_POWER_SCALE;
5878
5879
5880
5881
5882
5883
5884
5885 if (env->imbalance < busiest->load_per_task)
5886 return fix_small_imbalance(env, sds);
5887}
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908static struct sched_group *find_busiest_group(struct lb_env *env)
5909{
5910 struct sg_lb_stats *local, *busiest;
5911 struct sd_lb_stats sds;
5912
5913 init_sd_lb_stats(&sds);
5914
5915
5916
5917
5918
5919 update_sd_lb_stats(env, &sds);
5920 local = &sds.local_stat;
5921 busiest = &sds.busiest_stat;
5922
5923 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5924 check_asym_packing(env, &sds))
5925 return sds.busiest;
5926
5927
5928 if (!sds.busiest || busiest->sum_nr_running == 0)
5929 goto out_balanced;
5930
5931 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
5932
5933
5934
5935
5936
5937
5938 if (busiest->group_imb)
5939 goto force_balance;
5940
5941
5942 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
5943 !busiest->group_has_capacity)
5944 goto force_balance;
5945
5946
5947
5948
5949
5950 if (local->avg_load >= busiest->avg_load)
5951 goto out_balanced;
5952
5953
5954
5955
5956
5957 if (local->avg_load >= sds.avg_load)
5958 goto out_balanced;
5959
5960 if (env->idle == CPU_IDLE) {
5961
5962
5963
5964
5965
5966
5967 if ((local->idle_cpus < busiest->idle_cpus) &&
5968 busiest->sum_nr_running <= busiest->group_weight)
5969 goto out_balanced;
5970 } else {
5971
5972
5973
5974
5975 if (100 * busiest->avg_load <=
5976 env->sd->imbalance_pct * local->avg_load)
5977 goto out_balanced;
5978 }
5979
5980force_balance:
5981
5982 calculate_imbalance(env, &sds);
5983 return sds.busiest;
5984
5985out_balanced:
5986 env->imbalance = 0;
5987 return NULL;
5988}
5989
5990
5991
5992
5993static struct rq *find_busiest_queue(struct lb_env *env,
5994 struct sched_group *group)
5995{
5996 struct rq *busiest = NULL, *rq;
5997 unsigned long busiest_load = 0, busiest_power = 1;
5998 int i;
5999
6000 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6001 unsigned long power, capacity, wl;
6002 enum fbq_type rt;
6003
6004 rq = cpu_rq(i);
6005 rt = fbq_classify_rq(rq);
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026 if (rt > env->fbq_type)
6027 continue;
6028
6029 power = power_of(i);
6030 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
6031 if (!capacity)
6032 capacity = fix_small_capacity(env->sd, group);
6033
6034 wl = weighted_cpuload(i);
6035
6036
6037
6038
6039
6040 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
6041 continue;
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054 if (wl * busiest_power > busiest_load * power) {
6055 busiest_load = wl;
6056 busiest_power = power;
6057 busiest = rq;
6058 }
6059 }
6060
6061 return busiest;
6062}
6063
6064
6065
6066
6067
6068#define MAX_PINNED_INTERVAL 512
6069
6070
6071DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6072
6073static int need_active_balance(struct lb_env *env)
6074{
6075 struct sched_domain *sd = env->sd;
6076
6077 if (env->idle == CPU_NEWLY_IDLE) {
6078
6079
6080
6081
6082
6083
6084 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6085 return 1;
6086 }
6087
6088 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6089}
6090
6091static int active_load_balance_cpu_stop(void *data);
6092
6093static int should_we_balance(struct lb_env *env)
6094{
6095 struct sched_group *sg = env->sd->groups;
6096 struct cpumask *sg_cpus, *sg_mask;
6097 int cpu, balance_cpu = -1;
6098
6099
6100
6101
6102
6103 if (env->idle == CPU_NEWLY_IDLE)
6104 return 1;
6105
6106 sg_cpus = sched_group_cpus(sg);
6107 sg_mask = sched_group_mask(sg);
6108
6109 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6110 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6111 continue;
6112
6113 balance_cpu = cpu;
6114 break;
6115 }
6116
6117 if (balance_cpu == -1)
6118 balance_cpu = group_balance_cpu(sg);
6119
6120
6121
6122
6123
6124 return balance_cpu == env->dst_cpu;
6125}
6126
6127
6128
6129
6130
6131static int load_balance(int this_cpu, struct rq *this_rq,
6132 struct sched_domain *sd, enum cpu_idle_type idle,
6133 int *continue_balancing)
6134{
6135 int ld_moved, cur_ld_moved, active_balance = 0;
6136 struct sched_domain *sd_parent = sd->parent;
6137 struct sched_group *group;
6138 struct rq *busiest;
6139 unsigned long flags;
6140 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
6141
6142 struct lb_env env = {
6143 .sd = sd,
6144 .dst_cpu = this_cpu,
6145 .dst_rq = this_rq,
6146 .dst_grpmask = sched_group_cpus(sd->groups),
6147 .idle = idle,
6148 .loop_break = sched_nr_migrate_break,
6149 .cpus = cpus,
6150 .fbq_type = all,
6151 };
6152
6153
6154
6155
6156
6157 if (idle == CPU_NEWLY_IDLE)
6158 env.dst_grpmask = NULL;
6159
6160 cpumask_copy(cpus, cpu_active_mask);
6161
6162 schedstat_inc(sd, lb_count[idle]);
6163
6164redo:
6165 if (!should_we_balance(&env)) {
6166 *continue_balancing = 0;
6167 goto out_balanced;
6168 }
6169
6170 group = find_busiest_group(&env);
6171 if (!group) {
6172 schedstat_inc(sd, lb_nobusyg[idle]);
6173 goto out_balanced;
6174 }
6175
6176 busiest = find_busiest_queue(&env, group);
6177 if (!busiest) {
6178 schedstat_inc(sd, lb_nobusyq[idle]);
6179 goto out_balanced;
6180 }
6181
6182 BUG_ON(busiest == env.dst_rq);
6183
6184 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6185
6186 ld_moved = 0;
6187 if (busiest->nr_running > 1) {
6188
6189
6190
6191
6192
6193
6194 env.flags |= LBF_ALL_PINNED;
6195 env.src_cpu = busiest->cpu;
6196 env.src_rq = busiest;
6197 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6198
6199more_balance:
6200 local_irq_save(flags);
6201 double_rq_lock(env.dst_rq, busiest);
6202
6203
6204
6205
6206
6207 cur_ld_moved = move_tasks(&env);
6208 ld_moved += cur_ld_moved;
6209 double_rq_unlock(env.dst_rq, busiest);
6210 local_irq_restore(flags);
6211
6212
6213
6214
6215 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6216 resched_cpu(env.dst_cpu);
6217
6218 if (env.flags & LBF_NEED_BREAK) {
6219 env.flags &= ~LBF_NEED_BREAK;
6220 goto more_balance;
6221 }
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6243
6244
6245 cpumask_clear_cpu(env.dst_cpu, env.cpus);
6246
6247 env.dst_rq = cpu_rq(env.new_dst_cpu);
6248 env.dst_cpu = env.new_dst_cpu;
6249 env.flags &= ~LBF_DST_PINNED;
6250 env.loop = 0;
6251 env.loop_break = sched_nr_migrate_break;
6252
6253
6254
6255
6256
6257 goto more_balance;
6258 }
6259
6260
6261
6262
6263 if (sd_parent) {
6264 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6265
6266 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6267 *group_imbalance = 1;
6268 } else if (*group_imbalance)
6269 *group_imbalance = 0;
6270 }
6271
6272
6273 if (unlikely(env.flags & LBF_ALL_PINNED)) {
6274 cpumask_clear_cpu(cpu_of(busiest), cpus);
6275 if (!cpumask_empty(cpus)) {
6276 env.loop = 0;
6277 env.loop_break = sched_nr_migrate_break;
6278 goto redo;
6279 }
6280 goto out_balanced;
6281 }
6282 }
6283
6284 if (!ld_moved) {
6285 schedstat_inc(sd, lb_failed[idle]);
6286
6287
6288
6289
6290
6291
6292 if (idle != CPU_NEWLY_IDLE)
6293 sd->nr_balance_failed++;
6294
6295 if (need_active_balance(&env)) {
6296 raw_spin_lock_irqsave(&busiest->lock, flags);
6297
6298
6299
6300
6301
6302 if (!cpumask_test_cpu(this_cpu,
6303 tsk_cpus_allowed(busiest->curr))) {
6304 raw_spin_unlock_irqrestore(&busiest->lock,
6305 flags);
6306 env.flags |= LBF_ALL_PINNED;
6307 goto out_one_pinned;
6308 }
6309
6310
6311
6312
6313
6314
6315 if (!busiest->active_balance) {
6316 busiest->active_balance = 1;
6317 busiest->push_cpu = this_cpu;
6318 active_balance = 1;
6319 }
6320 raw_spin_unlock_irqrestore(&busiest->lock, flags);
6321
6322 if (active_balance) {
6323 stop_one_cpu_nowait(cpu_of(busiest),
6324 active_load_balance_cpu_stop, busiest,
6325 &busiest->active_balance_work);
6326 }
6327
6328
6329
6330
6331
6332 sd->nr_balance_failed = sd->cache_nice_tries+1;
6333 }
6334 } else
6335 sd->nr_balance_failed = 0;
6336
6337 if (likely(!active_balance)) {
6338
6339 sd->balance_interval = sd->min_interval;
6340 } else {
6341
6342
6343
6344
6345
6346
6347 if (sd->balance_interval < sd->max_interval)
6348 sd->balance_interval *= 2;
6349 }
6350
6351 goto out;
6352
6353out_balanced:
6354 schedstat_inc(sd, lb_balanced[idle]);
6355
6356 sd->nr_balance_failed = 0;
6357
6358out_one_pinned:
6359
6360 if (((env.flags & LBF_ALL_PINNED) &&
6361 sd->balance_interval < MAX_PINNED_INTERVAL) ||
6362 (sd->balance_interval < sd->max_interval))
6363 sd->balance_interval *= 2;
6364
6365 ld_moved = 0;
6366out:
6367 return ld_moved;
6368}
6369
6370
6371
6372
6373
6374void idle_balance(int this_cpu, struct rq *this_rq)
6375{
6376 struct sched_domain *sd;
6377 int pulled_task = 0;
6378 unsigned long next_balance = jiffies + HZ;
6379 u64 curr_cost = 0;
6380
6381 this_rq->idle_stamp = rq_clock(this_rq);
6382
6383 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6384 return;
6385
6386
6387
6388
6389 raw_spin_unlock(&this_rq->lock);
6390
6391 update_blocked_averages(this_cpu);
6392 rcu_read_lock();
6393 for_each_domain(this_cpu, sd) {
6394 unsigned long interval;
6395 int continue_balancing = 1;
6396 u64 t0, domain_cost;
6397
6398 if (!(sd->flags & SD_LOAD_BALANCE))
6399 continue;
6400
6401 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6402 break;
6403
6404 if (sd->flags & SD_BALANCE_NEWIDLE) {
6405 t0 = sched_clock_cpu(this_cpu);
6406
6407
6408 pulled_task = load_balance(this_cpu, this_rq,
6409 sd, CPU_NEWLY_IDLE,
6410 &continue_balancing);
6411
6412 domain_cost = sched_clock_cpu(this_cpu) - t0;
6413 if (domain_cost > sd->max_newidle_lb_cost)
6414 sd->max_newidle_lb_cost = domain_cost;
6415
6416 curr_cost += domain_cost;
6417 }
6418
6419 interval = msecs_to_jiffies(sd->balance_interval);
6420 if (time_after(next_balance, sd->last_balance + interval))
6421 next_balance = sd->last_balance + interval;
6422 if (pulled_task) {
6423 this_rq->idle_stamp = 0;
6424 break;
6425 }
6426 }
6427 rcu_read_unlock();
6428
6429 raw_spin_lock(&this_rq->lock);
6430
6431 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6432
6433
6434
6435
6436 this_rq->next_balance = next_balance;
6437 }
6438
6439 if (curr_cost > this_rq->max_idle_balance_cost)
6440 this_rq->max_idle_balance_cost = curr_cost;
6441}
6442
6443
6444
6445
6446
6447
6448
6449static int active_load_balance_cpu_stop(void *data)
6450{
6451 struct rq *busiest_rq = data;
6452 int busiest_cpu = cpu_of(busiest_rq);
6453 int target_cpu = busiest_rq->push_cpu;
6454 struct rq *target_rq = cpu_rq(target_cpu);
6455 struct sched_domain *sd;
6456
6457 raw_spin_lock_irq(&busiest_rq->lock);
6458
6459
6460 if (unlikely(busiest_cpu != smp_processor_id() ||
6461 !busiest_rq->active_balance))
6462 goto out_unlock;
6463
6464
6465 if (busiest_rq->nr_running <= 1)
6466 goto out_unlock;
6467
6468
6469
6470
6471
6472
6473 BUG_ON(busiest_rq == target_rq);
6474
6475
6476 double_lock_balance(busiest_rq, target_rq);
6477
6478
6479 rcu_read_lock();
6480 for_each_domain(target_cpu, sd) {
6481 if ((sd->flags & SD_LOAD_BALANCE) &&
6482 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6483 break;
6484 }
6485
6486 if (likely(sd)) {
6487 struct lb_env env = {
6488 .sd = sd,
6489 .dst_cpu = target_cpu,
6490 .dst_rq = target_rq,
6491 .src_cpu = busiest_rq->cpu,
6492 .src_rq = busiest_rq,
6493 .idle = CPU_IDLE,
6494 };
6495
6496 schedstat_inc(sd, alb_count);
6497
6498 if (move_one_task(&env))
6499 schedstat_inc(sd, alb_pushed);
6500 else
6501 schedstat_inc(sd, alb_failed);
6502 }
6503 rcu_read_unlock();
6504 double_unlock_balance(busiest_rq, target_rq);
6505out_unlock:
6506 busiest_rq->active_balance = 0;
6507 raw_spin_unlock_irq(&busiest_rq->lock);
6508 return 0;
6509}
6510
6511#ifdef CONFIG_NO_HZ_COMMON
6512
6513
6514
6515
6516
6517
6518static struct {
6519 cpumask_var_t idle_cpus_mask;
6520 atomic_t nr_cpus;
6521 unsigned long next_balance;
6522} nohz ____cacheline_aligned;
6523
6524static inline int find_new_ilb(int call_cpu)
6525{
6526 int ilb = cpumask_first(nohz.idle_cpus_mask);
6527
6528 if (ilb < nr_cpu_ids && idle_cpu(ilb))
6529 return ilb;
6530
6531 return nr_cpu_ids;
6532}
6533
6534
6535
6536
6537
6538
6539static void nohz_balancer_kick(int cpu)
6540{
6541 int ilb_cpu;
6542
6543 nohz.next_balance++;
6544
6545 ilb_cpu = find_new_ilb(cpu);
6546
6547 if (ilb_cpu >= nr_cpu_ids)
6548 return;
6549
6550 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6551 return;
6552
6553
6554
6555
6556
6557
6558 smp_send_reschedule(ilb_cpu);
6559 return;
6560}
6561
6562static inline void nohz_balance_exit_idle(int cpu)
6563{
6564 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6565 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6566 atomic_dec(&nohz.nr_cpus);
6567 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6568 }
6569}
6570
6571static inline void set_cpu_sd_state_busy(void)
6572{
6573 struct sched_domain *sd;
6574 int cpu = smp_processor_id();
6575
6576 rcu_read_lock();
6577 sd = rcu_dereference(per_cpu(sd_busy, cpu));
6578
6579 if (!sd || !sd->nohz_idle)
6580 goto unlock;
6581 sd->nohz_idle = 0;
6582
6583 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6584unlock:
6585 rcu_read_unlock();
6586}
6587
6588void set_cpu_sd_state_idle(void)
6589{
6590 struct sched_domain *sd;
6591 int cpu = smp_processor_id();
6592
6593 rcu_read_lock();
6594 sd = rcu_dereference(per_cpu(sd_busy, cpu));
6595
6596 if (!sd || sd->nohz_idle)
6597 goto unlock;
6598 sd->nohz_idle = 1;
6599
6600 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6601unlock:
6602 rcu_read_unlock();
6603}
6604
6605
6606
6607
6608
6609void nohz_balance_enter_idle(int cpu)
6610{
6611
6612
6613
6614 if (!cpu_active(cpu))
6615 return;
6616
6617 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6618 return;
6619
6620 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6621 atomic_inc(&nohz.nr_cpus);
6622 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6623}
6624
6625static int sched_ilb_notifier(struct notifier_block *nfb,
6626 unsigned long action, void *hcpu)
6627{
6628 switch (action & ~CPU_TASKS_FROZEN) {
6629 case CPU_DYING:
6630 nohz_balance_exit_idle(smp_processor_id());
6631 return NOTIFY_OK;
6632 default:
6633 return NOTIFY_DONE;
6634 }
6635}
6636#endif
6637
6638static DEFINE_SPINLOCK(balancing);
6639
6640
6641
6642
6643
6644void update_max_interval(void)
6645{
6646 max_load_balance_interval = HZ*num_online_cpus()/10;
6647}
6648
6649
6650
6651
6652
6653
6654
6655static void rebalance_domains(int cpu, enum cpu_idle_type idle)
6656{
6657 int continue_balancing = 1;
6658 struct rq *rq = cpu_rq(cpu);
6659 unsigned long interval;
6660 struct sched_domain *sd;
6661
6662 unsigned long next_balance = jiffies + 60*HZ;
6663 int update_next_balance = 0;
6664 int need_serialize, need_decay = 0;
6665 u64 max_cost = 0;
6666
6667 update_blocked_averages(cpu);
6668
6669 rcu_read_lock();
6670 for_each_domain(cpu, sd) {
6671
6672
6673
6674
6675 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6676 sd->max_newidle_lb_cost =
6677 (sd->max_newidle_lb_cost * 253) / 256;
6678 sd->next_decay_max_lb_cost = jiffies + HZ;
6679 need_decay = 1;
6680 }
6681 max_cost += sd->max_newidle_lb_cost;
6682
6683 if (!(sd->flags & SD_LOAD_BALANCE))
6684 continue;
6685
6686
6687
6688
6689
6690
6691 if (!continue_balancing) {
6692 if (need_decay)
6693 continue;
6694 break;
6695 }
6696
6697 interval = sd->balance_interval;
6698 if (idle != CPU_IDLE)
6699 interval *= sd->busy_factor;
6700
6701
6702 interval = msecs_to_jiffies(interval);
6703 interval = clamp(interval, 1UL, max_load_balance_interval);
6704
6705 need_serialize = sd->flags & SD_SERIALIZE;
6706
6707 if (need_serialize) {
6708 if (!spin_trylock(&balancing))
6709 goto out;
6710 }
6711
6712 if (time_after_eq(jiffies, sd->last_balance + interval)) {
6713 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
6714
6715
6716
6717
6718
6719 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
6720 }
6721 sd->last_balance = jiffies;
6722 }
6723 if (need_serialize)
6724 spin_unlock(&balancing);
6725out:
6726 if (time_after(next_balance, sd->last_balance + interval)) {
6727 next_balance = sd->last_balance + interval;
6728 update_next_balance = 1;
6729 }
6730 }
6731 if (need_decay) {
6732
6733
6734
6735
6736 rq->max_idle_balance_cost =
6737 max((u64)sysctl_sched_migration_cost, max_cost);
6738 }
6739 rcu_read_unlock();
6740
6741
6742
6743
6744
6745
6746 if (likely(update_next_balance))
6747 rq->next_balance = next_balance;
6748}
6749
6750#ifdef CONFIG_NO_HZ_COMMON
6751
6752
6753
6754
6755static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6756{
6757 struct rq *this_rq = cpu_rq(this_cpu);
6758 struct rq *rq;
6759 int balance_cpu;
6760
6761 if (idle != CPU_IDLE ||
6762 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
6763 goto end;
6764
6765 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
6766 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
6767 continue;
6768
6769
6770
6771
6772
6773
6774 if (need_resched())
6775 break;
6776
6777 rq = cpu_rq(balance_cpu);
6778
6779 raw_spin_lock_irq(&rq->lock);
6780 update_rq_clock(rq);
6781 update_idle_cpu_load(rq);
6782 raw_spin_unlock_irq(&rq->lock);
6783
6784 rebalance_domains(balance_cpu, CPU_IDLE);
6785
6786 if (time_after(this_rq->next_balance, rq->next_balance))
6787 this_rq->next_balance = rq->next_balance;
6788 }
6789 nohz.next_balance = this_rq->next_balance;
6790end:
6791 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
6792}
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803static inline int nohz_kick_needed(struct rq *rq, int cpu)
6804{
6805 unsigned long now = jiffies;
6806 struct sched_domain *sd;
6807 struct sched_group_power *sgp;
6808 int nr_busy;
6809
6810 if (unlikely(idle_cpu(cpu)))
6811 return 0;
6812
6813
6814
6815
6816
6817 set_cpu_sd_state_busy();
6818 nohz_balance_exit_idle(cpu);
6819
6820
6821
6822
6823
6824 if (likely(!atomic_read(&nohz.nr_cpus)))
6825 return 0;
6826
6827 if (time_before(now, nohz.next_balance))
6828 return 0;
6829
6830 if (rq->nr_running >= 2)
6831 goto need_kick;
6832
6833 rcu_read_lock();
6834 sd = rcu_dereference(per_cpu(sd_busy, cpu));
6835
6836 if (sd) {
6837 sgp = sd->groups->sgp;
6838 nr_busy = atomic_read(&sgp->nr_busy_cpus);
6839
6840 if (nr_busy > 1)
6841 goto need_kick_unlock;
6842 }
6843
6844 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6845
6846 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6847 sched_domain_span(sd)) < cpu))
6848 goto need_kick_unlock;
6849
6850 rcu_read_unlock();
6851 return 0;
6852
6853need_kick_unlock:
6854 rcu_read_unlock();
6855need_kick:
6856 return 1;
6857}
6858#else
6859static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6860#endif
6861
6862
6863
6864
6865
6866static void run_rebalance_domains(struct softirq_action *h)
6867{
6868 int this_cpu = smp_processor_id();
6869 struct rq *this_rq = cpu_rq(this_cpu);
6870 enum cpu_idle_type idle = this_rq->idle_balance ?
6871 CPU_IDLE : CPU_NOT_IDLE;
6872
6873 rebalance_domains(this_cpu, idle);
6874
6875
6876
6877
6878
6879
6880 nohz_idle_balance(this_cpu, idle);
6881}
6882
6883static inline int on_null_domain(int cpu)
6884{
6885 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
6886}
6887
6888
6889
6890
6891void trigger_load_balance(struct rq *rq, int cpu)
6892{
6893
6894 if (time_after_eq(jiffies, rq->next_balance) &&
6895 likely(!on_null_domain(cpu)))
6896 raise_softirq(SCHED_SOFTIRQ);
6897#ifdef CONFIG_NO_HZ_COMMON
6898 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
6899 nohz_balancer_kick(cpu);
6900#endif
6901}
6902
6903static void rq_online_fair(struct rq *rq)
6904{
6905 update_sysctl();
6906}
6907
6908static void rq_offline_fair(struct rq *rq)
6909{
6910 update_sysctl();
6911
6912
6913 unthrottle_offline_cfs_rqs(rq);
6914}
6915
6916#endif
6917
6918
6919
6920
6921static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
6922{
6923 struct cfs_rq *cfs_rq;
6924 struct sched_entity *se = &curr->se;
6925
6926 for_each_sched_entity(se) {
6927 cfs_rq = cfs_rq_of(se);
6928 entity_tick(cfs_rq, se, queued);
6929 }
6930
6931 if (numabalancing_enabled)
6932 task_tick_numa(rq, curr);
6933
6934 update_rq_runnable_avg(rq, 1);
6935}
6936
6937
6938
6939
6940
6941
6942static void task_fork_fair(struct task_struct *p)
6943{
6944 struct cfs_rq *cfs_rq;
6945 struct sched_entity *se = &p->se, *curr;
6946 int this_cpu = smp_processor_id();
6947 struct rq *rq = this_rq();
6948 unsigned long flags;
6949
6950 raw_spin_lock_irqsave(&rq->lock, flags);
6951
6952 update_rq_clock(rq);
6953
6954 cfs_rq = task_cfs_rq(current);
6955 curr = cfs_rq->curr;
6956
6957
6958
6959
6960
6961
6962
6963 rcu_read_lock();
6964 __set_task_cpu(p, this_cpu);
6965 rcu_read_unlock();
6966
6967 update_curr(cfs_rq);
6968
6969 if (curr)
6970 se->vruntime = curr->vruntime;
6971 place_entity(cfs_rq, se, 1);
6972
6973 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
6974
6975
6976
6977
6978 swap(curr->vruntime, se->vruntime);
6979 resched_task(rq->curr);
6980 }
6981
6982 se->vruntime -= cfs_rq->min_vruntime;
6983
6984 raw_spin_unlock_irqrestore(&rq->lock, flags);
6985}
6986
6987
6988
6989
6990
6991static void
6992prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
6993{
6994 if (!p->se.on_rq)
6995 return;
6996
6997
6998
6999
7000
7001
7002 if (rq->curr == p) {
7003 if (p->prio > oldprio)
7004 resched_task(rq->curr);
7005 } else
7006 check_preempt_curr(rq, p, 0);
7007}
7008
7009static void switched_from_fair(struct rq *rq, struct task_struct *p)
7010{
7011 struct sched_entity *se = &p->se;
7012 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023 if (!se->on_rq && p->state != TASK_RUNNING) {
7024
7025
7026
7027
7028 place_entity(cfs_rq, se, 0);
7029 se->vruntime -= cfs_rq->min_vruntime;
7030 }
7031
7032#ifdef CONFIG_SMP
7033
7034
7035
7036
7037
7038 if (se->avg.decay_count) {
7039 __synchronize_entity_decay(se);
7040 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
7041 }
7042#endif
7043}
7044
7045
7046
7047
7048static void switched_to_fair(struct rq *rq, struct task_struct *p)
7049{
7050 if (!p->se.on_rq)
7051 return;
7052
7053
7054
7055
7056
7057
7058 if (rq->curr == p)
7059 resched_task(rq->curr);
7060 else
7061 check_preempt_curr(rq, p, 0);
7062}
7063
7064
7065
7066
7067
7068
7069static void set_curr_task_fair(struct rq *rq)
7070{
7071 struct sched_entity *se = &rq->curr->se;
7072
7073 for_each_sched_entity(se) {
7074 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7075
7076 set_next_entity(cfs_rq, se);
7077
7078 account_cfs_rq_runtime(cfs_rq, 0);
7079 }
7080}
7081
7082void init_cfs_rq(struct cfs_rq *cfs_rq)
7083{
7084 cfs_rq->tasks_timeline = RB_ROOT;
7085 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7086#ifndef CONFIG_64BIT
7087 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7088#endif
7089#ifdef CONFIG_SMP
7090 atomic64_set(&cfs_rq->decay_counter, 1);
7091 atomic_long_set(&cfs_rq->removed_load, 0);
7092#endif
7093}
7094
7095#ifdef CONFIG_FAIR_GROUP_SCHED
7096static void task_move_group_fair(struct task_struct *p, int on_rq)
7097{
7098 struct cfs_rq *cfs_rq;
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7125 on_rq = 1;
7126
7127 if (!on_rq)
7128 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
7129 set_task_rq(p, task_cpu(p));
7130 if (!on_rq) {
7131 cfs_rq = cfs_rq_of(&p->se);
7132 p->se.vruntime += cfs_rq->min_vruntime;
7133#ifdef CONFIG_SMP
7134
7135
7136
7137
7138
7139 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7140 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
7141#endif
7142 }
7143}
7144
7145void free_fair_sched_group(struct task_group *tg)
7146{
7147 int i;
7148
7149 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7150
7151 for_each_possible_cpu(i) {
7152 if (tg->cfs_rq)
7153 kfree(tg->cfs_rq[i]);
7154 if (tg->se)
7155 kfree(tg->se[i]);
7156 }
7157
7158 kfree(tg->cfs_rq);
7159 kfree(tg->se);
7160}
7161
7162int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7163{
7164 struct cfs_rq *cfs_rq;
7165 struct sched_entity *se;
7166 int i;
7167
7168 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7169 if (!tg->cfs_rq)
7170 goto err;
7171 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7172 if (!tg->se)
7173 goto err;
7174
7175 tg->shares = NICE_0_LOAD;
7176
7177 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7178
7179 for_each_possible_cpu(i) {
7180 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7181 GFP_KERNEL, cpu_to_node(i));
7182 if (!cfs_rq)
7183 goto err;
7184
7185 se = kzalloc_node(sizeof(struct sched_entity),
7186 GFP_KERNEL, cpu_to_node(i));
7187 if (!se)
7188 goto err_free_rq;
7189
7190 init_cfs_rq(cfs_rq);
7191 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7192 }
7193
7194 return 1;
7195
7196err_free_rq:
7197 kfree(cfs_rq);
7198err:
7199 return 0;
7200}
7201
7202void unregister_fair_sched_group(struct task_group *tg, int cpu)
7203{
7204 struct rq *rq = cpu_rq(cpu);
7205 unsigned long flags;
7206
7207
7208
7209
7210
7211 if (!tg->cfs_rq[cpu]->on_list)
7212 return;
7213
7214 raw_spin_lock_irqsave(&rq->lock, flags);
7215 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7216 raw_spin_unlock_irqrestore(&rq->lock, flags);
7217}
7218
7219void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7220 struct sched_entity *se, int cpu,
7221 struct sched_entity *parent)
7222{
7223 struct rq *rq = cpu_rq(cpu);
7224
7225 cfs_rq->tg = tg;
7226 cfs_rq->rq = rq;
7227 init_cfs_rq_runtime(cfs_rq);
7228
7229 tg->cfs_rq[cpu] = cfs_rq;
7230 tg->se[cpu] = se;
7231
7232
7233 if (!se)
7234 return;
7235
7236 if (!parent)
7237 se->cfs_rq = &rq->cfs;
7238 else
7239 se->cfs_rq = parent->my_q;
7240
7241 se->my_q = cfs_rq;
7242
7243 update_load_set(&se->load, NICE_0_LOAD);
7244 se->parent = parent;
7245}
7246
7247static DEFINE_MUTEX(shares_mutex);
7248
7249int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7250{
7251 int i;
7252 unsigned long flags;
7253
7254
7255
7256
7257 if (!tg->se[0])
7258 return -EINVAL;
7259
7260 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7261
7262 mutex_lock(&shares_mutex);
7263 if (tg->shares == shares)
7264 goto done;
7265
7266 tg->shares = shares;
7267 for_each_possible_cpu(i) {
7268 struct rq *rq = cpu_rq(i);
7269 struct sched_entity *se;
7270
7271 se = tg->se[i];
7272
7273 raw_spin_lock_irqsave(&rq->lock, flags);
7274
7275
7276 update_rq_clock(rq);
7277 for_each_sched_entity(se)
7278 update_cfs_shares(group_cfs_rq(se));
7279 raw_spin_unlock_irqrestore(&rq->lock, flags);
7280 }
7281
7282done:
7283 mutex_unlock(&shares_mutex);
7284 return 0;
7285}
7286#else
7287
7288void free_fair_sched_group(struct task_group *tg) { }
7289
7290int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7291{
7292 return 1;
7293}
7294
7295void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7296
7297#endif
7298
7299
7300static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7301{
7302 struct sched_entity *se = &task->se;
7303 unsigned int rr_interval = 0;
7304
7305
7306
7307
7308
7309 if (rq->cfs.load.weight)
7310 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7311
7312 return rr_interval;
7313}
7314
7315
7316
7317
7318const struct sched_class fair_sched_class = {
7319 .next = &idle_sched_class,
7320 .enqueue_task = enqueue_task_fair,
7321 .dequeue_task = dequeue_task_fair,
7322 .yield_task = yield_task_fair,
7323 .yield_to_task = yield_to_task_fair,
7324
7325 .check_preempt_curr = check_preempt_wakeup,
7326
7327 .pick_next_task = pick_next_task_fair,
7328 .put_prev_task = put_prev_task_fair,
7329
7330#ifdef CONFIG_SMP
7331 .select_task_rq = select_task_rq_fair,
7332 .migrate_task_rq = migrate_task_rq_fair,
7333
7334 .rq_online = rq_online_fair,
7335 .rq_offline = rq_offline_fair,
7336
7337 .task_waking = task_waking_fair,
7338#endif
7339
7340 .set_curr_task = set_curr_task_fair,
7341 .task_tick = task_tick_fair,
7342 .task_fork = task_fork_fair,
7343
7344 .prio_changed = prio_changed_fair,
7345 .switched_from = switched_from_fair,
7346 .switched_to = switched_to_fair,
7347
7348 .get_rr_interval = get_rr_interval_fair,
7349
7350#ifdef CONFIG_FAIR_GROUP_SCHED
7351 .task_move_group = task_move_group_fair,
7352#endif
7353};
7354
7355#ifdef CONFIG_SCHED_DEBUG
7356void print_cfs_stats(struct seq_file *m, int cpu)
7357{
7358 struct cfs_rq *cfs_rq;
7359
7360 rcu_read_lock();
7361 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7362 print_cfs_rq(m, cpu, cfs_rq);
7363 rcu_read_unlock();
7364}
7365#endif
7366
7367__init void init_sched_fair_class(void)
7368{
7369#ifdef CONFIG_SMP
7370 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7371
7372#ifdef CONFIG_NO_HZ_COMMON
7373 nohz.next_balance = jiffies;
7374 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7375 cpu_notifier(sched_ilb_notifier, 0);
7376#endif
7377#endif
7378
7379}
7380