1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
36
37
38
39
40
41
42
43
44
45
46
47
48
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51
52
53
54
55
56
57
58
59
60
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
64
65
66
67
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70
71
72
73
74static unsigned int sched_nr_latency = 8;
75
76
77
78
79
80unsigned int sysctl_sched_child_runs_first __read_mostly;
81
82
83
84
85
86
87
88
89
90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
95
96
97
98
99
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
102#ifdef CONFIG_CFS_BANDWIDTH
103
104
105
106
107
108
109
110
111
112
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
134
135
136
137
138
139
140
141
142
143static int get_update_sysctl_factor(void)
144{
145 unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 unsigned int factor;
147
148 switch (sysctl_sched_tunable_scaling) {
149 case SCHED_TUNABLESCALING_NONE:
150 factor = 1;
151 break;
152 case SCHED_TUNABLESCALING_LINEAR:
153 factor = cpus;
154 break;
155 case SCHED_TUNABLESCALING_LOG:
156 default:
157 factor = 1 + ilog2(cpus);
158 break;
159 }
160
161 return factor;
162}
163
164static void update_sysctl(void)
165{
166 unsigned int factor = get_update_sysctl_factor();
167
168#define SET_SYSCTL(name) \
169 (sysctl_##name = (factor) * normalized_sysctl_##name)
170 SET_SYSCTL(sched_min_granularity);
171 SET_SYSCTL(sched_latency);
172 SET_SYSCTL(sched_wakeup_granularity);
173#undef SET_SYSCTL
174}
175
176void sched_init_granularity(void)
177{
178 update_sysctl();
179}
180
181#if BITS_PER_LONG == 32
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32
188
189
190
191
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
193
194
195
196
197static unsigned long
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{
201 u64 tmp;
202
203
204
205
206
207
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212
213 if (!lw->inv_weight) {
214 unsigned long w = scale_load_down(lw->weight);
215
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 lw->inv_weight = 1;
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 }
223
224
225
226
227 if (unlikely(tmp > WMULT_CONST))
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 WMULT_SHIFT/2);
230 else
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
234}
235
236
237const struct sched_class fair_sched_class;
238
239
240
241
242
243#ifdef CONFIG_FAIR_GROUP_SCHED
244
245
246static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
247{
248 return cfs_rq->rq;
249}
250
251
252#define entity_is_task(se) (!se->my_q)
253
254static inline struct task_struct *task_of(struct sched_entity *se)
255{
256#ifdef CONFIG_SCHED_DEBUG
257 WARN_ON_ONCE(!entity_is_task(se));
258#endif
259 return container_of(se, struct task_struct, se);
260}
261
262
263#define for_each_sched_entity(se) \
264 for (; se; se = se->parent)
265
266static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
267{
268 return p->se.cfs_rq;
269}
270
271
272static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
273{
274 return se->cfs_rq;
275}
276
277
278static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
279{
280 return grp->my_q;
281}
282
283static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
284 int force_update);
285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288 if (!cfs_rq->on_list) {
289
290
291
292
293
294
295 if (cfs_rq->tg->parent &&
296 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 &rq_of(cfs_rq)->leaf_cfs_rq_list);
299 } else {
300 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 &rq_of(cfs_rq)->leaf_cfs_rq_list);
302 }
303
304 cfs_rq->on_list = 1;
305
306 update_cfs_rq_blocked_load(cfs_rq, 0);
307 }
308}
309
310static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311{
312 if (cfs_rq->on_list) {
313 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
314 cfs_rq->on_list = 0;
315 }
316}
317
318
319#define for_each_leaf_cfs_rq(rq, cfs_rq) \
320 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
321
322
323static inline int
324is_same_group(struct sched_entity *se, struct sched_entity *pse)
325{
326 if (se->cfs_rq == pse->cfs_rq)
327 return 1;
328
329 return 0;
330}
331
332static inline struct sched_entity *parent_entity(struct sched_entity *se)
333{
334 return se->parent;
335}
336
337
338static inline int depth_se(struct sched_entity *se)
339{
340 int depth = 0;
341
342 for_each_sched_entity(se)
343 depth++;
344
345 return depth;
346}
347
348static void
349find_matching_se(struct sched_entity **se, struct sched_entity **pse)
350{
351 int se_depth, pse_depth;
352
353
354
355
356
357
358
359
360
361 se_depth = depth_se(*se);
362 pse_depth = depth_se(*pse);
363
364 while (se_depth > pse_depth) {
365 se_depth--;
366 *se = parent_entity(*se);
367 }
368
369 while (pse_depth > se_depth) {
370 pse_depth--;
371 *pse = parent_entity(*pse);
372 }
373
374 while (!is_same_group(*se, *pse)) {
375 *se = parent_entity(*se);
376 *pse = parent_entity(*pse);
377 }
378}
379
380#else
381
382static inline struct task_struct *task_of(struct sched_entity *se)
383{
384 return container_of(se, struct task_struct, se);
385}
386
387static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
388{
389 return container_of(cfs_rq, struct rq, cfs);
390}
391
392#define entity_is_task(se) 1
393
394#define for_each_sched_entity(se) \
395 for (; se; se = NULL)
396
397static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
398{
399 return &task_rq(p)->cfs;
400}
401
402static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
403{
404 struct task_struct *p = task_of(se);
405 struct rq *rq = task_rq(p);
406
407 return &rq->cfs;
408}
409
410
411static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
412{
413 return NULL;
414}
415
416static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
417{
418}
419
420static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
421{
422}
423
424#define for_each_leaf_cfs_rq(rq, cfs_rq) \
425 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
426
427static inline int
428is_same_group(struct sched_entity *se, struct sched_entity *pse)
429{
430 return 1;
431}
432
433static inline struct sched_entity *parent_entity(struct sched_entity *se)
434{
435 return NULL;
436}
437
438static inline void
439find_matching_se(struct sched_entity **se, struct sched_entity **pse)
440{
441}
442
443#endif
444
445static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
447
448
449
450
451
452static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
453{
454 s64 delta = (s64)(vruntime - max_vruntime);
455 if (delta > 0)
456 max_vruntime = vruntime;
457
458 return max_vruntime;
459}
460
461static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
462{
463 s64 delta = (s64)(vruntime - min_vruntime);
464 if (delta < 0)
465 min_vruntime = vruntime;
466
467 return min_vruntime;
468}
469
470static inline int entity_before(struct sched_entity *a,
471 struct sched_entity *b)
472{
473 return (s64)(a->vruntime - b->vruntime) < 0;
474}
475
476static void update_min_vruntime(struct cfs_rq *cfs_rq)
477{
478 u64 vruntime = cfs_rq->min_vruntime;
479
480 if (cfs_rq->curr)
481 vruntime = cfs_rq->curr->vruntime;
482
483 if (cfs_rq->rb_leftmost) {
484 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
485 struct sched_entity,
486 run_node);
487
488 if (!cfs_rq->curr)
489 vruntime = se->vruntime;
490 else
491 vruntime = min_vruntime(vruntime, se->vruntime);
492 }
493
494
495 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
496#ifndef CONFIG_64BIT
497 smp_wmb();
498 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
499#endif
500}
501
502
503
504
505static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
506{
507 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
508 struct rb_node *parent = NULL;
509 struct sched_entity *entry;
510 int leftmost = 1;
511
512
513
514
515 while (*link) {
516 parent = *link;
517 entry = rb_entry(parent, struct sched_entity, run_node);
518
519
520
521
522 if (entity_before(se, entry)) {
523 link = &parent->rb_left;
524 } else {
525 link = &parent->rb_right;
526 leftmost = 0;
527 }
528 }
529
530
531
532
533
534 if (leftmost)
535 cfs_rq->rb_leftmost = &se->run_node;
536
537 rb_link_node(&se->run_node, parent, link);
538 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
539}
540
541static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
542{
543 if (cfs_rq->rb_leftmost == &se->run_node) {
544 struct rb_node *next_node;
545
546 next_node = rb_next(&se->run_node);
547 cfs_rq->rb_leftmost = next_node;
548 }
549
550 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
551}
552
553struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
554{
555 struct rb_node *left = cfs_rq->rb_leftmost;
556
557 if (!left)
558 return NULL;
559
560 return rb_entry(left, struct sched_entity, run_node);
561}
562
563static struct sched_entity *__pick_next_entity(struct sched_entity *se)
564{
565 struct rb_node *next = rb_next(&se->run_node);
566
567 if (!next)
568 return NULL;
569
570 return rb_entry(next, struct sched_entity, run_node);
571}
572
573#ifdef CONFIG_SCHED_DEBUG
574struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
575{
576 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
577
578 if (!last)
579 return NULL;
580
581 return rb_entry(last, struct sched_entity, run_node);
582}
583
584
585
586
587
588int sched_proc_update_handler(struct ctl_table *table, int write,
589 void __user *buffer, size_t *lenp,
590 loff_t *ppos)
591{
592 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
593 int factor = get_update_sysctl_factor();
594
595 if (ret || !write)
596 return ret;
597
598 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
599 sysctl_sched_min_granularity);
600
601#define WRT_SYSCTL(name) \
602 (normalized_sysctl_##name = sysctl_##name / (factor))
603 WRT_SYSCTL(sched_min_granularity);
604 WRT_SYSCTL(sched_latency);
605 WRT_SYSCTL(sched_wakeup_granularity);
606#undef WRT_SYSCTL
607
608 return 0;
609}
610#endif
611
612
613
614
615static inline unsigned long
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{
618 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
620
621 return delta;
622}
623
624
625
626
627
628
629
630
631
632static u64 __sched_period(unsigned long nr_running)
633{
634 u64 period = sysctl_sched_latency;
635 unsigned long nr_latency = sched_nr_latency;
636
637 if (unlikely(nr_running > nr_latency)) {
638 period = sysctl_sched_min_granularity;
639 period *= nr_running;
640 }
641
642 return period;
643}
644
645
646
647
648
649
650
651static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652{
653 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
654
655 for_each_sched_entity(se) {
656 struct load_weight *load;
657 struct load_weight lw;
658
659 cfs_rq = cfs_rq_of(se);
660 load = &cfs_rq->load;
661
662 if (unlikely(!se->on_rq)) {
663 lw = cfs_rq->load;
664
665 update_load_add(&lw, se->load.weight);
666 load = &lw;
667 }
668 slice = calc_delta_mine(slice, se->load.weight, load);
669 }
670 return slice;
671}
672
673
674
675
676
677
678static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
679{
680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
681}
682
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
703
704
705
706
707static inline void
708__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
709 unsigned long delta_exec)
710{
711 unsigned long delta_exec_weighted;
712
713 schedstat_set(curr->statistics.exec_max,
714 max((u64)delta_exec, curr->statistics.exec_max));
715
716 curr->sum_exec_runtime += delta_exec;
717 schedstat_add(cfs_rq, exec_clock, delta_exec);
718 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
719
720 curr->vruntime += delta_exec_weighted;
721 update_min_vruntime(cfs_rq);
722}
723
724static void update_curr(struct cfs_rq *cfs_rq)
725{
726 struct sched_entity *curr = cfs_rq->curr;
727 u64 now = rq_clock_task(rq_of(cfs_rq));
728 unsigned long delta_exec;
729
730 if (unlikely(!curr))
731 return;
732
733
734
735
736
737
738 delta_exec = (unsigned long)(now - curr->exec_start);
739 if (!delta_exec)
740 return;
741
742 __update_curr(cfs_rq, curr, delta_exec);
743 curr->exec_start = now;
744
745 if (entity_is_task(curr)) {
746 struct task_struct *curtask = task_of(curr);
747
748 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
749 cpuacct_charge(curtask, delta_exec);
750 account_group_exec_runtime(curtask, delta_exec);
751 }
752
753 account_cfs_rq_runtime(cfs_rq, delta_exec);
754}
755
756static inline void
757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
758{
759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
760}
761
762
763
764
765static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
766{
767
768
769
770
771 if (se != cfs_rq->curr)
772 update_stats_wait_start(cfs_rq, se);
773}
774
775static void
776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
777{
778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
783#ifdef CONFIG_SCHEDSTATS
784 if (entity_is_task(se)) {
785 trace_sched_stat_wait(task_of(se),
786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
787 }
788#endif
789 schedstat_set(se->statistics.wait_start, 0);
790}
791
792static inline void
793update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794{
795
796
797
798
799 if (se != cfs_rq->curr)
800 update_stats_wait_end(cfs_rq, se);
801}
802
803
804
805
806static inline void
807update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
808{
809
810
811
812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
813}
814
815
816
817
818
819#ifdef CONFIG_NUMA_BALANCING
820
821
822
823unsigned int sysctl_numa_balancing_scan_period_min = 100;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826
827
828unsigned int sysctl_numa_balancing_scan_size = 256;
829
830
831unsigned int sysctl_numa_balancing_scan_delay = 1000;
832
833static void task_numa_placement(struct task_struct *p)
834{
835 int seq;
836
837 if (!p->mm)
838 return;
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq)
841 return;
842 p->numa_scan_seq = seq;
843
844
845}
846
847
848
849
850void task_numa_fault(int node, int pages, bool migrated)
851{
852 struct task_struct *p = current;
853
854 if (!numabalancing_enabled)
855 return;
856
857
858
859
860
861
862
863 if (!migrated)
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
865 p->numa_scan_period + jiffies_to_msecs(10));
866
867 task_numa_placement(p);
868}
869
870static void reset_ptenuma_scan(struct task_struct *p)
871{
872 ACCESS_ONCE(p->mm->numa_scan_seq)++;
873 p->mm->numa_scan_offset = 0;
874}
875
876
877
878
879
880void task_numa_work(struct callback_head *work)
881{
882 unsigned long migrate, next_scan, now = jiffies;
883 struct task_struct *p = current;
884 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma;
886 unsigned long start, end;
887 long pages;
888
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
890
891 work->next = work;
892
893
894
895
896
897
898
899
900 if (p->flags & PF_EXITING)
901 return;
902
903
904
905
906
907
908
909
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921
922
923
924
925
926
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 }
933
934
935
936
937 migrate = mm->numa_next_scan;
938 if (time_before(now, migrate))
939 return;
940
941 if (p->numa_scan_period == 0)
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
943
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return;
947
948
949
950
951
952
953 if (migrate_ratelimited(numa_node_id()))
954 return;
955
956 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size;
958 pages <<= 20 - PAGE_SHIFT;
959 if (!pages)
960 return;
961
962 down_read(&mm->mmap_sem);
963 vma = find_vma(mm, start);
964 if (!vma) {
965 reset_ptenuma_scan(p);
966 start = 0;
967 vma = mm->mmap;
968 }
969 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma))
971 continue;
972
973
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
975 continue;
976
977 do {
978 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end);
982
983 start = end;
984 if (pages <= 0)
985 goto out;
986 } while (end != vma->vm_end);
987 }
988
989out:
990
991
992
993
994
995
996 if (vma)
997 mm->numa_scan_offset = start;
998 else
999 reset_ptenuma_scan(p);
1000 up_read(&mm->mmap_sem);
1001}
1002
1003
1004
1005
1006void task_tick_numa(struct rq *rq, struct task_struct *curr)
1007{
1008 struct callback_head *work = &curr->numa_work;
1009 u64 period, now;
1010
1011
1012
1013
1014 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1015 return;
1016
1017
1018
1019
1020
1021
1022
1023 now = curr->se.sum_exec_runtime;
1024 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1025
1026 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1029 curr->node_stamp = now;
1030
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work);
1033 task_work_add(curr, work, true);
1034 }
1035 }
1036}
1037#else
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{
1040}
1041#endif
1042
1043static void
1044account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1045{
1046 update_load_add(&cfs_rq->load, se->load.weight);
1047 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP
1050 if (entity_is_task(se))
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1052#endif
1053 cfs_rq->nr_running++;
1054}
1055
1056static void
1057account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1058{
1059 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se))
1063 list_del_init(&se->group_node);
1064 cfs_rq->nr_running--;
1065}
1066
1067#ifdef CONFIG_FAIR_GROUP_SCHED
1068# ifdef CONFIG_SMP
1069static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1070{
1071 long tg_weight;
1072
1073
1074
1075
1076
1077
1078 tg_weight = atomic_long_read(&tg->load_avg);
1079 tg_weight -= cfs_rq->tg_load_contrib;
1080 tg_weight += cfs_rq->load.weight;
1081
1082 return tg_weight;
1083}
1084
1085static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1086{
1087 long tg_weight, load, shares;
1088
1089 tg_weight = calc_tg_weight(tg, cfs_rq);
1090 load = cfs_rq->load.weight;
1091
1092 shares = (tg->shares * load);
1093 if (tg_weight)
1094 shares /= tg_weight;
1095
1096 if (shares < MIN_SHARES)
1097 shares = MIN_SHARES;
1098 if (shares > tg->shares)
1099 shares = tg->shares;
1100
1101 return shares;
1102}
1103# else
1104static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1105{
1106 return tg->shares;
1107}
1108# endif
1109static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1110 unsigned long weight)
1111{
1112 if (se->on_rq) {
1113
1114 if (cfs_rq->curr == se)
1115 update_curr(cfs_rq);
1116 account_entity_dequeue(cfs_rq, se);
1117 }
1118
1119 update_load_set(&se->load, weight);
1120
1121 if (se->on_rq)
1122 account_entity_enqueue(cfs_rq, se);
1123}
1124
1125static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1126
1127static void update_cfs_shares(struct cfs_rq *cfs_rq)
1128{
1129 struct task_group *tg;
1130 struct sched_entity *se;
1131 long shares;
1132
1133 tg = cfs_rq->tg;
1134 se = tg->se[cpu_of(rq_of(cfs_rq))];
1135 if (!se || throttled_hierarchy(cfs_rq))
1136 return;
1137#ifndef CONFIG_SMP
1138 if (likely(se->load.weight == tg->shares))
1139 return;
1140#endif
1141 shares = calc_cfs_shares(cfs_rq, tg);
1142
1143 reweight_entity(cfs_rq_of(se), se, shares);
1144}
1145#else
1146static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1147{
1148}
1149#endif
1150
1151#ifdef CONFIG_SMP
1152
1153
1154
1155
1156#define LOAD_AVG_PERIOD 32
1157#define LOAD_AVG_MAX 47742
1158#define LOAD_AVG_MAX_N 345
1159
1160
1161static const u32 runnable_avg_yN_inv[] = {
1162 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1163 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1164 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1165 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1166 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1167 0x85aac367, 0x82cd8698,
1168};
1169
1170
1171
1172
1173
1174static const u32 runnable_avg_yN_sum[] = {
1175 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1176 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1177 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1178};
1179
1180
1181
1182
1183
1184static __always_inline u64 decay_load(u64 val, u64 n)
1185{
1186 unsigned int local_n;
1187
1188 if (!n)
1189 return val;
1190 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1191 return 0;
1192
1193
1194 local_n = n;
1195
1196
1197
1198
1199
1200
1201
1202
1203 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1204 val >>= local_n / LOAD_AVG_PERIOD;
1205 local_n %= LOAD_AVG_PERIOD;
1206 }
1207
1208 val *= runnable_avg_yN_inv[local_n];
1209
1210 return val >> 32;
1211}
1212
1213
1214
1215
1216
1217
1218
1219
1220static u32 __compute_runnable_contrib(u64 n)
1221{
1222 u32 contrib = 0;
1223
1224 if (likely(n <= LOAD_AVG_PERIOD))
1225 return runnable_avg_yN_sum[n];
1226 else if (unlikely(n >= LOAD_AVG_MAX_N))
1227 return LOAD_AVG_MAX;
1228
1229
1230 do {
1231 contrib /= 2;
1232 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1233
1234 n -= LOAD_AVG_PERIOD;
1235 } while (n > LOAD_AVG_PERIOD);
1236
1237 contrib = decay_load(contrib, n);
1238 return contrib + runnable_avg_yN_sum[n];
1239}
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269static __always_inline int __update_entity_runnable_avg(u64 now,
1270 struct sched_avg *sa,
1271 int runnable)
1272{
1273 u64 delta, periods;
1274 u32 runnable_contrib;
1275 int delta_w, decayed = 0;
1276
1277 delta = now - sa->last_runnable_update;
1278
1279
1280
1281
1282 if ((s64)delta < 0) {
1283 sa->last_runnable_update = now;
1284 return 0;
1285 }
1286
1287
1288
1289
1290
1291 delta >>= 10;
1292 if (!delta)
1293 return 0;
1294 sa->last_runnable_update = now;
1295
1296
1297 delta_w = sa->runnable_avg_period % 1024;
1298 if (delta + delta_w >= 1024) {
1299
1300 decayed = 1;
1301
1302
1303
1304
1305
1306
1307 delta_w = 1024 - delta_w;
1308 if (runnable)
1309 sa->runnable_avg_sum += delta_w;
1310 sa->runnable_avg_period += delta_w;
1311
1312 delta -= delta_w;
1313
1314
1315 periods = delta / 1024;
1316 delta %= 1024;
1317
1318 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1319 periods + 1);
1320 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1321 periods + 1);
1322
1323
1324 runnable_contrib = __compute_runnable_contrib(periods);
1325 if (runnable)
1326 sa->runnable_avg_sum += runnable_contrib;
1327 sa->runnable_avg_period += runnable_contrib;
1328 }
1329
1330
1331 if (runnable)
1332 sa->runnable_avg_sum += delta;
1333 sa->runnable_avg_period += delta;
1334
1335 return decayed;
1336}
1337
1338
1339static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1340{
1341 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1342 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1343
1344 decays -= se->avg.decay_count;
1345 if (!decays)
1346 return 0;
1347
1348 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1349 se->avg.decay_count = 0;
1350
1351 return decays;
1352}
1353
1354#ifdef CONFIG_FAIR_GROUP_SCHED
1355static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1356 int force_update)
1357{
1358 struct task_group *tg = cfs_rq->tg;
1359 long tg_contrib;
1360
1361 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1362 tg_contrib -= cfs_rq->tg_load_contrib;
1363
1364 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1365 atomic_long_add(tg_contrib, &tg->load_avg);
1366 cfs_rq->tg_load_contrib += tg_contrib;
1367 }
1368}
1369
1370
1371
1372
1373
1374static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1375 struct cfs_rq *cfs_rq)
1376{
1377 struct task_group *tg = cfs_rq->tg;
1378 long contrib;
1379
1380
1381 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1382 sa->runnable_avg_period + 1);
1383 contrib -= cfs_rq->tg_runnable_contrib;
1384
1385 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1386 atomic_add(contrib, &tg->runnable_avg);
1387 cfs_rq->tg_runnable_contrib += contrib;
1388 }
1389}
1390
1391static inline void __update_group_entity_contrib(struct sched_entity *se)
1392{
1393 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1394 struct task_group *tg = cfs_rq->tg;
1395 int runnable_avg;
1396
1397 u64 contrib;
1398
1399 contrib = cfs_rq->tg_load_contrib * tg->shares;
1400 se->avg.load_avg_contrib = div_u64(contrib,
1401 atomic_long_read(&tg->load_avg) + 1);
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426 runnable_avg = atomic_read(&tg->runnable_avg);
1427 if (runnable_avg < NICE_0_LOAD) {
1428 se->avg.load_avg_contrib *= runnable_avg;
1429 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1430 }
1431}
1432#else
1433static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1434 int force_update) {}
1435static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1436 struct cfs_rq *cfs_rq) {}
1437static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1438#endif
1439
1440static inline void __update_task_entity_contrib(struct sched_entity *se)
1441{
1442 u32 contrib;
1443
1444
1445 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1446 contrib /= (se->avg.runnable_avg_period + 1);
1447 se->avg.load_avg_contrib = scale_load(contrib);
1448}
1449
1450
1451static long __update_entity_load_avg_contrib(struct sched_entity *se)
1452{
1453 long old_contrib = se->avg.load_avg_contrib;
1454
1455 if (entity_is_task(se)) {
1456 __update_task_entity_contrib(se);
1457 } else {
1458 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1459 __update_group_entity_contrib(se);
1460 }
1461
1462 return se->avg.load_avg_contrib - old_contrib;
1463}
1464
1465static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1466 long load_contrib)
1467{
1468 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1469 cfs_rq->blocked_load_avg -= load_contrib;
1470 else
1471 cfs_rq->blocked_load_avg = 0;
1472}
1473
1474static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1475
1476
1477static inline void update_entity_load_avg(struct sched_entity *se,
1478 int update_cfs_rq)
1479{
1480 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1481 long contrib_delta;
1482 u64 now;
1483
1484
1485
1486
1487
1488 if (entity_is_task(se))
1489 now = cfs_rq_clock_task(cfs_rq);
1490 else
1491 now = cfs_rq_clock_task(group_cfs_rq(se));
1492
1493 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1494 return;
1495
1496 contrib_delta = __update_entity_load_avg_contrib(se);
1497
1498 if (!update_cfs_rq)
1499 return;
1500
1501 if (se->on_rq)
1502 cfs_rq->runnable_load_avg += contrib_delta;
1503 else
1504 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1505}
1506
1507
1508
1509
1510
1511static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1512{
1513 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1514 u64 decays;
1515
1516 decays = now - cfs_rq->last_decay;
1517 if (!decays && !force_update)
1518 return;
1519
1520 if (atomic_long_read(&cfs_rq->removed_load)) {
1521 unsigned long removed_load;
1522 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1523 subtract_blocked_load_contrib(cfs_rq, removed_load);
1524 }
1525
1526 if (decays) {
1527 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1528 decays);
1529 atomic64_add(decays, &cfs_rq->decay_counter);
1530 cfs_rq->last_decay = now;
1531 }
1532
1533 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1534}
1535
1536static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1537{
1538 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1539 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1540}
1541
1542
1543static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1544 struct sched_entity *se,
1545 int wakeup)
1546{
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556 if (unlikely(se->avg.decay_count <= 0)) {
1557 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1558 if (se->avg.decay_count) {
1559
1560
1561
1562
1563
1564
1565
1566
1567 se->avg.last_runnable_update -= (-se->avg.decay_count)
1568 << 20;
1569 update_entity_load_avg(se, 0);
1570
1571 se->avg.decay_count = 0;
1572 }
1573 wakeup = 0;
1574 } else {
1575
1576
1577
1578
1579
1580 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 << 20;
1582 }
1583
1584
1585 if (wakeup) {
1586 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1587 update_entity_load_avg(se, 0);
1588 }
1589
1590 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1591
1592 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1593}
1594
1595
1596
1597
1598
1599
1600static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1601 struct sched_entity *se,
1602 int sleep)
1603{
1604 update_entity_load_avg(se, 1);
1605
1606 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1607
1608 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1609 if (sleep) {
1610 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1611 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1612 }
1613}
1614
1615
1616
1617
1618
1619
1620void idle_enter_fair(struct rq *this_rq)
1621{
1622 update_rq_runnable_avg(this_rq, 1);
1623}
1624
1625
1626
1627
1628
1629
1630void idle_exit_fair(struct rq *this_rq)
1631{
1632 update_rq_runnable_avg(this_rq, 0);
1633}
1634
1635#else
1636static inline void update_entity_load_avg(struct sched_entity *se,
1637 int update_cfs_rq) {}
1638static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1639static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1640 struct sched_entity *se,
1641 int wakeup) {}
1642static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1643 struct sched_entity *se,
1644 int sleep) {}
1645static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1646 int force_update) {}
1647#endif
1648
1649static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1650{
1651#ifdef CONFIG_SCHEDSTATS
1652 struct task_struct *tsk = NULL;
1653
1654 if (entity_is_task(se))
1655 tsk = task_of(se);
1656
1657 if (se->statistics.sleep_start) {
1658 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1659
1660 if ((s64)delta < 0)
1661 delta = 0;
1662
1663 if (unlikely(delta > se->statistics.sleep_max))
1664 se->statistics.sleep_max = delta;
1665
1666 se->statistics.sleep_start = 0;
1667 se->statistics.sum_sleep_runtime += delta;
1668
1669 if (tsk) {
1670 account_scheduler_latency(tsk, delta >> 10, 1);
1671 trace_sched_stat_sleep(tsk, delta);
1672 }
1673 }
1674 if (se->statistics.block_start) {
1675 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1676
1677 if ((s64)delta < 0)
1678 delta = 0;
1679
1680 if (unlikely(delta > se->statistics.block_max))
1681 se->statistics.block_max = delta;
1682
1683 se->statistics.block_start = 0;
1684 se->statistics.sum_sleep_runtime += delta;
1685
1686 if (tsk) {
1687 if (tsk->in_iowait) {
1688 se->statistics.iowait_sum += delta;
1689 se->statistics.iowait_count++;
1690 trace_sched_stat_iowait(tsk, delta);
1691 }
1692
1693 trace_sched_stat_blocked(tsk, delta);
1694
1695
1696
1697
1698
1699
1700 if (unlikely(prof_on == SLEEP_PROFILING)) {
1701 profile_hits(SLEEP_PROFILING,
1702 (void *)get_wchan(tsk),
1703 delta >> 20);
1704 }
1705 account_scheduler_latency(tsk, delta >> 10, 0);
1706 }
1707 }
1708#endif
1709}
1710
1711static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1712{
1713#ifdef CONFIG_SCHED_DEBUG
1714 s64 d = se->vruntime - cfs_rq->min_vruntime;
1715
1716 if (d < 0)
1717 d = -d;
1718
1719 if (d > 3*sysctl_sched_latency)
1720 schedstat_inc(cfs_rq, nr_spread_over);
1721#endif
1722}
1723
1724static void
1725place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1726{
1727 u64 vruntime = cfs_rq->min_vruntime;
1728
1729
1730
1731
1732
1733
1734
1735 if (initial && sched_feat(START_DEBIT))
1736 vruntime += sched_vslice(cfs_rq, se);
1737
1738
1739 if (!initial) {
1740 unsigned long thresh = sysctl_sched_latency;
1741
1742
1743
1744
1745
1746 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1747 thresh >>= 1;
1748
1749 vruntime -= thresh;
1750 }
1751
1752
1753 se->vruntime = max_vruntime(se->vruntime, vruntime);
1754}
1755
1756static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1757
1758static void
1759enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1760{
1761
1762
1763
1764
1765 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1766 se->vruntime += cfs_rq->min_vruntime;
1767
1768
1769
1770
1771 update_curr(cfs_rq);
1772 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1773 account_entity_enqueue(cfs_rq, se);
1774 update_cfs_shares(cfs_rq);
1775
1776 if (flags & ENQUEUE_WAKEUP) {
1777 place_entity(cfs_rq, se, 0);
1778 enqueue_sleeper(cfs_rq, se);
1779 }
1780
1781 update_stats_enqueue(cfs_rq, se);
1782 check_spread(cfs_rq, se);
1783 if (se != cfs_rq->curr)
1784 __enqueue_entity(cfs_rq, se);
1785 se->on_rq = 1;
1786
1787 if (cfs_rq->nr_running == 1) {
1788 list_add_leaf_cfs_rq(cfs_rq);
1789 check_enqueue_throttle(cfs_rq);
1790 }
1791}
1792
1793static void __clear_buddies_last(struct sched_entity *se)
1794{
1795 for_each_sched_entity(se) {
1796 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1797 if (cfs_rq->last == se)
1798 cfs_rq->last = NULL;
1799 else
1800 break;
1801 }
1802}
1803
1804static void __clear_buddies_next(struct sched_entity *se)
1805{
1806 for_each_sched_entity(se) {
1807 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1808 if (cfs_rq->next == se)
1809 cfs_rq->next = NULL;
1810 else
1811 break;
1812 }
1813}
1814
1815static void __clear_buddies_skip(struct sched_entity *se)
1816{
1817 for_each_sched_entity(se) {
1818 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1819 if (cfs_rq->skip == se)
1820 cfs_rq->skip = NULL;
1821 else
1822 break;
1823 }
1824}
1825
1826static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1827{
1828 if (cfs_rq->last == se)
1829 __clear_buddies_last(se);
1830
1831 if (cfs_rq->next == se)
1832 __clear_buddies_next(se);
1833
1834 if (cfs_rq->skip == se)
1835 __clear_buddies_skip(se);
1836}
1837
1838static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1839
1840static void
1841dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1842{
1843
1844
1845
1846 update_curr(cfs_rq);
1847 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1848
1849 update_stats_dequeue(cfs_rq, se);
1850 if (flags & DEQUEUE_SLEEP) {
1851#ifdef CONFIG_SCHEDSTATS
1852 if (entity_is_task(se)) {
1853 struct task_struct *tsk = task_of(se);
1854
1855 if (tsk->state & TASK_INTERRUPTIBLE)
1856 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1857 if (tsk->state & TASK_UNINTERRUPTIBLE)
1858 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1859 }
1860#endif
1861 }
1862
1863 clear_buddies(cfs_rq, se);
1864
1865 if (se != cfs_rq->curr)
1866 __dequeue_entity(cfs_rq, se);
1867 se->on_rq = 0;
1868 account_entity_dequeue(cfs_rq, se);
1869
1870
1871
1872
1873
1874
1875 if (!(flags & DEQUEUE_SLEEP))
1876 se->vruntime -= cfs_rq->min_vruntime;
1877
1878
1879 return_cfs_rq_runtime(cfs_rq);
1880
1881 update_min_vruntime(cfs_rq);
1882 update_cfs_shares(cfs_rq);
1883}
1884
1885
1886
1887
1888static void
1889check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1890{
1891 unsigned long ideal_runtime, delta_exec;
1892 struct sched_entity *se;
1893 s64 delta;
1894
1895 ideal_runtime = sched_slice(cfs_rq, curr);
1896 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1897 if (delta_exec > ideal_runtime) {
1898 resched_task(rq_of(cfs_rq)->curr);
1899
1900
1901
1902
1903 clear_buddies(cfs_rq, curr);
1904 return;
1905 }
1906
1907
1908
1909
1910
1911
1912 if (delta_exec < sysctl_sched_min_granularity)
1913 return;
1914
1915 se = __pick_first_entity(cfs_rq);
1916 delta = curr->vruntime - se->vruntime;
1917
1918 if (delta < 0)
1919 return;
1920
1921 if (delta > ideal_runtime)
1922 resched_task(rq_of(cfs_rq)->curr);
1923}
1924
1925static void
1926set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1927{
1928
1929 if (se->on_rq) {
1930
1931
1932
1933
1934
1935 update_stats_wait_end(cfs_rq, se);
1936 __dequeue_entity(cfs_rq, se);
1937 }
1938
1939 update_stats_curr_start(cfs_rq, se);
1940 cfs_rq->curr = se;
1941#ifdef CONFIG_SCHEDSTATS
1942
1943
1944
1945
1946
1947 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1948 se->statistics.slice_max = max(se->statistics.slice_max,
1949 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1950 }
1951#endif
1952 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1953}
1954
1955static int
1956wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1957
1958
1959
1960
1961
1962
1963
1964
1965static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1966{
1967 struct sched_entity *se = __pick_first_entity(cfs_rq);
1968 struct sched_entity *left = se;
1969
1970
1971
1972
1973
1974 if (cfs_rq->skip == se) {
1975 struct sched_entity *second = __pick_next_entity(se);
1976 if (second && wakeup_preempt_entity(second, left) < 1)
1977 se = second;
1978 }
1979
1980
1981
1982
1983 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1984 se = cfs_rq->last;
1985
1986
1987
1988
1989 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1990 se = cfs_rq->next;
1991
1992 clear_buddies(cfs_rq, se);
1993
1994 return se;
1995}
1996
1997static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1998
1999static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2000{
2001
2002
2003
2004
2005 if (prev->on_rq)
2006 update_curr(cfs_rq);
2007
2008
2009 check_cfs_rq_runtime(cfs_rq);
2010
2011 check_spread(cfs_rq, prev);
2012 if (prev->on_rq) {
2013 update_stats_wait_start(cfs_rq, prev);
2014
2015 __enqueue_entity(cfs_rq, prev);
2016
2017 update_entity_load_avg(prev, 1);
2018 }
2019 cfs_rq->curr = NULL;
2020}
2021
2022static void
2023entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2024{
2025
2026
2027
2028 update_curr(cfs_rq);
2029
2030
2031
2032
2033 update_entity_load_avg(curr, 1);
2034 update_cfs_rq_blocked_load(cfs_rq, 1);
2035 update_cfs_shares(cfs_rq);
2036
2037#ifdef CONFIG_SCHED_HRTICK
2038
2039
2040
2041
2042 if (queued) {
2043 resched_task(rq_of(cfs_rq)->curr);
2044 return;
2045 }
2046
2047
2048
2049 if (!sched_feat(DOUBLE_TICK) &&
2050 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2051 return;
2052#endif
2053
2054 if (cfs_rq->nr_running > 1)
2055 check_preempt_tick(cfs_rq, curr);
2056}
2057
2058
2059
2060
2061
2062
2063#ifdef CONFIG_CFS_BANDWIDTH
2064
2065#ifdef HAVE_JUMP_LABEL
2066static struct static_key __cfs_bandwidth_used;
2067
2068static inline bool cfs_bandwidth_used(void)
2069{
2070 return static_key_false(&__cfs_bandwidth_used);
2071}
2072
2073void account_cfs_bandwidth_used(int enabled, int was_enabled)
2074{
2075
2076 if (enabled && !was_enabled)
2077 static_key_slow_inc(&__cfs_bandwidth_used);
2078 else if (!enabled && was_enabled)
2079 static_key_slow_dec(&__cfs_bandwidth_used);
2080}
2081#else
2082static bool cfs_bandwidth_used(void)
2083{
2084 return true;
2085}
2086
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2088#endif
2089
2090
2091
2092
2093
2094static inline u64 default_cfs_period(void)
2095{
2096 return 100000000ULL;
2097}
2098
2099static inline u64 sched_cfs_bandwidth_slice(void)
2100{
2101 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2102}
2103
2104
2105
2106
2107
2108
2109
2110
2111void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2112{
2113 u64 now;
2114
2115 if (cfs_b->quota == RUNTIME_INF)
2116 return;
2117
2118 now = sched_clock_cpu(smp_processor_id());
2119 cfs_b->runtime = cfs_b->quota;
2120 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2121}
2122
2123static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2124{
2125 return &tg->cfs_bandwidth;
2126}
2127
2128
2129static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2130{
2131 if (unlikely(cfs_rq->throttle_count))
2132 return cfs_rq->throttled_clock_task;
2133
2134 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2135}
2136
2137
2138static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139{
2140 struct task_group *tg = cfs_rq->tg;
2141 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2142 u64 amount = 0, min_amount, expires;
2143
2144
2145 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2146
2147 raw_spin_lock(&cfs_b->lock);
2148 if (cfs_b->quota == RUNTIME_INF)
2149 amount = min_amount;
2150 else {
2151
2152
2153
2154
2155
2156
2157 if (!cfs_b->timer_active) {
2158 __refill_cfs_bandwidth_runtime(cfs_b);
2159 __start_cfs_bandwidth(cfs_b);
2160 }
2161
2162 if (cfs_b->runtime > 0) {
2163 amount = min(cfs_b->runtime, min_amount);
2164 cfs_b->runtime -= amount;
2165 cfs_b->idle = 0;
2166 }
2167 }
2168 expires = cfs_b->runtime_expires;
2169 raw_spin_unlock(&cfs_b->lock);
2170
2171 cfs_rq->runtime_remaining += amount;
2172
2173
2174
2175
2176
2177 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2178 cfs_rq->runtime_expires = expires;
2179
2180 return cfs_rq->runtime_remaining > 0;
2181}
2182
2183
2184
2185
2186
2187static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2188{
2189 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2190
2191
2192 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2193 return;
2194
2195 if (cfs_rq->runtime_remaining < 0)
2196 return;
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2208
2209 cfs_rq->runtime_expires += TICK_NSEC;
2210 } else {
2211
2212 cfs_rq->runtime_remaining = 0;
2213 }
2214}
2215
2216static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2217 unsigned long delta_exec)
2218{
2219
2220 cfs_rq->runtime_remaining -= delta_exec;
2221 expire_cfs_rq_runtime(cfs_rq);
2222
2223 if (likely(cfs_rq->runtime_remaining > 0))
2224 return;
2225
2226
2227
2228
2229
2230 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2231 resched_task(rq_of(cfs_rq)->curr);
2232}
2233
2234static __always_inline
2235void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2236{
2237 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2238 return;
2239
2240 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2241}
2242
2243static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2244{
2245 return cfs_bandwidth_used() && cfs_rq->throttled;
2246}
2247
2248
2249static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2250{
2251 return cfs_bandwidth_used() && cfs_rq->throttle_count;
2252}
2253
2254
2255
2256
2257
2258
2259static inline int throttled_lb_pair(struct task_group *tg,
2260 int src_cpu, int dest_cpu)
2261{
2262 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2263
2264 src_cfs_rq = tg->cfs_rq[src_cpu];
2265 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2266
2267 return throttled_hierarchy(src_cfs_rq) ||
2268 throttled_hierarchy(dest_cfs_rq);
2269}
2270
2271
2272static int tg_unthrottle_up(struct task_group *tg, void *data)
2273{
2274 struct rq *rq = data;
2275 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2276
2277 cfs_rq->throttle_count--;
2278#ifdef CONFIG_SMP
2279 if (!cfs_rq->throttle_count) {
2280
2281 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2282 cfs_rq->throttled_clock_task;
2283 }
2284#endif
2285
2286 return 0;
2287}
2288
2289static int tg_throttle_down(struct task_group *tg, void *data)
2290{
2291 struct rq *rq = data;
2292 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2293
2294
2295 if (!cfs_rq->throttle_count)
2296 cfs_rq->throttled_clock_task = rq_clock_task(rq);
2297 cfs_rq->throttle_count++;
2298
2299 return 0;
2300}
2301
2302static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2303{
2304 struct rq *rq = rq_of(cfs_rq);
2305 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2306 struct sched_entity *se;
2307 long task_delta, dequeue = 1;
2308
2309 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2310
2311
2312 rcu_read_lock();
2313 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2314 rcu_read_unlock();
2315
2316 task_delta = cfs_rq->h_nr_running;
2317 for_each_sched_entity(se) {
2318 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2319
2320 if (!se->on_rq)
2321 break;
2322
2323 if (dequeue)
2324 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2325 qcfs_rq->h_nr_running -= task_delta;
2326
2327 if (qcfs_rq->load.weight)
2328 dequeue = 0;
2329 }
2330
2331 if (!se)
2332 rq->nr_running -= task_delta;
2333
2334 cfs_rq->throttled = 1;
2335 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2338 raw_spin_unlock(&cfs_b->lock);
2339}
2340
2341void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2342{
2343 struct rq *rq = rq_of(cfs_rq);
2344 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2345 struct sched_entity *se;
2346 int enqueue = 1;
2347 long task_delta;
2348
2349 se = cfs_rq->tg->se[cpu_of(rq)];
2350
2351 cfs_rq->throttled = 0;
2352
2353 update_rq_clock(rq);
2354
2355 raw_spin_lock(&cfs_b->lock);
2356 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2357 list_del_rcu(&cfs_rq->throttled_list);
2358 raw_spin_unlock(&cfs_b->lock);
2359
2360
2361 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2362
2363 if (!cfs_rq->load.weight)
2364 return;
2365
2366 task_delta = cfs_rq->h_nr_running;
2367 for_each_sched_entity(se) {
2368 if (se->on_rq)
2369 enqueue = 0;
2370
2371 cfs_rq = cfs_rq_of(se);
2372 if (enqueue)
2373 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2374 cfs_rq->h_nr_running += task_delta;
2375
2376 if (cfs_rq_throttled(cfs_rq))
2377 break;
2378 }
2379
2380 if (!se)
2381 rq->nr_running += task_delta;
2382
2383
2384 if (rq->curr == rq->idle && rq->cfs.nr_running)
2385 resched_task(rq->curr);
2386}
2387
2388static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2389 u64 remaining, u64 expires)
2390{
2391 struct cfs_rq *cfs_rq;
2392 u64 runtime = remaining;
2393
2394 rcu_read_lock();
2395 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2396 throttled_list) {
2397 struct rq *rq = rq_of(cfs_rq);
2398
2399 raw_spin_lock(&rq->lock);
2400 if (!cfs_rq_throttled(cfs_rq))
2401 goto next;
2402
2403 runtime = -cfs_rq->runtime_remaining + 1;
2404 if (runtime > remaining)
2405 runtime = remaining;
2406 remaining -= runtime;
2407
2408 cfs_rq->runtime_remaining += runtime;
2409 cfs_rq->runtime_expires = expires;
2410
2411
2412 if (cfs_rq->runtime_remaining > 0)
2413 unthrottle_cfs_rq(cfs_rq);
2414
2415next:
2416 raw_spin_unlock(&rq->lock);
2417
2418 if (!remaining)
2419 break;
2420 }
2421 rcu_read_unlock();
2422
2423 return remaining;
2424}
2425
2426
2427
2428
2429
2430
2431
2432static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2433{
2434 u64 runtime, runtime_expires;
2435 int idle = 1, throttled;
2436
2437 raw_spin_lock(&cfs_b->lock);
2438
2439 if (cfs_b->quota == RUNTIME_INF)
2440 goto out_unlock;
2441
2442 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2443
2444 idle = cfs_b->idle && !throttled;
2445 cfs_b->nr_periods += overrun;
2446
2447
2448 if (idle)
2449 goto out_unlock;
2450
2451 __refill_cfs_bandwidth_runtime(cfs_b);
2452
2453 if (!throttled) {
2454
2455 cfs_b->idle = 1;
2456 goto out_unlock;
2457 }
2458
2459
2460 cfs_b->nr_throttled += overrun;
2461
2462
2463
2464
2465
2466
2467
2468 runtime = cfs_b->runtime;
2469 runtime_expires = cfs_b->runtime_expires;
2470 cfs_b->runtime = 0;
2471
2472
2473
2474
2475
2476
2477 while (throttled && runtime > 0) {
2478 raw_spin_unlock(&cfs_b->lock);
2479
2480 runtime = distribute_cfs_runtime(cfs_b, runtime,
2481 runtime_expires);
2482 raw_spin_lock(&cfs_b->lock);
2483
2484 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2485 }
2486
2487
2488 cfs_b->runtime = runtime;
2489
2490
2491
2492
2493
2494
2495 cfs_b->idle = 0;
2496out_unlock:
2497 if (idle)
2498 cfs_b->timer_active = 0;
2499 raw_spin_unlock(&cfs_b->lock);
2500
2501 return idle;
2502}
2503
2504
2505static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2506
2507static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510
2511
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2515 u64 remaining;
2516
2517
2518 if (hrtimer_callback_running(refresh_timer))
2519 return 1;
2520
2521
2522 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2523 if (remaining < min_expire)
2524 return 1;
2525
2526 return 0;
2527}
2528
2529static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2530{
2531 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2532
2533
2534 if (runtime_refresh_within(cfs_b, min_left))
2535 return;
2536
2537 start_bandwidth_timer(&cfs_b->slack_timer,
2538 ns_to_ktime(cfs_bandwidth_slack_period));
2539}
2540
2541
2542static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2543{
2544 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2545 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2546
2547 if (slack_runtime <= 0)
2548 return;
2549
2550 raw_spin_lock(&cfs_b->lock);
2551 if (cfs_b->quota != RUNTIME_INF &&
2552 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2553 cfs_b->runtime += slack_runtime;
2554
2555
2556 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2557 !list_empty(&cfs_b->throttled_cfs_rq))
2558 start_cfs_slack_bandwidth(cfs_b);
2559 }
2560 raw_spin_unlock(&cfs_b->lock);
2561
2562
2563 cfs_rq->runtime_remaining -= slack_runtime;
2564}
2565
2566static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2567{
2568 if (!cfs_bandwidth_used())
2569 return;
2570
2571 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2572 return;
2573
2574 __return_cfs_rq_runtime(cfs_rq);
2575}
2576
2577
2578
2579
2580
2581static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2582{
2583 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2584 u64 expires;
2585
2586
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2588 return;
2589
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0;
2594 }
2595 expires = cfs_b->runtime_expires;
2596 raw_spin_unlock(&cfs_b->lock);
2597
2598 if (!runtime)
2599 return;
2600
2601 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2602
2603 raw_spin_lock(&cfs_b->lock);
2604 if (expires == cfs_b->runtime_expires)
2605 cfs_b->runtime = runtime;
2606 raw_spin_unlock(&cfs_b->lock);
2607}
2608
2609
2610
2611
2612
2613
2614static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2615{
2616 if (!cfs_bandwidth_used())
2617 return;
2618
2619
2620 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2621 return;
2622
2623
2624 if (cfs_rq_throttled(cfs_rq))
2625 return;
2626
2627
2628 account_cfs_rq_runtime(cfs_rq, 0);
2629 if (cfs_rq->runtime_remaining <= 0)
2630 throttle_cfs_rq(cfs_rq);
2631}
2632
2633
2634static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2635{
2636 if (!cfs_bandwidth_used())
2637 return;
2638
2639 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2640 return;
2641
2642
2643
2644
2645
2646 if (cfs_rq_throttled(cfs_rq))
2647 return;
2648
2649 throttle_cfs_rq(cfs_rq);
2650}
2651
2652static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2653{
2654 struct cfs_bandwidth *cfs_b =
2655 container_of(timer, struct cfs_bandwidth, slack_timer);
2656 do_sched_cfs_slack_timer(cfs_b);
2657
2658 return HRTIMER_NORESTART;
2659}
2660
2661static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2662{
2663 struct cfs_bandwidth *cfs_b =
2664 container_of(timer, struct cfs_bandwidth, period_timer);
2665 ktime_t now;
2666 int overrun;
2667 int idle = 0;
2668
2669 for (;;) {
2670 now = hrtimer_cb_get_time(timer);
2671 overrun = hrtimer_forward(timer, now, cfs_b->period);
2672
2673 if (!overrun)
2674 break;
2675
2676 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2677 }
2678
2679 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2680}
2681
2682void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2683{
2684 raw_spin_lock_init(&cfs_b->lock);
2685 cfs_b->runtime = 0;
2686 cfs_b->quota = RUNTIME_INF;
2687 cfs_b->period = ns_to_ktime(default_cfs_period());
2688
2689 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2690 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2691 cfs_b->period_timer.function = sched_cfs_period_timer;
2692 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2693 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2694}
2695
2696static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2697{
2698 cfs_rq->runtime_enabled = 0;
2699 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2700}
2701
2702
2703void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2704{
2705
2706
2707
2708
2709
2710
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2712 raw_spin_unlock(&cfs_b->lock);
2713
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock);
2717
2718 if (cfs_b->timer_active)
2719 return;
2720 }
2721
2722 cfs_b->timer_active = 1;
2723 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2724}
2725
2726static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2727{
2728 hrtimer_cancel(&cfs_b->period_timer);
2729 hrtimer_cancel(&cfs_b->slack_timer);
2730}
2731
2732static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2733{
2734 struct cfs_rq *cfs_rq;
2735
2736 for_each_leaf_cfs_rq(rq, cfs_rq) {
2737 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2738
2739 if (!cfs_rq->runtime_enabled)
2740 continue;
2741
2742
2743
2744
2745
2746 cfs_rq->runtime_remaining = cfs_b->quota;
2747 if (cfs_rq_throttled(cfs_rq))
2748 unthrottle_cfs_rq(cfs_rq);
2749 }
2750}
2751
2752#else
2753static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2754{
2755 return rq_clock_task(rq_of(cfs_rq));
2756}
2757
2758static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2759 unsigned long delta_exec) {}
2760static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2761static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2762static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2763
2764static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2765{
2766 return 0;
2767}
2768
2769static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2770{
2771 return 0;
2772}
2773
2774static inline int throttled_lb_pair(struct task_group *tg,
2775 int src_cpu, int dest_cpu)
2776{
2777 return 0;
2778}
2779
2780void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2781
2782#ifdef CONFIG_FAIR_GROUP_SCHED
2783static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2784#endif
2785
2786static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2787{
2788 return NULL;
2789}
2790static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2791static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2792
2793#endif
2794
2795
2796
2797
2798
2799#ifdef CONFIG_SCHED_HRTICK
2800static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2801{
2802 struct sched_entity *se = &p->se;
2803 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2804
2805 WARN_ON(task_rq(p) != rq);
2806
2807 if (cfs_rq->nr_running > 1) {
2808 u64 slice = sched_slice(cfs_rq, se);
2809 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2810 s64 delta = slice - ran;
2811
2812 if (delta < 0) {
2813 if (rq->curr == p)
2814 resched_task(p);
2815 return;
2816 }
2817
2818
2819
2820
2821
2822 if (rq->curr != p)
2823 delta = max_t(s64, 10000LL, delta);
2824
2825 hrtick_start(rq, delta);
2826 }
2827}
2828
2829
2830
2831
2832
2833
2834static void hrtick_update(struct rq *rq)
2835{
2836 struct task_struct *curr = rq->curr;
2837
2838 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2839 return;
2840
2841 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2842 hrtick_start_fair(rq, curr);
2843}
2844#else
2845static inline void
2846hrtick_start_fair(struct rq *rq, struct task_struct *p)
2847{
2848}
2849
2850static inline void hrtick_update(struct rq *rq)
2851{
2852}
2853#endif
2854
2855
2856
2857
2858
2859
2860static void
2861enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2862{
2863 struct cfs_rq *cfs_rq;
2864 struct sched_entity *se = &p->se;
2865
2866 for_each_sched_entity(se) {
2867 if (se->on_rq)
2868 break;
2869 cfs_rq = cfs_rq_of(se);
2870 enqueue_entity(cfs_rq, se, flags);
2871
2872
2873
2874
2875
2876
2877
2878 if (cfs_rq_throttled(cfs_rq))
2879 break;
2880 cfs_rq->h_nr_running++;
2881
2882 flags = ENQUEUE_WAKEUP;
2883 }
2884
2885 for_each_sched_entity(se) {
2886 cfs_rq = cfs_rq_of(se);
2887 cfs_rq->h_nr_running++;
2888
2889 if (cfs_rq_throttled(cfs_rq))
2890 break;
2891
2892 update_cfs_shares(cfs_rq);
2893 update_entity_load_avg(se, 1);
2894 }
2895
2896 if (!se) {
2897 update_rq_runnable_avg(rq, rq->nr_running);
2898 inc_nr_running(rq);
2899 }
2900 hrtick_update(rq);
2901}
2902
2903static void set_next_buddy(struct sched_entity *se);
2904
2905
2906
2907
2908
2909
2910static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2911{
2912 struct cfs_rq *cfs_rq;
2913 struct sched_entity *se = &p->se;
2914 int task_sleep = flags & DEQUEUE_SLEEP;
2915
2916 for_each_sched_entity(se) {
2917 cfs_rq = cfs_rq_of(se);
2918 dequeue_entity(cfs_rq, se, flags);
2919
2920
2921
2922
2923
2924
2925
2926 if (cfs_rq_throttled(cfs_rq))
2927 break;
2928 cfs_rq->h_nr_running--;
2929
2930
2931 if (cfs_rq->load.weight) {
2932
2933
2934
2935
2936 if (task_sleep && parent_entity(se))
2937 set_next_buddy(parent_entity(se));
2938
2939
2940 se = parent_entity(se);
2941 break;
2942 }
2943 flags |= DEQUEUE_SLEEP;
2944 }
2945
2946 for_each_sched_entity(se) {
2947 cfs_rq = cfs_rq_of(se);
2948 cfs_rq->h_nr_running--;
2949
2950 if (cfs_rq_throttled(cfs_rq))
2951 break;
2952
2953 update_cfs_shares(cfs_rq);
2954 update_entity_load_avg(se, 1);
2955 }
2956
2957 if (!se) {
2958 dec_nr_running(rq);
2959 update_rq_runnable_avg(rq, 1);
2960 }
2961 hrtick_update(rq);
2962}
2963
2964#ifdef CONFIG_SMP
2965
2966static unsigned long weighted_cpuload(const int cpu)
2967{
2968 return cpu_rq(cpu)->cfs.runnable_load_avg;
2969}
2970
2971
2972
2973
2974
2975
2976
2977
2978static unsigned long source_load(int cpu, int type)
2979{
2980 struct rq *rq = cpu_rq(cpu);
2981 unsigned long total = weighted_cpuload(cpu);
2982
2983 if (type == 0 || !sched_feat(LB_BIAS))
2984 return total;
2985
2986 return min(rq->cpu_load[type-1], total);
2987}
2988
2989
2990
2991
2992
2993static unsigned long target_load(int cpu, int type)
2994{
2995 struct rq *rq = cpu_rq(cpu);
2996 unsigned long total = weighted_cpuload(cpu);
2997
2998 if (type == 0 || !sched_feat(LB_BIAS))
2999 return total;
3000
3001 return max(rq->cpu_load[type-1], total);
3002}
3003
3004static unsigned long power_of(int cpu)
3005{
3006 return cpu_rq(cpu)->cpu_power;
3007}
3008
3009static unsigned long cpu_avg_load_per_task(int cpu)
3010{
3011 struct rq *rq = cpu_rq(cpu);
3012 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3013 unsigned long load_avg = rq->cfs.runnable_load_avg;
3014
3015 if (nr_running)
3016 return load_avg / nr_running;
3017
3018 return 0;
3019}
3020
3021
3022static void task_waking_fair(struct task_struct *p)
3023{
3024 struct sched_entity *se = &p->se;
3025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3026 u64 min_vruntime;
3027
3028#ifndef CONFIG_64BIT
3029 u64 min_vruntime_copy;
3030
3031 do {
3032 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3033 smp_rmb();
3034 min_vruntime = cfs_rq->min_vruntime;
3035 } while (min_vruntime != min_vruntime_copy);
3036#else
3037 min_vruntime = cfs_rq->min_vruntime;
3038#endif
3039
3040 se->vruntime -= min_vruntime;
3041}
3042
3043#ifdef CONFIG_FAIR_GROUP_SCHED
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3095{
3096 struct sched_entity *se = tg->se[cpu];
3097
3098 if (!tg->parent)
3099 return wl;
3100
3101 for_each_sched_entity(se) {
3102 long w, W;
3103
3104 tg = se->my_q->tg;
3105
3106
3107
3108
3109 W = wg + calc_tg_weight(tg, se->my_q);
3110
3111
3112
3113
3114 w = se->my_q->load.weight + wl;
3115
3116
3117
3118
3119 if (W > 0 && w < W)
3120 wl = (w * tg->shares) / W;
3121 else
3122 wl = tg->shares;
3123
3124
3125
3126
3127
3128
3129 if (wl < MIN_SHARES)
3130 wl = MIN_SHARES;
3131
3132
3133
3134
3135 wl -= se->load.weight;
3136
3137
3138
3139
3140
3141
3142
3143
3144 wg = 0;
3145 }
3146
3147 return wl;
3148}
3149#else
3150
3151static inline unsigned long effective_load(struct task_group *tg, int cpu,
3152 unsigned long wl, unsigned long wg)
3153{
3154 return wl;
3155}
3156
3157#endif
3158
3159static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3160{
3161 s64 this_load, load;
3162 int idx, this_cpu, prev_cpu;
3163 unsigned long tl_per_task;
3164 struct task_group *tg;
3165 unsigned long weight;
3166 int balanced;
3167
3168 idx = sd->wake_idx;
3169 this_cpu = smp_processor_id();
3170 prev_cpu = task_cpu(p);
3171 load = source_load(prev_cpu, idx);
3172 this_load = target_load(this_cpu, idx);
3173
3174
3175
3176
3177
3178
3179 if (sync) {
3180 tg = task_group(current);
3181 weight = current->se.load.weight;
3182
3183 this_load += effective_load(tg, this_cpu, -weight, -weight);
3184 load += effective_load(tg, prev_cpu, 0, -weight);
3185 }
3186
3187 tg = task_group(p);
3188 weight = p->se.load.weight;
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199 if (this_load > 0) {
3200 s64 this_eff_load, prev_eff_load;
3201
3202 this_eff_load = 100;
3203 this_eff_load *= power_of(prev_cpu);
3204 this_eff_load *= this_load +
3205 effective_load(tg, this_cpu, weight, weight);
3206
3207 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3208 prev_eff_load *= power_of(this_cpu);
3209 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3210
3211 balanced = this_eff_load <= prev_eff_load;
3212 } else
3213 balanced = true;
3214
3215
3216
3217
3218
3219
3220 if (sync && balanced)
3221 return 1;
3222
3223 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3224 tl_per_task = cpu_avg_load_per_task(this_cpu);
3225
3226 if (balanced ||
3227 (this_load <= load &&
3228 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3229
3230
3231
3232
3233
3234 schedstat_inc(sd, ttwu_move_affine);
3235 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3236
3237 return 1;
3238 }
3239 return 0;
3240}
3241
3242
3243
3244
3245
3246static struct sched_group *
3247find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3248 int this_cpu, int load_idx)
3249{
3250 struct sched_group *idlest = NULL, *group = sd->groups;
3251 unsigned long min_load = ULONG_MAX, this_load = 0;
3252 int imbalance = 100 + (sd->imbalance_pct-100)/2;
3253
3254 do {
3255 unsigned long load, avg_load;
3256 int local_group;
3257 int i;
3258
3259
3260 if (!cpumask_intersects(sched_group_cpus(group),
3261 tsk_cpus_allowed(p)))
3262 continue;
3263
3264 local_group = cpumask_test_cpu(this_cpu,
3265 sched_group_cpus(group));
3266
3267
3268 avg_load = 0;
3269
3270 for_each_cpu(i, sched_group_cpus(group)) {
3271
3272 if (local_group)
3273 load = source_load(i, load_idx);
3274 else
3275 load = target_load(i, load_idx);
3276
3277 avg_load += load;
3278 }
3279
3280
3281 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3282
3283 if (local_group) {
3284 this_load = avg_load;
3285 } else if (avg_load < min_load) {
3286 min_load = avg_load;
3287 idlest = group;
3288 }
3289 } while (group = group->next, group != sd->groups);
3290
3291 if (!idlest || 100*this_load < imbalance*min_load)
3292 return NULL;
3293 return idlest;
3294}
3295
3296
3297
3298
3299static int
3300find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3301{
3302 unsigned long load, min_load = ULONG_MAX;
3303 int idlest = -1;
3304 int i;
3305
3306
3307 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3308 load = weighted_cpuload(i);
3309
3310 if (load < min_load || (load == min_load && i == this_cpu)) {
3311 min_load = load;
3312 idlest = i;
3313 }
3314 }
3315
3316 return idlest;
3317}
3318
3319
3320
3321
3322static int select_idle_sibling(struct task_struct *p, int target)
3323{
3324 struct sched_domain *sd;
3325 struct sched_group *sg;
3326 int i = task_cpu(p);
3327
3328 if (idle_cpu(target))
3329 return target;
3330
3331
3332
3333
3334 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3335 return i;
3336
3337
3338
3339
3340 sd = rcu_dereference(per_cpu(sd_llc, target));
3341 for_each_lower_domain(sd) {
3342 sg = sd->groups;
3343 do {
3344 if (!cpumask_intersects(sched_group_cpus(sg),
3345 tsk_cpus_allowed(p)))
3346 goto next;
3347
3348 for_each_cpu(i, sched_group_cpus(sg)) {
3349 if (i == target || !idle_cpu(i))
3350 goto next;
3351 }
3352
3353 target = cpumask_first_and(sched_group_cpus(sg),
3354 tsk_cpus_allowed(p));
3355 goto done;
3356next:
3357 sg = sg->next;
3358 } while (sg != sd->groups);
3359 }
3360done:
3361 return target;
3362}
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375static int
3376select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3377{
3378 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3379 int cpu = smp_processor_id();
3380 int prev_cpu = task_cpu(p);
3381 int new_cpu = cpu;
3382 int want_affine = 0;
3383 int sync = wake_flags & WF_SYNC;
3384
3385 if (p->nr_cpus_allowed == 1)
3386 return prev_cpu;
3387
3388 if (sd_flag & SD_BALANCE_WAKE) {
3389 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3390 want_affine = 1;
3391 new_cpu = prev_cpu;
3392 }
3393
3394 rcu_read_lock();
3395 for_each_domain(cpu, tmp) {
3396 if (!(tmp->flags & SD_LOAD_BALANCE))
3397 continue;
3398
3399
3400
3401
3402
3403 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3404 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3405 affine_sd = tmp;
3406 break;
3407 }
3408
3409 if (tmp->flags & sd_flag)
3410 sd = tmp;
3411 }
3412
3413 if (affine_sd) {
3414 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3415 prev_cpu = cpu;
3416
3417 new_cpu = select_idle_sibling(p, prev_cpu);
3418 goto unlock;
3419 }
3420
3421 while (sd) {
3422 int load_idx = sd->forkexec_idx;
3423 struct sched_group *group;
3424 int weight;
3425
3426 if (!(sd->flags & sd_flag)) {
3427 sd = sd->child;
3428 continue;
3429 }
3430
3431 if (sd_flag & SD_BALANCE_WAKE)
3432 load_idx = sd->wake_idx;
3433
3434 group = find_idlest_group(sd, p, cpu, load_idx);
3435 if (!group) {
3436 sd = sd->child;
3437 continue;
3438 }
3439
3440 new_cpu = find_idlest_cpu(group, p, cpu);
3441 if (new_cpu == -1 || new_cpu == cpu) {
3442
3443 sd = sd->child;
3444 continue;
3445 }
3446
3447
3448 cpu = new_cpu;
3449 weight = sd->span_weight;
3450 sd = NULL;
3451 for_each_domain(cpu, tmp) {
3452 if (weight <= tmp->span_weight)
3453 break;
3454 if (tmp->flags & sd_flag)
3455 sd = tmp;
3456 }
3457
3458 }
3459unlock:
3460 rcu_read_unlock();
3461
3462 return new_cpu;
3463}
3464
3465
3466
3467
3468
3469
3470
3471static void
3472migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3473{
3474 struct sched_entity *se = &p->se;
3475 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3476
3477
3478
3479
3480
3481
3482
3483 if (se->avg.decay_count) {
3484 se->avg.decay_count = -__synchronize_entity_decay(se);
3485 atomic_long_add(se->avg.load_avg_contrib,
3486 &cfs_rq->removed_load);
3487 }
3488}
3489#endif
3490
3491static unsigned long
3492wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3493{
3494 unsigned long gran = sysctl_sched_wakeup_granularity;
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509 return calc_delta_fair(gran, se);
3510}
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526static int
3527wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3528{
3529 s64 gran, vdiff = curr->vruntime - se->vruntime;
3530
3531 if (vdiff <= 0)
3532 return -1;
3533
3534 gran = wakeup_gran(curr, se);
3535 if (vdiff > gran)
3536 return 1;
3537
3538 return 0;
3539}
3540
3541static void set_last_buddy(struct sched_entity *se)
3542{
3543 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3544 return;
3545
3546 for_each_sched_entity(se)
3547 cfs_rq_of(se)->last = se;
3548}
3549
3550static void set_next_buddy(struct sched_entity *se)
3551{
3552 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3553 return;
3554
3555 for_each_sched_entity(se)
3556 cfs_rq_of(se)->next = se;
3557}
3558
3559static void set_skip_buddy(struct sched_entity *se)
3560{
3561 for_each_sched_entity(se)
3562 cfs_rq_of(se)->skip = se;
3563}
3564
3565
3566
3567
3568static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3569{
3570 struct task_struct *curr = rq->curr;
3571 struct sched_entity *se = &curr->se, *pse = &p->se;
3572 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3573 int scale = cfs_rq->nr_running >= sched_nr_latency;
3574 int next_buddy_marked = 0;
3575
3576 if (unlikely(se == pse))
3577 return;
3578
3579
3580
3581
3582
3583
3584
3585 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3586 return;
3587
3588 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3589 set_next_buddy(pse);
3590 next_buddy_marked = 1;
3591 }
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603 if (test_tsk_need_resched(curr))
3604 return;
3605
3606
3607 if (unlikely(curr->policy == SCHED_IDLE) &&
3608 likely(p->policy != SCHED_IDLE))
3609 goto preempt;
3610
3611
3612
3613
3614
3615 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3616 return;
3617
3618 find_matching_se(&se, &pse);
3619 update_curr(cfs_rq_of(se));
3620 BUG_ON(!pse);
3621 if (wakeup_preempt_entity(se, pse) == 1) {
3622
3623
3624
3625
3626 if (!next_buddy_marked)
3627 set_next_buddy(pse);
3628 goto preempt;
3629 }
3630
3631 return;
3632
3633preempt:
3634 resched_task(curr);
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644 if (unlikely(!se->on_rq || curr == rq->idle))
3645 return;
3646
3647 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3648 set_last_buddy(se);
3649}
3650
3651static struct task_struct *pick_next_task_fair(struct rq *rq)
3652{
3653 struct task_struct *p;
3654 struct cfs_rq *cfs_rq = &rq->cfs;
3655 struct sched_entity *se;
3656
3657 if (!cfs_rq->nr_running)
3658 return NULL;
3659
3660 do {
3661 se = pick_next_entity(cfs_rq);
3662 set_next_entity(cfs_rq, se);
3663 cfs_rq = group_cfs_rq(se);
3664 } while (cfs_rq);
3665
3666 p = task_of(se);
3667 if (hrtick_enabled(rq))
3668 hrtick_start_fair(rq, p);
3669
3670 return p;
3671}
3672
3673
3674
3675
3676static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3677{
3678 struct sched_entity *se = &prev->se;
3679 struct cfs_rq *cfs_rq;
3680
3681 for_each_sched_entity(se) {
3682 cfs_rq = cfs_rq_of(se);
3683 put_prev_entity(cfs_rq, se);
3684 }
3685}
3686
3687
3688
3689
3690
3691
3692static void yield_task_fair(struct rq *rq)
3693{
3694 struct task_struct *curr = rq->curr;
3695 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3696 struct sched_entity *se = &curr->se;
3697
3698
3699
3700
3701 if (unlikely(rq->nr_running == 1))
3702 return;
3703
3704 clear_buddies(cfs_rq, se);
3705
3706 if (curr->policy != SCHED_BATCH) {
3707 update_rq_clock(rq);
3708
3709
3710
3711 update_curr(cfs_rq);
3712
3713
3714
3715
3716
3717 rq->skip_clock_update = 1;
3718 }
3719
3720 set_skip_buddy(se);
3721}
3722
3723static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3724{
3725 struct sched_entity *se = &p->se;
3726
3727
3728 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3729 return false;
3730
3731
3732 set_next_buddy(se);
3733
3734 yield_task_fair(rq);
3735
3736 return true;
3737}
3738
3739#ifdef CONFIG_SMP
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3859
3860#define LBF_ALL_PINNED 0x01
3861#define LBF_NEED_BREAK 0x02
3862#define LBF_SOME_PINNED 0x04
3863
3864struct lb_env {
3865 struct sched_domain *sd;
3866
3867 struct rq *src_rq;
3868 int src_cpu;
3869
3870 int dst_cpu;
3871 struct rq *dst_rq;
3872
3873 struct cpumask *dst_grpmask;
3874 int new_dst_cpu;
3875 enum cpu_idle_type idle;
3876 long imbalance;
3877
3878 struct cpumask *cpus;
3879
3880 unsigned int flags;
3881
3882 unsigned int loop;
3883 unsigned int loop_break;
3884 unsigned int loop_max;
3885};
3886
3887
3888
3889
3890
3891static void move_task(struct task_struct *p, struct lb_env *env)
3892{
3893 deactivate_task(env->src_rq, p, 0);
3894 set_task_cpu(p, env->dst_cpu);
3895 activate_task(env->dst_rq, p, 0);
3896 check_preempt_curr(env->dst_rq, p, 0);
3897}
3898
3899
3900
3901
3902static int
3903task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3904{
3905 s64 delta;
3906
3907 if (p->sched_class != &fair_sched_class)
3908 return 0;
3909
3910 if (unlikely(p->policy == SCHED_IDLE))
3911 return 0;
3912
3913
3914
3915
3916 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3917 (&p->se == cfs_rq_of(&p->se)->next ||
3918 &p->se == cfs_rq_of(&p->se)->last))
3919 return 1;
3920
3921 if (sysctl_sched_migration_cost == -1)
3922 return 1;
3923 if (sysctl_sched_migration_cost == 0)
3924 return 0;
3925
3926 delta = now - p->se.exec_start;
3927
3928 return delta < (s64)sysctl_sched_migration_cost;
3929}
3930
3931
3932
3933
3934static
3935int can_migrate_task(struct task_struct *p, struct lb_env *env)
3936{
3937 int tsk_cache_hot = 0;
3938
3939
3940
3941
3942
3943
3944
3945 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3946 return 0;
3947
3948 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3949 int cpu;
3950
3951 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3962 return 0;
3963
3964
3965 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3966 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3967 env->flags |= LBF_SOME_PINNED;
3968 env->new_dst_cpu = cpu;
3969 break;
3970 }
3971 }
3972
3973 return 0;
3974 }
3975
3976
3977 env->flags &= ~LBF_ALL_PINNED;
3978
3979 if (task_running(env->src_rq, p)) {
3980 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3981 return 0;
3982 }
3983
3984
3985
3986
3987
3988
3989
3990 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
3991 if (!tsk_cache_hot ||
3992 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3993
3994 if (tsk_cache_hot) {
3995 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3996 schedstat_inc(p, se.statistics.nr_forced_migrations);
3997 }
3998
3999 return 1;
4000 }
4001
4002 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4003 return 0;
4004}
4005
4006
4007
4008
4009
4010
4011
4012
4013static int move_one_task(struct lb_env *env)
4014{
4015 struct task_struct *p, *n;
4016
4017 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4018 if (!can_migrate_task(p, env))
4019 continue;
4020
4021 move_task(p, env);
4022
4023
4024
4025
4026
4027 schedstat_inc(env->sd, lb_gained[env->idle]);
4028 return 1;
4029 }
4030 return 0;
4031}
4032
4033static unsigned long task_h_load(struct task_struct *p);
4034
4035static const unsigned int sched_nr_migrate_break = 32;
4036
4037
4038
4039
4040
4041
4042
4043
4044static int move_tasks(struct lb_env *env)
4045{
4046 struct list_head *tasks = &env->src_rq->cfs_tasks;
4047 struct task_struct *p;
4048 unsigned long load;
4049 int pulled = 0;
4050
4051 if (env->imbalance <= 0)
4052 return 0;
4053
4054 while (!list_empty(tasks)) {
4055 p = list_first_entry(tasks, struct task_struct, se.group_node);
4056
4057 env->loop++;
4058
4059 if (env->loop > env->loop_max)
4060 break;
4061
4062
4063 if (env->loop > env->loop_break) {
4064 env->loop_break += sched_nr_migrate_break;
4065 env->flags |= LBF_NEED_BREAK;
4066 break;
4067 }
4068
4069 if (!can_migrate_task(p, env))
4070 goto next;
4071
4072 load = task_h_load(p);
4073
4074 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4075 goto next;
4076
4077 if ((load / 2) > env->imbalance)
4078 goto next;
4079
4080 move_task(p, env);
4081 pulled++;
4082 env->imbalance -= load;
4083
4084#ifdef CONFIG_PREEMPT
4085
4086
4087
4088
4089
4090 if (env->idle == CPU_NEWLY_IDLE)
4091 break;
4092#endif
4093
4094
4095
4096
4097
4098 if (env->imbalance <= 0)
4099 break;
4100
4101 continue;
4102next:
4103 list_move_tail(&p->se.group_node, tasks);
4104 }
4105
4106
4107
4108
4109
4110
4111 schedstat_add(env->sd, lb_gained[env->idle], pulled);
4112
4113 return pulled;
4114}
4115
4116#ifdef CONFIG_FAIR_GROUP_SCHED
4117
4118
4119
4120static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4121{
4122 struct sched_entity *se = tg->se[cpu];
4123 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4124
4125
4126 if (throttled_hierarchy(cfs_rq))
4127 return;
4128
4129 update_cfs_rq_blocked_load(cfs_rq, 1);
4130
4131 if (se) {
4132 update_entity_load_avg(se, 1);
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4143 list_del_leaf_cfs_rq(cfs_rq);
4144 } else {
4145 struct rq *rq = rq_of(cfs_rq);
4146 update_rq_runnable_avg(rq, rq->nr_running);
4147 }
4148}
4149
4150static void update_blocked_averages(int cpu)
4151{
4152 struct rq *rq = cpu_rq(cpu);
4153 struct cfs_rq *cfs_rq;
4154 unsigned long flags;
4155
4156 raw_spin_lock_irqsave(&rq->lock, flags);
4157 update_rq_clock(rq);
4158
4159
4160
4161
4162 for_each_leaf_cfs_rq(rq, cfs_rq) {
4163
4164
4165
4166
4167
4168 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4169 }
4170
4171 raw_spin_unlock_irqrestore(&rq->lock, flags);
4172}
4173
4174
4175
4176
4177
4178
4179static int tg_load_down(struct task_group *tg, void *data)
4180{
4181 unsigned long load;
4182 long cpu = (long)data;
4183
4184 if (!tg->parent) {
4185 load = cpu_rq(cpu)->avg.load_avg_contrib;
4186 } else {
4187 load = tg->parent->cfs_rq[cpu]->h_load;
4188 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4189 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4190 }
4191
4192 tg->cfs_rq[cpu]->h_load = load;
4193
4194 return 0;
4195}
4196
4197static void update_h_load(long cpu)
4198{
4199 struct rq *rq = cpu_rq(cpu);
4200 unsigned long now = jiffies;
4201
4202 if (rq->h_load_throttle == now)
4203 return;
4204
4205 rq->h_load_throttle = now;
4206
4207 rcu_read_lock();
4208 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4209 rcu_read_unlock();
4210}
4211
4212static unsigned long task_h_load(struct task_struct *p)
4213{
4214 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4215
4216 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4217 cfs_rq->runnable_load_avg + 1);
4218}
4219#else
4220static inline void update_blocked_averages(int cpu)
4221{
4222}
4223
4224static inline void update_h_load(long cpu)
4225{
4226}
4227
4228static unsigned long task_h_load(struct task_struct *p)
4229{
4230 return p->se.avg.load_avg_contrib;
4231}
4232#endif
4233
4234
4235
4236
4237
4238
4239struct sd_lb_stats {
4240 struct sched_group *busiest;
4241 struct sched_group *this;
4242 unsigned long total_load;
4243 unsigned long total_pwr;
4244 unsigned long avg_load;
4245
4246
4247 unsigned long this_load;
4248 unsigned long this_load_per_task;
4249 unsigned long this_nr_running;
4250 unsigned long this_has_capacity;
4251 unsigned int this_idle_cpus;
4252
4253
4254 unsigned int busiest_idle_cpus;
4255 unsigned long max_load;
4256 unsigned long busiest_load_per_task;
4257 unsigned long busiest_nr_running;
4258 unsigned long busiest_group_capacity;
4259 unsigned long busiest_has_capacity;
4260 unsigned int busiest_group_weight;
4261
4262 int group_imb;
4263};
4264
4265
4266
4267
4268struct sg_lb_stats {
4269 unsigned long avg_load;
4270 unsigned long group_load;
4271 unsigned long sum_nr_running;
4272 unsigned long sum_weighted_load;
4273 unsigned long group_capacity;
4274 unsigned long idle_cpus;
4275 unsigned long group_weight;
4276 int group_imb;
4277 int group_has_capacity;
4278};
4279
4280
4281
4282
4283
4284
4285
4286
4287static inline int get_sd_load_idx(struct sched_domain *sd,
4288 enum cpu_idle_type idle)
4289{
4290 int load_idx;
4291
4292 switch (idle) {
4293 case CPU_NOT_IDLE:
4294 load_idx = sd->busy_idx;
4295 break;
4296
4297 case CPU_NEWLY_IDLE:
4298 load_idx = sd->newidle_idx;
4299 break;
4300 default:
4301 load_idx = sd->idle_idx;
4302 break;
4303 }
4304
4305 return load_idx;
4306}
4307
4308static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4309{
4310 return SCHED_POWER_SCALE;
4311}
4312
4313unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4314{
4315 return default_scale_freq_power(sd, cpu);
4316}
4317
4318static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4319{
4320 unsigned long weight = sd->span_weight;
4321 unsigned long smt_gain = sd->smt_gain;
4322
4323 smt_gain /= weight;
4324
4325 return smt_gain;
4326}
4327
4328unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4329{
4330 return default_scale_smt_power(sd, cpu);
4331}
4332
4333static unsigned long scale_rt_power(int cpu)
4334{
4335 struct rq *rq = cpu_rq(cpu);
4336 u64 total, available, age_stamp, avg;
4337
4338
4339
4340
4341
4342 age_stamp = ACCESS_ONCE(rq->age_stamp);
4343 avg = ACCESS_ONCE(rq->rt_avg);
4344
4345 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4346
4347 if (unlikely(total < avg)) {
4348
4349 available = 0;
4350 } else {
4351 available = total - avg;
4352 }
4353
4354 if (unlikely((s64)total < SCHED_POWER_SCALE))
4355 total = SCHED_POWER_SCALE;
4356
4357 total >>= SCHED_POWER_SHIFT;
4358
4359 return div_u64(available, total);
4360}
4361
4362static void update_cpu_power(struct sched_domain *sd, int cpu)
4363{
4364 unsigned long weight = sd->span_weight;
4365 unsigned long power = SCHED_POWER_SCALE;
4366 struct sched_group *sdg = sd->groups;
4367
4368 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4369 if (sched_feat(ARCH_POWER))
4370 power *= arch_scale_smt_power(sd, cpu);
4371 else
4372 power *= default_scale_smt_power(sd, cpu);
4373
4374 power >>= SCHED_POWER_SHIFT;
4375 }
4376
4377 sdg->sgp->power_orig = power;
4378
4379 if (sched_feat(ARCH_POWER))
4380 power *= arch_scale_freq_power(sd, cpu);
4381 else
4382 power *= default_scale_freq_power(sd, cpu);
4383
4384 power >>= SCHED_POWER_SHIFT;
4385
4386 power *= scale_rt_power(cpu);
4387 power >>= SCHED_POWER_SHIFT;
4388
4389 if (!power)
4390 power = 1;
4391
4392 cpu_rq(cpu)->cpu_power = power;
4393 sdg->sgp->power = power;
4394}
4395
4396void update_group_power(struct sched_domain *sd, int cpu)
4397{
4398 struct sched_domain *child = sd->child;
4399 struct sched_group *group, *sdg = sd->groups;
4400 unsigned long power;
4401 unsigned long interval;
4402
4403 interval = msecs_to_jiffies(sd->balance_interval);
4404 interval = clamp(interval, 1UL, max_load_balance_interval);
4405 sdg->sgp->next_update = jiffies + interval;
4406
4407 if (!child) {
4408 update_cpu_power(sd, cpu);
4409 return;
4410 }
4411
4412 power = 0;
4413
4414 if (child->flags & SD_OVERLAP) {
4415
4416
4417
4418
4419
4420 for_each_cpu(cpu, sched_group_cpus(sdg))
4421 power += power_of(cpu);
4422 } else {
4423
4424
4425
4426
4427
4428 group = child->groups;
4429 do {
4430 power += group->sgp->power;
4431 group = group->next;
4432 } while (group != child->groups);
4433 }
4434
4435 sdg->sgp->power_orig = sdg->sgp->power = power;
4436}
4437
4438
4439
4440
4441
4442
4443
4444
4445static inline int
4446fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4447{
4448
4449
4450
4451 if (!(sd->flags & SD_SHARE_CPUPOWER))
4452 return 0;
4453
4454
4455
4456
4457 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4458 return 1;
4459
4460 return 0;
4461}
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472static inline void update_sg_lb_stats(struct lb_env *env,
4473 struct sched_group *group, int load_idx,
4474 int local_group, int *balance, struct sg_lb_stats *sgs)
4475{
4476 unsigned long nr_running, max_nr_running, min_nr_running;
4477 unsigned long load, max_cpu_load, min_cpu_load;
4478 unsigned int balance_cpu = -1, first_idle_cpu = 0;
4479 unsigned long avg_load_per_task = 0;
4480 int i;
4481
4482 if (local_group)
4483 balance_cpu = group_balance_cpu(group);
4484
4485
4486 max_cpu_load = 0;
4487 min_cpu_load = ~0UL;
4488 max_nr_running = 0;
4489 min_nr_running = ~0UL;
4490
4491 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4492 struct rq *rq = cpu_rq(i);
4493
4494 nr_running = rq->nr_running;
4495
4496
4497 if (local_group) {
4498 if (idle_cpu(i) && !first_idle_cpu &&
4499 cpumask_test_cpu(i, sched_group_mask(group))) {
4500 first_idle_cpu = 1;
4501 balance_cpu = i;
4502 }
4503
4504 load = target_load(i, load_idx);
4505 } else {
4506 load = source_load(i, load_idx);
4507 if (load > max_cpu_load)
4508 max_cpu_load = load;
4509 if (min_cpu_load > load)
4510 min_cpu_load = load;
4511
4512 if (nr_running > max_nr_running)
4513 max_nr_running = nr_running;
4514 if (min_nr_running > nr_running)
4515 min_nr_running = nr_running;
4516 }
4517
4518 sgs->group_load += load;
4519 sgs->sum_nr_running += nr_running;
4520 sgs->sum_weighted_load += weighted_cpuload(i);
4521 if (idle_cpu(i))
4522 sgs->idle_cpus++;
4523 }
4524
4525
4526
4527
4528
4529
4530
4531 if (local_group) {
4532 if (env->idle != CPU_NEWLY_IDLE) {
4533 if (balance_cpu != env->dst_cpu) {
4534 *balance = 0;
4535 return;
4536 }
4537 update_group_power(env->sd, env->dst_cpu);
4538 } else if (time_after_eq(jiffies, group->sgp->next_update))
4539 update_group_power(env->sd, env->dst_cpu);
4540 }
4541
4542
4543 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554 if (sgs->sum_nr_running)
4555 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4556
4557 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4558 (max_nr_running - min_nr_running) > 1)
4559 sgs->group_imb = 1;
4560
4561 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4562 SCHED_POWER_SCALE);
4563 if (!sgs->group_capacity)
4564 sgs->group_capacity = fix_small_capacity(env->sd, group);
4565 sgs->group_weight = group->group_weight;
4566
4567 if (sgs->group_capacity > sgs->sum_nr_running)
4568 sgs->group_has_capacity = 1;
4569}
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584static bool update_sd_pick_busiest(struct lb_env *env,
4585 struct sd_lb_stats *sds,
4586 struct sched_group *sg,
4587 struct sg_lb_stats *sgs)
4588{
4589 if (sgs->avg_load <= sds->max_load)
4590 return false;
4591
4592 if (sgs->sum_nr_running > sgs->group_capacity)
4593 return true;
4594
4595 if (sgs->group_imb)
4596 return true;
4597
4598
4599
4600
4601
4602
4603 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4604 env->dst_cpu < group_first_cpu(sg)) {
4605 if (!sds->busiest)
4606 return true;
4607
4608 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4609 return true;
4610 }
4611
4612 return false;
4613}
4614
4615
4616
4617
4618
4619
4620
4621static inline void update_sd_lb_stats(struct lb_env *env,
4622 int *balance, struct sd_lb_stats *sds)
4623{
4624 struct sched_domain *child = env->sd->child;
4625 struct sched_group *sg = env->sd->groups;
4626 struct sg_lb_stats sgs;
4627 int load_idx, prefer_sibling = 0;
4628
4629 if (child && child->flags & SD_PREFER_SIBLING)
4630 prefer_sibling = 1;
4631
4632 load_idx = get_sd_load_idx(env->sd, env->idle);
4633
4634 do {
4635 int local_group;
4636
4637 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4638 memset(&sgs, 0, sizeof(sgs));
4639 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4640
4641 if (local_group && !(*balance))
4642 return;
4643
4644 sds->total_load += sgs.group_load;
4645 sds->total_pwr += sg->sgp->power;
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657 if (prefer_sibling && !local_group && sds->this_has_capacity)
4658 sgs.group_capacity = min(sgs.group_capacity, 1UL);
4659
4660 if (local_group) {
4661 sds->this_load = sgs.avg_load;
4662 sds->this = sg;
4663 sds->this_nr_running = sgs.sum_nr_running;
4664 sds->this_load_per_task = sgs.sum_weighted_load;
4665 sds->this_has_capacity = sgs.group_has_capacity;
4666 sds->this_idle_cpus = sgs.idle_cpus;
4667 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4668 sds->max_load = sgs.avg_load;
4669 sds->busiest = sg;
4670 sds->busiest_nr_running = sgs.sum_nr_running;
4671 sds->busiest_idle_cpus = sgs.idle_cpus;
4672 sds->busiest_group_capacity = sgs.group_capacity;
4673 sds->busiest_load_per_task = sgs.sum_weighted_load;
4674 sds->busiest_has_capacity = sgs.group_has_capacity;
4675 sds->busiest_group_weight = sgs.group_weight;
4676 sds->group_imb = sgs.group_imb;
4677 }
4678
4679 sg = sg->next;
4680 } while (sg != env->sd->groups);
4681}
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4707{
4708 int busiest_cpu;
4709
4710 if (!(env->sd->flags & SD_ASYM_PACKING))
4711 return 0;
4712
4713 if (!sds->busiest)
4714 return 0;
4715
4716 busiest_cpu = group_first_cpu(sds->busiest);
4717 if (env->dst_cpu > busiest_cpu)
4718 return 0;
4719
4720 env->imbalance = DIV_ROUND_CLOSEST(
4721 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4722
4723 return 1;
4724}
4725
4726
4727
4728
4729
4730
4731
4732
4733static inline
4734void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4735{
4736 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4737 unsigned int imbn = 2;
4738 unsigned long scaled_busy_load_per_task;
4739
4740 if (sds->this_nr_running) {
4741 sds->this_load_per_task /= sds->this_nr_running;
4742 if (sds->busiest_load_per_task >
4743 sds->this_load_per_task)
4744 imbn = 1;
4745 } else {
4746 sds->this_load_per_task =
4747 cpu_avg_load_per_task(env->dst_cpu);
4748 }
4749
4750 scaled_busy_load_per_task = sds->busiest_load_per_task
4751 * SCHED_POWER_SCALE;
4752 scaled_busy_load_per_task /= sds->busiest->sgp->power;
4753
4754 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4755 (scaled_busy_load_per_task * imbn)) {
4756 env->imbalance = sds->busiest_load_per_task;
4757 return;
4758 }
4759
4760
4761
4762
4763
4764
4765
4766 pwr_now += sds->busiest->sgp->power *
4767 min(sds->busiest_load_per_task, sds->max_load);
4768 pwr_now += sds->this->sgp->power *
4769 min(sds->this_load_per_task, sds->this_load);
4770 pwr_now /= SCHED_POWER_SCALE;
4771
4772
4773 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4774 sds->busiest->sgp->power;
4775 if (sds->max_load > tmp)
4776 pwr_move += sds->busiest->sgp->power *
4777 min(sds->busiest_load_per_task, sds->max_load - tmp);
4778
4779
4780 if (sds->max_load * sds->busiest->sgp->power <
4781 sds->busiest_load_per_task * SCHED_POWER_SCALE)
4782 tmp = (sds->max_load * sds->busiest->sgp->power) /
4783 sds->this->sgp->power;
4784 else
4785 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4786 sds->this->sgp->power;
4787 pwr_move += sds->this->sgp->power *
4788 min(sds->this_load_per_task, sds->this_load + tmp);
4789 pwr_move /= SCHED_POWER_SCALE;
4790
4791
4792 if (pwr_move > pwr_now)
4793 env->imbalance = sds->busiest_load_per_task;
4794}
4795
4796
4797
4798
4799
4800
4801
4802static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4803{
4804 unsigned long max_pull, load_above_capacity = ~0UL;
4805
4806 sds->busiest_load_per_task /= sds->busiest_nr_running;
4807 if (sds->group_imb) {
4808 sds->busiest_load_per_task =
4809 min(sds->busiest_load_per_task, sds->avg_load);
4810 }
4811
4812
4813
4814
4815
4816
4817 if (sds->max_load < sds->avg_load) {
4818 env->imbalance = 0;
4819 return fix_small_imbalance(env, sds);
4820 }
4821
4822 if (!sds->group_imb) {
4823
4824
4825
4826 load_above_capacity = (sds->busiest_nr_running -
4827 sds->busiest_group_capacity);
4828
4829 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4830
4831 load_above_capacity /= sds->busiest->sgp->power;
4832 }
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4845
4846
4847 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4848 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4849 / SCHED_POWER_SCALE;
4850
4851
4852
4853
4854
4855
4856
4857 if (env->imbalance < sds->busiest_load_per_task)
4858 return fix_small_imbalance(env, sds);
4859
4860}
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883static struct sched_group *
4884find_busiest_group(struct lb_env *env, int *balance)
4885{
4886 struct sd_lb_stats sds;
4887
4888 memset(&sds, 0, sizeof(sds));
4889
4890
4891
4892
4893
4894 update_sd_lb_stats(env, balance, &sds);
4895
4896
4897
4898
4899
4900 if (!(*balance))
4901 goto ret;
4902
4903 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4904 check_asym_packing(env, &sds))
4905 return sds.busiest;
4906
4907
4908 if (!sds.busiest || sds.busiest_nr_running == 0)
4909 goto out_balanced;
4910
4911 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4912
4913
4914
4915
4916
4917
4918 if (sds.group_imb)
4919 goto force_balance;
4920
4921
4922 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4923 !sds.busiest_has_capacity)
4924 goto force_balance;
4925
4926
4927
4928
4929
4930 if (sds.this_load >= sds.max_load)
4931 goto out_balanced;
4932
4933
4934
4935
4936
4937 if (sds.this_load >= sds.avg_load)
4938 goto out_balanced;
4939
4940 if (env->idle == CPU_IDLE) {
4941
4942
4943
4944
4945
4946
4947 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4948 sds.busiest_nr_running <= sds.busiest_group_weight)
4949 goto out_balanced;
4950 } else {
4951
4952
4953
4954
4955 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4956 goto out_balanced;
4957 }
4958
4959force_balance:
4960
4961 calculate_imbalance(env, &sds);
4962 return sds.busiest;
4963
4964out_balanced:
4965ret:
4966 env->imbalance = 0;
4967 return NULL;
4968}
4969
4970
4971
4972
4973static struct rq *find_busiest_queue(struct lb_env *env,
4974 struct sched_group *group)
4975{
4976 struct rq *busiest = NULL, *rq;
4977 unsigned long max_load = 0;
4978 int i;
4979
4980 for_each_cpu(i, sched_group_cpus(group)) {
4981 unsigned long power = power_of(i);
4982 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4983 SCHED_POWER_SCALE);
4984 unsigned long wl;
4985
4986 if (!capacity)
4987 capacity = fix_small_capacity(env->sd, group);
4988
4989 if (!cpumask_test_cpu(i, env->cpus))
4990 continue;
4991
4992 rq = cpu_rq(i);
4993 wl = weighted_cpuload(i);
4994
4995
4996
4997
4998
4999 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
5000 continue;
5001
5002
5003
5004
5005
5006
5007
5008 wl = (wl * SCHED_POWER_SCALE) / power;
5009
5010 if (wl > max_load) {
5011 max_load = wl;
5012 busiest = rq;
5013 }
5014 }
5015
5016 return busiest;
5017}
5018
5019
5020
5021
5022
5023#define MAX_PINNED_INTERVAL 512
5024
5025
5026DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5027
5028static int need_active_balance(struct lb_env *env)
5029{
5030 struct sched_domain *sd = env->sd;
5031
5032 if (env->idle == CPU_NEWLY_IDLE) {
5033
5034
5035
5036
5037
5038
5039 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
5040 return 1;
5041 }
5042
5043 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5044}
5045
5046static int active_load_balance_cpu_stop(void *data);
5047
5048
5049
5050
5051
5052static int load_balance(int this_cpu, struct rq *this_rq,
5053 struct sched_domain *sd, enum cpu_idle_type idle,
5054 int *balance)
5055{
5056 int ld_moved, cur_ld_moved, active_balance = 0;
5057 struct sched_group *group;
5058 struct rq *busiest;
5059 unsigned long flags;
5060 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5061
5062 struct lb_env env = {
5063 .sd = sd,
5064 .dst_cpu = this_cpu,
5065 .dst_rq = this_rq,
5066 .dst_grpmask = sched_group_cpus(sd->groups),
5067 .idle = idle,
5068 .loop_break = sched_nr_migrate_break,
5069 .cpus = cpus,
5070 };
5071
5072
5073
5074
5075
5076 if (idle == CPU_NEWLY_IDLE)
5077 env.dst_grpmask = NULL;
5078
5079 cpumask_copy(cpus, cpu_active_mask);
5080
5081 schedstat_inc(sd, lb_count[idle]);
5082
5083redo:
5084 group = find_busiest_group(&env, balance);
5085
5086 if (*balance == 0)
5087 goto out_balanced;
5088
5089 if (!group) {
5090 schedstat_inc(sd, lb_nobusyg[idle]);
5091 goto out_balanced;
5092 }
5093
5094 busiest = find_busiest_queue(&env, group);
5095 if (!busiest) {
5096 schedstat_inc(sd, lb_nobusyq[idle]);
5097 goto out_balanced;
5098 }
5099
5100 BUG_ON(busiest == env.dst_rq);
5101
5102 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5103
5104 ld_moved = 0;
5105 if (busiest->nr_running > 1) {
5106
5107
5108
5109
5110
5111
5112 env.flags |= LBF_ALL_PINNED;
5113 env.src_cpu = busiest->cpu;
5114 env.src_rq = busiest;
5115 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5116
5117 update_h_load(env.src_cpu);
5118more_balance:
5119 local_irq_save(flags);
5120 double_rq_lock(env.dst_rq, busiest);
5121
5122
5123
5124
5125
5126 cur_ld_moved = move_tasks(&env);
5127 ld_moved += cur_ld_moved;
5128 double_rq_unlock(env.dst_rq, busiest);
5129 local_irq_restore(flags);
5130
5131
5132
5133
5134 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5135 resched_cpu(env.dst_cpu);
5136
5137 if (env.flags & LBF_NEED_BREAK) {
5138 env.flags &= ~LBF_NEED_BREAK;
5139 goto more_balance;
5140 }
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5162
5163 env.dst_rq = cpu_rq(env.new_dst_cpu);
5164 env.dst_cpu = env.new_dst_cpu;
5165 env.flags &= ~LBF_SOME_PINNED;
5166 env.loop = 0;
5167 env.loop_break = sched_nr_migrate_break;
5168
5169
5170 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5171
5172
5173
5174
5175
5176 goto more_balance;
5177 }
5178
5179
5180 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5181 cpumask_clear_cpu(cpu_of(busiest), cpus);
5182 if (!cpumask_empty(cpus)) {
5183 env.loop = 0;
5184 env.loop_break = sched_nr_migrate_break;
5185 goto redo;
5186 }
5187 goto out_balanced;
5188 }
5189 }
5190
5191 if (!ld_moved) {
5192 schedstat_inc(sd, lb_failed[idle]);
5193
5194
5195
5196
5197
5198
5199 if (idle != CPU_NEWLY_IDLE)
5200 sd->nr_balance_failed++;
5201
5202 if (need_active_balance(&env)) {
5203 raw_spin_lock_irqsave(&busiest->lock, flags);
5204
5205
5206
5207
5208
5209 if (!cpumask_test_cpu(this_cpu,
5210 tsk_cpus_allowed(busiest->curr))) {
5211 raw_spin_unlock_irqrestore(&busiest->lock,
5212 flags);
5213 env.flags |= LBF_ALL_PINNED;
5214 goto out_one_pinned;
5215 }
5216
5217
5218
5219
5220
5221
5222 if (!busiest->active_balance) {
5223 busiest->active_balance = 1;
5224 busiest->push_cpu = this_cpu;
5225 active_balance = 1;
5226 }
5227 raw_spin_unlock_irqrestore(&busiest->lock, flags);
5228
5229 if (active_balance) {
5230 stop_one_cpu_nowait(cpu_of(busiest),
5231 active_load_balance_cpu_stop, busiest,
5232 &busiest->active_balance_work);
5233 }
5234
5235
5236
5237
5238
5239 sd->nr_balance_failed = sd->cache_nice_tries+1;
5240 }
5241 } else
5242 sd->nr_balance_failed = 0;
5243
5244 if (likely(!active_balance)) {
5245
5246 sd->balance_interval = sd->min_interval;
5247 } else {
5248
5249
5250
5251
5252
5253
5254 if (sd->balance_interval < sd->max_interval)
5255 sd->balance_interval *= 2;
5256 }
5257
5258 goto out;
5259
5260out_balanced:
5261 schedstat_inc(sd, lb_balanced[idle]);
5262
5263 sd->nr_balance_failed = 0;
5264
5265out_one_pinned:
5266
5267 if (((env.flags & LBF_ALL_PINNED) &&
5268 sd->balance_interval < MAX_PINNED_INTERVAL) ||
5269 (sd->balance_interval < sd->max_interval))
5270 sd->balance_interval *= 2;
5271
5272 ld_moved = 0;
5273out:
5274 return ld_moved;
5275}
5276
5277
5278
5279
5280
5281void idle_balance(int this_cpu, struct rq *this_rq)
5282{
5283 struct sched_domain *sd;
5284 int pulled_task = 0;
5285 unsigned long next_balance = jiffies + HZ;
5286
5287 this_rq->idle_stamp = rq_clock(this_rq);
5288
5289 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5290 return;
5291
5292
5293
5294
5295 raw_spin_unlock(&this_rq->lock);
5296
5297 update_blocked_averages(this_cpu);
5298 rcu_read_lock();
5299 for_each_domain(this_cpu, sd) {
5300 unsigned long interval;
5301 int balance = 1;
5302
5303 if (!(sd->flags & SD_LOAD_BALANCE))
5304 continue;
5305
5306 if (sd->flags & SD_BALANCE_NEWIDLE) {
5307
5308 pulled_task = load_balance(this_cpu, this_rq,
5309 sd, CPU_NEWLY_IDLE, &balance);
5310 }
5311
5312 interval = msecs_to_jiffies(sd->balance_interval);
5313 if (time_after(next_balance, sd->last_balance + interval))
5314 next_balance = sd->last_balance + interval;
5315 if (pulled_task) {
5316 this_rq->idle_stamp = 0;
5317 break;
5318 }
5319 }
5320 rcu_read_unlock();
5321
5322 raw_spin_lock(&this_rq->lock);
5323
5324 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5325
5326
5327
5328
5329 this_rq->next_balance = next_balance;
5330 }
5331}
5332
5333
5334
5335
5336
5337
5338
5339static int active_load_balance_cpu_stop(void *data)
5340{
5341 struct rq *busiest_rq = data;
5342 int busiest_cpu = cpu_of(busiest_rq);
5343 int target_cpu = busiest_rq->push_cpu;
5344 struct rq *target_rq = cpu_rq(target_cpu);
5345 struct sched_domain *sd;
5346
5347 raw_spin_lock_irq(&busiest_rq->lock);
5348
5349
5350 if (unlikely(busiest_cpu != smp_processor_id() ||
5351 !busiest_rq->active_balance))
5352 goto out_unlock;
5353
5354
5355 if (busiest_rq->nr_running <= 1)
5356 goto out_unlock;
5357
5358
5359
5360
5361
5362
5363 BUG_ON(busiest_rq == target_rq);
5364
5365
5366 double_lock_balance(busiest_rq, target_rq);
5367
5368
5369 rcu_read_lock();
5370 for_each_domain(target_cpu, sd) {
5371 if ((sd->flags & SD_LOAD_BALANCE) &&
5372 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5373 break;
5374 }
5375
5376 if (likely(sd)) {
5377 struct lb_env env = {
5378 .sd = sd,
5379 .dst_cpu = target_cpu,
5380 .dst_rq = target_rq,
5381 .src_cpu = busiest_rq->cpu,
5382 .src_rq = busiest_rq,
5383 .idle = CPU_IDLE,
5384 };
5385
5386 schedstat_inc(sd, alb_count);
5387
5388 if (move_one_task(&env))
5389 schedstat_inc(sd, alb_pushed);
5390 else
5391 schedstat_inc(sd, alb_failed);
5392 }
5393 rcu_read_unlock();
5394 double_unlock_balance(busiest_rq, target_rq);
5395out_unlock:
5396 busiest_rq->active_balance = 0;
5397 raw_spin_unlock_irq(&busiest_rq->lock);
5398 return 0;
5399}
5400
5401#ifdef CONFIG_NO_HZ_COMMON
5402
5403
5404
5405
5406
5407
5408static struct {
5409 cpumask_var_t idle_cpus_mask;
5410 atomic_t nr_cpus;
5411 unsigned long next_balance;
5412} nohz ____cacheline_aligned;
5413
5414static inline int find_new_ilb(int call_cpu)
5415{
5416 int ilb = cpumask_first(nohz.idle_cpus_mask);
5417
5418 if (ilb < nr_cpu_ids && idle_cpu(ilb))
5419 return ilb;
5420
5421 return nr_cpu_ids;
5422}
5423
5424
5425
5426
5427
5428
5429static void nohz_balancer_kick(int cpu)
5430{
5431 int ilb_cpu;
5432
5433 nohz.next_balance++;
5434
5435 ilb_cpu = find_new_ilb(cpu);
5436
5437 if (ilb_cpu >= nr_cpu_ids)
5438 return;
5439
5440 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5441 return;
5442
5443
5444
5445
5446
5447
5448 smp_send_reschedule(ilb_cpu);
5449 return;
5450}
5451
5452static inline void nohz_balance_exit_idle(int cpu)
5453{
5454 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5455 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5456 atomic_dec(&nohz.nr_cpus);
5457 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5458 }
5459}
5460
5461static inline void set_cpu_sd_state_busy(void)
5462{
5463 struct sched_domain *sd;
5464
5465 rcu_read_lock();
5466 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5467
5468 if (!sd || !sd->nohz_idle)
5469 goto unlock;
5470 sd->nohz_idle = 0;
5471
5472 for (; sd; sd = sd->parent)
5473 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5474unlock:
5475 rcu_read_unlock();
5476}
5477
5478void set_cpu_sd_state_idle(void)
5479{
5480 struct sched_domain *sd;
5481
5482 rcu_read_lock();
5483 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5484
5485 if (!sd || sd->nohz_idle)
5486 goto unlock;
5487 sd->nohz_idle = 1;
5488
5489 for (; sd; sd = sd->parent)
5490 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5491unlock:
5492 rcu_read_unlock();
5493}
5494
5495
5496
5497
5498
5499void nohz_balance_enter_idle(int cpu)
5500{
5501
5502
5503
5504 if (!cpu_active(cpu))
5505 return;
5506
5507 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5508 return;
5509
5510 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5511 atomic_inc(&nohz.nr_cpus);
5512 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5513}
5514
5515static int sched_ilb_notifier(struct notifier_block *nfb,
5516 unsigned long action, void *hcpu)
5517{
5518 switch (action & ~CPU_TASKS_FROZEN) {
5519 case CPU_DYING:
5520 nohz_balance_exit_idle(smp_processor_id());
5521 return NOTIFY_OK;
5522 default:
5523 return NOTIFY_DONE;
5524 }
5525}
5526#endif
5527
5528static DEFINE_SPINLOCK(balancing);
5529
5530
5531
5532
5533
5534void update_max_interval(void)
5535{
5536 max_load_balance_interval = HZ*num_online_cpus()/10;
5537}
5538
5539
5540
5541
5542
5543
5544
5545static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5546{
5547 int balance = 1;
5548 struct rq *rq = cpu_rq(cpu);
5549 unsigned long interval;
5550 struct sched_domain *sd;
5551
5552 unsigned long next_balance = jiffies + 60*HZ;
5553 int update_next_balance = 0;
5554 int need_serialize;
5555
5556 update_blocked_averages(cpu);
5557
5558 rcu_read_lock();
5559 for_each_domain(cpu, sd) {
5560 if (!(sd->flags & SD_LOAD_BALANCE))
5561 continue;
5562
5563 interval = sd->balance_interval;
5564 if (idle != CPU_IDLE)
5565 interval *= sd->busy_factor;
5566
5567
5568 interval = msecs_to_jiffies(interval);
5569 interval = clamp(interval, 1UL, max_load_balance_interval);
5570
5571 need_serialize = sd->flags & SD_SERIALIZE;
5572
5573 if (need_serialize) {
5574 if (!spin_trylock(&balancing))
5575 goto out;
5576 }
5577
5578 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5579 if (load_balance(cpu, rq, sd, idle, &balance)) {
5580
5581
5582
5583
5584
5585 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5586 }
5587 sd->last_balance = jiffies;
5588 }
5589 if (need_serialize)
5590 spin_unlock(&balancing);
5591out:
5592 if (time_after(next_balance, sd->last_balance + interval)) {
5593 next_balance = sd->last_balance + interval;
5594 update_next_balance = 1;
5595 }
5596
5597
5598
5599
5600
5601
5602 if (!balance)
5603 break;
5604 }
5605 rcu_read_unlock();
5606
5607
5608
5609
5610
5611
5612 if (likely(update_next_balance))
5613 rq->next_balance = next_balance;
5614}
5615
5616#ifdef CONFIG_NO_HZ_COMMON
5617
5618
5619
5620
5621static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5622{
5623 struct rq *this_rq = cpu_rq(this_cpu);
5624 struct rq *rq;
5625 int balance_cpu;
5626
5627 if (idle != CPU_IDLE ||
5628 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5629 goto end;
5630
5631 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5632 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5633 continue;
5634
5635
5636
5637
5638
5639
5640 if (need_resched())
5641 break;
5642
5643 rq = cpu_rq(balance_cpu);
5644
5645 raw_spin_lock_irq(&rq->lock);
5646 update_rq_clock(rq);
5647 update_idle_cpu_load(rq);
5648 raw_spin_unlock_irq(&rq->lock);
5649
5650 rebalance_domains(balance_cpu, CPU_IDLE);
5651
5652 if (time_after(this_rq->next_balance, rq->next_balance))
5653 this_rq->next_balance = rq->next_balance;
5654 }
5655 nohz.next_balance = this_rq->next_balance;
5656end:
5657 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5658}
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669static inline int nohz_kick_needed(struct rq *rq, int cpu)
5670{
5671 unsigned long now = jiffies;
5672 struct sched_domain *sd;
5673
5674 if (unlikely(idle_cpu(cpu)))
5675 return 0;
5676
5677
5678
5679
5680
5681 set_cpu_sd_state_busy();
5682 nohz_balance_exit_idle(cpu);
5683
5684
5685
5686
5687
5688 if (likely(!atomic_read(&nohz.nr_cpus)))
5689 return 0;
5690
5691 if (time_before(now, nohz.next_balance))
5692 return 0;
5693
5694 if (rq->nr_running >= 2)
5695 goto need_kick;
5696
5697 rcu_read_lock();
5698 for_each_domain(cpu, sd) {
5699 struct sched_group *sg = sd->groups;
5700 struct sched_group_power *sgp = sg->sgp;
5701 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5702
5703 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5704 goto need_kick_unlock;
5705
5706 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5707 && (cpumask_first_and(nohz.idle_cpus_mask,
5708 sched_domain_span(sd)) < cpu))
5709 goto need_kick_unlock;
5710
5711 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5712 break;
5713 }
5714 rcu_read_unlock();
5715 return 0;
5716
5717need_kick_unlock:
5718 rcu_read_unlock();
5719need_kick:
5720 return 1;
5721}
5722#else
5723static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5724#endif
5725
5726
5727
5728
5729
5730static void run_rebalance_domains(struct softirq_action *h)
5731{
5732 int this_cpu = smp_processor_id();
5733 struct rq *this_rq = cpu_rq(this_cpu);
5734 enum cpu_idle_type idle = this_rq->idle_balance ?
5735 CPU_IDLE : CPU_NOT_IDLE;
5736
5737 rebalance_domains(this_cpu, idle);
5738
5739
5740
5741
5742
5743
5744 nohz_idle_balance(this_cpu, idle);
5745}
5746
5747static inline int on_null_domain(int cpu)
5748{
5749 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5750}
5751
5752
5753
5754
5755void trigger_load_balance(struct rq *rq, int cpu)
5756{
5757
5758 if (time_after_eq(jiffies, rq->next_balance) &&
5759 likely(!on_null_domain(cpu)))
5760 raise_softirq(SCHED_SOFTIRQ);
5761#ifdef CONFIG_NO_HZ_COMMON
5762 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5763 nohz_balancer_kick(cpu);
5764#endif
5765}
5766
5767static void rq_online_fair(struct rq *rq)
5768{
5769 update_sysctl();
5770}
5771
5772static void rq_offline_fair(struct rq *rq)
5773{
5774 update_sysctl();
5775
5776
5777 unthrottle_offline_cfs_rqs(rq);
5778}
5779
5780#endif
5781
5782
5783
5784
5785static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5786{
5787 struct cfs_rq *cfs_rq;
5788 struct sched_entity *se = &curr->se;
5789
5790 for_each_sched_entity(se) {
5791 cfs_rq = cfs_rq_of(se);
5792 entity_tick(cfs_rq, se, queued);
5793 }
5794
5795 if (numabalancing_enabled)
5796 task_tick_numa(rq, curr);
5797
5798 update_rq_runnable_avg(rq, 1);
5799}
5800
5801
5802
5803
5804
5805
5806static void task_fork_fair(struct task_struct *p)
5807{
5808 struct cfs_rq *cfs_rq;
5809 struct sched_entity *se = &p->se, *curr;
5810 int this_cpu = smp_processor_id();
5811 struct rq *rq = this_rq();
5812 unsigned long flags;
5813
5814 raw_spin_lock_irqsave(&rq->lock, flags);
5815
5816 update_rq_clock(rq);
5817
5818 cfs_rq = task_cfs_rq(current);
5819 curr = cfs_rq->curr;
5820
5821 if (unlikely(task_cpu(p) != this_cpu)) {
5822 rcu_read_lock();
5823 __set_task_cpu(p, this_cpu);
5824 rcu_read_unlock();
5825 }
5826
5827 update_curr(cfs_rq);
5828
5829 if (curr)
5830 se->vruntime = curr->vruntime;
5831 place_entity(cfs_rq, se, 1);
5832
5833 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5834
5835
5836
5837
5838 swap(curr->vruntime, se->vruntime);
5839 resched_task(rq->curr);
5840 }
5841
5842 se->vruntime -= cfs_rq->min_vruntime;
5843
5844 raw_spin_unlock_irqrestore(&rq->lock, flags);
5845}
5846
5847
5848
5849
5850
5851static void
5852prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5853{
5854 if (!p->se.on_rq)
5855 return;
5856
5857
5858
5859
5860
5861
5862 if (rq->curr == p) {
5863 if (p->prio > oldprio)
5864 resched_task(rq->curr);
5865 } else
5866 check_preempt_curr(rq, p, 0);
5867}
5868
5869static void switched_from_fair(struct rq *rq, struct task_struct *p)
5870{
5871 struct sched_entity *se = &p->se;
5872 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883 if (!se->on_rq && p->state != TASK_RUNNING) {
5884
5885
5886
5887
5888 place_entity(cfs_rq, se, 0);
5889 se->vruntime -= cfs_rq->min_vruntime;
5890 }
5891
5892#ifdef CONFIG_SMP
5893
5894
5895
5896
5897
5898 if (p->se.avg.decay_count) {
5899 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5900 __synchronize_entity_decay(&p->se);
5901 subtract_blocked_load_contrib(cfs_rq,
5902 p->se.avg.load_avg_contrib);
5903 }
5904#endif
5905}
5906
5907
5908
5909
5910static void switched_to_fair(struct rq *rq, struct task_struct *p)
5911{
5912 if (!p->se.on_rq)
5913 return;
5914
5915
5916
5917
5918
5919
5920 if (rq->curr == p)
5921 resched_task(rq->curr);
5922 else
5923 check_preempt_curr(rq, p, 0);
5924}
5925
5926
5927
5928
5929
5930
5931static void set_curr_task_fair(struct rq *rq)
5932{
5933 struct sched_entity *se = &rq->curr->se;
5934
5935 for_each_sched_entity(se) {
5936 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5937
5938 set_next_entity(cfs_rq, se);
5939
5940 account_cfs_rq_runtime(cfs_rq, 0);
5941 }
5942}
5943
5944void init_cfs_rq(struct cfs_rq *cfs_rq)
5945{
5946 cfs_rq->tasks_timeline = RB_ROOT;
5947 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5948#ifndef CONFIG_64BIT
5949 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5950#endif
5951#ifdef CONFIG_SMP
5952 atomic64_set(&cfs_rq->decay_counter, 1);
5953 atomic_long_set(&cfs_rq->removed_load, 0);
5954#endif
5955}
5956
5957#ifdef CONFIG_FAIR_GROUP_SCHED
5958static void task_move_group_fair(struct task_struct *p, int on_rq)
5959{
5960 struct cfs_rq *cfs_rq;
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5987 on_rq = 1;
5988
5989 if (!on_rq)
5990 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5991 set_task_rq(p, task_cpu(p));
5992 if (!on_rq) {
5993 cfs_rq = cfs_rq_of(&p->se);
5994 p->se.vruntime += cfs_rq->min_vruntime;
5995#ifdef CONFIG_SMP
5996
5997
5998
5999
6000
6001 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
6002 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
6003#endif
6004 }
6005}
6006
6007void free_fair_sched_group(struct task_group *tg)
6008{
6009 int i;
6010
6011 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
6012
6013 for_each_possible_cpu(i) {
6014 if (tg->cfs_rq)
6015 kfree(tg->cfs_rq[i]);
6016 if (tg->se)
6017 kfree(tg->se[i]);
6018 }
6019
6020 kfree(tg->cfs_rq);
6021 kfree(tg->se);
6022}
6023
6024int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6025{
6026 struct cfs_rq *cfs_rq;
6027 struct sched_entity *se;
6028 int i;
6029
6030 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
6031 if (!tg->cfs_rq)
6032 goto err;
6033 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
6034 if (!tg->se)
6035 goto err;
6036
6037 tg->shares = NICE_0_LOAD;
6038
6039 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
6040
6041 for_each_possible_cpu(i) {
6042 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
6043 GFP_KERNEL, cpu_to_node(i));
6044 if (!cfs_rq)
6045 goto err;
6046
6047 se = kzalloc_node(sizeof(struct sched_entity),
6048 GFP_KERNEL, cpu_to_node(i));
6049 if (!se)
6050 goto err_free_rq;
6051
6052 init_cfs_rq(cfs_rq);
6053 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
6054 }
6055
6056 return 1;
6057
6058err_free_rq:
6059 kfree(cfs_rq);
6060err:
6061 return 0;
6062}
6063
6064void unregister_fair_sched_group(struct task_group *tg, int cpu)
6065{
6066 struct rq *rq = cpu_rq(cpu);
6067 unsigned long flags;
6068
6069
6070
6071
6072
6073 if (!tg->cfs_rq[cpu]->on_list)
6074 return;
6075
6076 raw_spin_lock_irqsave(&rq->lock, flags);
6077 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6078 raw_spin_unlock_irqrestore(&rq->lock, flags);
6079}
6080
6081void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6082 struct sched_entity *se, int cpu,
6083 struct sched_entity *parent)
6084{
6085 struct rq *rq = cpu_rq(cpu);
6086
6087 cfs_rq->tg = tg;
6088 cfs_rq->rq = rq;
6089 init_cfs_rq_runtime(cfs_rq);
6090
6091 tg->cfs_rq[cpu] = cfs_rq;
6092 tg->se[cpu] = se;
6093
6094
6095 if (!se)
6096 return;
6097
6098 if (!parent)
6099 se->cfs_rq = &rq->cfs;
6100 else
6101 se->cfs_rq = parent->my_q;
6102
6103 se->my_q = cfs_rq;
6104 update_load_set(&se->load, 0);
6105 se->parent = parent;
6106}
6107
6108static DEFINE_MUTEX(shares_mutex);
6109
6110int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6111{
6112 int i;
6113 unsigned long flags;
6114
6115
6116
6117
6118 if (!tg->se[0])
6119 return -EINVAL;
6120
6121 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6122
6123 mutex_lock(&shares_mutex);
6124 if (tg->shares == shares)
6125 goto done;
6126
6127 tg->shares = shares;
6128 for_each_possible_cpu(i) {
6129 struct rq *rq = cpu_rq(i);
6130 struct sched_entity *se;
6131
6132 se = tg->se[i];
6133
6134 raw_spin_lock_irqsave(&rq->lock, flags);
6135
6136
6137 update_rq_clock(rq);
6138 for_each_sched_entity(se)
6139 update_cfs_shares(group_cfs_rq(se));
6140 raw_spin_unlock_irqrestore(&rq->lock, flags);
6141 }
6142
6143done:
6144 mutex_unlock(&shares_mutex);
6145 return 0;
6146}
6147#else
6148
6149void free_fair_sched_group(struct task_group *tg) { }
6150
6151int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6152{
6153 return 1;
6154}
6155
6156void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6157
6158#endif
6159
6160
6161static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6162{
6163 struct sched_entity *se = &task->se;
6164 unsigned int rr_interval = 0;
6165
6166
6167
6168
6169
6170 if (rq->cfs.load.weight)
6171 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6172
6173 return rr_interval;
6174}
6175
6176
6177
6178
6179const struct sched_class fair_sched_class = {
6180 .next = &idle_sched_class,
6181 .enqueue_task = enqueue_task_fair,
6182 .dequeue_task = dequeue_task_fair,
6183 .yield_task = yield_task_fair,
6184 .yield_to_task = yield_to_task_fair,
6185
6186 .check_preempt_curr = check_preempt_wakeup,
6187
6188 .pick_next_task = pick_next_task_fair,
6189 .put_prev_task = put_prev_task_fair,
6190
6191#ifdef CONFIG_SMP
6192 .select_task_rq = select_task_rq_fair,
6193 .migrate_task_rq = migrate_task_rq_fair,
6194
6195 .rq_online = rq_online_fair,
6196 .rq_offline = rq_offline_fair,
6197
6198 .task_waking = task_waking_fair,
6199#endif
6200
6201 .set_curr_task = set_curr_task_fair,
6202 .task_tick = task_tick_fair,
6203 .task_fork = task_fork_fair,
6204
6205 .prio_changed = prio_changed_fair,
6206 .switched_from = switched_from_fair,
6207 .switched_to = switched_to_fair,
6208
6209 .get_rr_interval = get_rr_interval_fair,
6210
6211#ifdef CONFIG_FAIR_GROUP_SCHED
6212 .task_move_group = task_move_group_fair,
6213#endif
6214};
6215
6216#ifdef CONFIG_SCHED_DEBUG
6217void print_cfs_stats(struct seq_file *m, int cpu)
6218{
6219 struct cfs_rq *cfs_rq;
6220
6221 rcu_read_lock();
6222 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6223 print_cfs_rq(m, cpu, cfs_rq);
6224 rcu_read_unlock();
6225}
6226#endif
6227
6228__init void init_sched_fair_class(void)
6229{
6230#ifdef CONFIG_SMP
6231 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6232
6233#ifdef CONFIG_NO_HZ_COMMON
6234 nohz.next_balance = jiffies;
6235 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6236 cpu_notifier(sched_ilb_notifier, 0);
6237#endif
6238#endif
6239
6240}
6241