1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
44
45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
46
47#ifdef CONFIG_SCHED_DEBUG
48
49
50
51
52
53
54
55#define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57const_debug unsigned int sysctl_sched_features =
58#include "features.h"
59 0;
60#undef SCHED_FEAT
61#endif
62
63
64
65
66
67const_debug unsigned int sysctl_sched_nr_migrate = 32;
68
69
70
71
72
73unsigned int sysctl_sched_rt_period = 1000000;
74
75__read_mostly int scheduler_running;
76
77
78
79
80
81int sysctl_sched_rt_runtime = 950000;
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
181 __acquires(rq->lock)
182{
183 struct rq *rq;
184
185 lockdep_assert_held(&p->pi_lock);
186
187 for (;;) {
188 rq = task_rq(p);
189 raw_spin_lock(&rq->lock);
190 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
191 rq_pin_lock(rq, rf);
192 return rq;
193 }
194 raw_spin_unlock(&rq->lock);
195
196 while (unlikely(task_on_rq_migrating(p)))
197 cpu_relax();
198 }
199}
200
201
202
203
204struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
205 __acquires(p->pi_lock)
206 __acquires(rq->lock)
207{
208 struct rq *rq;
209
210 for (;;) {
211 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
212 rq = task_rq(p);
213 raw_spin_lock(&rq->lock);
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
232 rq_pin_lock(rq, rf);
233 return rq;
234 }
235 raw_spin_unlock(&rq->lock);
236 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
237
238 while (unlikely(task_on_rq_migrating(p)))
239 cpu_relax();
240 }
241}
242
243
244
245
246
247static void update_rq_clock_task(struct rq *rq, s64 delta)
248{
249
250
251
252
253 s64 __maybe_unused steal = 0, irq_delta = 0;
254
255#ifdef CONFIG_IRQ_TIME_ACCOUNTING
256 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273 if (irq_delta > delta)
274 irq_delta = delta;
275
276 rq->prev_irq_time += irq_delta;
277 delta -= irq_delta;
278#endif
279#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
280 if (static_key_false((¶virt_steal_rq_enabled))) {
281 steal = paravirt_steal_clock(cpu_of(rq));
282 steal -= rq->prev_steal_time_rq;
283
284 if (unlikely(steal > delta))
285 steal = delta;
286
287 rq->prev_steal_time_rq += steal;
288 delta -= steal;
289 }
290#endif
291
292 rq->clock_task += delta;
293
294#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
295 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
296 update_irq_load_avg(rq, irq_delta + steal);
297#endif
298 update_rq_clock_pelt(rq, delta);
299}
300
301void update_rq_clock(struct rq *rq)
302{
303 s64 delta;
304
305 lockdep_assert_held(&rq->lock);
306
307 if (rq->clock_update_flags & RQCF_ACT_SKIP)
308 return;
309
310#ifdef CONFIG_SCHED_DEBUG
311 if (sched_feat(WARN_DOUBLE_CLOCK))
312 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
313 rq->clock_update_flags |= RQCF_UPDATED;
314#endif
315
316 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
317 if (delta < 0)
318 return;
319 rq->clock += delta;
320 update_rq_clock_task(rq, delta);
321}
322
323static inline void
324rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
325{
326 csd->flags = 0;
327 csd->func = func;
328 csd->info = rq;
329}
330
331#ifdef CONFIG_SCHED_HRTICK
332
333
334
335
336static void hrtick_clear(struct rq *rq)
337{
338 if (hrtimer_active(&rq->hrtick_timer))
339 hrtimer_cancel(&rq->hrtick_timer);
340}
341
342
343
344
345
346static enum hrtimer_restart hrtick(struct hrtimer *timer)
347{
348 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
349 struct rq_flags rf;
350
351 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
352
353 rq_lock(rq, &rf);
354 update_rq_clock(rq);
355 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
356 rq_unlock(rq, &rf);
357
358 return HRTIMER_NORESTART;
359}
360
361#ifdef CONFIG_SMP
362
363static void __hrtick_restart(struct rq *rq)
364{
365 struct hrtimer *timer = &rq->hrtick_timer;
366
367 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
368}
369
370
371
372
373static void __hrtick_start(void *arg)
374{
375 struct rq *rq = arg;
376 struct rq_flags rf;
377
378 rq_lock(rq, &rf);
379 __hrtick_restart(rq);
380 rq_unlock(rq, &rf);
381}
382
383
384
385
386
387
388void hrtick_start(struct rq *rq, u64 delay)
389{
390 struct hrtimer *timer = &rq->hrtick_timer;
391 ktime_t time;
392 s64 delta;
393
394
395
396
397
398 delta = max_t(s64, delay, 10000LL);
399 time = ktime_add_ns(timer->base->get_time(), delta);
400
401 hrtimer_set_expires(timer, time);
402
403 if (rq == this_rq())
404 __hrtick_restart(rq);
405 else
406 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
407}
408
409#else
410
411
412
413
414
415void hrtick_start(struct rq *rq, u64 delay)
416{
417
418
419
420
421 delay = max_t(u64, delay, 10000LL);
422 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
423 HRTIMER_MODE_REL_PINNED_HARD);
424}
425
426#endif
427
428static void hrtick_rq_init(struct rq *rq)
429{
430#ifdef CONFIG_SMP
431 rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
432#endif
433 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
434 rq->hrtick_timer.function = hrtick;
435}
436#else
437static inline void hrtick_clear(struct rq *rq)
438{
439}
440
441static inline void hrtick_rq_init(struct rq *rq)
442{
443}
444#endif
445
446
447
448
449#define fetch_or(ptr, mask) \
450 ({ \
451 typeof(ptr) _ptr = (ptr); \
452 typeof(mask) _mask = (mask); \
453 typeof(*_ptr) _old, _val = *_ptr; \
454 \
455 for (;;) { \
456 _old = cmpxchg(_ptr, _val, _val | _mask); \
457 if (_old == _val) \
458 break; \
459 _val = _old; \
460 } \
461 _old; \
462})
463
464#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
465
466
467
468
469
470static bool set_nr_and_not_polling(struct task_struct *p)
471{
472 struct thread_info *ti = task_thread_info(p);
473 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
474}
475
476
477
478
479
480
481
482static bool set_nr_if_polling(struct task_struct *p)
483{
484 struct thread_info *ti = task_thread_info(p);
485 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
486
487 for (;;) {
488 if (!(val & _TIF_POLLING_NRFLAG))
489 return false;
490 if (val & _TIF_NEED_RESCHED)
491 return true;
492 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
493 if (old == val)
494 break;
495 val = old;
496 }
497 return true;
498}
499
500#else
501static bool set_nr_and_not_polling(struct task_struct *p)
502{
503 set_tsk_need_resched(p);
504 return true;
505}
506
507#ifdef CONFIG_SMP
508static bool set_nr_if_polling(struct task_struct *p)
509{
510 return false;
511}
512#endif
513#endif
514
515static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
516{
517 struct wake_q_node *node = &task->wake_q;
518
519
520
521
522
523
524
525
526
527 smp_mb__before_atomic();
528 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
529 return false;
530
531
532
533
534 *head->lastp = node;
535 head->lastp = &node->next;
536 return true;
537}
538
539
540
541
542
543
544
545
546
547
548
549
550
551void wake_q_add(struct wake_q_head *head, struct task_struct *task)
552{
553 if (__wake_q_add(head, task))
554 get_task_struct(task);
555}
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
575{
576 if (!__wake_q_add(head, task))
577 put_task_struct(task);
578}
579
580void wake_up_q(struct wake_q_head *head)
581{
582 struct wake_q_node *node = head->first;
583
584 while (node != WAKE_Q_TAIL) {
585 struct task_struct *task;
586
587 task = container_of(node, struct task_struct, wake_q);
588 BUG_ON(!task);
589
590 node = node->next;
591 task->wake_q.next = NULL;
592
593
594
595
596
597 wake_up_process(task);
598 put_task_struct(task);
599 }
600}
601
602
603
604
605
606
607
608
609void resched_curr(struct rq *rq)
610{
611 struct task_struct *curr = rq->curr;
612 int cpu;
613
614 lockdep_assert_held(&rq->lock);
615
616 if (test_tsk_need_resched(curr))
617 return;
618
619 cpu = cpu_of(rq);
620
621 if (cpu == smp_processor_id()) {
622 set_tsk_need_resched(curr);
623 set_preempt_need_resched();
624 return;
625 }
626
627 if (set_nr_and_not_polling(curr))
628 smp_send_reschedule(cpu);
629 else
630 trace_sched_wake_idle_without_ipi(cpu);
631}
632
633void resched_cpu(int cpu)
634{
635 struct rq *rq = cpu_rq(cpu);
636 unsigned long flags;
637
638 raw_spin_lock_irqsave(&rq->lock, flags);
639 if (cpu_online(cpu) || cpu == smp_processor_id())
640 resched_curr(rq);
641 raw_spin_unlock_irqrestore(&rq->lock, flags);
642}
643
644#ifdef CONFIG_SMP
645#ifdef CONFIG_NO_HZ_COMMON
646
647
648
649
650
651
652
653
654int get_nohz_timer_target(void)
655{
656 int i, cpu = smp_processor_id(), default_cpu = -1;
657 struct sched_domain *sd;
658
659 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
660 if (!idle_cpu(cpu))
661 return cpu;
662 default_cpu = cpu;
663 }
664
665 rcu_read_lock();
666 for_each_domain(cpu, sd) {
667 for_each_cpu_and(i, sched_domain_span(sd),
668 housekeeping_cpumask(HK_FLAG_TIMER)) {
669 if (cpu == i)
670 continue;
671
672 if (!idle_cpu(i)) {
673 cpu = i;
674 goto unlock;
675 }
676 }
677 }
678
679 if (default_cpu == -1)
680 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
681 cpu = default_cpu;
682unlock:
683 rcu_read_unlock();
684 return cpu;
685}
686
687
688
689
690
691
692
693
694
695
696
697static void wake_up_idle_cpu(int cpu)
698{
699 struct rq *rq = cpu_rq(cpu);
700
701 if (cpu == smp_processor_id())
702 return;
703
704 if (set_nr_and_not_polling(rq->idle))
705 smp_send_reschedule(cpu);
706 else
707 trace_sched_wake_idle_without_ipi(cpu);
708}
709
710static bool wake_up_full_nohz_cpu(int cpu)
711{
712
713
714
715
716
717
718 if (cpu_is_offline(cpu))
719 return true;
720 if (tick_nohz_full_cpu(cpu)) {
721 if (cpu != smp_processor_id() ||
722 tick_nohz_tick_stopped())
723 tick_nohz_full_kick_cpu(cpu);
724 return true;
725 }
726
727 return false;
728}
729
730
731
732
733
734
735void wake_up_nohz_cpu(int cpu)
736{
737 if (!wake_up_full_nohz_cpu(cpu))
738 wake_up_idle_cpu(cpu);
739}
740
741static void nohz_csd_func(void *info)
742{
743 struct rq *rq = info;
744 int cpu = cpu_of(rq);
745 unsigned int flags;
746
747
748
749
750 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
751 WARN_ON(!(flags & NOHZ_KICK_MASK));
752
753 rq->idle_balance = idle_cpu(cpu);
754 if (rq->idle_balance && !need_resched()) {
755 rq->nohz_idle_balance = flags;
756 raise_softirq_irqoff(SCHED_SOFTIRQ);
757 }
758}
759
760#endif
761
762#ifdef CONFIG_NO_HZ_FULL
763bool sched_can_stop_tick(struct rq *rq)
764{
765 int fifo_nr_running;
766
767
768 if (rq->dl.dl_nr_running)
769 return false;
770
771
772
773
774
775 if (rq->rt.rr_nr_running) {
776 if (rq->rt.rr_nr_running == 1)
777 return true;
778 else
779 return false;
780 }
781
782
783
784
785
786 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
787 if (fifo_nr_running)
788 return true;
789
790
791
792
793
794
795 if (rq->nr_running > 1)
796 return false;
797
798 return true;
799}
800#endif
801#endif
802
803#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
804 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
805
806
807
808
809
810
811int walk_tg_tree_from(struct task_group *from,
812 tg_visitor down, tg_visitor up, void *data)
813{
814 struct task_group *parent, *child;
815 int ret;
816
817 parent = from;
818
819down:
820 ret = (*down)(parent, data);
821 if (ret)
822 goto out;
823 list_for_each_entry_rcu(child, &parent->children, siblings) {
824 parent = child;
825 goto down;
826
827up:
828 continue;
829 }
830 ret = (*up)(parent, data);
831 if (ret || parent == from)
832 goto out;
833
834 child = parent;
835 parent = parent->parent;
836 if (parent)
837 goto up;
838out:
839 return ret;
840}
841
842int tg_nop(struct task_group *tg, void *data)
843{
844 return 0;
845}
846#endif
847
848static void set_load_weight(struct task_struct *p, bool update_load)
849{
850 int prio = p->static_prio - MAX_RT_PRIO;
851 struct load_weight *load = &p->se.load;
852
853
854
855
856 if (task_has_idle_policy(p)) {
857 load->weight = scale_load(WEIGHT_IDLEPRIO);
858 load->inv_weight = WMULT_IDLEPRIO;
859 return;
860 }
861
862
863
864
865
866 if (update_load && p->sched_class == &fair_sched_class) {
867 reweight_task(p, prio);
868 } else {
869 load->weight = scale_load(sched_prio_to_weight[prio]);
870 load->inv_weight = sched_prio_to_wmult[prio];
871 }
872}
873
874#ifdef CONFIG_UCLAMP_TASK
875
876
877
878
879
880
881
882
883
884
885static DEFINE_MUTEX(uclamp_mutex);
886
887
888unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
889
890
891unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
909
910
911static struct uclamp_se uclamp_default[UCLAMP_CNT];
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
932
933
934#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
935
936#define for_each_clamp_id(clamp_id) \
937 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
938
939static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
940{
941 return clamp_value / UCLAMP_BUCKET_DELTA;
942}
943
944static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
945{
946 if (clamp_id == UCLAMP_MIN)
947 return 0;
948 return SCHED_CAPACITY_SCALE;
949}
950
951static inline void uclamp_se_set(struct uclamp_se *uc_se,
952 unsigned int value, bool user_defined)
953{
954 uc_se->value = value;
955 uc_se->bucket_id = uclamp_bucket_id(value);
956 uc_se->user_defined = user_defined;
957}
958
959static inline unsigned int
960uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
961 unsigned int clamp_value)
962{
963
964
965
966
967
968 if (clamp_id == UCLAMP_MAX) {
969 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
970 return clamp_value;
971 }
972
973 return uclamp_none(UCLAMP_MIN);
974}
975
976static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
977 unsigned int clamp_value)
978{
979
980 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
981 return;
982
983 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
984}
985
986static inline
987unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
988 unsigned int clamp_value)
989{
990 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
991 int bucket_id = UCLAMP_BUCKETS - 1;
992
993
994
995
996
997 for ( ; bucket_id >= 0; bucket_id--) {
998 if (!bucket[bucket_id].tasks)
999 continue;
1000 return bucket[bucket_id].value;
1001 }
1002
1003
1004 return uclamp_idle_value(rq, clamp_id, clamp_value);
1005}
1006
1007static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1008{
1009 unsigned int default_util_min;
1010 struct uclamp_se *uc_se;
1011
1012 lockdep_assert_held(&p->pi_lock);
1013
1014 uc_se = &p->uclamp_req[UCLAMP_MIN];
1015
1016
1017 if (uc_se->user_defined)
1018 return;
1019
1020 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1021 uclamp_se_set(uc_se, default_util_min, false);
1022}
1023
1024static void uclamp_update_util_min_rt_default(struct task_struct *p)
1025{
1026 struct rq_flags rf;
1027 struct rq *rq;
1028
1029 if (!rt_task(p))
1030 return;
1031
1032
1033 rq = task_rq_lock(p, &rf);
1034 __uclamp_update_util_min_rt_default(p);
1035 task_rq_unlock(rq, p, &rf);
1036}
1037
1038static void uclamp_sync_util_min_rt_default(void)
1039{
1040 struct task_struct *g, *p;
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 read_lock(&tasklist_lock);
1056 smp_mb__after_spinlock();
1057 read_unlock(&tasklist_lock);
1058
1059 rcu_read_lock();
1060 for_each_process_thread(g, p)
1061 uclamp_update_util_min_rt_default(p);
1062 rcu_read_unlock();
1063}
1064
1065static inline struct uclamp_se
1066uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1067{
1068 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1069#ifdef CONFIG_UCLAMP_TASK_GROUP
1070 struct uclamp_se uc_max;
1071
1072
1073
1074
1075
1076 if (task_group_is_autogroup(task_group(p)))
1077 return uc_req;
1078 if (task_group(p) == &root_task_group)
1079 return uc_req;
1080
1081 uc_max = task_group(p)->uclamp[clamp_id];
1082 if (uc_req.value > uc_max.value || !uc_req.user_defined)
1083 return uc_max;
1084#endif
1085
1086 return uc_req;
1087}
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097static inline struct uclamp_se
1098uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1099{
1100 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1101 struct uclamp_se uc_max = uclamp_default[clamp_id];
1102
1103
1104 if (unlikely(uc_req.value > uc_max.value))
1105 return uc_max;
1106
1107 return uc_req;
1108}
1109
1110unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1111{
1112 struct uclamp_se uc_eff;
1113
1114
1115 if (p->uclamp[clamp_id].active)
1116 return (unsigned long)p->uclamp[clamp_id].value;
1117
1118 uc_eff = uclamp_eff_get(p, clamp_id);
1119
1120 return (unsigned long)uc_eff.value;
1121}
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1134 enum uclamp_id clamp_id)
1135{
1136 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1137 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1138 struct uclamp_bucket *bucket;
1139
1140 lockdep_assert_held(&rq->lock);
1141
1142
1143 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1144
1145 bucket = &uc_rq->bucket[uc_se->bucket_id];
1146 bucket->tasks++;
1147 uc_se->active = true;
1148
1149 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1150
1151
1152
1153
1154
1155 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1156 bucket->value = uc_se->value;
1157
1158 if (uc_se->value > READ_ONCE(uc_rq->value))
1159 WRITE_ONCE(uc_rq->value, uc_se->value);
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1172 enum uclamp_id clamp_id)
1173{
1174 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1175 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1176 struct uclamp_bucket *bucket;
1177 unsigned int bkt_clamp;
1178 unsigned int rq_clamp;
1179
1180 lockdep_assert_held(&rq->lock);
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205 if (unlikely(!uc_se->active))
1206 return;
1207
1208 bucket = &uc_rq->bucket[uc_se->bucket_id];
1209
1210 SCHED_WARN_ON(!bucket->tasks);
1211 if (likely(bucket->tasks))
1212 bucket->tasks--;
1213
1214 uc_se->active = false;
1215
1216
1217
1218
1219
1220
1221
1222 if (likely(bucket->tasks))
1223 return;
1224
1225 rq_clamp = READ_ONCE(uc_rq->value);
1226
1227
1228
1229
1230 SCHED_WARN_ON(bucket->value > rq_clamp);
1231 if (bucket->value >= rq_clamp) {
1232 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1233 WRITE_ONCE(uc_rq->value, bkt_clamp);
1234 }
1235}
1236
1237static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1238{
1239 enum uclamp_id clamp_id;
1240
1241
1242
1243
1244
1245
1246
1247 if (!static_branch_unlikely(&sched_uclamp_used))
1248 return;
1249
1250 if (unlikely(!p->sched_class->uclamp_enabled))
1251 return;
1252
1253 for_each_clamp_id(clamp_id)
1254 uclamp_rq_inc_id(rq, p, clamp_id);
1255
1256
1257 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1258 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1259}
1260
1261static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1262{
1263 enum uclamp_id clamp_id;
1264
1265
1266
1267
1268
1269
1270
1271 if (!static_branch_unlikely(&sched_uclamp_used))
1272 return;
1273
1274 if (unlikely(!p->sched_class->uclamp_enabled))
1275 return;
1276
1277 for_each_clamp_id(clamp_id)
1278 uclamp_rq_dec_id(rq, p, clamp_id);
1279}
1280
1281static inline void
1282uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1283{
1284 struct rq_flags rf;
1285 struct rq *rq;
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295 rq = task_rq_lock(p, &rf);
1296
1297
1298
1299
1300
1301
1302
1303 if (p->uclamp[clamp_id].active) {
1304 uclamp_rq_dec_id(rq, p, clamp_id);
1305 uclamp_rq_inc_id(rq, p, clamp_id);
1306 }
1307
1308 task_rq_unlock(rq, p, &rf);
1309}
1310
1311#ifdef CONFIG_UCLAMP_TASK_GROUP
1312static inline void
1313uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1314 unsigned int clamps)
1315{
1316 enum uclamp_id clamp_id;
1317 struct css_task_iter it;
1318 struct task_struct *p;
1319
1320 css_task_iter_start(css, 0, &it);
1321 while ((p = css_task_iter_next(&it))) {
1322 for_each_clamp_id(clamp_id) {
1323 if ((0x1 << clamp_id) & clamps)
1324 uclamp_update_active(p, clamp_id);
1325 }
1326 }
1327 css_task_iter_end(&it);
1328}
1329
1330static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1331static void uclamp_update_root_tg(void)
1332{
1333 struct task_group *tg = &root_task_group;
1334
1335 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1336 sysctl_sched_uclamp_util_min, false);
1337 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1338 sysctl_sched_uclamp_util_max, false);
1339
1340 rcu_read_lock();
1341 cpu_util_update_eff(&root_task_group.css);
1342 rcu_read_unlock();
1343}
1344#else
1345static void uclamp_update_root_tg(void) { }
1346#endif
1347
1348int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1349 void *buffer, size_t *lenp, loff_t *ppos)
1350{
1351 bool update_root_tg = false;
1352 int old_min, old_max, old_min_rt;
1353 int result;
1354
1355 mutex_lock(&uclamp_mutex);
1356 old_min = sysctl_sched_uclamp_util_min;
1357 old_max = sysctl_sched_uclamp_util_max;
1358 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1359
1360 result = proc_dointvec(table, write, buffer, lenp, ppos);
1361 if (result)
1362 goto undo;
1363 if (!write)
1364 goto done;
1365
1366 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1367 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1368 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1369
1370 result = -EINVAL;
1371 goto undo;
1372 }
1373
1374 if (old_min != sysctl_sched_uclamp_util_min) {
1375 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1376 sysctl_sched_uclamp_util_min, false);
1377 update_root_tg = true;
1378 }
1379 if (old_max != sysctl_sched_uclamp_util_max) {
1380 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1381 sysctl_sched_uclamp_util_max, false);
1382 update_root_tg = true;
1383 }
1384
1385 if (update_root_tg) {
1386 static_branch_enable(&sched_uclamp_used);
1387 uclamp_update_root_tg();
1388 }
1389
1390 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1391 static_branch_enable(&sched_uclamp_used);
1392 uclamp_sync_util_min_rt_default();
1393 }
1394
1395
1396
1397
1398
1399
1400
1401 goto done;
1402
1403undo:
1404 sysctl_sched_uclamp_util_min = old_min;
1405 sysctl_sched_uclamp_util_max = old_max;
1406 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1407done:
1408 mutex_unlock(&uclamp_mutex);
1409
1410 return result;
1411}
1412
1413static int uclamp_validate(struct task_struct *p,
1414 const struct sched_attr *attr)
1415{
1416 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1417 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1418
1419 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1420 lower_bound = attr->sched_util_min;
1421 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1422 upper_bound = attr->sched_util_max;
1423
1424 if (lower_bound > upper_bound)
1425 return -EINVAL;
1426 if (upper_bound > SCHED_CAPACITY_SCALE)
1427 return -EINVAL;
1428
1429
1430
1431
1432
1433
1434
1435
1436 static_branch_enable(&sched_uclamp_used);
1437
1438 return 0;
1439}
1440
1441static void __setscheduler_uclamp(struct task_struct *p,
1442 const struct sched_attr *attr)
1443{
1444 enum uclamp_id clamp_id;
1445
1446
1447
1448
1449
1450 for_each_clamp_id(clamp_id) {
1451 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1452
1453
1454 if (uc_se->user_defined)
1455 continue;
1456
1457
1458
1459
1460
1461 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1462 __uclamp_update_util_min_rt_default(p);
1463 else
1464 uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
1465
1466 }
1467
1468 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1469 return;
1470
1471 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1472 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1473 attr->sched_util_min, true);
1474 }
1475
1476 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1477 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1478 attr->sched_util_max, true);
1479 }
1480}
1481
1482static void uclamp_fork(struct task_struct *p)
1483{
1484 enum uclamp_id clamp_id;
1485
1486
1487
1488
1489
1490 for_each_clamp_id(clamp_id)
1491 p->uclamp[clamp_id].active = false;
1492
1493 if (likely(!p->sched_reset_on_fork))
1494 return;
1495
1496 for_each_clamp_id(clamp_id) {
1497 uclamp_se_set(&p->uclamp_req[clamp_id],
1498 uclamp_none(clamp_id), false);
1499 }
1500}
1501
1502static void uclamp_post_fork(struct task_struct *p)
1503{
1504 uclamp_update_util_min_rt_default(p);
1505}
1506
1507static void __init init_uclamp_rq(struct rq *rq)
1508{
1509 enum uclamp_id clamp_id;
1510 struct uclamp_rq *uc_rq = rq->uclamp;
1511
1512 for_each_clamp_id(clamp_id) {
1513 uc_rq[clamp_id] = (struct uclamp_rq) {
1514 .value = uclamp_none(clamp_id)
1515 };
1516 }
1517
1518 rq->uclamp_flags = 0;
1519}
1520
1521static void __init init_uclamp(void)
1522{
1523 struct uclamp_se uc_max = {};
1524 enum uclamp_id clamp_id;
1525 int cpu;
1526
1527 for_each_possible_cpu(cpu)
1528 init_uclamp_rq(cpu_rq(cpu));
1529
1530 for_each_clamp_id(clamp_id) {
1531 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1532 uclamp_none(clamp_id), false);
1533 }
1534
1535
1536 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1537 for_each_clamp_id(clamp_id) {
1538 uclamp_default[clamp_id] = uc_max;
1539#ifdef CONFIG_UCLAMP_TASK_GROUP
1540 root_task_group.uclamp_req[clamp_id] = uc_max;
1541 root_task_group.uclamp[clamp_id] = uc_max;
1542#endif
1543 }
1544}
1545
1546#else
1547static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1548static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1549static inline int uclamp_validate(struct task_struct *p,
1550 const struct sched_attr *attr)
1551{
1552 return -EOPNOTSUPP;
1553}
1554static void __setscheduler_uclamp(struct task_struct *p,
1555 const struct sched_attr *attr) { }
1556static inline void uclamp_fork(struct task_struct *p) { }
1557static inline void uclamp_post_fork(struct task_struct *p) { }
1558static inline void init_uclamp(void) { }
1559#endif
1560
1561static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1562{
1563 if (!(flags & ENQUEUE_NOCLOCK))
1564 update_rq_clock(rq);
1565
1566 if (!(flags & ENQUEUE_RESTORE)) {
1567 sched_info_queued(rq, p);
1568 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1569 }
1570
1571 uclamp_rq_inc(rq, p);
1572 p->sched_class->enqueue_task(rq, p, flags);
1573}
1574
1575static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1576{
1577 if (!(flags & DEQUEUE_NOCLOCK))
1578 update_rq_clock(rq);
1579
1580 if (!(flags & DEQUEUE_SAVE)) {
1581 sched_info_dequeued(rq, p);
1582 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1583 }
1584
1585 uclamp_rq_dec(rq, p);
1586 p->sched_class->dequeue_task(rq, p, flags);
1587}
1588
1589void activate_task(struct rq *rq, struct task_struct *p, int flags)
1590{
1591 enqueue_task(rq, p, flags);
1592
1593 p->on_rq = TASK_ON_RQ_QUEUED;
1594}
1595
1596void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1597{
1598 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1599
1600 dequeue_task(rq, p, flags);
1601}
1602
1603
1604
1605
1606static inline int __normal_prio(struct task_struct *p)
1607{
1608 return p->static_prio;
1609}
1610
1611
1612
1613
1614
1615
1616
1617
1618static inline int normal_prio(struct task_struct *p)
1619{
1620 int prio;
1621
1622 if (task_has_dl_policy(p))
1623 prio = MAX_DL_PRIO-1;
1624 else if (task_has_rt_policy(p))
1625 prio = MAX_RT_PRIO-1 - p->rt_priority;
1626 else
1627 prio = __normal_prio(p);
1628 return prio;
1629}
1630
1631
1632
1633
1634
1635
1636
1637
1638static int effective_prio(struct task_struct *p)
1639{
1640 p->normal_prio = normal_prio(p);
1641
1642
1643
1644
1645
1646 if (!rt_prio(p->prio))
1647 return p->normal_prio;
1648 return p->prio;
1649}
1650
1651
1652
1653
1654
1655
1656
1657inline int task_curr(const struct task_struct *p)
1658{
1659 return cpu_curr(task_cpu(p)) == p;
1660}
1661
1662
1663
1664
1665
1666
1667
1668
1669static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 const struct sched_class *prev_class,
1671 int oldprio)
1672{
1673 if (prev_class != p->sched_class) {
1674 if (prev_class->switched_from)
1675 prev_class->switched_from(rq, p);
1676
1677 p->sched_class->switched_to(rq, p);
1678 } else if (oldprio != p->prio || dl_task(p))
1679 p->sched_class->prio_changed(rq, p, oldprio);
1680}
1681
1682void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1683{
1684 if (p->sched_class == rq->curr->sched_class)
1685 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1686 else if (p->sched_class > rq->curr->sched_class)
1687 resched_curr(rq);
1688
1689
1690
1691
1692
1693 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1694 rq_clock_skip_update(rq);
1695}
1696
1697#ifdef CONFIG_SMP
1698
1699
1700
1701
1702
1703static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1704{
1705 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1706 return false;
1707
1708 if (is_per_cpu_kthread(p))
1709 return cpu_online(cpu);
1710
1711 return cpu_active(cpu);
1712}
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1734 struct task_struct *p, int new_cpu)
1735{
1736 lockdep_assert_held(&rq->lock);
1737
1738 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1739 set_task_cpu(p, new_cpu);
1740 rq_unlock(rq, rf);
1741
1742 rq = cpu_rq(new_cpu);
1743
1744 rq_lock(rq, rf);
1745 BUG_ON(task_cpu(p) != new_cpu);
1746 activate_task(rq, p, 0);
1747 check_preempt_curr(rq, p, 0);
1748
1749 return rq;
1750}
1751
1752struct migration_arg {
1753 struct task_struct *task;
1754 int dest_cpu;
1755};
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1767 struct task_struct *p, int dest_cpu)
1768{
1769
1770 if (!is_cpu_allowed(p, dest_cpu))
1771 return rq;
1772
1773 update_rq_clock(rq);
1774 rq = move_queued_task(rq, rf, p, dest_cpu);
1775
1776 return rq;
1777}
1778
1779
1780
1781
1782
1783
1784static int migration_cpu_stop(void *data)
1785{
1786 struct migration_arg *arg = data;
1787 struct task_struct *p = arg->task;
1788 struct rq *rq = this_rq();
1789 struct rq_flags rf;
1790
1791
1792
1793
1794
1795 local_irq_disable();
1796
1797
1798
1799
1800
1801 flush_smp_call_function_from_idle();
1802
1803 raw_spin_lock(&p->pi_lock);
1804 rq_lock(rq, &rf);
1805
1806
1807
1808
1809
1810 if (task_rq(p) == rq) {
1811 if (task_on_rq_queued(p))
1812 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1813 else
1814 p->wake_cpu = arg->dest_cpu;
1815 }
1816 rq_unlock(rq, &rf);
1817 raw_spin_unlock(&p->pi_lock);
1818
1819 local_irq_enable();
1820 return 0;
1821}
1822
1823
1824
1825
1826
1827void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1828{
1829 cpumask_copy(&p->cpus_mask, new_mask);
1830 p->nr_cpus_allowed = cpumask_weight(new_mask);
1831}
1832
1833void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1834{
1835 struct rq *rq = task_rq(p);
1836 bool queued, running;
1837
1838 lockdep_assert_held(&p->pi_lock);
1839
1840 queued = task_on_rq_queued(p);
1841 running = task_current(rq, p);
1842
1843 if (queued) {
1844
1845
1846
1847
1848 lockdep_assert_held(&rq->lock);
1849 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1850 }
1851 if (running)
1852 put_prev_task(rq, p);
1853
1854 p->sched_class->set_cpus_allowed(p, new_mask);
1855
1856 if (queued)
1857 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1858 if (running)
1859 set_next_task(rq, p);
1860}
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871static int __set_cpus_allowed_ptr(struct task_struct *p,
1872 const struct cpumask *new_mask, bool check)
1873{
1874 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1875 unsigned int dest_cpu;
1876 struct rq_flags rf;
1877 struct rq *rq;
1878 int ret = 0;
1879
1880 rq = task_rq_lock(p, &rf);
1881 update_rq_clock(rq);
1882
1883 if (p->flags & PF_KTHREAD) {
1884
1885
1886
1887 cpu_valid_mask = cpu_online_mask;
1888 }
1889
1890
1891
1892
1893
1894 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1895 ret = -EINVAL;
1896 goto out;
1897 }
1898
1899 if (cpumask_equal(&p->cpus_mask, new_mask))
1900 goto out;
1901
1902
1903
1904
1905
1906
1907 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
1908 if (dest_cpu >= nr_cpu_ids) {
1909 ret = -EINVAL;
1910 goto out;
1911 }
1912
1913 do_set_cpus_allowed(p, new_mask);
1914
1915 if (p->flags & PF_KTHREAD) {
1916
1917
1918
1919
1920 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1921 !cpumask_intersects(new_mask, cpu_active_mask) &&
1922 p->nr_cpus_allowed != 1);
1923 }
1924
1925
1926 if (cpumask_test_cpu(task_cpu(p), new_mask))
1927 goto out;
1928
1929 if (task_running(rq, p) || p->state == TASK_WAKING) {
1930 struct migration_arg arg = { p, dest_cpu };
1931
1932 task_rq_unlock(rq, p, &rf);
1933 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1934 return 0;
1935 } else if (task_on_rq_queued(p)) {
1936
1937
1938
1939
1940 rq = move_queued_task(rq, &rf, p, dest_cpu);
1941 }
1942out:
1943 task_rq_unlock(rq, p, &rf);
1944
1945 return ret;
1946}
1947
1948int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1949{
1950 return __set_cpus_allowed_ptr(p, new_mask, false);
1951}
1952EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1953
1954void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1955{
1956#ifdef CONFIG_SCHED_DEBUG
1957
1958
1959
1960
1961 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1962 !p->on_rq);
1963
1964
1965
1966
1967
1968
1969 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1970 p->sched_class == &fair_sched_class &&
1971 (p->on_rq && !task_on_rq_migrating(p)));
1972
1973#ifdef CONFIG_LOCKDEP
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1985 lockdep_is_held(&task_rq(p)->lock)));
1986#endif
1987
1988
1989
1990 WARN_ON_ONCE(!cpu_online(new_cpu));
1991#endif
1992
1993 trace_sched_migrate_task(p, new_cpu);
1994
1995 if (task_cpu(p) != new_cpu) {
1996 if (p->sched_class->migrate_task_rq)
1997 p->sched_class->migrate_task_rq(p, new_cpu);
1998 p->se.nr_migrations++;
1999 rseq_migrate(p);
2000 perf_event_task_migrate(p);
2001 }
2002
2003 __set_task_cpu(p, new_cpu);
2004}
2005
2006#ifdef CONFIG_NUMA_BALANCING
2007static void __migrate_swap_task(struct task_struct *p, int cpu)
2008{
2009 if (task_on_rq_queued(p)) {
2010 struct rq *src_rq, *dst_rq;
2011 struct rq_flags srf, drf;
2012
2013 src_rq = task_rq(p);
2014 dst_rq = cpu_rq(cpu);
2015
2016 rq_pin_lock(src_rq, &srf);
2017 rq_pin_lock(dst_rq, &drf);
2018
2019 deactivate_task(src_rq, p, 0);
2020 set_task_cpu(p, cpu);
2021 activate_task(dst_rq, p, 0);
2022 check_preempt_curr(dst_rq, p, 0);
2023
2024 rq_unpin_lock(dst_rq, &drf);
2025 rq_unpin_lock(src_rq, &srf);
2026
2027 } else {
2028
2029
2030
2031
2032
2033 p->wake_cpu = cpu;
2034 }
2035}
2036
2037struct migration_swap_arg {
2038 struct task_struct *src_task, *dst_task;
2039 int src_cpu, dst_cpu;
2040};
2041
2042static int migrate_swap_stop(void *data)
2043{
2044 struct migration_swap_arg *arg = data;
2045 struct rq *src_rq, *dst_rq;
2046 int ret = -EAGAIN;
2047
2048 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2049 return -EAGAIN;
2050
2051 src_rq = cpu_rq(arg->src_cpu);
2052 dst_rq = cpu_rq(arg->dst_cpu);
2053
2054 double_raw_lock(&arg->src_task->pi_lock,
2055 &arg->dst_task->pi_lock);
2056 double_rq_lock(src_rq, dst_rq);
2057
2058 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2059 goto unlock;
2060
2061 if (task_cpu(arg->src_task) != arg->src_cpu)
2062 goto unlock;
2063
2064 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2065 goto unlock;
2066
2067 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2068 goto unlock;
2069
2070 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2071 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2072
2073 ret = 0;
2074
2075unlock:
2076 double_rq_unlock(src_rq, dst_rq);
2077 raw_spin_unlock(&arg->dst_task->pi_lock);
2078 raw_spin_unlock(&arg->src_task->pi_lock);
2079
2080 return ret;
2081}
2082
2083
2084
2085
2086int migrate_swap(struct task_struct *cur, struct task_struct *p,
2087 int target_cpu, int curr_cpu)
2088{
2089 struct migration_swap_arg arg;
2090 int ret = -EINVAL;
2091
2092 arg = (struct migration_swap_arg){
2093 .src_task = cur,
2094 .src_cpu = curr_cpu,
2095 .dst_task = p,
2096 .dst_cpu = target_cpu,
2097 };
2098
2099 if (arg.src_cpu == arg.dst_cpu)
2100 goto out;
2101
2102
2103
2104
2105
2106 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2107 goto out;
2108
2109 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2110 goto out;
2111
2112 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2113 goto out;
2114
2115 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2116 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2117
2118out:
2119 return ret;
2120}
2121#endif
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2140{
2141 int running, queued;
2142 struct rq_flags rf;
2143 unsigned long ncsw;
2144 struct rq *rq;
2145
2146 for (;;) {
2147
2148
2149
2150
2151
2152
2153 rq = task_rq(p);
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166 while (task_running(rq, p)) {
2167 if (match_state && unlikely(p->state != match_state))
2168 return 0;
2169 cpu_relax();
2170 }
2171
2172
2173
2174
2175
2176
2177 rq = task_rq_lock(p, &rf);
2178 trace_sched_wait_task(p);
2179 running = task_running(rq, p);
2180 queued = task_on_rq_queued(p);
2181 ncsw = 0;
2182 if (!match_state || p->state == match_state)
2183 ncsw = p->nvcsw | LONG_MIN;
2184 task_rq_unlock(rq, p, &rf);
2185
2186
2187
2188
2189 if (unlikely(!ncsw))
2190 break;
2191
2192
2193
2194
2195
2196
2197
2198 if (unlikely(running)) {
2199 cpu_relax();
2200 continue;
2201 }
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212 if (unlikely(queued)) {
2213 ktime_t to = NSEC_PER_SEC / HZ;
2214
2215 set_current_state(TASK_UNINTERRUPTIBLE);
2216 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2217 continue;
2218 }
2219
2220
2221
2222
2223
2224
2225 break;
2226 }
2227
2228 return ncsw;
2229}
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244void kick_process(struct task_struct *p)
2245{
2246 int cpu;
2247
2248 preempt_disable();
2249 cpu = task_cpu(p);
2250 if ((cpu != smp_processor_id()) && task_curr(p))
2251 smp_send_reschedule(cpu);
2252 preempt_enable();
2253}
2254EXPORT_SYMBOL_GPL(kick_process);
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278static int select_fallback_rq(int cpu, struct task_struct *p)
2279{
2280 int nid = cpu_to_node(cpu);
2281 const struct cpumask *nodemask = NULL;
2282 enum { cpuset, possible, fail } state = cpuset;
2283 int dest_cpu;
2284
2285
2286
2287
2288
2289
2290 if (nid != -1) {
2291 nodemask = cpumask_of_node(nid);
2292
2293
2294 for_each_cpu(dest_cpu, nodemask) {
2295 if (!cpu_active(dest_cpu))
2296 continue;
2297 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
2298 return dest_cpu;
2299 }
2300 }
2301
2302 for (;;) {
2303
2304 for_each_cpu(dest_cpu, p->cpus_ptr) {
2305 if (!is_cpu_allowed(p, dest_cpu))
2306 continue;
2307
2308 goto out;
2309 }
2310
2311
2312 switch (state) {
2313 case cpuset:
2314 if (IS_ENABLED(CONFIG_CPUSETS)) {
2315 cpuset_cpus_allowed_fallback(p);
2316 state = possible;
2317 break;
2318 }
2319 fallthrough;
2320 case possible:
2321 do_set_cpus_allowed(p, cpu_possible_mask);
2322 state = fail;
2323 break;
2324
2325 case fail:
2326 BUG();
2327 break;
2328 }
2329 }
2330
2331out:
2332 if (state != cpuset) {
2333
2334
2335
2336
2337
2338 if (p->mm && printk_ratelimit()) {
2339 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
2340 task_pid_nr(p), p->comm, cpu);
2341 }
2342 }
2343
2344 return dest_cpu;
2345}
2346
2347
2348
2349
2350static inline
2351int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
2352{
2353 lockdep_assert_held(&p->pi_lock);
2354
2355 if (p->nr_cpus_allowed > 1)
2356 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2357 else
2358 cpu = cpumask_any(p->cpus_ptr);
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370 if (unlikely(!is_cpu_allowed(p, cpu)))
2371 cpu = select_fallback_rq(task_cpu(p), p);
2372
2373 return cpu;
2374}
2375
2376void sched_set_stop_task(int cpu, struct task_struct *stop)
2377{
2378 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2379 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2380
2381 if (stop) {
2382
2383
2384
2385
2386
2387
2388
2389
2390 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2391
2392 stop->sched_class = &stop_sched_class;
2393 }
2394
2395 cpu_rq(cpu)->stop = stop;
2396
2397 if (old_stop) {
2398
2399
2400
2401
2402 old_stop->sched_class = &rt_sched_class;
2403 }
2404}
2405
2406#else
2407
2408static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2409 const struct cpumask *new_mask, bool check)
2410{
2411 return set_cpus_allowed_ptr(p, new_mask);
2412}
2413
2414#endif
2415
2416static void
2417ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2418{
2419 struct rq *rq;
2420
2421 if (!schedstat_enabled())
2422 return;
2423
2424 rq = this_rq();
2425
2426#ifdef CONFIG_SMP
2427 if (cpu == rq->cpu) {
2428 __schedstat_inc(rq->ttwu_local);
2429 __schedstat_inc(p->se.statistics.nr_wakeups_local);
2430 } else {
2431 struct sched_domain *sd;
2432
2433 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2434 rcu_read_lock();
2435 for_each_domain(rq->cpu, sd) {
2436 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2437 __schedstat_inc(sd->ttwu_wake_remote);
2438 break;
2439 }
2440 }
2441 rcu_read_unlock();
2442 }
2443
2444 if (wake_flags & WF_MIGRATED)
2445 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2446#endif
2447
2448 __schedstat_inc(rq->ttwu_count);
2449 __schedstat_inc(p->se.statistics.nr_wakeups);
2450
2451 if (wake_flags & WF_SYNC)
2452 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2453}
2454
2455
2456
2457
2458static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2459 struct rq_flags *rf)
2460{
2461 check_preempt_curr(rq, p, wake_flags);
2462 p->state = TASK_RUNNING;
2463 trace_sched_wakeup(p);
2464
2465#ifdef CONFIG_SMP
2466 if (p->sched_class->task_woken) {
2467
2468
2469
2470
2471 rq_unpin_lock(rq, rf);
2472 p->sched_class->task_woken(rq, p);
2473 rq_repin_lock(rq, rf);
2474 }
2475
2476 if (rq->idle_stamp) {
2477 u64 delta = rq_clock(rq) - rq->idle_stamp;
2478 u64 max = 2*rq->max_idle_balance_cost;
2479
2480 update_avg(&rq->avg_idle, delta);
2481
2482 if (rq->avg_idle > max)
2483 rq->avg_idle = max;
2484
2485 rq->idle_stamp = 0;
2486 }
2487#endif
2488}
2489
2490static void
2491ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
2492 struct rq_flags *rf)
2493{
2494 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2495
2496 lockdep_assert_held(&rq->lock);
2497
2498 if (p->sched_contributes_to_load)
2499 rq->nr_uninterruptible--;
2500
2501#ifdef CONFIG_SMP
2502 if (wake_flags & WF_MIGRATED)
2503 en_flags |= ENQUEUE_MIGRATED;
2504 else
2505#endif
2506 if (p->in_iowait) {
2507 delayacct_blkio_end(p);
2508 atomic_dec(&task_rq(p)->nr_iowait);
2509 }
2510
2511 activate_task(rq, p, en_flags);
2512 ttwu_do_wakeup(rq, p, wake_flags, rf);
2513}
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540static int ttwu_runnable(struct task_struct *p, int wake_flags)
2541{
2542 struct rq_flags rf;
2543 struct rq *rq;
2544 int ret = 0;
2545
2546 rq = __task_rq_lock(p, &rf);
2547 if (task_on_rq_queued(p)) {
2548
2549 update_rq_clock(rq);
2550 ttwu_do_wakeup(rq, p, wake_flags, &rf);
2551 ret = 1;
2552 }
2553 __task_rq_unlock(rq, &rf);
2554
2555 return ret;
2556}
2557
2558#ifdef CONFIG_SMP
2559void sched_ttwu_pending(void *arg)
2560{
2561 struct llist_node *llist = arg;
2562 struct rq *rq = this_rq();
2563 struct task_struct *p, *t;
2564 struct rq_flags rf;
2565
2566 if (!llist)
2567 return;
2568
2569
2570
2571
2572
2573
2574 WRITE_ONCE(rq->ttwu_pending, 0);
2575
2576 rq_lock_irqsave(rq, &rf);
2577 update_rq_clock(rq);
2578
2579 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
2580 if (WARN_ON_ONCE(p->on_cpu))
2581 smp_cond_load_acquire(&p->on_cpu, !VAL);
2582
2583 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
2584 set_task_cpu(p, cpu_of(rq));
2585
2586 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2587 }
2588
2589 rq_unlock_irqrestore(rq, &rf);
2590}
2591
2592void send_call_function_single_ipi(int cpu)
2593{
2594 struct rq *rq = cpu_rq(cpu);
2595
2596 if (!set_nr_if_polling(rq->idle))
2597 arch_send_call_function_single_ipi(cpu);
2598 else
2599 trace_sched_wake_idle_without_ipi(cpu);
2600}
2601
2602
2603
2604
2605
2606
2607
2608static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2609{
2610 struct rq *rq = cpu_rq(cpu);
2611
2612 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2613
2614 WRITE_ONCE(rq->ttwu_pending, 1);
2615 __smp_call_single_queue(cpu, &p->wake_entry.llist);
2616}
2617
2618void wake_up_if_idle(int cpu)
2619{
2620 struct rq *rq = cpu_rq(cpu);
2621 struct rq_flags rf;
2622
2623 rcu_read_lock();
2624
2625 if (!is_idle_task(rcu_dereference(rq->curr)))
2626 goto out;
2627
2628 if (set_nr_if_polling(rq->idle)) {
2629 trace_sched_wake_idle_without_ipi(cpu);
2630 } else {
2631 rq_lock_irqsave(rq, &rf);
2632 if (is_idle_task(rq->curr))
2633 smp_send_reschedule(cpu);
2634
2635 rq_unlock_irqrestore(rq, &rf);
2636 }
2637
2638out:
2639 rcu_read_unlock();
2640}
2641
2642bool cpus_share_cache(int this_cpu, int that_cpu)
2643{
2644 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2645}
2646
2647static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2648{
2649
2650
2651
2652
2653 if (!cpus_share_cache(smp_processor_id(), cpu))
2654 return true;
2655
2656
2657
2658
2659
2660
2661
2662 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
2663 return true;
2664
2665 return false;
2666}
2667
2668static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2669{
2670 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
2671 if (WARN_ON_ONCE(cpu == smp_processor_id()))
2672 return false;
2673
2674 sched_clock_cpu(cpu);
2675 __ttwu_queue_wakelist(p, cpu, wake_flags);
2676 return true;
2677 }
2678
2679 return false;
2680}
2681
2682#else
2683
2684static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2685{
2686 return false;
2687}
2688
2689#endif
2690
2691static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2692{
2693 struct rq *rq = cpu_rq(cpu);
2694 struct rq_flags rf;
2695
2696 if (ttwu_queue_wakelist(p, cpu, wake_flags))
2697 return;
2698
2699 rq_lock(rq, &rf);
2700 update_rq_clock(rq);
2701 ttwu_do_activate(rq, p, wake_flags, &rf);
2702 rq_unlock(rq, &rf);
2703}
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825static int
2826try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2827{
2828 unsigned long flags;
2829 int cpu, success = 0;
2830
2831 preempt_disable();
2832 if (p == current) {
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844 if (!(p->state & state))
2845 goto out;
2846
2847 success = 1;
2848 trace_sched_waking(p);
2849 p->state = TASK_RUNNING;
2850 trace_sched_wakeup(p);
2851 goto out;
2852 }
2853
2854
2855
2856
2857
2858
2859
2860 raw_spin_lock_irqsave(&p->pi_lock, flags);
2861 smp_mb__after_spinlock();
2862 if (!(p->state & state))
2863 goto unlock;
2864
2865 trace_sched_waking(p);
2866
2867
2868 success = 1;
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892 smp_rmb();
2893 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
2894 goto unlock;
2895
2896#ifdef CONFIG_SMP
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920 smp_acquire__after_ctrl_dep();
2921
2922
2923
2924
2925
2926
2927
2928 p->state = TASK_WAKING;
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949 if (smp_load_acquire(&p->on_cpu) &&
2950 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
2951 goto unlock;
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962 smp_cond_load_acquire(&p->on_cpu, !VAL);
2963
2964 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2965 if (task_cpu(p) != cpu) {
2966 if (p->in_iowait) {
2967 delayacct_blkio_end(p);
2968 atomic_dec(&task_rq(p)->nr_iowait);
2969 }
2970
2971 wake_flags |= WF_MIGRATED;
2972 psi_ttwu_dequeue(p);
2973 set_task_cpu(p, cpu);
2974 }
2975#else
2976 cpu = task_cpu(p);
2977#endif
2978
2979 ttwu_queue(p, cpu, wake_flags);
2980unlock:
2981 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2982out:
2983 if (success)
2984 ttwu_stat(p, task_cpu(p), wake_flags);
2985 preempt_enable();
2986
2987 return success;
2988}
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3009{
3010 bool ret = false;
3011 struct rq_flags rf;
3012 struct rq *rq;
3013
3014 lockdep_assert_irqs_enabled();
3015 raw_spin_lock_irq(&p->pi_lock);
3016 if (p->on_rq) {
3017 rq = __task_rq_lock(p, &rf);
3018 if (task_rq(p) == rq)
3019 ret = func(p, arg);
3020 rq_unlock(rq, &rf);
3021 } else {
3022 switch (p->state) {
3023 case TASK_RUNNING:
3024 case TASK_WAKING:
3025 break;
3026 default:
3027 smp_rmb();
3028 if (!p->on_rq)
3029 ret = func(p, arg);
3030 }
3031 }
3032 raw_spin_unlock_irq(&p->pi_lock);
3033 return ret;
3034}
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047int wake_up_process(struct task_struct *p)
3048{
3049 return try_to_wake_up(p, TASK_NORMAL, 0);
3050}
3051EXPORT_SYMBOL(wake_up_process);
3052
3053int wake_up_state(struct task_struct *p, unsigned int state)
3054{
3055 return try_to_wake_up(p, state, 0);
3056}
3057
3058
3059
3060
3061
3062
3063
3064static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3065{
3066 p->on_rq = 0;
3067
3068 p->se.on_rq = 0;
3069 p->se.exec_start = 0;
3070 p->se.sum_exec_runtime = 0;
3071 p->se.prev_sum_exec_runtime = 0;
3072 p->se.nr_migrations = 0;
3073 p->se.vruntime = 0;
3074 INIT_LIST_HEAD(&p->se.group_node);
3075
3076#ifdef CONFIG_FAIR_GROUP_SCHED
3077 p->se.cfs_rq = NULL;
3078#endif
3079
3080#ifdef CONFIG_SCHEDSTATS
3081
3082 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3083#endif
3084
3085 RB_CLEAR_NODE(&p->dl.rb_node);
3086 init_dl_task_timer(&p->dl);
3087 init_dl_inactive_task_timer(&p->dl);
3088 __dl_clear_params(p);
3089
3090 INIT_LIST_HEAD(&p->rt.run_list);
3091 p->rt.timeout = 0;
3092 p->rt.time_slice = sched_rr_timeslice;
3093 p->rt.on_rq = 0;
3094 p->rt.on_list = 0;
3095
3096#ifdef CONFIG_PREEMPT_NOTIFIERS
3097 INIT_HLIST_HEAD(&p->preempt_notifiers);
3098#endif
3099
3100#ifdef CONFIG_COMPACTION
3101 p->capture_control = NULL;
3102#endif
3103 init_numa_balancing(clone_flags, p);
3104#ifdef CONFIG_SMP
3105 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3106#endif
3107}
3108
3109DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3110
3111#ifdef CONFIG_NUMA_BALANCING
3112
3113void set_numabalancing_state(bool enabled)
3114{
3115 if (enabled)
3116 static_branch_enable(&sched_numa_balancing);
3117 else
3118 static_branch_disable(&sched_numa_balancing);
3119}
3120
3121#ifdef CONFIG_PROC_SYSCTL
3122int sysctl_numa_balancing(struct ctl_table *table, int write,
3123 void *buffer, size_t *lenp, loff_t *ppos)
3124{
3125 struct ctl_table t;
3126 int err;
3127 int state = static_branch_likely(&sched_numa_balancing);
3128
3129 if (write && !capable(CAP_SYS_ADMIN))
3130 return -EPERM;
3131
3132 t = *table;
3133 t.data = &state;
3134 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3135 if (err < 0)
3136 return err;
3137 if (write)
3138 set_numabalancing_state(state);
3139 return err;
3140}
3141#endif
3142#endif
3143
3144#ifdef CONFIG_SCHEDSTATS
3145
3146DEFINE_STATIC_KEY_FALSE(sched_schedstats);
3147static bool __initdata __sched_schedstats = false;
3148
3149static void set_schedstats(bool enabled)
3150{
3151 if (enabled)
3152 static_branch_enable(&sched_schedstats);
3153 else
3154 static_branch_disable(&sched_schedstats);
3155}
3156
3157void force_schedstat_enabled(void)
3158{
3159 if (!schedstat_enabled()) {
3160 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
3161 static_branch_enable(&sched_schedstats);
3162 }
3163}
3164
3165static int __init setup_schedstats(char *str)
3166{
3167 int ret = 0;
3168 if (!str)
3169 goto out;
3170
3171
3172
3173
3174
3175
3176 if (!strcmp(str, "enable")) {
3177 __sched_schedstats = true;
3178 ret = 1;
3179 } else if (!strcmp(str, "disable")) {
3180 __sched_schedstats = false;
3181 ret = 1;
3182 }
3183out:
3184 if (!ret)
3185 pr_warn("Unable to parse schedstats=\n");
3186
3187 return ret;
3188}
3189__setup("schedstats=", setup_schedstats);
3190
3191static void __init init_schedstats(void)
3192{
3193 set_schedstats(__sched_schedstats);
3194}
3195
3196#ifdef CONFIG_PROC_SYSCTL
3197int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3198 size_t *lenp, loff_t *ppos)
3199{
3200 struct ctl_table t;
3201 int err;
3202 int state = static_branch_likely(&sched_schedstats);
3203
3204 if (write && !capable(CAP_SYS_ADMIN))
3205 return -EPERM;
3206
3207 t = *table;
3208 t.data = &state;
3209 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3210 if (err < 0)
3211 return err;
3212 if (write)
3213 set_schedstats(state);
3214 return err;
3215}
3216#endif
3217#else
3218static inline void init_schedstats(void) {}
3219#endif
3220
3221
3222
3223
3224int sched_fork(unsigned long clone_flags, struct task_struct *p)
3225{
3226 unsigned long flags;
3227
3228 __sched_fork(clone_flags, p);
3229
3230
3231
3232
3233
3234 p->state = TASK_NEW;
3235
3236
3237
3238
3239 p->prio = current->normal_prio;
3240
3241 uclamp_fork(p);
3242
3243
3244
3245
3246 if (unlikely(p->sched_reset_on_fork)) {
3247 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3248 p->policy = SCHED_NORMAL;
3249 p->static_prio = NICE_TO_PRIO(0);
3250 p->rt_priority = 0;
3251 } else if (PRIO_TO_NICE(p->static_prio) < 0)
3252 p->static_prio = NICE_TO_PRIO(0);
3253
3254 p->prio = p->normal_prio = __normal_prio(p);
3255 set_load_weight(p, false);
3256
3257
3258
3259
3260
3261 p->sched_reset_on_fork = 0;
3262 }
3263
3264 if (dl_prio(p->prio))
3265 return -EAGAIN;
3266 else if (rt_prio(p->prio))
3267 p->sched_class = &rt_sched_class;
3268 else
3269 p->sched_class = &fair_sched_class;
3270
3271 init_entity_runnable_average(&p->se);
3272
3273
3274
3275
3276
3277
3278
3279
3280 raw_spin_lock_irqsave(&p->pi_lock, flags);
3281 rseq_migrate(p);
3282
3283
3284
3285
3286 __set_task_cpu(p, smp_processor_id());
3287 if (p->sched_class->task_fork)
3288 p->sched_class->task_fork(p);
3289 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3290
3291#ifdef CONFIG_SCHED_INFO
3292 if (likely(sched_info_on()))
3293 memset(&p->sched_info, 0, sizeof(p->sched_info));
3294#endif
3295#if defined(CONFIG_SMP)
3296 p->on_cpu = 0;
3297#endif
3298 init_task_preempt_count(p);
3299#ifdef CONFIG_SMP
3300 plist_node_init(&p->pushable_tasks, MAX_PRIO);
3301 RB_CLEAR_NODE(&p->pushable_dl_tasks);
3302#endif
3303 return 0;
3304}
3305
3306void sched_post_fork(struct task_struct *p)
3307{
3308 uclamp_post_fork(p);
3309}
3310
3311unsigned long to_ratio(u64 period, u64 runtime)
3312{
3313 if (runtime == RUNTIME_INF)
3314 return BW_UNIT;
3315
3316
3317
3318
3319
3320
3321 if (period == 0)
3322 return 0;
3323
3324 return div64_u64(runtime << BW_SHIFT, period);
3325}
3326
3327
3328
3329
3330
3331
3332
3333
3334void wake_up_new_task(struct task_struct *p)
3335{
3336 struct rq_flags rf;
3337 struct rq *rq;
3338
3339 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3340 p->state = TASK_RUNNING;
3341#ifdef CONFIG_SMP
3342
3343
3344
3345
3346
3347
3348
3349
3350 p->recent_used_cpu = task_cpu(p);
3351 rseq_migrate(p);
3352 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3353#endif
3354 rq = __task_rq_lock(p, &rf);
3355 update_rq_clock(rq);
3356 post_init_entity_util_avg(p);
3357
3358 activate_task(rq, p, ENQUEUE_NOCLOCK);
3359 trace_sched_wakeup_new(p);
3360 check_preempt_curr(rq, p, WF_FORK);
3361#ifdef CONFIG_SMP
3362 if (p->sched_class->task_woken) {
3363
3364
3365
3366
3367 rq_unpin_lock(rq, &rf);
3368 p->sched_class->task_woken(rq, p);
3369 rq_repin_lock(rq, &rf);
3370 }
3371#endif
3372 task_rq_unlock(rq, p, &rf);
3373}
3374
3375#ifdef CONFIG_PREEMPT_NOTIFIERS
3376
3377static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
3378
3379void preempt_notifier_inc(void)
3380{
3381 static_branch_inc(&preempt_notifier_key);
3382}
3383EXPORT_SYMBOL_GPL(preempt_notifier_inc);
3384
3385void preempt_notifier_dec(void)
3386{
3387 static_branch_dec(&preempt_notifier_key);
3388}
3389EXPORT_SYMBOL_GPL(preempt_notifier_dec);
3390
3391
3392
3393
3394
3395void preempt_notifier_register(struct preempt_notifier *notifier)
3396{
3397 if (!static_branch_unlikely(&preempt_notifier_key))
3398 WARN(1, "registering preempt_notifier while notifiers disabled\n");
3399
3400 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
3401}
3402EXPORT_SYMBOL_GPL(preempt_notifier_register);
3403
3404
3405
3406
3407
3408
3409
3410void preempt_notifier_unregister(struct preempt_notifier *notifier)
3411{
3412 hlist_del(¬ifier->link);
3413}
3414EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3415
3416static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
3417{
3418 struct preempt_notifier *notifier;
3419
3420 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3421 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3422}
3423
3424static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3425{
3426 if (static_branch_unlikely(&preempt_notifier_key))
3427 __fire_sched_in_preempt_notifiers(curr);
3428}
3429
3430static void
3431__fire_sched_out_preempt_notifiers(struct task_struct *curr,
3432 struct task_struct *next)
3433{
3434 struct preempt_notifier *notifier;
3435
3436 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3437 notifier->ops->sched_out(notifier, next);
3438}
3439
3440static __always_inline void
3441fire_sched_out_preempt_notifiers(struct task_struct *curr,
3442 struct task_struct *next)
3443{
3444 if (static_branch_unlikely(&preempt_notifier_key))
3445 __fire_sched_out_preempt_notifiers(curr, next);
3446}
3447
3448#else
3449
3450static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3451{
3452}
3453
3454static inline void
3455fire_sched_out_preempt_notifiers(struct task_struct *curr,
3456 struct task_struct *next)
3457{
3458}
3459
3460#endif
3461
3462static inline void prepare_task(struct task_struct *next)
3463{
3464#ifdef CONFIG_SMP
3465
3466
3467
3468
3469
3470
3471 WRITE_ONCE(next->on_cpu, 1);
3472#endif
3473}
3474
3475static inline void finish_task(struct task_struct *prev)
3476{
3477#ifdef CONFIG_SMP
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489 smp_store_release(&prev->on_cpu, 0);
3490#endif
3491}
3492
3493static inline void
3494prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
3495{
3496
3497
3498
3499
3500
3501
3502 rq_unpin_lock(rq, rf);
3503 spin_release(&rq->lock.dep_map, _THIS_IP_);
3504#ifdef CONFIG_DEBUG_SPINLOCK
3505
3506 rq->lock.owner = next;
3507#endif
3508}
3509
3510static inline void finish_lock_switch(struct rq *rq)
3511{
3512
3513
3514
3515
3516
3517 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
3518 raw_spin_unlock_irq(&rq->lock);
3519}
3520
3521
3522
3523
3524
3525#ifndef prepare_arch_switch
3526# define prepare_arch_switch(next) do { } while (0)
3527#endif
3528
3529#ifndef finish_arch_post_lock_switch
3530# define finish_arch_post_lock_switch() do { } while (0)
3531#endif
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546static inline void
3547prepare_task_switch(struct rq *rq, struct task_struct *prev,
3548 struct task_struct *next)
3549{
3550 kcov_prepare_switch(prev);
3551 sched_info_switch(rq, prev, next);
3552 perf_event_task_sched_out(prev, next);
3553 rseq_preempt(prev);
3554 fire_sched_out_preempt_notifiers(prev, next);
3555 prepare_task(next);
3556 prepare_arch_switch(next);
3557}
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578static struct rq *finish_task_switch(struct task_struct *prev)
3579 __releases(rq->lock)
3580{
3581 struct rq *rq = this_rq();
3582 struct mm_struct *mm = rq->prev_mm;
3583 long prev_state;
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
3597 "corrupted preempt_count: %s/%d/0x%x\n",
3598 current->comm, current->pid, preempt_count()))
3599 preempt_count_set(FORK_PREEMPT_COUNT);
3600
3601 rq->prev_mm = NULL;
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614 prev_state = prev->state;
3615 vtime_task_switch(prev);
3616 perf_event_task_sched_in(prev, current);
3617 finish_task(prev);
3618 finish_lock_switch(rq);
3619 finish_arch_post_lock_switch();
3620 kcov_finish_switch(current);
3621
3622 fire_sched_in_preempt_notifiers(current);
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635 if (mm) {
3636 membarrier_mm_sync_core_before_usermode(mm);
3637 mmdrop(mm);
3638 }
3639 if (unlikely(prev_state == TASK_DEAD)) {
3640 if (prev->sched_class->task_dead)
3641 prev->sched_class->task_dead(prev);
3642
3643
3644
3645
3646
3647 kprobe_flush_task(prev);
3648
3649
3650 put_task_stack(prev);
3651
3652 put_task_struct_rcu_user(prev);
3653 }
3654
3655 tick_nohz_task_switch();
3656 return rq;
3657}
3658
3659#ifdef CONFIG_SMP
3660
3661
3662static void __balance_callback(struct rq *rq)
3663{
3664 struct callback_head *head, *next;
3665 void (*func)(struct rq *rq);
3666 unsigned long flags;
3667
3668 raw_spin_lock_irqsave(&rq->lock, flags);
3669 head = rq->balance_callback;
3670 rq->balance_callback = NULL;
3671 while (head) {
3672 func = (void (*)(struct rq *))head->func;
3673 next = head->next;
3674 head->next = NULL;
3675 head = next;
3676
3677 func(rq);
3678 }
3679 raw_spin_unlock_irqrestore(&rq->lock, flags);
3680}
3681
3682static inline void balance_callback(struct rq *rq)
3683{
3684 if (unlikely(rq->balance_callback))
3685 __balance_callback(rq);
3686}
3687
3688#else
3689
3690static inline void balance_callback(struct rq *rq)
3691{
3692}
3693
3694#endif
3695
3696
3697
3698
3699
3700asmlinkage __visible void schedule_tail(struct task_struct *prev)
3701 __releases(rq->lock)
3702{
3703 struct rq *rq;
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714 rq = finish_task_switch(prev);
3715 balance_callback(rq);
3716 preempt_enable();
3717
3718 if (current->set_child_tid)
3719 put_user(task_pid_vnr(current), current->set_child_tid);
3720
3721 calculate_sigpending();
3722}
3723
3724
3725
3726
3727static __always_inline struct rq *
3728context_switch(struct rq *rq, struct task_struct *prev,
3729 struct task_struct *next, struct rq_flags *rf)
3730{
3731 prepare_task_switch(rq, prev, next);
3732
3733
3734
3735
3736
3737
3738 arch_start_context_switch(prev);
3739
3740
3741
3742
3743
3744
3745
3746
3747 if (!next->mm) {
3748 enter_lazy_tlb(prev->active_mm, next);
3749
3750 next->active_mm = prev->active_mm;
3751 if (prev->mm)
3752 mmgrab(prev->active_mm);
3753 else
3754 prev->active_mm = NULL;
3755 } else {
3756 membarrier_switch_mm(rq, prev->active_mm, next->mm);
3757
3758
3759
3760
3761
3762
3763
3764
3765 switch_mm_irqs_off(prev->active_mm, next->mm, next);
3766
3767 if (!prev->mm) {
3768
3769 rq->prev_mm = prev->active_mm;
3770 prev->active_mm = NULL;
3771 }
3772 }
3773
3774 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3775
3776 prepare_lock_switch(rq, next, rf);
3777
3778
3779 switch_to(prev, next, prev);
3780 barrier();
3781
3782 return finish_task_switch(prev);
3783}
3784
3785
3786
3787
3788
3789
3790
3791unsigned long nr_running(void)
3792{
3793 unsigned long i, sum = 0;
3794
3795 for_each_online_cpu(i)
3796 sum += cpu_rq(i)->nr_running;
3797
3798 return sum;
3799}
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814bool single_task_running(void)
3815{
3816 return raw_rq()->nr_running == 1;
3817}
3818EXPORT_SYMBOL(single_task_running);
3819
3820unsigned long long nr_context_switches(void)
3821{
3822 int i;
3823 unsigned long long sum = 0;
3824
3825 for_each_possible_cpu(i)
3826 sum += cpu_rq(i)->nr_switches;
3827
3828 return sum;
3829}
3830
3831
3832
3833
3834
3835
3836
3837
3838unsigned long nr_iowait_cpu(int cpu)
3839{
3840 return atomic_read(&cpu_rq(cpu)->nr_iowait);
3841}
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873unsigned long nr_iowait(void)
3874{
3875 unsigned long i, sum = 0;
3876
3877 for_each_possible_cpu(i)
3878 sum += nr_iowait_cpu(i);
3879
3880 return sum;
3881}
3882
3883#ifdef CONFIG_SMP
3884
3885
3886
3887
3888
3889void sched_exec(void)
3890{
3891 struct task_struct *p = current;
3892 unsigned long flags;
3893 int dest_cpu;
3894
3895 raw_spin_lock_irqsave(&p->pi_lock, flags);
3896 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3897 if (dest_cpu == smp_processor_id())
3898 goto unlock;
3899
3900 if (likely(cpu_active(dest_cpu))) {
3901 struct migration_arg arg = { p, dest_cpu };
3902
3903 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3904 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3905 return;
3906 }
3907unlock:
3908 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3909}
3910
3911#endif
3912
3913DEFINE_PER_CPU(struct kernel_stat, kstat);
3914DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3915
3916EXPORT_PER_CPU_SYMBOL(kstat);
3917EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3918
3919
3920
3921
3922
3923
3924
3925static inline void prefetch_curr_exec_start(struct task_struct *p)
3926{
3927#ifdef CONFIG_FAIR_GROUP_SCHED
3928 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3929#else
3930 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3931#endif
3932 prefetch(curr);
3933 prefetch(&curr->exec_start);
3934}
3935
3936
3937
3938
3939
3940
3941unsigned long long task_sched_runtime(struct task_struct *p)
3942{
3943 struct rq_flags rf;
3944 struct rq *rq;
3945 u64 ns;
3946
3947#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959 if (!p->on_cpu || !task_on_rq_queued(p))
3960 return p->se.sum_exec_runtime;
3961#endif
3962
3963 rq = task_rq_lock(p, &rf);
3964
3965
3966
3967
3968
3969 if (task_current(rq, p) && task_on_rq_queued(p)) {
3970 prefetch_curr_exec_start(p);
3971 update_rq_clock(rq);
3972 p->sched_class->update_curr(rq);
3973 }
3974 ns = p->se.sum_exec_runtime;
3975 task_rq_unlock(rq, p, &rf);
3976
3977 return ns;
3978}
3979
3980
3981
3982
3983
3984void scheduler_tick(void)
3985{
3986 int cpu = smp_processor_id();
3987 struct rq *rq = cpu_rq(cpu);
3988 struct task_struct *curr = rq->curr;
3989 struct rq_flags rf;
3990 unsigned long thermal_pressure;
3991
3992 arch_scale_freq_tick();
3993 sched_clock_tick();
3994
3995 rq_lock(rq, &rf);
3996
3997 update_rq_clock(rq);
3998 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
3999 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4000 curr->sched_class->task_tick(rq, curr, 0);
4001 calc_global_load_tick(rq);
4002 psi_task_tick(rq);
4003
4004 rq_unlock(rq, &rf);
4005
4006 perf_event_task_tick();
4007
4008#ifdef CONFIG_SMP
4009 rq->idle_balance = idle_cpu(cpu);
4010 trigger_load_balance(rq);
4011#endif
4012}
4013
4014#ifdef CONFIG_NO_HZ_FULL
4015
4016struct tick_work {
4017 int cpu;
4018 atomic_t state;
4019 struct delayed_work work;
4020};
4021
4022#define TICK_SCHED_REMOTE_OFFLINE 0
4023#define TICK_SCHED_REMOTE_OFFLINING 1
4024#define TICK_SCHED_REMOTE_RUNNING 2
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049static struct tick_work __percpu *tick_work_cpu;
4050
4051static void sched_tick_remote(struct work_struct *work)
4052{
4053 struct delayed_work *dwork = to_delayed_work(work);
4054 struct tick_work *twork = container_of(dwork, struct tick_work, work);
4055 int cpu = twork->cpu;
4056 struct rq *rq = cpu_rq(cpu);
4057 struct task_struct *curr;
4058 struct rq_flags rf;
4059 u64 delta;
4060 int os;
4061
4062
4063
4064
4065
4066
4067
4068
4069 if (!tick_nohz_tick_stopped_cpu(cpu))
4070 goto out_requeue;
4071
4072 rq_lock_irq(rq, &rf);
4073 curr = rq->curr;
4074 if (cpu_is_offline(cpu))
4075 goto out_unlock;
4076
4077 update_rq_clock(rq);
4078
4079 if (!is_idle_task(curr)) {
4080
4081
4082
4083
4084 delta = rq_clock_task(rq) - curr->se.exec_start;
4085 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4086 }
4087 curr->sched_class->task_tick(rq, curr, 0);
4088
4089 calc_load_nohz_remote(rq);
4090out_unlock:
4091 rq_unlock_irq(rq, &rf);
4092out_requeue:
4093
4094
4095
4096
4097
4098
4099
4100 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
4101 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
4102 if (os == TICK_SCHED_REMOTE_RUNNING)
4103 queue_delayed_work(system_unbound_wq, dwork, HZ);
4104}
4105
4106static void sched_tick_start(int cpu)
4107{
4108 int os;
4109 struct tick_work *twork;
4110
4111 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4112 return;
4113
4114 WARN_ON_ONCE(!tick_work_cpu);
4115
4116 twork = per_cpu_ptr(tick_work_cpu, cpu);
4117 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
4118 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
4119 if (os == TICK_SCHED_REMOTE_OFFLINE) {
4120 twork->cpu = cpu;
4121 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
4122 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
4123 }
4124}
4125
4126#ifdef CONFIG_HOTPLUG_CPU
4127static void sched_tick_stop(int cpu)
4128{
4129 struct tick_work *twork;
4130 int os;
4131
4132 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4133 return;
4134
4135 WARN_ON_ONCE(!tick_work_cpu);
4136
4137 twork = per_cpu_ptr(tick_work_cpu, cpu);
4138
4139 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
4140 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
4141
4142}
4143#endif
4144
4145int __init sched_tick_offload_init(void)
4146{
4147 tick_work_cpu = alloc_percpu(struct tick_work);
4148 BUG_ON(!tick_work_cpu);
4149 return 0;
4150}
4151
4152#else
4153static inline void sched_tick_start(int cpu) { }
4154static inline void sched_tick_stop(int cpu) { }
4155#endif
4156
4157#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
4158 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
4159
4160
4161
4162
4163static inline void preempt_latency_start(int val)
4164{
4165 if (preempt_count() == val) {
4166 unsigned long ip = get_lock_parent_ip();
4167#ifdef CONFIG_DEBUG_PREEMPT
4168 current->preempt_disable_ip = ip;
4169#endif
4170 trace_preempt_off(CALLER_ADDR0, ip);
4171 }
4172}
4173
4174void preempt_count_add(int val)
4175{
4176#ifdef CONFIG_DEBUG_PREEMPT
4177
4178
4179
4180 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4181 return;
4182#endif
4183 __preempt_count_add(val);
4184#ifdef CONFIG_DEBUG_PREEMPT
4185
4186
4187
4188 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4189 PREEMPT_MASK - 10);
4190#endif
4191 preempt_latency_start(val);
4192}
4193EXPORT_SYMBOL(preempt_count_add);
4194NOKPROBE_SYMBOL(preempt_count_add);
4195
4196
4197
4198
4199
4200static inline void preempt_latency_stop(int val)
4201{
4202 if (preempt_count() == val)
4203 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
4204}
4205
4206void preempt_count_sub(int val)
4207{
4208#ifdef CONFIG_DEBUG_PREEMPT
4209
4210
4211
4212 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4213 return;
4214
4215
4216
4217 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4218 !(preempt_count() & PREEMPT_MASK)))
4219 return;
4220#endif
4221
4222 preempt_latency_stop(val);
4223 __preempt_count_sub(val);
4224}
4225EXPORT_SYMBOL(preempt_count_sub);
4226NOKPROBE_SYMBOL(preempt_count_sub);
4227
4228#else
4229static inline void preempt_latency_start(int val) { }
4230static inline void preempt_latency_stop(int val) { }
4231#endif
4232
4233static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
4234{
4235#ifdef CONFIG_DEBUG_PREEMPT
4236 return p->preempt_disable_ip;
4237#else
4238 return 0;
4239#endif
4240}
4241
4242
4243
4244
4245static noinline void __schedule_bug(struct task_struct *prev)
4246{
4247
4248 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
4249
4250 if (oops_in_progress)
4251 return;
4252
4253 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4254 prev->comm, prev->pid, preempt_count());
4255
4256 debug_show_held_locks(prev);
4257 print_modules();
4258 if (irqs_disabled())
4259 print_irqtrace_events(prev);
4260 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
4261 && in_atomic_preempt_off()) {
4262 pr_err("Preemption disabled at:");
4263 print_ip_sym(KERN_ERR, preempt_disable_ip);
4264 }
4265 if (panic_on_warn)
4266 panic("scheduling while atomic\n");
4267
4268 dump_stack();
4269 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4270}
4271
4272
4273
4274
4275static inline void schedule_debug(struct task_struct *prev, bool preempt)
4276{
4277#ifdef CONFIG_SCHED_STACK_END_CHECK
4278 if (task_stack_end_corrupted(prev))
4279 panic("corrupted stack end detected inside scheduler\n");
4280
4281 if (task_scs_end_corrupted(prev))
4282 panic("corrupted shadow stack detected inside scheduler\n");
4283#endif
4284
4285#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4286 if (!preempt && prev->state && prev->non_block_count) {
4287 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4288 prev->comm, prev->pid, prev->non_block_count);
4289 dump_stack();
4290 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4291 }
4292#endif
4293
4294 if (unlikely(in_atomic_preempt_off())) {
4295 __schedule_bug(prev);
4296 preempt_count_set(PREEMPT_DISABLED);
4297 }
4298 rcu_sleep_check();
4299
4300 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4301
4302 schedstat_inc(this_rq()->sched_count);
4303}
4304
4305static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4306 struct rq_flags *rf)
4307{
4308#ifdef CONFIG_SMP
4309 const struct sched_class *class;
4310
4311
4312
4313
4314
4315
4316
4317
4318 for_class_range(class, prev->sched_class, &idle_sched_class) {
4319 if (class->balance(rq, prev, rf))
4320 break;
4321 }
4322#endif
4323
4324 put_prev_task(rq, prev);
4325}
4326
4327
4328
4329
4330static inline struct task_struct *
4331pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4332{
4333 const struct sched_class *class;
4334 struct task_struct *p;
4335
4336
4337
4338
4339
4340
4341
4342 if (likely(prev->sched_class <= &fair_sched_class &&
4343 rq->nr_running == rq->cfs.h_nr_running)) {
4344
4345 p = pick_next_task_fair(rq, prev, rf);
4346 if (unlikely(p == RETRY_TASK))
4347 goto restart;
4348
4349
4350 if (!p) {
4351 put_prev_task(rq, prev);
4352 p = pick_next_task_idle(rq);
4353 }
4354
4355 return p;
4356 }
4357
4358restart:
4359 put_prev_task_balance(rq, prev, rf);
4360
4361 for_each_class(class) {
4362 p = class->pick_next_task(rq);
4363 if (p)
4364 return p;
4365 }
4366
4367
4368 BUG();
4369}
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410static void __sched notrace __schedule(bool preempt)
4411{
4412 struct task_struct *prev, *next;
4413 unsigned long *switch_count;
4414 unsigned long prev_state;
4415 struct rq_flags rf;
4416 struct rq *rq;
4417 int cpu;
4418
4419 cpu = smp_processor_id();
4420 rq = cpu_rq(cpu);
4421 prev = rq->curr;
4422
4423 schedule_debug(prev, preempt);
4424
4425 if (sched_feat(HRTICK))
4426 hrtick_clear(rq);
4427
4428 local_irq_disable();
4429 rcu_note_context_switch(preempt);
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446 rq_lock(rq, &rf);
4447 smp_mb__after_spinlock();
4448
4449
4450 rq->clock_update_flags <<= 1;
4451 update_rq_clock(rq);
4452
4453 switch_count = &prev->nivcsw;
4454
4455
4456
4457
4458
4459
4460
4461
4462 prev_state = prev->state;
4463 if (!preempt && prev_state) {
4464 if (signal_pending_state(prev_state, prev)) {
4465 prev->state = TASK_RUNNING;
4466 } else {
4467 prev->sched_contributes_to_load =
4468 (prev_state & TASK_UNINTERRUPTIBLE) &&
4469 !(prev_state & TASK_NOLOAD) &&
4470 !(prev->flags & PF_FROZEN);
4471
4472 if (prev->sched_contributes_to_load)
4473 rq->nr_uninterruptible++;
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4487
4488 if (prev->in_iowait) {
4489 atomic_inc(&rq->nr_iowait);
4490 delayacct_blkio_start();
4491 }
4492 }
4493 switch_count = &prev->nvcsw;
4494 }
4495
4496 next = pick_next_task(rq, prev, &rf);
4497 clear_tsk_need_resched(prev);
4498 clear_preempt_need_resched();
4499
4500 if (likely(prev != next)) {
4501 rq->nr_switches++;
4502
4503
4504
4505
4506 RCU_INIT_POINTER(rq->curr, next);
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521 ++*switch_count;
4522
4523 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4524
4525 trace_sched_switch(preempt, prev, next);
4526
4527
4528 rq = context_switch(rq, prev, next, &rf);
4529 } else {
4530 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4531 rq_unlock_irq(rq, &rf);
4532 }
4533
4534 balance_callback(rq);
4535}
4536
4537void __noreturn do_task_dead(void)
4538{
4539
4540 set_special_state(TASK_DEAD);
4541
4542
4543 current->flags |= PF_NOFREEZE;
4544
4545 __schedule(false);
4546 BUG();
4547
4548
4549 for (;;)
4550 cpu_relax();
4551}
4552
4553static inline void sched_submit_work(struct task_struct *tsk)
4554{
4555 unsigned int task_flags;
4556
4557 if (!tsk->state)
4558 return;
4559
4560 task_flags = tsk->flags;
4561
4562
4563
4564
4565
4566
4567
4568
4569 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4570 preempt_disable();
4571 if (task_flags & PF_WQ_WORKER)
4572 wq_worker_sleeping(tsk);
4573 else
4574 io_wq_worker_sleeping(tsk);
4575 preempt_enable_no_resched();
4576 }
4577
4578 if (tsk_is_pi_blocked(tsk))
4579 return;
4580
4581
4582
4583
4584
4585 if (blk_needs_flush_plug(tsk))
4586 blk_schedule_flush_plug(tsk);
4587}
4588
4589static void sched_update_worker(struct task_struct *tsk)
4590{
4591 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4592 if (tsk->flags & PF_WQ_WORKER)
4593 wq_worker_running(tsk);
4594 else
4595 io_wq_worker_running(tsk);
4596 }
4597}
4598
4599asmlinkage __visible void __sched schedule(void)
4600{
4601 struct task_struct *tsk = current;
4602
4603 sched_submit_work(tsk);
4604 do {
4605 preempt_disable();
4606 __schedule(false);
4607 sched_preempt_enable_no_resched();
4608 } while (need_resched());
4609 sched_update_worker(tsk);
4610}
4611EXPORT_SYMBOL(schedule);
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623void __sched schedule_idle(void)
4624{
4625
4626
4627
4628
4629
4630
4631
4632 WARN_ON_ONCE(current->state);
4633 do {
4634 __schedule(false);
4635 } while (need_resched());
4636}
4637
4638#ifdef CONFIG_CONTEXT_TRACKING
4639asmlinkage __visible void __sched schedule_user(void)
4640{
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651 enum ctx_state prev_state = exception_enter();
4652 schedule();
4653 exception_exit(prev_state);
4654}
4655#endif
4656
4657
4658
4659
4660
4661
4662void __sched schedule_preempt_disabled(void)
4663{
4664 sched_preempt_enable_no_resched();
4665 schedule();
4666 preempt_disable();
4667}
4668
4669static void __sched notrace preempt_schedule_common(void)
4670{
4671 do {
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685 preempt_disable_notrace();
4686 preempt_latency_start(1);
4687 __schedule(true);
4688 preempt_latency_stop(1);
4689 preempt_enable_no_resched_notrace();
4690
4691
4692
4693
4694
4695 } while (need_resched());
4696}
4697
4698#ifdef CONFIG_PREEMPTION
4699
4700
4701
4702
4703asmlinkage __visible void __sched notrace preempt_schedule(void)
4704{
4705
4706
4707
4708
4709 if (likely(!preemptible()))
4710 return;
4711
4712 preempt_schedule_common();
4713}
4714NOKPROBE_SYMBOL(preempt_schedule);
4715EXPORT_SYMBOL(preempt_schedule);
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
4732{
4733 enum ctx_state prev_ctx;
4734
4735 if (likely(!preemptible()))
4736 return;
4737
4738 do {
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752 preempt_disable_notrace();
4753 preempt_latency_start(1);
4754
4755
4756
4757
4758
4759 prev_ctx = exception_enter();
4760 __schedule(true);
4761 exception_exit(prev_ctx);
4762
4763 preempt_latency_stop(1);
4764 preempt_enable_no_resched_notrace();
4765 } while (need_resched());
4766}
4767EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4768
4769#endif
4770
4771
4772
4773
4774
4775
4776
4777asmlinkage __visible void __sched preempt_schedule_irq(void)
4778{
4779 enum ctx_state prev_state;
4780
4781
4782 BUG_ON(preempt_count() || !irqs_disabled());
4783
4784 prev_state = exception_enter();
4785
4786 do {
4787 preempt_disable();
4788 local_irq_enable();
4789 __schedule(true);
4790 local_irq_disable();
4791 sched_preempt_enable_no_resched();
4792 } while (need_resched());
4793
4794 exception_exit(prev_state);
4795}
4796
4797int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4798 void *key)
4799{
4800 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
4801 return try_to_wake_up(curr->private, mode, wake_flags);
4802}
4803EXPORT_SYMBOL(default_wake_function);
4804
4805#ifdef CONFIG_RT_MUTEXES
4806
4807static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
4808{
4809 if (pi_task)
4810 prio = min(prio, pi_task->prio);
4811
4812 return prio;
4813}
4814
4815static inline int rt_effective_prio(struct task_struct *p, int prio)
4816{
4817 struct task_struct *pi_task = rt_mutex_get_top_task(p);
4818
4819 return __rt_effective_prio(pi_task, prio);
4820}
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
4834{
4835 int prio, oldprio, queued, running, queue_flag =
4836 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4837 const struct sched_class *prev_class;
4838 struct rq_flags rf;
4839 struct rq *rq;
4840
4841
4842 prio = __rt_effective_prio(pi_task, p->normal_prio);
4843
4844
4845
4846
4847 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
4848 return;
4849
4850 rq = __task_rq_lock(p, &rf);
4851 update_rq_clock(rq);
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862 p->pi_top_task = pi_task;
4863
4864
4865
4866
4867 if (prio == p->prio && !dl_prio(prio))
4868 goto out_unlock;
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882 if (unlikely(p == rq->idle)) {
4883 WARN_ON(p != rq->curr);
4884 WARN_ON(p->pi_blocked_on);
4885 goto out_unlock;
4886 }
4887
4888 trace_sched_pi_setprio(p, pi_task);
4889 oldprio = p->prio;
4890
4891 if (oldprio == prio)
4892 queue_flag &= ~DEQUEUE_MOVE;
4893
4894 prev_class = p->sched_class;
4895 queued = task_on_rq_queued(p);
4896 running = task_current(rq, p);
4897 if (queued)
4898 dequeue_task(rq, p, queue_flag);
4899 if (running)
4900 put_prev_task(rq, p);
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911 if (dl_prio(prio)) {
4912 if (!dl_prio(p->normal_prio) ||
4913 (pi_task && dl_prio(pi_task->prio) &&
4914 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4915 p->dl.pi_se = pi_task->dl.pi_se;
4916 queue_flag |= ENQUEUE_REPLENISH;
4917 } else {
4918 p->dl.pi_se = &p->dl;
4919 }
4920 p->sched_class = &dl_sched_class;
4921 } else if (rt_prio(prio)) {
4922 if (dl_prio(oldprio))
4923 p->dl.pi_se = &p->dl;
4924 if (oldprio < prio)
4925 queue_flag |= ENQUEUE_HEAD;
4926 p->sched_class = &rt_sched_class;
4927 } else {
4928 if (dl_prio(oldprio))
4929 p->dl.pi_se = &p->dl;
4930 if (rt_prio(oldprio))
4931 p->rt.timeout = 0;
4932 p->sched_class = &fair_sched_class;
4933 }
4934
4935 p->prio = prio;
4936
4937 if (queued)
4938 enqueue_task(rq, p, queue_flag);
4939 if (running)
4940 set_next_task(rq, p);
4941
4942 check_class_changed(rq, p, prev_class, oldprio);
4943out_unlock:
4944
4945 preempt_disable();
4946 __task_rq_unlock(rq, &rf);
4947
4948 balance_callback(rq);
4949 preempt_enable();
4950}
4951#else
4952static inline int rt_effective_prio(struct task_struct *p, int prio)
4953{
4954 return prio;
4955}
4956#endif
4957
4958void set_user_nice(struct task_struct *p, long nice)
4959{
4960 bool queued, running;
4961 int old_prio;
4962 struct rq_flags rf;
4963 struct rq *rq;
4964
4965 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
4966 return;
4967
4968
4969
4970
4971 rq = task_rq_lock(p, &rf);
4972 update_rq_clock(rq);
4973
4974
4975
4976
4977
4978
4979
4980 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4981 p->static_prio = NICE_TO_PRIO(nice);
4982 goto out_unlock;
4983 }
4984 queued = task_on_rq_queued(p);
4985 running = task_current(rq, p);
4986 if (queued)
4987 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
4988 if (running)
4989 put_prev_task(rq, p);
4990
4991 p->static_prio = NICE_TO_PRIO(nice);
4992 set_load_weight(p, true);
4993 old_prio = p->prio;
4994 p->prio = effective_prio(p);
4995
4996 if (queued)
4997 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4998 if (running)
4999 set_next_task(rq, p);
5000
5001
5002
5003
5004
5005 p->sched_class->prio_changed(rq, p, old_prio);
5006
5007out_unlock:
5008 task_rq_unlock(rq, p, &rf);
5009}
5010EXPORT_SYMBOL(set_user_nice);
5011
5012
5013
5014
5015
5016
5017int can_nice(const struct task_struct *p, const int nice)
5018{
5019
5020 int nice_rlim = nice_to_rlimit(nice);
5021
5022 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
5023 capable(CAP_SYS_NICE));
5024}
5025
5026#ifdef __ARCH_WANT_SYS_NICE
5027
5028
5029
5030
5031
5032
5033
5034
5035SYSCALL_DEFINE1(nice, int, increment)
5036{
5037 long nice, retval;
5038
5039
5040
5041
5042
5043
5044 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
5045 nice = task_nice(current) + increment;
5046
5047 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
5048 if (increment < 0 && !can_nice(current, nice))
5049 return -EPERM;
5050
5051 retval = security_task_setnice(current, nice);
5052 if (retval)
5053 return retval;
5054
5055 set_user_nice(current, nice);
5056 return 0;
5057}
5058
5059#endif
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069int task_prio(const struct task_struct *p)
5070{
5071 return p->prio - MAX_RT_PRIO;
5072}
5073
5074
5075
5076
5077
5078
5079
5080int idle_cpu(int cpu)
5081{
5082 struct rq *rq = cpu_rq(cpu);
5083
5084 if (rq->curr != rq->idle)
5085 return 0;
5086
5087 if (rq->nr_running)
5088 return 0;
5089
5090#ifdef CONFIG_SMP
5091 if (rq->ttwu_pending)
5092 return 0;
5093#endif
5094
5095 return 1;
5096}
5097
5098
5099
5100
5101
5102
5103
5104int available_idle_cpu(int cpu)
5105{
5106 if (!idle_cpu(cpu))
5107 return 0;
5108
5109 if (vcpu_is_preempted(cpu))
5110 return 0;
5111
5112 return 1;
5113}
5114
5115
5116
5117
5118
5119
5120
5121struct task_struct *idle_task(int cpu)
5122{
5123 return cpu_rq(cpu)->idle;
5124}
5125
5126
5127
5128
5129
5130
5131
5132static struct task_struct *find_process_by_pid(pid_t pid)
5133{
5134 return pid ? find_task_by_vpid(pid) : current;
5135}
5136
5137
5138
5139
5140
5141#define SETPARAM_POLICY -1
5142
5143static void __setscheduler_params(struct task_struct *p,
5144 const struct sched_attr *attr)
5145{
5146 int policy = attr->sched_policy;
5147
5148 if (policy == SETPARAM_POLICY)
5149 policy = p->policy;
5150
5151 p->policy = policy;
5152
5153 if (dl_policy(policy))
5154 __setparam_dl(p, attr);
5155 else if (fair_policy(policy))
5156 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5157
5158
5159
5160
5161
5162
5163 p->rt_priority = attr->sched_priority;
5164 p->normal_prio = normal_prio(p);
5165 set_load_weight(p, true);
5166}
5167
5168
5169static void __setscheduler(struct rq *rq, struct task_struct *p,
5170 const struct sched_attr *attr, bool keep_boost)
5171{
5172
5173
5174
5175
5176 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
5177 return;
5178
5179 __setscheduler_params(p, attr);
5180
5181
5182
5183
5184
5185 p->prio = normal_prio(p);
5186 if (keep_boost)
5187 p->prio = rt_effective_prio(p, p->prio);
5188
5189 if (dl_prio(p->prio))
5190 p->sched_class = &dl_sched_class;
5191 else if (rt_prio(p->prio))
5192 p->sched_class = &rt_sched_class;
5193 else
5194 p->sched_class = &fair_sched_class;
5195}
5196
5197
5198
5199
5200static bool check_same_owner(struct task_struct *p)
5201{
5202 const struct cred *cred = current_cred(), *pcred;
5203 bool match;
5204
5205 rcu_read_lock();
5206 pcred = __task_cred(p);
5207 match = (uid_eq(cred->euid, pcred->euid) ||
5208 uid_eq(cred->euid, pcred->uid));
5209 rcu_read_unlock();
5210 return match;
5211}
5212
5213static int __sched_setscheduler(struct task_struct *p,
5214 const struct sched_attr *attr,
5215 bool user, bool pi)
5216{
5217 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
5218 MAX_RT_PRIO - 1 - attr->sched_priority;
5219 int retval, oldprio, oldpolicy = -1, queued, running;
5220 int new_effective_prio, policy = attr->sched_policy;
5221 const struct sched_class *prev_class;
5222 struct rq_flags rf;
5223 int reset_on_fork;
5224 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5225 struct rq *rq;
5226
5227
5228 BUG_ON(pi && in_interrupt());
5229recheck:
5230
5231 if (policy < 0) {
5232 reset_on_fork = p->sched_reset_on_fork;
5233 policy = oldpolicy = p->policy;
5234 } else {
5235 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
5236
5237 if (!valid_policy(policy))
5238 return -EINVAL;
5239 }
5240
5241 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
5242 return -EINVAL;
5243
5244
5245
5246
5247
5248
5249 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
5250 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
5251 return -EINVAL;
5252 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
5253 (rt_policy(policy) != (attr->sched_priority != 0)))
5254 return -EINVAL;
5255
5256
5257
5258
5259 if (user && !capable(CAP_SYS_NICE)) {
5260 if (fair_policy(policy)) {
5261 if (attr->sched_nice < task_nice(p) &&
5262 !can_nice(p, attr->sched_nice))
5263 return -EPERM;
5264 }
5265
5266 if (rt_policy(policy)) {
5267 unsigned long rlim_rtprio =
5268 task_rlimit(p, RLIMIT_RTPRIO);
5269
5270
5271 if (policy != p->policy && !rlim_rtprio)
5272 return -EPERM;
5273
5274
5275 if (attr->sched_priority > p->rt_priority &&
5276 attr->sched_priority > rlim_rtprio)
5277 return -EPERM;
5278 }
5279
5280
5281
5282
5283
5284
5285
5286 if (dl_policy(policy))
5287 return -EPERM;
5288
5289
5290
5291
5292
5293 if (task_has_idle_policy(p) && !idle_policy(policy)) {
5294 if (!can_nice(p, task_nice(p)))
5295 return -EPERM;
5296 }
5297
5298
5299 if (!check_same_owner(p))
5300 return -EPERM;
5301
5302
5303 if (p->sched_reset_on_fork && !reset_on_fork)
5304 return -EPERM;
5305 }
5306
5307 if (user) {
5308 if (attr->sched_flags & SCHED_FLAG_SUGOV)
5309 return -EINVAL;
5310
5311 retval = security_task_setscheduler(p);
5312 if (retval)
5313 return retval;
5314 }
5315
5316
5317 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5318 retval = uclamp_validate(p, attr);
5319 if (retval)
5320 return retval;
5321 }
5322
5323 if (pi)
5324 cpuset_read_lock();
5325
5326
5327
5328
5329
5330
5331
5332
5333 rq = task_rq_lock(p, &rf);
5334 update_rq_clock(rq);
5335
5336
5337
5338
5339 if (p == rq->stop) {
5340 retval = -EINVAL;
5341 goto unlock;
5342 }
5343
5344
5345
5346
5347
5348 if (unlikely(policy == p->policy)) {
5349 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
5350 goto change;
5351 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
5352 goto change;
5353 if (dl_policy(policy) && dl_param_changed(p, attr))
5354 goto change;
5355 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
5356 goto change;
5357
5358 p->sched_reset_on_fork = reset_on_fork;
5359 retval = 0;
5360 goto unlock;
5361 }
5362change:
5363
5364 if (user) {
5365#ifdef CONFIG_RT_GROUP_SCHED
5366
5367
5368
5369
5370 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5371 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5372 !task_group_is_autogroup(task_group(p))) {
5373 retval = -EPERM;
5374 goto unlock;
5375 }
5376#endif
5377#ifdef CONFIG_SMP
5378 if (dl_bandwidth_enabled() && dl_policy(policy) &&
5379 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
5380 cpumask_t *span = rq->rd->span;
5381
5382
5383
5384
5385
5386
5387 if (!cpumask_subset(span, p->cpus_ptr) ||
5388 rq->rd->dl_bw.bw == 0) {
5389 retval = -EPERM;
5390 goto unlock;
5391 }
5392 }
5393#endif
5394 }
5395
5396
5397 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5398 policy = oldpolicy = -1;
5399 task_rq_unlock(rq, p, &rf);
5400 if (pi)
5401 cpuset_read_unlock();
5402 goto recheck;
5403 }
5404
5405
5406
5407
5408
5409
5410 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5411 retval = -EBUSY;
5412 goto unlock;
5413 }
5414
5415 p->sched_reset_on_fork = reset_on_fork;
5416 oldprio = p->prio;
5417
5418 if (pi) {
5419
5420
5421
5422
5423
5424
5425
5426 new_effective_prio = rt_effective_prio(p, newprio);
5427 if (new_effective_prio == oldprio)
5428 queue_flags &= ~DEQUEUE_MOVE;
5429 }
5430
5431 queued = task_on_rq_queued(p);
5432 running = task_current(rq, p);
5433 if (queued)
5434 dequeue_task(rq, p, queue_flags);
5435 if (running)
5436 put_prev_task(rq, p);
5437
5438 prev_class = p->sched_class;
5439
5440 __setscheduler(rq, p, attr, pi);
5441 __setscheduler_uclamp(p, attr);
5442
5443 if (queued) {
5444
5445
5446
5447
5448 if (oldprio < p->prio)
5449 queue_flags |= ENQUEUE_HEAD;
5450
5451 enqueue_task(rq, p, queue_flags);
5452 }
5453 if (running)
5454 set_next_task(rq, p);
5455
5456 check_class_changed(rq, p, prev_class, oldprio);
5457
5458
5459 preempt_disable();
5460 task_rq_unlock(rq, p, &rf);
5461
5462 if (pi) {
5463 cpuset_read_unlock();
5464 rt_mutex_adjust_pi(p);
5465 }
5466
5467
5468 balance_callback(rq);
5469 preempt_enable();
5470
5471 return 0;
5472
5473unlock:
5474 task_rq_unlock(rq, p, &rf);
5475 if (pi)
5476 cpuset_read_unlock();
5477 return retval;
5478}
5479
5480static int _sched_setscheduler(struct task_struct *p, int policy,
5481 const struct sched_param *param, bool check)
5482{
5483 struct sched_attr attr = {
5484 .sched_policy = policy,
5485 .sched_priority = param->sched_priority,
5486 .sched_nice = PRIO_TO_NICE(p->static_prio),
5487 };
5488
5489
5490 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5491 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5492 policy &= ~SCHED_RESET_ON_FORK;
5493 attr.sched_policy = policy;
5494 }
5495
5496 return __sched_setscheduler(p, &attr, check, true);
5497}
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510int sched_setscheduler(struct task_struct *p, int policy,
5511 const struct sched_param *param)
5512{
5513 return _sched_setscheduler(p, policy, param, true);
5514}
5515
5516int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
5517{
5518 return __sched_setscheduler(p, attr, true, true);
5519}
5520
5521int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
5522{
5523 return __sched_setscheduler(p, attr, false, true);
5524}
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5540 const struct sched_param *param)
5541{
5542 return _sched_setscheduler(p, policy, param, false);
5543}
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563void sched_set_fifo(struct task_struct *p)
5564{
5565 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
5566 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5567}
5568EXPORT_SYMBOL_GPL(sched_set_fifo);
5569
5570
5571
5572
5573void sched_set_fifo_low(struct task_struct *p)
5574{
5575 struct sched_param sp = { .sched_priority = 1 };
5576 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5577}
5578EXPORT_SYMBOL_GPL(sched_set_fifo_low);
5579
5580void sched_set_normal(struct task_struct *p, int nice)
5581{
5582 struct sched_attr attr = {
5583 .sched_policy = SCHED_NORMAL,
5584 .sched_nice = nice,
5585 };
5586 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
5587}
5588EXPORT_SYMBOL_GPL(sched_set_normal);
5589
5590static int
5591do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5592{
5593 struct sched_param lparam;
5594 struct task_struct *p;
5595 int retval;
5596
5597 if (!param || pid < 0)
5598 return -EINVAL;
5599 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5600 return -EFAULT;
5601
5602 rcu_read_lock();
5603 retval = -ESRCH;
5604 p = find_process_by_pid(pid);
5605 if (likely(p))
5606 get_task_struct(p);
5607 rcu_read_unlock();
5608
5609 if (likely(p)) {
5610 retval = sched_setscheduler(p, policy, &lparam);
5611 put_task_struct(p);
5612 }
5613
5614 return retval;
5615}
5616
5617
5618
5619
5620static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
5621{
5622 u32 size;
5623 int ret;
5624
5625
5626 memset(attr, 0, sizeof(*attr));
5627
5628 ret = get_user(size, &uattr->size);
5629 if (ret)
5630 return ret;
5631
5632
5633 if (!size)
5634 size = SCHED_ATTR_SIZE_VER0;
5635 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
5636 goto err_size;
5637
5638 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
5639 if (ret) {
5640 if (ret == -E2BIG)
5641 goto err_size;
5642 return ret;
5643 }
5644
5645 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
5646 size < SCHED_ATTR_SIZE_VER1)
5647 return -EINVAL;
5648
5649
5650
5651
5652
5653 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
5654
5655 return 0;
5656
5657err_size:
5658 put_user(sizeof(*attr), &uattr->size);
5659 return -E2BIG;
5660}
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
5671{
5672 if (policy < 0)
5673 return -EINVAL;
5674
5675 return do_sched_setscheduler(pid, policy, param);
5676}
5677
5678
5679
5680
5681
5682
5683
5684
5685SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5686{
5687 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
5688}
5689
5690
5691
5692
5693
5694
5695
5696SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
5697 unsigned int, flags)
5698{
5699 struct sched_attr attr;
5700 struct task_struct *p;
5701 int retval;
5702
5703 if (!uattr || pid < 0 || flags)
5704 return -EINVAL;
5705
5706 retval = sched_copy_attr(uattr, &attr);
5707 if (retval)
5708 return retval;
5709
5710 if ((int)attr.sched_policy < 0)
5711 return -EINVAL;
5712 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
5713 attr.sched_policy = SETPARAM_POLICY;
5714
5715 rcu_read_lock();
5716 retval = -ESRCH;
5717 p = find_process_by_pid(pid);
5718 if (likely(p))
5719 get_task_struct(p);
5720 rcu_read_unlock();
5721
5722 if (likely(p)) {
5723 retval = sched_setattr(p, &attr);
5724 put_task_struct(p);
5725 }
5726
5727 return retval;
5728}
5729
5730
5731
5732
5733
5734
5735
5736
5737SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5738{
5739 struct task_struct *p;
5740 int retval;
5741
5742 if (pid < 0)
5743 return -EINVAL;
5744
5745 retval = -ESRCH;
5746 rcu_read_lock();
5747 p = find_process_by_pid(pid);
5748 if (p) {
5749 retval = security_task_getscheduler(p);
5750 if (!retval)
5751 retval = p->policy
5752 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5753 }
5754 rcu_read_unlock();
5755 return retval;
5756}
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5767{
5768 struct sched_param lp = { .sched_priority = 0 };
5769 struct task_struct *p;
5770 int retval;
5771
5772 if (!param || pid < 0)
5773 return -EINVAL;
5774
5775 rcu_read_lock();
5776 p = find_process_by_pid(pid);
5777 retval = -ESRCH;
5778 if (!p)
5779 goto out_unlock;
5780
5781 retval = security_task_getscheduler(p);
5782 if (retval)
5783 goto out_unlock;
5784
5785 if (task_has_rt_policy(p))
5786 lp.sched_priority = p->rt_priority;
5787 rcu_read_unlock();
5788
5789
5790
5791
5792 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5793
5794 return retval;
5795
5796out_unlock:
5797 rcu_read_unlock();
5798 return retval;
5799}
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809static int
5810sched_attr_copy_to_user(struct sched_attr __user *uattr,
5811 struct sched_attr *kattr,
5812 unsigned int usize)
5813{
5814 unsigned int ksize = sizeof(*kattr);
5815
5816 if (!access_ok(uattr, usize))
5817 return -EFAULT;
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832 kattr->size = min(usize, ksize);
5833
5834 if (copy_to_user(uattr, kattr, kattr->size))
5835 return -EFAULT;
5836
5837 return 0;
5838}
5839
5840
5841
5842
5843
5844
5845
5846
5847SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5848 unsigned int, usize, unsigned int, flags)
5849{
5850 struct sched_attr kattr = { };
5851 struct task_struct *p;
5852 int retval;
5853
5854 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
5855 usize < SCHED_ATTR_SIZE_VER0 || flags)
5856 return -EINVAL;
5857
5858 rcu_read_lock();
5859 p = find_process_by_pid(pid);
5860 retval = -ESRCH;
5861 if (!p)
5862 goto out_unlock;
5863
5864 retval = security_task_getscheduler(p);
5865 if (retval)
5866 goto out_unlock;
5867
5868 kattr.sched_policy = p->policy;
5869 if (p->sched_reset_on_fork)
5870 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5871 if (task_has_dl_policy(p))
5872 __getparam_dl(p, &kattr);
5873 else if (task_has_rt_policy(p))
5874 kattr.sched_priority = p->rt_priority;
5875 else
5876 kattr.sched_nice = task_nice(p);
5877
5878#ifdef CONFIG_UCLAMP_TASK
5879
5880
5881
5882
5883
5884 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5885 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5886#endif
5887
5888 rcu_read_unlock();
5889
5890 return sched_attr_copy_to_user(uattr, &kattr, usize);
5891
5892out_unlock:
5893 rcu_read_unlock();
5894 return retval;
5895}
5896
5897long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5898{
5899 cpumask_var_t cpus_allowed, new_mask;
5900 struct task_struct *p;
5901 int retval;
5902
5903 rcu_read_lock();
5904
5905 p = find_process_by_pid(pid);
5906 if (!p) {
5907 rcu_read_unlock();
5908 return -ESRCH;
5909 }
5910
5911
5912 get_task_struct(p);
5913 rcu_read_unlock();
5914
5915 if (p->flags & PF_NO_SETAFFINITY) {
5916 retval = -EINVAL;
5917 goto out_put_task;
5918 }
5919 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5920 retval = -ENOMEM;
5921 goto out_put_task;
5922 }
5923 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5924 retval = -ENOMEM;
5925 goto out_free_cpus_allowed;
5926 }
5927 retval = -EPERM;
5928 if (!check_same_owner(p)) {
5929 rcu_read_lock();
5930 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
5931 rcu_read_unlock();
5932 goto out_free_new_mask;
5933 }
5934 rcu_read_unlock();
5935 }
5936
5937 retval = security_task_setscheduler(p);
5938 if (retval)
5939 goto out_free_new_mask;
5940
5941
5942 cpuset_cpus_allowed(p, cpus_allowed);
5943 cpumask_and(new_mask, in_mask, cpus_allowed);
5944
5945
5946
5947
5948
5949
5950
5951#ifdef CONFIG_SMP
5952 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
5953 rcu_read_lock();
5954 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
5955 retval = -EBUSY;
5956 rcu_read_unlock();
5957 goto out_free_new_mask;
5958 }
5959 rcu_read_unlock();
5960 }
5961#endif
5962again:
5963 retval = __set_cpus_allowed_ptr(p, new_mask, true);
5964
5965 if (!retval) {
5966 cpuset_cpus_allowed(p, cpus_allowed);
5967 if (!cpumask_subset(new_mask, cpus_allowed)) {
5968
5969
5970
5971
5972
5973 cpumask_copy(new_mask, cpus_allowed);
5974 goto again;
5975 }
5976 }
5977out_free_new_mask:
5978 free_cpumask_var(new_mask);
5979out_free_cpus_allowed:
5980 free_cpumask_var(cpus_allowed);
5981out_put_task:
5982 put_task_struct(p);
5983 return retval;
5984}
5985
5986static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5987 struct cpumask *new_mask)
5988{
5989 if (len < cpumask_size())
5990 cpumask_clear(new_mask);
5991 else if (len > cpumask_size())
5992 len = cpumask_size();
5993
5994 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5995}
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6006 unsigned long __user *, user_mask_ptr)
6007{
6008 cpumask_var_t new_mask;
6009 int retval;
6010
6011 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6012 return -ENOMEM;
6013
6014 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6015 if (retval == 0)
6016 retval = sched_setaffinity(pid, new_mask);
6017 free_cpumask_var(new_mask);
6018 return retval;
6019}
6020
6021long sched_getaffinity(pid_t pid, struct cpumask *mask)
6022{
6023 struct task_struct *p;
6024 unsigned long flags;
6025 int retval;
6026
6027 rcu_read_lock();
6028
6029 retval = -ESRCH;
6030 p = find_process_by_pid(pid);
6031 if (!p)
6032 goto out_unlock;
6033
6034 retval = security_task_getscheduler(p);
6035 if (retval)
6036 goto out_unlock;
6037
6038 raw_spin_lock_irqsave(&p->pi_lock, flags);
6039 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
6040 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6041
6042out_unlock:
6043 rcu_read_unlock();
6044
6045 return retval;
6046}
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6058 unsigned long __user *, user_mask_ptr)
6059{
6060 int ret;
6061 cpumask_var_t mask;
6062
6063 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
6064 return -EINVAL;
6065 if (len & (sizeof(unsigned long)-1))
6066 return -EINVAL;
6067
6068 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6069 return -ENOMEM;
6070
6071 ret = sched_getaffinity(pid, mask);
6072 if (ret == 0) {
6073 unsigned int retlen = min(len, cpumask_size());
6074
6075 if (copy_to_user(user_mask_ptr, mask, retlen))
6076 ret = -EFAULT;
6077 else
6078 ret = retlen;
6079 }
6080 free_cpumask_var(mask);
6081
6082 return ret;
6083}
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093static void do_sched_yield(void)
6094{
6095 struct rq_flags rf;
6096 struct rq *rq;
6097
6098 rq = this_rq_lock_irq(&rf);
6099
6100 schedstat_inc(rq->yld_count);
6101 current->sched_class->yield_task(rq);
6102
6103
6104
6105
6106
6107 preempt_disable();
6108 rq_unlock(rq, &rf);
6109 sched_preempt_enable_no_resched();
6110
6111 schedule();
6112}
6113
6114SYSCALL_DEFINE0(sched_yield)
6115{
6116 do_sched_yield();
6117 return 0;
6118}
6119
6120#ifndef CONFIG_PREEMPTION
6121int __sched _cond_resched(void)
6122{
6123 if (should_resched(0)) {
6124 preempt_schedule_common();
6125 return 1;
6126 }
6127 rcu_all_qs();
6128 return 0;
6129}
6130EXPORT_SYMBOL(_cond_resched);
6131#endif
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141int __cond_resched_lock(spinlock_t *lock)
6142{
6143 int resched = should_resched(PREEMPT_LOCK_OFFSET);
6144 int ret = 0;
6145
6146 lockdep_assert_held(lock);
6147
6148 if (spin_needbreak(lock) || resched) {
6149 spin_unlock(lock);
6150 if (resched)
6151 preempt_schedule_common();
6152 else
6153 cpu_relax();
6154 ret = 1;
6155 spin_lock(lock);
6156 }
6157 return ret;
6158}
6159EXPORT_SYMBOL(__cond_resched_lock);
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183void __sched yield(void)
6184{
6185 set_current_state(TASK_RUNNING);
6186 do_sched_yield();
6187}
6188EXPORT_SYMBOL(yield);
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205int __sched yield_to(struct task_struct *p, bool preempt)
6206{
6207 struct task_struct *curr = current;
6208 struct rq *rq, *p_rq;
6209 unsigned long flags;
6210 int yielded = 0;
6211
6212 local_irq_save(flags);
6213 rq = this_rq();
6214
6215again:
6216 p_rq = task_rq(p);
6217
6218
6219
6220
6221 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
6222 yielded = -ESRCH;
6223 goto out_irq;
6224 }
6225
6226 double_rq_lock(rq, p_rq);
6227 if (task_rq(p) != p_rq) {
6228 double_rq_unlock(rq, p_rq);
6229 goto again;
6230 }
6231
6232 if (!curr->sched_class->yield_to_task)
6233 goto out_unlock;
6234
6235 if (curr->sched_class != p->sched_class)
6236 goto out_unlock;
6237
6238 if (task_running(p_rq, p) || p->state)
6239 goto out_unlock;
6240
6241 yielded = curr->sched_class->yield_to_task(rq, p);
6242 if (yielded) {
6243 schedstat_inc(rq->yld_count);
6244
6245
6246
6247
6248 if (preempt && rq != p_rq)
6249 resched_curr(p_rq);
6250 }
6251
6252out_unlock:
6253 double_rq_unlock(rq, p_rq);
6254out_irq:
6255 local_irq_restore(flags);
6256
6257 if (yielded > 0)
6258 schedule();
6259
6260 return yielded;
6261}
6262EXPORT_SYMBOL_GPL(yield_to);
6263
6264int io_schedule_prepare(void)
6265{
6266 int old_iowait = current->in_iowait;
6267
6268 current->in_iowait = 1;
6269 blk_schedule_flush_plug(current);
6270
6271 return old_iowait;
6272}
6273
6274void io_schedule_finish(int token)
6275{
6276 current->in_iowait = token;
6277}
6278
6279
6280
6281
6282
6283long __sched io_schedule_timeout(long timeout)
6284{
6285 int token;
6286 long ret;
6287
6288 token = io_schedule_prepare();
6289 ret = schedule_timeout(timeout);
6290 io_schedule_finish(token);
6291
6292 return ret;
6293}
6294EXPORT_SYMBOL(io_schedule_timeout);
6295
6296void __sched io_schedule(void)
6297{
6298 int token;
6299
6300 token = io_schedule_prepare();
6301 schedule();
6302 io_schedule_finish(token);
6303}
6304EXPORT_SYMBOL(io_schedule);
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6315{
6316 int ret = -EINVAL;
6317
6318 switch (policy) {
6319 case SCHED_FIFO:
6320 case SCHED_RR:
6321 ret = MAX_USER_RT_PRIO-1;
6322 break;
6323 case SCHED_DEADLINE:
6324 case SCHED_NORMAL:
6325 case SCHED_BATCH:
6326 case SCHED_IDLE:
6327 ret = 0;
6328 break;
6329 }
6330 return ret;
6331}
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6342{
6343 int ret = -EINVAL;
6344
6345 switch (policy) {
6346 case SCHED_FIFO:
6347 case SCHED_RR:
6348 ret = 1;
6349 break;
6350 case SCHED_DEADLINE:
6351 case SCHED_NORMAL:
6352 case SCHED_BATCH:
6353 case SCHED_IDLE:
6354 ret = 0;
6355 }
6356 return ret;
6357}
6358
6359static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
6360{
6361 struct task_struct *p;
6362 unsigned int time_slice;
6363 struct rq_flags rf;
6364 struct rq *rq;
6365 int retval;
6366
6367 if (pid < 0)
6368 return -EINVAL;
6369
6370 retval = -ESRCH;
6371 rcu_read_lock();
6372 p = find_process_by_pid(pid);
6373 if (!p)
6374 goto out_unlock;
6375
6376 retval = security_task_getscheduler(p);
6377 if (retval)
6378 goto out_unlock;
6379
6380 rq = task_rq_lock(p, &rf);
6381 time_slice = 0;
6382 if (p->sched_class->get_rr_interval)
6383 time_slice = p->sched_class->get_rr_interval(rq, p);
6384 task_rq_unlock(rq, p, &rf);
6385
6386 rcu_read_unlock();
6387 jiffies_to_timespec64(time_slice, t);
6388 return 0;
6389
6390out_unlock:
6391 rcu_read_unlock();
6392 return retval;
6393}
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6407 struct __kernel_timespec __user *, interval)
6408{
6409 struct timespec64 t;
6410 int retval = sched_rr_get_interval(pid, &t);
6411
6412 if (retval == 0)
6413 retval = put_timespec64(&t, interval);
6414
6415 return retval;
6416}
6417
6418#ifdef CONFIG_COMPAT_32BIT_TIME
6419SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
6420 struct old_timespec32 __user *, interval)
6421{
6422 struct timespec64 t;
6423 int retval = sched_rr_get_interval(pid, &t);
6424
6425 if (retval == 0)
6426 retval = put_old_timespec32(&t, interval);
6427 return retval;
6428}
6429#endif
6430
6431void sched_show_task(struct task_struct *p)
6432{
6433 unsigned long free = 0;
6434 int ppid;
6435
6436 if (!try_get_task_stack(p))
6437 return;
6438
6439 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
6440
6441 if (p->state == TASK_RUNNING)
6442 pr_cont(" running task ");
6443#ifdef CONFIG_DEBUG_STACK_USAGE
6444 free = stack_not_used(p);
6445#endif
6446 ppid = 0;
6447 rcu_read_lock();
6448 if (pid_alive(p))
6449 ppid = task_pid_nr(rcu_dereference(p->real_parent));
6450 rcu_read_unlock();
6451 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
6452 free, task_pid_nr(p), ppid,
6453 (unsigned long)task_thread_info(p)->flags);
6454
6455 print_worker_info(KERN_INFO, p);
6456 show_stack(p, NULL, KERN_INFO);
6457 put_task_stack(p);
6458}
6459EXPORT_SYMBOL_GPL(sched_show_task);
6460
6461static inline bool
6462state_filter_match(unsigned long state_filter, struct task_struct *p)
6463{
6464
6465 if (!state_filter)
6466 return true;
6467
6468
6469 if (!(p->state & state_filter))
6470 return false;
6471
6472
6473
6474
6475
6476 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
6477 return false;
6478
6479 return true;
6480}
6481
6482
6483void show_state_filter(unsigned long state_filter)
6484{
6485 struct task_struct *g, *p;
6486
6487 rcu_read_lock();
6488 for_each_process_thread(g, p) {
6489
6490
6491
6492
6493
6494
6495
6496 touch_nmi_watchdog();
6497 touch_all_softlockup_watchdogs();
6498 if (state_filter_match(state_filter, p))
6499 sched_show_task(p);
6500 }
6501
6502#ifdef CONFIG_SCHED_DEBUG
6503 if (!state_filter)
6504 sysrq_sched_debug_show();
6505#endif
6506 rcu_read_unlock();
6507
6508
6509
6510 if (!state_filter)
6511 debug_show_all_locks();
6512}
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522void init_idle(struct task_struct *idle, int cpu)
6523{
6524 struct rq *rq = cpu_rq(cpu);
6525 unsigned long flags;
6526
6527 __sched_fork(0, idle);
6528
6529 raw_spin_lock_irqsave(&idle->pi_lock, flags);
6530 raw_spin_lock(&rq->lock);
6531
6532 idle->state = TASK_RUNNING;
6533 idle->se.exec_start = sched_clock();
6534 idle->flags |= PF_IDLE;
6535
6536 scs_task_reset(idle);
6537 kasan_unpoison_task_stack(idle);
6538
6539#ifdef CONFIG_SMP
6540
6541
6542
6543
6544
6545
6546 set_cpus_allowed_common(idle, cpumask_of(cpu));
6547#endif
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558 rcu_read_lock();
6559 __set_task_cpu(idle, cpu);
6560 rcu_read_unlock();
6561
6562 rq->idle = idle;
6563 rcu_assign_pointer(rq->curr, idle);
6564 idle->on_rq = TASK_ON_RQ_QUEUED;
6565#ifdef CONFIG_SMP
6566 idle->on_cpu = 1;
6567#endif
6568 raw_spin_unlock(&rq->lock);
6569 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
6570
6571
6572 init_idle_preempt_count(idle, cpu);
6573
6574
6575
6576
6577 idle->sched_class = &idle_sched_class;
6578 ftrace_graph_init_idle_task(idle, cpu);
6579 vtime_init_idle(idle, cpu);
6580#ifdef CONFIG_SMP
6581 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6582#endif
6583}
6584
6585#ifdef CONFIG_SMP
6586
6587int cpuset_cpumask_can_shrink(const struct cpumask *cur,
6588 const struct cpumask *trial)
6589{
6590 int ret = 1;
6591
6592 if (!cpumask_weight(cur))
6593 return ret;
6594
6595 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
6596
6597 return ret;
6598}
6599
6600int task_can_attach(struct task_struct *p,
6601 const struct cpumask *cs_cpus_allowed)
6602{
6603 int ret = 0;
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614 if (p->flags & PF_NO_SETAFFINITY) {
6615 ret = -EINVAL;
6616 goto out;
6617 }
6618
6619 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6620 cs_cpus_allowed))
6621 ret = dl_task_can_attach(p, cs_cpus_allowed);
6622
6623out:
6624 return ret;
6625}
6626
6627bool sched_smp_initialized __read_mostly;
6628
6629#ifdef CONFIG_NUMA_BALANCING
6630
6631int migrate_task_to(struct task_struct *p, int target_cpu)
6632{
6633 struct migration_arg arg = { p, target_cpu };
6634 int curr_cpu = task_cpu(p);
6635
6636 if (curr_cpu == target_cpu)
6637 return 0;
6638
6639 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
6640 return -EINVAL;
6641
6642
6643
6644 trace_sched_move_numa(p, curr_cpu, target_cpu);
6645 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
6646}
6647
6648
6649
6650
6651
6652void sched_setnuma(struct task_struct *p, int nid)
6653{
6654 bool queued, running;
6655 struct rq_flags rf;
6656 struct rq *rq;
6657
6658 rq = task_rq_lock(p, &rf);
6659 queued = task_on_rq_queued(p);
6660 running = task_current(rq, p);
6661
6662 if (queued)
6663 dequeue_task(rq, p, DEQUEUE_SAVE);
6664 if (running)
6665 put_prev_task(rq, p);
6666
6667 p->numa_preferred_nid = nid;
6668
6669 if (queued)
6670 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6671 if (running)
6672 set_next_task(rq, p);
6673 task_rq_unlock(rq, p, &rf);
6674}
6675#endif
6676
6677#ifdef CONFIG_HOTPLUG_CPU
6678
6679
6680
6681
6682void idle_task_exit(void)
6683{
6684 struct mm_struct *mm = current->active_mm;
6685
6686 BUG_ON(cpu_online(smp_processor_id()));
6687 BUG_ON(current != this_rq()->idle);
6688
6689 if (mm != &init_mm) {
6690 switch_mm(mm, &init_mm, current);
6691 finish_arch_post_lock_switch();
6692 }
6693
6694
6695}
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706static void calc_load_migrate(struct rq *rq)
6707{
6708 long delta = calc_load_fold_active(rq, 1);
6709 if (delta)
6710 atomic_long_add(delta, &calc_load_tasks);
6711}
6712
6713static struct task_struct *__pick_migrate_task(struct rq *rq)
6714{
6715 const struct sched_class *class;
6716 struct task_struct *next;
6717
6718 for_each_class(class) {
6719 next = class->pick_next_task(rq);
6720 if (next) {
6721 next->sched_class->put_prev_task(rq, next);
6722 return next;
6723 }
6724 }
6725
6726
6727 BUG();
6728}
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6739{
6740 struct rq *rq = dead_rq;
6741 struct task_struct *next, *stop = rq->stop;
6742 struct rq_flags orf = *rf;
6743 int dest_cpu;
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754 rq->stop = NULL;
6755
6756
6757
6758
6759
6760
6761 update_rq_clock(rq);
6762
6763 for (;;) {
6764
6765
6766
6767
6768 if (rq->nr_running == 1)
6769 break;
6770
6771 next = __pick_migrate_task(rq);
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782 rq_unlock(rq, rf);
6783 raw_spin_lock(&next->pi_lock);
6784 rq_relock(rq, rf);
6785
6786
6787
6788
6789
6790
6791 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6792 raw_spin_unlock(&next->pi_lock);
6793 continue;
6794 }
6795
6796
6797 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6798 rq = __migrate_task(rq, rf, next, dest_cpu);
6799 if (rq != dead_rq) {
6800 rq_unlock(rq, rf);
6801 rq = dead_rq;
6802 *rf = orf;
6803 rq_relock(rq, rf);
6804 }
6805 raw_spin_unlock(&next->pi_lock);
6806 }
6807
6808 rq->stop = stop;
6809}
6810#endif
6811
6812void set_rq_online(struct rq *rq)
6813{
6814 if (!rq->online) {
6815 const struct sched_class *class;
6816
6817 cpumask_set_cpu(rq->cpu, rq->rd->online);
6818 rq->online = 1;
6819
6820 for_each_class(class) {
6821 if (class->rq_online)
6822 class->rq_online(rq);
6823 }
6824 }
6825}
6826
6827void set_rq_offline(struct rq *rq)
6828{
6829 if (rq->online) {
6830 const struct sched_class *class;
6831
6832 for_each_class(class) {
6833 if (class->rq_offline)
6834 class->rq_offline(rq);
6835 }
6836
6837 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6838 rq->online = 0;
6839 }
6840}
6841
6842
6843
6844
6845static int num_cpus_frozen;
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855static void cpuset_cpu_active(void)
6856{
6857 if (cpuhp_tasks_frozen) {
6858
6859
6860
6861
6862
6863
6864 partition_sched_domains(1, NULL, NULL);
6865 if (--num_cpus_frozen)
6866 return;
6867
6868
6869
6870
6871
6872 cpuset_force_rebuild();
6873 }
6874 cpuset_update_active_cpus();
6875}
6876
6877static int cpuset_cpu_inactive(unsigned int cpu)
6878{
6879 if (!cpuhp_tasks_frozen) {
6880 if (dl_cpu_busy(cpu))
6881 return -EBUSY;
6882 cpuset_update_active_cpus();
6883 } else {
6884 num_cpus_frozen++;
6885 partition_sched_domains(1, NULL, NULL);
6886 }
6887 return 0;
6888}
6889
6890int sched_cpu_activate(unsigned int cpu)
6891{
6892 struct rq *rq = cpu_rq(cpu);
6893 struct rq_flags rf;
6894
6895#ifdef CONFIG_SCHED_SMT
6896
6897
6898
6899 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6900 static_branch_inc_cpuslocked(&sched_smt_present);
6901#endif
6902 set_cpu_active(cpu, true);
6903
6904 if (sched_smp_initialized) {
6905 sched_domains_numa_masks_set(cpu);
6906 cpuset_cpu_active();
6907 }
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918 rq_lock_irqsave(rq, &rf);
6919 if (rq->rd) {
6920 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6921 set_rq_online(rq);
6922 }
6923 rq_unlock_irqrestore(rq, &rf);
6924
6925 return 0;
6926}
6927
6928int sched_cpu_deactivate(unsigned int cpu)
6929{
6930 int ret;
6931
6932 set_cpu_active(cpu, false);
6933
6934
6935
6936
6937
6938
6939
6940 synchronize_rcu();
6941
6942#ifdef CONFIG_SCHED_SMT
6943
6944
6945
6946 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6947 static_branch_dec_cpuslocked(&sched_smt_present);
6948#endif
6949
6950 if (!sched_smp_initialized)
6951 return 0;
6952
6953 ret = cpuset_cpu_inactive(cpu);
6954 if (ret) {
6955 set_cpu_active(cpu, true);
6956 return ret;
6957 }
6958 sched_domains_numa_masks_clear(cpu);
6959 return 0;
6960}
6961
6962static void sched_rq_cpu_starting(unsigned int cpu)
6963{
6964 struct rq *rq = cpu_rq(cpu);
6965
6966 rq->calc_load_update = calc_load_update;
6967 update_max_interval();
6968}
6969
6970int sched_cpu_starting(unsigned int cpu)
6971{
6972 sched_rq_cpu_starting(cpu);
6973 sched_tick_start(cpu);
6974 return 0;
6975}
6976
6977#ifdef CONFIG_HOTPLUG_CPU
6978int sched_cpu_dying(unsigned int cpu)
6979{
6980 struct rq *rq = cpu_rq(cpu);
6981 struct rq_flags rf;
6982
6983
6984 sched_tick_stop(cpu);
6985
6986 rq_lock_irqsave(rq, &rf);
6987 if (rq->rd) {
6988 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6989 set_rq_offline(rq);
6990 }
6991 migrate_tasks(rq, &rf);
6992 BUG_ON(rq->nr_running != 1);
6993 rq_unlock_irqrestore(rq, &rf);
6994
6995 calc_load_migrate(rq);
6996 update_max_interval();
6997 nohz_balance_exit_idle(rq);
6998 hrtick_clear(rq);
6999 return 0;
7000}
7001#endif
7002
7003void __init sched_init_smp(void)
7004{
7005 sched_init_numa();
7006
7007
7008
7009
7010
7011
7012 mutex_lock(&sched_domains_mutex);
7013 sched_init_domains(cpu_active_mask);
7014 mutex_unlock(&sched_domains_mutex);
7015
7016
7017 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
7018 BUG();
7019 sched_init_granularity();
7020
7021 init_sched_rt_class();
7022 init_sched_dl_class();
7023
7024 sched_smp_initialized = true;
7025}
7026
7027static int __init migration_init(void)
7028{
7029 sched_cpu_starting(smp_processor_id());
7030 return 0;
7031}
7032early_initcall(migration_init);
7033
7034#else
7035void __init sched_init_smp(void)
7036{
7037 sched_init_granularity();
7038}
7039#endif
7040
7041int in_sched_functions(unsigned long addr)
7042{
7043 return in_lock_functions(addr) ||
7044 (addr >= (unsigned long)__sched_text_start
7045 && addr < (unsigned long)__sched_text_end);
7046}
7047
7048#ifdef CONFIG_CGROUP_SCHED
7049
7050
7051
7052
7053struct task_group root_task_group;
7054LIST_HEAD(task_groups);
7055
7056
7057static struct kmem_cache *task_group_cache __read_mostly;
7058#endif
7059
7060DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7061DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7062
7063void __init sched_init(void)
7064{
7065 unsigned long ptr = 0;
7066 int i;
7067
7068
7069 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
7070 &fair_sched_class + 1 != &rt_sched_class ||
7071 &rt_sched_class + 1 != &dl_sched_class);
7072#ifdef CONFIG_SMP
7073 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
7074#endif
7075
7076 wait_bit_init();
7077
7078#ifdef CONFIG_FAIR_GROUP_SCHED
7079 ptr += 2 * nr_cpu_ids * sizeof(void **);
7080#endif
7081#ifdef CONFIG_RT_GROUP_SCHED
7082 ptr += 2 * nr_cpu_ids * sizeof(void **);
7083#endif
7084 if (ptr) {
7085 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
7086
7087#ifdef CONFIG_FAIR_GROUP_SCHED
7088 root_task_group.se = (struct sched_entity **)ptr;
7089 ptr += nr_cpu_ids * sizeof(void **);
7090
7091 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7092 ptr += nr_cpu_ids * sizeof(void **);
7093
7094 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7095 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7096#endif
7097#ifdef CONFIG_RT_GROUP_SCHED
7098 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7099 ptr += nr_cpu_ids * sizeof(void **);
7100
7101 root_task_group.rt_rq = (struct rt_rq **)ptr;
7102 ptr += nr_cpu_ids * sizeof(void **);
7103
7104#endif
7105 }
7106#ifdef CONFIG_CPUMASK_OFFSTACK
7107 for_each_possible_cpu(i) {
7108 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7109 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7110 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7111 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7112 }
7113#endif
7114
7115 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
7116 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
7117
7118#ifdef CONFIG_SMP
7119 init_defrootdomain();
7120#endif
7121
7122#ifdef CONFIG_RT_GROUP_SCHED
7123 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7124 global_rt_period(), global_rt_runtime());
7125#endif
7126
7127#ifdef CONFIG_CGROUP_SCHED
7128 task_group_cache = KMEM_CACHE(task_group, 0);
7129
7130 list_add(&root_task_group.list, &task_groups);
7131 INIT_LIST_HEAD(&root_task_group.children);
7132 INIT_LIST_HEAD(&root_task_group.siblings);
7133 autogroup_init(&init_task);
7134#endif
7135
7136 for_each_possible_cpu(i) {
7137 struct rq *rq;
7138
7139 rq = cpu_rq(i);
7140 raw_spin_lock_init(&rq->lock);
7141 rq->nr_running = 0;
7142 rq->calc_load_active = 0;
7143 rq->calc_load_update = jiffies + LOAD_FREQ;
7144 init_cfs_rq(&rq->cfs);
7145 init_rt_rq(&rq->rt);
7146 init_dl_rq(&rq->dl);
7147#ifdef CONFIG_FAIR_GROUP_SCHED
7148 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7149 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7170#endif
7171
7172 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7173#ifdef CONFIG_RT_GROUP_SCHED
7174 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7175#endif
7176#ifdef CONFIG_SMP
7177 rq->sd = NULL;
7178 rq->rd = NULL;
7179 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7180 rq->balance_callback = NULL;
7181 rq->active_balance = 0;
7182 rq->next_balance = jiffies;
7183 rq->push_cpu = 0;
7184 rq->cpu = i;
7185 rq->online = 0;
7186 rq->idle_stamp = 0;
7187 rq->avg_idle = 2*sysctl_sched_migration_cost;
7188 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7189
7190 INIT_LIST_HEAD(&rq->cfs_tasks);
7191
7192 rq_attach_root(rq, &def_root_domain);
7193#ifdef CONFIG_NO_HZ_COMMON
7194 rq->last_blocked_load_update_tick = jiffies;
7195 atomic_set(&rq->nohz_flags, 0);
7196
7197 rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
7198#endif
7199#endif
7200 hrtick_rq_init(rq);
7201 atomic_set(&rq->nr_iowait, 0);
7202 }
7203
7204 set_load_weight(&init_task, false);
7205
7206
7207
7208
7209 mmgrab(&init_mm);
7210 enter_lazy_tlb(&init_mm, current);
7211
7212
7213
7214
7215
7216
7217
7218 init_idle(current, smp_processor_id());
7219
7220 calc_load_update = jiffies + LOAD_FREQ;
7221
7222#ifdef CONFIG_SMP
7223 idle_thread_set_boot_cpu();
7224#endif
7225 init_sched_fair_class();
7226
7227 init_schedstats();
7228
7229 psi_init();
7230
7231 init_uclamp();
7232
7233 scheduler_running = 1;
7234}
7235
7236#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7237static inline int preempt_count_equals(int preempt_offset)
7238{
7239 int nested = preempt_count() + rcu_preempt_depth();
7240
7241 return (nested == preempt_offset);
7242}
7243
7244void __might_sleep(const char *file, int line, int preempt_offset)
7245{
7246
7247
7248
7249
7250
7251 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7252 "do not call blocking ops when !TASK_RUNNING; "
7253 "state=%lx set at [<%p>] %pS\n",
7254 current->state,
7255 (void *)current->task_state_change,
7256 (void *)current->task_state_change);
7257
7258 ___might_sleep(file, line, preempt_offset);
7259}
7260EXPORT_SYMBOL(__might_sleep);
7261
7262void ___might_sleep(const char *file, int line, int preempt_offset)
7263{
7264
7265 static unsigned long prev_jiffy;
7266
7267 unsigned long preempt_disable_ip;
7268
7269
7270 rcu_sleep_check();
7271
7272 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7273 !is_idle_task(current) && !current->non_block_count) ||
7274 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
7275 oops_in_progress)
7276 return;
7277
7278 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7279 return;
7280 prev_jiffy = jiffies;
7281
7282
7283 preempt_disable_ip = get_preempt_disable_ip(current);
7284
7285 printk(KERN_ERR
7286 "BUG: sleeping function called from invalid context at %s:%d\n",
7287 file, line);
7288 printk(KERN_ERR
7289 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
7290 in_atomic(), irqs_disabled(), current->non_block_count,
7291 current->pid, current->comm);
7292
7293 if (task_stack_end_corrupted(current))
7294 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7295
7296 debug_show_held_locks(current);
7297 if (irqs_disabled())
7298 print_irqtrace_events(current);
7299 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
7300 && !preempt_count_equals(preempt_offset)) {
7301 pr_err("Preemption disabled at:");
7302 print_ip_sym(KERN_ERR, preempt_disable_ip);
7303 }
7304 dump_stack();
7305 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7306}
7307EXPORT_SYMBOL(___might_sleep);
7308
7309void __cant_sleep(const char *file, int line, int preempt_offset)
7310{
7311 static unsigned long prev_jiffy;
7312
7313 if (irqs_disabled())
7314 return;
7315
7316 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
7317 return;
7318
7319 if (preempt_count() > preempt_offset)
7320 return;
7321
7322 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7323 return;
7324 prev_jiffy = jiffies;
7325
7326 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
7327 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7328 in_atomic(), irqs_disabled(),
7329 current->pid, current->comm);
7330
7331 debug_show_held_locks(current);
7332 dump_stack();
7333 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7334}
7335EXPORT_SYMBOL_GPL(__cant_sleep);
7336#endif
7337
7338#ifdef CONFIG_MAGIC_SYSRQ
7339void normalize_rt_tasks(void)
7340{
7341 struct task_struct *g, *p;
7342 struct sched_attr attr = {
7343 .sched_policy = SCHED_NORMAL,
7344 };
7345
7346 read_lock(&tasklist_lock);
7347 for_each_process_thread(g, p) {
7348
7349
7350
7351 if (p->flags & PF_KTHREAD)
7352 continue;
7353
7354 p->se.exec_start = 0;
7355 schedstat_set(p->se.statistics.wait_start, 0);
7356 schedstat_set(p->se.statistics.sleep_start, 0);
7357 schedstat_set(p->se.statistics.block_start, 0);
7358
7359 if (!dl_task(p) && !rt_task(p)) {
7360
7361
7362
7363
7364 if (task_nice(p) < 0)
7365 set_user_nice(p, 0);
7366 continue;
7367 }
7368
7369 __sched_setscheduler(p, &attr, false, false);
7370 }
7371 read_unlock(&tasklist_lock);
7372}
7373
7374#endif
7375
7376#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395struct task_struct *curr_task(int cpu)
7396{
7397 return cpu_curr(cpu);
7398}
7399
7400#endif
7401
7402#ifdef CONFIG_IA64
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418void ia64_set_curr_task(int cpu, struct task_struct *p)
7419{
7420 cpu_curr(cpu) = p;
7421}
7422
7423#endif
7424
7425#ifdef CONFIG_CGROUP_SCHED
7426
7427static DEFINE_SPINLOCK(task_group_lock);
7428
7429static inline void alloc_uclamp_sched_group(struct task_group *tg,
7430 struct task_group *parent)
7431{
7432#ifdef CONFIG_UCLAMP_TASK_GROUP
7433 enum uclamp_id clamp_id;
7434
7435 for_each_clamp_id(clamp_id) {
7436 uclamp_se_set(&tg->uclamp_req[clamp_id],
7437 uclamp_none(clamp_id), false);
7438 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
7439 }
7440#endif
7441}
7442
7443static void sched_free_group(struct task_group *tg)
7444{
7445 free_fair_sched_group(tg);
7446 free_rt_sched_group(tg);
7447 autogroup_free(tg);
7448 kmem_cache_free(task_group_cache, tg);
7449}
7450
7451
7452struct task_group *sched_create_group(struct task_group *parent)
7453{
7454 struct task_group *tg;
7455
7456 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
7457 if (!tg)
7458 return ERR_PTR(-ENOMEM);
7459
7460 if (!alloc_fair_sched_group(tg, parent))
7461 goto err;
7462
7463 if (!alloc_rt_sched_group(tg, parent))
7464 goto err;
7465
7466 alloc_uclamp_sched_group(tg, parent);
7467
7468 return tg;
7469
7470err:
7471 sched_free_group(tg);
7472 return ERR_PTR(-ENOMEM);
7473}
7474
7475void sched_online_group(struct task_group *tg, struct task_group *parent)
7476{
7477 unsigned long flags;
7478
7479 spin_lock_irqsave(&task_group_lock, flags);
7480 list_add_rcu(&tg->list, &task_groups);
7481
7482
7483 WARN_ON(!parent);
7484
7485 tg->parent = parent;
7486 INIT_LIST_HEAD(&tg->children);
7487 list_add_rcu(&tg->siblings, &parent->children);
7488 spin_unlock_irqrestore(&task_group_lock, flags);
7489
7490 online_fair_sched_group(tg);
7491}
7492
7493
7494static void sched_free_group_rcu(struct rcu_head *rhp)
7495{
7496
7497 sched_free_group(container_of(rhp, struct task_group, rcu));
7498}
7499
7500void sched_destroy_group(struct task_group *tg)
7501{
7502
7503 call_rcu(&tg->rcu, sched_free_group_rcu);
7504}
7505
7506void sched_offline_group(struct task_group *tg)
7507{
7508 unsigned long flags;
7509
7510
7511 unregister_fair_sched_group(tg);
7512
7513 spin_lock_irqsave(&task_group_lock, flags);
7514 list_del_rcu(&tg->list);
7515 list_del_rcu(&tg->siblings);
7516 spin_unlock_irqrestore(&task_group_lock, flags);
7517}
7518
7519static void sched_change_group(struct task_struct *tsk, int type)
7520{
7521 struct task_group *tg;
7522
7523
7524
7525
7526
7527
7528 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7529 struct task_group, css);
7530 tg = autogroup_task_group(tsk, tg);
7531 tsk->sched_task_group = tg;
7532
7533#ifdef CONFIG_FAIR_GROUP_SCHED
7534 if (tsk->sched_class->task_change_group)
7535 tsk->sched_class->task_change_group(tsk, type);
7536 else
7537#endif
7538 set_task_rq(tsk, task_cpu(tsk));
7539}
7540
7541
7542
7543
7544
7545
7546
7547
7548void sched_move_task(struct task_struct *tsk)
7549{
7550 int queued, running, queue_flags =
7551 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7552 struct rq_flags rf;
7553 struct rq *rq;
7554
7555 rq = task_rq_lock(tsk, &rf);
7556 update_rq_clock(rq);
7557
7558 running = task_current(rq, tsk);
7559 queued = task_on_rq_queued(tsk);
7560
7561 if (queued)
7562 dequeue_task(rq, tsk, queue_flags);
7563 if (running)
7564 put_prev_task(rq, tsk);
7565
7566 sched_change_group(tsk, TASK_MOVE_GROUP);
7567
7568 if (queued)
7569 enqueue_task(rq, tsk, queue_flags);
7570 if (running) {
7571 set_next_task(rq, tsk);
7572
7573
7574
7575
7576
7577 resched_curr(rq);
7578 }
7579
7580 task_rq_unlock(rq, tsk, &rf);
7581}
7582
7583static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7584{
7585 return css ? container_of(css, struct task_group, css) : NULL;
7586}
7587
7588static struct cgroup_subsys_state *
7589cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7590{
7591 struct task_group *parent = css_tg(parent_css);
7592 struct task_group *tg;
7593
7594 if (!parent) {
7595
7596 return &root_task_group.css;
7597 }
7598
7599 tg = sched_create_group(parent);
7600 if (IS_ERR(tg))
7601 return ERR_PTR(-ENOMEM);
7602
7603 return &tg->css;
7604}
7605
7606
7607static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7608{
7609 struct task_group *tg = css_tg(css);
7610 struct task_group *parent = css_tg(css->parent);
7611
7612 if (parent)
7613 sched_online_group(tg, parent);
7614
7615#ifdef CONFIG_UCLAMP_TASK_GROUP
7616
7617 cpu_util_update_eff(css);
7618#endif
7619
7620 return 0;
7621}
7622
7623static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
7624{
7625 struct task_group *tg = css_tg(css);
7626
7627 sched_offline_group(tg);
7628}
7629
7630static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7631{
7632 struct task_group *tg = css_tg(css);
7633
7634
7635
7636
7637 sched_free_group(tg);
7638}
7639
7640
7641
7642
7643
7644static void cpu_cgroup_fork(struct task_struct *task)
7645{
7646 struct rq_flags rf;
7647 struct rq *rq;
7648
7649 rq = task_rq_lock(task, &rf);
7650
7651 update_rq_clock(rq);
7652 sched_change_group(task, TASK_SET_GROUP);
7653
7654 task_rq_unlock(rq, task, &rf);
7655}
7656
7657static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
7658{
7659 struct task_struct *task;
7660 struct cgroup_subsys_state *css;
7661 int ret = 0;
7662
7663 cgroup_taskset_for_each(task, css, tset) {
7664#ifdef CONFIG_RT_GROUP_SCHED
7665 if (!sched_rt_can_attach(css_tg(css), task))
7666 return -EINVAL;
7667#endif
7668
7669
7670
7671
7672 raw_spin_lock_irq(&task->pi_lock);
7673
7674
7675
7676
7677
7678 if (task->state == TASK_NEW)
7679 ret = -EINVAL;
7680 raw_spin_unlock_irq(&task->pi_lock);
7681
7682 if (ret)
7683 break;
7684 }
7685 return ret;
7686}
7687
7688static void cpu_cgroup_attach(struct cgroup_taskset *tset)
7689{
7690 struct task_struct *task;
7691 struct cgroup_subsys_state *css;
7692
7693 cgroup_taskset_for_each(task, css, tset)
7694 sched_move_task(task);
7695}
7696
7697#ifdef CONFIG_UCLAMP_TASK_GROUP
7698static void cpu_util_update_eff(struct cgroup_subsys_state *css)
7699{
7700 struct cgroup_subsys_state *top_css = css;
7701 struct uclamp_se *uc_parent = NULL;
7702 struct uclamp_se *uc_se = NULL;
7703 unsigned int eff[UCLAMP_CNT];
7704 enum uclamp_id clamp_id;
7705 unsigned int clamps;
7706
7707 css_for_each_descendant_pre(css, top_css) {
7708 uc_parent = css_tg(css)->parent
7709 ? css_tg(css)->parent->uclamp : NULL;
7710
7711 for_each_clamp_id(clamp_id) {
7712
7713 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
7714
7715 if (uc_parent &&
7716 eff[clamp_id] > uc_parent[clamp_id].value) {
7717 eff[clamp_id] = uc_parent[clamp_id].value;
7718 }
7719 }
7720
7721 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
7722
7723
7724 clamps = 0x0;
7725 uc_se = css_tg(css)->uclamp;
7726 for_each_clamp_id(clamp_id) {
7727 if (eff[clamp_id] == uc_se[clamp_id].value)
7728 continue;
7729 uc_se[clamp_id].value = eff[clamp_id];
7730 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
7731 clamps |= (0x1 << clamp_id);
7732 }
7733 if (!clamps) {
7734 css = css_rightmost_descendant(css);
7735 continue;
7736 }
7737
7738
7739 uclamp_update_active_tasks(css, clamps);
7740 }
7741}
7742
7743
7744
7745
7746
7747
7748#define _POW10(exp) ((unsigned int)1e##exp)
7749#define POW10(exp) _POW10(exp)
7750
7751struct uclamp_request {
7752#define UCLAMP_PERCENT_SHIFT 2
7753#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
7754 s64 percent;
7755 u64 util;
7756 int ret;
7757};
7758
7759static inline struct uclamp_request
7760capacity_from_percent(char *buf)
7761{
7762 struct uclamp_request req = {
7763 .percent = UCLAMP_PERCENT_SCALE,
7764 .util = SCHED_CAPACITY_SCALE,
7765 .ret = 0,
7766 };
7767
7768 buf = strim(buf);
7769 if (strcmp(buf, "max")) {
7770 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
7771 &req.percent);
7772 if (req.ret)
7773 return req;
7774 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
7775 req.ret = -ERANGE;
7776 return req;
7777 }
7778
7779 req.util = req.percent << SCHED_CAPACITY_SHIFT;
7780 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
7781 }
7782
7783 return req;
7784}
7785
7786static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
7787 size_t nbytes, loff_t off,
7788 enum uclamp_id clamp_id)
7789{
7790 struct uclamp_request req;
7791 struct task_group *tg;
7792
7793 req = capacity_from_percent(buf);
7794 if (req.ret)
7795 return req.ret;
7796
7797 static_branch_enable(&sched_uclamp_used);
7798
7799 mutex_lock(&uclamp_mutex);
7800 rcu_read_lock();
7801
7802 tg = css_tg(of_css(of));
7803 if (tg->uclamp_req[clamp_id].value != req.util)
7804 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
7805
7806
7807
7808
7809
7810 tg->uclamp_pct[clamp_id] = req.percent;
7811
7812
7813 cpu_util_update_eff(of_css(of));
7814
7815 rcu_read_unlock();
7816 mutex_unlock(&uclamp_mutex);
7817
7818 return nbytes;
7819}
7820
7821static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
7822 char *buf, size_t nbytes,
7823 loff_t off)
7824{
7825 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
7826}
7827
7828static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
7829 char *buf, size_t nbytes,
7830 loff_t off)
7831{
7832 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
7833}
7834
7835static inline void cpu_uclamp_print(struct seq_file *sf,
7836 enum uclamp_id clamp_id)
7837{
7838 struct task_group *tg;
7839 u64 util_clamp;
7840 u64 percent;
7841 u32 rem;
7842
7843 rcu_read_lock();
7844 tg = css_tg(seq_css(sf));
7845 util_clamp = tg->uclamp_req[clamp_id].value;
7846 rcu_read_unlock();
7847
7848 if (util_clamp == SCHED_CAPACITY_SCALE) {
7849 seq_puts(sf, "max\n");
7850 return;
7851 }
7852
7853 percent = tg->uclamp_pct[clamp_id];
7854 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
7855 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
7856}
7857
7858static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
7859{
7860 cpu_uclamp_print(sf, UCLAMP_MIN);
7861 return 0;
7862}
7863
7864static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
7865{
7866 cpu_uclamp_print(sf, UCLAMP_MAX);
7867 return 0;
7868}
7869#endif
7870
7871#ifdef CONFIG_FAIR_GROUP_SCHED
7872static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7873 struct cftype *cftype, u64 shareval)
7874{
7875 if (shareval > scale_load_down(ULONG_MAX))
7876 shareval = MAX_SHARES;
7877 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7878}
7879
7880static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7881 struct cftype *cft)
7882{
7883 struct task_group *tg = css_tg(css);
7884
7885 return (u64) scale_load_down(tg->shares);
7886}
7887
7888#ifdef CONFIG_CFS_BANDWIDTH
7889static DEFINE_MUTEX(cfs_constraints_mutex);
7890
7891const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7892static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7893
7894static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
7895
7896static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7897
7898static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7899{
7900 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7901 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7902
7903 if (tg == &root_task_group)
7904 return -EINVAL;
7905
7906
7907
7908
7909
7910
7911 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7912 return -EINVAL;
7913
7914
7915
7916
7917
7918
7919 if (period > max_cfs_quota_period)
7920 return -EINVAL;
7921
7922
7923
7924
7925 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
7926 return -EINVAL;
7927
7928
7929
7930
7931
7932 get_online_cpus();
7933 mutex_lock(&cfs_constraints_mutex);
7934 ret = __cfs_schedulable(tg, period, quota);
7935 if (ret)
7936 goto out_unlock;
7937
7938 runtime_enabled = quota != RUNTIME_INF;
7939 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7940
7941
7942
7943
7944 if (runtime_enabled && !runtime_was_enabled)
7945 cfs_bandwidth_usage_inc();
7946 raw_spin_lock_irq(&cfs_b->lock);
7947 cfs_b->period = ns_to_ktime(period);
7948 cfs_b->quota = quota;
7949
7950 __refill_cfs_bandwidth_runtime(cfs_b);
7951
7952
7953 if (runtime_enabled)
7954 start_cfs_bandwidth(cfs_b);
7955
7956 raw_spin_unlock_irq(&cfs_b->lock);
7957
7958 for_each_online_cpu(i) {
7959 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7960 struct rq *rq = cfs_rq->rq;
7961 struct rq_flags rf;
7962
7963 rq_lock_irq(rq, &rf);
7964 cfs_rq->runtime_enabled = runtime_enabled;
7965 cfs_rq->runtime_remaining = 0;
7966
7967 if (cfs_rq->throttled)
7968 unthrottle_cfs_rq(cfs_rq);
7969 rq_unlock_irq(rq, &rf);
7970 }
7971 if (runtime_was_enabled && !runtime_enabled)
7972 cfs_bandwidth_usage_dec();
7973out_unlock:
7974 mutex_unlock(&cfs_constraints_mutex);
7975 put_online_cpus();
7976
7977 return ret;
7978}
7979
7980static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7981{
7982 u64 quota, period;
7983
7984 period = ktime_to_ns(tg->cfs_bandwidth.period);
7985 if (cfs_quota_us < 0)
7986 quota = RUNTIME_INF;
7987 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
7988 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7989 else
7990 return -EINVAL;
7991
7992 return tg_set_cfs_bandwidth(tg, period, quota);
7993}
7994
7995static long tg_get_cfs_quota(struct task_group *tg)
7996{
7997 u64 quota_us;
7998
7999 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8000 return -1;
8001
8002 quota_us = tg->cfs_bandwidth.quota;
8003 do_div(quota_us, NSEC_PER_USEC);
8004
8005 return quota_us;
8006}
8007
8008static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8009{
8010 u64 quota, period;
8011
8012 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
8013 return -EINVAL;
8014
8015 period = (u64)cfs_period_us * NSEC_PER_USEC;
8016 quota = tg->cfs_bandwidth.quota;
8017
8018 return tg_set_cfs_bandwidth(tg, period, quota);
8019}
8020
8021static long tg_get_cfs_period(struct task_group *tg)
8022{
8023 u64 cfs_period_us;
8024
8025 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8026 do_div(cfs_period_us, NSEC_PER_USEC);
8027
8028 return cfs_period_us;
8029}
8030
8031static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8032 struct cftype *cft)
8033{
8034 return tg_get_cfs_quota(css_tg(css));
8035}
8036
8037static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8038 struct cftype *cftype, s64 cfs_quota_us)
8039{
8040 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8041}
8042
8043static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8044 struct cftype *cft)
8045{
8046 return tg_get_cfs_period(css_tg(css));
8047}
8048
8049static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8050 struct cftype *cftype, u64 cfs_period_us)
8051{
8052 return tg_set_cfs_period(css_tg(css), cfs_period_us);
8053}
8054
8055struct cfs_schedulable_data {
8056 struct task_group *tg;
8057 u64 period, quota;
8058};
8059
8060
8061
8062
8063
8064static u64 normalize_cfs_quota(struct task_group *tg,
8065 struct cfs_schedulable_data *d)
8066{
8067 u64 quota, period;
8068
8069 if (tg == d->tg) {
8070 period = d->period;
8071 quota = d->quota;
8072 } else {
8073 period = tg_get_cfs_period(tg);
8074 quota = tg_get_cfs_quota(tg);
8075 }
8076
8077
8078 if (quota == RUNTIME_INF || quota == -1)
8079 return RUNTIME_INF;
8080
8081 return to_ratio(period, quota);
8082}
8083
8084static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8085{
8086 struct cfs_schedulable_data *d = data;
8087 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8088 s64 quota = 0, parent_quota = -1;
8089
8090 if (!tg->parent) {
8091 quota = RUNTIME_INF;
8092 } else {
8093 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8094
8095 quota = normalize_cfs_quota(tg, d);
8096 parent_quota = parent_b->hierarchical_quota;
8097
8098
8099
8100
8101
8102
8103 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
8104 quota = min(quota, parent_quota);
8105 } else {
8106 if (quota == RUNTIME_INF)
8107 quota = parent_quota;
8108 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8109 return -EINVAL;
8110 }
8111 }
8112 cfs_b->hierarchical_quota = quota;
8113
8114 return 0;
8115}
8116
8117static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8118{
8119 int ret;
8120 struct cfs_schedulable_data data = {
8121 .tg = tg,
8122 .period = period,
8123 .quota = quota,
8124 };
8125
8126 if (quota != RUNTIME_INF) {
8127 do_div(data.period, NSEC_PER_USEC);
8128 do_div(data.quota, NSEC_PER_USEC);
8129 }
8130
8131 rcu_read_lock();
8132 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8133 rcu_read_unlock();
8134
8135 return ret;
8136}
8137
8138static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
8139{
8140 struct task_group *tg = css_tg(seq_css(sf));
8141 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8142
8143 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8144 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8145 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8146
8147 if (schedstat_enabled() && tg != &root_task_group) {
8148 u64 ws = 0;
8149 int i;
8150
8151 for_each_possible_cpu(i)
8152 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
8153
8154 seq_printf(sf, "wait_sum %llu\n", ws);
8155 }
8156
8157 return 0;
8158}
8159#endif
8160#endif
8161
8162#ifdef CONFIG_RT_GROUP_SCHED
8163static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8164 struct cftype *cft, s64 val)
8165{
8166 return sched_group_set_rt_runtime(css_tg(css), val);
8167}
8168
8169static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8170 struct cftype *cft)
8171{
8172 return sched_group_rt_runtime(css_tg(css));
8173}
8174
8175static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8176 struct cftype *cftype, u64 rt_period_us)
8177{
8178 return sched_group_set_rt_period(css_tg(css), rt_period_us);
8179}
8180
8181static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8182 struct cftype *cft)
8183{
8184 return sched_group_rt_period(css_tg(css));
8185}
8186#endif
8187
8188static struct cftype cpu_legacy_files[] = {
8189#ifdef CONFIG_FAIR_GROUP_SCHED
8190 {
8191 .name = "shares",
8192 .read_u64 = cpu_shares_read_u64,
8193 .write_u64 = cpu_shares_write_u64,
8194 },
8195#endif
8196#ifdef CONFIG_CFS_BANDWIDTH
8197 {
8198 .name = "cfs_quota_us",
8199 .read_s64 = cpu_cfs_quota_read_s64,
8200 .write_s64 = cpu_cfs_quota_write_s64,
8201 },
8202 {
8203 .name = "cfs_period_us",
8204 .read_u64 = cpu_cfs_period_read_u64,
8205 .write_u64 = cpu_cfs_period_write_u64,
8206 },
8207 {
8208 .name = "stat",
8209 .seq_show = cpu_cfs_stat_show,
8210 },
8211#endif
8212#ifdef CONFIG_RT_GROUP_SCHED
8213 {
8214 .name = "rt_runtime_us",
8215 .read_s64 = cpu_rt_runtime_read,
8216 .write_s64 = cpu_rt_runtime_write,
8217 },
8218 {
8219 .name = "rt_period_us",
8220 .read_u64 = cpu_rt_period_read_uint,
8221 .write_u64 = cpu_rt_period_write_uint,
8222 },
8223#endif
8224#ifdef CONFIG_UCLAMP_TASK_GROUP
8225 {
8226 .name = "uclamp.min",
8227 .flags = CFTYPE_NOT_ON_ROOT,
8228 .seq_show = cpu_uclamp_min_show,
8229 .write = cpu_uclamp_min_write,
8230 },
8231 {
8232 .name = "uclamp.max",
8233 .flags = CFTYPE_NOT_ON_ROOT,
8234 .seq_show = cpu_uclamp_max_show,
8235 .write = cpu_uclamp_max_write,
8236 },
8237#endif
8238 { }
8239};
8240
8241static int cpu_extra_stat_show(struct seq_file *sf,
8242 struct cgroup_subsys_state *css)
8243{
8244#ifdef CONFIG_CFS_BANDWIDTH
8245 {
8246 struct task_group *tg = css_tg(css);
8247 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8248 u64 throttled_usec;
8249
8250 throttled_usec = cfs_b->throttled_time;
8251 do_div(throttled_usec, NSEC_PER_USEC);
8252
8253 seq_printf(sf, "nr_periods %d\n"
8254 "nr_throttled %d\n"
8255 "throttled_usec %llu\n",
8256 cfs_b->nr_periods, cfs_b->nr_throttled,
8257 throttled_usec);
8258 }
8259#endif
8260 return 0;
8261}
8262
8263#ifdef CONFIG_FAIR_GROUP_SCHED
8264static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
8265 struct cftype *cft)
8266{
8267 struct task_group *tg = css_tg(css);
8268 u64 weight = scale_load_down(tg->shares);
8269
8270 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
8271}
8272
8273static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
8274 struct cftype *cft, u64 weight)
8275{
8276
8277
8278
8279
8280
8281
8282
8283 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
8284 return -ERANGE;
8285
8286 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
8287
8288 return sched_group_set_shares(css_tg(css), scale_load(weight));
8289}
8290
8291static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
8292 struct cftype *cft)
8293{
8294 unsigned long weight = scale_load_down(css_tg(css)->shares);
8295 int last_delta = INT_MAX;
8296 int prio, delta;
8297
8298
8299 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
8300 delta = abs(sched_prio_to_weight[prio] - weight);
8301 if (delta >= last_delta)
8302 break;
8303 last_delta = delta;
8304 }
8305
8306 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
8307}
8308
8309static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
8310 struct cftype *cft, s64 nice)
8311{
8312 unsigned long weight;
8313 int idx;
8314
8315 if (nice < MIN_NICE || nice > MAX_NICE)
8316 return -ERANGE;
8317
8318 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
8319 idx = array_index_nospec(idx, 40);
8320 weight = sched_prio_to_weight[idx];
8321
8322 return sched_group_set_shares(css_tg(css), scale_load(weight));
8323}
8324#endif
8325
8326static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
8327 long period, long quota)
8328{
8329 if (quota < 0)
8330 seq_puts(sf, "max");
8331 else
8332 seq_printf(sf, "%ld", quota);
8333
8334 seq_printf(sf, " %ld\n", period);
8335}
8336
8337
8338static int __maybe_unused cpu_period_quota_parse(char *buf,
8339 u64 *periodp, u64 *quotap)
8340{
8341 char tok[21];
8342
8343 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
8344 return -EINVAL;
8345
8346 *periodp *= NSEC_PER_USEC;
8347
8348 if (sscanf(tok, "%llu", quotap))
8349 *quotap *= NSEC_PER_USEC;
8350 else if (!strcmp(tok, "max"))
8351 *quotap = RUNTIME_INF;
8352 else
8353 return -EINVAL;
8354
8355 return 0;
8356}
8357
8358#ifdef CONFIG_CFS_BANDWIDTH
8359static int cpu_max_show(struct seq_file *sf, void *v)
8360{
8361 struct task_group *tg = css_tg(seq_css(sf));
8362
8363 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
8364 return 0;
8365}
8366
8367static ssize_t cpu_max_write(struct kernfs_open_file *of,
8368 char *buf, size_t nbytes, loff_t off)
8369{
8370 struct task_group *tg = css_tg(of_css(of));
8371 u64 period = tg_get_cfs_period(tg);
8372 u64 quota;
8373 int ret;
8374
8375 ret = cpu_period_quota_parse(buf, &period, "a);
8376 if (!ret)
8377 ret = tg_set_cfs_bandwidth(tg, period, quota);
8378 return ret ?: nbytes;
8379}
8380#endif
8381
8382static struct cftype cpu_files[] = {
8383#ifdef CONFIG_FAIR_GROUP_SCHED
8384 {
8385 .name = "weight",
8386 .flags = CFTYPE_NOT_ON_ROOT,
8387 .read_u64 = cpu_weight_read_u64,
8388 .write_u64 = cpu_weight_write_u64,
8389 },
8390 {
8391 .name = "weight.nice",
8392 .flags = CFTYPE_NOT_ON_ROOT,
8393 .read_s64 = cpu_weight_nice_read_s64,
8394 .write_s64 = cpu_weight_nice_write_s64,
8395 },
8396#endif
8397#ifdef CONFIG_CFS_BANDWIDTH
8398 {
8399 .name = "max",
8400 .flags = CFTYPE_NOT_ON_ROOT,
8401 .seq_show = cpu_max_show,
8402 .write = cpu_max_write,
8403 },
8404#endif
8405#ifdef CONFIG_UCLAMP_TASK_GROUP
8406 {
8407 .name = "uclamp.min",
8408 .flags = CFTYPE_NOT_ON_ROOT,
8409 .seq_show = cpu_uclamp_min_show,
8410 .write = cpu_uclamp_min_write,
8411 },
8412 {
8413 .name = "uclamp.max",
8414 .flags = CFTYPE_NOT_ON_ROOT,
8415 .seq_show = cpu_uclamp_max_show,
8416 .write = cpu_uclamp_max_write,
8417 },
8418#endif
8419 { }
8420};
8421
8422struct cgroup_subsys cpu_cgrp_subsys = {
8423 .css_alloc = cpu_cgroup_css_alloc,
8424 .css_online = cpu_cgroup_css_online,
8425 .css_released = cpu_cgroup_css_released,
8426 .css_free = cpu_cgroup_css_free,
8427 .css_extra_stat_show = cpu_extra_stat_show,
8428 .fork = cpu_cgroup_fork,
8429 .can_attach = cpu_cgroup_can_attach,
8430 .attach = cpu_cgroup_attach,
8431 .legacy_cftypes = cpu_legacy_files,
8432 .dfl_cftypes = cpu_files,
8433 .early_init = true,
8434 .threaded = true,
8435};
8436
8437#endif
8438
8439void dump_cpu_task(int cpu)
8440{
8441 pr_info("Task dump for CPU %d:\n", cpu);
8442 sched_show_task(cpu_curr(cpu));
8443}
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457const int sched_prio_to_weight[40] = {
8458 88761, 71755, 56483, 46273, 36291,
8459 29154, 23254, 18705, 14949, 11916,
8460 9548, 7620, 6100, 4904, 3906,
8461 3121, 2501, 1991, 1586, 1277,
8462 1024, 820, 655, 526, 423,
8463 335, 272, 215, 172, 137,
8464 110, 87, 70, 56, 45,
8465 36, 29, 23, 18, 15,
8466};
8467
8468
8469
8470
8471
8472
8473
8474
8475const u32 sched_prio_to_wmult[40] = {
8476 48388, 59856, 76040, 92818, 118348,
8477 147320, 184698, 229616, 287308, 360437,
8478 449829, 563644, 704093, 875809, 1099582,
8479 1376151, 1717300, 2157191, 2708050, 3363326,
8480 4194304, 5237765, 6557202, 8165337, 10153587,
8481 12820798, 15790321, 19976592, 24970740, 31350126,
8482 39045157, 49367440, 61356676, 76695844, 95443717,
8483 119304647, 148102320, 186737708, 238609294, 286331153,
8484};
8485
8486void call_trace_sched_update_nr_running(struct rq *rq, int count)
8487{
8488 trace_sched_update_nr_running_tp(rq, count);
8489}
8490