1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
43
44DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
45
46#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
47
48
49
50
51
52
53
54#define SCHED_FEAT(name, enabled) \
55 (1UL << __SCHED_FEAT_##name) * enabled |
56const_debug unsigned int sysctl_sched_features =
57#include "features.h"
58 0;
59#undef SCHED_FEAT
60#endif
61
62
63
64
65
66const_debug unsigned int sysctl_sched_nr_migrate = 32;
67
68
69
70
71
72unsigned int sysctl_sched_rt_period = 1000000;
73
74__read_mostly int scheduler_running;
75
76
77
78
79
80int sysctl_sched_rt_runtime = 950000;
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
180 __acquires(rq->lock)
181{
182 struct rq *rq;
183
184 lockdep_assert_held(&p->pi_lock);
185
186 for (;;) {
187 rq = task_rq(p);
188 raw_spin_lock(&rq->lock);
189 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
190 rq_pin_lock(rq, rf);
191 return rq;
192 }
193 raw_spin_unlock(&rq->lock);
194
195 while (unlikely(task_on_rq_migrating(p)))
196 cpu_relax();
197 }
198}
199
200
201
202
203struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
204 __acquires(p->pi_lock)
205 __acquires(rq->lock)
206{
207 struct rq *rq;
208
209 for (;;) {
210 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
211 rq = task_rq(p);
212 raw_spin_lock(&rq->lock);
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
231 rq_pin_lock(rq, rf);
232 return rq;
233 }
234 raw_spin_unlock(&rq->lock);
235 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
236
237 while (unlikely(task_on_rq_migrating(p)))
238 cpu_relax();
239 }
240}
241
242
243
244
245
246static void update_rq_clock_task(struct rq *rq, s64 delta)
247{
248
249
250
251
252 s64 __maybe_unused steal = 0, irq_delta = 0;
253
254#ifdef CONFIG_IRQ_TIME_ACCOUNTING
255 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272 if (irq_delta > delta)
273 irq_delta = delta;
274
275 rq->prev_irq_time += irq_delta;
276 delta -= irq_delta;
277#endif
278#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
279 if (static_key_false((¶virt_steal_rq_enabled))) {
280 steal = paravirt_steal_clock(cpu_of(rq));
281 steal -= rq->prev_steal_time_rq;
282
283 if (unlikely(steal > delta))
284 steal = delta;
285
286 rq->prev_steal_time_rq += steal;
287 delta -= steal;
288 }
289#endif
290
291 rq->clock_task += delta;
292
293#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
294 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
295 update_irq_load_avg(rq, irq_delta + steal);
296#endif
297 update_rq_clock_pelt(rq, delta);
298}
299
300void update_rq_clock(struct rq *rq)
301{
302 s64 delta;
303
304 lockdep_assert_held(&rq->lock);
305
306 if (rq->clock_update_flags & RQCF_ACT_SKIP)
307 return;
308
309#ifdef CONFIG_SCHED_DEBUG
310 if (sched_feat(WARN_DOUBLE_CLOCK))
311 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
312 rq->clock_update_flags |= RQCF_UPDATED;
313#endif
314
315 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
316 if (delta < 0)
317 return;
318 rq->clock += delta;
319 update_rq_clock_task(rq, delta);
320}
321
322static inline void
323rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
324{
325 csd->flags = 0;
326 csd->func = func;
327 csd->info = rq;
328}
329
330#ifdef CONFIG_SCHED_HRTICK
331
332
333
334
335static void hrtick_clear(struct rq *rq)
336{
337 if (hrtimer_active(&rq->hrtick_timer))
338 hrtimer_cancel(&rq->hrtick_timer);
339}
340
341
342
343
344
345static enum hrtimer_restart hrtick(struct hrtimer *timer)
346{
347 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
348 struct rq_flags rf;
349
350 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
351
352 rq_lock(rq, &rf);
353 update_rq_clock(rq);
354 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
355 rq_unlock(rq, &rf);
356
357 return HRTIMER_NORESTART;
358}
359
360#ifdef CONFIG_SMP
361
362static void __hrtick_restart(struct rq *rq)
363{
364 struct hrtimer *timer = &rq->hrtick_timer;
365
366 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
367}
368
369
370
371
372static void __hrtick_start(void *arg)
373{
374 struct rq *rq = arg;
375 struct rq_flags rf;
376
377 rq_lock(rq, &rf);
378 __hrtick_restart(rq);
379 rq_unlock(rq, &rf);
380}
381
382
383
384
385
386
387void hrtick_start(struct rq *rq, u64 delay)
388{
389 struct hrtimer *timer = &rq->hrtick_timer;
390 ktime_t time;
391 s64 delta;
392
393
394
395
396
397 delta = max_t(s64, delay, 10000LL);
398 time = ktime_add_ns(timer->base->get_time(), delta);
399
400 hrtimer_set_expires(timer, time);
401
402 if (rq == this_rq())
403 __hrtick_restart(rq);
404 else
405 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
406}
407
408#else
409
410
411
412
413
414void hrtick_start(struct rq *rq, u64 delay)
415{
416
417
418
419
420 delay = max_t(u64, delay, 10000LL);
421 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
422 HRTIMER_MODE_REL_PINNED_HARD);
423}
424
425#endif
426
427static void hrtick_rq_init(struct rq *rq)
428{
429#ifdef CONFIG_SMP
430 rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
431#endif
432 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
433 rq->hrtick_timer.function = hrtick;
434}
435#else
436static inline void hrtick_clear(struct rq *rq)
437{
438}
439
440static inline void hrtick_rq_init(struct rq *rq)
441{
442}
443#endif
444
445
446
447
448#define fetch_or(ptr, mask) \
449 ({ \
450 typeof(ptr) _ptr = (ptr); \
451 typeof(mask) _mask = (mask); \
452 typeof(*_ptr) _old, _val = *_ptr; \
453 \
454 for (;;) { \
455 _old = cmpxchg(_ptr, _val, _val | _mask); \
456 if (_old == _val) \
457 break; \
458 _val = _old; \
459 } \
460 _old; \
461})
462
463#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
464
465
466
467
468
469static bool set_nr_and_not_polling(struct task_struct *p)
470{
471 struct thread_info *ti = task_thread_info(p);
472 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
473}
474
475
476
477
478
479
480
481static bool set_nr_if_polling(struct task_struct *p)
482{
483 struct thread_info *ti = task_thread_info(p);
484 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
485
486 for (;;) {
487 if (!(val & _TIF_POLLING_NRFLAG))
488 return false;
489 if (val & _TIF_NEED_RESCHED)
490 return true;
491 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
492 if (old == val)
493 break;
494 val = old;
495 }
496 return true;
497}
498
499#else
500static bool set_nr_and_not_polling(struct task_struct *p)
501{
502 set_tsk_need_resched(p);
503 return true;
504}
505
506#ifdef CONFIG_SMP
507static bool set_nr_if_polling(struct task_struct *p)
508{
509 return false;
510}
511#endif
512#endif
513
514static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
515{
516 struct wake_q_node *node = &task->wake_q;
517
518
519
520
521
522
523
524
525
526 smp_mb__before_atomic();
527 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
528 return false;
529
530
531
532
533 *head->lastp = node;
534 head->lastp = &node->next;
535 return true;
536}
537
538
539
540
541
542
543
544
545
546
547
548
549
550void wake_q_add(struct wake_q_head *head, struct task_struct *task)
551{
552 if (__wake_q_add(head, task))
553 get_task_struct(task);
554}
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
574{
575 if (!__wake_q_add(head, task))
576 put_task_struct(task);
577}
578
579void wake_up_q(struct wake_q_head *head)
580{
581 struct wake_q_node *node = head->first;
582
583 while (node != WAKE_Q_TAIL) {
584 struct task_struct *task;
585
586 task = container_of(node, struct task_struct, wake_q);
587 BUG_ON(!task);
588
589 node = node->next;
590 task->wake_q.next = NULL;
591
592
593
594
595
596 wake_up_process(task);
597 put_task_struct(task);
598 }
599}
600
601
602
603
604
605
606
607
608void resched_curr(struct rq *rq)
609{
610 struct task_struct *curr = rq->curr;
611 int cpu;
612
613 lockdep_assert_held(&rq->lock);
614
615 if (test_tsk_need_resched(curr))
616 return;
617
618 cpu = cpu_of(rq);
619
620 if (cpu == smp_processor_id()) {
621 set_tsk_need_resched(curr);
622 set_preempt_need_resched();
623 return;
624 }
625
626 if (set_nr_and_not_polling(curr))
627 smp_send_reschedule(cpu);
628 else
629 trace_sched_wake_idle_without_ipi(cpu);
630}
631
632void resched_cpu(int cpu)
633{
634 struct rq *rq = cpu_rq(cpu);
635 unsigned long flags;
636
637 raw_spin_lock_irqsave(&rq->lock, flags);
638 if (cpu_online(cpu) || cpu == smp_processor_id())
639 resched_curr(rq);
640 raw_spin_unlock_irqrestore(&rq->lock, flags);
641}
642
643#ifdef CONFIG_SMP
644#ifdef CONFIG_NO_HZ_COMMON
645
646
647
648
649
650
651
652
653int get_nohz_timer_target(void)
654{
655 int i, cpu = smp_processor_id(), default_cpu = -1;
656 struct sched_domain *sd;
657
658 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
659 if (!idle_cpu(cpu))
660 return cpu;
661 default_cpu = cpu;
662 }
663
664 rcu_read_lock();
665 for_each_domain(cpu, sd) {
666 for_each_cpu_and(i, sched_domain_span(sd),
667 housekeeping_cpumask(HK_FLAG_TIMER)) {
668 if (cpu == i)
669 continue;
670
671 if (!idle_cpu(i)) {
672 cpu = i;
673 goto unlock;
674 }
675 }
676 }
677
678 if (default_cpu == -1)
679 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
680 cpu = default_cpu;
681unlock:
682 rcu_read_unlock();
683 return cpu;
684}
685
686
687
688
689
690
691
692
693
694
695
696static void wake_up_idle_cpu(int cpu)
697{
698 struct rq *rq = cpu_rq(cpu);
699
700 if (cpu == smp_processor_id())
701 return;
702
703 if (set_nr_and_not_polling(rq->idle))
704 smp_send_reschedule(cpu);
705 else
706 trace_sched_wake_idle_without_ipi(cpu);
707}
708
709static bool wake_up_full_nohz_cpu(int cpu)
710{
711
712
713
714
715
716
717 if (cpu_is_offline(cpu))
718 return true;
719 if (tick_nohz_full_cpu(cpu)) {
720 if (cpu != smp_processor_id() ||
721 tick_nohz_tick_stopped())
722 tick_nohz_full_kick_cpu(cpu);
723 return true;
724 }
725
726 return false;
727}
728
729
730
731
732
733
734void wake_up_nohz_cpu(int cpu)
735{
736 if (!wake_up_full_nohz_cpu(cpu))
737 wake_up_idle_cpu(cpu);
738}
739
740static void nohz_csd_func(void *info)
741{
742 struct rq *rq = info;
743 int cpu = cpu_of(rq);
744 unsigned int flags;
745
746
747
748
749 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
750 WARN_ON(!(flags & NOHZ_KICK_MASK));
751
752 rq->idle_balance = idle_cpu(cpu);
753 if (rq->idle_balance && !need_resched()) {
754 rq->nohz_idle_balance = flags;
755 raise_softirq_irqoff(SCHED_SOFTIRQ);
756 }
757}
758
759#endif
760
761#ifdef CONFIG_NO_HZ_FULL
762bool sched_can_stop_tick(struct rq *rq)
763{
764 int fifo_nr_running;
765
766
767 if (rq->dl.dl_nr_running)
768 return false;
769
770
771
772
773
774 if (rq->rt.rr_nr_running) {
775 if (rq->rt.rr_nr_running == 1)
776 return true;
777 else
778 return false;
779 }
780
781
782
783
784
785 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
786 if (fifo_nr_running)
787 return true;
788
789
790
791
792
793
794 if (rq->nr_running > 1)
795 return false;
796
797 return true;
798}
799#endif
800#endif
801
802#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
803 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
804
805
806
807
808
809
810int walk_tg_tree_from(struct task_group *from,
811 tg_visitor down, tg_visitor up, void *data)
812{
813 struct task_group *parent, *child;
814 int ret;
815
816 parent = from;
817
818down:
819 ret = (*down)(parent, data);
820 if (ret)
821 goto out;
822 list_for_each_entry_rcu(child, &parent->children, siblings) {
823 parent = child;
824 goto down;
825
826up:
827 continue;
828 }
829 ret = (*up)(parent, data);
830 if (ret || parent == from)
831 goto out;
832
833 child = parent;
834 parent = parent->parent;
835 if (parent)
836 goto up;
837out:
838 return ret;
839}
840
841int tg_nop(struct task_group *tg, void *data)
842{
843 return 0;
844}
845#endif
846
847static void set_load_weight(struct task_struct *p, bool update_load)
848{
849 int prio = p->static_prio - MAX_RT_PRIO;
850 struct load_weight *load = &p->se.load;
851
852
853
854
855 if (task_has_idle_policy(p)) {
856 load->weight = scale_load(WEIGHT_IDLEPRIO);
857 load->inv_weight = WMULT_IDLEPRIO;
858 return;
859 }
860
861
862
863
864
865 if (update_load && p->sched_class == &fair_sched_class) {
866 reweight_task(p, prio);
867 } else {
868 load->weight = scale_load(sched_prio_to_weight[prio]);
869 load->inv_weight = sched_prio_to_wmult[prio];
870 }
871}
872
873#ifdef CONFIG_UCLAMP_TASK
874
875
876
877
878
879
880
881
882
883
884static DEFINE_MUTEX(uclamp_mutex);
885
886
887unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
888
889
890unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
908
909
910static struct uclamp_se uclamp_default[UCLAMP_CNT];
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
931
932
933#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
934
935#define for_each_clamp_id(clamp_id) \
936 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
937
938static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
939{
940 return clamp_value / UCLAMP_BUCKET_DELTA;
941}
942
943static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
944{
945 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
946}
947
948static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
949{
950 if (clamp_id == UCLAMP_MIN)
951 return 0;
952 return SCHED_CAPACITY_SCALE;
953}
954
955static inline void uclamp_se_set(struct uclamp_se *uc_se,
956 unsigned int value, bool user_defined)
957{
958 uc_se->value = value;
959 uc_se->bucket_id = uclamp_bucket_id(value);
960 uc_se->user_defined = user_defined;
961}
962
963static inline unsigned int
964uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
965 unsigned int clamp_value)
966{
967
968
969
970
971
972 if (clamp_id == UCLAMP_MAX) {
973 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
974 return clamp_value;
975 }
976
977 return uclamp_none(UCLAMP_MIN);
978}
979
980static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
981 unsigned int clamp_value)
982{
983
984 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
985 return;
986
987 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
988}
989
990static inline
991unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
992 unsigned int clamp_value)
993{
994 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
995 int bucket_id = UCLAMP_BUCKETS - 1;
996
997
998
999
1000
1001 for ( ; bucket_id >= 0; bucket_id--) {
1002 if (!bucket[bucket_id].tasks)
1003 continue;
1004 return bucket[bucket_id].value;
1005 }
1006
1007
1008 return uclamp_idle_value(rq, clamp_id, clamp_value);
1009}
1010
1011static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1012{
1013 unsigned int default_util_min;
1014 struct uclamp_se *uc_se;
1015
1016 lockdep_assert_held(&p->pi_lock);
1017
1018 uc_se = &p->uclamp_req[UCLAMP_MIN];
1019
1020
1021 if (uc_se->user_defined)
1022 return;
1023
1024 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1025 uclamp_se_set(uc_se, default_util_min, false);
1026}
1027
1028static void uclamp_update_util_min_rt_default(struct task_struct *p)
1029{
1030 struct rq_flags rf;
1031 struct rq *rq;
1032
1033 if (!rt_task(p))
1034 return;
1035
1036
1037 rq = task_rq_lock(p, &rf);
1038 __uclamp_update_util_min_rt_default(p);
1039 task_rq_unlock(rq, p, &rf);
1040}
1041
1042static void uclamp_sync_util_min_rt_default(void)
1043{
1044 struct task_struct *g, *p;
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059 read_lock(&tasklist_lock);
1060 smp_mb__after_spinlock();
1061 read_unlock(&tasklist_lock);
1062
1063 rcu_read_lock();
1064 for_each_process_thread(g, p)
1065 uclamp_update_util_min_rt_default(p);
1066 rcu_read_unlock();
1067}
1068
1069static inline struct uclamp_se
1070uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1071{
1072 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1073#ifdef CONFIG_UCLAMP_TASK_GROUP
1074 struct uclamp_se uc_max;
1075
1076
1077
1078
1079
1080 if (task_group_is_autogroup(task_group(p)))
1081 return uc_req;
1082 if (task_group(p) == &root_task_group)
1083 return uc_req;
1084
1085 uc_max = task_group(p)->uclamp[clamp_id];
1086 if (uc_req.value > uc_max.value || !uc_req.user_defined)
1087 return uc_max;
1088#endif
1089
1090 return uc_req;
1091}
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101static inline struct uclamp_se
1102uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1103{
1104 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1105 struct uclamp_se uc_max = uclamp_default[clamp_id];
1106
1107
1108 if (unlikely(uc_req.value > uc_max.value))
1109 return uc_max;
1110
1111 return uc_req;
1112}
1113
1114unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1115{
1116 struct uclamp_se uc_eff;
1117
1118
1119 if (p->uclamp[clamp_id].active)
1120 return (unsigned long)p->uclamp[clamp_id].value;
1121
1122 uc_eff = uclamp_eff_get(p, clamp_id);
1123
1124 return (unsigned long)uc_eff.value;
1125}
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1138 enum uclamp_id clamp_id)
1139{
1140 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1141 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1142 struct uclamp_bucket *bucket;
1143
1144 lockdep_assert_held(&rq->lock);
1145
1146
1147 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1148
1149 bucket = &uc_rq->bucket[uc_se->bucket_id];
1150 bucket->tasks++;
1151 uc_se->active = true;
1152
1153 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1154
1155
1156
1157
1158
1159 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1160 bucket->value = uc_se->value;
1161
1162 if (uc_se->value > READ_ONCE(uc_rq->value))
1163 WRITE_ONCE(uc_rq->value, uc_se->value);
1164}
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1176 enum uclamp_id clamp_id)
1177{
1178 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1179 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1180 struct uclamp_bucket *bucket;
1181 unsigned int bkt_clamp;
1182 unsigned int rq_clamp;
1183
1184 lockdep_assert_held(&rq->lock);
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 if (unlikely(!uc_se->active))
1210 return;
1211
1212 bucket = &uc_rq->bucket[uc_se->bucket_id];
1213
1214 SCHED_WARN_ON(!bucket->tasks);
1215 if (likely(bucket->tasks))
1216 bucket->tasks--;
1217
1218 uc_se->active = false;
1219
1220
1221
1222
1223
1224
1225
1226 if (likely(bucket->tasks))
1227 return;
1228
1229 rq_clamp = READ_ONCE(uc_rq->value);
1230
1231
1232
1233
1234 SCHED_WARN_ON(bucket->value > rq_clamp);
1235 if (bucket->value >= rq_clamp) {
1236 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1237 WRITE_ONCE(uc_rq->value, bkt_clamp);
1238 }
1239}
1240
1241static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1242{
1243 enum uclamp_id clamp_id;
1244
1245
1246
1247
1248
1249
1250
1251 if (!static_branch_unlikely(&sched_uclamp_used))
1252 return;
1253
1254 if (unlikely(!p->sched_class->uclamp_enabled))
1255 return;
1256
1257 for_each_clamp_id(clamp_id)
1258 uclamp_rq_inc_id(rq, p, clamp_id);
1259
1260
1261 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1262 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1263}
1264
1265static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1266{
1267 enum uclamp_id clamp_id;
1268
1269
1270
1271
1272
1273
1274
1275 if (!static_branch_unlikely(&sched_uclamp_used))
1276 return;
1277
1278 if (unlikely(!p->sched_class->uclamp_enabled))
1279 return;
1280
1281 for_each_clamp_id(clamp_id)
1282 uclamp_rq_dec_id(rq, p, clamp_id);
1283}
1284
1285static inline void
1286uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1287{
1288 struct rq_flags rf;
1289 struct rq *rq;
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299 rq = task_rq_lock(p, &rf);
1300
1301
1302
1303
1304
1305
1306
1307 if (p->uclamp[clamp_id].active) {
1308 uclamp_rq_dec_id(rq, p, clamp_id);
1309 uclamp_rq_inc_id(rq, p, clamp_id);
1310 }
1311
1312 task_rq_unlock(rq, p, &rf);
1313}
1314
1315#ifdef CONFIG_UCLAMP_TASK_GROUP
1316static inline void
1317uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1318 unsigned int clamps)
1319{
1320 enum uclamp_id clamp_id;
1321 struct css_task_iter it;
1322 struct task_struct *p;
1323
1324 css_task_iter_start(css, 0, &it);
1325 while ((p = css_task_iter_next(&it))) {
1326 for_each_clamp_id(clamp_id) {
1327 if ((0x1 << clamp_id) & clamps)
1328 uclamp_update_active(p, clamp_id);
1329 }
1330 }
1331 css_task_iter_end(&it);
1332}
1333
1334static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1335static void uclamp_update_root_tg(void)
1336{
1337 struct task_group *tg = &root_task_group;
1338
1339 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1340 sysctl_sched_uclamp_util_min, false);
1341 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1342 sysctl_sched_uclamp_util_max, false);
1343
1344 rcu_read_lock();
1345 cpu_util_update_eff(&root_task_group.css);
1346 rcu_read_unlock();
1347}
1348#else
1349static void uclamp_update_root_tg(void) { }
1350#endif
1351
1352int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1353 void *buffer, size_t *lenp, loff_t *ppos)
1354{
1355 bool update_root_tg = false;
1356 int old_min, old_max, old_min_rt;
1357 int result;
1358
1359 mutex_lock(&uclamp_mutex);
1360 old_min = sysctl_sched_uclamp_util_min;
1361 old_max = sysctl_sched_uclamp_util_max;
1362 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1363
1364 result = proc_dointvec(table, write, buffer, lenp, ppos);
1365 if (result)
1366 goto undo;
1367 if (!write)
1368 goto done;
1369
1370 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1371 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1372 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1373
1374 result = -EINVAL;
1375 goto undo;
1376 }
1377
1378 if (old_min != sysctl_sched_uclamp_util_min) {
1379 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1380 sysctl_sched_uclamp_util_min, false);
1381 update_root_tg = true;
1382 }
1383 if (old_max != sysctl_sched_uclamp_util_max) {
1384 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1385 sysctl_sched_uclamp_util_max, false);
1386 update_root_tg = true;
1387 }
1388
1389 if (update_root_tg) {
1390 static_branch_enable(&sched_uclamp_used);
1391 uclamp_update_root_tg();
1392 }
1393
1394 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1395 static_branch_enable(&sched_uclamp_used);
1396 uclamp_sync_util_min_rt_default();
1397 }
1398
1399
1400
1401
1402
1403
1404
1405 goto done;
1406
1407undo:
1408 sysctl_sched_uclamp_util_min = old_min;
1409 sysctl_sched_uclamp_util_max = old_max;
1410 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1411done:
1412 mutex_unlock(&uclamp_mutex);
1413
1414 return result;
1415}
1416
1417static int uclamp_validate(struct task_struct *p,
1418 const struct sched_attr *attr)
1419{
1420 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1421 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1422
1423 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1424 lower_bound = attr->sched_util_min;
1425 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1426 upper_bound = attr->sched_util_max;
1427
1428 if (lower_bound > upper_bound)
1429 return -EINVAL;
1430 if (upper_bound > SCHED_CAPACITY_SCALE)
1431 return -EINVAL;
1432
1433
1434
1435
1436
1437
1438
1439
1440 static_branch_enable(&sched_uclamp_used);
1441
1442 return 0;
1443}
1444
1445static void __setscheduler_uclamp(struct task_struct *p,
1446 const struct sched_attr *attr)
1447{
1448 enum uclamp_id clamp_id;
1449
1450
1451
1452
1453
1454 for_each_clamp_id(clamp_id) {
1455 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1456
1457
1458 if (uc_se->user_defined)
1459 continue;
1460
1461
1462
1463
1464
1465 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1466 __uclamp_update_util_min_rt_default(p);
1467 else
1468 uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
1469
1470 }
1471
1472 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1473 return;
1474
1475 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1476 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1477 attr->sched_util_min, true);
1478 }
1479
1480 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1481 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1482 attr->sched_util_max, true);
1483 }
1484}
1485
1486static void uclamp_fork(struct task_struct *p)
1487{
1488 enum uclamp_id clamp_id;
1489
1490
1491
1492
1493
1494 for_each_clamp_id(clamp_id)
1495 p->uclamp[clamp_id].active = false;
1496
1497 if (likely(!p->sched_reset_on_fork))
1498 return;
1499
1500 for_each_clamp_id(clamp_id) {
1501 uclamp_se_set(&p->uclamp_req[clamp_id],
1502 uclamp_none(clamp_id), false);
1503 }
1504}
1505
1506static void uclamp_post_fork(struct task_struct *p)
1507{
1508 uclamp_update_util_min_rt_default(p);
1509}
1510
1511static void __init init_uclamp_rq(struct rq *rq)
1512{
1513 enum uclamp_id clamp_id;
1514 struct uclamp_rq *uc_rq = rq->uclamp;
1515
1516 for_each_clamp_id(clamp_id) {
1517 uc_rq[clamp_id] = (struct uclamp_rq) {
1518 .value = uclamp_none(clamp_id)
1519 };
1520 }
1521
1522 rq->uclamp_flags = 0;
1523}
1524
1525static void __init init_uclamp(void)
1526{
1527 struct uclamp_se uc_max = {};
1528 enum uclamp_id clamp_id;
1529 int cpu;
1530
1531 for_each_possible_cpu(cpu)
1532 init_uclamp_rq(cpu_rq(cpu));
1533
1534 for_each_clamp_id(clamp_id) {
1535 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1536 uclamp_none(clamp_id), false);
1537 }
1538
1539
1540 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1541 for_each_clamp_id(clamp_id) {
1542 uclamp_default[clamp_id] = uc_max;
1543#ifdef CONFIG_UCLAMP_TASK_GROUP
1544 root_task_group.uclamp_req[clamp_id] = uc_max;
1545 root_task_group.uclamp[clamp_id] = uc_max;
1546#endif
1547 }
1548}
1549
1550#else
1551static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1552static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1553static inline int uclamp_validate(struct task_struct *p,
1554 const struct sched_attr *attr)
1555{
1556 return -EOPNOTSUPP;
1557}
1558static void __setscheduler_uclamp(struct task_struct *p,
1559 const struct sched_attr *attr) { }
1560static inline void uclamp_fork(struct task_struct *p) { }
1561static inline void uclamp_post_fork(struct task_struct *p) { }
1562static inline void init_uclamp(void) { }
1563#endif
1564
1565static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1566{
1567 if (!(flags & ENQUEUE_NOCLOCK))
1568 update_rq_clock(rq);
1569
1570 if (!(flags & ENQUEUE_RESTORE)) {
1571 sched_info_queued(rq, p);
1572 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1573 }
1574
1575 uclamp_rq_inc(rq, p);
1576 p->sched_class->enqueue_task(rq, p, flags);
1577}
1578
1579static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1580{
1581 if (!(flags & DEQUEUE_NOCLOCK))
1582 update_rq_clock(rq);
1583
1584 if (!(flags & DEQUEUE_SAVE)) {
1585 sched_info_dequeued(rq, p);
1586 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1587 }
1588
1589 uclamp_rq_dec(rq, p);
1590 p->sched_class->dequeue_task(rq, p, flags);
1591}
1592
1593void activate_task(struct rq *rq, struct task_struct *p, int flags)
1594{
1595 enqueue_task(rq, p, flags);
1596
1597 p->on_rq = TASK_ON_RQ_QUEUED;
1598}
1599
1600void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1601{
1602 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1603
1604 dequeue_task(rq, p, flags);
1605}
1606
1607
1608
1609
1610static inline int __normal_prio(struct task_struct *p)
1611{
1612 return p->static_prio;
1613}
1614
1615
1616
1617
1618
1619
1620
1621
1622static inline int normal_prio(struct task_struct *p)
1623{
1624 int prio;
1625
1626 if (task_has_dl_policy(p))
1627 prio = MAX_DL_PRIO-1;
1628 else if (task_has_rt_policy(p))
1629 prio = MAX_RT_PRIO-1 - p->rt_priority;
1630 else
1631 prio = __normal_prio(p);
1632 return prio;
1633}
1634
1635
1636
1637
1638
1639
1640
1641
1642static int effective_prio(struct task_struct *p)
1643{
1644 p->normal_prio = normal_prio(p);
1645
1646
1647
1648
1649
1650 if (!rt_prio(p->prio))
1651 return p->normal_prio;
1652 return p->prio;
1653}
1654
1655
1656
1657
1658
1659
1660
1661inline int task_curr(const struct task_struct *p)
1662{
1663 return cpu_curr(task_cpu(p)) == p;
1664}
1665
1666
1667
1668
1669
1670
1671
1672
1673static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1674 const struct sched_class *prev_class,
1675 int oldprio)
1676{
1677 if (prev_class != p->sched_class) {
1678 if (prev_class->switched_from)
1679 prev_class->switched_from(rq, p);
1680
1681 p->sched_class->switched_to(rq, p);
1682 } else if (oldprio != p->prio || dl_task(p))
1683 p->sched_class->prio_changed(rq, p, oldprio);
1684}
1685
1686void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1687{
1688 if (p->sched_class == rq->curr->sched_class)
1689 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1690 else if (p->sched_class > rq->curr->sched_class)
1691 resched_curr(rq);
1692
1693
1694
1695
1696
1697 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1698 rq_clock_skip_update(rq);
1699}
1700
1701#ifdef CONFIG_SMP
1702
1703
1704
1705
1706
1707static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1708{
1709 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1710 return false;
1711
1712 if (is_per_cpu_kthread(p))
1713 return cpu_online(cpu);
1714
1715 return cpu_active(cpu);
1716}
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1738 struct task_struct *p, int new_cpu)
1739{
1740 lockdep_assert_held(&rq->lock);
1741
1742 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1743 set_task_cpu(p, new_cpu);
1744 rq_unlock(rq, rf);
1745
1746 rq = cpu_rq(new_cpu);
1747
1748 rq_lock(rq, rf);
1749 BUG_ON(task_cpu(p) != new_cpu);
1750 activate_task(rq, p, 0);
1751 check_preempt_curr(rq, p, 0);
1752
1753 return rq;
1754}
1755
1756struct migration_arg {
1757 struct task_struct *task;
1758 int dest_cpu;
1759};
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1771 struct task_struct *p, int dest_cpu)
1772{
1773
1774 if (!is_cpu_allowed(p, dest_cpu))
1775 return rq;
1776
1777 update_rq_clock(rq);
1778 rq = move_queued_task(rq, rf, p, dest_cpu);
1779
1780 return rq;
1781}
1782
1783
1784
1785
1786
1787
1788static int migration_cpu_stop(void *data)
1789{
1790 struct migration_arg *arg = data;
1791 struct task_struct *p = arg->task;
1792 struct rq *rq = this_rq();
1793 struct rq_flags rf;
1794
1795
1796
1797
1798
1799 local_irq_disable();
1800
1801
1802
1803
1804
1805 flush_smp_call_function_from_idle();
1806
1807 raw_spin_lock(&p->pi_lock);
1808 rq_lock(rq, &rf);
1809
1810
1811
1812
1813
1814 if (task_rq(p) == rq) {
1815 if (task_on_rq_queued(p))
1816 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1817 else
1818 p->wake_cpu = arg->dest_cpu;
1819 }
1820 rq_unlock(rq, &rf);
1821 raw_spin_unlock(&p->pi_lock);
1822
1823 local_irq_enable();
1824 return 0;
1825}
1826
1827
1828
1829
1830
1831void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1832{
1833 cpumask_copy(&p->cpus_mask, new_mask);
1834 p->nr_cpus_allowed = cpumask_weight(new_mask);
1835}
1836
1837void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1838{
1839 struct rq *rq = task_rq(p);
1840 bool queued, running;
1841
1842 lockdep_assert_held(&p->pi_lock);
1843
1844 queued = task_on_rq_queued(p);
1845 running = task_current(rq, p);
1846
1847 if (queued) {
1848
1849
1850
1851
1852 lockdep_assert_held(&rq->lock);
1853 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1854 }
1855 if (running)
1856 put_prev_task(rq, p);
1857
1858 p->sched_class->set_cpus_allowed(p, new_mask);
1859
1860 if (queued)
1861 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1862 if (running)
1863 set_next_task(rq, p);
1864}
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875static int __set_cpus_allowed_ptr(struct task_struct *p,
1876 const struct cpumask *new_mask, bool check)
1877{
1878 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1879 unsigned int dest_cpu;
1880 struct rq_flags rf;
1881 struct rq *rq;
1882 int ret = 0;
1883
1884 rq = task_rq_lock(p, &rf);
1885 update_rq_clock(rq);
1886
1887 if (p->flags & PF_KTHREAD) {
1888
1889
1890
1891 cpu_valid_mask = cpu_online_mask;
1892 }
1893
1894
1895
1896
1897
1898 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1899 ret = -EINVAL;
1900 goto out;
1901 }
1902
1903 if (cpumask_equal(&p->cpus_mask, new_mask))
1904 goto out;
1905
1906
1907
1908
1909
1910
1911 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
1912 if (dest_cpu >= nr_cpu_ids) {
1913 ret = -EINVAL;
1914 goto out;
1915 }
1916
1917 do_set_cpus_allowed(p, new_mask);
1918
1919 if (p->flags & PF_KTHREAD) {
1920
1921
1922
1923
1924 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1925 !cpumask_intersects(new_mask, cpu_active_mask) &&
1926 p->nr_cpus_allowed != 1);
1927 }
1928
1929
1930 if (cpumask_test_cpu(task_cpu(p), new_mask))
1931 goto out;
1932
1933 if (task_running(rq, p) || p->state == TASK_WAKING) {
1934 struct migration_arg arg = { p, dest_cpu };
1935
1936 task_rq_unlock(rq, p, &rf);
1937 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1938 return 0;
1939 } else if (task_on_rq_queued(p)) {
1940
1941
1942
1943
1944 rq = move_queued_task(rq, &rf, p, dest_cpu);
1945 }
1946out:
1947 task_rq_unlock(rq, p, &rf);
1948
1949 return ret;
1950}
1951
1952int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1953{
1954 return __set_cpus_allowed_ptr(p, new_mask, false);
1955}
1956EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1957
1958void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1959{
1960#ifdef CONFIG_SCHED_DEBUG
1961
1962
1963
1964
1965 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1966 !p->on_rq);
1967
1968
1969
1970
1971
1972
1973 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1974 p->sched_class == &fair_sched_class &&
1975 (p->on_rq && !task_on_rq_migrating(p)));
1976
1977#ifdef CONFIG_LOCKDEP
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1989 lockdep_is_held(&task_rq(p)->lock)));
1990#endif
1991
1992
1993
1994 WARN_ON_ONCE(!cpu_online(new_cpu));
1995#endif
1996
1997 trace_sched_migrate_task(p, new_cpu);
1998
1999 if (task_cpu(p) != new_cpu) {
2000 if (p->sched_class->migrate_task_rq)
2001 p->sched_class->migrate_task_rq(p, new_cpu);
2002 p->se.nr_migrations++;
2003 rseq_migrate(p);
2004 perf_event_task_migrate(p);
2005 }
2006
2007 __set_task_cpu(p, new_cpu);
2008}
2009
2010#ifdef CONFIG_NUMA_BALANCING
2011static void __migrate_swap_task(struct task_struct *p, int cpu)
2012{
2013 if (task_on_rq_queued(p)) {
2014 struct rq *src_rq, *dst_rq;
2015 struct rq_flags srf, drf;
2016
2017 src_rq = task_rq(p);
2018 dst_rq = cpu_rq(cpu);
2019
2020 rq_pin_lock(src_rq, &srf);
2021 rq_pin_lock(dst_rq, &drf);
2022
2023 deactivate_task(src_rq, p, 0);
2024 set_task_cpu(p, cpu);
2025 activate_task(dst_rq, p, 0);
2026 check_preempt_curr(dst_rq, p, 0);
2027
2028 rq_unpin_lock(dst_rq, &drf);
2029 rq_unpin_lock(src_rq, &srf);
2030
2031 } else {
2032
2033
2034
2035
2036
2037 p->wake_cpu = cpu;
2038 }
2039}
2040
2041struct migration_swap_arg {
2042 struct task_struct *src_task, *dst_task;
2043 int src_cpu, dst_cpu;
2044};
2045
2046static int migrate_swap_stop(void *data)
2047{
2048 struct migration_swap_arg *arg = data;
2049 struct rq *src_rq, *dst_rq;
2050 int ret = -EAGAIN;
2051
2052 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2053 return -EAGAIN;
2054
2055 src_rq = cpu_rq(arg->src_cpu);
2056 dst_rq = cpu_rq(arg->dst_cpu);
2057
2058 double_raw_lock(&arg->src_task->pi_lock,
2059 &arg->dst_task->pi_lock);
2060 double_rq_lock(src_rq, dst_rq);
2061
2062 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2063 goto unlock;
2064
2065 if (task_cpu(arg->src_task) != arg->src_cpu)
2066 goto unlock;
2067
2068 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2069 goto unlock;
2070
2071 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2072 goto unlock;
2073
2074 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2075 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2076
2077 ret = 0;
2078
2079unlock:
2080 double_rq_unlock(src_rq, dst_rq);
2081 raw_spin_unlock(&arg->dst_task->pi_lock);
2082 raw_spin_unlock(&arg->src_task->pi_lock);
2083
2084 return ret;
2085}
2086
2087
2088
2089
2090int migrate_swap(struct task_struct *cur, struct task_struct *p,
2091 int target_cpu, int curr_cpu)
2092{
2093 struct migration_swap_arg arg;
2094 int ret = -EINVAL;
2095
2096 arg = (struct migration_swap_arg){
2097 .src_task = cur,
2098 .src_cpu = curr_cpu,
2099 .dst_task = p,
2100 .dst_cpu = target_cpu,
2101 };
2102
2103 if (arg.src_cpu == arg.dst_cpu)
2104 goto out;
2105
2106
2107
2108
2109
2110 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2111 goto out;
2112
2113 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2114 goto out;
2115
2116 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2117 goto out;
2118
2119 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2120 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2121
2122out:
2123 return ret;
2124}
2125#endif
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2144{
2145 int running, queued;
2146 struct rq_flags rf;
2147 unsigned long ncsw;
2148 struct rq *rq;
2149
2150 for (;;) {
2151
2152
2153
2154
2155
2156
2157 rq = task_rq(p);
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170 while (task_running(rq, p)) {
2171 if (match_state && unlikely(p->state != match_state))
2172 return 0;
2173 cpu_relax();
2174 }
2175
2176
2177
2178
2179
2180
2181 rq = task_rq_lock(p, &rf);
2182 trace_sched_wait_task(p);
2183 running = task_running(rq, p);
2184 queued = task_on_rq_queued(p);
2185 ncsw = 0;
2186 if (!match_state || p->state == match_state)
2187 ncsw = p->nvcsw | LONG_MIN;
2188 task_rq_unlock(rq, p, &rf);
2189
2190
2191
2192
2193 if (unlikely(!ncsw))
2194 break;
2195
2196
2197
2198
2199
2200
2201
2202 if (unlikely(running)) {
2203 cpu_relax();
2204 continue;
2205 }
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216 if (unlikely(queued)) {
2217 ktime_t to = NSEC_PER_SEC / HZ;
2218
2219 set_current_state(TASK_UNINTERRUPTIBLE);
2220 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2221 continue;
2222 }
2223
2224
2225
2226
2227
2228
2229 break;
2230 }
2231
2232 return ncsw;
2233}
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248void kick_process(struct task_struct *p)
2249{
2250 int cpu;
2251
2252 preempt_disable();
2253 cpu = task_cpu(p);
2254 if ((cpu != smp_processor_id()) && task_curr(p))
2255 smp_send_reschedule(cpu);
2256 preempt_enable();
2257}
2258EXPORT_SYMBOL_GPL(kick_process);
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282static int select_fallback_rq(int cpu, struct task_struct *p)
2283{
2284 int nid = cpu_to_node(cpu);
2285 const struct cpumask *nodemask = NULL;
2286 enum { cpuset, possible, fail } state = cpuset;
2287 int dest_cpu;
2288
2289
2290
2291
2292
2293
2294 if (nid != -1) {
2295 nodemask = cpumask_of_node(nid);
2296
2297
2298 for_each_cpu(dest_cpu, nodemask) {
2299 if (!cpu_active(dest_cpu))
2300 continue;
2301 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
2302 return dest_cpu;
2303 }
2304 }
2305
2306 for (;;) {
2307
2308 for_each_cpu(dest_cpu, p->cpus_ptr) {
2309 if (!is_cpu_allowed(p, dest_cpu))
2310 continue;
2311
2312 goto out;
2313 }
2314
2315
2316 switch (state) {
2317 case cpuset:
2318 if (IS_ENABLED(CONFIG_CPUSETS)) {
2319 cpuset_cpus_allowed_fallback(p);
2320 state = possible;
2321 break;
2322 }
2323 fallthrough;
2324 case possible:
2325 do_set_cpus_allowed(p, cpu_possible_mask);
2326 state = fail;
2327 break;
2328
2329 case fail:
2330 BUG();
2331 break;
2332 }
2333 }
2334
2335out:
2336 if (state != cpuset) {
2337
2338
2339
2340
2341
2342 if (p->mm && printk_ratelimit()) {
2343 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
2344 task_pid_nr(p), p->comm, cpu);
2345 }
2346 }
2347
2348 return dest_cpu;
2349}
2350
2351
2352
2353
2354static inline
2355int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
2356{
2357 lockdep_assert_held(&p->pi_lock);
2358
2359 if (p->nr_cpus_allowed > 1)
2360 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2361 else
2362 cpu = cpumask_any(p->cpus_ptr);
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374 if (unlikely(!is_cpu_allowed(p, cpu)))
2375 cpu = select_fallback_rq(task_cpu(p), p);
2376
2377 return cpu;
2378}
2379
2380void sched_set_stop_task(int cpu, struct task_struct *stop)
2381{
2382 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2383 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2384
2385 if (stop) {
2386
2387
2388
2389
2390
2391
2392
2393
2394 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2395
2396 stop->sched_class = &stop_sched_class;
2397 }
2398
2399 cpu_rq(cpu)->stop = stop;
2400
2401 if (old_stop) {
2402
2403
2404
2405
2406 old_stop->sched_class = &rt_sched_class;
2407 }
2408}
2409
2410#else
2411
2412static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2413 const struct cpumask *new_mask, bool check)
2414{
2415 return set_cpus_allowed_ptr(p, new_mask);
2416}
2417
2418#endif
2419
2420static void
2421ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2422{
2423 struct rq *rq;
2424
2425 if (!schedstat_enabled())
2426 return;
2427
2428 rq = this_rq();
2429
2430#ifdef CONFIG_SMP
2431 if (cpu == rq->cpu) {
2432 __schedstat_inc(rq->ttwu_local);
2433 __schedstat_inc(p->se.statistics.nr_wakeups_local);
2434 } else {
2435 struct sched_domain *sd;
2436
2437 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2438 rcu_read_lock();
2439 for_each_domain(rq->cpu, sd) {
2440 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2441 __schedstat_inc(sd->ttwu_wake_remote);
2442 break;
2443 }
2444 }
2445 rcu_read_unlock();
2446 }
2447
2448 if (wake_flags & WF_MIGRATED)
2449 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2450#endif
2451
2452 __schedstat_inc(rq->ttwu_count);
2453 __schedstat_inc(p->se.statistics.nr_wakeups);
2454
2455 if (wake_flags & WF_SYNC)
2456 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2457}
2458
2459
2460
2461
2462static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2463 struct rq_flags *rf)
2464{
2465 check_preempt_curr(rq, p, wake_flags);
2466 p->state = TASK_RUNNING;
2467 trace_sched_wakeup(p);
2468
2469#ifdef CONFIG_SMP
2470 if (p->sched_class->task_woken) {
2471
2472
2473
2474
2475 rq_unpin_lock(rq, rf);
2476 p->sched_class->task_woken(rq, p);
2477 rq_repin_lock(rq, rf);
2478 }
2479
2480 if (rq->idle_stamp) {
2481 u64 delta = rq_clock(rq) - rq->idle_stamp;
2482 u64 max = 2*rq->max_idle_balance_cost;
2483
2484 update_avg(&rq->avg_idle, delta);
2485
2486 if (rq->avg_idle > max)
2487 rq->avg_idle = max;
2488
2489 rq->idle_stamp = 0;
2490 }
2491#endif
2492}
2493
2494static void
2495ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
2496 struct rq_flags *rf)
2497{
2498 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2499
2500 lockdep_assert_held(&rq->lock);
2501
2502 if (p->sched_contributes_to_load)
2503 rq->nr_uninterruptible--;
2504
2505#ifdef CONFIG_SMP
2506 if (wake_flags & WF_MIGRATED)
2507 en_flags |= ENQUEUE_MIGRATED;
2508#endif
2509
2510 activate_task(rq, p, en_flags);
2511 ttwu_do_wakeup(rq, p, wake_flags, rf);
2512}
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539static int ttwu_runnable(struct task_struct *p, int wake_flags)
2540{
2541 struct rq_flags rf;
2542 struct rq *rq;
2543 int ret = 0;
2544
2545 rq = __task_rq_lock(p, &rf);
2546 if (task_on_rq_queued(p)) {
2547
2548 update_rq_clock(rq);
2549 ttwu_do_wakeup(rq, p, wake_flags, &rf);
2550 ret = 1;
2551 }
2552 __task_rq_unlock(rq, &rf);
2553
2554 return ret;
2555}
2556
2557#ifdef CONFIG_SMP
2558void sched_ttwu_pending(void *arg)
2559{
2560 struct llist_node *llist = arg;
2561 struct rq *rq = this_rq();
2562 struct task_struct *p, *t;
2563 struct rq_flags rf;
2564
2565 if (!llist)
2566 return;
2567
2568
2569
2570
2571
2572
2573 WRITE_ONCE(rq->ttwu_pending, 0);
2574
2575 rq_lock_irqsave(rq, &rf);
2576 update_rq_clock(rq);
2577
2578 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
2579 if (WARN_ON_ONCE(p->on_cpu))
2580 smp_cond_load_acquire(&p->on_cpu, !VAL);
2581
2582 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
2583 set_task_cpu(p, cpu_of(rq));
2584
2585 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2586 }
2587
2588 rq_unlock_irqrestore(rq, &rf);
2589}
2590
2591void send_call_function_single_ipi(int cpu)
2592{
2593 struct rq *rq = cpu_rq(cpu);
2594
2595 if (!set_nr_if_polling(rq->idle))
2596 arch_send_call_function_single_ipi(cpu);
2597 else
2598 trace_sched_wake_idle_without_ipi(cpu);
2599}
2600
2601
2602
2603
2604
2605
2606
2607static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2608{
2609 struct rq *rq = cpu_rq(cpu);
2610
2611 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2612
2613 WRITE_ONCE(rq->ttwu_pending, 1);
2614 __smp_call_single_queue(cpu, &p->wake_entry.llist);
2615}
2616
2617void wake_up_if_idle(int cpu)
2618{
2619 struct rq *rq = cpu_rq(cpu);
2620 struct rq_flags rf;
2621
2622 rcu_read_lock();
2623
2624 if (!is_idle_task(rcu_dereference(rq->curr)))
2625 goto out;
2626
2627 if (set_nr_if_polling(rq->idle)) {
2628 trace_sched_wake_idle_without_ipi(cpu);
2629 } else {
2630 rq_lock_irqsave(rq, &rf);
2631 if (is_idle_task(rq->curr))
2632 smp_send_reschedule(cpu);
2633
2634 rq_unlock_irqrestore(rq, &rf);
2635 }
2636
2637out:
2638 rcu_read_unlock();
2639}
2640
2641bool cpus_share_cache(int this_cpu, int that_cpu)
2642{
2643 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2644}
2645
2646static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2647{
2648
2649
2650
2651
2652 if (!cpus_share_cache(smp_processor_id(), cpu))
2653 return true;
2654
2655
2656
2657
2658
2659
2660
2661 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
2662 return true;
2663
2664 return false;
2665}
2666
2667static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2668{
2669 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
2670 if (WARN_ON_ONCE(cpu == smp_processor_id()))
2671 return false;
2672
2673 sched_clock_cpu(cpu);
2674 __ttwu_queue_wakelist(p, cpu, wake_flags);
2675 return true;
2676 }
2677
2678 return false;
2679}
2680
2681#else
2682
2683static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2684{
2685 return false;
2686}
2687
2688#endif
2689
2690static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2691{
2692 struct rq *rq = cpu_rq(cpu);
2693 struct rq_flags rf;
2694
2695 if (ttwu_queue_wakelist(p, cpu, wake_flags))
2696 return;
2697
2698 rq_lock(rq, &rf);
2699 update_rq_clock(rq);
2700 ttwu_do_activate(rq, p, wake_flags, &rf);
2701 rq_unlock(rq, &rf);
2702}
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824static int
2825try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2826{
2827 unsigned long flags;
2828 int cpu, success = 0;
2829
2830 preempt_disable();
2831 if (p == current) {
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843 if (!(p->state & state))
2844 goto out;
2845
2846 success = 1;
2847 trace_sched_waking(p);
2848 p->state = TASK_RUNNING;
2849 trace_sched_wakeup(p);
2850 goto out;
2851 }
2852
2853
2854
2855
2856
2857
2858
2859 raw_spin_lock_irqsave(&p->pi_lock, flags);
2860 smp_mb__after_spinlock();
2861 if (!(p->state & state))
2862 goto unlock;
2863
2864 trace_sched_waking(p);
2865
2866
2867 success = 1;
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891 smp_rmb();
2892 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
2893 goto unlock;
2894
2895 if (p->in_iowait) {
2896 delayacct_blkio_end(p);
2897 atomic_dec(&task_rq(p)->nr_iowait);
2898 }
2899
2900#ifdef CONFIG_SMP
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924 smp_acquire__after_ctrl_dep();
2925
2926
2927
2928
2929
2930
2931
2932 p->state = TASK_WAKING;
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953 if (smp_load_acquire(&p->on_cpu) &&
2954 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
2955 goto unlock;
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966 smp_cond_load_acquire(&p->on_cpu, !VAL);
2967
2968 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2969 if (task_cpu(p) != cpu) {
2970 wake_flags |= WF_MIGRATED;
2971 psi_ttwu_dequeue(p);
2972 set_task_cpu(p, cpu);
2973 }
2974#else
2975 cpu = task_cpu(p);
2976#endif
2977
2978 ttwu_queue(p, cpu, wake_flags);
2979unlock:
2980 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2981out:
2982 if (success)
2983 ttwu_stat(p, task_cpu(p), wake_flags);
2984 preempt_enable();
2985
2986 return success;
2987}
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3008{
3009 bool ret = false;
3010 struct rq_flags rf;
3011 struct rq *rq;
3012
3013 lockdep_assert_irqs_enabled();
3014 raw_spin_lock_irq(&p->pi_lock);
3015 if (p->on_rq) {
3016 rq = __task_rq_lock(p, &rf);
3017 if (task_rq(p) == rq)
3018 ret = func(p, arg);
3019 rq_unlock(rq, &rf);
3020 } else {
3021 switch (p->state) {
3022 case TASK_RUNNING:
3023 case TASK_WAKING:
3024 break;
3025 default:
3026 smp_rmb();
3027 if (!p->on_rq)
3028 ret = func(p, arg);
3029 }
3030 }
3031 raw_spin_unlock_irq(&p->pi_lock);
3032 return ret;
3033}
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046int wake_up_process(struct task_struct *p)
3047{
3048 return try_to_wake_up(p, TASK_NORMAL, 0);
3049}
3050EXPORT_SYMBOL(wake_up_process);
3051
3052int wake_up_state(struct task_struct *p, unsigned int state)
3053{
3054 return try_to_wake_up(p, state, 0);
3055}
3056
3057
3058
3059
3060
3061
3062
3063static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3064{
3065 p->on_rq = 0;
3066
3067 p->se.on_rq = 0;
3068 p->se.exec_start = 0;
3069 p->se.sum_exec_runtime = 0;
3070 p->se.prev_sum_exec_runtime = 0;
3071 p->se.nr_migrations = 0;
3072 p->se.vruntime = 0;
3073 INIT_LIST_HEAD(&p->se.group_node);
3074
3075#ifdef CONFIG_FAIR_GROUP_SCHED
3076 p->se.cfs_rq = NULL;
3077#endif
3078
3079#ifdef CONFIG_SCHEDSTATS
3080
3081 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3082#endif
3083
3084 RB_CLEAR_NODE(&p->dl.rb_node);
3085 init_dl_task_timer(&p->dl);
3086 init_dl_inactive_task_timer(&p->dl);
3087 __dl_clear_params(p);
3088
3089 INIT_LIST_HEAD(&p->rt.run_list);
3090 p->rt.timeout = 0;
3091 p->rt.time_slice = sched_rr_timeslice;
3092 p->rt.on_rq = 0;
3093 p->rt.on_list = 0;
3094
3095#ifdef CONFIG_PREEMPT_NOTIFIERS
3096 INIT_HLIST_HEAD(&p->preempt_notifiers);
3097#endif
3098
3099#ifdef CONFIG_COMPACTION
3100 p->capture_control = NULL;
3101#endif
3102 init_numa_balancing(clone_flags, p);
3103#ifdef CONFIG_SMP
3104 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3105#endif
3106}
3107
3108DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3109
3110#ifdef CONFIG_NUMA_BALANCING
3111
3112void set_numabalancing_state(bool enabled)
3113{
3114 if (enabled)
3115 static_branch_enable(&sched_numa_balancing);
3116 else
3117 static_branch_disable(&sched_numa_balancing);
3118}
3119
3120#ifdef CONFIG_PROC_SYSCTL
3121int sysctl_numa_balancing(struct ctl_table *table, int write,
3122 void *buffer, size_t *lenp, loff_t *ppos)
3123{
3124 struct ctl_table t;
3125 int err;
3126 int state = static_branch_likely(&sched_numa_balancing);
3127
3128 if (write && !capable(CAP_SYS_ADMIN))
3129 return -EPERM;
3130
3131 t = *table;
3132 t.data = &state;
3133 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3134 if (err < 0)
3135 return err;
3136 if (write)
3137 set_numabalancing_state(state);
3138 return err;
3139}
3140#endif
3141#endif
3142
3143#ifdef CONFIG_SCHEDSTATS
3144
3145DEFINE_STATIC_KEY_FALSE(sched_schedstats);
3146static bool __initdata __sched_schedstats = false;
3147
3148static void set_schedstats(bool enabled)
3149{
3150 if (enabled)
3151 static_branch_enable(&sched_schedstats);
3152 else
3153 static_branch_disable(&sched_schedstats);
3154}
3155
3156void force_schedstat_enabled(void)
3157{
3158 if (!schedstat_enabled()) {
3159 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
3160 static_branch_enable(&sched_schedstats);
3161 }
3162}
3163
3164static int __init setup_schedstats(char *str)
3165{
3166 int ret = 0;
3167 if (!str)
3168 goto out;
3169
3170
3171
3172
3173
3174
3175 if (!strcmp(str, "enable")) {
3176 __sched_schedstats = true;
3177 ret = 1;
3178 } else if (!strcmp(str, "disable")) {
3179 __sched_schedstats = false;
3180 ret = 1;
3181 }
3182out:
3183 if (!ret)
3184 pr_warn("Unable to parse schedstats=\n");
3185
3186 return ret;
3187}
3188__setup("schedstats=", setup_schedstats);
3189
3190static void __init init_schedstats(void)
3191{
3192 set_schedstats(__sched_schedstats);
3193}
3194
3195#ifdef CONFIG_PROC_SYSCTL
3196int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3197 size_t *lenp, loff_t *ppos)
3198{
3199 struct ctl_table t;
3200 int err;
3201 int state = static_branch_likely(&sched_schedstats);
3202
3203 if (write && !capable(CAP_SYS_ADMIN))
3204 return -EPERM;
3205
3206 t = *table;
3207 t.data = &state;
3208 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3209 if (err < 0)
3210 return err;
3211 if (write)
3212 set_schedstats(state);
3213 return err;
3214}
3215#endif
3216#else
3217static inline void init_schedstats(void) {}
3218#endif
3219
3220
3221
3222
3223int sched_fork(unsigned long clone_flags, struct task_struct *p)
3224{
3225 unsigned long flags;
3226
3227 __sched_fork(clone_flags, p);
3228
3229
3230
3231
3232
3233 p->state = TASK_NEW;
3234
3235
3236
3237
3238 p->prio = current->normal_prio;
3239
3240 uclamp_fork(p);
3241
3242
3243
3244
3245 if (unlikely(p->sched_reset_on_fork)) {
3246 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3247 p->policy = SCHED_NORMAL;
3248 p->static_prio = NICE_TO_PRIO(0);
3249 p->rt_priority = 0;
3250 } else if (PRIO_TO_NICE(p->static_prio) < 0)
3251 p->static_prio = NICE_TO_PRIO(0);
3252
3253 p->prio = p->normal_prio = __normal_prio(p);
3254 set_load_weight(p, false);
3255
3256
3257
3258
3259
3260 p->sched_reset_on_fork = 0;
3261 }
3262
3263 if (dl_prio(p->prio))
3264 return -EAGAIN;
3265 else if (rt_prio(p->prio))
3266 p->sched_class = &rt_sched_class;
3267 else
3268 p->sched_class = &fair_sched_class;
3269
3270 init_entity_runnable_average(&p->se);
3271
3272
3273
3274
3275
3276
3277
3278
3279 raw_spin_lock_irqsave(&p->pi_lock, flags);
3280 rseq_migrate(p);
3281
3282
3283
3284
3285 __set_task_cpu(p, smp_processor_id());
3286 if (p->sched_class->task_fork)
3287 p->sched_class->task_fork(p);
3288 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3289
3290#ifdef CONFIG_SCHED_INFO
3291 if (likely(sched_info_on()))
3292 memset(&p->sched_info, 0, sizeof(p->sched_info));
3293#endif
3294#if defined(CONFIG_SMP)
3295 p->on_cpu = 0;
3296#endif
3297 init_task_preempt_count(p);
3298#ifdef CONFIG_SMP
3299 plist_node_init(&p->pushable_tasks, MAX_PRIO);
3300 RB_CLEAR_NODE(&p->pushable_dl_tasks);
3301#endif
3302 return 0;
3303}
3304
3305void sched_post_fork(struct task_struct *p)
3306{
3307 uclamp_post_fork(p);
3308}
3309
3310unsigned long to_ratio(u64 period, u64 runtime)
3311{
3312 if (runtime == RUNTIME_INF)
3313 return BW_UNIT;
3314
3315
3316
3317
3318
3319
3320 if (period == 0)
3321 return 0;
3322
3323 return div64_u64(runtime << BW_SHIFT, period);
3324}
3325
3326
3327
3328
3329
3330
3331
3332
3333void wake_up_new_task(struct task_struct *p)
3334{
3335 struct rq_flags rf;
3336 struct rq *rq;
3337
3338 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3339 p->state = TASK_RUNNING;
3340#ifdef CONFIG_SMP
3341
3342
3343
3344
3345
3346
3347
3348
3349 p->recent_used_cpu = task_cpu(p);
3350 rseq_migrate(p);
3351 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3352#endif
3353 rq = __task_rq_lock(p, &rf);
3354 update_rq_clock(rq);
3355 post_init_entity_util_avg(p);
3356
3357 activate_task(rq, p, ENQUEUE_NOCLOCK);
3358 trace_sched_wakeup_new(p);
3359 check_preempt_curr(rq, p, WF_FORK);
3360#ifdef CONFIG_SMP
3361 if (p->sched_class->task_woken) {
3362
3363
3364
3365
3366 rq_unpin_lock(rq, &rf);
3367 p->sched_class->task_woken(rq, p);
3368 rq_repin_lock(rq, &rf);
3369 }
3370#endif
3371 task_rq_unlock(rq, p, &rf);
3372}
3373
3374#ifdef CONFIG_PREEMPT_NOTIFIERS
3375
3376static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
3377
3378void preempt_notifier_inc(void)
3379{
3380 static_branch_inc(&preempt_notifier_key);
3381}
3382EXPORT_SYMBOL_GPL(preempt_notifier_inc);
3383
3384void preempt_notifier_dec(void)
3385{
3386 static_branch_dec(&preempt_notifier_key);
3387}
3388EXPORT_SYMBOL_GPL(preempt_notifier_dec);
3389
3390
3391
3392
3393
3394void preempt_notifier_register(struct preempt_notifier *notifier)
3395{
3396 if (!static_branch_unlikely(&preempt_notifier_key))
3397 WARN(1, "registering preempt_notifier while notifiers disabled\n");
3398
3399 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
3400}
3401EXPORT_SYMBOL_GPL(preempt_notifier_register);
3402
3403
3404
3405
3406
3407
3408
3409void preempt_notifier_unregister(struct preempt_notifier *notifier)
3410{
3411 hlist_del(¬ifier->link);
3412}
3413EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3414
3415static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
3416{
3417 struct preempt_notifier *notifier;
3418
3419 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3420 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3421}
3422
3423static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3424{
3425 if (static_branch_unlikely(&preempt_notifier_key))
3426 __fire_sched_in_preempt_notifiers(curr);
3427}
3428
3429static void
3430__fire_sched_out_preempt_notifiers(struct task_struct *curr,
3431 struct task_struct *next)
3432{
3433 struct preempt_notifier *notifier;
3434
3435 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3436 notifier->ops->sched_out(notifier, next);
3437}
3438
3439static __always_inline void
3440fire_sched_out_preempt_notifiers(struct task_struct *curr,
3441 struct task_struct *next)
3442{
3443 if (static_branch_unlikely(&preempt_notifier_key))
3444 __fire_sched_out_preempt_notifiers(curr, next);
3445}
3446
3447#else
3448
3449static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3450{
3451}
3452
3453static inline void
3454fire_sched_out_preempt_notifiers(struct task_struct *curr,
3455 struct task_struct *next)
3456{
3457}
3458
3459#endif
3460
3461static inline void prepare_task(struct task_struct *next)
3462{
3463#ifdef CONFIG_SMP
3464
3465
3466
3467
3468
3469
3470 WRITE_ONCE(next->on_cpu, 1);
3471#endif
3472}
3473
3474static inline void finish_task(struct task_struct *prev)
3475{
3476#ifdef CONFIG_SMP
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488 smp_store_release(&prev->on_cpu, 0);
3489#endif
3490}
3491
3492static inline void
3493prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
3494{
3495
3496
3497
3498
3499
3500
3501 rq_unpin_lock(rq, rf);
3502 spin_release(&rq->lock.dep_map, _THIS_IP_);
3503#ifdef CONFIG_DEBUG_SPINLOCK
3504
3505 rq->lock.owner = next;
3506#endif
3507}
3508
3509static inline void finish_lock_switch(struct rq *rq)
3510{
3511
3512
3513
3514
3515
3516 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
3517 raw_spin_unlock_irq(&rq->lock);
3518}
3519
3520
3521
3522
3523
3524#ifndef prepare_arch_switch
3525# define prepare_arch_switch(next) do { } while (0)
3526#endif
3527
3528#ifndef finish_arch_post_lock_switch
3529# define finish_arch_post_lock_switch() do { } while (0)
3530#endif
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545static inline void
3546prepare_task_switch(struct rq *rq, struct task_struct *prev,
3547 struct task_struct *next)
3548{
3549 kcov_prepare_switch(prev);
3550 sched_info_switch(rq, prev, next);
3551 perf_event_task_sched_out(prev, next);
3552 rseq_preempt(prev);
3553 fire_sched_out_preempt_notifiers(prev, next);
3554 prepare_task(next);
3555 prepare_arch_switch(next);
3556}
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577static struct rq *finish_task_switch(struct task_struct *prev)
3578 __releases(rq->lock)
3579{
3580 struct rq *rq = this_rq();
3581 struct mm_struct *mm = rq->prev_mm;
3582 long prev_state;
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
3596 "corrupted preempt_count: %s/%d/0x%x\n",
3597 current->comm, current->pid, preempt_count()))
3598 preempt_count_set(FORK_PREEMPT_COUNT);
3599
3600 rq->prev_mm = NULL;
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613 prev_state = prev->state;
3614 vtime_task_switch(prev);
3615 perf_event_task_sched_in(prev, current);
3616 finish_task(prev);
3617 finish_lock_switch(rq);
3618 finish_arch_post_lock_switch();
3619 kcov_finish_switch(current);
3620
3621 fire_sched_in_preempt_notifiers(current);
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634 if (mm) {
3635 membarrier_mm_sync_core_before_usermode(mm);
3636 mmdrop(mm);
3637 }
3638 if (unlikely(prev_state == TASK_DEAD)) {
3639 if (prev->sched_class->task_dead)
3640 prev->sched_class->task_dead(prev);
3641
3642
3643
3644
3645
3646 kprobe_flush_task(prev);
3647
3648
3649 put_task_stack(prev);
3650
3651 put_task_struct_rcu_user(prev);
3652 }
3653
3654 tick_nohz_task_switch();
3655 return rq;
3656}
3657
3658#ifdef CONFIG_SMP
3659
3660
3661static void __balance_callback(struct rq *rq)
3662{
3663 struct callback_head *head, *next;
3664 void (*func)(struct rq *rq);
3665 unsigned long flags;
3666
3667 raw_spin_lock_irqsave(&rq->lock, flags);
3668 head = rq->balance_callback;
3669 rq->balance_callback = NULL;
3670 while (head) {
3671 func = (void (*)(struct rq *))head->func;
3672 next = head->next;
3673 head->next = NULL;
3674 head = next;
3675
3676 func(rq);
3677 }
3678 raw_spin_unlock_irqrestore(&rq->lock, flags);
3679}
3680
3681static inline void balance_callback(struct rq *rq)
3682{
3683 if (unlikely(rq->balance_callback))
3684 __balance_callback(rq);
3685}
3686
3687#else
3688
3689static inline void balance_callback(struct rq *rq)
3690{
3691}
3692
3693#endif
3694
3695
3696
3697
3698
3699asmlinkage __visible void schedule_tail(struct task_struct *prev)
3700 __releases(rq->lock)
3701{
3702 struct rq *rq;
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713 rq = finish_task_switch(prev);
3714 balance_callback(rq);
3715 preempt_enable();
3716
3717 if (current->set_child_tid)
3718 put_user(task_pid_vnr(current), current->set_child_tid);
3719
3720 calculate_sigpending();
3721}
3722
3723
3724
3725
3726static __always_inline struct rq *
3727context_switch(struct rq *rq, struct task_struct *prev,
3728 struct task_struct *next, struct rq_flags *rf)
3729{
3730 prepare_task_switch(rq, prev, next);
3731
3732
3733
3734
3735
3736
3737 arch_start_context_switch(prev);
3738
3739
3740
3741
3742
3743
3744
3745
3746 if (!next->mm) {
3747 enter_lazy_tlb(prev->active_mm, next);
3748
3749 next->active_mm = prev->active_mm;
3750 if (prev->mm)
3751 mmgrab(prev->active_mm);
3752 else
3753 prev->active_mm = NULL;
3754 } else {
3755 membarrier_switch_mm(rq, prev->active_mm, next->mm);
3756
3757
3758
3759
3760
3761
3762
3763
3764 switch_mm_irqs_off(prev->active_mm, next->mm, next);
3765
3766 if (!prev->mm) {
3767
3768 rq->prev_mm = prev->active_mm;
3769 prev->active_mm = NULL;
3770 }
3771 }
3772
3773 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3774
3775 prepare_lock_switch(rq, next, rf);
3776
3777
3778 switch_to(prev, next, prev);
3779 barrier();
3780
3781 return finish_task_switch(prev);
3782}
3783
3784
3785
3786
3787
3788
3789
3790unsigned long nr_running(void)
3791{
3792 unsigned long i, sum = 0;
3793
3794 for_each_online_cpu(i)
3795 sum += cpu_rq(i)->nr_running;
3796
3797 return sum;
3798}
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813bool single_task_running(void)
3814{
3815 return raw_rq()->nr_running == 1;
3816}
3817EXPORT_SYMBOL(single_task_running);
3818
3819unsigned long long nr_context_switches(void)
3820{
3821 int i;
3822 unsigned long long sum = 0;
3823
3824 for_each_possible_cpu(i)
3825 sum += cpu_rq(i)->nr_switches;
3826
3827 return sum;
3828}
3829
3830
3831
3832
3833
3834
3835
3836
3837unsigned long nr_iowait_cpu(int cpu)
3838{
3839 return atomic_read(&cpu_rq(cpu)->nr_iowait);
3840}
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872unsigned long nr_iowait(void)
3873{
3874 unsigned long i, sum = 0;
3875
3876 for_each_possible_cpu(i)
3877 sum += nr_iowait_cpu(i);
3878
3879 return sum;
3880}
3881
3882#ifdef CONFIG_SMP
3883
3884
3885
3886
3887
3888void sched_exec(void)
3889{
3890 struct task_struct *p = current;
3891 unsigned long flags;
3892 int dest_cpu;
3893
3894 raw_spin_lock_irqsave(&p->pi_lock, flags);
3895 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3896 if (dest_cpu == smp_processor_id())
3897 goto unlock;
3898
3899 if (likely(cpu_active(dest_cpu))) {
3900 struct migration_arg arg = { p, dest_cpu };
3901
3902 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3903 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3904 return;
3905 }
3906unlock:
3907 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3908}
3909
3910#endif
3911
3912DEFINE_PER_CPU(struct kernel_stat, kstat);
3913DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3914
3915EXPORT_PER_CPU_SYMBOL(kstat);
3916EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3917
3918
3919
3920
3921
3922
3923
3924static inline void prefetch_curr_exec_start(struct task_struct *p)
3925{
3926#ifdef CONFIG_FAIR_GROUP_SCHED
3927 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3928#else
3929 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3930#endif
3931 prefetch(curr);
3932 prefetch(&curr->exec_start);
3933}
3934
3935
3936
3937
3938
3939
3940unsigned long long task_sched_runtime(struct task_struct *p)
3941{
3942 struct rq_flags rf;
3943 struct rq *rq;
3944 u64 ns;
3945
3946#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958 if (!p->on_cpu || !task_on_rq_queued(p))
3959 return p->se.sum_exec_runtime;
3960#endif
3961
3962 rq = task_rq_lock(p, &rf);
3963
3964
3965
3966
3967
3968 if (task_current(rq, p) && task_on_rq_queued(p)) {
3969 prefetch_curr_exec_start(p);
3970 update_rq_clock(rq);
3971 p->sched_class->update_curr(rq);
3972 }
3973 ns = p->se.sum_exec_runtime;
3974 task_rq_unlock(rq, p, &rf);
3975
3976 return ns;
3977}
3978
3979
3980
3981
3982
3983void scheduler_tick(void)
3984{
3985 int cpu = smp_processor_id();
3986 struct rq *rq = cpu_rq(cpu);
3987 struct task_struct *curr = rq->curr;
3988 struct rq_flags rf;
3989 unsigned long thermal_pressure;
3990
3991 arch_scale_freq_tick();
3992 sched_clock_tick();
3993
3994 rq_lock(rq, &rf);
3995
3996 update_rq_clock(rq);
3997 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
3998 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
3999 curr->sched_class->task_tick(rq, curr, 0);
4000 calc_global_load_tick(rq);
4001 psi_task_tick(rq);
4002
4003 rq_unlock(rq, &rf);
4004
4005 perf_event_task_tick();
4006
4007#ifdef CONFIG_SMP
4008 rq->idle_balance = idle_cpu(cpu);
4009 trigger_load_balance(rq);
4010#endif
4011}
4012
4013#ifdef CONFIG_NO_HZ_FULL
4014
4015struct tick_work {
4016 int cpu;
4017 atomic_t state;
4018 struct delayed_work work;
4019};
4020
4021#define TICK_SCHED_REMOTE_OFFLINE 0
4022#define TICK_SCHED_REMOTE_OFFLINING 1
4023#define TICK_SCHED_REMOTE_RUNNING 2
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048static struct tick_work __percpu *tick_work_cpu;
4049
4050static void sched_tick_remote(struct work_struct *work)
4051{
4052 struct delayed_work *dwork = to_delayed_work(work);
4053 struct tick_work *twork = container_of(dwork, struct tick_work, work);
4054 int cpu = twork->cpu;
4055 struct rq *rq = cpu_rq(cpu);
4056 struct task_struct *curr;
4057 struct rq_flags rf;
4058 u64 delta;
4059 int os;
4060
4061
4062
4063
4064
4065
4066
4067
4068 if (!tick_nohz_tick_stopped_cpu(cpu))
4069 goto out_requeue;
4070
4071 rq_lock_irq(rq, &rf);
4072 curr = rq->curr;
4073 if (cpu_is_offline(cpu))
4074 goto out_unlock;
4075
4076 update_rq_clock(rq);
4077
4078 if (!is_idle_task(curr)) {
4079
4080
4081
4082
4083 delta = rq_clock_task(rq) - curr->se.exec_start;
4084 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4085 }
4086 curr->sched_class->task_tick(rq, curr, 0);
4087
4088 calc_load_nohz_remote(rq);
4089out_unlock:
4090 rq_unlock_irq(rq, &rf);
4091out_requeue:
4092
4093
4094
4095
4096
4097
4098
4099 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
4100 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
4101 if (os == TICK_SCHED_REMOTE_RUNNING)
4102 queue_delayed_work(system_unbound_wq, dwork, HZ);
4103}
4104
4105static void sched_tick_start(int cpu)
4106{
4107 int os;
4108 struct tick_work *twork;
4109
4110 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4111 return;
4112
4113 WARN_ON_ONCE(!tick_work_cpu);
4114
4115 twork = per_cpu_ptr(tick_work_cpu, cpu);
4116 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
4117 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
4118 if (os == TICK_SCHED_REMOTE_OFFLINE) {
4119 twork->cpu = cpu;
4120 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
4121 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
4122 }
4123}
4124
4125#ifdef CONFIG_HOTPLUG_CPU
4126static void sched_tick_stop(int cpu)
4127{
4128 struct tick_work *twork;
4129 int os;
4130
4131 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4132 return;
4133
4134 WARN_ON_ONCE(!tick_work_cpu);
4135
4136 twork = per_cpu_ptr(tick_work_cpu, cpu);
4137
4138 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
4139 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
4140
4141}
4142#endif
4143
4144int __init sched_tick_offload_init(void)
4145{
4146 tick_work_cpu = alloc_percpu(struct tick_work);
4147 BUG_ON(!tick_work_cpu);
4148 return 0;
4149}
4150
4151#else
4152static inline void sched_tick_start(int cpu) { }
4153static inline void sched_tick_stop(int cpu) { }
4154#endif
4155
4156#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
4157 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
4158
4159
4160
4161
4162static inline void preempt_latency_start(int val)
4163{
4164 if (preempt_count() == val) {
4165 unsigned long ip = get_lock_parent_ip();
4166#ifdef CONFIG_DEBUG_PREEMPT
4167 current->preempt_disable_ip = ip;
4168#endif
4169 trace_preempt_off(CALLER_ADDR0, ip);
4170 }
4171}
4172
4173void preempt_count_add(int val)
4174{
4175#ifdef CONFIG_DEBUG_PREEMPT
4176
4177
4178
4179 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4180 return;
4181#endif
4182 __preempt_count_add(val);
4183#ifdef CONFIG_DEBUG_PREEMPT
4184
4185
4186
4187 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4188 PREEMPT_MASK - 10);
4189#endif
4190 preempt_latency_start(val);
4191}
4192EXPORT_SYMBOL(preempt_count_add);
4193NOKPROBE_SYMBOL(preempt_count_add);
4194
4195
4196
4197
4198
4199static inline void preempt_latency_stop(int val)
4200{
4201 if (preempt_count() == val)
4202 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
4203}
4204
4205void preempt_count_sub(int val)
4206{
4207#ifdef CONFIG_DEBUG_PREEMPT
4208
4209
4210
4211 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4212 return;
4213
4214
4215
4216 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4217 !(preempt_count() & PREEMPT_MASK)))
4218 return;
4219#endif
4220
4221 preempt_latency_stop(val);
4222 __preempt_count_sub(val);
4223}
4224EXPORT_SYMBOL(preempt_count_sub);
4225NOKPROBE_SYMBOL(preempt_count_sub);
4226
4227#else
4228static inline void preempt_latency_start(int val) { }
4229static inline void preempt_latency_stop(int val) { }
4230#endif
4231
4232static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
4233{
4234#ifdef CONFIG_DEBUG_PREEMPT
4235 return p->preempt_disable_ip;
4236#else
4237 return 0;
4238#endif
4239}
4240
4241
4242
4243
4244static noinline void __schedule_bug(struct task_struct *prev)
4245{
4246
4247 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
4248
4249 if (oops_in_progress)
4250 return;
4251
4252 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4253 prev->comm, prev->pid, preempt_count());
4254
4255 debug_show_held_locks(prev);
4256 print_modules();
4257 if (irqs_disabled())
4258 print_irqtrace_events(prev);
4259 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
4260 && in_atomic_preempt_off()) {
4261 pr_err("Preemption disabled at:");
4262 print_ip_sym(KERN_ERR, preempt_disable_ip);
4263 }
4264 if (panic_on_warn)
4265 panic("scheduling while atomic\n");
4266
4267 dump_stack();
4268 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4269}
4270
4271
4272
4273
4274static inline void schedule_debug(struct task_struct *prev, bool preempt)
4275{
4276#ifdef CONFIG_SCHED_STACK_END_CHECK
4277 if (task_stack_end_corrupted(prev))
4278 panic("corrupted stack end detected inside scheduler\n");
4279
4280 if (task_scs_end_corrupted(prev))
4281 panic("corrupted shadow stack detected inside scheduler\n");
4282#endif
4283
4284#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4285 if (!preempt && prev->state && prev->non_block_count) {
4286 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4287 prev->comm, prev->pid, prev->non_block_count);
4288 dump_stack();
4289 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4290 }
4291#endif
4292
4293 if (unlikely(in_atomic_preempt_off())) {
4294 __schedule_bug(prev);
4295 preempt_count_set(PREEMPT_DISABLED);
4296 }
4297 rcu_sleep_check();
4298
4299 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4300
4301 schedstat_inc(this_rq()->sched_count);
4302}
4303
4304static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4305 struct rq_flags *rf)
4306{
4307#ifdef CONFIG_SMP
4308 const struct sched_class *class;
4309
4310
4311
4312
4313
4314
4315
4316
4317 for_class_range(class, prev->sched_class, &idle_sched_class) {
4318 if (class->balance(rq, prev, rf))
4319 break;
4320 }
4321#endif
4322
4323 put_prev_task(rq, prev);
4324}
4325
4326
4327
4328
4329static inline struct task_struct *
4330pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4331{
4332 const struct sched_class *class;
4333 struct task_struct *p;
4334
4335
4336
4337
4338
4339
4340
4341 if (likely(prev->sched_class <= &fair_sched_class &&
4342 rq->nr_running == rq->cfs.h_nr_running)) {
4343
4344 p = pick_next_task_fair(rq, prev, rf);
4345 if (unlikely(p == RETRY_TASK))
4346 goto restart;
4347
4348
4349 if (!p) {
4350 put_prev_task(rq, prev);
4351 p = pick_next_task_idle(rq);
4352 }
4353
4354 return p;
4355 }
4356
4357restart:
4358 put_prev_task_balance(rq, prev, rf);
4359
4360 for_each_class(class) {
4361 p = class->pick_next_task(rq);
4362 if (p)
4363 return p;
4364 }
4365
4366
4367 BUG();
4368}
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409static void __sched notrace __schedule(bool preempt)
4410{
4411 struct task_struct *prev, *next;
4412 unsigned long *switch_count;
4413 unsigned long prev_state;
4414 struct rq_flags rf;
4415 struct rq *rq;
4416 int cpu;
4417
4418 cpu = smp_processor_id();
4419 rq = cpu_rq(cpu);
4420 prev = rq->curr;
4421
4422 schedule_debug(prev, preempt);
4423
4424 if (sched_feat(HRTICK))
4425 hrtick_clear(rq);
4426
4427 local_irq_disable();
4428 rcu_note_context_switch(preempt);
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445 rq_lock(rq, &rf);
4446 smp_mb__after_spinlock();
4447
4448
4449 rq->clock_update_flags <<= 1;
4450 update_rq_clock(rq);
4451
4452 switch_count = &prev->nivcsw;
4453
4454
4455
4456
4457
4458
4459
4460
4461 prev_state = prev->state;
4462 if (!preempt && prev_state) {
4463 if (signal_pending_state(prev_state, prev)) {
4464 prev->state = TASK_RUNNING;
4465 } else {
4466 prev->sched_contributes_to_load =
4467 (prev_state & TASK_UNINTERRUPTIBLE) &&
4468 !(prev_state & TASK_NOLOAD) &&
4469 !(prev->flags & PF_FROZEN);
4470
4471 if (prev->sched_contributes_to_load)
4472 rq->nr_uninterruptible++;
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4486
4487 if (prev->in_iowait) {
4488 atomic_inc(&rq->nr_iowait);
4489 delayacct_blkio_start();
4490 }
4491 }
4492 switch_count = &prev->nvcsw;
4493 }
4494
4495 next = pick_next_task(rq, prev, &rf);
4496 clear_tsk_need_resched(prev);
4497 clear_preempt_need_resched();
4498
4499 if (likely(prev != next)) {
4500 rq->nr_switches++;
4501
4502
4503
4504
4505 RCU_INIT_POINTER(rq->curr, next);
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520 ++*switch_count;
4521
4522 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4523
4524 trace_sched_switch(preempt, prev, next);
4525
4526
4527 rq = context_switch(rq, prev, next, &rf);
4528 } else {
4529 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4530 rq_unlock_irq(rq, &rf);
4531 }
4532
4533 balance_callback(rq);
4534}
4535
4536void __noreturn do_task_dead(void)
4537{
4538
4539 set_special_state(TASK_DEAD);
4540
4541
4542 current->flags |= PF_NOFREEZE;
4543
4544 __schedule(false);
4545 BUG();
4546
4547
4548 for (;;)
4549 cpu_relax();
4550}
4551
4552static inline void sched_submit_work(struct task_struct *tsk)
4553{
4554 if (!tsk->state)
4555 return;
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4566 preempt_disable();
4567 if (tsk->flags & PF_WQ_WORKER)
4568 wq_worker_sleeping(tsk);
4569 else
4570 io_wq_worker_sleeping(tsk);
4571 preempt_enable_no_resched();
4572 }
4573
4574 if (tsk_is_pi_blocked(tsk))
4575 return;
4576
4577
4578
4579
4580
4581 if (blk_needs_flush_plug(tsk))
4582 blk_schedule_flush_plug(tsk);
4583}
4584
4585static void sched_update_worker(struct task_struct *tsk)
4586{
4587 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4588 if (tsk->flags & PF_WQ_WORKER)
4589 wq_worker_running(tsk);
4590 else
4591 io_wq_worker_running(tsk);
4592 }
4593}
4594
4595asmlinkage __visible void __sched schedule(void)
4596{
4597 struct task_struct *tsk = current;
4598
4599 sched_submit_work(tsk);
4600 do {
4601 preempt_disable();
4602 __schedule(false);
4603 sched_preempt_enable_no_resched();
4604 } while (need_resched());
4605 sched_update_worker(tsk);
4606}
4607EXPORT_SYMBOL(schedule);
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619void __sched schedule_idle(void)
4620{
4621
4622
4623
4624
4625
4626
4627
4628 WARN_ON_ONCE(current->state);
4629 do {
4630 __schedule(false);
4631 } while (need_resched());
4632}
4633
4634#ifdef CONFIG_CONTEXT_TRACKING
4635asmlinkage __visible void __sched schedule_user(void)
4636{
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647 enum ctx_state prev_state = exception_enter();
4648 schedule();
4649 exception_exit(prev_state);
4650}
4651#endif
4652
4653
4654
4655
4656
4657
4658void __sched schedule_preempt_disabled(void)
4659{
4660 sched_preempt_enable_no_resched();
4661 schedule();
4662 preempt_disable();
4663}
4664
4665static void __sched notrace preempt_schedule_common(void)
4666{
4667 do {
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681 preempt_disable_notrace();
4682 preempt_latency_start(1);
4683 __schedule(true);
4684 preempt_latency_stop(1);
4685 preempt_enable_no_resched_notrace();
4686
4687
4688
4689
4690
4691 } while (need_resched());
4692}
4693
4694#ifdef CONFIG_PREEMPTION
4695
4696
4697
4698
4699asmlinkage __visible void __sched notrace preempt_schedule(void)
4700{
4701
4702
4703
4704
4705 if (likely(!preemptible()))
4706 return;
4707
4708 preempt_schedule_common();
4709}
4710NOKPROBE_SYMBOL(preempt_schedule);
4711EXPORT_SYMBOL(preempt_schedule);
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
4728{
4729 enum ctx_state prev_ctx;
4730
4731 if (likely(!preemptible()))
4732 return;
4733
4734 do {
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748 preempt_disable_notrace();
4749 preempt_latency_start(1);
4750
4751
4752
4753
4754
4755 prev_ctx = exception_enter();
4756 __schedule(true);
4757 exception_exit(prev_ctx);
4758
4759 preempt_latency_stop(1);
4760 preempt_enable_no_resched_notrace();
4761 } while (need_resched());
4762}
4763EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4764
4765#endif
4766
4767
4768
4769
4770
4771
4772
4773asmlinkage __visible void __sched preempt_schedule_irq(void)
4774{
4775 enum ctx_state prev_state;
4776
4777
4778 BUG_ON(preempt_count() || !irqs_disabled());
4779
4780 prev_state = exception_enter();
4781
4782 do {
4783 preempt_disable();
4784 local_irq_enable();
4785 __schedule(true);
4786 local_irq_disable();
4787 sched_preempt_enable_no_resched();
4788 } while (need_resched());
4789
4790 exception_exit(prev_state);
4791}
4792
4793int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4794 void *key)
4795{
4796 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
4797 return try_to_wake_up(curr->private, mode, wake_flags);
4798}
4799EXPORT_SYMBOL(default_wake_function);
4800
4801#ifdef CONFIG_RT_MUTEXES
4802
4803static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
4804{
4805 if (pi_task)
4806 prio = min(prio, pi_task->prio);
4807
4808 return prio;
4809}
4810
4811static inline int rt_effective_prio(struct task_struct *p, int prio)
4812{
4813 struct task_struct *pi_task = rt_mutex_get_top_task(p);
4814
4815 return __rt_effective_prio(pi_task, prio);
4816}
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
4830{
4831 int prio, oldprio, queued, running, queue_flag =
4832 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4833 const struct sched_class *prev_class;
4834 struct rq_flags rf;
4835 struct rq *rq;
4836
4837
4838 prio = __rt_effective_prio(pi_task, p->normal_prio);
4839
4840
4841
4842
4843 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
4844 return;
4845
4846 rq = __task_rq_lock(p, &rf);
4847 update_rq_clock(rq);
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858 p->pi_top_task = pi_task;
4859
4860
4861
4862
4863 if (prio == p->prio && !dl_prio(prio))
4864 goto out_unlock;
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878 if (unlikely(p == rq->idle)) {
4879 WARN_ON(p != rq->curr);
4880 WARN_ON(p->pi_blocked_on);
4881 goto out_unlock;
4882 }
4883
4884 trace_sched_pi_setprio(p, pi_task);
4885 oldprio = p->prio;
4886
4887 if (oldprio == prio)
4888 queue_flag &= ~DEQUEUE_MOVE;
4889
4890 prev_class = p->sched_class;
4891 queued = task_on_rq_queued(p);
4892 running = task_current(rq, p);
4893 if (queued)
4894 dequeue_task(rq, p, queue_flag);
4895 if (running)
4896 put_prev_task(rq, p);
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907 if (dl_prio(prio)) {
4908 if (!dl_prio(p->normal_prio) ||
4909 (pi_task && dl_prio(pi_task->prio) &&
4910 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4911 p->dl.dl_boosted = 1;
4912 queue_flag |= ENQUEUE_REPLENISH;
4913 } else
4914 p->dl.dl_boosted = 0;
4915 p->sched_class = &dl_sched_class;
4916 } else if (rt_prio(prio)) {
4917 if (dl_prio(oldprio))
4918 p->dl.dl_boosted = 0;
4919 if (oldprio < prio)
4920 queue_flag |= ENQUEUE_HEAD;
4921 p->sched_class = &rt_sched_class;
4922 } else {
4923 if (dl_prio(oldprio))
4924 p->dl.dl_boosted = 0;
4925 if (rt_prio(oldprio))
4926 p->rt.timeout = 0;
4927 p->sched_class = &fair_sched_class;
4928 }
4929
4930 p->prio = prio;
4931
4932 if (queued)
4933 enqueue_task(rq, p, queue_flag);
4934 if (running)
4935 set_next_task(rq, p);
4936
4937 check_class_changed(rq, p, prev_class, oldprio);
4938out_unlock:
4939
4940 preempt_disable();
4941 __task_rq_unlock(rq, &rf);
4942
4943 balance_callback(rq);
4944 preempt_enable();
4945}
4946#else
4947static inline int rt_effective_prio(struct task_struct *p, int prio)
4948{
4949 return prio;
4950}
4951#endif
4952
4953void set_user_nice(struct task_struct *p, long nice)
4954{
4955 bool queued, running;
4956 int old_prio;
4957 struct rq_flags rf;
4958 struct rq *rq;
4959
4960 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
4961 return;
4962
4963
4964
4965
4966 rq = task_rq_lock(p, &rf);
4967 update_rq_clock(rq);
4968
4969
4970
4971
4972
4973
4974
4975 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4976 p->static_prio = NICE_TO_PRIO(nice);
4977 goto out_unlock;
4978 }
4979 queued = task_on_rq_queued(p);
4980 running = task_current(rq, p);
4981 if (queued)
4982 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
4983 if (running)
4984 put_prev_task(rq, p);
4985
4986 p->static_prio = NICE_TO_PRIO(nice);
4987 set_load_weight(p, true);
4988 old_prio = p->prio;
4989 p->prio = effective_prio(p);
4990
4991 if (queued)
4992 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4993 if (running)
4994 set_next_task(rq, p);
4995
4996
4997
4998
4999
5000 p->sched_class->prio_changed(rq, p, old_prio);
5001
5002out_unlock:
5003 task_rq_unlock(rq, p, &rf);
5004}
5005EXPORT_SYMBOL(set_user_nice);
5006
5007
5008
5009
5010
5011
5012int can_nice(const struct task_struct *p, const int nice)
5013{
5014
5015 int nice_rlim = nice_to_rlimit(nice);
5016
5017 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
5018 capable(CAP_SYS_NICE));
5019}
5020
5021#ifdef __ARCH_WANT_SYS_NICE
5022
5023
5024
5025
5026
5027
5028
5029
5030SYSCALL_DEFINE1(nice, int, increment)
5031{
5032 long nice, retval;
5033
5034
5035
5036
5037
5038
5039 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
5040 nice = task_nice(current) + increment;
5041
5042 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
5043 if (increment < 0 && !can_nice(current, nice))
5044 return -EPERM;
5045
5046 retval = security_task_setnice(current, nice);
5047 if (retval)
5048 return retval;
5049
5050 set_user_nice(current, nice);
5051 return 0;
5052}
5053
5054#endif
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064int task_prio(const struct task_struct *p)
5065{
5066 return p->prio - MAX_RT_PRIO;
5067}
5068
5069
5070
5071
5072
5073
5074
5075int idle_cpu(int cpu)
5076{
5077 struct rq *rq = cpu_rq(cpu);
5078
5079 if (rq->curr != rq->idle)
5080 return 0;
5081
5082 if (rq->nr_running)
5083 return 0;
5084
5085#ifdef CONFIG_SMP
5086 if (rq->ttwu_pending)
5087 return 0;
5088#endif
5089
5090 return 1;
5091}
5092
5093
5094
5095
5096
5097
5098
5099int available_idle_cpu(int cpu)
5100{
5101 if (!idle_cpu(cpu))
5102 return 0;
5103
5104 if (vcpu_is_preempted(cpu))
5105 return 0;
5106
5107 return 1;
5108}
5109
5110
5111
5112
5113
5114
5115
5116struct task_struct *idle_task(int cpu)
5117{
5118 return cpu_rq(cpu)->idle;
5119}
5120
5121
5122
5123
5124
5125
5126
5127static struct task_struct *find_process_by_pid(pid_t pid)
5128{
5129 return pid ? find_task_by_vpid(pid) : current;
5130}
5131
5132
5133
5134
5135
5136#define SETPARAM_POLICY -1
5137
5138static void __setscheduler_params(struct task_struct *p,
5139 const struct sched_attr *attr)
5140{
5141 int policy = attr->sched_policy;
5142
5143 if (policy == SETPARAM_POLICY)
5144 policy = p->policy;
5145
5146 p->policy = policy;
5147
5148 if (dl_policy(policy))
5149 __setparam_dl(p, attr);
5150 else if (fair_policy(policy))
5151 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5152
5153
5154
5155
5156
5157
5158 p->rt_priority = attr->sched_priority;
5159 p->normal_prio = normal_prio(p);
5160 set_load_weight(p, true);
5161}
5162
5163
5164static void __setscheduler(struct rq *rq, struct task_struct *p,
5165 const struct sched_attr *attr, bool keep_boost)
5166{
5167
5168
5169
5170
5171 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
5172 return;
5173
5174 __setscheduler_params(p, attr);
5175
5176
5177
5178
5179
5180 p->prio = normal_prio(p);
5181 if (keep_boost)
5182 p->prio = rt_effective_prio(p, p->prio);
5183
5184 if (dl_prio(p->prio))
5185 p->sched_class = &dl_sched_class;
5186 else if (rt_prio(p->prio))
5187 p->sched_class = &rt_sched_class;
5188 else
5189 p->sched_class = &fair_sched_class;
5190}
5191
5192
5193
5194
5195static bool check_same_owner(struct task_struct *p)
5196{
5197 const struct cred *cred = current_cred(), *pcred;
5198 bool match;
5199
5200 rcu_read_lock();
5201 pcred = __task_cred(p);
5202 match = (uid_eq(cred->euid, pcred->euid) ||
5203 uid_eq(cred->euid, pcred->uid));
5204 rcu_read_unlock();
5205 return match;
5206}
5207
5208static int __sched_setscheduler(struct task_struct *p,
5209 const struct sched_attr *attr,
5210 bool user, bool pi)
5211{
5212 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
5213 MAX_RT_PRIO - 1 - attr->sched_priority;
5214 int retval, oldprio, oldpolicy = -1, queued, running;
5215 int new_effective_prio, policy = attr->sched_policy;
5216 const struct sched_class *prev_class;
5217 struct rq_flags rf;
5218 int reset_on_fork;
5219 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5220 struct rq *rq;
5221
5222
5223 BUG_ON(pi && in_interrupt());
5224recheck:
5225
5226 if (policy < 0) {
5227 reset_on_fork = p->sched_reset_on_fork;
5228 policy = oldpolicy = p->policy;
5229 } else {
5230 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
5231
5232 if (!valid_policy(policy))
5233 return -EINVAL;
5234 }
5235
5236 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
5237 return -EINVAL;
5238
5239
5240
5241
5242
5243
5244 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
5245 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
5246 return -EINVAL;
5247 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
5248 (rt_policy(policy) != (attr->sched_priority != 0)))
5249 return -EINVAL;
5250
5251
5252
5253
5254 if (user && !capable(CAP_SYS_NICE)) {
5255 if (fair_policy(policy)) {
5256 if (attr->sched_nice < task_nice(p) &&
5257 !can_nice(p, attr->sched_nice))
5258 return -EPERM;
5259 }
5260
5261 if (rt_policy(policy)) {
5262 unsigned long rlim_rtprio =
5263 task_rlimit(p, RLIMIT_RTPRIO);
5264
5265
5266 if (policy != p->policy && !rlim_rtprio)
5267 return -EPERM;
5268
5269
5270 if (attr->sched_priority > p->rt_priority &&
5271 attr->sched_priority > rlim_rtprio)
5272 return -EPERM;
5273 }
5274
5275
5276
5277
5278
5279
5280
5281 if (dl_policy(policy))
5282 return -EPERM;
5283
5284
5285
5286
5287
5288 if (task_has_idle_policy(p) && !idle_policy(policy)) {
5289 if (!can_nice(p, task_nice(p)))
5290 return -EPERM;
5291 }
5292
5293
5294 if (!check_same_owner(p))
5295 return -EPERM;
5296
5297
5298 if (p->sched_reset_on_fork && !reset_on_fork)
5299 return -EPERM;
5300 }
5301
5302 if (user) {
5303 if (attr->sched_flags & SCHED_FLAG_SUGOV)
5304 return -EINVAL;
5305
5306 retval = security_task_setscheduler(p);
5307 if (retval)
5308 return retval;
5309 }
5310
5311
5312 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5313 retval = uclamp_validate(p, attr);
5314 if (retval)
5315 return retval;
5316 }
5317
5318 if (pi)
5319 cpuset_read_lock();
5320
5321
5322
5323
5324
5325
5326
5327
5328 rq = task_rq_lock(p, &rf);
5329 update_rq_clock(rq);
5330
5331
5332
5333
5334 if (p == rq->stop) {
5335 retval = -EINVAL;
5336 goto unlock;
5337 }
5338
5339
5340
5341
5342
5343 if (unlikely(policy == p->policy)) {
5344 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
5345 goto change;
5346 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
5347 goto change;
5348 if (dl_policy(policy) && dl_param_changed(p, attr))
5349 goto change;
5350 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
5351 goto change;
5352
5353 p->sched_reset_on_fork = reset_on_fork;
5354 retval = 0;
5355 goto unlock;
5356 }
5357change:
5358
5359 if (user) {
5360#ifdef CONFIG_RT_GROUP_SCHED
5361
5362
5363
5364
5365 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5366 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5367 !task_group_is_autogroup(task_group(p))) {
5368 retval = -EPERM;
5369 goto unlock;
5370 }
5371#endif
5372#ifdef CONFIG_SMP
5373 if (dl_bandwidth_enabled() && dl_policy(policy) &&
5374 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
5375 cpumask_t *span = rq->rd->span;
5376
5377
5378
5379
5380
5381
5382 if (!cpumask_subset(span, p->cpus_ptr) ||
5383 rq->rd->dl_bw.bw == 0) {
5384 retval = -EPERM;
5385 goto unlock;
5386 }
5387 }
5388#endif
5389 }
5390
5391
5392 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5393 policy = oldpolicy = -1;
5394 task_rq_unlock(rq, p, &rf);
5395 if (pi)
5396 cpuset_read_unlock();
5397 goto recheck;
5398 }
5399
5400
5401
5402
5403
5404
5405 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5406 retval = -EBUSY;
5407 goto unlock;
5408 }
5409
5410 p->sched_reset_on_fork = reset_on_fork;
5411 oldprio = p->prio;
5412
5413 if (pi) {
5414
5415
5416
5417
5418
5419
5420
5421 new_effective_prio = rt_effective_prio(p, newprio);
5422 if (new_effective_prio == oldprio)
5423 queue_flags &= ~DEQUEUE_MOVE;
5424 }
5425
5426 queued = task_on_rq_queued(p);
5427 running = task_current(rq, p);
5428 if (queued)
5429 dequeue_task(rq, p, queue_flags);
5430 if (running)
5431 put_prev_task(rq, p);
5432
5433 prev_class = p->sched_class;
5434
5435 __setscheduler(rq, p, attr, pi);
5436 __setscheduler_uclamp(p, attr);
5437
5438 if (queued) {
5439
5440
5441
5442
5443 if (oldprio < p->prio)
5444 queue_flags |= ENQUEUE_HEAD;
5445
5446 enqueue_task(rq, p, queue_flags);
5447 }
5448 if (running)
5449 set_next_task(rq, p);
5450
5451 check_class_changed(rq, p, prev_class, oldprio);
5452
5453
5454 preempt_disable();
5455 task_rq_unlock(rq, p, &rf);
5456
5457 if (pi) {
5458 cpuset_read_unlock();
5459 rt_mutex_adjust_pi(p);
5460 }
5461
5462
5463 balance_callback(rq);
5464 preempt_enable();
5465
5466 return 0;
5467
5468unlock:
5469 task_rq_unlock(rq, p, &rf);
5470 if (pi)
5471 cpuset_read_unlock();
5472 return retval;
5473}
5474
5475static int _sched_setscheduler(struct task_struct *p, int policy,
5476 const struct sched_param *param, bool check)
5477{
5478 struct sched_attr attr = {
5479 .sched_policy = policy,
5480 .sched_priority = param->sched_priority,
5481 .sched_nice = PRIO_TO_NICE(p->static_prio),
5482 };
5483
5484
5485 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5486 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5487 policy &= ~SCHED_RESET_ON_FORK;
5488 attr.sched_policy = policy;
5489 }
5490
5491 return __sched_setscheduler(p, &attr, check, true);
5492}
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505int sched_setscheduler(struct task_struct *p, int policy,
5506 const struct sched_param *param)
5507{
5508 return _sched_setscheduler(p, policy, param, true);
5509}
5510
5511int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
5512{
5513 return __sched_setscheduler(p, attr, true, true);
5514}
5515
5516int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
5517{
5518 return __sched_setscheduler(p, attr, false, true);
5519}
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5535 const struct sched_param *param)
5536{
5537 return _sched_setscheduler(p, policy, param, false);
5538}
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558void sched_set_fifo(struct task_struct *p)
5559{
5560 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
5561 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5562}
5563EXPORT_SYMBOL_GPL(sched_set_fifo);
5564
5565
5566
5567
5568void sched_set_fifo_low(struct task_struct *p)
5569{
5570 struct sched_param sp = { .sched_priority = 1 };
5571 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5572}
5573EXPORT_SYMBOL_GPL(sched_set_fifo_low);
5574
5575void sched_set_normal(struct task_struct *p, int nice)
5576{
5577 struct sched_attr attr = {
5578 .sched_policy = SCHED_NORMAL,
5579 .sched_nice = nice,
5580 };
5581 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
5582}
5583EXPORT_SYMBOL_GPL(sched_set_normal);
5584
5585static int
5586do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5587{
5588 struct sched_param lparam;
5589 struct task_struct *p;
5590 int retval;
5591
5592 if (!param || pid < 0)
5593 return -EINVAL;
5594 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5595 return -EFAULT;
5596
5597 rcu_read_lock();
5598 retval = -ESRCH;
5599 p = find_process_by_pid(pid);
5600 if (likely(p))
5601 get_task_struct(p);
5602 rcu_read_unlock();
5603
5604 if (likely(p)) {
5605 retval = sched_setscheduler(p, policy, &lparam);
5606 put_task_struct(p);
5607 }
5608
5609 return retval;
5610}
5611
5612
5613
5614
5615static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
5616{
5617 u32 size;
5618 int ret;
5619
5620
5621 memset(attr, 0, sizeof(*attr));
5622
5623 ret = get_user(size, &uattr->size);
5624 if (ret)
5625 return ret;
5626
5627
5628 if (!size)
5629 size = SCHED_ATTR_SIZE_VER0;
5630 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
5631 goto err_size;
5632
5633 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
5634 if (ret) {
5635 if (ret == -E2BIG)
5636 goto err_size;
5637 return ret;
5638 }
5639
5640 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
5641 size < SCHED_ATTR_SIZE_VER1)
5642 return -EINVAL;
5643
5644
5645
5646
5647
5648 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
5649
5650 return 0;
5651
5652err_size:
5653 put_user(sizeof(*attr), &uattr->size);
5654 return -E2BIG;
5655}
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
5666{
5667 if (policy < 0)
5668 return -EINVAL;
5669
5670 return do_sched_setscheduler(pid, policy, param);
5671}
5672
5673
5674
5675
5676
5677
5678
5679
5680SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5681{
5682 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
5683}
5684
5685
5686
5687
5688
5689
5690
5691SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
5692 unsigned int, flags)
5693{
5694 struct sched_attr attr;
5695 struct task_struct *p;
5696 int retval;
5697
5698 if (!uattr || pid < 0 || flags)
5699 return -EINVAL;
5700
5701 retval = sched_copy_attr(uattr, &attr);
5702 if (retval)
5703 return retval;
5704
5705 if ((int)attr.sched_policy < 0)
5706 return -EINVAL;
5707 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
5708 attr.sched_policy = SETPARAM_POLICY;
5709
5710 rcu_read_lock();
5711 retval = -ESRCH;
5712 p = find_process_by_pid(pid);
5713 if (likely(p))
5714 get_task_struct(p);
5715 rcu_read_unlock();
5716
5717 if (likely(p)) {
5718 retval = sched_setattr(p, &attr);
5719 put_task_struct(p);
5720 }
5721
5722 return retval;
5723}
5724
5725
5726
5727
5728
5729
5730
5731
5732SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5733{
5734 struct task_struct *p;
5735 int retval;
5736
5737 if (pid < 0)
5738 return -EINVAL;
5739
5740 retval = -ESRCH;
5741 rcu_read_lock();
5742 p = find_process_by_pid(pid);
5743 if (p) {
5744 retval = security_task_getscheduler(p);
5745 if (!retval)
5746 retval = p->policy
5747 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5748 }
5749 rcu_read_unlock();
5750 return retval;
5751}
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5762{
5763 struct sched_param lp = { .sched_priority = 0 };
5764 struct task_struct *p;
5765 int retval;
5766
5767 if (!param || pid < 0)
5768 return -EINVAL;
5769
5770 rcu_read_lock();
5771 p = find_process_by_pid(pid);
5772 retval = -ESRCH;
5773 if (!p)
5774 goto out_unlock;
5775
5776 retval = security_task_getscheduler(p);
5777 if (retval)
5778 goto out_unlock;
5779
5780 if (task_has_rt_policy(p))
5781 lp.sched_priority = p->rt_priority;
5782 rcu_read_unlock();
5783
5784
5785
5786
5787 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5788
5789 return retval;
5790
5791out_unlock:
5792 rcu_read_unlock();
5793 return retval;
5794}
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804static int
5805sched_attr_copy_to_user(struct sched_attr __user *uattr,
5806 struct sched_attr *kattr,
5807 unsigned int usize)
5808{
5809 unsigned int ksize = sizeof(*kattr);
5810
5811 if (!access_ok(uattr, usize))
5812 return -EFAULT;
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827 kattr->size = min(usize, ksize);
5828
5829 if (copy_to_user(uattr, kattr, kattr->size))
5830 return -EFAULT;
5831
5832 return 0;
5833}
5834
5835
5836
5837
5838
5839
5840
5841
5842SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5843 unsigned int, usize, unsigned int, flags)
5844{
5845 struct sched_attr kattr = { };
5846 struct task_struct *p;
5847 int retval;
5848
5849 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
5850 usize < SCHED_ATTR_SIZE_VER0 || flags)
5851 return -EINVAL;
5852
5853 rcu_read_lock();
5854 p = find_process_by_pid(pid);
5855 retval = -ESRCH;
5856 if (!p)
5857 goto out_unlock;
5858
5859 retval = security_task_getscheduler(p);
5860 if (retval)
5861 goto out_unlock;
5862
5863 kattr.sched_policy = p->policy;
5864 if (p->sched_reset_on_fork)
5865 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5866 if (task_has_dl_policy(p))
5867 __getparam_dl(p, &kattr);
5868 else if (task_has_rt_policy(p))
5869 kattr.sched_priority = p->rt_priority;
5870 else
5871 kattr.sched_nice = task_nice(p);
5872
5873#ifdef CONFIG_UCLAMP_TASK
5874
5875
5876
5877
5878
5879 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5880 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5881#endif
5882
5883 rcu_read_unlock();
5884
5885 return sched_attr_copy_to_user(uattr, &kattr, usize);
5886
5887out_unlock:
5888 rcu_read_unlock();
5889 return retval;
5890}
5891
5892long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5893{
5894 cpumask_var_t cpus_allowed, new_mask;
5895 struct task_struct *p;
5896 int retval;
5897
5898 rcu_read_lock();
5899
5900 p = find_process_by_pid(pid);
5901 if (!p) {
5902 rcu_read_unlock();
5903 return -ESRCH;
5904 }
5905
5906
5907 get_task_struct(p);
5908 rcu_read_unlock();
5909
5910 if (p->flags & PF_NO_SETAFFINITY) {
5911 retval = -EINVAL;
5912 goto out_put_task;
5913 }
5914 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5915 retval = -ENOMEM;
5916 goto out_put_task;
5917 }
5918 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5919 retval = -ENOMEM;
5920 goto out_free_cpus_allowed;
5921 }
5922 retval = -EPERM;
5923 if (!check_same_owner(p)) {
5924 rcu_read_lock();
5925 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
5926 rcu_read_unlock();
5927 goto out_free_new_mask;
5928 }
5929 rcu_read_unlock();
5930 }
5931
5932 retval = security_task_setscheduler(p);
5933 if (retval)
5934 goto out_free_new_mask;
5935
5936
5937 cpuset_cpus_allowed(p, cpus_allowed);
5938 cpumask_and(new_mask, in_mask, cpus_allowed);
5939
5940
5941
5942
5943
5944
5945
5946#ifdef CONFIG_SMP
5947 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
5948 rcu_read_lock();
5949 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
5950 retval = -EBUSY;
5951 rcu_read_unlock();
5952 goto out_free_new_mask;
5953 }
5954 rcu_read_unlock();
5955 }
5956#endif
5957again:
5958 retval = __set_cpus_allowed_ptr(p, new_mask, true);
5959
5960 if (!retval) {
5961 cpuset_cpus_allowed(p, cpus_allowed);
5962 if (!cpumask_subset(new_mask, cpus_allowed)) {
5963
5964
5965
5966
5967
5968 cpumask_copy(new_mask, cpus_allowed);
5969 goto again;
5970 }
5971 }
5972out_free_new_mask:
5973 free_cpumask_var(new_mask);
5974out_free_cpus_allowed:
5975 free_cpumask_var(cpus_allowed);
5976out_put_task:
5977 put_task_struct(p);
5978 return retval;
5979}
5980
5981static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5982 struct cpumask *new_mask)
5983{
5984 if (len < cpumask_size())
5985 cpumask_clear(new_mask);
5986 else if (len > cpumask_size())
5987 len = cpumask_size();
5988
5989 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5990}
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6001 unsigned long __user *, user_mask_ptr)
6002{
6003 cpumask_var_t new_mask;
6004 int retval;
6005
6006 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6007 return -ENOMEM;
6008
6009 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6010 if (retval == 0)
6011 retval = sched_setaffinity(pid, new_mask);
6012 free_cpumask_var(new_mask);
6013 return retval;
6014}
6015
6016long sched_getaffinity(pid_t pid, struct cpumask *mask)
6017{
6018 struct task_struct *p;
6019 unsigned long flags;
6020 int retval;
6021
6022 rcu_read_lock();
6023
6024 retval = -ESRCH;
6025 p = find_process_by_pid(pid);
6026 if (!p)
6027 goto out_unlock;
6028
6029 retval = security_task_getscheduler(p);
6030 if (retval)
6031 goto out_unlock;
6032
6033 raw_spin_lock_irqsave(&p->pi_lock, flags);
6034 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
6035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6036
6037out_unlock:
6038 rcu_read_unlock();
6039
6040 return retval;
6041}
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6053 unsigned long __user *, user_mask_ptr)
6054{
6055 int ret;
6056 cpumask_var_t mask;
6057
6058 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
6059 return -EINVAL;
6060 if (len & (sizeof(unsigned long)-1))
6061 return -EINVAL;
6062
6063 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6064 return -ENOMEM;
6065
6066 ret = sched_getaffinity(pid, mask);
6067 if (ret == 0) {
6068 unsigned int retlen = min(len, cpumask_size());
6069
6070 if (copy_to_user(user_mask_ptr, mask, retlen))
6071 ret = -EFAULT;
6072 else
6073 ret = retlen;
6074 }
6075 free_cpumask_var(mask);
6076
6077 return ret;
6078}
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088static void do_sched_yield(void)
6089{
6090 struct rq_flags rf;
6091 struct rq *rq;
6092
6093 rq = this_rq_lock_irq(&rf);
6094
6095 schedstat_inc(rq->yld_count);
6096 current->sched_class->yield_task(rq);
6097
6098
6099
6100
6101
6102 preempt_disable();
6103 rq_unlock(rq, &rf);
6104 sched_preempt_enable_no_resched();
6105
6106 schedule();
6107}
6108
6109SYSCALL_DEFINE0(sched_yield)
6110{
6111 do_sched_yield();
6112 return 0;
6113}
6114
6115#ifndef CONFIG_PREEMPTION
6116int __sched _cond_resched(void)
6117{
6118 if (should_resched(0)) {
6119 preempt_schedule_common();
6120 return 1;
6121 }
6122 rcu_all_qs();
6123 return 0;
6124}
6125EXPORT_SYMBOL(_cond_resched);
6126#endif
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136int __cond_resched_lock(spinlock_t *lock)
6137{
6138 int resched = should_resched(PREEMPT_LOCK_OFFSET);
6139 int ret = 0;
6140
6141 lockdep_assert_held(lock);
6142
6143 if (spin_needbreak(lock) || resched) {
6144 spin_unlock(lock);
6145 if (resched)
6146 preempt_schedule_common();
6147 else
6148 cpu_relax();
6149 ret = 1;
6150 spin_lock(lock);
6151 }
6152 return ret;
6153}
6154EXPORT_SYMBOL(__cond_resched_lock);
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178void __sched yield(void)
6179{
6180 set_current_state(TASK_RUNNING);
6181 do_sched_yield();
6182}
6183EXPORT_SYMBOL(yield);
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200int __sched yield_to(struct task_struct *p, bool preempt)
6201{
6202 struct task_struct *curr = current;
6203 struct rq *rq, *p_rq;
6204 unsigned long flags;
6205 int yielded = 0;
6206
6207 local_irq_save(flags);
6208 rq = this_rq();
6209
6210again:
6211 p_rq = task_rq(p);
6212
6213
6214
6215
6216 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
6217 yielded = -ESRCH;
6218 goto out_irq;
6219 }
6220
6221 double_rq_lock(rq, p_rq);
6222 if (task_rq(p) != p_rq) {
6223 double_rq_unlock(rq, p_rq);
6224 goto again;
6225 }
6226
6227 if (!curr->sched_class->yield_to_task)
6228 goto out_unlock;
6229
6230 if (curr->sched_class != p->sched_class)
6231 goto out_unlock;
6232
6233 if (task_running(p_rq, p) || p->state)
6234 goto out_unlock;
6235
6236 yielded = curr->sched_class->yield_to_task(rq, p);
6237 if (yielded) {
6238 schedstat_inc(rq->yld_count);
6239
6240
6241
6242
6243 if (preempt && rq != p_rq)
6244 resched_curr(p_rq);
6245 }
6246
6247out_unlock:
6248 double_rq_unlock(rq, p_rq);
6249out_irq:
6250 local_irq_restore(flags);
6251
6252 if (yielded > 0)
6253 schedule();
6254
6255 return yielded;
6256}
6257EXPORT_SYMBOL_GPL(yield_to);
6258
6259int io_schedule_prepare(void)
6260{
6261 int old_iowait = current->in_iowait;
6262
6263 current->in_iowait = 1;
6264 blk_schedule_flush_plug(current);
6265
6266 return old_iowait;
6267}
6268
6269void io_schedule_finish(int token)
6270{
6271 current->in_iowait = token;
6272}
6273
6274
6275
6276
6277
6278long __sched io_schedule_timeout(long timeout)
6279{
6280 int token;
6281 long ret;
6282
6283 token = io_schedule_prepare();
6284 ret = schedule_timeout(timeout);
6285 io_schedule_finish(token);
6286
6287 return ret;
6288}
6289EXPORT_SYMBOL(io_schedule_timeout);
6290
6291void __sched io_schedule(void)
6292{
6293 int token;
6294
6295 token = io_schedule_prepare();
6296 schedule();
6297 io_schedule_finish(token);
6298}
6299EXPORT_SYMBOL(io_schedule);
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6310{
6311 int ret = -EINVAL;
6312
6313 switch (policy) {
6314 case SCHED_FIFO:
6315 case SCHED_RR:
6316 ret = MAX_USER_RT_PRIO-1;
6317 break;
6318 case SCHED_DEADLINE:
6319 case SCHED_NORMAL:
6320 case SCHED_BATCH:
6321 case SCHED_IDLE:
6322 ret = 0;
6323 break;
6324 }
6325 return ret;
6326}
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6337{
6338 int ret = -EINVAL;
6339
6340 switch (policy) {
6341 case SCHED_FIFO:
6342 case SCHED_RR:
6343 ret = 1;
6344 break;
6345 case SCHED_DEADLINE:
6346 case SCHED_NORMAL:
6347 case SCHED_BATCH:
6348 case SCHED_IDLE:
6349 ret = 0;
6350 }
6351 return ret;
6352}
6353
6354static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
6355{
6356 struct task_struct *p;
6357 unsigned int time_slice;
6358 struct rq_flags rf;
6359 struct rq *rq;
6360 int retval;
6361
6362 if (pid < 0)
6363 return -EINVAL;
6364
6365 retval = -ESRCH;
6366 rcu_read_lock();
6367 p = find_process_by_pid(pid);
6368 if (!p)
6369 goto out_unlock;
6370
6371 retval = security_task_getscheduler(p);
6372 if (retval)
6373 goto out_unlock;
6374
6375 rq = task_rq_lock(p, &rf);
6376 time_slice = 0;
6377 if (p->sched_class->get_rr_interval)
6378 time_slice = p->sched_class->get_rr_interval(rq, p);
6379 task_rq_unlock(rq, p, &rf);
6380
6381 rcu_read_unlock();
6382 jiffies_to_timespec64(time_slice, t);
6383 return 0;
6384
6385out_unlock:
6386 rcu_read_unlock();
6387 return retval;
6388}
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6402 struct __kernel_timespec __user *, interval)
6403{
6404 struct timespec64 t;
6405 int retval = sched_rr_get_interval(pid, &t);
6406
6407 if (retval == 0)
6408 retval = put_timespec64(&t, interval);
6409
6410 return retval;
6411}
6412
6413#ifdef CONFIG_COMPAT_32BIT_TIME
6414SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
6415 struct old_timespec32 __user *, interval)
6416{
6417 struct timespec64 t;
6418 int retval = sched_rr_get_interval(pid, &t);
6419
6420 if (retval == 0)
6421 retval = put_old_timespec32(&t, interval);
6422 return retval;
6423}
6424#endif
6425
6426void sched_show_task(struct task_struct *p)
6427{
6428 unsigned long free = 0;
6429 int ppid;
6430
6431 if (!try_get_task_stack(p))
6432 return;
6433
6434 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
6435
6436 if (p->state == TASK_RUNNING)
6437 pr_cont(" running task ");
6438#ifdef CONFIG_DEBUG_STACK_USAGE
6439 free = stack_not_used(p);
6440#endif
6441 ppid = 0;
6442 rcu_read_lock();
6443 if (pid_alive(p))
6444 ppid = task_pid_nr(rcu_dereference(p->real_parent));
6445 rcu_read_unlock();
6446 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
6447 free, task_pid_nr(p), ppid,
6448 (unsigned long)task_thread_info(p)->flags);
6449
6450 print_worker_info(KERN_INFO, p);
6451 show_stack(p, NULL, KERN_INFO);
6452 put_task_stack(p);
6453}
6454EXPORT_SYMBOL_GPL(sched_show_task);
6455
6456static inline bool
6457state_filter_match(unsigned long state_filter, struct task_struct *p)
6458{
6459
6460 if (!state_filter)
6461 return true;
6462
6463
6464 if (!(p->state & state_filter))
6465 return false;
6466
6467
6468
6469
6470
6471 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
6472 return false;
6473
6474 return true;
6475}
6476
6477
6478void show_state_filter(unsigned long state_filter)
6479{
6480 struct task_struct *g, *p;
6481
6482 rcu_read_lock();
6483 for_each_process_thread(g, p) {
6484
6485
6486
6487
6488
6489
6490
6491 touch_nmi_watchdog();
6492 touch_all_softlockup_watchdogs();
6493 if (state_filter_match(state_filter, p))
6494 sched_show_task(p);
6495 }
6496
6497#ifdef CONFIG_SCHED_DEBUG
6498 if (!state_filter)
6499 sysrq_sched_debug_show();
6500#endif
6501 rcu_read_unlock();
6502
6503
6504
6505 if (!state_filter)
6506 debug_show_all_locks();
6507}
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517void init_idle(struct task_struct *idle, int cpu)
6518{
6519 struct rq *rq = cpu_rq(cpu);
6520 unsigned long flags;
6521
6522 __sched_fork(0, idle);
6523
6524 raw_spin_lock_irqsave(&idle->pi_lock, flags);
6525 raw_spin_lock(&rq->lock);
6526
6527 idle->state = TASK_RUNNING;
6528 idle->se.exec_start = sched_clock();
6529 idle->flags |= PF_IDLE;
6530
6531 scs_task_reset(idle);
6532 kasan_unpoison_task_stack(idle);
6533
6534#ifdef CONFIG_SMP
6535
6536
6537
6538
6539
6540
6541 set_cpus_allowed_common(idle, cpumask_of(cpu));
6542#endif
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553 rcu_read_lock();
6554 __set_task_cpu(idle, cpu);
6555 rcu_read_unlock();
6556
6557 rq->idle = idle;
6558 rcu_assign_pointer(rq->curr, idle);
6559 idle->on_rq = TASK_ON_RQ_QUEUED;
6560#ifdef CONFIG_SMP
6561 idle->on_cpu = 1;
6562#endif
6563 raw_spin_unlock(&rq->lock);
6564 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
6565
6566
6567 init_idle_preempt_count(idle, cpu);
6568
6569
6570
6571
6572 idle->sched_class = &idle_sched_class;
6573 ftrace_graph_init_idle_task(idle, cpu);
6574 vtime_init_idle(idle, cpu);
6575#ifdef CONFIG_SMP
6576 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6577#endif
6578}
6579
6580#ifdef CONFIG_SMP
6581
6582int cpuset_cpumask_can_shrink(const struct cpumask *cur,
6583 const struct cpumask *trial)
6584{
6585 int ret = 1;
6586
6587 if (!cpumask_weight(cur))
6588 return ret;
6589
6590 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
6591
6592 return ret;
6593}
6594
6595int task_can_attach(struct task_struct *p,
6596 const struct cpumask *cs_cpus_allowed)
6597{
6598 int ret = 0;
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609 if (p->flags & PF_NO_SETAFFINITY) {
6610 ret = -EINVAL;
6611 goto out;
6612 }
6613
6614 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6615 cs_cpus_allowed))
6616 ret = dl_task_can_attach(p, cs_cpus_allowed);
6617
6618out:
6619 return ret;
6620}
6621
6622bool sched_smp_initialized __read_mostly;
6623
6624#ifdef CONFIG_NUMA_BALANCING
6625
6626int migrate_task_to(struct task_struct *p, int target_cpu)
6627{
6628 struct migration_arg arg = { p, target_cpu };
6629 int curr_cpu = task_cpu(p);
6630
6631 if (curr_cpu == target_cpu)
6632 return 0;
6633
6634 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
6635 return -EINVAL;
6636
6637
6638
6639 trace_sched_move_numa(p, curr_cpu, target_cpu);
6640 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
6641}
6642
6643
6644
6645
6646
6647void sched_setnuma(struct task_struct *p, int nid)
6648{
6649 bool queued, running;
6650 struct rq_flags rf;
6651 struct rq *rq;
6652
6653 rq = task_rq_lock(p, &rf);
6654 queued = task_on_rq_queued(p);
6655 running = task_current(rq, p);
6656
6657 if (queued)
6658 dequeue_task(rq, p, DEQUEUE_SAVE);
6659 if (running)
6660 put_prev_task(rq, p);
6661
6662 p->numa_preferred_nid = nid;
6663
6664 if (queued)
6665 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6666 if (running)
6667 set_next_task(rq, p);
6668 task_rq_unlock(rq, p, &rf);
6669}
6670#endif
6671
6672#ifdef CONFIG_HOTPLUG_CPU
6673
6674
6675
6676
6677void idle_task_exit(void)
6678{
6679 struct mm_struct *mm = current->active_mm;
6680
6681 BUG_ON(cpu_online(smp_processor_id()));
6682 BUG_ON(current != this_rq()->idle);
6683
6684 if (mm != &init_mm) {
6685 switch_mm(mm, &init_mm, current);
6686 finish_arch_post_lock_switch();
6687 }
6688
6689
6690}
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701static void calc_load_migrate(struct rq *rq)
6702{
6703 long delta = calc_load_fold_active(rq, 1);
6704 if (delta)
6705 atomic_long_add(delta, &calc_load_tasks);
6706}
6707
6708static struct task_struct *__pick_migrate_task(struct rq *rq)
6709{
6710 const struct sched_class *class;
6711 struct task_struct *next;
6712
6713 for_each_class(class) {
6714 next = class->pick_next_task(rq);
6715 if (next) {
6716 next->sched_class->put_prev_task(rq, next);
6717 return next;
6718 }
6719 }
6720
6721
6722 BUG();
6723}
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6734{
6735 struct rq *rq = dead_rq;
6736 struct task_struct *next, *stop = rq->stop;
6737 struct rq_flags orf = *rf;
6738 int dest_cpu;
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749 rq->stop = NULL;
6750
6751
6752
6753
6754
6755
6756 update_rq_clock(rq);
6757
6758 for (;;) {
6759
6760
6761
6762
6763 if (rq->nr_running == 1)
6764 break;
6765
6766 next = __pick_migrate_task(rq);
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777 rq_unlock(rq, rf);
6778 raw_spin_lock(&next->pi_lock);
6779 rq_relock(rq, rf);
6780
6781
6782
6783
6784
6785
6786 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6787 raw_spin_unlock(&next->pi_lock);
6788 continue;
6789 }
6790
6791
6792 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6793 rq = __migrate_task(rq, rf, next, dest_cpu);
6794 if (rq != dead_rq) {
6795 rq_unlock(rq, rf);
6796 rq = dead_rq;
6797 *rf = orf;
6798 rq_relock(rq, rf);
6799 }
6800 raw_spin_unlock(&next->pi_lock);
6801 }
6802
6803 rq->stop = stop;
6804}
6805#endif
6806
6807void set_rq_online(struct rq *rq)
6808{
6809 if (!rq->online) {
6810 const struct sched_class *class;
6811
6812 cpumask_set_cpu(rq->cpu, rq->rd->online);
6813 rq->online = 1;
6814
6815 for_each_class(class) {
6816 if (class->rq_online)
6817 class->rq_online(rq);
6818 }
6819 }
6820}
6821
6822void set_rq_offline(struct rq *rq)
6823{
6824 if (rq->online) {
6825 const struct sched_class *class;
6826
6827 for_each_class(class) {
6828 if (class->rq_offline)
6829 class->rq_offline(rq);
6830 }
6831
6832 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6833 rq->online = 0;
6834 }
6835}
6836
6837
6838
6839
6840static int num_cpus_frozen;
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850static void cpuset_cpu_active(void)
6851{
6852 if (cpuhp_tasks_frozen) {
6853
6854
6855
6856
6857
6858
6859 partition_sched_domains(1, NULL, NULL);
6860 if (--num_cpus_frozen)
6861 return;
6862
6863
6864
6865
6866
6867 cpuset_force_rebuild();
6868 }
6869 cpuset_update_active_cpus();
6870}
6871
6872static int cpuset_cpu_inactive(unsigned int cpu)
6873{
6874 if (!cpuhp_tasks_frozen) {
6875 if (dl_cpu_busy(cpu))
6876 return -EBUSY;
6877 cpuset_update_active_cpus();
6878 } else {
6879 num_cpus_frozen++;
6880 partition_sched_domains(1, NULL, NULL);
6881 }
6882 return 0;
6883}
6884
6885int sched_cpu_activate(unsigned int cpu)
6886{
6887 struct rq *rq = cpu_rq(cpu);
6888 struct rq_flags rf;
6889
6890#ifdef CONFIG_SCHED_SMT
6891
6892
6893
6894 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6895 static_branch_inc_cpuslocked(&sched_smt_present);
6896#endif
6897 set_cpu_active(cpu, true);
6898
6899 if (sched_smp_initialized) {
6900 sched_domains_numa_masks_set(cpu);
6901 cpuset_cpu_active();
6902 }
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913 rq_lock_irqsave(rq, &rf);
6914 if (rq->rd) {
6915 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6916 set_rq_online(rq);
6917 }
6918 rq_unlock_irqrestore(rq, &rf);
6919
6920 return 0;
6921}
6922
6923int sched_cpu_deactivate(unsigned int cpu)
6924{
6925 int ret;
6926
6927 set_cpu_active(cpu, false);
6928
6929
6930
6931
6932
6933
6934
6935 synchronize_rcu();
6936
6937#ifdef CONFIG_SCHED_SMT
6938
6939
6940
6941 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6942 static_branch_dec_cpuslocked(&sched_smt_present);
6943#endif
6944
6945 if (!sched_smp_initialized)
6946 return 0;
6947
6948 ret = cpuset_cpu_inactive(cpu);
6949 if (ret) {
6950 set_cpu_active(cpu, true);
6951 return ret;
6952 }
6953 sched_domains_numa_masks_clear(cpu);
6954 return 0;
6955}
6956
6957static void sched_rq_cpu_starting(unsigned int cpu)
6958{
6959 struct rq *rq = cpu_rq(cpu);
6960
6961 rq->calc_load_update = calc_load_update;
6962 update_max_interval();
6963}
6964
6965int sched_cpu_starting(unsigned int cpu)
6966{
6967 sched_rq_cpu_starting(cpu);
6968 sched_tick_start(cpu);
6969 return 0;
6970}
6971
6972#ifdef CONFIG_HOTPLUG_CPU
6973int sched_cpu_dying(unsigned int cpu)
6974{
6975 struct rq *rq = cpu_rq(cpu);
6976 struct rq_flags rf;
6977
6978
6979 sched_tick_stop(cpu);
6980
6981 rq_lock_irqsave(rq, &rf);
6982 if (rq->rd) {
6983 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6984 set_rq_offline(rq);
6985 }
6986 migrate_tasks(rq, &rf);
6987 BUG_ON(rq->nr_running != 1);
6988 rq_unlock_irqrestore(rq, &rf);
6989
6990 calc_load_migrate(rq);
6991 update_max_interval();
6992 nohz_balance_exit_idle(rq);
6993 hrtick_clear(rq);
6994 return 0;
6995}
6996#endif
6997
6998void __init sched_init_smp(void)
6999{
7000 sched_init_numa();
7001
7002
7003
7004
7005
7006
7007 mutex_lock(&sched_domains_mutex);
7008 sched_init_domains(cpu_active_mask);
7009 mutex_unlock(&sched_domains_mutex);
7010
7011
7012 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
7013 BUG();
7014 sched_init_granularity();
7015
7016 init_sched_rt_class();
7017 init_sched_dl_class();
7018
7019 sched_smp_initialized = true;
7020}
7021
7022static int __init migration_init(void)
7023{
7024 sched_cpu_starting(smp_processor_id());
7025 return 0;
7026}
7027early_initcall(migration_init);
7028
7029#else
7030void __init sched_init_smp(void)
7031{
7032 sched_init_granularity();
7033}
7034#endif
7035
7036int in_sched_functions(unsigned long addr)
7037{
7038 return in_lock_functions(addr) ||
7039 (addr >= (unsigned long)__sched_text_start
7040 && addr < (unsigned long)__sched_text_end);
7041}
7042
7043#ifdef CONFIG_CGROUP_SCHED
7044
7045
7046
7047
7048struct task_group root_task_group;
7049LIST_HEAD(task_groups);
7050
7051
7052static struct kmem_cache *task_group_cache __read_mostly;
7053#endif
7054
7055DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7056DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7057
7058void __init sched_init(void)
7059{
7060 unsigned long ptr = 0;
7061 int i;
7062
7063
7064 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
7065 &fair_sched_class + 1 != &rt_sched_class ||
7066 &rt_sched_class + 1 != &dl_sched_class);
7067#ifdef CONFIG_SMP
7068 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
7069#endif
7070
7071 wait_bit_init();
7072
7073#ifdef CONFIG_FAIR_GROUP_SCHED
7074 ptr += 2 * nr_cpu_ids * sizeof(void **);
7075#endif
7076#ifdef CONFIG_RT_GROUP_SCHED
7077 ptr += 2 * nr_cpu_ids * sizeof(void **);
7078#endif
7079 if (ptr) {
7080 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
7081
7082#ifdef CONFIG_FAIR_GROUP_SCHED
7083 root_task_group.se = (struct sched_entity **)ptr;
7084 ptr += nr_cpu_ids * sizeof(void **);
7085
7086 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7087 ptr += nr_cpu_ids * sizeof(void **);
7088
7089 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7090 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7091#endif
7092#ifdef CONFIG_RT_GROUP_SCHED
7093 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7094 ptr += nr_cpu_ids * sizeof(void **);
7095
7096 root_task_group.rt_rq = (struct rt_rq **)ptr;
7097 ptr += nr_cpu_ids * sizeof(void **);
7098
7099#endif
7100 }
7101#ifdef CONFIG_CPUMASK_OFFSTACK
7102 for_each_possible_cpu(i) {
7103 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7104 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7105 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7106 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7107 }
7108#endif
7109
7110 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
7111 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
7112
7113#ifdef CONFIG_SMP
7114 init_defrootdomain();
7115#endif
7116
7117#ifdef CONFIG_RT_GROUP_SCHED
7118 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7119 global_rt_period(), global_rt_runtime());
7120#endif
7121
7122#ifdef CONFIG_CGROUP_SCHED
7123 task_group_cache = KMEM_CACHE(task_group, 0);
7124
7125 list_add(&root_task_group.list, &task_groups);
7126 INIT_LIST_HEAD(&root_task_group.children);
7127 INIT_LIST_HEAD(&root_task_group.siblings);
7128 autogroup_init(&init_task);
7129#endif
7130
7131 for_each_possible_cpu(i) {
7132 struct rq *rq;
7133
7134 rq = cpu_rq(i);
7135 raw_spin_lock_init(&rq->lock);
7136 rq->nr_running = 0;
7137 rq->calc_load_active = 0;
7138 rq->calc_load_update = jiffies + LOAD_FREQ;
7139 init_cfs_rq(&rq->cfs);
7140 init_rt_rq(&rq->rt);
7141 init_dl_rq(&rq->dl);
7142#ifdef CONFIG_FAIR_GROUP_SCHED
7143 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7144 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7165#endif
7166
7167 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7168#ifdef CONFIG_RT_GROUP_SCHED
7169 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7170#endif
7171#ifdef CONFIG_SMP
7172 rq->sd = NULL;
7173 rq->rd = NULL;
7174 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7175 rq->balance_callback = NULL;
7176 rq->active_balance = 0;
7177 rq->next_balance = jiffies;
7178 rq->push_cpu = 0;
7179 rq->cpu = i;
7180 rq->online = 0;
7181 rq->idle_stamp = 0;
7182 rq->avg_idle = 2*sysctl_sched_migration_cost;
7183 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7184
7185 INIT_LIST_HEAD(&rq->cfs_tasks);
7186
7187 rq_attach_root(rq, &def_root_domain);
7188#ifdef CONFIG_NO_HZ_COMMON
7189 rq->last_blocked_load_update_tick = jiffies;
7190 atomic_set(&rq->nohz_flags, 0);
7191
7192 rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
7193#endif
7194#endif
7195 hrtick_rq_init(rq);
7196 atomic_set(&rq->nr_iowait, 0);
7197 }
7198
7199 set_load_weight(&init_task, false);
7200
7201
7202
7203
7204 mmgrab(&init_mm);
7205 enter_lazy_tlb(&init_mm, current);
7206
7207
7208
7209
7210
7211
7212
7213 init_idle(current, smp_processor_id());
7214
7215 calc_load_update = jiffies + LOAD_FREQ;
7216
7217#ifdef CONFIG_SMP
7218 idle_thread_set_boot_cpu();
7219#endif
7220 init_sched_fair_class();
7221
7222 init_schedstats();
7223
7224 psi_init();
7225
7226 init_uclamp();
7227
7228 scheduler_running = 1;
7229}
7230
7231#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7232static inline int preempt_count_equals(int preempt_offset)
7233{
7234 int nested = preempt_count() + rcu_preempt_depth();
7235
7236 return (nested == preempt_offset);
7237}
7238
7239void __might_sleep(const char *file, int line, int preempt_offset)
7240{
7241
7242
7243
7244
7245
7246 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7247 "do not call blocking ops when !TASK_RUNNING; "
7248 "state=%lx set at [<%p>] %pS\n",
7249 current->state,
7250 (void *)current->task_state_change,
7251 (void *)current->task_state_change);
7252
7253 ___might_sleep(file, line, preempt_offset);
7254}
7255EXPORT_SYMBOL(__might_sleep);
7256
7257void ___might_sleep(const char *file, int line, int preempt_offset)
7258{
7259
7260 static unsigned long prev_jiffy;
7261
7262 unsigned long preempt_disable_ip;
7263
7264
7265 rcu_sleep_check();
7266
7267 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7268 !is_idle_task(current) && !current->non_block_count) ||
7269 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
7270 oops_in_progress)
7271 return;
7272
7273 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7274 return;
7275 prev_jiffy = jiffies;
7276
7277
7278 preempt_disable_ip = get_preempt_disable_ip(current);
7279
7280 printk(KERN_ERR
7281 "BUG: sleeping function called from invalid context at %s:%d\n",
7282 file, line);
7283 printk(KERN_ERR
7284 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
7285 in_atomic(), irqs_disabled(), current->non_block_count,
7286 current->pid, current->comm);
7287
7288 if (task_stack_end_corrupted(current))
7289 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7290
7291 debug_show_held_locks(current);
7292 if (irqs_disabled())
7293 print_irqtrace_events(current);
7294 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
7295 && !preempt_count_equals(preempt_offset)) {
7296 pr_err("Preemption disabled at:");
7297 print_ip_sym(KERN_ERR, preempt_disable_ip);
7298 }
7299 dump_stack();
7300 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7301}
7302EXPORT_SYMBOL(___might_sleep);
7303
7304void __cant_sleep(const char *file, int line, int preempt_offset)
7305{
7306 static unsigned long prev_jiffy;
7307
7308 if (irqs_disabled())
7309 return;
7310
7311 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
7312 return;
7313
7314 if (preempt_count() > preempt_offset)
7315 return;
7316
7317 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7318 return;
7319 prev_jiffy = jiffies;
7320
7321 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
7322 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7323 in_atomic(), irqs_disabled(),
7324 current->pid, current->comm);
7325
7326 debug_show_held_locks(current);
7327 dump_stack();
7328 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7329}
7330EXPORT_SYMBOL_GPL(__cant_sleep);
7331#endif
7332
7333#ifdef CONFIG_MAGIC_SYSRQ
7334void normalize_rt_tasks(void)
7335{
7336 struct task_struct *g, *p;
7337 struct sched_attr attr = {
7338 .sched_policy = SCHED_NORMAL,
7339 };
7340
7341 read_lock(&tasklist_lock);
7342 for_each_process_thread(g, p) {
7343
7344
7345
7346 if (p->flags & PF_KTHREAD)
7347 continue;
7348
7349 p->se.exec_start = 0;
7350 schedstat_set(p->se.statistics.wait_start, 0);
7351 schedstat_set(p->se.statistics.sleep_start, 0);
7352 schedstat_set(p->se.statistics.block_start, 0);
7353
7354 if (!dl_task(p) && !rt_task(p)) {
7355
7356
7357
7358
7359 if (task_nice(p) < 0)
7360 set_user_nice(p, 0);
7361 continue;
7362 }
7363
7364 __sched_setscheduler(p, &attr, false, false);
7365 }
7366 read_unlock(&tasklist_lock);
7367}
7368
7369#endif
7370
7371#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390struct task_struct *curr_task(int cpu)
7391{
7392 return cpu_curr(cpu);
7393}
7394
7395#endif
7396
7397#ifdef CONFIG_IA64
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413void ia64_set_curr_task(int cpu, struct task_struct *p)
7414{
7415 cpu_curr(cpu) = p;
7416}
7417
7418#endif
7419
7420#ifdef CONFIG_CGROUP_SCHED
7421
7422static DEFINE_SPINLOCK(task_group_lock);
7423
7424static inline void alloc_uclamp_sched_group(struct task_group *tg,
7425 struct task_group *parent)
7426{
7427#ifdef CONFIG_UCLAMP_TASK_GROUP
7428 enum uclamp_id clamp_id;
7429
7430 for_each_clamp_id(clamp_id) {
7431 uclamp_se_set(&tg->uclamp_req[clamp_id],
7432 uclamp_none(clamp_id), false);
7433 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
7434 }
7435#endif
7436}
7437
7438static void sched_free_group(struct task_group *tg)
7439{
7440 free_fair_sched_group(tg);
7441 free_rt_sched_group(tg);
7442 autogroup_free(tg);
7443 kmem_cache_free(task_group_cache, tg);
7444}
7445
7446
7447struct task_group *sched_create_group(struct task_group *parent)
7448{
7449 struct task_group *tg;
7450
7451 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
7452 if (!tg)
7453 return ERR_PTR(-ENOMEM);
7454
7455 if (!alloc_fair_sched_group(tg, parent))
7456 goto err;
7457
7458 if (!alloc_rt_sched_group(tg, parent))
7459 goto err;
7460
7461 alloc_uclamp_sched_group(tg, parent);
7462
7463 return tg;
7464
7465err:
7466 sched_free_group(tg);
7467 return ERR_PTR(-ENOMEM);
7468}
7469
7470void sched_online_group(struct task_group *tg, struct task_group *parent)
7471{
7472 unsigned long flags;
7473
7474 spin_lock_irqsave(&task_group_lock, flags);
7475 list_add_rcu(&tg->list, &task_groups);
7476
7477
7478 WARN_ON(!parent);
7479
7480 tg->parent = parent;
7481 INIT_LIST_HEAD(&tg->children);
7482 list_add_rcu(&tg->siblings, &parent->children);
7483 spin_unlock_irqrestore(&task_group_lock, flags);
7484
7485 online_fair_sched_group(tg);
7486}
7487
7488
7489static void sched_free_group_rcu(struct rcu_head *rhp)
7490{
7491
7492 sched_free_group(container_of(rhp, struct task_group, rcu));
7493}
7494
7495void sched_destroy_group(struct task_group *tg)
7496{
7497
7498 call_rcu(&tg->rcu, sched_free_group_rcu);
7499}
7500
7501void sched_offline_group(struct task_group *tg)
7502{
7503 unsigned long flags;
7504
7505
7506 unregister_fair_sched_group(tg);
7507
7508 spin_lock_irqsave(&task_group_lock, flags);
7509 list_del_rcu(&tg->list);
7510 list_del_rcu(&tg->siblings);
7511 spin_unlock_irqrestore(&task_group_lock, flags);
7512}
7513
7514static void sched_change_group(struct task_struct *tsk, int type)
7515{
7516 struct task_group *tg;
7517
7518
7519
7520
7521
7522
7523 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7524 struct task_group, css);
7525 tg = autogroup_task_group(tsk, tg);
7526 tsk->sched_task_group = tg;
7527
7528#ifdef CONFIG_FAIR_GROUP_SCHED
7529 if (tsk->sched_class->task_change_group)
7530 tsk->sched_class->task_change_group(tsk, type);
7531 else
7532#endif
7533 set_task_rq(tsk, task_cpu(tsk));
7534}
7535
7536
7537
7538
7539
7540
7541
7542
7543void sched_move_task(struct task_struct *tsk)
7544{
7545 int queued, running, queue_flags =
7546 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7547 struct rq_flags rf;
7548 struct rq *rq;
7549
7550 rq = task_rq_lock(tsk, &rf);
7551 update_rq_clock(rq);
7552
7553 running = task_current(rq, tsk);
7554 queued = task_on_rq_queued(tsk);
7555
7556 if (queued)
7557 dequeue_task(rq, tsk, queue_flags);
7558 if (running)
7559 put_prev_task(rq, tsk);
7560
7561 sched_change_group(tsk, TASK_MOVE_GROUP);
7562
7563 if (queued)
7564 enqueue_task(rq, tsk, queue_flags);
7565 if (running) {
7566 set_next_task(rq, tsk);
7567
7568
7569
7570
7571
7572 resched_curr(rq);
7573 }
7574
7575 task_rq_unlock(rq, tsk, &rf);
7576}
7577
7578static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7579{
7580 return css ? container_of(css, struct task_group, css) : NULL;
7581}
7582
7583static struct cgroup_subsys_state *
7584cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7585{
7586 struct task_group *parent = css_tg(parent_css);
7587 struct task_group *tg;
7588
7589 if (!parent) {
7590
7591 return &root_task_group.css;
7592 }
7593
7594 tg = sched_create_group(parent);
7595 if (IS_ERR(tg))
7596 return ERR_PTR(-ENOMEM);
7597
7598 return &tg->css;
7599}
7600
7601
7602static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7603{
7604 struct task_group *tg = css_tg(css);
7605 struct task_group *parent = css_tg(css->parent);
7606
7607 if (parent)
7608 sched_online_group(tg, parent);
7609
7610#ifdef CONFIG_UCLAMP_TASK_GROUP
7611
7612 cpu_util_update_eff(css);
7613#endif
7614
7615 return 0;
7616}
7617
7618static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
7619{
7620 struct task_group *tg = css_tg(css);
7621
7622 sched_offline_group(tg);
7623}
7624
7625static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7626{
7627 struct task_group *tg = css_tg(css);
7628
7629
7630
7631
7632 sched_free_group(tg);
7633}
7634
7635
7636
7637
7638
7639static void cpu_cgroup_fork(struct task_struct *task)
7640{
7641 struct rq_flags rf;
7642 struct rq *rq;
7643
7644 rq = task_rq_lock(task, &rf);
7645
7646 update_rq_clock(rq);
7647 sched_change_group(task, TASK_SET_GROUP);
7648
7649 task_rq_unlock(rq, task, &rf);
7650}
7651
7652static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
7653{
7654 struct task_struct *task;
7655 struct cgroup_subsys_state *css;
7656 int ret = 0;
7657
7658 cgroup_taskset_for_each(task, css, tset) {
7659#ifdef CONFIG_RT_GROUP_SCHED
7660 if (!sched_rt_can_attach(css_tg(css), task))
7661 return -EINVAL;
7662#endif
7663
7664
7665
7666
7667 raw_spin_lock_irq(&task->pi_lock);
7668
7669
7670
7671
7672
7673 if (task->state == TASK_NEW)
7674 ret = -EINVAL;
7675 raw_spin_unlock_irq(&task->pi_lock);
7676
7677 if (ret)
7678 break;
7679 }
7680 return ret;
7681}
7682
7683static void cpu_cgroup_attach(struct cgroup_taskset *tset)
7684{
7685 struct task_struct *task;
7686 struct cgroup_subsys_state *css;
7687
7688 cgroup_taskset_for_each(task, css, tset)
7689 sched_move_task(task);
7690}
7691
7692#ifdef CONFIG_UCLAMP_TASK_GROUP
7693static void cpu_util_update_eff(struct cgroup_subsys_state *css)
7694{
7695 struct cgroup_subsys_state *top_css = css;
7696 struct uclamp_se *uc_parent = NULL;
7697 struct uclamp_se *uc_se = NULL;
7698 unsigned int eff[UCLAMP_CNT];
7699 enum uclamp_id clamp_id;
7700 unsigned int clamps;
7701
7702 css_for_each_descendant_pre(css, top_css) {
7703 uc_parent = css_tg(css)->parent
7704 ? css_tg(css)->parent->uclamp : NULL;
7705
7706 for_each_clamp_id(clamp_id) {
7707
7708 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
7709
7710 if (uc_parent &&
7711 eff[clamp_id] > uc_parent[clamp_id].value) {
7712 eff[clamp_id] = uc_parent[clamp_id].value;
7713 }
7714 }
7715
7716 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
7717
7718
7719 clamps = 0x0;
7720 uc_se = css_tg(css)->uclamp;
7721 for_each_clamp_id(clamp_id) {
7722 if (eff[clamp_id] == uc_se[clamp_id].value)
7723 continue;
7724 uc_se[clamp_id].value = eff[clamp_id];
7725 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
7726 clamps |= (0x1 << clamp_id);
7727 }
7728 if (!clamps) {
7729 css = css_rightmost_descendant(css);
7730 continue;
7731 }
7732
7733
7734 uclamp_update_active_tasks(css, clamps);
7735 }
7736}
7737
7738
7739
7740
7741
7742
7743#define _POW10(exp) ((unsigned int)1e##exp)
7744#define POW10(exp) _POW10(exp)
7745
7746struct uclamp_request {
7747#define UCLAMP_PERCENT_SHIFT 2
7748#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
7749 s64 percent;
7750 u64 util;
7751 int ret;
7752};
7753
7754static inline struct uclamp_request
7755capacity_from_percent(char *buf)
7756{
7757 struct uclamp_request req = {
7758 .percent = UCLAMP_PERCENT_SCALE,
7759 .util = SCHED_CAPACITY_SCALE,
7760 .ret = 0,
7761 };
7762
7763 buf = strim(buf);
7764 if (strcmp(buf, "max")) {
7765 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
7766 &req.percent);
7767 if (req.ret)
7768 return req;
7769 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
7770 req.ret = -ERANGE;
7771 return req;
7772 }
7773
7774 req.util = req.percent << SCHED_CAPACITY_SHIFT;
7775 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
7776 }
7777
7778 return req;
7779}
7780
7781static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
7782 size_t nbytes, loff_t off,
7783 enum uclamp_id clamp_id)
7784{
7785 struct uclamp_request req;
7786 struct task_group *tg;
7787
7788 req = capacity_from_percent(buf);
7789 if (req.ret)
7790 return req.ret;
7791
7792 static_branch_enable(&sched_uclamp_used);
7793
7794 mutex_lock(&uclamp_mutex);
7795 rcu_read_lock();
7796
7797 tg = css_tg(of_css(of));
7798 if (tg->uclamp_req[clamp_id].value != req.util)
7799 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
7800
7801
7802
7803
7804
7805 tg->uclamp_pct[clamp_id] = req.percent;
7806
7807
7808 cpu_util_update_eff(of_css(of));
7809
7810 rcu_read_unlock();
7811 mutex_unlock(&uclamp_mutex);
7812
7813 return nbytes;
7814}
7815
7816static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
7817 char *buf, size_t nbytes,
7818 loff_t off)
7819{
7820 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
7821}
7822
7823static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
7824 char *buf, size_t nbytes,
7825 loff_t off)
7826{
7827 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
7828}
7829
7830static inline void cpu_uclamp_print(struct seq_file *sf,
7831 enum uclamp_id clamp_id)
7832{
7833 struct task_group *tg;
7834 u64 util_clamp;
7835 u64 percent;
7836 u32 rem;
7837
7838 rcu_read_lock();
7839 tg = css_tg(seq_css(sf));
7840 util_clamp = tg->uclamp_req[clamp_id].value;
7841 rcu_read_unlock();
7842
7843 if (util_clamp == SCHED_CAPACITY_SCALE) {
7844 seq_puts(sf, "max\n");
7845 return;
7846 }
7847
7848 percent = tg->uclamp_pct[clamp_id];
7849 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
7850 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
7851}
7852
7853static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
7854{
7855 cpu_uclamp_print(sf, UCLAMP_MIN);
7856 return 0;
7857}
7858
7859static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
7860{
7861 cpu_uclamp_print(sf, UCLAMP_MAX);
7862 return 0;
7863}
7864#endif
7865
7866#ifdef CONFIG_FAIR_GROUP_SCHED
7867static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7868 struct cftype *cftype, u64 shareval)
7869{
7870 if (shareval > scale_load_down(ULONG_MAX))
7871 shareval = MAX_SHARES;
7872 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7873}
7874
7875static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7876 struct cftype *cft)
7877{
7878 struct task_group *tg = css_tg(css);
7879
7880 return (u64) scale_load_down(tg->shares);
7881}
7882
7883#ifdef CONFIG_CFS_BANDWIDTH
7884static DEFINE_MUTEX(cfs_constraints_mutex);
7885
7886const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7887static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7888
7889static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
7890
7891static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7892
7893static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7894{
7895 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7896 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7897
7898 if (tg == &root_task_group)
7899 return -EINVAL;
7900
7901
7902
7903
7904
7905
7906 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7907 return -EINVAL;
7908
7909
7910
7911
7912
7913
7914 if (period > max_cfs_quota_period)
7915 return -EINVAL;
7916
7917
7918
7919
7920 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
7921 return -EINVAL;
7922
7923
7924
7925
7926
7927 get_online_cpus();
7928 mutex_lock(&cfs_constraints_mutex);
7929 ret = __cfs_schedulable(tg, period, quota);
7930 if (ret)
7931 goto out_unlock;
7932
7933 runtime_enabled = quota != RUNTIME_INF;
7934 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7935
7936
7937
7938
7939 if (runtime_enabled && !runtime_was_enabled)
7940 cfs_bandwidth_usage_inc();
7941 raw_spin_lock_irq(&cfs_b->lock);
7942 cfs_b->period = ns_to_ktime(period);
7943 cfs_b->quota = quota;
7944
7945 __refill_cfs_bandwidth_runtime(cfs_b);
7946
7947
7948 if (runtime_enabled)
7949 start_cfs_bandwidth(cfs_b);
7950
7951 raw_spin_unlock_irq(&cfs_b->lock);
7952
7953 for_each_online_cpu(i) {
7954 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7955 struct rq *rq = cfs_rq->rq;
7956 struct rq_flags rf;
7957
7958 rq_lock_irq(rq, &rf);
7959 cfs_rq->runtime_enabled = runtime_enabled;
7960 cfs_rq->runtime_remaining = 0;
7961
7962 if (cfs_rq->throttled)
7963 unthrottle_cfs_rq(cfs_rq);
7964 rq_unlock_irq(rq, &rf);
7965 }
7966 if (runtime_was_enabled && !runtime_enabled)
7967 cfs_bandwidth_usage_dec();
7968out_unlock:
7969 mutex_unlock(&cfs_constraints_mutex);
7970 put_online_cpus();
7971
7972 return ret;
7973}
7974
7975static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7976{
7977 u64 quota, period;
7978
7979 period = ktime_to_ns(tg->cfs_bandwidth.period);
7980 if (cfs_quota_us < 0)
7981 quota = RUNTIME_INF;
7982 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
7983 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7984 else
7985 return -EINVAL;
7986
7987 return tg_set_cfs_bandwidth(tg, period, quota);
7988}
7989
7990static long tg_get_cfs_quota(struct task_group *tg)
7991{
7992 u64 quota_us;
7993
7994 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7995 return -1;
7996
7997 quota_us = tg->cfs_bandwidth.quota;
7998 do_div(quota_us, NSEC_PER_USEC);
7999
8000 return quota_us;
8001}
8002
8003static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8004{
8005 u64 quota, period;
8006
8007 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
8008 return -EINVAL;
8009
8010 period = (u64)cfs_period_us * NSEC_PER_USEC;
8011 quota = tg->cfs_bandwidth.quota;
8012
8013 return tg_set_cfs_bandwidth(tg, period, quota);
8014}
8015
8016static long tg_get_cfs_period(struct task_group *tg)
8017{
8018 u64 cfs_period_us;
8019
8020 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8021 do_div(cfs_period_us, NSEC_PER_USEC);
8022
8023 return cfs_period_us;
8024}
8025
8026static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8027 struct cftype *cft)
8028{
8029 return tg_get_cfs_quota(css_tg(css));
8030}
8031
8032static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8033 struct cftype *cftype, s64 cfs_quota_us)
8034{
8035 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8036}
8037
8038static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8039 struct cftype *cft)
8040{
8041 return tg_get_cfs_period(css_tg(css));
8042}
8043
8044static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8045 struct cftype *cftype, u64 cfs_period_us)
8046{
8047 return tg_set_cfs_period(css_tg(css), cfs_period_us);
8048}
8049
8050struct cfs_schedulable_data {
8051 struct task_group *tg;
8052 u64 period, quota;
8053};
8054
8055
8056
8057
8058
8059static u64 normalize_cfs_quota(struct task_group *tg,
8060 struct cfs_schedulable_data *d)
8061{
8062 u64 quota, period;
8063
8064 if (tg == d->tg) {
8065 period = d->period;
8066 quota = d->quota;
8067 } else {
8068 period = tg_get_cfs_period(tg);
8069 quota = tg_get_cfs_quota(tg);
8070 }
8071
8072
8073 if (quota == RUNTIME_INF || quota == -1)
8074 return RUNTIME_INF;
8075
8076 return to_ratio(period, quota);
8077}
8078
8079static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8080{
8081 struct cfs_schedulable_data *d = data;
8082 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8083 s64 quota = 0, parent_quota = -1;
8084
8085 if (!tg->parent) {
8086 quota = RUNTIME_INF;
8087 } else {
8088 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8089
8090 quota = normalize_cfs_quota(tg, d);
8091 parent_quota = parent_b->hierarchical_quota;
8092
8093
8094
8095
8096
8097
8098 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
8099 quota = min(quota, parent_quota);
8100 } else {
8101 if (quota == RUNTIME_INF)
8102 quota = parent_quota;
8103 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8104 return -EINVAL;
8105 }
8106 }
8107 cfs_b->hierarchical_quota = quota;
8108
8109 return 0;
8110}
8111
8112static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8113{
8114 int ret;
8115 struct cfs_schedulable_data data = {
8116 .tg = tg,
8117 .period = period,
8118 .quota = quota,
8119 };
8120
8121 if (quota != RUNTIME_INF) {
8122 do_div(data.period, NSEC_PER_USEC);
8123 do_div(data.quota, NSEC_PER_USEC);
8124 }
8125
8126 rcu_read_lock();
8127 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8128 rcu_read_unlock();
8129
8130 return ret;
8131}
8132
8133static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
8134{
8135 struct task_group *tg = css_tg(seq_css(sf));
8136 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8137
8138 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8139 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8140 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8141
8142 if (schedstat_enabled() && tg != &root_task_group) {
8143 u64 ws = 0;
8144 int i;
8145
8146 for_each_possible_cpu(i)
8147 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
8148
8149 seq_printf(sf, "wait_sum %llu\n", ws);
8150 }
8151
8152 return 0;
8153}
8154#endif
8155#endif
8156
8157#ifdef CONFIG_RT_GROUP_SCHED
8158static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8159 struct cftype *cft, s64 val)
8160{
8161 return sched_group_set_rt_runtime(css_tg(css), val);
8162}
8163
8164static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8165 struct cftype *cft)
8166{
8167 return sched_group_rt_runtime(css_tg(css));
8168}
8169
8170static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8171 struct cftype *cftype, u64 rt_period_us)
8172{
8173 return sched_group_set_rt_period(css_tg(css), rt_period_us);
8174}
8175
8176static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8177 struct cftype *cft)
8178{
8179 return sched_group_rt_period(css_tg(css));
8180}
8181#endif
8182
8183static struct cftype cpu_legacy_files[] = {
8184#ifdef CONFIG_FAIR_GROUP_SCHED
8185 {
8186 .name = "shares",
8187 .read_u64 = cpu_shares_read_u64,
8188 .write_u64 = cpu_shares_write_u64,
8189 },
8190#endif
8191#ifdef CONFIG_CFS_BANDWIDTH
8192 {
8193 .name = "cfs_quota_us",
8194 .read_s64 = cpu_cfs_quota_read_s64,
8195 .write_s64 = cpu_cfs_quota_write_s64,
8196 },
8197 {
8198 .name = "cfs_period_us",
8199 .read_u64 = cpu_cfs_period_read_u64,
8200 .write_u64 = cpu_cfs_period_write_u64,
8201 },
8202 {
8203 .name = "stat",
8204 .seq_show = cpu_cfs_stat_show,
8205 },
8206#endif
8207#ifdef CONFIG_RT_GROUP_SCHED
8208 {
8209 .name = "rt_runtime_us",
8210 .read_s64 = cpu_rt_runtime_read,
8211 .write_s64 = cpu_rt_runtime_write,
8212 },
8213 {
8214 .name = "rt_period_us",
8215 .read_u64 = cpu_rt_period_read_uint,
8216 .write_u64 = cpu_rt_period_write_uint,
8217 },
8218#endif
8219#ifdef CONFIG_UCLAMP_TASK_GROUP
8220 {
8221 .name = "uclamp.min",
8222 .flags = CFTYPE_NOT_ON_ROOT,
8223 .seq_show = cpu_uclamp_min_show,
8224 .write = cpu_uclamp_min_write,
8225 },
8226 {
8227 .name = "uclamp.max",
8228 .flags = CFTYPE_NOT_ON_ROOT,
8229 .seq_show = cpu_uclamp_max_show,
8230 .write = cpu_uclamp_max_write,
8231 },
8232#endif
8233 { }
8234};
8235
8236static int cpu_extra_stat_show(struct seq_file *sf,
8237 struct cgroup_subsys_state *css)
8238{
8239#ifdef CONFIG_CFS_BANDWIDTH
8240 {
8241 struct task_group *tg = css_tg(css);
8242 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8243 u64 throttled_usec;
8244
8245 throttled_usec = cfs_b->throttled_time;
8246 do_div(throttled_usec, NSEC_PER_USEC);
8247
8248 seq_printf(sf, "nr_periods %d\n"
8249 "nr_throttled %d\n"
8250 "throttled_usec %llu\n",
8251 cfs_b->nr_periods, cfs_b->nr_throttled,
8252 throttled_usec);
8253 }
8254#endif
8255 return 0;
8256}
8257
8258#ifdef CONFIG_FAIR_GROUP_SCHED
8259static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
8260 struct cftype *cft)
8261{
8262 struct task_group *tg = css_tg(css);
8263 u64 weight = scale_load_down(tg->shares);
8264
8265 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
8266}
8267
8268static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
8269 struct cftype *cft, u64 weight)
8270{
8271
8272
8273
8274
8275
8276
8277
8278 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
8279 return -ERANGE;
8280
8281 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
8282
8283 return sched_group_set_shares(css_tg(css), scale_load(weight));
8284}
8285
8286static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
8287 struct cftype *cft)
8288{
8289 unsigned long weight = scale_load_down(css_tg(css)->shares);
8290 int last_delta = INT_MAX;
8291 int prio, delta;
8292
8293
8294 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
8295 delta = abs(sched_prio_to_weight[prio] - weight);
8296 if (delta >= last_delta)
8297 break;
8298 last_delta = delta;
8299 }
8300
8301 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
8302}
8303
8304static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
8305 struct cftype *cft, s64 nice)
8306{
8307 unsigned long weight;
8308 int idx;
8309
8310 if (nice < MIN_NICE || nice > MAX_NICE)
8311 return -ERANGE;
8312
8313 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
8314 idx = array_index_nospec(idx, 40);
8315 weight = sched_prio_to_weight[idx];
8316
8317 return sched_group_set_shares(css_tg(css), scale_load(weight));
8318}
8319#endif
8320
8321static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
8322 long period, long quota)
8323{
8324 if (quota < 0)
8325 seq_puts(sf, "max");
8326 else
8327 seq_printf(sf, "%ld", quota);
8328
8329 seq_printf(sf, " %ld\n", period);
8330}
8331
8332
8333static int __maybe_unused cpu_period_quota_parse(char *buf,
8334 u64 *periodp, u64 *quotap)
8335{
8336 char tok[21];
8337
8338 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
8339 return -EINVAL;
8340
8341 *periodp *= NSEC_PER_USEC;
8342
8343 if (sscanf(tok, "%llu", quotap))
8344 *quotap *= NSEC_PER_USEC;
8345 else if (!strcmp(tok, "max"))
8346 *quotap = RUNTIME_INF;
8347 else
8348 return -EINVAL;
8349
8350 return 0;
8351}
8352
8353#ifdef CONFIG_CFS_BANDWIDTH
8354static int cpu_max_show(struct seq_file *sf, void *v)
8355{
8356 struct task_group *tg = css_tg(seq_css(sf));
8357
8358 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
8359 return 0;
8360}
8361
8362static ssize_t cpu_max_write(struct kernfs_open_file *of,
8363 char *buf, size_t nbytes, loff_t off)
8364{
8365 struct task_group *tg = css_tg(of_css(of));
8366 u64 period = tg_get_cfs_period(tg);
8367 u64 quota;
8368 int ret;
8369
8370 ret = cpu_period_quota_parse(buf, &period, "a);
8371 if (!ret)
8372 ret = tg_set_cfs_bandwidth(tg, period, quota);
8373 return ret ?: nbytes;
8374}
8375#endif
8376
8377static struct cftype cpu_files[] = {
8378#ifdef CONFIG_FAIR_GROUP_SCHED
8379 {
8380 .name = "weight",
8381 .flags = CFTYPE_NOT_ON_ROOT,
8382 .read_u64 = cpu_weight_read_u64,
8383 .write_u64 = cpu_weight_write_u64,
8384 },
8385 {
8386 .name = "weight.nice",
8387 .flags = CFTYPE_NOT_ON_ROOT,
8388 .read_s64 = cpu_weight_nice_read_s64,
8389 .write_s64 = cpu_weight_nice_write_s64,
8390 },
8391#endif
8392#ifdef CONFIG_CFS_BANDWIDTH
8393 {
8394 .name = "max",
8395 .flags = CFTYPE_NOT_ON_ROOT,
8396 .seq_show = cpu_max_show,
8397 .write = cpu_max_write,
8398 },
8399#endif
8400#ifdef CONFIG_UCLAMP_TASK_GROUP
8401 {
8402 .name = "uclamp.min",
8403 .flags = CFTYPE_NOT_ON_ROOT,
8404 .seq_show = cpu_uclamp_min_show,
8405 .write = cpu_uclamp_min_write,
8406 },
8407 {
8408 .name = "uclamp.max",
8409 .flags = CFTYPE_NOT_ON_ROOT,
8410 .seq_show = cpu_uclamp_max_show,
8411 .write = cpu_uclamp_max_write,
8412 },
8413#endif
8414 { }
8415};
8416
8417struct cgroup_subsys cpu_cgrp_subsys = {
8418 .css_alloc = cpu_cgroup_css_alloc,
8419 .css_online = cpu_cgroup_css_online,
8420 .css_released = cpu_cgroup_css_released,
8421 .css_free = cpu_cgroup_css_free,
8422 .css_extra_stat_show = cpu_extra_stat_show,
8423 .fork = cpu_cgroup_fork,
8424 .can_attach = cpu_cgroup_can_attach,
8425 .attach = cpu_cgroup_attach,
8426 .legacy_cftypes = cpu_legacy_files,
8427 .dfl_cftypes = cpu_files,
8428 .early_init = true,
8429 .threaded = true,
8430};
8431
8432#endif
8433
8434void dump_cpu_task(int cpu)
8435{
8436 pr_info("Task dump for CPU %d:\n", cpu);
8437 sched_show_task(cpu_curr(cpu));
8438}
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452const int sched_prio_to_weight[40] = {
8453 88761, 71755, 56483, 46273, 36291,
8454 29154, 23254, 18705, 14949, 11916,
8455 9548, 7620, 6100, 4904, 3906,
8456 3121, 2501, 1991, 1586, 1277,
8457 1024, 820, 655, 526, 423,
8458 335, 272, 215, 172, 137,
8459 110, 87, 70, 56, 45,
8460 36, 29, 23, 18, 15,
8461};
8462
8463
8464
8465
8466
8467
8468
8469
8470const u32 sched_prio_to_wmult[40] = {
8471 48388, 59856, 76040, 92818, 118348,
8472 147320, 184698, 229616, 287308, 360437,
8473 449829, 563644, 704093, 875809, 1099582,
8474 1376151, 1717300, 2157191, 2708050, 3363326,
8475 4194304, 5237765, 6557202, 8165337, 10153587,
8476 12820798, 15790321, 19976592, 24970740, 31350126,
8477 39045157, 49367440, 61356676, 76695844, 95443717,
8478 119304647, 148102320, 186737708, 238609294, 286331153,
8479};
8480
8481void call_trace_sched_update_nr_running(struct rq *rq, int count)
8482{
8483 trace_sched_update_nr_running_tp(rq, count);
8484}
8485