1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
44
45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
46
47#ifdef CONFIG_SCHED_DEBUG
48
49
50
51
52
53
54
55#define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57const_debug unsigned int sysctl_sched_features =
58#include "features.h"
59 0;
60#undef SCHED_FEAT
61
62
63
64
65
66
67
68
69__read_mostly int sysctl_resched_latency_warn_ms = 100;
70__read_mostly int sysctl_resched_latency_warn_once = 1;
71#endif
72
73
74
75
76
77const_debug unsigned int sysctl_sched_nr_migrate = 32;
78
79
80
81
82
83unsigned int sysctl_sched_rt_period = 1000000;
84
85__read_mostly int scheduler_running;
86
87
88
89
90
91int sysctl_sched_rt_runtime = 950000;
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
191 __acquires(rq->lock)
192{
193 struct rq *rq;
194
195 lockdep_assert_held(&p->pi_lock);
196
197 for (;;) {
198 rq = task_rq(p);
199 raw_spin_lock(&rq->lock);
200 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
201 rq_pin_lock(rq, rf);
202 return rq;
203 }
204 raw_spin_unlock(&rq->lock);
205
206 while (unlikely(task_on_rq_migrating(p)))
207 cpu_relax();
208 }
209}
210
211
212
213
214struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
215 __acquires(p->pi_lock)
216 __acquires(rq->lock)
217{
218 struct rq *rq;
219
220 for (;;) {
221 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
222 rq = task_rq(p);
223 raw_spin_lock(&rq->lock);
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
242 rq_pin_lock(rq, rf);
243 return rq;
244 }
245 raw_spin_unlock(&rq->lock);
246 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
247
248 while (unlikely(task_on_rq_migrating(p)))
249 cpu_relax();
250 }
251}
252
253
254
255
256
257static void update_rq_clock_task(struct rq *rq, s64 delta)
258{
259
260
261
262
263 s64 __maybe_unused steal = 0, irq_delta = 0;
264
265#ifdef CONFIG_IRQ_TIME_ACCOUNTING
266 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283 if (irq_delta > delta)
284 irq_delta = delta;
285
286 rq->prev_irq_time += irq_delta;
287 delta -= irq_delta;
288#endif
289#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
290 if (static_key_false((¶virt_steal_rq_enabled))) {
291 steal = paravirt_steal_clock(cpu_of(rq));
292 steal -= rq->prev_steal_time_rq;
293
294 if (unlikely(steal > delta))
295 steal = delta;
296
297 rq->prev_steal_time_rq += steal;
298 delta -= steal;
299 }
300#endif
301
302 rq->clock_task += delta;
303
304#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
305 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
306 update_irq_load_avg(rq, irq_delta + steal);
307#endif
308 update_rq_clock_pelt(rq, delta);
309}
310
311void update_rq_clock(struct rq *rq)
312{
313 s64 delta;
314
315 lockdep_assert_held(&rq->lock);
316
317 if (rq->clock_update_flags & RQCF_ACT_SKIP)
318 return;
319
320#ifdef CONFIG_SCHED_DEBUG
321 if (sched_feat(WARN_DOUBLE_CLOCK))
322 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
323 rq->clock_update_flags |= RQCF_UPDATED;
324#endif
325
326 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
327 if (delta < 0)
328 return;
329 rq->clock += delta;
330 update_rq_clock_task(rq, delta);
331}
332
333#ifdef CONFIG_SCHED_HRTICK
334
335
336
337
338static void hrtick_clear(struct rq *rq)
339{
340 if (hrtimer_active(&rq->hrtick_timer))
341 hrtimer_cancel(&rq->hrtick_timer);
342}
343
344
345
346
347
348static enum hrtimer_restart hrtick(struct hrtimer *timer)
349{
350 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
351 struct rq_flags rf;
352
353 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
354
355 rq_lock(rq, &rf);
356 update_rq_clock(rq);
357 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
358 rq_unlock(rq, &rf);
359
360 return HRTIMER_NORESTART;
361}
362
363#ifdef CONFIG_SMP
364
365static void __hrtick_restart(struct rq *rq)
366{
367 struct hrtimer *timer = &rq->hrtick_timer;
368 ktime_t time = rq->hrtick_time;
369
370 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
371}
372
373
374
375
376static void __hrtick_start(void *arg)
377{
378 struct rq *rq = arg;
379 struct rq_flags rf;
380
381 rq_lock(rq, &rf);
382 __hrtick_restart(rq);
383 rq_unlock(rq, &rf);
384}
385
386
387
388
389
390
391void hrtick_start(struct rq *rq, u64 delay)
392{
393 struct hrtimer *timer = &rq->hrtick_timer;
394 s64 delta;
395
396
397
398
399
400 delta = max_t(s64, delay, 10000LL);
401 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
402
403 if (rq == this_rq())
404 __hrtick_restart(rq);
405 else
406 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
407}
408
409#else
410
411
412
413
414
415void hrtick_start(struct rq *rq, u64 delay)
416{
417
418
419
420
421 delay = max_t(u64, delay, 10000LL);
422 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
423 HRTIMER_MODE_REL_PINNED_HARD);
424}
425
426#endif
427
428static void hrtick_rq_init(struct rq *rq)
429{
430#ifdef CONFIG_SMP
431 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
432#endif
433 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
434 rq->hrtick_timer.function = hrtick;
435}
436#else
437static inline void hrtick_clear(struct rq *rq)
438{
439}
440
441static inline void hrtick_rq_init(struct rq *rq)
442{
443}
444#endif
445
446
447
448
449#define fetch_or(ptr, mask) \
450 ({ \
451 typeof(ptr) _ptr = (ptr); \
452 typeof(mask) _mask = (mask); \
453 typeof(*_ptr) _old, _val = *_ptr; \
454 \
455 for (;;) { \
456 _old = cmpxchg(_ptr, _val, _val | _mask); \
457 if (_old == _val) \
458 break; \
459 _val = _old; \
460 } \
461 _old; \
462})
463
464#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
465
466
467
468
469
470static bool set_nr_and_not_polling(struct task_struct *p)
471{
472 struct thread_info *ti = task_thread_info(p);
473 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
474}
475
476
477
478
479
480
481
482static bool set_nr_if_polling(struct task_struct *p)
483{
484 struct thread_info *ti = task_thread_info(p);
485 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
486
487 for (;;) {
488 if (!(val & _TIF_POLLING_NRFLAG))
489 return false;
490 if (val & _TIF_NEED_RESCHED)
491 return true;
492 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
493 if (old == val)
494 break;
495 val = old;
496 }
497 return true;
498}
499
500#else
501static bool set_nr_and_not_polling(struct task_struct *p)
502{
503 set_tsk_need_resched(p);
504 return true;
505}
506
507#ifdef CONFIG_SMP
508static bool set_nr_if_polling(struct task_struct *p)
509{
510 return false;
511}
512#endif
513#endif
514
515static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
516{
517 struct wake_q_node *node = &task->wake_q;
518
519
520
521
522
523
524
525
526
527 smp_mb__before_atomic();
528 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
529 return false;
530
531
532
533
534 *head->lastp = node;
535 head->lastp = &node->next;
536 return true;
537}
538
539
540
541
542
543
544
545
546
547
548
549
550
551void wake_q_add(struct wake_q_head *head, struct task_struct *task)
552{
553 if (__wake_q_add(head, task))
554 get_task_struct(task);
555}
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
575{
576 if (!__wake_q_add(head, task))
577 put_task_struct(task);
578}
579
580void wake_up_q(struct wake_q_head *head)
581{
582 struct wake_q_node *node = head->first;
583
584 while (node != WAKE_Q_TAIL) {
585 struct task_struct *task;
586
587 task = container_of(node, struct task_struct, wake_q);
588 BUG_ON(!task);
589
590 node = node->next;
591 task->wake_q.next = NULL;
592
593
594
595
596
597 wake_up_process(task);
598 put_task_struct(task);
599 }
600}
601
602
603
604
605
606
607
608
609void resched_curr(struct rq *rq)
610{
611 struct task_struct *curr = rq->curr;
612 int cpu;
613
614 lockdep_assert_held(&rq->lock);
615
616 if (test_tsk_need_resched(curr))
617 return;
618
619 cpu = cpu_of(rq);
620
621 if (cpu == smp_processor_id()) {
622 set_tsk_need_resched(curr);
623 set_preempt_need_resched();
624 return;
625 }
626
627 if (set_nr_and_not_polling(curr))
628 smp_send_reschedule(cpu);
629 else
630 trace_sched_wake_idle_without_ipi(cpu);
631}
632
633void resched_cpu(int cpu)
634{
635 struct rq *rq = cpu_rq(cpu);
636 unsigned long flags;
637
638 raw_spin_lock_irqsave(&rq->lock, flags);
639 if (cpu_online(cpu) || cpu == smp_processor_id())
640 resched_curr(rq);
641 raw_spin_unlock_irqrestore(&rq->lock, flags);
642}
643
644#ifdef CONFIG_SMP
645#ifdef CONFIG_NO_HZ_COMMON
646
647
648
649
650
651
652
653
654int get_nohz_timer_target(void)
655{
656 int i, cpu = smp_processor_id(), default_cpu = -1;
657 struct sched_domain *sd;
658
659 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
660 if (!idle_cpu(cpu))
661 return cpu;
662 default_cpu = cpu;
663 }
664
665 rcu_read_lock();
666 for_each_domain(cpu, sd) {
667 for_each_cpu_and(i, sched_domain_span(sd),
668 housekeeping_cpumask(HK_FLAG_TIMER)) {
669 if (cpu == i)
670 continue;
671
672 if (!idle_cpu(i)) {
673 cpu = i;
674 goto unlock;
675 }
676 }
677 }
678
679 if (default_cpu == -1)
680 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
681 cpu = default_cpu;
682unlock:
683 rcu_read_unlock();
684 return cpu;
685}
686
687
688
689
690
691
692
693
694
695
696
697static void wake_up_idle_cpu(int cpu)
698{
699 struct rq *rq = cpu_rq(cpu);
700
701 if (cpu == smp_processor_id())
702 return;
703
704 if (set_nr_and_not_polling(rq->idle))
705 smp_send_reschedule(cpu);
706 else
707 trace_sched_wake_idle_without_ipi(cpu);
708}
709
710static bool wake_up_full_nohz_cpu(int cpu)
711{
712
713
714
715
716
717
718 if (cpu_is_offline(cpu))
719 return true;
720 if (tick_nohz_full_cpu(cpu)) {
721 if (cpu != smp_processor_id() ||
722 tick_nohz_tick_stopped())
723 tick_nohz_full_kick_cpu(cpu);
724 return true;
725 }
726
727 return false;
728}
729
730
731
732
733
734
735void wake_up_nohz_cpu(int cpu)
736{
737 if (!wake_up_full_nohz_cpu(cpu))
738 wake_up_idle_cpu(cpu);
739}
740
741static void nohz_csd_func(void *info)
742{
743 struct rq *rq = info;
744 int cpu = cpu_of(rq);
745 unsigned int flags;
746
747
748
749
750 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
751 WARN_ON(!(flags & NOHZ_KICK_MASK));
752
753 rq->idle_balance = idle_cpu(cpu);
754 if (rq->idle_balance && !need_resched()) {
755 rq->nohz_idle_balance = flags;
756 raise_softirq_irqoff(SCHED_SOFTIRQ);
757 }
758}
759
760#endif
761
762#ifdef CONFIG_NO_HZ_FULL
763bool sched_can_stop_tick(struct rq *rq)
764{
765 int fifo_nr_running;
766
767
768 if (rq->dl.dl_nr_running)
769 return false;
770
771
772
773
774
775 if (rq->rt.rr_nr_running) {
776 if (rq->rt.rr_nr_running == 1)
777 return true;
778 else
779 return false;
780 }
781
782
783
784
785
786 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
787 if (fifo_nr_running)
788 return true;
789
790
791
792
793
794
795 if (rq->nr_running > 1)
796 return false;
797
798 return true;
799}
800#endif
801#endif
802
803#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
804 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
805
806
807
808
809
810
811int walk_tg_tree_from(struct task_group *from,
812 tg_visitor down, tg_visitor up, void *data)
813{
814 struct task_group *parent, *child;
815 int ret;
816
817 parent = from;
818
819down:
820 ret = (*down)(parent, data);
821 if (ret)
822 goto out;
823 list_for_each_entry_rcu(child, &parent->children, siblings) {
824 parent = child;
825 goto down;
826
827up:
828 continue;
829 }
830 ret = (*up)(parent, data);
831 if (ret || parent == from)
832 goto out;
833
834 child = parent;
835 parent = parent->parent;
836 if (parent)
837 goto up;
838out:
839 return ret;
840}
841
842int tg_nop(struct task_group *tg, void *data)
843{
844 return 0;
845}
846#endif
847
848static void set_load_weight(struct task_struct *p, bool update_load)
849{
850 int prio = p->static_prio - MAX_RT_PRIO;
851 struct load_weight *load = &p->se.load;
852
853
854
855
856 if (task_has_idle_policy(p)) {
857 load->weight = scale_load(WEIGHT_IDLEPRIO);
858 load->inv_weight = WMULT_IDLEPRIO;
859 return;
860 }
861
862
863
864
865
866 if (update_load && p->sched_class == &fair_sched_class) {
867 reweight_task(p, prio);
868 } else {
869 load->weight = scale_load(sched_prio_to_weight[prio]);
870 load->inv_weight = sched_prio_to_wmult[prio];
871 }
872}
873
874#ifdef CONFIG_UCLAMP_TASK
875
876
877
878
879
880
881
882
883
884
885static DEFINE_MUTEX(uclamp_mutex);
886
887
888unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
889
890
891unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
909
910
911static struct uclamp_se uclamp_default[UCLAMP_CNT];
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
932
933
934#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
935
936#define for_each_clamp_id(clamp_id) \
937 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
938
939static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
940{
941 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
942}
943
944static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
945{
946 if (clamp_id == UCLAMP_MIN)
947 return 0;
948 return SCHED_CAPACITY_SCALE;
949}
950
951static inline void uclamp_se_set(struct uclamp_se *uc_se,
952 unsigned int value, bool user_defined)
953{
954 uc_se->value = value;
955 uc_se->bucket_id = uclamp_bucket_id(value);
956 uc_se->user_defined = user_defined;
957}
958
959static inline unsigned int
960uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
961 unsigned int clamp_value)
962{
963
964
965
966
967
968 if (clamp_id == UCLAMP_MAX) {
969 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
970 return clamp_value;
971 }
972
973 return uclamp_none(UCLAMP_MIN);
974}
975
976static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
977 unsigned int clamp_value)
978{
979
980 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
981 return;
982
983 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
984}
985
986static inline
987unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
988 unsigned int clamp_value)
989{
990 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
991 int bucket_id = UCLAMP_BUCKETS - 1;
992
993
994
995
996
997 for ( ; bucket_id >= 0; bucket_id--) {
998 if (!bucket[bucket_id].tasks)
999 continue;
1000 return bucket[bucket_id].value;
1001 }
1002
1003
1004 return uclamp_idle_value(rq, clamp_id, clamp_value);
1005}
1006
1007static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1008{
1009 unsigned int default_util_min;
1010 struct uclamp_se *uc_se;
1011
1012 lockdep_assert_held(&p->pi_lock);
1013
1014 uc_se = &p->uclamp_req[UCLAMP_MIN];
1015
1016
1017 if (uc_se->user_defined)
1018 return;
1019
1020 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1021 uclamp_se_set(uc_se, default_util_min, false);
1022}
1023
1024static void uclamp_update_util_min_rt_default(struct task_struct *p)
1025{
1026 struct rq_flags rf;
1027 struct rq *rq;
1028
1029 if (!rt_task(p))
1030 return;
1031
1032
1033 rq = task_rq_lock(p, &rf);
1034 __uclamp_update_util_min_rt_default(p);
1035 task_rq_unlock(rq, p, &rf);
1036}
1037
1038static void uclamp_sync_util_min_rt_default(void)
1039{
1040 struct task_struct *g, *p;
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 read_lock(&tasklist_lock);
1056 smp_mb__after_spinlock();
1057 read_unlock(&tasklist_lock);
1058
1059 rcu_read_lock();
1060 for_each_process_thread(g, p)
1061 uclamp_update_util_min_rt_default(p);
1062 rcu_read_unlock();
1063}
1064
1065static inline struct uclamp_se
1066uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1067{
1068 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1069#ifdef CONFIG_UCLAMP_TASK_GROUP
1070 struct uclamp_se uc_max;
1071
1072
1073
1074
1075
1076 if (task_group_is_autogroup(task_group(p)))
1077 return uc_req;
1078 if (task_group(p) == &root_task_group)
1079 return uc_req;
1080
1081 uc_max = task_group(p)->uclamp[clamp_id];
1082 if (uc_req.value > uc_max.value || !uc_req.user_defined)
1083 return uc_max;
1084#endif
1085
1086 return uc_req;
1087}
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097static inline struct uclamp_se
1098uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1099{
1100 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1101 struct uclamp_se uc_max = uclamp_default[clamp_id];
1102
1103
1104 if (unlikely(uc_req.value > uc_max.value))
1105 return uc_max;
1106
1107 return uc_req;
1108}
1109
1110unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1111{
1112 struct uclamp_se uc_eff;
1113
1114
1115 if (p->uclamp[clamp_id].active)
1116 return (unsigned long)p->uclamp[clamp_id].value;
1117
1118 uc_eff = uclamp_eff_get(p, clamp_id);
1119
1120 return (unsigned long)uc_eff.value;
1121}
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1134 enum uclamp_id clamp_id)
1135{
1136 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1137 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1138 struct uclamp_bucket *bucket;
1139
1140 lockdep_assert_held(&rq->lock);
1141
1142
1143 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1144
1145 bucket = &uc_rq->bucket[uc_se->bucket_id];
1146 bucket->tasks++;
1147 uc_se->active = true;
1148
1149 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1150
1151
1152
1153
1154
1155 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1156 bucket->value = uc_se->value;
1157
1158 if (uc_se->value > READ_ONCE(uc_rq->value))
1159 WRITE_ONCE(uc_rq->value, uc_se->value);
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1172 enum uclamp_id clamp_id)
1173{
1174 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1175 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1176 struct uclamp_bucket *bucket;
1177 unsigned int bkt_clamp;
1178 unsigned int rq_clamp;
1179
1180 lockdep_assert_held(&rq->lock);
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205 if (unlikely(!uc_se->active))
1206 return;
1207
1208 bucket = &uc_rq->bucket[uc_se->bucket_id];
1209
1210 SCHED_WARN_ON(!bucket->tasks);
1211 if (likely(bucket->tasks))
1212 bucket->tasks--;
1213
1214 uc_se->active = false;
1215
1216
1217
1218
1219
1220
1221
1222 if (likely(bucket->tasks))
1223 return;
1224
1225 rq_clamp = READ_ONCE(uc_rq->value);
1226
1227
1228
1229
1230 SCHED_WARN_ON(bucket->value > rq_clamp);
1231 if (bucket->value >= rq_clamp) {
1232 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1233 WRITE_ONCE(uc_rq->value, bkt_clamp);
1234 }
1235}
1236
1237static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1238{
1239 enum uclamp_id clamp_id;
1240
1241
1242
1243
1244
1245
1246
1247 if (!static_branch_unlikely(&sched_uclamp_used))
1248 return;
1249
1250 if (unlikely(!p->sched_class->uclamp_enabled))
1251 return;
1252
1253 for_each_clamp_id(clamp_id)
1254 uclamp_rq_inc_id(rq, p, clamp_id);
1255
1256
1257 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1258 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1259}
1260
1261static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1262{
1263 enum uclamp_id clamp_id;
1264
1265
1266
1267
1268
1269
1270
1271 if (!static_branch_unlikely(&sched_uclamp_used))
1272 return;
1273
1274 if (unlikely(!p->sched_class->uclamp_enabled))
1275 return;
1276
1277 for_each_clamp_id(clamp_id)
1278 uclamp_rq_dec_id(rq, p, clamp_id);
1279}
1280
1281static inline void
1282uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1283{
1284 struct rq_flags rf;
1285 struct rq *rq;
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295 rq = task_rq_lock(p, &rf);
1296
1297
1298
1299
1300
1301
1302
1303 if (p->uclamp[clamp_id].active) {
1304 uclamp_rq_dec_id(rq, p, clamp_id);
1305 uclamp_rq_inc_id(rq, p, clamp_id);
1306 }
1307
1308 task_rq_unlock(rq, p, &rf);
1309}
1310
1311#ifdef CONFIG_UCLAMP_TASK_GROUP
1312static inline void
1313uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1314 unsigned int clamps)
1315{
1316 enum uclamp_id clamp_id;
1317 struct css_task_iter it;
1318 struct task_struct *p;
1319
1320 css_task_iter_start(css, 0, &it);
1321 while ((p = css_task_iter_next(&it))) {
1322 for_each_clamp_id(clamp_id) {
1323 if ((0x1 << clamp_id) & clamps)
1324 uclamp_update_active(p, clamp_id);
1325 }
1326 }
1327 css_task_iter_end(&it);
1328}
1329
1330static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1331static void uclamp_update_root_tg(void)
1332{
1333 struct task_group *tg = &root_task_group;
1334
1335 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1336 sysctl_sched_uclamp_util_min, false);
1337 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1338 sysctl_sched_uclamp_util_max, false);
1339
1340 rcu_read_lock();
1341 cpu_util_update_eff(&root_task_group.css);
1342 rcu_read_unlock();
1343}
1344#else
1345static void uclamp_update_root_tg(void) { }
1346#endif
1347
1348int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1349 void *buffer, size_t *lenp, loff_t *ppos)
1350{
1351 bool update_root_tg = false;
1352 int old_min, old_max, old_min_rt;
1353 int result;
1354
1355 mutex_lock(&uclamp_mutex);
1356 old_min = sysctl_sched_uclamp_util_min;
1357 old_max = sysctl_sched_uclamp_util_max;
1358 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1359
1360 result = proc_dointvec(table, write, buffer, lenp, ppos);
1361 if (result)
1362 goto undo;
1363 if (!write)
1364 goto done;
1365
1366 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1367 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1368 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1369
1370 result = -EINVAL;
1371 goto undo;
1372 }
1373
1374 if (old_min != sysctl_sched_uclamp_util_min) {
1375 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1376 sysctl_sched_uclamp_util_min, false);
1377 update_root_tg = true;
1378 }
1379 if (old_max != sysctl_sched_uclamp_util_max) {
1380 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1381 sysctl_sched_uclamp_util_max, false);
1382 update_root_tg = true;
1383 }
1384
1385 if (update_root_tg) {
1386 static_branch_enable(&sched_uclamp_used);
1387 uclamp_update_root_tg();
1388 }
1389
1390 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1391 static_branch_enable(&sched_uclamp_used);
1392 uclamp_sync_util_min_rt_default();
1393 }
1394
1395
1396
1397
1398
1399
1400
1401 goto done;
1402
1403undo:
1404 sysctl_sched_uclamp_util_min = old_min;
1405 sysctl_sched_uclamp_util_max = old_max;
1406 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1407done:
1408 mutex_unlock(&uclamp_mutex);
1409
1410 return result;
1411}
1412
1413static int uclamp_validate(struct task_struct *p,
1414 const struct sched_attr *attr)
1415{
1416 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1417 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1418
1419 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1420 util_min = attr->sched_util_min;
1421
1422 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1423 return -EINVAL;
1424 }
1425
1426 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1427 util_max = attr->sched_util_max;
1428
1429 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1430 return -EINVAL;
1431 }
1432
1433 if (util_min != -1 && util_max != -1 && util_min > util_max)
1434 return -EINVAL;
1435
1436
1437
1438
1439
1440
1441
1442
1443 static_branch_enable(&sched_uclamp_used);
1444
1445 return 0;
1446}
1447
1448static bool uclamp_reset(const struct sched_attr *attr,
1449 enum uclamp_id clamp_id,
1450 struct uclamp_se *uc_se)
1451{
1452
1453 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1454 !uc_se->user_defined)
1455 return true;
1456
1457
1458 if (clamp_id == UCLAMP_MIN &&
1459 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1460 attr->sched_util_min == -1) {
1461 return true;
1462 }
1463
1464 if (clamp_id == UCLAMP_MAX &&
1465 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1466 attr->sched_util_max == -1) {
1467 return true;
1468 }
1469
1470 return false;
1471}
1472
1473static void __setscheduler_uclamp(struct task_struct *p,
1474 const struct sched_attr *attr)
1475{
1476 enum uclamp_id clamp_id;
1477
1478 for_each_clamp_id(clamp_id) {
1479 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1480 unsigned int value;
1481
1482 if (!uclamp_reset(attr, clamp_id, uc_se))
1483 continue;
1484
1485
1486
1487
1488
1489 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1490 value = sysctl_sched_uclamp_util_min_rt_default;
1491 else
1492 value = uclamp_none(clamp_id);
1493
1494 uclamp_se_set(uc_se, value, false);
1495
1496 }
1497
1498 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1499 return;
1500
1501 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1502 attr->sched_util_min != -1) {
1503 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1504 attr->sched_util_min, true);
1505 }
1506
1507 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1508 attr->sched_util_max != -1) {
1509 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1510 attr->sched_util_max, true);
1511 }
1512}
1513
1514static void uclamp_fork(struct task_struct *p)
1515{
1516 enum uclamp_id clamp_id;
1517
1518
1519
1520
1521
1522 for_each_clamp_id(clamp_id)
1523 p->uclamp[clamp_id].active = false;
1524
1525 if (likely(!p->sched_reset_on_fork))
1526 return;
1527
1528 for_each_clamp_id(clamp_id) {
1529 uclamp_se_set(&p->uclamp_req[clamp_id],
1530 uclamp_none(clamp_id), false);
1531 }
1532}
1533
1534static void uclamp_post_fork(struct task_struct *p)
1535{
1536 uclamp_update_util_min_rt_default(p);
1537}
1538
1539static void __init init_uclamp_rq(struct rq *rq)
1540{
1541 enum uclamp_id clamp_id;
1542 struct uclamp_rq *uc_rq = rq->uclamp;
1543
1544 for_each_clamp_id(clamp_id) {
1545 uc_rq[clamp_id] = (struct uclamp_rq) {
1546 .value = uclamp_none(clamp_id)
1547 };
1548 }
1549
1550 rq->uclamp_flags = 0;
1551}
1552
1553static void __init init_uclamp(void)
1554{
1555 struct uclamp_se uc_max = {};
1556 enum uclamp_id clamp_id;
1557 int cpu;
1558
1559 for_each_possible_cpu(cpu)
1560 init_uclamp_rq(cpu_rq(cpu));
1561
1562 for_each_clamp_id(clamp_id) {
1563 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1564 uclamp_none(clamp_id), false);
1565 }
1566
1567
1568 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1569 for_each_clamp_id(clamp_id) {
1570 uclamp_default[clamp_id] = uc_max;
1571#ifdef CONFIG_UCLAMP_TASK_GROUP
1572 root_task_group.uclamp_req[clamp_id] = uc_max;
1573 root_task_group.uclamp[clamp_id] = uc_max;
1574#endif
1575 }
1576}
1577
1578#else
1579static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1580static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1581static inline int uclamp_validate(struct task_struct *p,
1582 const struct sched_attr *attr)
1583{
1584 return -EOPNOTSUPP;
1585}
1586static void __setscheduler_uclamp(struct task_struct *p,
1587 const struct sched_attr *attr) { }
1588static inline void uclamp_fork(struct task_struct *p) { }
1589static inline void uclamp_post_fork(struct task_struct *p) { }
1590static inline void init_uclamp(void) { }
1591#endif
1592
1593static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1594{
1595 if (!(flags & ENQUEUE_NOCLOCK))
1596 update_rq_clock(rq);
1597
1598 if (!(flags & ENQUEUE_RESTORE)) {
1599 sched_info_queued(rq, p);
1600 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1601 }
1602
1603 uclamp_rq_inc(rq, p);
1604 p->sched_class->enqueue_task(rq, p, flags);
1605}
1606
1607static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1608{
1609 if (!(flags & DEQUEUE_NOCLOCK))
1610 update_rq_clock(rq);
1611
1612 if (!(flags & DEQUEUE_SAVE)) {
1613 sched_info_dequeued(rq, p);
1614 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1615 }
1616
1617 uclamp_rq_dec(rq, p);
1618 p->sched_class->dequeue_task(rq, p, flags);
1619}
1620
1621void activate_task(struct rq *rq, struct task_struct *p, int flags)
1622{
1623 enqueue_task(rq, p, flags);
1624
1625 p->on_rq = TASK_ON_RQ_QUEUED;
1626}
1627
1628void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1629{
1630 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1631
1632 dequeue_task(rq, p, flags);
1633}
1634
1635
1636
1637
1638static inline int __normal_prio(struct task_struct *p)
1639{
1640 return p->static_prio;
1641}
1642
1643
1644
1645
1646
1647
1648
1649
1650static inline int normal_prio(struct task_struct *p)
1651{
1652 int prio;
1653
1654 if (task_has_dl_policy(p))
1655 prio = MAX_DL_PRIO-1;
1656 else if (task_has_rt_policy(p))
1657 prio = MAX_RT_PRIO-1 - p->rt_priority;
1658 else
1659 prio = __normal_prio(p);
1660 return prio;
1661}
1662
1663
1664
1665
1666
1667
1668
1669
1670static int effective_prio(struct task_struct *p)
1671{
1672 p->normal_prio = normal_prio(p);
1673
1674
1675
1676
1677
1678 if (!rt_prio(p->prio))
1679 return p->normal_prio;
1680 return p->prio;
1681}
1682
1683
1684
1685
1686
1687
1688
1689inline int task_curr(const struct task_struct *p)
1690{
1691 return cpu_curr(task_cpu(p)) == p;
1692}
1693
1694
1695
1696
1697
1698
1699
1700
1701static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1702 const struct sched_class *prev_class,
1703 int oldprio)
1704{
1705 if (prev_class != p->sched_class) {
1706 if (prev_class->switched_from)
1707 prev_class->switched_from(rq, p);
1708
1709 p->sched_class->switched_to(rq, p);
1710 } else if (oldprio != p->prio || dl_task(p))
1711 p->sched_class->prio_changed(rq, p, oldprio);
1712}
1713
1714void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1715{
1716 if (p->sched_class == rq->curr->sched_class)
1717 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1718 else if (p->sched_class > rq->curr->sched_class)
1719 resched_curr(rq);
1720
1721
1722
1723
1724
1725 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1726 rq_clock_skip_update(rq);
1727}
1728
1729#ifdef CONFIG_SMP
1730
1731static void
1732__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
1733
1734static int __set_cpus_allowed_ptr(struct task_struct *p,
1735 const struct cpumask *new_mask,
1736 u32 flags);
1737
1738static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
1739{
1740 if (likely(!p->migration_disabled))
1741 return;
1742
1743 if (p->cpus_ptr != &p->cpus_mask)
1744 return;
1745
1746
1747
1748
1749 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
1750}
1751
1752void migrate_disable(void)
1753{
1754 struct task_struct *p = current;
1755
1756 if (p->migration_disabled) {
1757 p->migration_disabled++;
1758 return;
1759 }
1760
1761 preempt_disable();
1762 this_rq()->nr_pinned++;
1763 p->migration_disabled = 1;
1764 preempt_enable();
1765}
1766EXPORT_SYMBOL_GPL(migrate_disable);
1767
1768void migrate_enable(void)
1769{
1770 struct task_struct *p = current;
1771
1772 if (p->migration_disabled > 1) {
1773 p->migration_disabled--;
1774 return;
1775 }
1776
1777
1778
1779
1780
1781 preempt_disable();
1782 if (p->cpus_ptr != &p->cpus_mask)
1783 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
1784
1785
1786
1787
1788
1789 barrier();
1790 p->migration_disabled = 0;
1791 this_rq()->nr_pinned--;
1792 preempt_enable();
1793}
1794EXPORT_SYMBOL_GPL(migrate_enable);
1795
1796static inline bool rq_has_pinned_tasks(struct rq *rq)
1797{
1798 return rq->nr_pinned;
1799}
1800
1801
1802
1803
1804
1805static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1806{
1807
1808 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1809 return false;
1810
1811
1812 if (is_migration_disabled(p))
1813 return cpu_online(cpu);
1814
1815
1816 if (!(p->flags & PF_KTHREAD))
1817 return cpu_active(cpu);
1818
1819
1820 if (kthread_is_per_cpu(p))
1821 return cpu_online(cpu);
1822
1823
1824 if (cpu_dying(cpu))
1825 return false;
1826
1827
1828 return cpu_online(cpu);
1829}
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1851 struct task_struct *p, int new_cpu)
1852{
1853 lockdep_assert_held(&rq->lock);
1854
1855 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1856 set_task_cpu(p, new_cpu);
1857 rq_unlock(rq, rf);
1858
1859 rq = cpu_rq(new_cpu);
1860
1861 rq_lock(rq, rf);
1862 BUG_ON(task_cpu(p) != new_cpu);
1863 activate_task(rq, p, 0);
1864 check_preempt_curr(rq, p, 0);
1865
1866 return rq;
1867}
1868
1869struct migration_arg {
1870 struct task_struct *task;
1871 int dest_cpu;
1872 struct set_affinity_pending *pending;
1873};
1874
1875
1876
1877
1878
1879struct set_affinity_pending {
1880 refcount_t refs;
1881 unsigned int stop_pending;
1882 struct completion done;
1883 struct cpu_stop_work stop_work;
1884 struct migration_arg arg;
1885};
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1897 struct task_struct *p, int dest_cpu)
1898{
1899
1900 if (!is_cpu_allowed(p, dest_cpu))
1901 return rq;
1902
1903 update_rq_clock(rq);
1904 rq = move_queued_task(rq, rf, p, dest_cpu);
1905
1906 return rq;
1907}
1908
1909
1910
1911
1912
1913
1914static int migration_cpu_stop(void *data)
1915{
1916 struct migration_arg *arg = data;
1917 struct set_affinity_pending *pending = arg->pending;
1918 struct task_struct *p = arg->task;
1919 int dest_cpu = arg->dest_cpu;
1920 struct rq *rq = this_rq();
1921 bool complete = false;
1922 struct rq_flags rf;
1923
1924
1925
1926
1927
1928 local_irq_save(rf.flags);
1929
1930
1931
1932
1933
1934 flush_smp_call_function_from_idle();
1935
1936 raw_spin_lock(&p->pi_lock);
1937 rq_lock(rq, &rf);
1938
1939
1940
1941
1942
1943 WARN_ON_ONCE(pending && pending != p->migration_pending);
1944
1945
1946
1947
1948
1949
1950 if (task_rq(p) == rq) {
1951 if (is_migration_disabled(p))
1952 goto out;
1953
1954 if (pending) {
1955 p->migration_pending = NULL;
1956 complete = true;
1957 }
1958
1959 if (dest_cpu < 0) {
1960 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
1961 goto out;
1962
1963 dest_cpu = cpumask_any_distribute(&p->cpus_mask);
1964 }
1965
1966 if (task_on_rq_queued(p))
1967 rq = __migrate_task(rq, &rf, p, dest_cpu);
1968 else
1969 p->wake_cpu = dest_cpu;
1970
1971
1972
1973
1974
1975
1976
1977
1978 } else if (pending) {
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
1994 p->migration_pending = NULL;
1995 complete = true;
1996 goto out;
1997 }
1998
1999
2000
2001
2002
2003
2004 WARN_ON_ONCE(!pending->stop_pending);
2005 task_rq_unlock(rq, p, &rf);
2006 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2007 &pending->arg, &pending->stop_work);
2008 return 0;
2009 }
2010out:
2011 if (pending)
2012 pending->stop_pending = false;
2013 task_rq_unlock(rq, p, &rf);
2014
2015 if (complete)
2016 complete_all(&pending->done);
2017
2018 return 0;
2019}
2020
2021int push_cpu_stop(void *arg)
2022{
2023 struct rq *lowest_rq = NULL, *rq = this_rq();
2024 struct task_struct *p = arg;
2025
2026 raw_spin_lock_irq(&p->pi_lock);
2027 raw_spin_lock(&rq->lock);
2028
2029 if (task_rq(p) != rq)
2030 goto out_unlock;
2031
2032 if (is_migration_disabled(p)) {
2033 p->migration_flags |= MDF_PUSH;
2034 goto out_unlock;
2035 }
2036
2037 p->migration_flags &= ~MDF_PUSH;
2038
2039 if (p->sched_class->find_lock_rq)
2040 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2041
2042 if (!lowest_rq)
2043 goto out_unlock;
2044
2045
2046 if (task_rq(p) == rq) {
2047 deactivate_task(rq, p, 0);
2048 set_task_cpu(p, lowest_rq->cpu);
2049 activate_task(lowest_rq, p, 0);
2050 resched_curr(lowest_rq);
2051 }
2052
2053 double_unlock_balance(rq, lowest_rq);
2054
2055out_unlock:
2056 rq->push_busy = false;
2057 raw_spin_unlock(&rq->lock);
2058 raw_spin_unlock_irq(&p->pi_lock);
2059
2060 put_task_struct(p);
2061 return 0;
2062}
2063
2064
2065
2066
2067
2068void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2069{
2070 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2071 p->cpus_ptr = new_mask;
2072 return;
2073 }
2074
2075 cpumask_copy(&p->cpus_mask, new_mask);
2076 p->nr_cpus_allowed = cpumask_weight(new_mask);
2077}
2078
2079static void
2080__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2081{
2082 struct rq *rq = task_rq(p);
2083 bool queued, running;
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097 if (flags & SCA_MIGRATE_DISABLE)
2098 SCHED_WARN_ON(!p->on_cpu);
2099 else
2100 lockdep_assert_held(&p->pi_lock);
2101
2102 queued = task_on_rq_queued(p);
2103 running = task_current(rq, p);
2104
2105 if (queued) {
2106
2107
2108
2109
2110 lockdep_assert_held(&rq->lock);
2111 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2112 }
2113 if (running)
2114 put_prev_task(rq, p);
2115
2116 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2117
2118 if (queued)
2119 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2120 if (running)
2121 set_next_task(rq, p);
2122}
2123
2124void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2125{
2126 __do_set_cpus_allowed(p, new_mask, 0);
2127}
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2206 int dest_cpu, unsigned int flags)
2207{
2208 struct set_affinity_pending my_pending = { }, *pending = NULL;
2209 bool stop_pending, complete = false;
2210
2211
2212 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2213 struct task_struct *push_task = NULL;
2214
2215 if ((flags & SCA_MIGRATE_ENABLE) &&
2216 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2217 rq->push_busy = true;
2218 push_task = get_task_struct(p);
2219 }
2220
2221
2222
2223
2224
2225 pending = p->migration_pending;
2226 if (pending && !pending->stop_pending) {
2227 p->migration_pending = NULL;
2228 complete = true;
2229 }
2230
2231 task_rq_unlock(rq, p, rf);
2232
2233 if (push_task) {
2234 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2235 p, &rq->push_work);
2236 }
2237
2238 if (complete)
2239 complete_all(&pending->done);
2240
2241 return 0;
2242 }
2243
2244 if (!(flags & SCA_MIGRATE_ENABLE)) {
2245
2246 if (!p->migration_pending) {
2247
2248 refcount_set(&my_pending.refs, 1);
2249 init_completion(&my_pending.done);
2250 my_pending.arg = (struct migration_arg) {
2251 .task = p,
2252 .dest_cpu = -1,
2253 .pending = &my_pending,
2254 };
2255
2256 p->migration_pending = &my_pending;
2257 } else {
2258 pending = p->migration_pending;
2259 refcount_inc(&pending->refs);
2260 }
2261 }
2262 pending = p->migration_pending;
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275 if (WARN_ON_ONCE(!pending)) {
2276 task_rq_unlock(rq, p, rf);
2277 return -EINVAL;
2278 }
2279
2280 if (task_running(rq, p) || p->state == TASK_WAKING) {
2281
2282
2283
2284
2285
2286 stop_pending = pending->stop_pending;
2287 if (!stop_pending)
2288 pending->stop_pending = true;
2289
2290 if (flags & SCA_MIGRATE_ENABLE)
2291 p->migration_flags &= ~MDF_PUSH;
2292
2293 task_rq_unlock(rq, p, rf);
2294
2295 if (!stop_pending) {
2296 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2297 &pending->arg, &pending->stop_work);
2298 }
2299
2300 if (flags & SCA_MIGRATE_ENABLE)
2301 return 0;
2302 } else {
2303
2304 if (!is_migration_disabled(p)) {
2305 if (task_on_rq_queued(p))
2306 rq = move_queued_task(rq, rf, p, dest_cpu);
2307
2308 if (!pending->stop_pending) {
2309 p->migration_pending = NULL;
2310 complete = true;
2311 }
2312 }
2313 task_rq_unlock(rq, p, rf);
2314
2315 if (complete)
2316 complete_all(&pending->done);
2317 }
2318
2319 wait_for_completion(&pending->done);
2320
2321 if (refcount_dec_and_test(&pending->refs))
2322 wake_up_var(&pending->refs);
2323
2324
2325
2326
2327
2328 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2329
2330
2331 WARN_ON_ONCE(my_pending.stop_pending);
2332
2333 return 0;
2334}
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345static int __set_cpus_allowed_ptr(struct task_struct *p,
2346 const struct cpumask *new_mask,
2347 u32 flags)
2348{
2349 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2350 unsigned int dest_cpu;
2351 struct rq_flags rf;
2352 struct rq *rq;
2353 int ret = 0;
2354
2355 rq = task_rq_lock(p, &rf);
2356 update_rq_clock(rq);
2357
2358 if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369 cpu_valid_mask = cpu_online_mask;
2370 }
2371
2372
2373
2374
2375
2376 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2377 ret = -EINVAL;
2378 goto out;
2379 }
2380
2381 if (!(flags & SCA_MIGRATE_ENABLE)) {
2382 if (cpumask_equal(&p->cpus_mask, new_mask))
2383 goto out;
2384
2385 if (WARN_ON_ONCE(p == current &&
2386 is_migration_disabled(p) &&
2387 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2388 ret = -EBUSY;
2389 goto out;
2390 }
2391 }
2392
2393
2394
2395
2396
2397
2398 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2399 if (dest_cpu >= nr_cpu_ids) {
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403
2404 __do_set_cpus_allowed(p, new_mask, flags);
2405
2406 return affine_move_task(rq, p, &rf, dest_cpu, flags);
2407
2408out:
2409 task_rq_unlock(rq, p, &rf);
2410
2411 return ret;
2412}
2413
2414int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2415{
2416 return __set_cpus_allowed_ptr(p, new_mask, 0);
2417}
2418EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2419
2420void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2421{
2422#ifdef CONFIG_SCHED_DEBUG
2423
2424
2425
2426
2427 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2428 !p->on_rq);
2429
2430
2431
2432
2433
2434
2435 WARN_ON_ONCE(p->state == TASK_RUNNING &&
2436 p->sched_class == &fair_sched_class &&
2437 (p->on_rq && !task_on_rq_migrating(p)));
2438
2439#ifdef CONFIG_LOCKDEP
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2451 lockdep_is_held(&task_rq(p)->lock)));
2452#endif
2453
2454
2455
2456 WARN_ON_ONCE(!cpu_online(new_cpu));
2457
2458 WARN_ON_ONCE(is_migration_disabled(p));
2459#endif
2460
2461 trace_sched_migrate_task(p, new_cpu);
2462
2463 if (task_cpu(p) != new_cpu) {
2464 if (p->sched_class->migrate_task_rq)
2465 p->sched_class->migrate_task_rq(p, new_cpu);
2466 p->se.nr_migrations++;
2467 rseq_migrate(p);
2468 perf_event_task_migrate(p);
2469 }
2470
2471 __set_task_cpu(p, new_cpu);
2472}
2473
2474#ifdef CONFIG_NUMA_BALANCING
2475static void __migrate_swap_task(struct task_struct *p, int cpu)
2476{
2477 if (task_on_rq_queued(p)) {
2478 struct rq *src_rq, *dst_rq;
2479 struct rq_flags srf, drf;
2480
2481 src_rq = task_rq(p);
2482 dst_rq = cpu_rq(cpu);
2483
2484 rq_pin_lock(src_rq, &srf);
2485 rq_pin_lock(dst_rq, &drf);
2486
2487 deactivate_task(src_rq, p, 0);
2488 set_task_cpu(p, cpu);
2489 activate_task(dst_rq, p, 0);
2490 check_preempt_curr(dst_rq, p, 0);
2491
2492 rq_unpin_lock(dst_rq, &drf);
2493 rq_unpin_lock(src_rq, &srf);
2494
2495 } else {
2496
2497
2498
2499
2500
2501 p->wake_cpu = cpu;
2502 }
2503}
2504
2505struct migration_swap_arg {
2506 struct task_struct *src_task, *dst_task;
2507 int src_cpu, dst_cpu;
2508};
2509
2510static int migrate_swap_stop(void *data)
2511{
2512 struct migration_swap_arg *arg = data;
2513 struct rq *src_rq, *dst_rq;
2514 int ret = -EAGAIN;
2515
2516 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2517 return -EAGAIN;
2518
2519 src_rq = cpu_rq(arg->src_cpu);
2520 dst_rq = cpu_rq(arg->dst_cpu);
2521
2522 double_raw_lock(&arg->src_task->pi_lock,
2523 &arg->dst_task->pi_lock);
2524 double_rq_lock(src_rq, dst_rq);
2525
2526 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2527 goto unlock;
2528
2529 if (task_cpu(arg->src_task) != arg->src_cpu)
2530 goto unlock;
2531
2532 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2533 goto unlock;
2534
2535 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2536 goto unlock;
2537
2538 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2539 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2540
2541 ret = 0;
2542
2543unlock:
2544 double_rq_unlock(src_rq, dst_rq);
2545 raw_spin_unlock(&arg->dst_task->pi_lock);
2546 raw_spin_unlock(&arg->src_task->pi_lock);
2547
2548 return ret;
2549}
2550
2551
2552
2553
2554int migrate_swap(struct task_struct *cur, struct task_struct *p,
2555 int target_cpu, int curr_cpu)
2556{
2557 struct migration_swap_arg arg;
2558 int ret = -EINVAL;
2559
2560 arg = (struct migration_swap_arg){
2561 .src_task = cur,
2562 .src_cpu = curr_cpu,
2563 .dst_task = p,
2564 .dst_cpu = target_cpu,
2565 };
2566
2567 if (arg.src_cpu == arg.dst_cpu)
2568 goto out;
2569
2570
2571
2572
2573
2574 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2575 goto out;
2576
2577 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2578 goto out;
2579
2580 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2581 goto out;
2582
2583 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2584 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2585
2586out:
2587 return ret;
2588}
2589#endif
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2608{
2609 int running, queued;
2610 struct rq_flags rf;
2611 unsigned long ncsw;
2612 struct rq *rq;
2613
2614 for (;;) {
2615
2616
2617
2618
2619
2620
2621 rq = task_rq(p);
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634 while (task_running(rq, p)) {
2635 if (match_state && unlikely(p->state != match_state))
2636 return 0;
2637 cpu_relax();
2638 }
2639
2640
2641
2642
2643
2644
2645 rq = task_rq_lock(p, &rf);
2646 trace_sched_wait_task(p);
2647 running = task_running(rq, p);
2648 queued = task_on_rq_queued(p);
2649 ncsw = 0;
2650 if (!match_state || p->state == match_state)
2651 ncsw = p->nvcsw | LONG_MIN;
2652 task_rq_unlock(rq, p, &rf);
2653
2654
2655
2656
2657 if (unlikely(!ncsw))
2658 break;
2659
2660
2661
2662
2663
2664
2665
2666 if (unlikely(running)) {
2667 cpu_relax();
2668 continue;
2669 }
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680 if (unlikely(queued)) {
2681 ktime_t to = NSEC_PER_SEC / HZ;
2682
2683 set_current_state(TASK_UNINTERRUPTIBLE);
2684 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2685 continue;
2686 }
2687
2688
2689
2690
2691
2692
2693 break;
2694 }
2695
2696 return ncsw;
2697}
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712void kick_process(struct task_struct *p)
2713{
2714 int cpu;
2715
2716 preempt_disable();
2717 cpu = task_cpu(p);
2718 if ((cpu != smp_processor_id()) && task_curr(p))
2719 smp_send_reschedule(cpu);
2720 preempt_enable();
2721}
2722EXPORT_SYMBOL_GPL(kick_process);
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746static int select_fallback_rq(int cpu, struct task_struct *p)
2747{
2748 int nid = cpu_to_node(cpu);
2749 const struct cpumask *nodemask = NULL;
2750 enum { cpuset, possible, fail } state = cpuset;
2751 int dest_cpu;
2752
2753
2754
2755
2756
2757
2758 if (nid != -1) {
2759 nodemask = cpumask_of_node(nid);
2760
2761
2762 for_each_cpu(dest_cpu, nodemask) {
2763 if (!cpu_active(dest_cpu))
2764 continue;
2765 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
2766 return dest_cpu;
2767 }
2768 }
2769
2770 for (;;) {
2771
2772 for_each_cpu(dest_cpu, p->cpus_ptr) {
2773 if (!is_cpu_allowed(p, dest_cpu))
2774 continue;
2775
2776 goto out;
2777 }
2778
2779
2780 switch (state) {
2781 case cpuset:
2782 if (IS_ENABLED(CONFIG_CPUSETS)) {
2783 cpuset_cpus_allowed_fallback(p);
2784 state = possible;
2785 break;
2786 }
2787 fallthrough;
2788 case possible:
2789
2790
2791
2792
2793
2794
2795 do_set_cpus_allowed(p, cpu_possible_mask);
2796 state = fail;
2797 break;
2798
2799 case fail:
2800 BUG();
2801 break;
2802 }
2803 }
2804
2805out:
2806 if (state != cpuset) {
2807
2808
2809
2810
2811
2812 if (p->mm && printk_ratelimit()) {
2813 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
2814 task_pid_nr(p), p->comm, cpu);
2815 }
2816 }
2817
2818 return dest_cpu;
2819}
2820
2821
2822
2823
2824static inline
2825int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
2826{
2827 lockdep_assert_held(&p->pi_lock);
2828
2829 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
2830 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
2831 else
2832 cpu = cpumask_any(p->cpus_ptr);
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844 if (unlikely(!is_cpu_allowed(p, cpu)))
2845 cpu = select_fallback_rq(task_cpu(p), p);
2846
2847 return cpu;
2848}
2849
2850void sched_set_stop_task(int cpu, struct task_struct *stop)
2851{
2852 static struct lock_class_key stop_pi_lock;
2853 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2854 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2855
2856 if (stop) {
2857
2858
2859
2860
2861
2862
2863
2864
2865 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2866
2867 stop->sched_class = &stop_sched_class;
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
2882 }
2883
2884 cpu_rq(cpu)->stop = stop;
2885
2886 if (old_stop) {
2887
2888
2889
2890
2891 old_stop->sched_class = &rt_sched_class;
2892 }
2893}
2894
2895#else
2896
2897static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2898 const struct cpumask *new_mask,
2899 u32 flags)
2900{
2901 return set_cpus_allowed_ptr(p, new_mask);
2902}
2903
2904static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
2905
2906static inline bool rq_has_pinned_tasks(struct rq *rq)
2907{
2908 return false;
2909}
2910
2911#endif
2912
2913static void
2914ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2915{
2916 struct rq *rq;
2917
2918 if (!schedstat_enabled())
2919 return;
2920
2921 rq = this_rq();
2922
2923#ifdef CONFIG_SMP
2924 if (cpu == rq->cpu) {
2925 __schedstat_inc(rq->ttwu_local);
2926 __schedstat_inc(p->se.statistics.nr_wakeups_local);
2927 } else {
2928 struct sched_domain *sd;
2929
2930 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2931 rcu_read_lock();
2932 for_each_domain(rq->cpu, sd) {
2933 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2934 __schedstat_inc(sd->ttwu_wake_remote);
2935 break;
2936 }
2937 }
2938 rcu_read_unlock();
2939 }
2940
2941 if (wake_flags & WF_MIGRATED)
2942 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2943#endif
2944
2945 __schedstat_inc(rq->ttwu_count);
2946 __schedstat_inc(p->se.statistics.nr_wakeups);
2947
2948 if (wake_flags & WF_SYNC)
2949 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2950}
2951
2952
2953
2954
2955static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2956 struct rq_flags *rf)
2957{
2958 check_preempt_curr(rq, p, wake_flags);
2959 p->state = TASK_RUNNING;
2960 trace_sched_wakeup(p);
2961
2962#ifdef CONFIG_SMP
2963 if (p->sched_class->task_woken) {
2964
2965
2966
2967
2968 rq_unpin_lock(rq, rf);
2969 p->sched_class->task_woken(rq, p);
2970 rq_repin_lock(rq, rf);
2971 }
2972
2973 if (rq->idle_stamp) {
2974 u64 delta = rq_clock(rq) - rq->idle_stamp;
2975 u64 max = 2*rq->max_idle_balance_cost;
2976
2977 update_avg(&rq->avg_idle, delta);
2978
2979 if (rq->avg_idle > max)
2980 rq->avg_idle = max;
2981
2982 rq->idle_stamp = 0;
2983 }
2984#endif
2985}
2986
2987static void
2988ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
2989 struct rq_flags *rf)
2990{
2991 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2992
2993 lockdep_assert_held(&rq->lock);
2994
2995 if (p->sched_contributes_to_load)
2996 rq->nr_uninterruptible--;
2997
2998#ifdef CONFIG_SMP
2999 if (wake_flags & WF_MIGRATED)
3000 en_flags |= ENQUEUE_MIGRATED;
3001 else
3002#endif
3003 if (p->in_iowait) {
3004 delayacct_blkio_end(p);
3005 atomic_dec(&task_rq(p)->nr_iowait);
3006 }
3007
3008 activate_task(rq, p, en_flags);
3009 ttwu_do_wakeup(rq, p, wake_flags, rf);
3010}
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037static int ttwu_runnable(struct task_struct *p, int wake_flags)
3038{
3039 struct rq_flags rf;
3040 struct rq *rq;
3041 int ret = 0;
3042
3043 rq = __task_rq_lock(p, &rf);
3044 if (task_on_rq_queued(p)) {
3045
3046 update_rq_clock(rq);
3047 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3048 ret = 1;
3049 }
3050 __task_rq_unlock(rq, &rf);
3051
3052 return ret;
3053}
3054
3055#ifdef CONFIG_SMP
3056void sched_ttwu_pending(void *arg)
3057{
3058 struct llist_node *llist = arg;
3059 struct rq *rq = this_rq();
3060 struct task_struct *p, *t;
3061 struct rq_flags rf;
3062
3063 if (!llist)
3064 return;
3065
3066
3067
3068
3069
3070
3071 WRITE_ONCE(rq->ttwu_pending, 0);
3072
3073 rq_lock_irqsave(rq, &rf);
3074 update_rq_clock(rq);
3075
3076 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3077 if (WARN_ON_ONCE(p->on_cpu))
3078 smp_cond_load_acquire(&p->on_cpu, !VAL);
3079
3080 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3081 set_task_cpu(p, cpu_of(rq));
3082
3083 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3084 }
3085
3086 rq_unlock_irqrestore(rq, &rf);
3087}
3088
3089void send_call_function_single_ipi(int cpu)
3090{
3091 struct rq *rq = cpu_rq(cpu);
3092
3093 if (!set_nr_if_polling(rq->idle))
3094 arch_send_call_function_single_ipi(cpu);
3095 else
3096 trace_sched_wake_idle_without_ipi(cpu);
3097}
3098
3099
3100
3101
3102
3103
3104
3105static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3106{
3107 struct rq *rq = cpu_rq(cpu);
3108
3109 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3110
3111 WRITE_ONCE(rq->ttwu_pending, 1);
3112 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3113}
3114
3115void wake_up_if_idle(int cpu)
3116{
3117 struct rq *rq = cpu_rq(cpu);
3118 struct rq_flags rf;
3119
3120 rcu_read_lock();
3121
3122 if (!is_idle_task(rcu_dereference(rq->curr)))
3123 goto out;
3124
3125 if (set_nr_if_polling(rq->idle)) {
3126 trace_sched_wake_idle_without_ipi(cpu);
3127 } else {
3128 rq_lock_irqsave(rq, &rf);
3129 if (is_idle_task(rq->curr))
3130 smp_send_reschedule(cpu);
3131
3132 rq_unlock_irqrestore(rq, &rf);
3133 }
3134
3135out:
3136 rcu_read_unlock();
3137}
3138
3139bool cpus_share_cache(int this_cpu, int that_cpu)
3140{
3141 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3142}
3143
3144static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3145{
3146
3147
3148
3149
3150 if (!cpu_active(cpu))
3151 return false;
3152
3153
3154
3155
3156
3157 if (!cpus_share_cache(smp_processor_id(), cpu))
3158 return true;
3159
3160
3161
3162
3163
3164
3165
3166 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3167 return true;
3168
3169 return false;
3170}
3171
3172static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3173{
3174 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3175 if (WARN_ON_ONCE(cpu == smp_processor_id()))
3176 return false;
3177
3178 sched_clock_cpu(cpu);
3179 __ttwu_queue_wakelist(p, cpu, wake_flags);
3180 return true;
3181 }
3182
3183 return false;
3184}
3185
3186#else
3187
3188static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3189{
3190 return false;
3191}
3192
3193#endif
3194
3195static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3196{
3197 struct rq *rq = cpu_rq(cpu);
3198 struct rq_flags rf;
3199
3200 if (ttwu_queue_wakelist(p, cpu, wake_flags))
3201 return;
3202
3203 rq_lock(rq, &rf);
3204 update_rq_clock(rq);
3205 ttwu_do_activate(rq, p, wake_flags, &rf);
3206 rq_unlock(rq, &rf);
3207}
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329static int
3330try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3331{
3332 unsigned long flags;
3333 int cpu, success = 0;
3334
3335 preempt_disable();
3336 if (p == current) {
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348 if (!(p->state & state))
3349 goto out;
3350
3351 success = 1;
3352 trace_sched_waking(p);
3353 p->state = TASK_RUNNING;
3354 trace_sched_wakeup(p);
3355 goto out;
3356 }
3357
3358
3359
3360
3361
3362
3363
3364 raw_spin_lock_irqsave(&p->pi_lock, flags);
3365 smp_mb__after_spinlock();
3366 if (!(p->state & state))
3367 goto unlock;
3368
3369 trace_sched_waking(p);
3370
3371
3372 success = 1;
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396 smp_rmb();
3397 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3398 goto unlock;
3399
3400#ifdef CONFIG_SMP
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424 smp_acquire__after_ctrl_dep();
3425
3426
3427
3428
3429
3430
3431
3432 p->state = TASK_WAKING;
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453 if (smp_load_acquire(&p->on_cpu) &&
3454 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3455 goto unlock;
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466 smp_cond_load_acquire(&p->on_cpu, !VAL);
3467
3468 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
3469 if (task_cpu(p) != cpu) {
3470 if (p->in_iowait) {
3471 delayacct_blkio_end(p);
3472 atomic_dec(&task_rq(p)->nr_iowait);
3473 }
3474
3475 wake_flags |= WF_MIGRATED;
3476 psi_ttwu_dequeue(p);
3477 set_task_cpu(p, cpu);
3478 }
3479#else
3480 cpu = task_cpu(p);
3481#endif
3482
3483 ttwu_queue(p, cpu, wake_flags);
3484unlock:
3485 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3486out:
3487 if (success)
3488 ttwu_stat(p, task_cpu(p), wake_flags);
3489 preempt_enable();
3490
3491 return success;
3492}
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3513{
3514 struct rq_flags rf;
3515 bool ret = false;
3516 struct rq *rq;
3517
3518 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3519 if (p->on_rq) {
3520 rq = __task_rq_lock(p, &rf);
3521 if (task_rq(p) == rq)
3522 ret = func(p, arg);
3523 rq_unlock(rq, &rf);
3524 } else {
3525 switch (p->state) {
3526 case TASK_RUNNING:
3527 case TASK_WAKING:
3528 break;
3529 default:
3530 smp_rmb();
3531 if (!p->on_rq)
3532 ret = func(p, arg);
3533 }
3534 }
3535 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3536 return ret;
3537}
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550int wake_up_process(struct task_struct *p)
3551{
3552 return try_to_wake_up(p, TASK_NORMAL, 0);
3553}
3554EXPORT_SYMBOL(wake_up_process);
3555
3556int wake_up_state(struct task_struct *p, unsigned int state)
3557{
3558 return try_to_wake_up(p, state, 0);
3559}
3560
3561
3562
3563
3564
3565
3566
3567static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3568{
3569 p->on_rq = 0;
3570
3571 p->se.on_rq = 0;
3572 p->se.exec_start = 0;
3573 p->se.sum_exec_runtime = 0;
3574 p->se.prev_sum_exec_runtime = 0;
3575 p->se.nr_migrations = 0;
3576 p->se.vruntime = 0;
3577 INIT_LIST_HEAD(&p->se.group_node);
3578
3579#ifdef CONFIG_FAIR_GROUP_SCHED
3580 p->se.cfs_rq = NULL;
3581#endif
3582
3583#ifdef CONFIG_SCHEDSTATS
3584
3585 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3586#endif
3587
3588 RB_CLEAR_NODE(&p->dl.rb_node);
3589 init_dl_task_timer(&p->dl);
3590 init_dl_inactive_task_timer(&p->dl);
3591 __dl_clear_params(p);
3592
3593 INIT_LIST_HEAD(&p->rt.run_list);
3594 p->rt.timeout = 0;
3595 p->rt.time_slice = sched_rr_timeslice;
3596 p->rt.on_rq = 0;
3597 p->rt.on_list = 0;
3598
3599#ifdef CONFIG_PREEMPT_NOTIFIERS
3600 INIT_HLIST_HEAD(&p->preempt_notifiers);
3601#endif
3602
3603#ifdef CONFIG_COMPACTION
3604 p->capture_control = NULL;
3605#endif
3606 init_numa_balancing(clone_flags, p);
3607#ifdef CONFIG_SMP
3608 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3609 p->migration_pending = NULL;
3610#endif
3611}
3612
3613DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3614
3615#ifdef CONFIG_NUMA_BALANCING
3616
3617void set_numabalancing_state(bool enabled)
3618{
3619 if (enabled)
3620 static_branch_enable(&sched_numa_balancing);
3621 else
3622 static_branch_disable(&sched_numa_balancing);
3623}
3624
3625#ifdef CONFIG_PROC_SYSCTL
3626int sysctl_numa_balancing(struct ctl_table *table, int write,
3627 void *buffer, size_t *lenp, loff_t *ppos)
3628{
3629 struct ctl_table t;
3630 int err;
3631 int state = static_branch_likely(&sched_numa_balancing);
3632
3633 if (write && !capable(CAP_SYS_ADMIN))
3634 return -EPERM;
3635
3636 t = *table;
3637 t.data = &state;
3638 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3639 if (err < 0)
3640 return err;
3641 if (write)
3642 set_numabalancing_state(state);
3643 return err;
3644}
3645#endif
3646#endif
3647
3648#ifdef CONFIG_SCHEDSTATS
3649
3650DEFINE_STATIC_KEY_FALSE(sched_schedstats);
3651static bool __initdata __sched_schedstats = false;
3652
3653static void set_schedstats(bool enabled)
3654{
3655 if (enabled)
3656 static_branch_enable(&sched_schedstats);
3657 else
3658 static_branch_disable(&sched_schedstats);
3659}
3660
3661void force_schedstat_enabled(void)
3662{
3663 if (!schedstat_enabled()) {
3664 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
3665 static_branch_enable(&sched_schedstats);
3666 }
3667}
3668
3669static int __init setup_schedstats(char *str)
3670{
3671 int ret = 0;
3672 if (!str)
3673 goto out;
3674
3675
3676
3677
3678
3679
3680 if (!strcmp(str, "enable")) {
3681 __sched_schedstats = true;
3682 ret = 1;
3683 } else if (!strcmp(str, "disable")) {
3684 __sched_schedstats = false;
3685 ret = 1;
3686 }
3687out:
3688 if (!ret)
3689 pr_warn("Unable to parse schedstats=\n");
3690
3691 return ret;
3692}
3693__setup("schedstats=", setup_schedstats);
3694
3695static void __init init_schedstats(void)
3696{
3697 set_schedstats(__sched_schedstats);
3698}
3699
3700#ifdef CONFIG_PROC_SYSCTL
3701int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3702 size_t *lenp, loff_t *ppos)
3703{
3704 struct ctl_table t;
3705 int err;
3706 int state = static_branch_likely(&sched_schedstats);
3707
3708 if (write && !capable(CAP_SYS_ADMIN))
3709 return -EPERM;
3710
3711 t = *table;
3712 t.data = &state;
3713 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3714 if (err < 0)
3715 return err;
3716 if (write)
3717 set_schedstats(state);
3718 return err;
3719}
3720#endif
3721#else
3722static inline void init_schedstats(void) {}
3723#endif
3724
3725
3726
3727
3728int sched_fork(unsigned long clone_flags, struct task_struct *p)
3729{
3730 unsigned long flags;
3731
3732 __sched_fork(clone_flags, p);
3733
3734
3735
3736
3737
3738 p->state = TASK_NEW;
3739
3740
3741
3742
3743 p->prio = current->normal_prio;
3744
3745 uclamp_fork(p);
3746
3747
3748
3749
3750 if (unlikely(p->sched_reset_on_fork)) {
3751 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3752 p->policy = SCHED_NORMAL;
3753 p->static_prio = NICE_TO_PRIO(0);
3754 p->rt_priority = 0;
3755 } else if (PRIO_TO_NICE(p->static_prio) < 0)
3756 p->static_prio = NICE_TO_PRIO(0);
3757
3758 p->prio = p->normal_prio = __normal_prio(p);
3759 set_load_weight(p, false);
3760
3761
3762
3763
3764
3765 p->sched_reset_on_fork = 0;
3766 }
3767
3768 if (dl_prio(p->prio))
3769 return -EAGAIN;
3770 else if (rt_prio(p->prio))
3771 p->sched_class = &rt_sched_class;
3772 else
3773 p->sched_class = &fair_sched_class;
3774
3775 init_entity_runnable_average(&p->se);
3776
3777
3778
3779
3780
3781
3782
3783
3784 raw_spin_lock_irqsave(&p->pi_lock, flags);
3785 rseq_migrate(p);
3786
3787
3788
3789
3790 __set_task_cpu(p, smp_processor_id());
3791 if (p->sched_class->task_fork)
3792 p->sched_class->task_fork(p);
3793 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3794
3795#ifdef CONFIG_SCHED_INFO
3796 if (likely(sched_info_on()))
3797 memset(&p->sched_info, 0, sizeof(p->sched_info));
3798#endif
3799#if defined(CONFIG_SMP)
3800 p->on_cpu = 0;
3801#endif
3802 init_task_preempt_count(p);
3803#ifdef CONFIG_SMP
3804 plist_node_init(&p->pushable_tasks, MAX_PRIO);
3805 RB_CLEAR_NODE(&p->pushable_dl_tasks);
3806#endif
3807 return 0;
3808}
3809
3810void sched_post_fork(struct task_struct *p)
3811{
3812 uclamp_post_fork(p);
3813}
3814
3815unsigned long to_ratio(u64 period, u64 runtime)
3816{
3817 if (runtime == RUNTIME_INF)
3818 return BW_UNIT;
3819
3820
3821
3822
3823
3824
3825 if (period == 0)
3826 return 0;
3827
3828 return div64_u64(runtime << BW_SHIFT, period);
3829}
3830
3831
3832
3833
3834
3835
3836
3837
3838void wake_up_new_task(struct task_struct *p)
3839{
3840 struct rq_flags rf;
3841 struct rq *rq;
3842
3843 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3844 p->state = TASK_RUNNING;
3845#ifdef CONFIG_SMP
3846
3847
3848
3849
3850
3851
3852
3853
3854 p->recent_used_cpu = task_cpu(p);
3855 rseq_migrate(p);
3856 __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
3857#endif
3858 rq = __task_rq_lock(p, &rf);
3859 update_rq_clock(rq);
3860 post_init_entity_util_avg(p);
3861
3862 activate_task(rq, p, ENQUEUE_NOCLOCK);
3863 trace_sched_wakeup_new(p);
3864 check_preempt_curr(rq, p, WF_FORK);
3865#ifdef CONFIG_SMP
3866 if (p->sched_class->task_woken) {
3867
3868
3869
3870
3871 rq_unpin_lock(rq, &rf);
3872 p->sched_class->task_woken(rq, p);
3873 rq_repin_lock(rq, &rf);
3874 }
3875#endif
3876 task_rq_unlock(rq, p, &rf);
3877}
3878
3879#ifdef CONFIG_PREEMPT_NOTIFIERS
3880
3881static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
3882
3883void preempt_notifier_inc(void)
3884{
3885 static_branch_inc(&preempt_notifier_key);
3886}
3887EXPORT_SYMBOL_GPL(preempt_notifier_inc);
3888
3889void preempt_notifier_dec(void)
3890{
3891 static_branch_dec(&preempt_notifier_key);
3892}
3893EXPORT_SYMBOL_GPL(preempt_notifier_dec);
3894
3895
3896
3897
3898
3899void preempt_notifier_register(struct preempt_notifier *notifier)
3900{
3901 if (!static_branch_unlikely(&preempt_notifier_key))
3902 WARN(1, "registering preempt_notifier while notifiers disabled\n");
3903
3904 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
3905}
3906EXPORT_SYMBOL_GPL(preempt_notifier_register);
3907
3908
3909
3910
3911
3912
3913
3914void preempt_notifier_unregister(struct preempt_notifier *notifier)
3915{
3916 hlist_del(¬ifier->link);
3917}
3918EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3919
3920static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
3921{
3922 struct preempt_notifier *notifier;
3923
3924 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3925 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3926}
3927
3928static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3929{
3930 if (static_branch_unlikely(&preempt_notifier_key))
3931 __fire_sched_in_preempt_notifiers(curr);
3932}
3933
3934static void
3935__fire_sched_out_preempt_notifiers(struct task_struct *curr,
3936 struct task_struct *next)
3937{
3938 struct preempt_notifier *notifier;
3939
3940 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3941 notifier->ops->sched_out(notifier, next);
3942}
3943
3944static __always_inline void
3945fire_sched_out_preempt_notifiers(struct task_struct *curr,
3946 struct task_struct *next)
3947{
3948 if (static_branch_unlikely(&preempt_notifier_key))
3949 __fire_sched_out_preempt_notifiers(curr, next);
3950}
3951
3952#else
3953
3954static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3955{
3956}
3957
3958static inline void
3959fire_sched_out_preempt_notifiers(struct task_struct *curr,
3960 struct task_struct *next)
3961{
3962}
3963
3964#endif
3965
3966static inline void prepare_task(struct task_struct *next)
3967{
3968#ifdef CONFIG_SMP
3969
3970
3971
3972
3973
3974
3975 WRITE_ONCE(next->on_cpu, 1);
3976#endif
3977}
3978
3979static inline void finish_task(struct task_struct *prev)
3980{
3981#ifdef CONFIG_SMP
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993 smp_store_release(&prev->on_cpu, 0);
3994#endif
3995}
3996
3997#ifdef CONFIG_SMP
3998
3999static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4000{
4001 void (*func)(struct rq *rq);
4002 struct callback_head *next;
4003
4004 lockdep_assert_held(&rq->lock);
4005
4006 while (head) {
4007 func = (void (*)(struct rq *))head->func;
4008 next = head->next;
4009 head->next = NULL;
4010 head = next;
4011
4012 func(rq);
4013 }
4014}
4015
4016static void balance_push(struct rq *rq);
4017
4018struct callback_head balance_push_callback = {
4019 .next = NULL,
4020 .func = (void (*)(struct callback_head *))balance_push,
4021};
4022
4023static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4024{
4025 struct callback_head *head = rq->balance_callback;
4026
4027 lockdep_assert_held(&rq->lock);
4028 if (head)
4029 rq->balance_callback = NULL;
4030
4031 return head;
4032}
4033
4034static void __balance_callbacks(struct rq *rq)
4035{
4036 do_balance_callbacks(rq, splice_balance_callbacks(rq));
4037}
4038
4039static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4040{
4041 unsigned long flags;
4042
4043 if (unlikely(head)) {
4044 raw_spin_lock_irqsave(&rq->lock, flags);
4045 do_balance_callbacks(rq, head);
4046 raw_spin_unlock_irqrestore(&rq->lock, flags);
4047 }
4048}
4049
4050#else
4051
4052static inline void __balance_callbacks(struct rq *rq)
4053{
4054}
4055
4056static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4057{
4058 return NULL;
4059}
4060
4061static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4062{
4063}
4064
4065#endif
4066
4067static inline void
4068prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4069{
4070
4071
4072
4073
4074
4075
4076 rq_unpin_lock(rq, rf);
4077 spin_release(&rq->lock.dep_map, _THIS_IP_);
4078#ifdef CONFIG_DEBUG_SPINLOCK
4079
4080 rq->lock.owner = next;
4081#endif
4082}
4083
4084static inline void finish_lock_switch(struct rq *rq)
4085{
4086
4087
4088
4089
4090
4091 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
4092 __balance_callbacks(rq);
4093 raw_spin_unlock_irq(&rq->lock);
4094}
4095
4096
4097
4098
4099
4100#ifndef prepare_arch_switch
4101# define prepare_arch_switch(next) do { } while (0)
4102#endif
4103
4104#ifndef finish_arch_post_lock_switch
4105# define finish_arch_post_lock_switch() do { } while (0)
4106#endif
4107
4108static inline void kmap_local_sched_out(void)
4109{
4110#ifdef CONFIG_KMAP_LOCAL
4111 if (unlikely(current->kmap_ctrl.idx))
4112 __kmap_local_sched_out();
4113#endif
4114}
4115
4116static inline void kmap_local_sched_in(void)
4117{
4118#ifdef CONFIG_KMAP_LOCAL
4119 if (unlikely(current->kmap_ctrl.idx))
4120 __kmap_local_sched_in();
4121#endif
4122}
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137static inline void
4138prepare_task_switch(struct rq *rq, struct task_struct *prev,
4139 struct task_struct *next)
4140{
4141 kcov_prepare_switch(prev);
4142 sched_info_switch(rq, prev, next);
4143 perf_event_task_sched_out(prev, next);
4144 rseq_preempt(prev);
4145 fire_sched_out_preempt_notifiers(prev, next);
4146 kmap_local_sched_out();
4147 prepare_task(next);
4148 prepare_arch_switch(next);
4149}
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170static struct rq *finish_task_switch(struct task_struct *prev)
4171 __releases(rq->lock)
4172{
4173 struct rq *rq = this_rq();
4174 struct mm_struct *mm = rq->prev_mm;
4175 long prev_state;
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4189 "corrupted preempt_count: %s/%d/0x%x\n",
4190 current->comm, current->pid, preempt_count()))
4191 preempt_count_set(FORK_PREEMPT_COUNT);
4192
4193 rq->prev_mm = NULL;
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206 prev_state = prev->state;
4207 vtime_task_switch(prev);
4208 perf_event_task_sched_in(prev, current);
4209 finish_task(prev);
4210 finish_lock_switch(rq);
4211 finish_arch_post_lock_switch();
4212 kcov_finish_switch(current);
4213
4214
4215
4216
4217
4218
4219
4220 kmap_local_sched_in();
4221
4222 fire_sched_in_preempt_notifiers(current);
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235 if (mm) {
4236 membarrier_mm_sync_core_before_usermode(mm);
4237 mmdrop(mm);
4238 }
4239 if (unlikely(prev_state == TASK_DEAD)) {
4240 if (prev->sched_class->task_dead)
4241 prev->sched_class->task_dead(prev);
4242
4243
4244
4245
4246
4247 kprobe_flush_task(prev);
4248
4249
4250 put_task_stack(prev);
4251
4252 put_task_struct_rcu_user(prev);
4253 }
4254
4255 tick_nohz_task_switch();
4256 return rq;
4257}
4258
4259
4260
4261
4262
4263asmlinkage __visible void schedule_tail(struct task_struct *prev)
4264 __releases(rq->lock)
4265{
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275 finish_task_switch(prev);
4276 preempt_enable();
4277
4278 if (current->set_child_tid)
4279 put_user(task_pid_vnr(current), current->set_child_tid);
4280
4281 calculate_sigpending();
4282}
4283
4284
4285
4286
4287static __always_inline struct rq *
4288context_switch(struct rq *rq, struct task_struct *prev,
4289 struct task_struct *next, struct rq_flags *rf)
4290{
4291 prepare_task_switch(rq, prev, next);
4292
4293
4294
4295
4296
4297
4298 arch_start_context_switch(prev);
4299
4300
4301
4302
4303
4304
4305
4306
4307 if (!next->mm) {
4308 enter_lazy_tlb(prev->active_mm, next);
4309
4310 next->active_mm = prev->active_mm;
4311 if (prev->mm)
4312 mmgrab(prev->active_mm);
4313 else
4314 prev->active_mm = NULL;
4315 } else {
4316 membarrier_switch_mm(rq, prev->active_mm, next->mm);
4317
4318
4319
4320
4321
4322
4323
4324
4325 switch_mm_irqs_off(prev->active_mm, next->mm, next);
4326
4327 if (!prev->mm) {
4328
4329 rq->prev_mm = prev->active_mm;
4330 prev->active_mm = NULL;
4331 }
4332 }
4333
4334 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4335
4336 prepare_lock_switch(rq, next, rf);
4337
4338
4339 switch_to(prev, next, prev);
4340 barrier();
4341
4342 return finish_task_switch(prev);
4343}
4344
4345
4346
4347
4348
4349
4350
4351unsigned long nr_running(void)
4352{
4353 unsigned long i, sum = 0;
4354
4355 for_each_online_cpu(i)
4356 sum += cpu_rq(i)->nr_running;
4357
4358 return sum;
4359}
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374bool single_task_running(void)
4375{
4376 return raw_rq()->nr_running == 1;
4377}
4378EXPORT_SYMBOL(single_task_running);
4379
4380unsigned long long nr_context_switches(void)
4381{
4382 int i;
4383 unsigned long long sum = 0;
4384
4385 for_each_possible_cpu(i)
4386 sum += cpu_rq(i)->nr_switches;
4387
4388 return sum;
4389}
4390
4391
4392
4393
4394
4395
4396
4397
4398unsigned long nr_iowait_cpu(int cpu)
4399{
4400 return atomic_read(&cpu_rq(cpu)->nr_iowait);
4401}
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433unsigned long nr_iowait(void)
4434{
4435 unsigned long i, sum = 0;
4436
4437 for_each_possible_cpu(i)
4438 sum += nr_iowait_cpu(i);
4439
4440 return sum;
4441}
4442
4443#ifdef CONFIG_SMP
4444
4445
4446
4447
4448
4449void sched_exec(void)
4450{
4451 struct task_struct *p = current;
4452 unsigned long flags;
4453 int dest_cpu;
4454
4455 raw_spin_lock_irqsave(&p->pi_lock, flags);
4456 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
4457 if (dest_cpu == smp_processor_id())
4458 goto unlock;
4459
4460 if (likely(cpu_active(dest_cpu))) {
4461 struct migration_arg arg = { p, dest_cpu };
4462
4463 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4464 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4465 return;
4466 }
4467unlock:
4468 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4469}
4470
4471#endif
4472
4473DEFINE_PER_CPU(struct kernel_stat, kstat);
4474DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4475
4476EXPORT_PER_CPU_SYMBOL(kstat);
4477EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4478
4479
4480
4481
4482
4483
4484
4485static inline void prefetch_curr_exec_start(struct task_struct *p)
4486{
4487#ifdef CONFIG_FAIR_GROUP_SCHED
4488 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4489#else
4490 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4491#endif
4492 prefetch(curr);
4493 prefetch(&curr->exec_start);
4494}
4495
4496
4497
4498
4499
4500
4501unsigned long long task_sched_runtime(struct task_struct *p)
4502{
4503 struct rq_flags rf;
4504 struct rq *rq;
4505 u64 ns;
4506
4507#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519 if (!p->on_cpu || !task_on_rq_queued(p))
4520 return p->se.sum_exec_runtime;
4521#endif
4522
4523 rq = task_rq_lock(p, &rf);
4524
4525
4526
4527
4528
4529 if (task_current(rq, p) && task_on_rq_queued(p)) {
4530 prefetch_curr_exec_start(p);
4531 update_rq_clock(rq);
4532 p->sched_class->update_curr(rq);
4533 }
4534 ns = p->se.sum_exec_runtime;
4535 task_rq_unlock(rq, p, &rf);
4536
4537 return ns;
4538}
4539
4540#ifdef CONFIG_SCHED_DEBUG
4541static u64 cpu_resched_latency(struct rq *rq)
4542{
4543 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
4544 u64 resched_latency, now = rq_clock(rq);
4545 static bool warned_once;
4546
4547 if (sysctl_resched_latency_warn_once && warned_once)
4548 return 0;
4549
4550 if (!need_resched() || !latency_warn_ms)
4551 return 0;
4552
4553 if (system_state == SYSTEM_BOOTING)
4554 return 0;
4555
4556 if (!rq->last_seen_need_resched_ns) {
4557 rq->last_seen_need_resched_ns = now;
4558 rq->ticks_without_resched = 0;
4559 return 0;
4560 }
4561
4562 rq->ticks_without_resched++;
4563 resched_latency = now - rq->last_seen_need_resched_ns;
4564 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
4565 return 0;
4566
4567 warned_once = true;
4568
4569 return resched_latency;
4570}
4571
4572static int __init setup_resched_latency_warn_ms(char *str)
4573{
4574 long val;
4575
4576 if ((kstrtol(str, 0, &val))) {
4577 pr_warn("Unable to set resched_latency_warn_ms\n");
4578 return 1;
4579 }
4580
4581 sysctl_resched_latency_warn_ms = val;
4582 return 1;
4583}
4584__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
4585#else
4586static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
4587#endif
4588
4589
4590
4591
4592
4593void scheduler_tick(void)
4594{
4595 int cpu = smp_processor_id();
4596 struct rq *rq = cpu_rq(cpu);
4597 struct task_struct *curr = rq->curr;
4598 struct rq_flags rf;
4599 unsigned long thermal_pressure;
4600 u64 resched_latency;
4601
4602 arch_scale_freq_tick();
4603 sched_clock_tick();
4604
4605 rq_lock(rq, &rf);
4606
4607 update_rq_clock(rq);
4608 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4609 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4610 curr->sched_class->task_tick(rq, curr, 0);
4611 if (sched_feat(LATENCY_WARN))
4612 resched_latency = cpu_resched_latency(rq);
4613 calc_global_load_tick(rq);
4614
4615 rq_unlock(rq, &rf);
4616
4617 if (sched_feat(LATENCY_WARN) && resched_latency)
4618 resched_latency_warn(cpu, resched_latency);
4619
4620 perf_event_task_tick();
4621
4622#ifdef CONFIG_SMP
4623 rq->idle_balance = idle_cpu(cpu);
4624 trigger_load_balance(rq);
4625#endif
4626}
4627
4628#ifdef CONFIG_NO_HZ_FULL
4629
4630struct tick_work {
4631 int cpu;
4632 atomic_t state;
4633 struct delayed_work work;
4634};
4635
4636#define TICK_SCHED_REMOTE_OFFLINE 0
4637#define TICK_SCHED_REMOTE_OFFLINING 1
4638#define TICK_SCHED_REMOTE_RUNNING 2
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663static struct tick_work __percpu *tick_work_cpu;
4664
4665static void sched_tick_remote(struct work_struct *work)
4666{
4667 struct delayed_work *dwork = to_delayed_work(work);
4668 struct tick_work *twork = container_of(dwork, struct tick_work, work);
4669 int cpu = twork->cpu;
4670 struct rq *rq = cpu_rq(cpu);
4671 struct task_struct *curr;
4672 struct rq_flags rf;
4673 u64 delta;
4674 int os;
4675
4676
4677
4678
4679
4680
4681
4682
4683 if (!tick_nohz_tick_stopped_cpu(cpu))
4684 goto out_requeue;
4685
4686 rq_lock_irq(rq, &rf);
4687 curr = rq->curr;
4688 if (cpu_is_offline(cpu))
4689 goto out_unlock;
4690
4691 update_rq_clock(rq);
4692
4693 if (!is_idle_task(curr)) {
4694
4695
4696
4697
4698 delta = rq_clock_task(rq) - curr->se.exec_start;
4699 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4700 }
4701 curr->sched_class->task_tick(rq, curr, 0);
4702
4703 calc_load_nohz_remote(rq);
4704out_unlock:
4705 rq_unlock_irq(rq, &rf);
4706out_requeue:
4707
4708
4709
4710
4711
4712
4713
4714 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
4715 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
4716 if (os == TICK_SCHED_REMOTE_RUNNING)
4717 queue_delayed_work(system_unbound_wq, dwork, HZ);
4718}
4719
4720static void sched_tick_start(int cpu)
4721{
4722 int os;
4723 struct tick_work *twork;
4724
4725 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4726 return;
4727
4728 WARN_ON_ONCE(!tick_work_cpu);
4729
4730 twork = per_cpu_ptr(tick_work_cpu, cpu);
4731 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
4732 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
4733 if (os == TICK_SCHED_REMOTE_OFFLINE) {
4734 twork->cpu = cpu;
4735 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
4736 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
4737 }
4738}
4739
4740#ifdef CONFIG_HOTPLUG_CPU
4741static void sched_tick_stop(int cpu)
4742{
4743 struct tick_work *twork;
4744 int os;
4745
4746 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
4747 return;
4748
4749 WARN_ON_ONCE(!tick_work_cpu);
4750
4751 twork = per_cpu_ptr(tick_work_cpu, cpu);
4752
4753 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
4754 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
4755
4756}
4757#endif
4758
4759int __init sched_tick_offload_init(void)
4760{
4761 tick_work_cpu = alloc_percpu(struct tick_work);
4762 BUG_ON(!tick_work_cpu);
4763 return 0;
4764}
4765
4766#else
4767static inline void sched_tick_start(int cpu) { }
4768static inline void sched_tick_stop(int cpu) { }
4769#endif
4770
4771#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
4772 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
4773
4774
4775
4776
4777static inline void preempt_latency_start(int val)
4778{
4779 if (preempt_count() == val) {
4780 unsigned long ip = get_lock_parent_ip();
4781#ifdef CONFIG_DEBUG_PREEMPT
4782 current->preempt_disable_ip = ip;
4783#endif
4784 trace_preempt_off(CALLER_ADDR0, ip);
4785 }
4786}
4787
4788void preempt_count_add(int val)
4789{
4790#ifdef CONFIG_DEBUG_PREEMPT
4791
4792
4793
4794 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4795 return;
4796#endif
4797 __preempt_count_add(val);
4798#ifdef CONFIG_DEBUG_PREEMPT
4799
4800
4801
4802 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4803 PREEMPT_MASK - 10);
4804#endif
4805 preempt_latency_start(val);
4806}
4807EXPORT_SYMBOL(preempt_count_add);
4808NOKPROBE_SYMBOL(preempt_count_add);
4809
4810
4811
4812
4813
4814static inline void preempt_latency_stop(int val)
4815{
4816 if (preempt_count() == val)
4817 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
4818}
4819
4820void preempt_count_sub(int val)
4821{
4822#ifdef CONFIG_DEBUG_PREEMPT
4823
4824
4825
4826 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4827 return;
4828
4829
4830
4831 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4832 !(preempt_count() & PREEMPT_MASK)))
4833 return;
4834#endif
4835
4836 preempt_latency_stop(val);
4837 __preempt_count_sub(val);
4838}
4839EXPORT_SYMBOL(preempt_count_sub);
4840NOKPROBE_SYMBOL(preempt_count_sub);
4841
4842#else
4843static inline void preempt_latency_start(int val) { }
4844static inline void preempt_latency_stop(int val) { }
4845#endif
4846
4847static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
4848{
4849#ifdef CONFIG_DEBUG_PREEMPT
4850 return p->preempt_disable_ip;
4851#else
4852 return 0;
4853#endif
4854}
4855
4856
4857
4858
4859static noinline void __schedule_bug(struct task_struct *prev)
4860{
4861
4862 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
4863
4864 if (oops_in_progress)
4865 return;
4866
4867 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4868 prev->comm, prev->pid, preempt_count());
4869
4870 debug_show_held_locks(prev);
4871 print_modules();
4872 if (irqs_disabled())
4873 print_irqtrace_events(prev);
4874 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
4875 && in_atomic_preempt_off()) {
4876 pr_err("Preemption disabled at:");
4877 print_ip_sym(KERN_ERR, preempt_disable_ip);
4878 }
4879 if (panic_on_warn)
4880 panic("scheduling while atomic\n");
4881
4882 dump_stack();
4883 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4884}
4885
4886
4887
4888
4889static inline void schedule_debug(struct task_struct *prev, bool preempt)
4890{
4891#ifdef CONFIG_SCHED_STACK_END_CHECK
4892 if (task_stack_end_corrupted(prev))
4893 panic("corrupted stack end detected inside scheduler\n");
4894
4895 if (task_scs_end_corrupted(prev))
4896 panic("corrupted shadow stack detected inside scheduler\n");
4897#endif
4898
4899#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4900 if (!preempt && prev->state && prev->non_block_count) {
4901 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4902 prev->comm, prev->pid, prev->non_block_count);
4903 dump_stack();
4904 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4905 }
4906#endif
4907
4908 if (unlikely(in_atomic_preempt_off())) {
4909 __schedule_bug(prev);
4910 preempt_count_set(PREEMPT_DISABLED);
4911 }
4912 rcu_sleep_check();
4913 SCHED_WARN_ON(ct_state() == CONTEXT_USER);
4914
4915 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4916
4917 schedstat_inc(this_rq()->sched_count);
4918}
4919
4920static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4921 struct rq_flags *rf)
4922{
4923#ifdef CONFIG_SMP
4924 const struct sched_class *class;
4925
4926
4927
4928
4929
4930
4931
4932
4933 for_class_range(class, prev->sched_class, &idle_sched_class) {
4934 if (class->balance(rq, prev, rf))
4935 break;
4936 }
4937#endif
4938
4939 put_prev_task(rq, prev);
4940}
4941
4942
4943
4944
4945static inline struct task_struct *
4946pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4947{
4948 const struct sched_class *class;
4949 struct task_struct *p;
4950
4951
4952
4953
4954
4955
4956
4957 if (likely(prev->sched_class <= &fair_sched_class &&
4958 rq->nr_running == rq->cfs.h_nr_running)) {
4959
4960 p = pick_next_task_fair(rq, prev, rf);
4961 if (unlikely(p == RETRY_TASK))
4962 goto restart;
4963
4964
4965 if (!p) {
4966 put_prev_task(rq, prev);
4967 p = pick_next_task_idle(rq);
4968 }
4969
4970 return p;
4971 }
4972
4973restart:
4974 put_prev_task_balance(rq, prev, rf);
4975
4976 for_each_class(class) {
4977 p = class->pick_next_task(rq);
4978 if (p)
4979 return p;
4980 }
4981
4982
4983 BUG();
4984}
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025static void __sched notrace __schedule(bool preempt)
5026{
5027 struct task_struct *prev, *next;
5028 unsigned long *switch_count;
5029 unsigned long prev_state;
5030 struct rq_flags rf;
5031 struct rq *rq;
5032 int cpu;
5033
5034 cpu = smp_processor_id();
5035 rq = cpu_rq(cpu);
5036 prev = rq->curr;
5037
5038 schedule_debug(prev, preempt);
5039
5040 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
5041 hrtick_clear(rq);
5042
5043 local_irq_disable();
5044 rcu_note_context_switch(preempt);
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061 rq_lock(rq, &rf);
5062 smp_mb__after_spinlock();
5063
5064
5065 rq->clock_update_flags <<= 1;
5066 update_rq_clock(rq);
5067
5068 switch_count = &prev->nivcsw;
5069
5070
5071
5072
5073
5074
5075
5076
5077 prev_state = prev->state;
5078 if (!preempt && prev_state) {
5079 if (signal_pending_state(prev_state, prev)) {
5080 prev->state = TASK_RUNNING;
5081 } else {
5082 prev->sched_contributes_to_load =
5083 (prev_state & TASK_UNINTERRUPTIBLE) &&
5084 !(prev_state & TASK_NOLOAD) &&
5085 !(prev->flags & PF_FROZEN);
5086
5087 if (prev->sched_contributes_to_load)
5088 rq->nr_uninterruptible++;
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
5102
5103 if (prev->in_iowait) {
5104 atomic_inc(&rq->nr_iowait);
5105 delayacct_blkio_start();
5106 }
5107 }
5108 switch_count = &prev->nvcsw;
5109 }
5110
5111 next = pick_next_task(rq, prev, &rf);
5112 clear_tsk_need_resched(prev);
5113 clear_preempt_need_resched();
5114#ifdef CONFIG_SCHED_DEBUG
5115 rq->last_seen_need_resched_ns = 0;
5116#endif
5117
5118 if (likely(prev != next)) {
5119 rq->nr_switches++;
5120
5121
5122
5123
5124 RCU_INIT_POINTER(rq->curr, next);
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139 ++*switch_count;
5140
5141 migrate_disable_switch(rq, prev);
5142 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
5143
5144 trace_sched_switch(preempt, prev, next);
5145
5146
5147 rq = context_switch(rq, prev, next, &rf);
5148 } else {
5149 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
5150
5151 rq_unpin_lock(rq, &rf);
5152 __balance_callbacks(rq);
5153 raw_spin_unlock_irq(&rq->lock);
5154 }
5155}
5156
5157void __noreturn do_task_dead(void)
5158{
5159
5160 set_special_state(TASK_DEAD);
5161
5162
5163 current->flags |= PF_NOFREEZE;
5164
5165 __schedule(false);
5166 BUG();
5167
5168
5169 for (;;)
5170 cpu_relax();
5171}
5172
5173static inline void sched_submit_work(struct task_struct *tsk)
5174{
5175 unsigned int task_flags;
5176
5177 if (!tsk->state)
5178 return;
5179
5180 task_flags = tsk->flags;
5181
5182
5183
5184
5185
5186
5187
5188
5189 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5190 preempt_disable();
5191 if (task_flags & PF_WQ_WORKER)
5192 wq_worker_sleeping(tsk);
5193 else
5194 io_wq_worker_sleeping(tsk);
5195 preempt_enable_no_resched();
5196 }
5197
5198 if (tsk_is_pi_blocked(tsk))
5199 return;
5200
5201
5202
5203
5204
5205 if (blk_needs_flush_plug(tsk))
5206 blk_schedule_flush_plug(tsk);
5207}
5208
5209static void sched_update_worker(struct task_struct *tsk)
5210{
5211 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5212 if (tsk->flags & PF_WQ_WORKER)
5213 wq_worker_running(tsk);
5214 else
5215 io_wq_worker_running(tsk);
5216 }
5217}
5218
5219asmlinkage __visible void __sched schedule(void)
5220{
5221 struct task_struct *tsk = current;
5222
5223 sched_submit_work(tsk);
5224 do {
5225 preempt_disable();
5226 __schedule(false);
5227 sched_preempt_enable_no_resched();
5228 } while (need_resched());
5229 sched_update_worker(tsk);
5230}
5231EXPORT_SYMBOL(schedule);
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243void __sched schedule_idle(void)
5244{
5245
5246
5247
5248
5249
5250
5251
5252 WARN_ON_ONCE(current->state);
5253 do {
5254 __schedule(false);
5255 } while (need_resched());
5256}
5257
5258#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
5259asmlinkage __visible void __sched schedule_user(void)
5260{
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271 enum ctx_state prev_state = exception_enter();
5272 schedule();
5273 exception_exit(prev_state);
5274}
5275#endif
5276
5277
5278
5279
5280
5281
5282void __sched schedule_preempt_disabled(void)
5283{
5284 sched_preempt_enable_no_resched();
5285 schedule();
5286 preempt_disable();
5287}
5288
5289static void __sched notrace preempt_schedule_common(void)
5290{
5291 do {
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305 preempt_disable_notrace();
5306 preempt_latency_start(1);
5307 __schedule(true);
5308 preempt_latency_stop(1);
5309 preempt_enable_no_resched_notrace();
5310
5311
5312
5313
5314
5315 } while (need_resched());
5316}
5317
5318#ifdef CONFIG_PREEMPTION
5319
5320
5321
5322
5323asmlinkage __visible void __sched notrace preempt_schedule(void)
5324{
5325
5326
5327
5328
5329 if (likely(!preemptible()))
5330 return;
5331
5332 preempt_schedule_common();
5333}
5334NOKPROBE_SYMBOL(preempt_schedule);
5335EXPORT_SYMBOL(preempt_schedule);
5336
5337#ifdef CONFIG_PREEMPT_DYNAMIC
5338DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
5339EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
5340#endif
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
5358{
5359 enum ctx_state prev_ctx;
5360
5361 if (likely(!preemptible()))
5362 return;
5363
5364 do {
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378 preempt_disable_notrace();
5379 preempt_latency_start(1);
5380
5381
5382
5383
5384
5385 prev_ctx = exception_enter();
5386 __schedule(true);
5387 exception_exit(prev_ctx);
5388
5389 preempt_latency_stop(1);
5390 preempt_enable_no_resched_notrace();
5391 } while (need_resched());
5392}
5393EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
5394
5395#ifdef CONFIG_PREEMPT_DYNAMIC
5396DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
5397EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
5398#endif
5399
5400#endif
5401
5402#ifdef CONFIG_PREEMPT_DYNAMIC
5403
5404#include <linux/entry-common.h>
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436enum {
5437 preempt_dynamic_none = 0,
5438 preempt_dynamic_voluntary,
5439 preempt_dynamic_full,
5440};
5441
5442int preempt_dynamic_mode = preempt_dynamic_full;
5443
5444int sched_dynamic_mode(const char *str)
5445{
5446 if (!strcmp(str, "none"))
5447 return preempt_dynamic_none;
5448
5449 if (!strcmp(str, "voluntary"))
5450 return preempt_dynamic_voluntary;
5451
5452 if (!strcmp(str, "full"))
5453 return preempt_dynamic_full;
5454
5455 return -EINVAL;
5456}
5457
5458void sched_dynamic_update(int mode)
5459{
5460
5461
5462
5463
5464 static_call_update(cond_resched, __cond_resched);
5465 static_call_update(might_resched, __cond_resched);
5466 static_call_update(preempt_schedule, __preempt_schedule_func);
5467 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
5468 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
5469
5470 switch (mode) {
5471 case preempt_dynamic_none:
5472 static_call_update(cond_resched, __cond_resched);
5473 static_call_update(might_resched, (void *)&__static_call_return0);
5474 static_call_update(preempt_schedule, NULL);
5475 static_call_update(preempt_schedule_notrace, NULL);
5476 static_call_update(irqentry_exit_cond_resched, NULL);
5477 pr_info("Dynamic Preempt: none\n");
5478 break;
5479
5480 case preempt_dynamic_voluntary:
5481 static_call_update(cond_resched, __cond_resched);
5482 static_call_update(might_resched, __cond_resched);
5483 static_call_update(preempt_schedule, NULL);
5484 static_call_update(preempt_schedule_notrace, NULL);
5485 static_call_update(irqentry_exit_cond_resched, NULL);
5486 pr_info("Dynamic Preempt: voluntary\n");
5487 break;
5488
5489 case preempt_dynamic_full:
5490 static_call_update(cond_resched, (void *)&__static_call_return0);
5491 static_call_update(might_resched, (void *)&__static_call_return0);
5492 static_call_update(preempt_schedule, __preempt_schedule_func);
5493 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
5494 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
5495 pr_info("Dynamic Preempt: full\n");
5496 break;
5497 }
5498
5499 preempt_dynamic_mode = mode;
5500}
5501
5502static int __init setup_preempt_mode(char *str)
5503{
5504 int mode = sched_dynamic_mode(str);
5505 if (mode < 0) {
5506 pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
5507 return 1;
5508 }
5509
5510 sched_dynamic_update(mode);
5511 return 0;
5512}
5513__setup("preempt=", setup_preempt_mode);
5514
5515#endif
5516
5517
5518
5519
5520
5521
5522
5523asmlinkage __visible void __sched preempt_schedule_irq(void)
5524{
5525 enum ctx_state prev_state;
5526
5527
5528 BUG_ON(preempt_count() || !irqs_disabled());
5529
5530 prev_state = exception_enter();
5531
5532 do {
5533 preempt_disable();
5534 local_irq_enable();
5535 __schedule(true);
5536 local_irq_disable();
5537 sched_preempt_enable_no_resched();
5538 } while (need_resched());
5539
5540 exception_exit(prev_state);
5541}
5542
5543int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
5544 void *key)
5545{
5546 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
5547 return try_to_wake_up(curr->private, mode, wake_flags);
5548}
5549EXPORT_SYMBOL(default_wake_function);
5550
5551#ifdef CONFIG_RT_MUTEXES
5552
5553static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
5554{
5555 if (pi_task)
5556 prio = min(prio, pi_task->prio);
5557
5558 return prio;
5559}
5560
5561static inline int rt_effective_prio(struct task_struct *p, int prio)
5562{
5563 struct task_struct *pi_task = rt_mutex_get_top_task(p);
5564
5565 return __rt_effective_prio(pi_task, prio);
5566}
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
5580{
5581 int prio, oldprio, queued, running, queue_flag =
5582 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5583 const struct sched_class *prev_class;
5584 struct rq_flags rf;
5585 struct rq *rq;
5586
5587
5588 prio = __rt_effective_prio(pi_task, p->normal_prio);
5589
5590
5591
5592
5593 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
5594 return;
5595
5596 rq = __task_rq_lock(p, &rf);
5597 update_rq_clock(rq);
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608 p->pi_top_task = pi_task;
5609
5610
5611
5612
5613 if (prio == p->prio && !dl_prio(prio))
5614 goto out_unlock;
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628 if (unlikely(p == rq->idle)) {
5629 WARN_ON(p != rq->curr);
5630 WARN_ON(p->pi_blocked_on);
5631 goto out_unlock;
5632 }
5633
5634 trace_sched_pi_setprio(p, pi_task);
5635 oldprio = p->prio;
5636
5637 if (oldprio == prio)
5638 queue_flag &= ~DEQUEUE_MOVE;
5639
5640 prev_class = p->sched_class;
5641 queued = task_on_rq_queued(p);
5642 running = task_current(rq, p);
5643 if (queued)
5644 dequeue_task(rq, p, queue_flag);
5645 if (running)
5646 put_prev_task(rq, p);
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657 if (dl_prio(prio)) {
5658 if (!dl_prio(p->normal_prio) ||
5659 (pi_task && dl_prio(pi_task->prio) &&
5660 dl_entity_preempt(&pi_task->dl, &p->dl))) {
5661 p->dl.pi_se = pi_task->dl.pi_se;
5662 queue_flag |= ENQUEUE_REPLENISH;
5663 } else {
5664 p->dl.pi_se = &p->dl;
5665 }
5666 p->sched_class = &dl_sched_class;
5667 } else if (rt_prio(prio)) {
5668 if (dl_prio(oldprio))
5669 p->dl.pi_se = &p->dl;
5670 if (oldprio < prio)
5671 queue_flag |= ENQUEUE_HEAD;
5672 p->sched_class = &rt_sched_class;
5673 } else {
5674 if (dl_prio(oldprio))
5675 p->dl.pi_se = &p->dl;
5676 if (rt_prio(oldprio))
5677 p->rt.timeout = 0;
5678 p->sched_class = &fair_sched_class;
5679 }
5680
5681 p->prio = prio;
5682
5683 if (queued)
5684 enqueue_task(rq, p, queue_flag);
5685 if (running)
5686 set_next_task(rq, p);
5687
5688 check_class_changed(rq, p, prev_class, oldprio);
5689out_unlock:
5690
5691 preempt_disable();
5692
5693 rq_unpin_lock(rq, &rf);
5694 __balance_callbacks(rq);
5695 raw_spin_unlock(&rq->lock);
5696
5697 preempt_enable();
5698}
5699#else
5700static inline int rt_effective_prio(struct task_struct *p, int prio)
5701{
5702 return prio;
5703}
5704#endif
5705
5706void set_user_nice(struct task_struct *p, long nice)
5707{
5708 bool queued, running;
5709 int old_prio;
5710 struct rq_flags rf;
5711 struct rq *rq;
5712
5713 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5714 return;
5715
5716
5717
5718
5719 rq = task_rq_lock(p, &rf);
5720 update_rq_clock(rq);
5721
5722
5723
5724
5725
5726
5727
5728 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
5729 p->static_prio = NICE_TO_PRIO(nice);
5730 goto out_unlock;
5731 }
5732 queued = task_on_rq_queued(p);
5733 running = task_current(rq, p);
5734 if (queued)
5735 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
5736 if (running)
5737 put_prev_task(rq, p);
5738
5739 p->static_prio = NICE_TO_PRIO(nice);
5740 set_load_weight(p, true);
5741 old_prio = p->prio;
5742 p->prio = effective_prio(p);
5743
5744 if (queued)
5745 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5746 if (running)
5747 set_next_task(rq, p);
5748
5749
5750
5751
5752
5753 p->sched_class->prio_changed(rq, p, old_prio);
5754
5755out_unlock:
5756 task_rq_unlock(rq, p, &rf);
5757}
5758EXPORT_SYMBOL(set_user_nice);
5759
5760
5761
5762
5763
5764
5765int can_nice(const struct task_struct *p, const int nice)
5766{
5767
5768 int nice_rlim = nice_to_rlimit(nice);
5769
5770 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
5771 capable(CAP_SYS_NICE));
5772}
5773
5774#ifdef __ARCH_WANT_SYS_NICE
5775
5776
5777
5778
5779
5780
5781
5782
5783SYSCALL_DEFINE1(nice, int, increment)
5784{
5785 long nice, retval;
5786
5787
5788
5789
5790
5791
5792 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
5793 nice = task_nice(current) + increment;
5794
5795 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
5796 if (increment < 0 && !can_nice(current, nice))
5797 return -EPERM;
5798
5799 retval = security_task_setnice(current, nice);
5800 if (retval)
5801 return retval;
5802
5803 set_user_nice(current, nice);
5804 return 0;
5805}
5806
5807#endif
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821int task_prio(const struct task_struct *p)
5822{
5823 return p->prio - MAX_RT_PRIO;
5824}
5825
5826
5827
5828
5829
5830
5831
5832int idle_cpu(int cpu)
5833{
5834 struct rq *rq = cpu_rq(cpu);
5835
5836 if (rq->curr != rq->idle)
5837 return 0;
5838
5839 if (rq->nr_running)
5840 return 0;
5841
5842#ifdef CONFIG_SMP
5843 if (rq->ttwu_pending)
5844 return 0;
5845#endif
5846
5847 return 1;
5848}
5849
5850
5851
5852
5853
5854
5855
5856int available_idle_cpu(int cpu)
5857{
5858 if (!idle_cpu(cpu))
5859 return 0;
5860
5861 if (vcpu_is_preempted(cpu))
5862 return 0;
5863
5864 return 1;
5865}
5866
5867
5868
5869
5870
5871
5872
5873struct task_struct *idle_task(int cpu)
5874{
5875 return cpu_rq(cpu)->idle;
5876}
5877
5878#ifdef CONFIG_SMP
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
5900 unsigned long max, enum cpu_util_type type,
5901 struct task_struct *p)
5902{
5903 unsigned long dl_util, util, irq;
5904 struct rq *rq = cpu_rq(cpu);
5905
5906 if (!uclamp_is_used() &&
5907 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
5908 return max;
5909 }
5910
5911
5912
5913
5914
5915
5916 irq = cpu_util_irq(rq);
5917 if (unlikely(irq >= max))
5918 return max;
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932 util = util_cfs + cpu_util_rt(rq);
5933 if (type == FREQUENCY_UTIL)
5934 util = uclamp_rq_util_with(rq, util, p);
5935
5936 dl_util = cpu_util_dl(rq);
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947 if (util + dl_util >= max)
5948 return max;
5949
5950
5951
5952
5953
5954 if (type == ENERGY_UTIL)
5955 util += dl_util;
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966 util = scale_irq_capacity(util, irq, max);
5967 util += irq;
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979 if (type == FREQUENCY_UTIL)
5980 util += cpu_bw_dl(rq);
5981
5982 return min(max, util);
5983}
5984
5985unsigned long sched_cpu_util(int cpu, unsigned long max)
5986{
5987 return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
5988 ENERGY_UTIL, NULL);
5989}
5990#endif
5991
5992
5993
5994
5995
5996
5997
5998static struct task_struct *find_process_by_pid(pid_t pid)
5999{
6000 return pid ? find_task_by_vpid(pid) : current;
6001}
6002
6003
6004
6005
6006
6007#define SETPARAM_POLICY -1
6008
6009static void __setscheduler_params(struct task_struct *p,
6010 const struct sched_attr *attr)
6011{
6012 int policy = attr->sched_policy;
6013
6014 if (policy == SETPARAM_POLICY)
6015 policy = p->policy;
6016
6017 p->policy = policy;
6018
6019 if (dl_policy(policy))
6020 __setparam_dl(p, attr);
6021 else if (fair_policy(policy))
6022 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
6023
6024
6025
6026
6027
6028
6029 p->rt_priority = attr->sched_priority;
6030 p->normal_prio = normal_prio(p);
6031 set_load_weight(p, true);
6032}
6033
6034
6035static void __setscheduler(struct rq *rq, struct task_struct *p,
6036 const struct sched_attr *attr, bool keep_boost)
6037{
6038
6039
6040
6041
6042 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
6043 return;
6044
6045 __setscheduler_params(p, attr);
6046
6047
6048
6049
6050
6051 p->prio = normal_prio(p);
6052 if (keep_boost)
6053 p->prio = rt_effective_prio(p, p->prio);
6054
6055 if (dl_prio(p->prio))
6056 p->sched_class = &dl_sched_class;
6057 else if (rt_prio(p->prio))
6058 p->sched_class = &rt_sched_class;
6059 else
6060 p->sched_class = &fair_sched_class;
6061}
6062
6063
6064
6065
6066static bool check_same_owner(struct task_struct *p)
6067{
6068 const struct cred *cred = current_cred(), *pcred;
6069 bool match;
6070
6071 rcu_read_lock();
6072 pcred = __task_cred(p);
6073 match = (uid_eq(cred->euid, pcred->euid) ||
6074 uid_eq(cred->euid, pcred->uid));
6075 rcu_read_unlock();
6076 return match;
6077}
6078
6079static int __sched_setscheduler(struct task_struct *p,
6080 const struct sched_attr *attr,
6081 bool user, bool pi)
6082{
6083 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
6084 MAX_RT_PRIO - 1 - attr->sched_priority;
6085 int retval, oldprio, oldpolicy = -1, queued, running;
6086 int new_effective_prio, policy = attr->sched_policy;
6087 const struct sched_class *prev_class;
6088 struct callback_head *head;
6089 struct rq_flags rf;
6090 int reset_on_fork;
6091 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6092 struct rq *rq;
6093
6094
6095 BUG_ON(pi && in_interrupt());
6096recheck:
6097
6098 if (policy < 0) {
6099 reset_on_fork = p->sched_reset_on_fork;
6100 policy = oldpolicy = p->policy;
6101 } else {
6102 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
6103
6104 if (!valid_policy(policy))
6105 return -EINVAL;
6106 }
6107
6108 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
6109 return -EINVAL;
6110
6111
6112
6113
6114
6115
6116 if (attr->sched_priority > MAX_RT_PRIO-1)
6117 return -EINVAL;
6118 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
6119 (rt_policy(policy) != (attr->sched_priority != 0)))
6120 return -EINVAL;
6121
6122
6123
6124
6125 if (user && !capable(CAP_SYS_NICE)) {
6126 if (fair_policy(policy)) {
6127 if (attr->sched_nice < task_nice(p) &&
6128 !can_nice(p, attr->sched_nice))
6129 return -EPERM;
6130 }
6131
6132 if (rt_policy(policy)) {
6133 unsigned long rlim_rtprio =
6134 task_rlimit(p, RLIMIT_RTPRIO);
6135
6136
6137 if (policy != p->policy && !rlim_rtprio)
6138 return -EPERM;
6139
6140
6141 if (attr->sched_priority > p->rt_priority &&
6142 attr->sched_priority > rlim_rtprio)
6143 return -EPERM;
6144 }
6145
6146
6147
6148
6149
6150
6151
6152 if (dl_policy(policy))
6153 return -EPERM;
6154
6155
6156
6157
6158
6159 if (task_has_idle_policy(p) && !idle_policy(policy)) {
6160 if (!can_nice(p, task_nice(p)))
6161 return -EPERM;
6162 }
6163
6164
6165 if (!check_same_owner(p))
6166 return -EPERM;
6167
6168
6169 if (p->sched_reset_on_fork && !reset_on_fork)
6170 return -EPERM;
6171 }
6172
6173 if (user) {
6174 if (attr->sched_flags & SCHED_FLAG_SUGOV)
6175 return -EINVAL;
6176
6177 retval = security_task_setscheduler(p);
6178 if (retval)
6179 return retval;
6180 }
6181
6182
6183 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
6184 retval = uclamp_validate(p, attr);
6185 if (retval)
6186 return retval;
6187 }
6188
6189 if (pi)
6190 cpuset_read_lock();
6191
6192
6193
6194
6195
6196
6197
6198
6199 rq = task_rq_lock(p, &rf);
6200 update_rq_clock(rq);
6201
6202
6203
6204
6205 if (p == rq->stop) {
6206 retval = -EINVAL;
6207 goto unlock;
6208 }
6209
6210
6211
6212
6213
6214 if (unlikely(policy == p->policy)) {
6215 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
6216 goto change;
6217 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
6218 goto change;
6219 if (dl_policy(policy) && dl_param_changed(p, attr))
6220 goto change;
6221 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
6222 goto change;
6223
6224 p->sched_reset_on_fork = reset_on_fork;
6225 retval = 0;
6226 goto unlock;
6227 }
6228change:
6229
6230 if (user) {
6231#ifdef CONFIG_RT_GROUP_SCHED
6232
6233
6234
6235
6236 if (rt_bandwidth_enabled() && rt_policy(policy) &&
6237 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
6238 !task_group_is_autogroup(task_group(p))) {
6239 retval = -EPERM;
6240 goto unlock;
6241 }
6242#endif
6243#ifdef CONFIG_SMP
6244 if (dl_bandwidth_enabled() && dl_policy(policy) &&
6245 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
6246 cpumask_t *span = rq->rd->span;
6247
6248
6249
6250
6251
6252
6253 if (!cpumask_subset(span, p->cpus_ptr) ||
6254 rq->rd->dl_bw.bw == 0) {
6255 retval = -EPERM;
6256 goto unlock;
6257 }
6258 }
6259#endif
6260 }
6261
6262
6263 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6264 policy = oldpolicy = -1;
6265 task_rq_unlock(rq, p, &rf);
6266 if (pi)
6267 cpuset_read_unlock();
6268 goto recheck;
6269 }
6270
6271
6272
6273
6274
6275
6276 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
6277 retval = -EBUSY;
6278 goto unlock;
6279 }
6280
6281 p->sched_reset_on_fork = reset_on_fork;
6282 oldprio = p->prio;
6283
6284 if (pi) {
6285
6286
6287
6288
6289
6290
6291
6292 new_effective_prio = rt_effective_prio(p, newprio);
6293 if (new_effective_prio == oldprio)
6294 queue_flags &= ~DEQUEUE_MOVE;
6295 }
6296
6297 queued = task_on_rq_queued(p);
6298 running = task_current(rq, p);
6299 if (queued)
6300 dequeue_task(rq, p, queue_flags);
6301 if (running)
6302 put_prev_task(rq, p);
6303
6304 prev_class = p->sched_class;
6305
6306 __setscheduler(rq, p, attr, pi);
6307 __setscheduler_uclamp(p, attr);
6308
6309 if (queued) {
6310
6311
6312
6313
6314 if (oldprio < p->prio)
6315 queue_flags |= ENQUEUE_HEAD;
6316
6317 enqueue_task(rq, p, queue_flags);
6318 }
6319 if (running)
6320 set_next_task(rq, p);
6321
6322 check_class_changed(rq, p, prev_class, oldprio);
6323
6324
6325 preempt_disable();
6326 head = splice_balance_callbacks(rq);
6327 task_rq_unlock(rq, p, &rf);
6328
6329 if (pi) {
6330 cpuset_read_unlock();
6331 rt_mutex_adjust_pi(p);
6332 }
6333
6334
6335 balance_callbacks(rq, head);
6336 preempt_enable();
6337
6338 return 0;
6339
6340unlock:
6341 task_rq_unlock(rq, p, &rf);
6342 if (pi)
6343 cpuset_read_unlock();
6344 return retval;
6345}
6346
6347static int _sched_setscheduler(struct task_struct *p, int policy,
6348 const struct sched_param *param, bool check)
6349{
6350 struct sched_attr attr = {
6351 .sched_policy = policy,
6352 .sched_priority = param->sched_priority,
6353 .sched_nice = PRIO_TO_NICE(p->static_prio),
6354 };
6355
6356
6357 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
6358 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
6359 policy &= ~SCHED_RESET_ON_FORK;
6360 attr.sched_policy = policy;
6361 }
6362
6363 return __sched_setscheduler(p, &attr, check, true);
6364}
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377int sched_setscheduler(struct task_struct *p, int policy,
6378 const struct sched_param *param)
6379{
6380 return _sched_setscheduler(p, policy, param, true);
6381}
6382
6383int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
6384{
6385 return __sched_setscheduler(p, attr, true, true);
6386}
6387
6388int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
6389{
6390 return __sched_setscheduler(p, attr, false, true);
6391}
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6407 const struct sched_param *param)
6408{
6409 return _sched_setscheduler(p, policy, param, false);
6410}
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430void sched_set_fifo(struct task_struct *p)
6431{
6432 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
6433 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6434}
6435EXPORT_SYMBOL_GPL(sched_set_fifo);
6436
6437
6438
6439
6440void sched_set_fifo_low(struct task_struct *p)
6441{
6442 struct sched_param sp = { .sched_priority = 1 };
6443 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6444}
6445EXPORT_SYMBOL_GPL(sched_set_fifo_low);
6446
6447void sched_set_normal(struct task_struct *p, int nice)
6448{
6449 struct sched_attr attr = {
6450 .sched_policy = SCHED_NORMAL,
6451 .sched_nice = nice,
6452 };
6453 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
6454}
6455EXPORT_SYMBOL_GPL(sched_set_normal);
6456
6457static int
6458do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
6459{
6460 struct sched_param lparam;
6461 struct task_struct *p;
6462 int retval;
6463
6464 if (!param || pid < 0)
6465 return -EINVAL;
6466 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
6467 return -EFAULT;
6468
6469 rcu_read_lock();
6470 retval = -ESRCH;
6471 p = find_process_by_pid(pid);
6472 if (likely(p))
6473 get_task_struct(p);
6474 rcu_read_unlock();
6475
6476 if (likely(p)) {
6477 retval = sched_setscheduler(p, policy, &lparam);
6478 put_task_struct(p);
6479 }
6480
6481 return retval;
6482}
6483
6484
6485
6486
6487static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
6488{
6489 u32 size;
6490 int ret;
6491
6492
6493 memset(attr, 0, sizeof(*attr));
6494
6495 ret = get_user(size, &uattr->size);
6496 if (ret)
6497 return ret;
6498
6499
6500 if (!size)
6501 size = SCHED_ATTR_SIZE_VER0;
6502 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
6503 goto err_size;
6504
6505 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
6506 if (ret) {
6507 if (ret == -E2BIG)
6508 goto err_size;
6509 return ret;
6510 }
6511
6512 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
6513 size < SCHED_ATTR_SIZE_VER1)
6514 return -EINVAL;
6515
6516
6517
6518
6519
6520 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
6521
6522 return 0;
6523
6524err_size:
6525 put_user(sizeof(*attr), &uattr->size);
6526 return -E2BIG;
6527}
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
6538{
6539 if (policy < 0)
6540 return -EINVAL;
6541
6542 return do_sched_setscheduler(pid, policy, param);
6543}
6544
6545
6546
6547
6548
6549
6550
6551
6552SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6553{
6554 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
6555}
6556
6557
6558
6559
6560
6561
6562
6563SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
6564 unsigned int, flags)
6565{
6566 struct sched_attr attr;
6567 struct task_struct *p;
6568 int retval;
6569
6570 if (!uattr || pid < 0 || flags)
6571 return -EINVAL;
6572
6573 retval = sched_copy_attr(uattr, &attr);
6574 if (retval)
6575 return retval;
6576
6577 if ((int)attr.sched_policy < 0)
6578 return -EINVAL;
6579 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
6580 attr.sched_policy = SETPARAM_POLICY;
6581
6582 rcu_read_lock();
6583 retval = -ESRCH;
6584 p = find_process_by_pid(pid);
6585 if (likely(p))
6586 get_task_struct(p);
6587 rcu_read_unlock();
6588
6589 if (likely(p)) {
6590 retval = sched_setattr(p, &attr);
6591 put_task_struct(p);
6592 }
6593
6594 return retval;
6595}
6596
6597
6598
6599
6600
6601
6602
6603
6604SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6605{
6606 struct task_struct *p;
6607 int retval;
6608
6609 if (pid < 0)
6610 return -EINVAL;
6611
6612 retval = -ESRCH;
6613 rcu_read_lock();
6614 p = find_process_by_pid(pid);
6615 if (p) {
6616 retval = security_task_getscheduler(p);
6617 if (!retval)
6618 retval = p->policy
6619 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6620 }
6621 rcu_read_unlock();
6622 return retval;
6623}
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6634{
6635 struct sched_param lp = { .sched_priority = 0 };
6636 struct task_struct *p;
6637 int retval;
6638
6639 if (!param || pid < 0)
6640 return -EINVAL;
6641
6642 rcu_read_lock();
6643 p = find_process_by_pid(pid);
6644 retval = -ESRCH;
6645 if (!p)
6646 goto out_unlock;
6647
6648 retval = security_task_getscheduler(p);
6649 if (retval)
6650 goto out_unlock;
6651
6652 if (task_has_rt_policy(p))
6653 lp.sched_priority = p->rt_priority;
6654 rcu_read_unlock();
6655
6656
6657
6658
6659 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6660
6661 return retval;
6662
6663out_unlock:
6664 rcu_read_unlock();
6665 return retval;
6666}
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676static int
6677sched_attr_copy_to_user(struct sched_attr __user *uattr,
6678 struct sched_attr *kattr,
6679 unsigned int usize)
6680{
6681 unsigned int ksize = sizeof(*kattr);
6682
6683 if (!access_ok(uattr, usize))
6684 return -EFAULT;
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699 kattr->size = min(usize, ksize);
6700
6701 if (copy_to_user(uattr, kattr, kattr->size))
6702 return -EFAULT;
6703
6704 return 0;
6705}
6706
6707
6708
6709
6710
6711
6712
6713
6714SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
6715 unsigned int, usize, unsigned int, flags)
6716{
6717 struct sched_attr kattr = { };
6718 struct task_struct *p;
6719 int retval;
6720
6721 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
6722 usize < SCHED_ATTR_SIZE_VER0 || flags)
6723 return -EINVAL;
6724
6725 rcu_read_lock();
6726 p = find_process_by_pid(pid);
6727 retval = -ESRCH;
6728 if (!p)
6729 goto out_unlock;
6730
6731 retval = security_task_getscheduler(p);
6732 if (retval)
6733 goto out_unlock;
6734
6735 kattr.sched_policy = p->policy;
6736 if (p->sched_reset_on_fork)
6737 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
6738 if (task_has_dl_policy(p))
6739 __getparam_dl(p, &kattr);
6740 else if (task_has_rt_policy(p))
6741 kattr.sched_priority = p->rt_priority;
6742 else
6743 kattr.sched_nice = task_nice(p);
6744
6745#ifdef CONFIG_UCLAMP_TASK
6746
6747
6748
6749
6750
6751 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
6752 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
6753#endif
6754
6755 rcu_read_unlock();
6756
6757 return sched_attr_copy_to_user(uattr, &kattr, usize);
6758
6759out_unlock:
6760 rcu_read_unlock();
6761 return retval;
6762}
6763
6764long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6765{
6766 cpumask_var_t cpus_allowed, new_mask;
6767 struct task_struct *p;
6768 int retval;
6769
6770 rcu_read_lock();
6771
6772 p = find_process_by_pid(pid);
6773 if (!p) {
6774 rcu_read_unlock();
6775 return -ESRCH;
6776 }
6777
6778
6779 get_task_struct(p);
6780 rcu_read_unlock();
6781
6782 if (p->flags & PF_NO_SETAFFINITY) {
6783 retval = -EINVAL;
6784 goto out_put_task;
6785 }
6786 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6787 retval = -ENOMEM;
6788 goto out_put_task;
6789 }
6790 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6791 retval = -ENOMEM;
6792 goto out_free_cpus_allowed;
6793 }
6794 retval = -EPERM;
6795 if (!check_same_owner(p)) {
6796 rcu_read_lock();
6797 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
6798 rcu_read_unlock();
6799 goto out_free_new_mask;
6800 }
6801 rcu_read_unlock();
6802 }
6803
6804 retval = security_task_setscheduler(p);
6805 if (retval)
6806 goto out_free_new_mask;
6807
6808
6809 cpuset_cpus_allowed(p, cpus_allowed);
6810 cpumask_and(new_mask, in_mask, cpus_allowed);
6811
6812
6813
6814
6815
6816
6817
6818#ifdef CONFIG_SMP
6819 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
6820 rcu_read_lock();
6821 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
6822 retval = -EBUSY;
6823 rcu_read_unlock();
6824 goto out_free_new_mask;
6825 }
6826 rcu_read_unlock();
6827 }
6828#endif
6829again:
6830 retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
6831
6832 if (!retval) {
6833 cpuset_cpus_allowed(p, cpus_allowed);
6834 if (!cpumask_subset(new_mask, cpus_allowed)) {
6835
6836
6837
6838
6839
6840 cpumask_copy(new_mask, cpus_allowed);
6841 goto again;
6842 }
6843 }
6844out_free_new_mask:
6845 free_cpumask_var(new_mask);
6846out_free_cpus_allowed:
6847 free_cpumask_var(cpus_allowed);
6848out_put_task:
6849 put_task_struct(p);
6850 return retval;
6851}
6852
6853static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6854 struct cpumask *new_mask)
6855{
6856 if (len < cpumask_size())
6857 cpumask_clear(new_mask);
6858 else if (len > cpumask_size())
6859 len = cpumask_size();
6860
6861 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6862}
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6873 unsigned long __user *, user_mask_ptr)
6874{
6875 cpumask_var_t new_mask;
6876 int retval;
6877
6878 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6879 return -ENOMEM;
6880
6881 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6882 if (retval == 0)
6883 retval = sched_setaffinity(pid, new_mask);
6884 free_cpumask_var(new_mask);
6885 return retval;
6886}
6887
6888long sched_getaffinity(pid_t pid, struct cpumask *mask)
6889{
6890 struct task_struct *p;
6891 unsigned long flags;
6892 int retval;
6893
6894 rcu_read_lock();
6895
6896 retval = -ESRCH;
6897 p = find_process_by_pid(pid);
6898 if (!p)
6899 goto out_unlock;
6900
6901 retval = security_task_getscheduler(p);
6902 if (retval)
6903 goto out_unlock;
6904
6905 raw_spin_lock_irqsave(&p->pi_lock, flags);
6906 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
6907 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6908
6909out_unlock:
6910 rcu_read_unlock();
6911
6912 return retval;
6913}
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6925 unsigned long __user *, user_mask_ptr)
6926{
6927 int ret;
6928 cpumask_var_t mask;
6929
6930 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
6931 return -EINVAL;
6932 if (len & (sizeof(unsigned long)-1))
6933 return -EINVAL;
6934
6935 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6936 return -ENOMEM;
6937
6938 ret = sched_getaffinity(pid, mask);
6939 if (ret == 0) {
6940 unsigned int retlen = min(len, cpumask_size());
6941
6942 if (copy_to_user(user_mask_ptr, mask, retlen))
6943 ret = -EFAULT;
6944 else
6945 ret = retlen;
6946 }
6947 free_cpumask_var(mask);
6948
6949 return ret;
6950}
6951
6952static void do_sched_yield(void)
6953{
6954 struct rq_flags rf;
6955 struct rq *rq;
6956
6957 rq = this_rq_lock_irq(&rf);
6958
6959 schedstat_inc(rq->yld_count);
6960 current->sched_class->yield_task(rq);
6961
6962 preempt_disable();
6963 rq_unlock_irq(rq, &rf);
6964 sched_preempt_enable_no_resched();
6965
6966 schedule();
6967}
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977SYSCALL_DEFINE0(sched_yield)
6978{
6979 do_sched_yield();
6980 return 0;
6981}
6982
6983#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
6984int __sched __cond_resched(void)
6985{
6986 if (should_resched(0)) {
6987 preempt_schedule_common();
6988 return 1;
6989 }
6990#ifndef CONFIG_PREEMPT_RCU
6991 rcu_all_qs();
6992#endif
6993 return 0;
6994}
6995EXPORT_SYMBOL(__cond_resched);
6996#endif
6997
6998#ifdef CONFIG_PREEMPT_DYNAMIC
6999DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
7000EXPORT_STATIC_CALL_TRAMP(cond_resched);
7001
7002DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
7003EXPORT_STATIC_CALL_TRAMP(might_resched);
7004#endif
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014int __cond_resched_lock(spinlock_t *lock)
7015{
7016 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7017 int ret = 0;
7018
7019 lockdep_assert_held(lock);
7020
7021 if (spin_needbreak(lock) || resched) {
7022 spin_unlock(lock);
7023 if (resched)
7024 preempt_schedule_common();
7025 else
7026 cpu_relax();
7027 ret = 1;
7028 spin_lock(lock);
7029 }
7030 return ret;
7031}
7032EXPORT_SYMBOL(__cond_resched_lock);
7033
7034int __cond_resched_rwlock_read(rwlock_t *lock)
7035{
7036 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7037 int ret = 0;
7038
7039 lockdep_assert_held_read(lock);
7040
7041 if (rwlock_needbreak(lock) || resched) {
7042 read_unlock(lock);
7043 if (resched)
7044 preempt_schedule_common();
7045 else
7046 cpu_relax();
7047 ret = 1;
7048 read_lock(lock);
7049 }
7050 return ret;
7051}
7052EXPORT_SYMBOL(__cond_resched_rwlock_read);
7053
7054int __cond_resched_rwlock_write(rwlock_t *lock)
7055{
7056 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7057 int ret = 0;
7058
7059 lockdep_assert_held_write(lock);
7060
7061 if (rwlock_needbreak(lock) || resched) {
7062 write_unlock(lock);
7063 if (resched)
7064 preempt_schedule_common();
7065 else
7066 cpu_relax();
7067 ret = 1;
7068 write_lock(lock);
7069 }
7070 return ret;
7071}
7072EXPORT_SYMBOL(__cond_resched_rwlock_write);
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096void __sched yield(void)
7097{
7098 set_current_state(TASK_RUNNING);
7099 do_sched_yield();
7100}
7101EXPORT_SYMBOL(yield);
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118int __sched yield_to(struct task_struct *p, bool preempt)
7119{
7120 struct task_struct *curr = current;
7121 struct rq *rq, *p_rq;
7122 unsigned long flags;
7123 int yielded = 0;
7124
7125 local_irq_save(flags);
7126 rq = this_rq();
7127
7128again:
7129 p_rq = task_rq(p);
7130
7131
7132
7133
7134 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
7135 yielded = -ESRCH;
7136 goto out_irq;
7137 }
7138
7139 double_rq_lock(rq, p_rq);
7140 if (task_rq(p) != p_rq) {
7141 double_rq_unlock(rq, p_rq);
7142 goto again;
7143 }
7144
7145 if (!curr->sched_class->yield_to_task)
7146 goto out_unlock;
7147
7148 if (curr->sched_class != p->sched_class)
7149 goto out_unlock;
7150
7151 if (task_running(p_rq, p) || p->state)
7152 goto out_unlock;
7153
7154 yielded = curr->sched_class->yield_to_task(rq, p);
7155 if (yielded) {
7156 schedstat_inc(rq->yld_count);
7157
7158
7159
7160
7161 if (preempt && rq != p_rq)
7162 resched_curr(p_rq);
7163 }
7164
7165out_unlock:
7166 double_rq_unlock(rq, p_rq);
7167out_irq:
7168 local_irq_restore(flags);
7169
7170 if (yielded > 0)
7171 schedule();
7172
7173 return yielded;
7174}
7175EXPORT_SYMBOL_GPL(yield_to);
7176
7177int io_schedule_prepare(void)
7178{
7179 int old_iowait = current->in_iowait;
7180
7181 current->in_iowait = 1;
7182 blk_schedule_flush_plug(current);
7183
7184 return old_iowait;
7185}
7186
7187void io_schedule_finish(int token)
7188{
7189 current->in_iowait = token;
7190}
7191
7192
7193
7194
7195
7196long __sched io_schedule_timeout(long timeout)
7197{
7198 int token;
7199 long ret;
7200
7201 token = io_schedule_prepare();
7202 ret = schedule_timeout(timeout);
7203 io_schedule_finish(token);
7204
7205 return ret;
7206}
7207EXPORT_SYMBOL(io_schedule_timeout);
7208
7209void __sched io_schedule(void)
7210{
7211 int token;
7212
7213 token = io_schedule_prepare();
7214 schedule();
7215 io_schedule_finish(token);
7216}
7217EXPORT_SYMBOL(io_schedule);
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
7228{
7229 int ret = -EINVAL;
7230
7231 switch (policy) {
7232 case SCHED_FIFO:
7233 case SCHED_RR:
7234 ret = MAX_RT_PRIO-1;
7235 break;
7236 case SCHED_DEADLINE:
7237 case SCHED_NORMAL:
7238 case SCHED_BATCH:
7239 case SCHED_IDLE:
7240 ret = 0;
7241 break;
7242 }
7243 return ret;
7244}
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
7255{
7256 int ret = -EINVAL;
7257
7258 switch (policy) {
7259 case SCHED_FIFO:
7260 case SCHED_RR:
7261 ret = 1;
7262 break;
7263 case SCHED_DEADLINE:
7264 case SCHED_NORMAL:
7265 case SCHED_BATCH:
7266 case SCHED_IDLE:
7267 ret = 0;
7268 }
7269 return ret;
7270}
7271
7272static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
7273{
7274 struct task_struct *p;
7275 unsigned int time_slice;
7276 struct rq_flags rf;
7277 struct rq *rq;
7278 int retval;
7279
7280 if (pid < 0)
7281 return -EINVAL;
7282
7283 retval = -ESRCH;
7284 rcu_read_lock();
7285 p = find_process_by_pid(pid);
7286 if (!p)
7287 goto out_unlock;
7288
7289 retval = security_task_getscheduler(p);
7290 if (retval)
7291 goto out_unlock;
7292
7293 rq = task_rq_lock(p, &rf);
7294 time_slice = 0;
7295 if (p->sched_class->get_rr_interval)
7296 time_slice = p->sched_class->get_rr_interval(rq, p);
7297 task_rq_unlock(rq, p, &rf);
7298
7299 rcu_read_unlock();
7300 jiffies_to_timespec64(time_slice, t);
7301 return 0;
7302
7303out_unlock:
7304 rcu_read_unlock();
7305 return retval;
7306}
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
7320 struct __kernel_timespec __user *, interval)
7321{
7322 struct timespec64 t;
7323 int retval = sched_rr_get_interval(pid, &t);
7324
7325 if (retval == 0)
7326 retval = put_timespec64(&t, interval);
7327
7328 return retval;
7329}
7330
7331#ifdef CONFIG_COMPAT_32BIT_TIME
7332SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
7333 struct old_timespec32 __user *, interval)
7334{
7335 struct timespec64 t;
7336 int retval = sched_rr_get_interval(pid, &t);
7337
7338 if (retval == 0)
7339 retval = put_old_timespec32(&t, interval);
7340 return retval;
7341}
7342#endif
7343
7344void sched_show_task(struct task_struct *p)
7345{
7346 unsigned long free = 0;
7347 int ppid;
7348
7349 if (!try_get_task_stack(p))
7350 return;
7351
7352 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
7353
7354 if (p->state == TASK_RUNNING)
7355 pr_cont(" running task ");
7356#ifdef CONFIG_DEBUG_STACK_USAGE
7357 free = stack_not_used(p);
7358#endif
7359 ppid = 0;
7360 rcu_read_lock();
7361 if (pid_alive(p))
7362 ppid = task_pid_nr(rcu_dereference(p->real_parent));
7363 rcu_read_unlock();
7364 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
7365 free, task_pid_nr(p), ppid,
7366 (unsigned long)task_thread_info(p)->flags);
7367
7368 print_worker_info(KERN_INFO, p);
7369 print_stop_info(KERN_INFO, p);
7370 show_stack(p, NULL, KERN_INFO);
7371 put_task_stack(p);
7372}
7373EXPORT_SYMBOL_GPL(sched_show_task);
7374
7375static inline bool
7376state_filter_match(unsigned long state_filter, struct task_struct *p)
7377{
7378
7379 if (!state_filter)
7380 return true;
7381
7382
7383 if (!(p->state & state_filter))
7384 return false;
7385
7386
7387
7388
7389
7390 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
7391 return false;
7392
7393 return true;
7394}
7395
7396
7397void show_state_filter(unsigned long state_filter)
7398{
7399 struct task_struct *g, *p;
7400
7401 rcu_read_lock();
7402 for_each_process_thread(g, p) {
7403
7404
7405
7406
7407
7408
7409
7410 touch_nmi_watchdog();
7411 touch_all_softlockup_watchdogs();
7412 if (state_filter_match(state_filter, p))
7413 sched_show_task(p);
7414 }
7415
7416#ifdef CONFIG_SCHED_DEBUG
7417 if (!state_filter)
7418 sysrq_sched_debug_show();
7419#endif
7420 rcu_read_unlock();
7421
7422
7423
7424 if (!state_filter)
7425 debug_show_all_locks();
7426}
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436void init_idle(struct task_struct *idle, int cpu)
7437{
7438 struct rq *rq = cpu_rq(cpu);
7439 unsigned long flags;
7440
7441 __sched_fork(0, idle);
7442
7443 raw_spin_lock_irqsave(&idle->pi_lock, flags);
7444 raw_spin_lock(&rq->lock);
7445
7446 idle->state = TASK_RUNNING;
7447 idle->se.exec_start = sched_clock();
7448 idle->flags |= PF_IDLE;
7449
7450 scs_task_reset(idle);
7451 kasan_unpoison_task_stack(idle);
7452
7453#ifdef CONFIG_SMP
7454
7455
7456
7457
7458
7459
7460 set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
7461#endif
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472 rcu_read_lock();
7473 __set_task_cpu(idle, cpu);
7474 rcu_read_unlock();
7475
7476 rq->idle = idle;
7477 rcu_assign_pointer(rq->curr, idle);
7478 idle->on_rq = TASK_ON_RQ_QUEUED;
7479#ifdef CONFIG_SMP
7480 idle->on_cpu = 1;
7481#endif
7482 raw_spin_unlock(&rq->lock);
7483 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
7484
7485
7486 init_idle_preempt_count(idle, cpu);
7487
7488
7489
7490
7491 idle->sched_class = &idle_sched_class;
7492 ftrace_graph_init_idle_task(idle, cpu);
7493 vtime_init_idle(idle, cpu);
7494#ifdef CONFIG_SMP
7495 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
7496#endif
7497}
7498
7499#ifdef CONFIG_SMP
7500
7501int cpuset_cpumask_can_shrink(const struct cpumask *cur,
7502 const struct cpumask *trial)
7503{
7504 int ret = 1;
7505
7506 if (!cpumask_weight(cur))
7507 return ret;
7508
7509 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
7510
7511 return ret;
7512}
7513
7514int task_can_attach(struct task_struct *p,
7515 const struct cpumask *cs_cpus_allowed)
7516{
7517 int ret = 0;
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528 if (p->flags & PF_NO_SETAFFINITY) {
7529 ret = -EINVAL;
7530 goto out;
7531 }
7532
7533 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
7534 cs_cpus_allowed))
7535 ret = dl_task_can_attach(p, cs_cpus_allowed);
7536
7537out:
7538 return ret;
7539}
7540
7541bool sched_smp_initialized __read_mostly;
7542
7543#ifdef CONFIG_NUMA_BALANCING
7544
7545int migrate_task_to(struct task_struct *p, int target_cpu)
7546{
7547 struct migration_arg arg = { p, target_cpu };
7548 int curr_cpu = task_cpu(p);
7549
7550 if (curr_cpu == target_cpu)
7551 return 0;
7552
7553 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
7554 return -EINVAL;
7555
7556
7557
7558 trace_sched_move_numa(p, curr_cpu, target_cpu);
7559 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
7560}
7561
7562
7563
7564
7565
7566void sched_setnuma(struct task_struct *p, int nid)
7567{
7568 bool queued, running;
7569 struct rq_flags rf;
7570 struct rq *rq;
7571
7572 rq = task_rq_lock(p, &rf);
7573 queued = task_on_rq_queued(p);
7574 running = task_current(rq, p);
7575
7576 if (queued)
7577 dequeue_task(rq, p, DEQUEUE_SAVE);
7578 if (running)
7579 put_prev_task(rq, p);
7580
7581 p->numa_preferred_nid = nid;
7582
7583 if (queued)
7584 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
7585 if (running)
7586 set_next_task(rq, p);
7587 task_rq_unlock(rq, p, &rf);
7588}
7589#endif
7590
7591#ifdef CONFIG_HOTPLUG_CPU
7592
7593
7594
7595
7596void idle_task_exit(void)
7597{
7598 struct mm_struct *mm = current->active_mm;
7599
7600 BUG_ON(cpu_online(smp_processor_id()));
7601 BUG_ON(current != this_rq()->idle);
7602
7603 if (mm != &init_mm) {
7604 switch_mm(mm, &init_mm, current);
7605 finish_arch_post_lock_switch();
7606 }
7607
7608
7609}
7610
7611static int __balance_push_cpu_stop(void *arg)
7612{
7613 struct task_struct *p = arg;
7614 struct rq *rq = this_rq();
7615 struct rq_flags rf;
7616 int cpu;
7617
7618 raw_spin_lock_irq(&p->pi_lock);
7619 rq_lock(rq, &rf);
7620
7621 update_rq_clock(rq);
7622
7623 if (task_rq(p) == rq && task_on_rq_queued(p)) {
7624 cpu = select_fallback_rq(rq->cpu, p);
7625 rq = __migrate_task(rq, &rf, p, cpu);
7626 }
7627
7628 rq_unlock(rq, &rf);
7629 raw_spin_unlock_irq(&p->pi_lock);
7630
7631 put_task_struct(p);
7632
7633 return 0;
7634}
7635
7636static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
7637
7638
7639
7640
7641
7642
7643
7644static void balance_push(struct rq *rq)
7645{
7646 struct task_struct *push_task = rq->curr;
7647
7648 lockdep_assert_held(&rq->lock);
7649 SCHED_WARN_ON(rq->cpu != smp_processor_id());
7650
7651
7652
7653
7654 rq->balance_callback = &balance_push_callback;
7655
7656
7657
7658
7659 if (!cpu_dying(rq->cpu))
7660 return;
7661
7662
7663
7664
7665
7666
7667
7668
7669 if (rq->idle == push_task ||
7670 kthread_is_per_cpu(push_task) ||
7671 is_migration_disabled(push_task)) {
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684 if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
7685 rcuwait_active(&rq->hotplug_wait)) {
7686 raw_spin_unlock(&rq->lock);
7687 rcuwait_wake_up(&rq->hotplug_wait);
7688 raw_spin_lock(&rq->lock);
7689 }
7690 return;
7691 }
7692
7693 get_task_struct(push_task);
7694
7695
7696
7697
7698 raw_spin_unlock(&rq->lock);
7699 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
7700 this_cpu_ptr(&push_work));
7701
7702
7703
7704
7705
7706 raw_spin_lock(&rq->lock);
7707}
7708
7709static void balance_push_set(int cpu, bool on)
7710{
7711 struct rq *rq = cpu_rq(cpu);
7712 struct rq_flags rf;
7713
7714 rq_lock_irqsave(rq, &rf);
7715 if (on) {
7716 WARN_ON_ONCE(rq->balance_callback);
7717 rq->balance_callback = &balance_push_callback;
7718 } else if (rq->balance_callback == &balance_push_callback) {
7719 rq->balance_callback = NULL;
7720 }
7721 rq_unlock_irqrestore(rq, &rf);
7722}
7723
7724
7725
7726
7727
7728
7729
7730static void balance_hotplug_wait(void)
7731{
7732 struct rq *rq = this_rq();
7733
7734 rcuwait_wait_event(&rq->hotplug_wait,
7735 rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
7736 TASK_UNINTERRUPTIBLE);
7737}
7738
7739#else
7740
7741static inline void balance_push(struct rq *rq)
7742{
7743}
7744
7745static inline void balance_push_set(int cpu, bool on)
7746{
7747}
7748
7749static inline void balance_hotplug_wait(void)
7750{
7751}
7752
7753#endif
7754
7755void set_rq_online(struct rq *rq)
7756{
7757 if (!rq->online) {
7758 const struct sched_class *class;
7759
7760 cpumask_set_cpu(rq->cpu, rq->rd->online);
7761 rq->online = 1;
7762
7763 for_each_class(class) {
7764 if (class->rq_online)
7765 class->rq_online(rq);
7766 }
7767 }
7768}
7769
7770void set_rq_offline(struct rq *rq)
7771{
7772 if (rq->online) {
7773 const struct sched_class *class;
7774
7775 for_each_class(class) {
7776 if (class->rq_offline)
7777 class->rq_offline(rq);
7778 }
7779
7780 cpumask_clear_cpu(rq->cpu, rq->rd->online);
7781 rq->online = 0;
7782 }
7783}
7784
7785
7786
7787
7788static int num_cpus_frozen;
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798static void cpuset_cpu_active(void)
7799{
7800 if (cpuhp_tasks_frozen) {
7801
7802
7803
7804
7805
7806
7807 partition_sched_domains(1, NULL, NULL);
7808 if (--num_cpus_frozen)
7809 return;
7810
7811
7812
7813
7814
7815 cpuset_force_rebuild();
7816 }
7817 cpuset_update_active_cpus();
7818}
7819
7820static int cpuset_cpu_inactive(unsigned int cpu)
7821{
7822 if (!cpuhp_tasks_frozen) {
7823 if (dl_cpu_busy(cpu))
7824 return -EBUSY;
7825 cpuset_update_active_cpus();
7826 } else {
7827 num_cpus_frozen++;
7828 partition_sched_domains(1, NULL, NULL);
7829 }
7830 return 0;
7831}
7832
7833int sched_cpu_activate(unsigned int cpu)
7834{
7835 struct rq *rq = cpu_rq(cpu);
7836 struct rq_flags rf;
7837
7838
7839
7840
7841
7842 balance_push_set(cpu, false);
7843
7844#ifdef CONFIG_SCHED_SMT
7845
7846
7847
7848 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
7849 static_branch_inc_cpuslocked(&sched_smt_present);
7850#endif
7851 set_cpu_active(cpu, true);
7852
7853 if (sched_smp_initialized) {
7854 sched_domains_numa_masks_set(cpu);
7855 cpuset_cpu_active();
7856 }
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867 rq_lock_irqsave(rq, &rf);
7868 if (rq->rd) {
7869 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7870 set_rq_online(rq);
7871 }
7872 rq_unlock_irqrestore(rq, &rf);
7873
7874 return 0;
7875}
7876
7877int sched_cpu_deactivate(unsigned int cpu)
7878{
7879 struct rq *rq = cpu_rq(cpu);
7880 struct rq_flags rf;
7881 int ret;
7882
7883
7884
7885
7886
7887 nohz_balance_exit_idle(rq);
7888
7889 set_cpu_active(cpu, false);
7890
7891
7892
7893
7894
7895
7896
7897 balance_push_set(cpu, true);
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909 synchronize_rcu();
7910
7911 rq_lock_irqsave(rq, &rf);
7912 if (rq->rd) {
7913 update_rq_clock(rq);
7914 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7915 set_rq_offline(rq);
7916 }
7917 rq_unlock_irqrestore(rq, &rf);
7918
7919#ifdef CONFIG_SCHED_SMT
7920
7921
7922
7923 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
7924 static_branch_dec_cpuslocked(&sched_smt_present);
7925#endif
7926
7927 if (!sched_smp_initialized)
7928 return 0;
7929
7930 ret = cpuset_cpu_inactive(cpu);
7931 if (ret) {
7932 balance_push_set(cpu, false);
7933 set_cpu_active(cpu, true);
7934 return ret;
7935 }
7936 sched_domains_numa_masks_clear(cpu);
7937 return 0;
7938}
7939
7940static void sched_rq_cpu_starting(unsigned int cpu)
7941{
7942 struct rq *rq = cpu_rq(cpu);
7943
7944 rq->calc_load_update = calc_load_update;
7945 update_max_interval();
7946}
7947
7948int sched_cpu_starting(unsigned int cpu)
7949{
7950 sched_rq_cpu_starting(cpu);
7951 sched_tick_start(cpu);
7952 return 0;
7953}
7954
7955#ifdef CONFIG_HOTPLUG_CPU
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968int sched_cpu_wait_empty(unsigned int cpu)
7969{
7970 balance_hotplug_wait();
7971 return 0;
7972}
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983static void calc_load_migrate(struct rq *rq)
7984{
7985 long delta = calc_load_fold_active(rq, 1);
7986
7987 if (delta)
7988 atomic_long_add(delta, &calc_load_tasks);
7989}
7990
7991static void dump_rq_tasks(struct rq *rq, const char *loglvl)
7992{
7993 struct task_struct *g, *p;
7994 int cpu = cpu_of(rq);
7995
7996 lockdep_assert_held(&rq->lock);
7997
7998 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
7999 for_each_process_thread(g, p) {
8000 if (task_cpu(p) != cpu)
8001 continue;
8002
8003 if (!task_on_rq_queued(p))
8004 continue;
8005
8006 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
8007 }
8008}
8009
8010int sched_cpu_dying(unsigned int cpu)
8011{
8012 struct rq *rq = cpu_rq(cpu);
8013 struct rq_flags rf;
8014
8015
8016 sched_tick_stop(cpu);
8017
8018 rq_lock_irqsave(rq, &rf);
8019 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
8020 WARN(true, "Dying CPU not properly vacated!");
8021 dump_rq_tasks(rq, KERN_WARNING);
8022 }
8023 rq_unlock_irqrestore(rq, &rf);
8024
8025 calc_load_migrate(rq);
8026 update_max_interval();
8027 hrtick_clear(rq);
8028 return 0;
8029}
8030#endif
8031
8032void __init sched_init_smp(void)
8033{
8034 sched_init_numa();
8035
8036
8037
8038
8039
8040
8041 mutex_lock(&sched_domains_mutex);
8042 sched_init_domains(cpu_active_mask);
8043 mutex_unlock(&sched_domains_mutex);
8044
8045
8046 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
8047 BUG();
8048 sched_init_granularity();
8049
8050 init_sched_rt_class();
8051 init_sched_dl_class();
8052
8053 sched_smp_initialized = true;
8054}
8055
8056static int __init migration_init(void)
8057{
8058 sched_cpu_starting(smp_processor_id());
8059 return 0;
8060}
8061early_initcall(migration_init);
8062
8063#else
8064void __init sched_init_smp(void)
8065{
8066 sched_init_granularity();
8067}
8068#endif
8069
8070int in_sched_functions(unsigned long addr)
8071{
8072 return in_lock_functions(addr) ||
8073 (addr >= (unsigned long)__sched_text_start
8074 && addr < (unsigned long)__sched_text_end);
8075}
8076
8077#ifdef CONFIG_CGROUP_SCHED
8078
8079
8080
8081
8082struct task_group root_task_group;
8083LIST_HEAD(task_groups);
8084
8085
8086static struct kmem_cache *task_group_cache __read_mostly;
8087#endif
8088
8089DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
8090DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
8091
8092void __init sched_init(void)
8093{
8094 unsigned long ptr = 0;
8095 int i;
8096
8097
8098 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8099 &fair_sched_class + 1 != &rt_sched_class ||
8100 &rt_sched_class + 1 != &dl_sched_class);
8101#ifdef CONFIG_SMP
8102 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8103#endif
8104
8105 wait_bit_init();
8106
8107#ifdef CONFIG_FAIR_GROUP_SCHED
8108 ptr += 2 * nr_cpu_ids * sizeof(void **);
8109#endif
8110#ifdef CONFIG_RT_GROUP_SCHED
8111 ptr += 2 * nr_cpu_ids * sizeof(void **);
8112#endif
8113 if (ptr) {
8114 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
8115
8116#ifdef CONFIG_FAIR_GROUP_SCHED
8117 root_task_group.se = (struct sched_entity **)ptr;
8118 ptr += nr_cpu_ids * sizeof(void **);
8119
8120 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8121 ptr += nr_cpu_ids * sizeof(void **);
8122
8123 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8124 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8125#endif
8126#ifdef CONFIG_RT_GROUP_SCHED
8127 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8128 ptr += nr_cpu_ids * sizeof(void **);
8129
8130 root_task_group.rt_rq = (struct rt_rq **)ptr;
8131 ptr += nr_cpu_ids * sizeof(void **);
8132
8133#endif
8134 }
8135#ifdef CONFIG_CPUMASK_OFFSTACK
8136 for_each_possible_cpu(i) {
8137 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
8138 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8139 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
8140 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8141 }
8142#endif
8143
8144 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
8145 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
8146
8147#ifdef CONFIG_SMP
8148 init_defrootdomain();
8149#endif
8150
8151#ifdef CONFIG_RT_GROUP_SCHED
8152 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8153 global_rt_period(), global_rt_runtime());
8154#endif
8155
8156#ifdef CONFIG_CGROUP_SCHED
8157 task_group_cache = KMEM_CACHE(task_group, 0);
8158
8159 list_add(&root_task_group.list, &task_groups);
8160 INIT_LIST_HEAD(&root_task_group.children);
8161 INIT_LIST_HEAD(&root_task_group.siblings);
8162 autogroup_init(&init_task);
8163#endif
8164
8165 for_each_possible_cpu(i) {
8166 struct rq *rq;
8167
8168 rq = cpu_rq(i);
8169 raw_spin_lock_init(&rq->lock);
8170 rq->nr_running = 0;
8171 rq->calc_load_active = 0;
8172 rq->calc_load_update = jiffies + LOAD_FREQ;
8173 init_cfs_rq(&rq->cfs);
8174 init_rt_rq(&rq->rt);
8175 init_dl_rq(&rq->dl);
8176#ifdef CONFIG_FAIR_GROUP_SCHED
8177 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8178 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8199#endif
8200
8201 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8202#ifdef CONFIG_RT_GROUP_SCHED
8203 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8204#endif
8205#ifdef CONFIG_SMP
8206 rq->sd = NULL;
8207 rq->rd = NULL;
8208 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
8209 rq->balance_callback = &balance_push_callback;
8210 rq->active_balance = 0;
8211 rq->next_balance = jiffies;
8212 rq->push_cpu = 0;
8213 rq->cpu = i;
8214 rq->online = 0;
8215 rq->idle_stamp = 0;
8216 rq->avg_idle = 2*sysctl_sched_migration_cost;
8217 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
8218
8219 INIT_LIST_HEAD(&rq->cfs_tasks);
8220
8221 rq_attach_root(rq, &def_root_domain);
8222#ifdef CONFIG_NO_HZ_COMMON
8223 rq->last_blocked_load_update_tick = jiffies;
8224 atomic_set(&rq->nohz_flags, 0);
8225
8226 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
8227#endif
8228#ifdef CONFIG_HOTPLUG_CPU
8229 rcuwait_init(&rq->hotplug_wait);
8230#endif
8231#endif
8232 hrtick_rq_init(rq);
8233 atomic_set(&rq->nr_iowait, 0);
8234 }
8235
8236 set_load_weight(&init_task, false);
8237
8238
8239
8240
8241 mmgrab(&init_mm);
8242 enter_lazy_tlb(&init_mm, current);
8243
8244
8245
8246
8247
8248
8249
8250 init_idle(current, smp_processor_id());
8251
8252 calc_load_update = jiffies + LOAD_FREQ;
8253
8254#ifdef CONFIG_SMP
8255 idle_thread_set_boot_cpu();
8256 balance_push_set(smp_processor_id(), false);
8257#endif
8258 init_sched_fair_class();
8259
8260 init_schedstats();
8261
8262 psi_init();
8263
8264 init_uclamp();
8265
8266 scheduler_running = 1;
8267}
8268
8269#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8270static inline int preempt_count_equals(int preempt_offset)
8271{
8272 int nested = preempt_count() + rcu_preempt_depth();
8273
8274 return (nested == preempt_offset);
8275}
8276
8277void __might_sleep(const char *file, int line, int preempt_offset)
8278{
8279
8280
8281
8282
8283
8284 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
8285 "do not call blocking ops when !TASK_RUNNING; "
8286 "state=%lx set at [<%p>] %pS\n",
8287 current->state,
8288 (void *)current->task_state_change,
8289 (void *)current->task_state_change);
8290
8291 ___might_sleep(file, line, preempt_offset);
8292}
8293EXPORT_SYMBOL(__might_sleep);
8294
8295void ___might_sleep(const char *file, int line, int preempt_offset)
8296{
8297
8298 static unsigned long prev_jiffy;
8299
8300 unsigned long preempt_disable_ip;
8301
8302
8303 rcu_sleep_check();
8304
8305 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
8306 !is_idle_task(current) && !current->non_block_count) ||
8307 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
8308 oops_in_progress)
8309 return;
8310
8311 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8312 return;
8313 prev_jiffy = jiffies;
8314
8315
8316 preempt_disable_ip = get_preempt_disable_ip(current);
8317
8318 printk(KERN_ERR
8319 "BUG: sleeping function called from invalid context at %s:%d\n",
8320 file, line);
8321 printk(KERN_ERR
8322 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
8323 in_atomic(), irqs_disabled(), current->non_block_count,
8324 current->pid, current->comm);
8325
8326 if (task_stack_end_corrupted(current))
8327 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
8328
8329 debug_show_held_locks(current);
8330 if (irqs_disabled())
8331 print_irqtrace_events(current);
8332 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
8333 && !preempt_count_equals(preempt_offset)) {
8334 pr_err("Preemption disabled at:");
8335 print_ip_sym(KERN_ERR, preempt_disable_ip);
8336 }
8337 dump_stack();
8338 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8339}
8340EXPORT_SYMBOL(___might_sleep);
8341
8342void __cant_sleep(const char *file, int line, int preempt_offset)
8343{
8344 static unsigned long prev_jiffy;
8345
8346 if (irqs_disabled())
8347 return;
8348
8349 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8350 return;
8351
8352 if (preempt_count() > preempt_offset)
8353 return;
8354
8355 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8356 return;
8357 prev_jiffy = jiffies;
8358
8359 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8360 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8361 in_atomic(), irqs_disabled(),
8362 current->pid, current->comm);
8363
8364 debug_show_held_locks(current);
8365 dump_stack();
8366 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8367}
8368EXPORT_SYMBOL_GPL(__cant_sleep);
8369
8370#ifdef CONFIG_SMP
8371void __cant_migrate(const char *file, int line)
8372{
8373 static unsigned long prev_jiffy;
8374
8375 if (irqs_disabled())
8376 return;
8377
8378 if (is_migration_disabled(current))
8379 return;
8380
8381 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8382 return;
8383
8384 if (preempt_count() > 0)
8385 return;
8386
8387 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8388 return;
8389 prev_jiffy = jiffies;
8390
8391 pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
8392 pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
8393 in_atomic(), irqs_disabled(), is_migration_disabled(current),
8394 current->pid, current->comm);
8395
8396 debug_show_held_locks(current);
8397 dump_stack();
8398 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8399}
8400EXPORT_SYMBOL_GPL(__cant_migrate);
8401#endif
8402#endif
8403
8404#ifdef CONFIG_MAGIC_SYSRQ
8405void normalize_rt_tasks(void)
8406{
8407 struct task_struct *g, *p;
8408 struct sched_attr attr = {
8409 .sched_policy = SCHED_NORMAL,
8410 };
8411
8412 read_lock(&tasklist_lock);
8413 for_each_process_thread(g, p) {
8414
8415
8416
8417 if (p->flags & PF_KTHREAD)
8418 continue;
8419
8420 p->se.exec_start = 0;
8421 schedstat_set(p->se.statistics.wait_start, 0);
8422 schedstat_set(p->se.statistics.sleep_start, 0);
8423 schedstat_set(p->se.statistics.block_start, 0);
8424
8425 if (!dl_task(p) && !rt_task(p)) {
8426
8427
8428
8429
8430 if (task_nice(p) < 0)
8431 set_user_nice(p, 0);
8432 continue;
8433 }
8434
8435 __sched_setscheduler(p, &attr, false, false);
8436 }
8437 read_unlock(&tasklist_lock);
8438}
8439
8440#endif
8441
8442#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461struct task_struct *curr_task(int cpu)
8462{
8463 return cpu_curr(cpu);
8464}
8465
8466#endif
8467
8468#ifdef CONFIG_IA64
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484void ia64_set_curr_task(int cpu, struct task_struct *p)
8485{
8486 cpu_curr(cpu) = p;
8487}
8488
8489#endif
8490
8491#ifdef CONFIG_CGROUP_SCHED
8492
8493static DEFINE_SPINLOCK(task_group_lock);
8494
8495static inline void alloc_uclamp_sched_group(struct task_group *tg,
8496 struct task_group *parent)
8497{
8498#ifdef CONFIG_UCLAMP_TASK_GROUP
8499 enum uclamp_id clamp_id;
8500
8501 for_each_clamp_id(clamp_id) {
8502 uclamp_se_set(&tg->uclamp_req[clamp_id],
8503 uclamp_none(clamp_id), false);
8504 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
8505 }
8506#endif
8507}
8508
8509static void sched_free_group(struct task_group *tg)
8510{
8511 free_fair_sched_group(tg);
8512 free_rt_sched_group(tg);
8513 autogroup_free(tg);
8514 kmem_cache_free(task_group_cache, tg);
8515}
8516
8517
8518struct task_group *sched_create_group(struct task_group *parent)
8519{
8520 struct task_group *tg;
8521
8522 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
8523 if (!tg)
8524 return ERR_PTR(-ENOMEM);
8525
8526 if (!alloc_fair_sched_group(tg, parent))
8527 goto err;
8528
8529 if (!alloc_rt_sched_group(tg, parent))
8530 goto err;
8531
8532 alloc_uclamp_sched_group(tg, parent);
8533
8534 return tg;
8535
8536err:
8537 sched_free_group(tg);
8538 return ERR_PTR(-ENOMEM);
8539}
8540
8541void sched_online_group(struct task_group *tg, struct task_group *parent)
8542{
8543 unsigned long flags;
8544
8545 spin_lock_irqsave(&task_group_lock, flags);
8546 list_add_rcu(&tg->list, &task_groups);
8547
8548
8549 WARN_ON(!parent);
8550
8551 tg->parent = parent;
8552 INIT_LIST_HEAD(&tg->children);
8553 list_add_rcu(&tg->siblings, &parent->children);
8554 spin_unlock_irqrestore(&task_group_lock, flags);
8555
8556 online_fair_sched_group(tg);
8557}
8558
8559
8560static void sched_free_group_rcu(struct rcu_head *rhp)
8561{
8562
8563 sched_free_group(container_of(rhp, struct task_group, rcu));
8564}
8565
8566void sched_destroy_group(struct task_group *tg)
8567{
8568
8569 call_rcu(&tg->rcu, sched_free_group_rcu);
8570}
8571
8572void sched_offline_group(struct task_group *tg)
8573{
8574 unsigned long flags;
8575
8576
8577 unregister_fair_sched_group(tg);
8578
8579 spin_lock_irqsave(&task_group_lock, flags);
8580 list_del_rcu(&tg->list);
8581 list_del_rcu(&tg->siblings);
8582 spin_unlock_irqrestore(&task_group_lock, flags);
8583}
8584
8585static void sched_change_group(struct task_struct *tsk, int type)
8586{
8587 struct task_group *tg;
8588
8589
8590
8591
8592
8593
8594 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
8595 struct task_group, css);
8596 tg = autogroup_task_group(tsk, tg);
8597 tsk->sched_task_group = tg;
8598
8599#ifdef CONFIG_FAIR_GROUP_SCHED
8600 if (tsk->sched_class->task_change_group)
8601 tsk->sched_class->task_change_group(tsk, type);
8602 else
8603#endif
8604 set_task_rq(tsk, task_cpu(tsk));
8605}
8606
8607
8608
8609
8610
8611
8612
8613
8614void sched_move_task(struct task_struct *tsk)
8615{
8616 int queued, running, queue_flags =
8617 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
8618 struct rq_flags rf;
8619 struct rq *rq;
8620
8621 rq = task_rq_lock(tsk, &rf);
8622 update_rq_clock(rq);
8623
8624 running = task_current(rq, tsk);
8625 queued = task_on_rq_queued(tsk);
8626
8627 if (queued)
8628 dequeue_task(rq, tsk, queue_flags);
8629 if (running)
8630 put_prev_task(rq, tsk);
8631
8632 sched_change_group(tsk, TASK_MOVE_GROUP);
8633
8634 if (queued)
8635 enqueue_task(rq, tsk, queue_flags);
8636 if (running) {
8637 set_next_task(rq, tsk);
8638
8639
8640
8641
8642
8643 resched_curr(rq);
8644 }
8645
8646 task_rq_unlock(rq, tsk, &rf);
8647}
8648
8649static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8650{
8651 return css ? container_of(css, struct task_group, css) : NULL;
8652}
8653
8654static struct cgroup_subsys_state *
8655cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8656{
8657 struct task_group *parent = css_tg(parent_css);
8658 struct task_group *tg;
8659
8660 if (!parent) {
8661
8662 return &root_task_group.css;
8663 }
8664
8665 tg = sched_create_group(parent);
8666 if (IS_ERR(tg))
8667 return ERR_PTR(-ENOMEM);
8668
8669 return &tg->css;
8670}
8671
8672
8673static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
8674{
8675 struct task_group *tg = css_tg(css);
8676 struct task_group *parent = css_tg(css->parent);
8677
8678 if (parent)
8679 sched_online_group(tg, parent);
8680
8681#ifdef CONFIG_UCLAMP_TASK_GROUP
8682
8683 cpu_util_update_eff(css);
8684#endif
8685
8686 return 0;
8687}
8688
8689static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8690{
8691 struct task_group *tg = css_tg(css);
8692
8693 sched_offline_group(tg);
8694}
8695
8696static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8697{
8698 struct task_group *tg = css_tg(css);
8699
8700
8701
8702
8703 sched_free_group(tg);
8704}
8705
8706
8707
8708
8709
8710static void cpu_cgroup_fork(struct task_struct *task)
8711{
8712 struct rq_flags rf;
8713 struct rq *rq;
8714
8715 rq = task_rq_lock(task, &rf);
8716
8717 update_rq_clock(rq);
8718 sched_change_group(task, TASK_SET_GROUP);
8719
8720 task_rq_unlock(rq, task, &rf);
8721}
8722
8723static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8724{
8725 struct task_struct *task;
8726 struct cgroup_subsys_state *css;
8727 int ret = 0;
8728
8729 cgroup_taskset_for_each(task, css, tset) {
8730#ifdef CONFIG_RT_GROUP_SCHED
8731 if (!sched_rt_can_attach(css_tg(css), task))
8732 return -EINVAL;
8733#endif
8734
8735
8736
8737
8738 raw_spin_lock_irq(&task->pi_lock);
8739
8740
8741
8742
8743
8744 if (task->state == TASK_NEW)
8745 ret = -EINVAL;
8746 raw_spin_unlock_irq(&task->pi_lock);
8747
8748 if (ret)
8749 break;
8750 }
8751 return ret;
8752}
8753
8754static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8755{
8756 struct task_struct *task;
8757 struct cgroup_subsys_state *css;
8758
8759 cgroup_taskset_for_each(task, css, tset)
8760 sched_move_task(task);
8761}
8762
8763#ifdef CONFIG_UCLAMP_TASK_GROUP
8764static void cpu_util_update_eff(struct cgroup_subsys_state *css)
8765{
8766 struct cgroup_subsys_state *top_css = css;
8767 struct uclamp_se *uc_parent = NULL;
8768 struct uclamp_se *uc_se = NULL;
8769 unsigned int eff[UCLAMP_CNT];
8770 enum uclamp_id clamp_id;
8771 unsigned int clamps;
8772
8773 css_for_each_descendant_pre(css, top_css) {
8774 uc_parent = css_tg(css)->parent
8775 ? css_tg(css)->parent->uclamp : NULL;
8776
8777 for_each_clamp_id(clamp_id) {
8778
8779 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
8780
8781 if (uc_parent &&
8782 eff[clamp_id] > uc_parent[clamp_id].value) {
8783 eff[clamp_id] = uc_parent[clamp_id].value;
8784 }
8785 }
8786
8787 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
8788
8789
8790 clamps = 0x0;
8791 uc_se = css_tg(css)->uclamp;
8792 for_each_clamp_id(clamp_id) {
8793 if (eff[clamp_id] == uc_se[clamp_id].value)
8794 continue;
8795 uc_se[clamp_id].value = eff[clamp_id];
8796 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
8797 clamps |= (0x1 << clamp_id);
8798 }
8799 if (!clamps) {
8800 css = css_rightmost_descendant(css);
8801 continue;
8802 }
8803
8804
8805 uclamp_update_active_tasks(css, clamps);
8806 }
8807}
8808
8809
8810
8811
8812
8813
8814#define _POW10(exp) ((unsigned int)1e##exp)
8815#define POW10(exp) _POW10(exp)
8816
8817struct uclamp_request {
8818#define UCLAMP_PERCENT_SHIFT 2
8819#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
8820 s64 percent;
8821 u64 util;
8822 int ret;
8823};
8824
8825static inline struct uclamp_request
8826capacity_from_percent(char *buf)
8827{
8828 struct uclamp_request req = {
8829 .percent = UCLAMP_PERCENT_SCALE,
8830 .util = SCHED_CAPACITY_SCALE,
8831 .ret = 0,
8832 };
8833
8834 buf = strim(buf);
8835 if (strcmp(buf, "max")) {
8836 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
8837 &req.percent);
8838 if (req.ret)
8839 return req;
8840 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
8841 req.ret = -ERANGE;
8842 return req;
8843 }
8844
8845 req.util = req.percent << SCHED_CAPACITY_SHIFT;
8846 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
8847 }
8848
8849 return req;
8850}
8851
8852static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
8853 size_t nbytes, loff_t off,
8854 enum uclamp_id clamp_id)
8855{
8856 struct uclamp_request req;
8857 struct task_group *tg;
8858
8859 req = capacity_from_percent(buf);
8860 if (req.ret)
8861 return req.ret;
8862
8863 static_branch_enable(&sched_uclamp_used);
8864
8865 mutex_lock(&uclamp_mutex);
8866 rcu_read_lock();
8867
8868 tg = css_tg(of_css(of));
8869 if (tg->uclamp_req[clamp_id].value != req.util)
8870 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
8871
8872
8873
8874
8875
8876 tg->uclamp_pct[clamp_id] = req.percent;
8877
8878
8879 cpu_util_update_eff(of_css(of));
8880
8881 rcu_read_unlock();
8882 mutex_unlock(&uclamp_mutex);
8883
8884 return nbytes;
8885}
8886
8887static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
8888 char *buf, size_t nbytes,
8889 loff_t off)
8890{
8891 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
8892}
8893
8894static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
8895 char *buf, size_t nbytes,
8896 loff_t off)
8897{
8898 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
8899}
8900
8901static inline void cpu_uclamp_print(struct seq_file *sf,
8902 enum uclamp_id clamp_id)
8903{
8904 struct task_group *tg;
8905 u64 util_clamp;
8906 u64 percent;
8907 u32 rem;
8908
8909 rcu_read_lock();
8910 tg = css_tg(seq_css(sf));
8911 util_clamp = tg->uclamp_req[clamp_id].value;
8912 rcu_read_unlock();
8913
8914 if (util_clamp == SCHED_CAPACITY_SCALE) {
8915 seq_puts(sf, "max\n");
8916 return;
8917 }
8918
8919 percent = tg->uclamp_pct[clamp_id];
8920 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
8921 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
8922}
8923
8924static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
8925{
8926 cpu_uclamp_print(sf, UCLAMP_MIN);
8927 return 0;
8928}
8929
8930static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
8931{
8932 cpu_uclamp_print(sf, UCLAMP_MAX);
8933 return 0;
8934}
8935#endif
8936
8937#ifdef CONFIG_FAIR_GROUP_SCHED
8938static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8939 struct cftype *cftype, u64 shareval)
8940{
8941 if (shareval > scale_load_down(ULONG_MAX))
8942 shareval = MAX_SHARES;
8943 return sched_group_set_shares(css_tg(css), scale_load(shareval));
8944}
8945
8946static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8947 struct cftype *cft)
8948{
8949 struct task_group *tg = css_tg(css);
8950
8951 return (u64) scale_load_down(tg->shares);
8952}
8953
8954#ifdef CONFIG_CFS_BANDWIDTH
8955static DEFINE_MUTEX(cfs_constraints_mutex);
8956
8957const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
8958static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
8959
8960static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
8961
8962static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8963
8964static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8965{
8966 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8967 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8968
8969 if (tg == &root_task_group)
8970 return -EINVAL;
8971
8972
8973
8974
8975
8976
8977 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8978 return -EINVAL;
8979
8980
8981
8982
8983
8984
8985 if (period > max_cfs_quota_period)
8986 return -EINVAL;
8987
8988
8989
8990
8991 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
8992 return -EINVAL;
8993
8994
8995
8996
8997
8998 get_online_cpus();
8999 mutex_lock(&cfs_constraints_mutex);
9000 ret = __cfs_schedulable(tg, period, quota);
9001 if (ret)
9002 goto out_unlock;
9003
9004 runtime_enabled = quota != RUNTIME_INF;
9005 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9006
9007
9008
9009
9010 if (runtime_enabled && !runtime_was_enabled)
9011 cfs_bandwidth_usage_inc();
9012 raw_spin_lock_irq(&cfs_b->lock);
9013 cfs_b->period = ns_to_ktime(period);
9014 cfs_b->quota = quota;
9015
9016 __refill_cfs_bandwidth_runtime(cfs_b);
9017
9018
9019 if (runtime_enabled)
9020 start_cfs_bandwidth(cfs_b);
9021
9022 raw_spin_unlock_irq(&cfs_b->lock);
9023
9024 for_each_online_cpu(i) {
9025 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9026 struct rq *rq = cfs_rq->rq;
9027 struct rq_flags rf;
9028
9029 rq_lock_irq(rq, &rf);
9030 cfs_rq->runtime_enabled = runtime_enabled;
9031 cfs_rq->runtime_remaining = 0;
9032
9033 if (cfs_rq->throttled)
9034 unthrottle_cfs_rq(cfs_rq);
9035 rq_unlock_irq(rq, &rf);
9036 }
9037 if (runtime_was_enabled && !runtime_enabled)
9038 cfs_bandwidth_usage_dec();
9039out_unlock:
9040 mutex_unlock(&cfs_constraints_mutex);
9041 put_online_cpus();
9042
9043 return ret;
9044}
9045
9046static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9047{
9048 u64 quota, period;
9049
9050 period = ktime_to_ns(tg->cfs_bandwidth.period);
9051 if (cfs_quota_us < 0)
9052 quota = RUNTIME_INF;
9053 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
9054 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9055 else
9056 return -EINVAL;
9057
9058 return tg_set_cfs_bandwidth(tg, period, quota);
9059}
9060
9061static long tg_get_cfs_quota(struct task_group *tg)
9062{
9063 u64 quota_us;
9064
9065 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9066 return -1;
9067
9068 quota_us = tg->cfs_bandwidth.quota;
9069 do_div(quota_us, NSEC_PER_USEC);
9070
9071 return quota_us;
9072}
9073
9074static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9075{
9076 u64 quota, period;
9077
9078 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
9079 return -EINVAL;
9080
9081 period = (u64)cfs_period_us * NSEC_PER_USEC;
9082 quota = tg->cfs_bandwidth.quota;
9083
9084 return tg_set_cfs_bandwidth(tg, period, quota);
9085}
9086
9087static long tg_get_cfs_period(struct task_group *tg)
9088{
9089 u64 cfs_period_us;
9090
9091 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9092 do_div(cfs_period_us, NSEC_PER_USEC);
9093
9094 return cfs_period_us;
9095}
9096
9097static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
9098 struct cftype *cft)
9099{
9100 return tg_get_cfs_quota(css_tg(css));
9101}
9102
9103static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
9104 struct cftype *cftype, s64 cfs_quota_us)
9105{
9106 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
9107}
9108
9109static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
9110 struct cftype *cft)
9111{
9112 return tg_get_cfs_period(css_tg(css));
9113}
9114
9115static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
9116 struct cftype *cftype, u64 cfs_period_us)
9117{
9118 return tg_set_cfs_period(css_tg(css), cfs_period_us);
9119}
9120
9121struct cfs_schedulable_data {
9122 struct task_group *tg;
9123 u64 period, quota;
9124};
9125
9126
9127
9128
9129
9130static u64 normalize_cfs_quota(struct task_group *tg,
9131 struct cfs_schedulable_data *d)
9132{
9133 u64 quota, period;
9134
9135 if (tg == d->tg) {
9136 period = d->period;
9137 quota = d->quota;
9138 } else {
9139 period = tg_get_cfs_period(tg);
9140 quota = tg_get_cfs_quota(tg);
9141 }
9142
9143
9144 if (quota == RUNTIME_INF || quota == -1)
9145 return RUNTIME_INF;
9146
9147 return to_ratio(period, quota);
9148}
9149
9150static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9151{
9152 struct cfs_schedulable_data *d = data;
9153 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9154 s64 quota = 0, parent_quota = -1;
9155
9156 if (!tg->parent) {
9157 quota = RUNTIME_INF;
9158 } else {
9159 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9160
9161 quota = normalize_cfs_quota(tg, d);
9162 parent_quota = parent_b->hierarchical_quota;
9163
9164
9165
9166
9167
9168
9169 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
9170 quota = min(quota, parent_quota);
9171 } else {
9172 if (quota == RUNTIME_INF)
9173 quota = parent_quota;
9174 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9175 return -EINVAL;
9176 }
9177 }
9178 cfs_b->hierarchical_quota = quota;
9179
9180 return 0;
9181}
9182
9183static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9184{
9185 int ret;
9186 struct cfs_schedulable_data data = {
9187 .tg = tg,
9188 .period = period,
9189 .quota = quota,
9190 };
9191
9192 if (quota != RUNTIME_INF) {
9193 do_div(data.period, NSEC_PER_USEC);
9194 do_div(data.quota, NSEC_PER_USEC);
9195 }
9196
9197 rcu_read_lock();
9198 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9199 rcu_read_unlock();
9200
9201 return ret;
9202}
9203
9204static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
9205{
9206 struct task_group *tg = css_tg(seq_css(sf));
9207 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9208
9209 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
9210 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
9211 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
9212
9213 if (schedstat_enabled() && tg != &root_task_group) {
9214 u64 ws = 0;
9215 int i;
9216
9217 for_each_possible_cpu(i)
9218 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
9219
9220 seq_printf(sf, "wait_sum %llu\n", ws);
9221 }
9222
9223 return 0;
9224}
9225#endif
9226#endif
9227
9228#ifdef CONFIG_RT_GROUP_SCHED
9229static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
9230 struct cftype *cft, s64 val)
9231{
9232 return sched_group_set_rt_runtime(css_tg(css), val);
9233}
9234
9235static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
9236 struct cftype *cft)
9237{
9238 return sched_group_rt_runtime(css_tg(css));
9239}
9240
9241static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
9242 struct cftype *cftype, u64 rt_period_us)
9243{
9244 return sched_group_set_rt_period(css_tg(css), rt_period_us);
9245}
9246
9247static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
9248 struct cftype *cft)
9249{
9250 return sched_group_rt_period(css_tg(css));
9251}
9252#endif
9253
9254static struct cftype cpu_legacy_files[] = {
9255#ifdef CONFIG_FAIR_GROUP_SCHED
9256 {
9257 .name = "shares",
9258 .read_u64 = cpu_shares_read_u64,
9259 .write_u64 = cpu_shares_write_u64,
9260 },
9261#endif
9262#ifdef CONFIG_CFS_BANDWIDTH
9263 {
9264 .name = "cfs_quota_us",
9265 .read_s64 = cpu_cfs_quota_read_s64,
9266 .write_s64 = cpu_cfs_quota_write_s64,
9267 },
9268 {
9269 .name = "cfs_period_us",
9270 .read_u64 = cpu_cfs_period_read_u64,
9271 .write_u64 = cpu_cfs_period_write_u64,
9272 },
9273 {
9274 .name = "stat",
9275 .seq_show = cpu_cfs_stat_show,
9276 },
9277#endif
9278#ifdef CONFIG_RT_GROUP_SCHED
9279 {
9280 .name = "rt_runtime_us",
9281 .read_s64 = cpu_rt_runtime_read,
9282 .write_s64 = cpu_rt_runtime_write,
9283 },
9284 {
9285 .name = "rt_period_us",
9286 .read_u64 = cpu_rt_period_read_uint,
9287 .write_u64 = cpu_rt_period_write_uint,
9288 },
9289#endif
9290#ifdef CONFIG_UCLAMP_TASK_GROUP
9291 {
9292 .name = "uclamp.min",
9293 .flags = CFTYPE_NOT_ON_ROOT,
9294 .seq_show = cpu_uclamp_min_show,
9295 .write = cpu_uclamp_min_write,
9296 },
9297 {
9298 .name = "uclamp.max",
9299 .flags = CFTYPE_NOT_ON_ROOT,
9300 .seq_show = cpu_uclamp_max_show,
9301 .write = cpu_uclamp_max_write,
9302 },
9303#endif
9304 { }
9305};
9306
9307static int cpu_extra_stat_show(struct seq_file *sf,
9308 struct cgroup_subsys_state *css)
9309{
9310#ifdef CONFIG_CFS_BANDWIDTH
9311 {
9312 struct task_group *tg = css_tg(css);
9313 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9314 u64 throttled_usec;
9315
9316 throttled_usec = cfs_b->throttled_time;
9317 do_div(throttled_usec, NSEC_PER_USEC);
9318
9319 seq_printf(sf, "nr_periods %d\n"
9320 "nr_throttled %d\n"
9321 "throttled_usec %llu\n",
9322 cfs_b->nr_periods, cfs_b->nr_throttled,
9323 throttled_usec);
9324 }
9325#endif
9326 return 0;
9327}
9328
9329#ifdef CONFIG_FAIR_GROUP_SCHED
9330static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
9331 struct cftype *cft)
9332{
9333 struct task_group *tg = css_tg(css);
9334 u64 weight = scale_load_down(tg->shares);
9335
9336 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
9337}
9338
9339static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
9340 struct cftype *cft, u64 weight)
9341{
9342
9343
9344
9345
9346
9347
9348
9349 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
9350 return -ERANGE;
9351
9352 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
9353
9354 return sched_group_set_shares(css_tg(css), scale_load(weight));
9355}
9356
9357static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
9358 struct cftype *cft)
9359{
9360 unsigned long weight = scale_load_down(css_tg(css)->shares);
9361 int last_delta = INT_MAX;
9362 int prio, delta;
9363
9364
9365 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
9366 delta = abs(sched_prio_to_weight[prio] - weight);
9367 if (delta >= last_delta)
9368 break;
9369 last_delta = delta;
9370 }
9371
9372 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
9373}
9374
9375static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
9376 struct cftype *cft, s64 nice)
9377{
9378 unsigned long weight;
9379 int idx;
9380
9381 if (nice < MIN_NICE || nice > MAX_NICE)
9382 return -ERANGE;
9383
9384 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
9385 idx = array_index_nospec(idx, 40);
9386 weight = sched_prio_to_weight[idx];
9387
9388 return sched_group_set_shares(css_tg(css), scale_load(weight));
9389}
9390#endif
9391
9392static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
9393 long period, long quota)
9394{
9395 if (quota < 0)
9396 seq_puts(sf, "max");
9397 else
9398 seq_printf(sf, "%ld", quota);
9399
9400 seq_printf(sf, " %ld\n", period);
9401}
9402
9403
9404static int __maybe_unused cpu_period_quota_parse(char *buf,
9405 u64 *periodp, u64 *quotap)
9406{
9407 char tok[21];
9408
9409 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
9410 return -EINVAL;
9411
9412 *periodp *= NSEC_PER_USEC;
9413
9414 if (sscanf(tok, "%llu", quotap))
9415 *quotap *= NSEC_PER_USEC;
9416 else if (!strcmp(tok, "max"))
9417 *quotap = RUNTIME_INF;
9418 else
9419 return -EINVAL;
9420
9421 return 0;
9422}
9423
9424#ifdef CONFIG_CFS_BANDWIDTH
9425static int cpu_max_show(struct seq_file *sf, void *v)
9426{
9427 struct task_group *tg = css_tg(seq_css(sf));
9428
9429 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
9430 return 0;
9431}
9432
9433static ssize_t cpu_max_write(struct kernfs_open_file *of,
9434 char *buf, size_t nbytes, loff_t off)
9435{
9436 struct task_group *tg = css_tg(of_css(of));
9437 u64 period = tg_get_cfs_period(tg);
9438 u64 quota;
9439 int ret;
9440
9441 ret = cpu_period_quota_parse(buf, &period, "a);
9442 if (!ret)
9443 ret = tg_set_cfs_bandwidth(tg, period, quota);
9444 return ret ?: nbytes;
9445}
9446#endif
9447
9448static struct cftype cpu_files[] = {
9449#ifdef CONFIG_FAIR_GROUP_SCHED
9450 {
9451 .name = "weight",
9452 .flags = CFTYPE_NOT_ON_ROOT,
9453 .read_u64 = cpu_weight_read_u64,
9454 .write_u64 = cpu_weight_write_u64,
9455 },
9456 {
9457 .name = "weight.nice",
9458 .flags = CFTYPE_NOT_ON_ROOT,
9459 .read_s64 = cpu_weight_nice_read_s64,
9460 .write_s64 = cpu_weight_nice_write_s64,
9461 },
9462#endif
9463#ifdef CONFIG_CFS_BANDWIDTH
9464 {
9465 .name = "max",
9466 .flags = CFTYPE_NOT_ON_ROOT,
9467 .seq_show = cpu_max_show,
9468 .write = cpu_max_write,
9469 },
9470#endif
9471#ifdef CONFIG_UCLAMP_TASK_GROUP
9472 {
9473 .name = "uclamp.min",
9474 .flags = CFTYPE_NOT_ON_ROOT,
9475 .seq_show = cpu_uclamp_min_show,
9476 .write = cpu_uclamp_min_write,
9477 },
9478 {
9479 .name = "uclamp.max",
9480 .flags = CFTYPE_NOT_ON_ROOT,
9481 .seq_show = cpu_uclamp_max_show,
9482 .write = cpu_uclamp_max_write,
9483 },
9484#endif
9485 { }
9486};
9487
9488struct cgroup_subsys cpu_cgrp_subsys = {
9489 .css_alloc = cpu_cgroup_css_alloc,
9490 .css_online = cpu_cgroup_css_online,
9491 .css_released = cpu_cgroup_css_released,
9492 .css_free = cpu_cgroup_css_free,
9493 .css_extra_stat_show = cpu_extra_stat_show,
9494 .fork = cpu_cgroup_fork,
9495 .can_attach = cpu_cgroup_can_attach,
9496 .attach = cpu_cgroup_attach,
9497 .legacy_cftypes = cpu_legacy_files,
9498 .dfl_cftypes = cpu_files,
9499 .early_init = true,
9500 .threaded = true,
9501};
9502
9503#endif
9504
9505void dump_cpu_task(int cpu)
9506{
9507 pr_info("Task dump for CPU %d:\n", cpu);
9508 sched_show_task(cpu_curr(cpu));
9509}
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523const int sched_prio_to_weight[40] = {
9524 88761, 71755, 56483, 46273, 36291,
9525 29154, 23254, 18705, 14949, 11916,
9526 9548, 7620, 6100, 4904, 3906,
9527 3121, 2501, 1991, 1586, 1277,
9528 1024, 820, 655, 526, 423,
9529 335, 272, 215, 172, 137,
9530 110, 87, 70, 56, 45,
9531 36, 29, 23, 18, 15,
9532};
9533
9534
9535
9536
9537
9538
9539
9540
9541const u32 sched_prio_to_wmult[40] = {
9542 48388, 59856, 76040, 92818, 118348,
9543 147320, 184698, 229616, 287308, 360437,
9544 449829, 563644, 704093, 875809, 1099582,
9545 1376151, 1717300, 2157191, 2708050, 3363326,
9546 4194304, 5237765, 6557202, 8165337, 10153587,
9547 12820798, 15790321, 19976592, 24970740, 31350126,
9548 39045157, 49367440, 61356676, 76695844, 95443717,
9549 119304647, 148102320, 186737708, 238609294, 286331153,
9550};
9551
9552void call_trace_sched_update_nr_running(struct rq *rq, int count)
9553{
9554 trace_sched_update_nr_running_tp(rq, count);
9555}
9556