1
2
3
4
5
6
7
8
9#include "sched.h"
10
11#include <linux/nospec.h>
12
13#include <linux/kcov.h>
14
15#include <asm/switch_to.h>
16#include <asm/tlb.h>
17
18#include "../workqueue_internal.h"
19#include "../smpboot.h"
20
21#include "pelt.h"
22
23#define CREATE_TRACE_POINTS
24#include <trace/events/sched.h>
25
26
27
28
29
30EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
31EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
32EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
33EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
36
37DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
38
39#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
40
41
42
43
44
45
46
47#define SCHED_FEAT(name, enabled) \
48 (1UL << __SCHED_FEAT_##name) * enabled |
49const_debug unsigned int sysctl_sched_features =
50#include "features.h"
51 0;
52#undef SCHED_FEAT
53#endif
54
55
56
57
58
59const_debug unsigned int sysctl_sched_nr_migrate = 32;
60
61
62
63
64
65unsigned int sysctl_sched_rt_period = 1000000;
66
67__read_mostly int scheduler_running;
68
69
70
71
72
73int sysctl_sched_rt_runtime = 950000;
74
75
76
77
78struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
79 __acquires(rq->lock)
80{
81 struct rq *rq;
82
83 lockdep_assert_held(&p->pi_lock);
84
85 for (;;) {
86 rq = task_rq(p);
87 raw_spin_lock(&rq->lock);
88 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
89 rq_pin_lock(rq, rf);
90 return rq;
91 }
92 raw_spin_unlock(&rq->lock);
93
94 while (unlikely(task_on_rq_migrating(p)))
95 cpu_relax();
96 }
97}
98
99
100
101
102struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
103 __acquires(p->pi_lock)
104 __acquires(rq->lock)
105{
106 struct rq *rq;
107
108 for (;;) {
109 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
110 rq = task_rq(p);
111 raw_spin_lock(&rq->lock);
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
130 rq_pin_lock(rq, rf);
131 return rq;
132 }
133 raw_spin_unlock(&rq->lock);
134 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
135
136 while (unlikely(task_on_rq_migrating(p)))
137 cpu_relax();
138 }
139}
140
141
142
143
144
145static void update_rq_clock_task(struct rq *rq, s64 delta)
146{
147
148
149
150
151 s64 __maybe_unused steal = 0, irq_delta = 0;
152
153#ifdef CONFIG_IRQ_TIME_ACCOUNTING
154 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 if (irq_delta > delta)
172 irq_delta = delta;
173
174 rq->prev_irq_time += irq_delta;
175 delta -= irq_delta;
176#endif
177#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
178 if (static_key_false((¶virt_steal_rq_enabled))) {
179 steal = paravirt_steal_clock(cpu_of(rq));
180 steal -= rq->prev_steal_time_rq;
181
182 if (unlikely(steal > delta))
183 steal = delta;
184
185 rq->prev_steal_time_rq += steal;
186 delta -= steal;
187 }
188#endif
189
190 rq->clock_task += delta;
191
192#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
193 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
194 update_irq_load_avg(rq, irq_delta + steal);
195#endif
196 update_rq_clock_pelt(rq, delta);
197}
198
199void update_rq_clock(struct rq *rq)
200{
201 s64 delta;
202
203 lockdep_assert_held(&rq->lock);
204
205 if (rq->clock_update_flags & RQCF_ACT_SKIP)
206 return;
207
208#ifdef CONFIG_SCHED_DEBUG
209 if (sched_feat(WARN_DOUBLE_CLOCK))
210 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
211 rq->clock_update_flags |= RQCF_UPDATED;
212#endif
213
214 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
215 if (delta < 0)
216 return;
217 rq->clock += delta;
218 update_rq_clock_task(rq, delta);
219}
220
221
222#ifdef CONFIG_SCHED_HRTICK
223
224
225
226
227static void hrtick_clear(struct rq *rq)
228{
229 if (hrtimer_active(&rq->hrtick_timer))
230 hrtimer_cancel(&rq->hrtick_timer);
231}
232
233
234
235
236
237static enum hrtimer_restart hrtick(struct hrtimer *timer)
238{
239 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
240 struct rq_flags rf;
241
242 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
243
244 rq_lock(rq, &rf);
245 update_rq_clock(rq);
246 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
247 rq_unlock(rq, &rf);
248
249 return HRTIMER_NORESTART;
250}
251
252#ifdef CONFIG_SMP
253
254static void __hrtick_restart(struct rq *rq)
255{
256 struct hrtimer *timer = &rq->hrtick_timer;
257
258 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
259}
260
261
262
263
264static void __hrtick_start(void *arg)
265{
266 struct rq *rq = arg;
267 struct rq_flags rf;
268
269 rq_lock(rq, &rf);
270 __hrtick_restart(rq);
271 rq->hrtick_csd_pending = 0;
272 rq_unlock(rq, &rf);
273}
274
275
276
277
278
279
280void hrtick_start(struct rq *rq, u64 delay)
281{
282 struct hrtimer *timer = &rq->hrtick_timer;
283 ktime_t time;
284 s64 delta;
285
286
287
288
289
290 delta = max_t(s64, delay, 10000LL);
291 time = ktime_add_ns(timer->base->get_time(), delta);
292
293 hrtimer_set_expires(timer, time);
294
295 if (rq == this_rq()) {
296 __hrtick_restart(rq);
297 } else if (!rq->hrtick_csd_pending) {
298 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
299 rq->hrtick_csd_pending = 1;
300 }
301}
302
303#else
304
305
306
307
308
309void hrtick_start(struct rq *rq, u64 delay)
310{
311
312
313
314
315 delay = max_t(u64, delay, 10000LL);
316 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
317 HRTIMER_MODE_REL_PINNED);
318}
319#endif
320
321static void hrtick_rq_init(struct rq *rq)
322{
323#ifdef CONFIG_SMP
324 rq->hrtick_csd_pending = 0;
325
326 rq->hrtick_csd.flags = 0;
327 rq->hrtick_csd.func = __hrtick_start;
328 rq->hrtick_csd.info = rq;
329#endif
330
331 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
332 rq->hrtick_timer.function = hrtick;
333}
334#else
335static inline void hrtick_clear(struct rq *rq)
336{
337}
338
339static inline void hrtick_rq_init(struct rq *rq)
340{
341}
342#endif
343
344
345
346
347#define fetch_or(ptr, mask) \
348 ({ \
349 typeof(ptr) _ptr = (ptr); \
350 typeof(mask) _mask = (mask); \
351 typeof(*_ptr) _old, _val = *_ptr; \
352 \
353 for (;;) { \
354 _old = cmpxchg(_ptr, _val, _val | _mask); \
355 if (_old == _val) \
356 break; \
357 _val = _old; \
358 } \
359 _old; \
360})
361
362#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
363
364
365
366
367
368static bool set_nr_and_not_polling(struct task_struct *p)
369{
370 struct thread_info *ti = task_thread_info(p);
371 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
372}
373
374
375
376
377
378
379
380static bool set_nr_if_polling(struct task_struct *p)
381{
382 struct thread_info *ti = task_thread_info(p);
383 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
384
385 for (;;) {
386 if (!(val & _TIF_POLLING_NRFLAG))
387 return false;
388 if (val & _TIF_NEED_RESCHED)
389 return true;
390 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
391 if (old == val)
392 break;
393 val = old;
394 }
395 return true;
396}
397
398#else
399static bool set_nr_and_not_polling(struct task_struct *p)
400{
401 set_tsk_need_resched(p);
402 return true;
403}
404
405#ifdef CONFIG_SMP
406static bool set_nr_if_polling(struct task_struct *p)
407{
408 return false;
409}
410#endif
411#endif
412
413static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
414{
415 struct wake_q_node *node = &task->wake_q;
416
417
418
419
420
421
422
423
424
425 smp_mb__before_atomic();
426 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
427 return false;
428
429
430
431
432 *head->lastp = node;
433 head->lastp = &node->next;
434 return true;
435}
436
437
438
439
440
441
442
443
444
445
446
447
448
449void wake_q_add(struct wake_q_head *head, struct task_struct *task)
450{
451 if (__wake_q_add(head, task))
452 get_task_struct(task);
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
473{
474 if (!__wake_q_add(head, task))
475 put_task_struct(task);
476}
477
478void wake_up_q(struct wake_q_head *head)
479{
480 struct wake_q_node *node = head->first;
481
482 while (node != WAKE_Q_TAIL) {
483 struct task_struct *task;
484
485 task = container_of(node, struct task_struct, wake_q);
486 BUG_ON(!task);
487
488 node = node->next;
489 task->wake_q.next = NULL;
490
491
492
493
494
495 wake_up_process(task);
496 put_task_struct(task);
497 }
498}
499
500
501
502
503
504
505
506
507void resched_curr(struct rq *rq)
508{
509 struct task_struct *curr = rq->curr;
510 int cpu;
511
512 lockdep_assert_held(&rq->lock);
513
514 if (test_tsk_need_resched(curr))
515 return;
516
517 cpu = cpu_of(rq);
518
519 if (cpu == smp_processor_id()) {
520 set_tsk_need_resched(curr);
521 set_preempt_need_resched();
522 return;
523 }
524
525 if (set_nr_and_not_polling(curr))
526 smp_send_reschedule(cpu);
527 else
528 trace_sched_wake_idle_without_ipi(cpu);
529}
530
531void resched_cpu(int cpu)
532{
533 struct rq *rq = cpu_rq(cpu);
534 unsigned long flags;
535
536 raw_spin_lock_irqsave(&rq->lock, flags);
537 if (cpu_online(cpu) || cpu == smp_processor_id())
538 resched_curr(rq);
539 raw_spin_unlock_irqrestore(&rq->lock, flags);
540}
541
542#ifdef CONFIG_SMP
543#ifdef CONFIG_NO_HZ_COMMON
544
545
546
547
548
549
550
551
552int get_nohz_timer_target(void)
553{
554 int i, cpu = smp_processor_id();
555 struct sched_domain *sd;
556
557 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
558 return cpu;
559
560 rcu_read_lock();
561 for_each_domain(cpu, sd) {
562 for_each_cpu(i, sched_domain_span(sd)) {
563 if (cpu == i)
564 continue;
565
566 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
567 cpu = i;
568 goto unlock;
569 }
570 }
571 }
572
573 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
574 cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
575unlock:
576 rcu_read_unlock();
577 return cpu;
578}
579
580
581
582
583
584
585
586
587
588
589
590static void wake_up_idle_cpu(int cpu)
591{
592 struct rq *rq = cpu_rq(cpu);
593
594 if (cpu == smp_processor_id())
595 return;
596
597 if (set_nr_and_not_polling(rq->idle))
598 smp_send_reschedule(cpu);
599 else
600 trace_sched_wake_idle_without_ipi(cpu);
601}
602
603static bool wake_up_full_nohz_cpu(int cpu)
604{
605
606
607
608
609
610
611 if (cpu_is_offline(cpu))
612 return true;
613 if (tick_nohz_full_cpu(cpu)) {
614 if (cpu != smp_processor_id() ||
615 tick_nohz_tick_stopped())
616 tick_nohz_full_kick_cpu(cpu);
617 return true;
618 }
619
620 return false;
621}
622
623
624
625
626
627
628void wake_up_nohz_cpu(int cpu)
629{
630 if (!wake_up_full_nohz_cpu(cpu))
631 wake_up_idle_cpu(cpu);
632}
633
634static inline bool got_nohz_idle_kick(void)
635{
636 int cpu = smp_processor_id();
637
638 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
639 return false;
640
641 if (idle_cpu(cpu) && !need_resched())
642 return true;
643
644
645
646
647
648 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
649 return false;
650}
651
652#else
653
654static inline bool got_nohz_idle_kick(void)
655{
656 return false;
657}
658
659#endif
660
661#ifdef CONFIG_NO_HZ_FULL
662bool sched_can_stop_tick(struct rq *rq)
663{
664 int fifo_nr_running;
665
666
667 if (rq->dl.dl_nr_running)
668 return false;
669
670
671
672
673
674 if (rq->rt.rr_nr_running) {
675 if (rq->rt.rr_nr_running == 1)
676 return true;
677 else
678 return false;
679 }
680
681
682
683
684
685 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
686 if (fifo_nr_running)
687 return true;
688
689
690
691
692
693
694 if (rq->nr_running > 1)
695 return false;
696
697 return true;
698}
699#endif
700#endif
701
702#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
703 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
704
705
706
707
708
709
710int walk_tg_tree_from(struct task_group *from,
711 tg_visitor down, tg_visitor up, void *data)
712{
713 struct task_group *parent, *child;
714 int ret;
715
716 parent = from;
717
718down:
719 ret = (*down)(parent, data);
720 if (ret)
721 goto out;
722 list_for_each_entry_rcu(child, &parent->children, siblings) {
723 parent = child;
724 goto down;
725
726up:
727 continue;
728 }
729 ret = (*up)(parent, data);
730 if (ret || parent == from)
731 goto out;
732
733 child = parent;
734 parent = parent->parent;
735 if (parent)
736 goto up;
737out:
738 return ret;
739}
740
741int tg_nop(struct task_group *tg, void *data)
742{
743 return 0;
744}
745#endif
746
747static void set_load_weight(struct task_struct *p, bool update_load)
748{
749 int prio = p->static_prio - MAX_RT_PRIO;
750 struct load_weight *load = &p->se.load;
751
752
753
754
755 if (task_has_idle_policy(p)) {
756 load->weight = scale_load(WEIGHT_IDLEPRIO);
757 load->inv_weight = WMULT_IDLEPRIO;
758 p->se.runnable_weight = load->weight;
759 return;
760 }
761
762
763
764
765
766 if (update_load && p->sched_class == &fair_sched_class) {
767 reweight_task(p, prio);
768 } else {
769 load->weight = scale_load(sched_prio_to_weight[prio]);
770 load->inv_weight = sched_prio_to_wmult[prio];
771 p->se.runnable_weight = load->weight;
772 }
773}
774
775#ifdef CONFIG_UCLAMP_TASK
776
777unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
778
779
780unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
781
782
783static struct uclamp_se uclamp_default[UCLAMP_CNT];
784
785
786#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
787
788#define for_each_clamp_id(clamp_id) \
789 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
790
791static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
792{
793 return clamp_value / UCLAMP_BUCKET_DELTA;
794}
795
796static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
797{
798 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
799}
800
801static inline unsigned int uclamp_none(int clamp_id)
802{
803 if (clamp_id == UCLAMP_MIN)
804 return 0;
805 return SCHED_CAPACITY_SCALE;
806}
807
808static inline void uclamp_se_set(struct uclamp_se *uc_se,
809 unsigned int value, bool user_defined)
810{
811 uc_se->value = value;
812 uc_se->bucket_id = uclamp_bucket_id(value);
813 uc_se->user_defined = user_defined;
814}
815
816static inline unsigned int
817uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
818 unsigned int clamp_value)
819{
820
821
822
823
824
825 if (clamp_id == UCLAMP_MAX) {
826 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
827 return clamp_value;
828 }
829
830 return uclamp_none(UCLAMP_MIN);
831}
832
833static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
834 unsigned int clamp_value)
835{
836
837 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
838 return;
839
840 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
841}
842
843static inline
844unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
845 unsigned int clamp_value)
846{
847 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
848 int bucket_id = UCLAMP_BUCKETS - 1;
849
850
851
852
853
854 for ( ; bucket_id >= 0; bucket_id--) {
855 if (!bucket[bucket_id].tasks)
856 continue;
857 return bucket[bucket_id].value;
858 }
859
860
861 return uclamp_idle_value(rq, clamp_id, clamp_value);
862}
863
864
865
866
867
868
869
870static inline struct uclamp_se
871uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
872{
873 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
874 struct uclamp_se uc_max = uclamp_default[clamp_id];
875
876
877 if (unlikely(uc_req.value > uc_max.value))
878 return uc_max;
879
880 return uc_req;
881}
882
883unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
884{
885 struct uclamp_se uc_eff;
886
887
888 if (p->uclamp[clamp_id].active)
889 return p->uclamp[clamp_id].value;
890
891 uc_eff = uclamp_eff_get(p, clamp_id);
892
893 return uc_eff.value;
894}
895
896
897
898
899
900
901
902
903
904
905
906static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
907 unsigned int clamp_id)
908{
909 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
910 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
911 struct uclamp_bucket *bucket;
912
913 lockdep_assert_held(&rq->lock);
914
915
916 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
917
918 bucket = &uc_rq->bucket[uc_se->bucket_id];
919 bucket->tasks++;
920 uc_se->active = true;
921
922 uclamp_idle_reset(rq, clamp_id, uc_se->value);
923
924
925
926
927
928 if (bucket->tasks == 1 || uc_se->value > bucket->value)
929 bucket->value = uc_se->value;
930
931 if (uc_se->value > READ_ONCE(uc_rq->value))
932 WRITE_ONCE(uc_rq->value, uc_se->value);
933}
934
935
936
937
938
939
940
941
942
943
944static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
945 unsigned int clamp_id)
946{
947 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
948 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
949 struct uclamp_bucket *bucket;
950 unsigned int bkt_clamp;
951 unsigned int rq_clamp;
952
953 lockdep_assert_held(&rq->lock);
954
955 bucket = &uc_rq->bucket[uc_se->bucket_id];
956 SCHED_WARN_ON(!bucket->tasks);
957 if (likely(bucket->tasks))
958 bucket->tasks--;
959 uc_se->active = false;
960
961
962
963
964
965
966
967 if (likely(bucket->tasks))
968 return;
969
970 rq_clamp = READ_ONCE(uc_rq->value);
971
972
973
974
975 SCHED_WARN_ON(bucket->value > rq_clamp);
976 if (bucket->value >= rq_clamp) {
977 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
978 WRITE_ONCE(uc_rq->value, bkt_clamp);
979 }
980}
981
982static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
983{
984 unsigned int clamp_id;
985
986 if (unlikely(!p->sched_class->uclamp_enabled))
987 return;
988
989 for_each_clamp_id(clamp_id)
990 uclamp_rq_inc_id(rq, p, clamp_id);
991
992
993 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
994 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
995}
996
997static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
998{
999 unsigned int clamp_id;
1000
1001 if (unlikely(!p->sched_class->uclamp_enabled))
1002 return;
1003
1004 for_each_clamp_id(clamp_id)
1005 uclamp_rq_dec_id(rq, p, clamp_id);
1006}
1007
1008int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1009 void __user *buffer, size_t *lenp,
1010 loff_t *ppos)
1011{
1012 int old_min, old_max;
1013 static DEFINE_MUTEX(mutex);
1014 int result;
1015
1016 mutex_lock(&mutex);
1017 old_min = sysctl_sched_uclamp_util_min;
1018 old_max = sysctl_sched_uclamp_util_max;
1019
1020 result = proc_dointvec(table, write, buffer, lenp, ppos);
1021 if (result)
1022 goto undo;
1023 if (!write)
1024 goto done;
1025
1026 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1027 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1028 result = -EINVAL;
1029 goto undo;
1030 }
1031
1032 if (old_min != sysctl_sched_uclamp_util_min) {
1033 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1034 sysctl_sched_uclamp_util_min, false);
1035 }
1036 if (old_max != sysctl_sched_uclamp_util_max) {
1037 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1038 sysctl_sched_uclamp_util_max, false);
1039 }
1040
1041
1042
1043
1044
1045 goto done;
1046
1047undo:
1048 sysctl_sched_uclamp_util_min = old_min;
1049 sysctl_sched_uclamp_util_max = old_max;
1050done:
1051 mutex_unlock(&mutex);
1052
1053 return result;
1054}
1055
1056static int uclamp_validate(struct task_struct *p,
1057 const struct sched_attr *attr)
1058{
1059 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1060 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1061
1062 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1063 lower_bound = attr->sched_util_min;
1064 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1065 upper_bound = attr->sched_util_max;
1066
1067 if (lower_bound > upper_bound)
1068 return -EINVAL;
1069 if (upper_bound > SCHED_CAPACITY_SCALE)
1070 return -EINVAL;
1071
1072 return 0;
1073}
1074
1075static void __setscheduler_uclamp(struct task_struct *p,
1076 const struct sched_attr *attr)
1077{
1078 unsigned int clamp_id;
1079
1080
1081
1082
1083
1084 for_each_clamp_id(clamp_id) {
1085 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1086 unsigned int clamp_value = uclamp_none(clamp_id);
1087
1088
1089 if (uc_se->user_defined)
1090 continue;
1091
1092
1093 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1094 clamp_value = uclamp_none(UCLAMP_MAX);
1095
1096 uclamp_se_set(uc_se, clamp_value, false);
1097 }
1098
1099 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1100 return;
1101
1102 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1103 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1104 attr->sched_util_min, true);
1105 }
1106
1107 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1108 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1109 attr->sched_util_max, true);
1110 }
1111}
1112
1113static void uclamp_fork(struct task_struct *p)
1114{
1115 unsigned int clamp_id;
1116
1117 for_each_clamp_id(clamp_id)
1118 p->uclamp[clamp_id].active = false;
1119
1120 if (likely(!p->sched_reset_on_fork))
1121 return;
1122
1123 for_each_clamp_id(clamp_id) {
1124 unsigned int clamp_value = uclamp_none(clamp_id);
1125
1126
1127 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1128 clamp_value = uclamp_none(UCLAMP_MAX);
1129
1130 uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
1131 }
1132}
1133
1134static void __init init_uclamp(void)
1135{
1136 struct uclamp_se uc_max = {};
1137 unsigned int clamp_id;
1138 int cpu;
1139
1140 for_each_possible_cpu(cpu) {
1141 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
1142 cpu_rq(cpu)->uclamp_flags = 0;
1143 }
1144
1145 for_each_clamp_id(clamp_id) {
1146 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1147 uclamp_none(clamp_id), false);
1148 }
1149
1150
1151 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1152 for_each_clamp_id(clamp_id)
1153 uclamp_default[clamp_id] = uc_max;
1154}
1155
1156#else
1157static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1158static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1159static inline int uclamp_validate(struct task_struct *p,
1160 const struct sched_attr *attr)
1161{
1162 return -EOPNOTSUPP;
1163}
1164static void __setscheduler_uclamp(struct task_struct *p,
1165 const struct sched_attr *attr) { }
1166static inline void uclamp_fork(struct task_struct *p) { }
1167static inline void init_uclamp(void) { }
1168#endif
1169
1170static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1171{
1172 if (!(flags & ENQUEUE_NOCLOCK))
1173 update_rq_clock(rq);
1174
1175 if (!(flags & ENQUEUE_RESTORE)) {
1176 sched_info_queued(rq, p);
1177 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1178 }
1179
1180 uclamp_rq_inc(rq, p);
1181 p->sched_class->enqueue_task(rq, p, flags);
1182}
1183
1184static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1185{
1186 if (!(flags & DEQUEUE_NOCLOCK))
1187 update_rq_clock(rq);
1188
1189 if (!(flags & DEQUEUE_SAVE)) {
1190 sched_info_dequeued(rq, p);
1191 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1192 }
1193
1194 uclamp_rq_dec(rq, p);
1195 p->sched_class->dequeue_task(rq, p, flags);
1196}
1197
1198void activate_task(struct rq *rq, struct task_struct *p, int flags)
1199{
1200 if (task_contributes_to_load(p))
1201 rq->nr_uninterruptible--;
1202
1203 enqueue_task(rq, p, flags);
1204
1205 p->on_rq = TASK_ON_RQ_QUEUED;
1206}
1207
1208void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1209{
1210 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1211
1212 if (task_contributes_to_load(p))
1213 rq->nr_uninterruptible++;
1214
1215 dequeue_task(rq, p, flags);
1216}
1217
1218
1219
1220
1221static inline int __normal_prio(struct task_struct *p)
1222{
1223 return p->static_prio;
1224}
1225
1226
1227
1228
1229
1230
1231
1232
1233static inline int normal_prio(struct task_struct *p)
1234{
1235 int prio;
1236
1237 if (task_has_dl_policy(p))
1238 prio = MAX_DL_PRIO-1;
1239 else if (task_has_rt_policy(p))
1240 prio = MAX_RT_PRIO-1 - p->rt_priority;
1241 else
1242 prio = __normal_prio(p);
1243 return prio;
1244}
1245
1246
1247
1248
1249
1250
1251
1252
1253static int effective_prio(struct task_struct *p)
1254{
1255 p->normal_prio = normal_prio(p);
1256
1257
1258
1259
1260
1261 if (!rt_prio(p->prio))
1262 return p->normal_prio;
1263 return p->prio;
1264}
1265
1266
1267
1268
1269
1270
1271
1272inline int task_curr(const struct task_struct *p)
1273{
1274 return cpu_curr(task_cpu(p)) == p;
1275}
1276
1277
1278
1279
1280
1281
1282
1283
1284static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1285 const struct sched_class *prev_class,
1286 int oldprio)
1287{
1288 if (prev_class != p->sched_class) {
1289 if (prev_class->switched_from)
1290 prev_class->switched_from(rq, p);
1291
1292 p->sched_class->switched_to(rq, p);
1293 } else if (oldprio != p->prio || dl_task(p))
1294 p->sched_class->prio_changed(rq, p, oldprio);
1295}
1296
1297void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1298{
1299 const struct sched_class *class;
1300
1301 if (p->sched_class == rq->curr->sched_class) {
1302 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1303 } else {
1304 for_each_class(class) {
1305 if (class == rq->curr->sched_class)
1306 break;
1307 if (class == p->sched_class) {
1308 resched_curr(rq);
1309 break;
1310 }
1311 }
1312 }
1313
1314
1315
1316
1317
1318 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1319 rq_clock_skip_update(rq);
1320}
1321
1322#ifdef CONFIG_SMP
1323
1324static inline bool is_per_cpu_kthread(struct task_struct *p)
1325{
1326 if (!(p->flags & PF_KTHREAD))
1327 return false;
1328
1329 if (p->nr_cpus_allowed != 1)
1330 return false;
1331
1332 return true;
1333}
1334
1335
1336
1337
1338
1339static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1340{
1341 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1342 return false;
1343
1344 if (is_per_cpu_kthread(p))
1345 return cpu_online(cpu);
1346
1347 return cpu_active(cpu);
1348}
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1370 struct task_struct *p, int new_cpu)
1371{
1372 lockdep_assert_held(&rq->lock);
1373
1374 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1375 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1376 set_task_cpu(p, new_cpu);
1377 rq_unlock(rq, rf);
1378
1379 rq = cpu_rq(new_cpu);
1380
1381 rq_lock(rq, rf);
1382 BUG_ON(task_cpu(p) != new_cpu);
1383 enqueue_task(rq, p, 0);
1384 p->on_rq = TASK_ON_RQ_QUEUED;
1385 check_preempt_curr(rq, p, 0);
1386
1387 return rq;
1388}
1389
1390struct migration_arg {
1391 struct task_struct *task;
1392 int dest_cpu;
1393};
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1405 struct task_struct *p, int dest_cpu)
1406{
1407
1408 if (!is_cpu_allowed(p, dest_cpu))
1409 return rq;
1410
1411 update_rq_clock(rq);
1412 rq = move_queued_task(rq, rf, p, dest_cpu);
1413
1414 return rq;
1415}
1416
1417
1418
1419
1420
1421
1422static int migration_cpu_stop(void *data)
1423{
1424 struct migration_arg *arg = data;
1425 struct task_struct *p = arg->task;
1426 struct rq *rq = this_rq();
1427 struct rq_flags rf;
1428
1429
1430
1431
1432
1433 local_irq_disable();
1434
1435
1436
1437
1438
1439 sched_ttwu_pending();
1440
1441 raw_spin_lock(&p->pi_lock);
1442 rq_lock(rq, &rf);
1443
1444
1445
1446
1447
1448 if (task_rq(p) == rq) {
1449 if (task_on_rq_queued(p))
1450 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1451 else
1452 p->wake_cpu = arg->dest_cpu;
1453 }
1454 rq_unlock(rq, &rf);
1455 raw_spin_unlock(&p->pi_lock);
1456
1457 local_irq_enable();
1458 return 0;
1459}
1460
1461
1462
1463
1464
1465void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1466{
1467 cpumask_copy(&p->cpus_mask, new_mask);
1468 p->nr_cpus_allowed = cpumask_weight(new_mask);
1469}
1470
1471void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1472{
1473 struct rq *rq = task_rq(p);
1474 bool queued, running;
1475
1476 lockdep_assert_held(&p->pi_lock);
1477
1478 queued = task_on_rq_queued(p);
1479 running = task_current(rq, p);
1480
1481 if (queued) {
1482
1483
1484
1485
1486 lockdep_assert_held(&rq->lock);
1487 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1488 }
1489 if (running)
1490 put_prev_task(rq, p);
1491
1492 p->sched_class->set_cpus_allowed(p, new_mask);
1493
1494 if (queued)
1495 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1496 if (running)
1497 set_curr_task(rq, p);
1498}
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509static int __set_cpus_allowed_ptr(struct task_struct *p,
1510 const struct cpumask *new_mask, bool check)
1511{
1512 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1513 unsigned int dest_cpu;
1514 struct rq_flags rf;
1515 struct rq *rq;
1516 int ret = 0;
1517
1518 rq = task_rq_lock(p, &rf);
1519 update_rq_clock(rq);
1520
1521 if (p->flags & PF_KTHREAD) {
1522
1523
1524
1525 cpu_valid_mask = cpu_online_mask;
1526 }
1527
1528
1529
1530
1531
1532 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1533 ret = -EINVAL;
1534 goto out;
1535 }
1536
1537 if (cpumask_equal(p->cpus_ptr, new_mask))
1538 goto out;
1539
1540 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1541 ret = -EINVAL;
1542 goto out;
1543 }
1544
1545 do_set_cpus_allowed(p, new_mask);
1546
1547 if (p->flags & PF_KTHREAD) {
1548
1549
1550
1551
1552 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1553 !cpumask_intersects(new_mask, cpu_active_mask) &&
1554 p->nr_cpus_allowed != 1);
1555 }
1556
1557
1558 if (cpumask_test_cpu(task_cpu(p), new_mask))
1559 goto out;
1560
1561 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1562 if (task_running(rq, p) || p->state == TASK_WAKING) {
1563 struct migration_arg arg = { p, dest_cpu };
1564
1565 task_rq_unlock(rq, p, &rf);
1566 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1567 return 0;
1568 } else if (task_on_rq_queued(p)) {
1569
1570
1571
1572
1573 rq = move_queued_task(rq, &rf, p, dest_cpu);
1574 }
1575out:
1576 task_rq_unlock(rq, p, &rf);
1577
1578 return ret;
1579}
1580
1581int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1582{
1583 return __set_cpus_allowed_ptr(p, new_mask, false);
1584}
1585EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1586
1587void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1588{
1589#ifdef CONFIG_SCHED_DEBUG
1590
1591
1592
1593
1594 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1595 !p->on_rq);
1596
1597
1598
1599
1600
1601
1602 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1603 p->sched_class == &fair_sched_class &&
1604 (p->on_rq && !task_on_rq_migrating(p)));
1605
1606#ifdef CONFIG_LOCKDEP
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1618 lockdep_is_held(&task_rq(p)->lock)));
1619#endif
1620
1621
1622
1623 WARN_ON_ONCE(!cpu_online(new_cpu));
1624#endif
1625
1626 trace_sched_migrate_task(p, new_cpu);
1627
1628 if (task_cpu(p) != new_cpu) {
1629 if (p->sched_class->migrate_task_rq)
1630 p->sched_class->migrate_task_rq(p, new_cpu);
1631 p->se.nr_migrations++;
1632 rseq_migrate(p);
1633 perf_event_task_migrate(p);
1634 }
1635
1636 __set_task_cpu(p, new_cpu);
1637}
1638
1639#ifdef CONFIG_NUMA_BALANCING
1640static void __migrate_swap_task(struct task_struct *p, int cpu)
1641{
1642 if (task_on_rq_queued(p)) {
1643 struct rq *src_rq, *dst_rq;
1644 struct rq_flags srf, drf;
1645
1646 src_rq = task_rq(p);
1647 dst_rq = cpu_rq(cpu);
1648
1649 rq_pin_lock(src_rq, &srf);
1650 rq_pin_lock(dst_rq, &drf);
1651
1652 deactivate_task(src_rq, p, 0);
1653 set_task_cpu(p, cpu);
1654 activate_task(dst_rq, p, 0);
1655 check_preempt_curr(dst_rq, p, 0);
1656
1657 rq_unpin_lock(dst_rq, &drf);
1658 rq_unpin_lock(src_rq, &srf);
1659
1660 } else {
1661
1662
1663
1664
1665
1666 p->wake_cpu = cpu;
1667 }
1668}
1669
1670struct migration_swap_arg {
1671 struct task_struct *src_task, *dst_task;
1672 int src_cpu, dst_cpu;
1673};
1674
1675static int migrate_swap_stop(void *data)
1676{
1677 struct migration_swap_arg *arg = data;
1678 struct rq *src_rq, *dst_rq;
1679 int ret = -EAGAIN;
1680
1681 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1682 return -EAGAIN;
1683
1684 src_rq = cpu_rq(arg->src_cpu);
1685 dst_rq = cpu_rq(arg->dst_cpu);
1686
1687 double_raw_lock(&arg->src_task->pi_lock,
1688 &arg->dst_task->pi_lock);
1689 double_rq_lock(src_rq, dst_rq);
1690
1691 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1692 goto unlock;
1693
1694 if (task_cpu(arg->src_task) != arg->src_cpu)
1695 goto unlock;
1696
1697 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
1698 goto unlock;
1699
1700 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
1701 goto unlock;
1702
1703 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1704 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1705
1706 ret = 0;
1707
1708unlock:
1709 double_rq_unlock(src_rq, dst_rq);
1710 raw_spin_unlock(&arg->dst_task->pi_lock);
1711 raw_spin_unlock(&arg->src_task->pi_lock);
1712
1713 return ret;
1714}
1715
1716
1717
1718
1719int migrate_swap(struct task_struct *cur, struct task_struct *p,
1720 int target_cpu, int curr_cpu)
1721{
1722 struct migration_swap_arg arg;
1723 int ret = -EINVAL;
1724
1725 arg = (struct migration_swap_arg){
1726 .src_task = cur,
1727 .src_cpu = curr_cpu,
1728 .dst_task = p,
1729 .dst_cpu = target_cpu,
1730 };
1731
1732 if (arg.src_cpu == arg.dst_cpu)
1733 goto out;
1734
1735
1736
1737
1738
1739 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1740 goto out;
1741
1742 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
1743 goto out;
1744
1745 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
1746 goto out;
1747
1748 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1749 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1750
1751out:
1752 return ret;
1753}
1754#endif
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1773{
1774 int running, queued;
1775 struct rq_flags rf;
1776 unsigned long ncsw;
1777 struct rq *rq;
1778
1779 for (;;) {
1780
1781
1782
1783
1784
1785
1786 rq = task_rq(p);
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799 while (task_running(rq, p)) {
1800 if (match_state && unlikely(p->state != match_state))
1801 return 0;
1802 cpu_relax();
1803 }
1804
1805
1806
1807
1808
1809
1810 rq = task_rq_lock(p, &rf);
1811 trace_sched_wait_task(p);
1812 running = task_running(rq, p);
1813 queued = task_on_rq_queued(p);
1814 ncsw = 0;
1815 if (!match_state || p->state == match_state)
1816 ncsw = p->nvcsw | LONG_MIN;
1817 task_rq_unlock(rq, p, &rf);
1818
1819
1820
1821
1822 if (unlikely(!ncsw))
1823 break;
1824
1825
1826
1827
1828
1829
1830
1831 if (unlikely(running)) {
1832 cpu_relax();
1833 continue;
1834 }
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845 if (unlikely(queued)) {
1846 ktime_t to = NSEC_PER_SEC / HZ;
1847
1848 set_current_state(TASK_UNINTERRUPTIBLE);
1849 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1850 continue;
1851 }
1852
1853
1854
1855
1856
1857
1858 break;
1859 }
1860
1861 return ncsw;
1862}
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877void kick_process(struct task_struct *p)
1878{
1879 int cpu;
1880
1881 preempt_disable();
1882 cpu = task_cpu(p);
1883 if ((cpu != smp_processor_id()) && task_curr(p))
1884 smp_send_reschedule(cpu);
1885 preempt_enable();
1886}
1887EXPORT_SYMBOL_GPL(kick_process);
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911static int select_fallback_rq(int cpu, struct task_struct *p)
1912{
1913 int nid = cpu_to_node(cpu);
1914 const struct cpumask *nodemask = NULL;
1915 enum { cpuset, possible, fail } state = cpuset;
1916 int dest_cpu;
1917
1918
1919
1920
1921
1922
1923 if (nid != -1) {
1924 nodemask = cpumask_of_node(nid);
1925
1926
1927 for_each_cpu(dest_cpu, nodemask) {
1928 if (!cpu_active(dest_cpu))
1929 continue;
1930 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
1931 return dest_cpu;
1932 }
1933 }
1934
1935 for (;;) {
1936
1937 for_each_cpu(dest_cpu, p->cpus_ptr) {
1938 if (!is_cpu_allowed(p, dest_cpu))
1939 continue;
1940
1941 goto out;
1942 }
1943
1944
1945 switch (state) {
1946 case cpuset:
1947 if (IS_ENABLED(CONFIG_CPUSETS)) {
1948 cpuset_cpus_allowed_fallback(p);
1949 state = possible;
1950 break;
1951 }
1952
1953 case possible:
1954 do_set_cpus_allowed(p, cpu_possible_mask);
1955 state = fail;
1956 break;
1957
1958 case fail:
1959 BUG();
1960 break;
1961 }
1962 }
1963
1964out:
1965 if (state != cpuset) {
1966
1967
1968
1969
1970
1971 if (p->mm && printk_ratelimit()) {
1972 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1973 task_pid_nr(p), p->comm, cpu);
1974 }
1975 }
1976
1977 return dest_cpu;
1978}
1979
1980
1981
1982
1983static inline
1984int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1985{
1986 lockdep_assert_held(&p->pi_lock);
1987
1988 if (p->nr_cpus_allowed > 1)
1989 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1990 else
1991 cpu = cpumask_any(p->cpus_ptr);
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003 if (unlikely(!is_cpu_allowed(p, cpu)))
2004 cpu = select_fallback_rq(task_cpu(p), p);
2005
2006 return cpu;
2007}
2008
2009static void update_avg(u64 *avg, u64 sample)
2010{
2011 s64 diff = sample - *avg;
2012 *avg += diff >> 3;
2013}
2014
2015void sched_set_stop_task(int cpu, struct task_struct *stop)
2016{
2017 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2018 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2019
2020 if (stop) {
2021
2022
2023
2024
2025
2026
2027
2028
2029 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2030
2031 stop->sched_class = &stop_sched_class;
2032 }
2033
2034 cpu_rq(cpu)->stop = stop;
2035
2036 if (old_stop) {
2037
2038
2039
2040
2041 old_stop->sched_class = &rt_sched_class;
2042 }
2043}
2044
2045#else
2046
2047static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2048 const struct cpumask *new_mask, bool check)
2049{
2050 return set_cpus_allowed_ptr(p, new_mask);
2051}
2052
2053#endif
2054
2055static void
2056ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2057{
2058 struct rq *rq;
2059
2060 if (!schedstat_enabled())
2061 return;
2062
2063 rq = this_rq();
2064
2065#ifdef CONFIG_SMP
2066 if (cpu == rq->cpu) {
2067 __schedstat_inc(rq->ttwu_local);
2068 __schedstat_inc(p->se.statistics.nr_wakeups_local);
2069 } else {
2070 struct sched_domain *sd;
2071
2072 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2073 rcu_read_lock();
2074 for_each_domain(rq->cpu, sd) {
2075 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2076 __schedstat_inc(sd->ttwu_wake_remote);
2077 break;
2078 }
2079 }
2080 rcu_read_unlock();
2081 }
2082
2083 if (wake_flags & WF_MIGRATED)
2084 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2085#endif
2086
2087 __schedstat_inc(rq->ttwu_count);
2088 __schedstat_inc(p->se.statistics.nr_wakeups);
2089
2090 if (wake_flags & WF_SYNC)
2091 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2092}
2093
2094
2095
2096
2097static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2098 struct rq_flags *rf)
2099{
2100 check_preempt_curr(rq, p, wake_flags);
2101 p->state = TASK_RUNNING;
2102 trace_sched_wakeup(p);
2103
2104#ifdef CONFIG_SMP
2105 if (p->sched_class->task_woken) {
2106
2107
2108
2109
2110 rq_unpin_lock(rq, rf);
2111 p->sched_class->task_woken(rq, p);
2112 rq_repin_lock(rq, rf);
2113 }
2114
2115 if (rq->idle_stamp) {
2116 u64 delta = rq_clock(rq) - rq->idle_stamp;
2117 u64 max = 2*rq->max_idle_balance_cost;
2118
2119 update_avg(&rq->avg_idle, delta);
2120
2121 if (rq->avg_idle > max)
2122 rq->avg_idle = max;
2123
2124 rq->idle_stamp = 0;
2125 }
2126#endif
2127}
2128
2129static void
2130ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
2131 struct rq_flags *rf)
2132{
2133 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2134
2135 lockdep_assert_held(&rq->lock);
2136
2137#ifdef CONFIG_SMP
2138 if (p->sched_contributes_to_load)
2139 rq->nr_uninterruptible--;
2140
2141 if (wake_flags & WF_MIGRATED)
2142 en_flags |= ENQUEUE_MIGRATED;
2143#endif
2144
2145 activate_task(rq, p, en_flags);
2146 ttwu_do_wakeup(rq, p, wake_flags, rf);
2147}
2148
2149
2150
2151
2152
2153
2154
2155static int ttwu_remote(struct task_struct *p, int wake_flags)
2156{
2157 struct rq_flags rf;
2158 struct rq *rq;
2159 int ret = 0;
2160
2161 rq = __task_rq_lock(p, &rf);
2162 if (task_on_rq_queued(p)) {
2163
2164 update_rq_clock(rq);
2165 ttwu_do_wakeup(rq, p, wake_flags, &rf);
2166 ret = 1;
2167 }
2168 __task_rq_unlock(rq, &rf);
2169
2170 return ret;
2171}
2172
2173#ifdef CONFIG_SMP
2174void sched_ttwu_pending(void)
2175{
2176 struct rq *rq = this_rq();
2177 struct llist_node *llist = llist_del_all(&rq->wake_list);
2178 struct task_struct *p, *t;
2179 struct rq_flags rf;
2180
2181 if (!llist)
2182 return;
2183
2184 rq_lock_irqsave(rq, &rf);
2185 update_rq_clock(rq);
2186
2187 llist_for_each_entry_safe(p, t, llist, wake_entry)
2188 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2189
2190 rq_unlock_irqrestore(rq, &rf);
2191}
2192
2193void scheduler_ipi(void)
2194{
2195
2196
2197
2198
2199
2200 preempt_fold_need_resched();
2201
2202 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2203 return;
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218 irq_enter();
2219 sched_ttwu_pending();
2220
2221
2222
2223
2224 if (unlikely(got_nohz_idle_kick())) {
2225 this_rq()->idle_balance = 1;
2226 raise_softirq_irqoff(SCHED_SOFTIRQ);
2227 }
2228 irq_exit();
2229}
2230
2231static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
2232{
2233 struct rq *rq = cpu_rq(cpu);
2234
2235 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2236
2237 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2238 if (!set_nr_if_polling(rq->idle))
2239 smp_send_reschedule(cpu);
2240 else
2241 trace_sched_wake_idle_without_ipi(cpu);
2242 }
2243}
2244
2245void wake_up_if_idle(int cpu)
2246{
2247 struct rq *rq = cpu_rq(cpu);
2248 struct rq_flags rf;
2249
2250 rcu_read_lock();
2251
2252 if (!is_idle_task(rcu_dereference(rq->curr)))
2253 goto out;
2254
2255 if (set_nr_if_polling(rq->idle)) {
2256 trace_sched_wake_idle_without_ipi(cpu);
2257 } else {
2258 rq_lock_irqsave(rq, &rf);
2259 if (is_idle_task(rq->curr))
2260 smp_send_reschedule(cpu);
2261
2262 rq_unlock_irqrestore(rq, &rf);
2263 }
2264
2265out:
2266 rcu_read_unlock();
2267}
2268
2269bool cpus_share_cache(int this_cpu, int that_cpu)
2270{
2271 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2272}
2273#endif
2274
2275static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2276{
2277 struct rq *rq = cpu_rq(cpu);
2278 struct rq_flags rf;
2279
2280#if defined(CONFIG_SMP)
2281 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2282 sched_clock_cpu(cpu);
2283 ttwu_queue_remote(p, cpu, wake_flags);
2284 return;
2285 }
2286#endif
2287
2288 rq_lock(rq, &rf);
2289 update_rq_clock(rq);
2290 ttwu_do_activate(rq, p, wake_flags, &rf);
2291 rq_unlock(rq, &rf);
2292}
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396static int
2397try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2398{
2399 unsigned long flags;
2400 int cpu, success = 0;
2401
2402 preempt_disable();
2403 if (p == current) {
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415 if (!(p->state & state))
2416 goto out;
2417
2418 success = 1;
2419 cpu = task_cpu(p);
2420 trace_sched_waking(p);
2421 p->state = TASK_RUNNING;
2422 trace_sched_wakeup(p);
2423 goto out;
2424 }
2425
2426
2427
2428
2429
2430
2431
2432 raw_spin_lock_irqsave(&p->pi_lock, flags);
2433 smp_mb__after_spinlock();
2434 if (!(p->state & state))
2435 goto unlock;
2436
2437 trace_sched_waking(p);
2438
2439
2440 success = 1;
2441 cpu = task_cpu(p);
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463 smp_rmb();
2464 if (p->on_rq && ttwu_remote(p, wake_flags))
2465 goto unlock;
2466
2467#ifdef CONFIG_SMP
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487 smp_rmb();
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498 smp_cond_load_acquire(&p->on_cpu, !VAL);
2499
2500 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2501 p->state = TASK_WAKING;
2502
2503 if (p->in_iowait) {
2504 delayacct_blkio_end(p);
2505 atomic_dec(&task_rq(p)->nr_iowait);
2506 }
2507
2508 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2509 if (task_cpu(p) != cpu) {
2510 wake_flags |= WF_MIGRATED;
2511 psi_ttwu_dequeue(p);
2512 set_task_cpu(p, cpu);
2513 }
2514
2515#else
2516
2517 if (p->in_iowait) {
2518 delayacct_blkio_end(p);
2519 atomic_dec(&task_rq(p)->nr_iowait);
2520 }
2521
2522#endif
2523
2524 ttwu_queue(p, cpu, wake_flags);
2525unlock:
2526 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2527out:
2528 if (success)
2529 ttwu_stat(p, cpu, wake_flags);
2530 preempt_enable();
2531
2532 return success;
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546int wake_up_process(struct task_struct *p)
2547{
2548 return try_to_wake_up(p, TASK_NORMAL, 0);
2549}
2550EXPORT_SYMBOL(wake_up_process);
2551
2552int wake_up_state(struct task_struct *p, unsigned int state)
2553{
2554 return try_to_wake_up(p, state, 0);
2555}
2556
2557
2558
2559
2560
2561
2562
2563static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2564{
2565 p->on_rq = 0;
2566
2567 p->se.on_rq = 0;
2568 p->se.exec_start = 0;
2569 p->se.sum_exec_runtime = 0;
2570 p->se.prev_sum_exec_runtime = 0;
2571 p->se.nr_migrations = 0;
2572 p->se.vruntime = 0;
2573 INIT_LIST_HEAD(&p->se.group_node);
2574
2575#ifdef CONFIG_FAIR_GROUP_SCHED
2576 p->se.cfs_rq = NULL;
2577#endif
2578
2579#ifdef CONFIG_SCHEDSTATS
2580
2581 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2582#endif
2583
2584 RB_CLEAR_NODE(&p->dl.rb_node);
2585 init_dl_task_timer(&p->dl);
2586 init_dl_inactive_task_timer(&p->dl);
2587 __dl_clear_params(p);
2588
2589 INIT_LIST_HEAD(&p->rt.run_list);
2590 p->rt.timeout = 0;
2591 p->rt.time_slice = sched_rr_timeslice;
2592 p->rt.on_rq = 0;
2593 p->rt.on_list = 0;
2594
2595#ifdef CONFIG_PREEMPT_NOTIFIERS
2596 INIT_HLIST_HEAD(&p->preempt_notifiers);
2597#endif
2598
2599#ifdef CONFIG_COMPACTION
2600 p->capture_control = NULL;
2601#endif
2602 init_numa_balancing(clone_flags, p);
2603}
2604
2605DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2606
2607#ifdef CONFIG_NUMA_BALANCING
2608
2609void set_numabalancing_state(bool enabled)
2610{
2611 if (enabled)
2612 static_branch_enable(&sched_numa_balancing);
2613 else
2614 static_branch_disable(&sched_numa_balancing);
2615}
2616
2617#ifdef CONFIG_PROC_SYSCTL
2618int sysctl_numa_balancing(struct ctl_table *table, int write,
2619 void __user *buffer, size_t *lenp, loff_t *ppos)
2620{
2621 struct ctl_table t;
2622 int err;
2623 int state = static_branch_likely(&sched_numa_balancing);
2624
2625 if (write && !capable(CAP_SYS_ADMIN))
2626 return -EPERM;
2627
2628 t = *table;
2629 t.data = &state;
2630 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2631 if (err < 0)
2632 return err;
2633 if (write)
2634 set_numabalancing_state(state);
2635 return err;
2636}
2637#endif
2638#endif
2639
2640#ifdef CONFIG_SCHEDSTATS
2641
2642DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2643static bool __initdata __sched_schedstats = false;
2644
2645static void set_schedstats(bool enabled)
2646{
2647 if (enabled)
2648 static_branch_enable(&sched_schedstats);
2649 else
2650 static_branch_disable(&sched_schedstats);
2651}
2652
2653void force_schedstat_enabled(void)
2654{
2655 if (!schedstat_enabled()) {
2656 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2657 static_branch_enable(&sched_schedstats);
2658 }
2659}
2660
2661static int __init setup_schedstats(char *str)
2662{
2663 int ret = 0;
2664 if (!str)
2665 goto out;
2666
2667
2668
2669
2670
2671
2672 if (!strcmp(str, "enable")) {
2673 __sched_schedstats = true;
2674 ret = 1;
2675 } else if (!strcmp(str, "disable")) {
2676 __sched_schedstats = false;
2677 ret = 1;
2678 }
2679out:
2680 if (!ret)
2681 pr_warn("Unable to parse schedstats=\n");
2682
2683 return ret;
2684}
2685__setup("schedstats=", setup_schedstats);
2686
2687static void __init init_schedstats(void)
2688{
2689 set_schedstats(__sched_schedstats);
2690}
2691
2692#ifdef CONFIG_PROC_SYSCTL
2693int sysctl_schedstats(struct ctl_table *table, int write,
2694 void __user *buffer, size_t *lenp, loff_t *ppos)
2695{
2696 struct ctl_table t;
2697 int err;
2698 int state = static_branch_likely(&sched_schedstats);
2699
2700 if (write && !capable(CAP_SYS_ADMIN))
2701 return -EPERM;
2702
2703 t = *table;
2704 t.data = &state;
2705 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2706 if (err < 0)
2707 return err;
2708 if (write)
2709 set_schedstats(state);
2710 return err;
2711}
2712#endif
2713#else
2714static inline void init_schedstats(void) {}
2715#endif
2716
2717
2718
2719
2720int sched_fork(unsigned long clone_flags, struct task_struct *p)
2721{
2722 unsigned long flags;
2723
2724 __sched_fork(clone_flags, p);
2725
2726
2727
2728
2729
2730 p->state = TASK_NEW;
2731
2732
2733
2734
2735 p->prio = current->normal_prio;
2736
2737 uclamp_fork(p);
2738
2739
2740
2741
2742 if (unlikely(p->sched_reset_on_fork)) {
2743 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2744 p->policy = SCHED_NORMAL;
2745 p->static_prio = NICE_TO_PRIO(0);
2746 p->rt_priority = 0;
2747 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2748 p->static_prio = NICE_TO_PRIO(0);
2749
2750 p->prio = p->normal_prio = __normal_prio(p);
2751 set_load_weight(p, false);
2752
2753
2754
2755
2756
2757 p->sched_reset_on_fork = 0;
2758 }
2759
2760 if (dl_prio(p->prio))
2761 return -EAGAIN;
2762 else if (rt_prio(p->prio))
2763 p->sched_class = &rt_sched_class;
2764 else
2765 p->sched_class = &fair_sched_class;
2766
2767 init_entity_runnable_average(&p->se);
2768
2769
2770
2771
2772
2773
2774
2775
2776 raw_spin_lock_irqsave(&p->pi_lock, flags);
2777
2778
2779
2780
2781 __set_task_cpu(p, smp_processor_id());
2782 if (p->sched_class->task_fork)
2783 p->sched_class->task_fork(p);
2784 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2785
2786#ifdef CONFIG_SCHED_INFO
2787 if (likely(sched_info_on()))
2788 memset(&p->sched_info, 0, sizeof(p->sched_info));
2789#endif
2790#if defined(CONFIG_SMP)
2791 p->on_cpu = 0;
2792#endif
2793 init_task_preempt_count(p);
2794#ifdef CONFIG_SMP
2795 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2796 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2797#endif
2798 return 0;
2799}
2800
2801unsigned long to_ratio(u64 period, u64 runtime)
2802{
2803 if (runtime == RUNTIME_INF)
2804 return BW_UNIT;
2805
2806
2807
2808
2809
2810
2811 if (period == 0)
2812 return 0;
2813
2814 return div64_u64(runtime << BW_SHIFT, period);
2815}
2816
2817
2818
2819
2820
2821
2822
2823
2824void wake_up_new_task(struct task_struct *p)
2825{
2826 struct rq_flags rf;
2827 struct rq *rq;
2828
2829 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2830 p->state = TASK_RUNNING;
2831#ifdef CONFIG_SMP
2832
2833
2834
2835
2836
2837
2838
2839
2840 p->recent_used_cpu = task_cpu(p);
2841 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2842#endif
2843 rq = __task_rq_lock(p, &rf);
2844 update_rq_clock(rq);
2845 post_init_entity_util_avg(p);
2846
2847 activate_task(rq, p, ENQUEUE_NOCLOCK);
2848 trace_sched_wakeup_new(p);
2849 check_preempt_curr(rq, p, WF_FORK);
2850#ifdef CONFIG_SMP
2851 if (p->sched_class->task_woken) {
2852
2853
2854
2855
2856 rq_unpin_lock(rq, &rf);
2857 p->sched_class->task_woken(rq, p);
2858 rq_repin_lock(rq, &rf);
2859 }
2860#endif
2861 task_rq_unlock(rq, p, &rf);
2862}
2863
2864#ifdef CONFIG_PREEMPT_NOTIFIERS
2865
2866static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2867
2868void preempt_notifier_inc(void)
2869{
2870 static_branch_inc(&preempt_notifier_key);
2871}
2872EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2873
2874void preempt_notifier_dec(void)
2875{
2876 static_branch_dec(&preempt_notifier_key);
2877}
2878EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2879
2880
2881
2882
2883
2884void preempt_notifier_register(struct preempt_notifier *notifier)
2885{
2886 if (!static_branch_unlikely(&preempt_notifier_key))
2887 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2888
2889 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2890}
2891EXPORT_SYMBOL_GPL(preempt_notifier_register);
2892
2893
2894
2895
2896
2897
2898
2899void preempt_notifier_unregister(struct preempt_notifier *notifier)
2900{
2901 hlist_del(¬ifier->link);
2902}
2903EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2904
2905static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2906{
2907 struct preempt_notifier *notifier;
2908
2909 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2910 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2911}
2912
2913static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2914{
2915 if (static_branch_unlikely(&preempt_notifier_key))
2916 __fire_sched_in_preempt_notifiers(curr);
2917}
2918
2919static void
2920__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2921 struct task_struct *next)
2922{
2923 struct preempt_notifier *notifier;
2924
2925 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2926 notifier->ops->sched_out(notifier, next);
2927}
2928
2929static __always_inline void
2930fire_sched_out_preempt_notifiers(struct task_struct *curr,
2931 struct task_struct *next)
2932{
2933 if (static_branch_unlikely(&preempt_notifier_key))
2934 __fire_sched_out_preempt_notifiers(curr, next);
2935}
2936
2937#else
2938
2939static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2940{
2941}
2942
2943static inline void
2944fire_sched_out_preempt_notifiers(struct task_struct *curr,
2945 struct task_struct *next)
2946{
2947}
2948
2949#endif
2950
2951static inline void prepare_task(struct task_struct *next)
2952{
2953#ifdef CONFIG_SMP
2954
2955
2956
2957
2958 next->on_cpu = 1;
2959#endif
2960}
2961
2962static inline void finish_task(struct task_struct *prev)
2963{
2964#ifdef CONFIG_SMP
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975 smp_store_release(&prev->on_cpu, 0);
2976#endif
2977}
2978
2979static inline void
2980prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
2981{
2982
2983
2984
2985
2986
2987
2988 rq_unpin_lock(rq, rf);
2989 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2990#ifdef CONFIG_DEBUG_SPINLOCK
2991
2992 rq->lock.owner = next;
2993#endif
2994}
2995
2996static inline void finish_lock_switch(struct rq *rq)
2997{
2998
2999
3000
3001
3002
3003 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
3004 raw_spin_unlock_irq(&rq->lock);
3005}
3006
3007
3008
3009
3010
3011#ifndef prepare_arch_switch
3012# define prepare_arch_switch(next) do { } while (0)
3013#endif
3014
3015#ifndef finish_arch_post_lock_switch
3016# define finish_arch_post_lock_switch() do { } while (0)
3017#endif
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032static inline void
3033prepare_task_switch(struct rq *rq, struct task_struct *prev,
3034 struct task_struct *next)
3035{
3036 kcov_prepare_switch(prev);
3037 sched_info_switch(rq, prev, next);
3038 perf_event_task_sched_out(prev, next);
3039 rseq_preempt(prev);
3040 fire_sched_out_preempt_notifiers(prev, next);
3041 prepare_task(next);
3042 prepare_arch_switch(next);
3043}
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064static struct rq *finish_task_switch(struct task_struct *prev)
3065 __releases(rq->lock)
3066{
3067 struct rq *rq = this_rq();
3068 struct mm_struct *mm = rq->prev_mm;
3069 long prev_state;
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
3083 "corrupted preempt_count: %s/%d/0x%x\n",
3084 current->comm, current->pid, preempt_count()))
3085 preempt_count_set(FORK_PREEMPT_COUNT);
3086
3087 rq->prev_mm = NULL;
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100 prev_state = prev->state;
3101 vtime_task_switch(prev);
3102 perf_event_task_sched_in(prev, current);
3103 finish_task(prev);
3104 finish_lock_switch(rq);
3105 finish_arch_post_lock_switch();
3106 kcov_finish_switch(current);
3107
3108 fire_sched_in_preempt_notifiers(current);
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121 if (mm) {
3122 membarrier_mm_sync_core_before_usermode(mm);
3123 mmdrop(mm);
3124 }
3125 if (unlikely(prev_state == TASK_DEAD)) {
3126 if (prev->sched_class->task_dead)
3127 prev->sched_class->task_dead(prev);
3128
3129
3130
3131
3132
3133 kprobe_flush_task(prev);
3134
3135
3136 put_task_stack(prev);
3137
3138 put_task_struct(prev);
3139 }
3140
3141 tick_nohz_task_switch();
3142 return rq;
3143}
3144
3145#ifdef CONFIG_SMP
3146
3147
3148static void __balance_callback(struct rq *rq)
3149{
3150 struct callback_head *head, *next;
3151 void (*func)(struct rq *rq);
3152 unsigned long flags;
3153
3154 raw_spin_lock_irqsave(&rq->lock, flags);
3155 head = rq->balance_callback;
3156 rq->balance_callback = NULL;
3157 while (head) {
3158 func = (void (*)(struct rq *))head->func;
3159 next = head->next;
3160 head->next = NULL;
3161 head = next;
3162
3163 func(rq);
3164 }
3165 raw_spin_unlock_irqrestore(&rq->lock, flags);
3166}
3167
3168static inline void balance_callback(struct rq *rq)
3169{
3170 if (unlikely(rq->balance_callback))
3171 __balance_callback(rq);
3172}
3173
3174#else
3175
3176static inline void balance_callback(struct rq *rq)
3177{
3178}
3179
3180#endif
3181
3182
3183
3184
3185
3186asmlinkage __visible void schedule_tail(struct task_struct *prev)
3187 __releases(rq->lock)
3188{
3189 struct rq *rq;
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200 rq = finish_task_switch(prev);
3201 balance_callback(rq);
3202 preempt_enable();
3203
3204 if (current->set_child_tid)
3205 put_user(task_pid_vnr(current), current->set_child_tid);
3206
3207 calculate_sigpending();
3208}
3209
3210
3211
3212
3213static __always_inline struct rq *
3214context_switch(struct rq *rq, struct task_struct *prev,
3215 struct task_struct *next, struct rq_flags *rf)
3216{
3217 struct mm_struct *mm, *oldmm;
3218
3219 prepare_task_switch(rq, prev, next);
3220
3221 mm = next->mm;
3222 oldmm = prev->active_mm;
3223
3224
3225
3226
3227
3228 arch_start_context_switch(prev);
3229
3230
3231
3232
3233
3234
3235
3236
3237 if (!mm) {
3238 next->active_mm = oldmm;
3239 mmgrab(oldmm);
3240 enter_lazy_tlb(oldmm, next);
3241 } else
3242 switch_mm_irqs_off(oldmm, mm, next);
3243
3244 if (!prev->mm) {
3245 prev->active_mm = NULL;
3246 rq->prev_mm = oldmm;
3247 }
3248
3249 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3250
3251 prepare_lock_switch(rq, next, rf);
3252
3253
3254 switch_to(prev, next, prev);
3255 barrier();
3256
3257 return finish_task_switch(prev);
3258}
3259
3260
3261
3262
3263
3264
3265
3266unsigned long nr_running(void)
3267{
3268 unsigned long i, sum = 0;
3269
3270 for_each_online_cpu(i)
3271 sum += cpu_rq(i)->nr_running;
3272
3273 return sum;
3274}
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289bool single_task_running(void)
3290{
3291 return raw_rq()->nr_running == 1;
3292}
3293EXPORT_SYMBOL(single_task_running);
3294
3295unsigned long long nr_context_switches(void)
3296{
3297 int i;
3298 unsigned long long sum = 0;
3299
3300 for_each_possible_cpu(i)
3301 sum += cpu_rq(i)->nr_switches;
3302
3303 return sum;
3304}
3305
3306
3307
3308
3309
3310
3311
3312
3313unsigned long nr_iowait_cpu(int cpu)
3314{
3315 return atomic_read(&cpu_rq(cpu)->nr_iowait);
3316}
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348unsigned long nr_iowait(void)
3349{
3350 unsigned long i, sum = 0;
3351
3352 for_each_possible_cpu(i)
3353 sum += nr_iowait_cpu(i);
3354
3355 return sum;
3356}
3357
3358#ifdef CONFIG_SMP
3359
3360
3361
3362
3363
3364void sched_exec(void)
3365{
3366 struct task_struct *p = current;
3367 unsigned long flags;
3368 int dest_cpu;
3369
3370 raw_spin_lock_irqsave(&p->pi_lock, flags);
3371 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3372 if (dest_cpu == smp_processor_id())
3373 goto unlock;
3374
3375 if (likely(cpu_active(dest_cpu))) {
3376 struct migration_arg arg = { p, dest_cpu };
3377
3378 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3379 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3380 return;
3381 }
3382unlock:
3383 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3384}
3385
3386#endif
3387
3388DEFINE_PER_CPU(struct kernel_stat, kstat);
3389DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3390
3391EXPORT_PER_CPU_SYMBOL(kstat);
3392EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3393
3394
3395
3396
3397
3398
3399
3400static inline void prefetch_curr_exec_start(struct task_struct *p)
3401{
3402#ifdef CONFIG_FAIR_GROUP_SCHED
3403 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3404#else
3405 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3406#endif
3407 prefetch(curr);
3408 prefetch(&curr->exec_start);
3409}
3410
3411
3412
3413
3414
3415
3416unsigned long long task_sched_runtime(struct task_struct *p)
3417{
3418 struct rq_flags rf;
3419 struct rq *rq;
3420 u64 ns;
3421
3422#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434 if (!p->on_cpu || !task_on_rq_queued(p))
3435 return p->se.sum_exec_runtime;
3436#endif
3437
3438 rq = task_rq_lock(p, &rf);
3439
3440
3441
3442
3443
3444 if (task_current(rq, p) && task_on_rq_queued(p)) {
3445 prefetch_curr_exec_start(p);
3446 update_rq_clock(rq);
3447 p->sched_class->update_curr(rq);
3448 }
3449 ns = p->se.sum_exec_runtime;
3450 task_rq_unlock(rq, p, &rf);
3451
3452 return ns;
3453}
3454
3455
3456
3457
3458
3459void scheduler_tick(void)
3460{
3461 int cpu = smp_processor_id();
3462 struct rq *rq = cpu_rq(cpu);
3463 struct task_struct *curr = rq->curr;
3464 struct rq_flags rf;
3465
3466 sched_clock_tick();
3467
3468 rq_lock(rq, &rf);
3469
3470 update_rq_clock(rq);
3471 curr->sched_class->task_tick(rq, curr, 0);
3472 calc_global_load_tick(rq);
3473 psi_task_tick(rq);
3474
3475 rq_unlock(rq, &rf);
3476
3477 perf_event_task_tick();
3478
3479#ifdef CONFIG_SMP
3480 rq->idle_balance = idle_cpu(cpu);
3481 trigger_load_balance(rq);
3482#endif
3483}
3484
3485#ifdef CONFIG_NO_HZ_FULL
3486
3487struct tick_work {
3488 int cpu;
3489 struct delayed_work work;
3490};
3491
3492static struct tick_work __percpu *tick_work_cpu;
3493
3494static void sched_tick_remote(struct work_struct *work)
3495{
3496 struct delayed_work *dwork = to_delayed_work(work);
3497 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3498 int cpu = twork->cpu;
3499 struct rq *rq = cpu_rq(cpu);
3500 struct task_struct *curr;
3501 struct rq_flags rf;
3502 u64 delta;
3503
3504
3505
3506
3507
3508
3509
3510
3511 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3512 goto out_requeue;
3513
3514 rq_lock_irq(rq, &rf);
3515 curr = rq->curr;
3516 if (is_idle_task(curr))
3517 goto out_unlock;
3518
3519 update_rq_clock(rq);
3520 delta = rq_clock_task(rq) - curr->se.exec_start;
3521
3522
3523
3524
3525
3526 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3527 curr->sched_class->task_tick(rq, curr, 0);
3528
3529out_unlock:
3530 rq_unlock_irq(rq, &rf);
3531
3532out_requeue:
3533
3534
3535
3536
3537
3538 queue_delayed_work(system_unbound_wq, dwork, HZ);
3539}
3540
3541static void sched_tick_start(int cpu)
3542{
3543 struct tick_work *twork;
3544
3545 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3546 return;
3547
3548 WARN_ON_ONCE(!tick_work_cpu);
3549
3550 twork = per_cpu_ptr(tick_work_cpu, cpu);
3551 twork->cpu = cpu;
3552 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3553 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3554}
3555
3556#ifdef CONFIG_HOTPLUG_CPU
3557static void sched_tick_stop(int cpu)
3558{
3559 struct tick_work *twork;
3560
3561 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3562 return;
3563
3564 WARN_ON_ONCE(!tick_work_cpu);
3565
3566 twork = per_cpu_ptr(tick_work_cpu, cpu);
3567 cancel_delayed_work_sync(&twork->work);
3568}
3569#endif
3570
3571int __init sched_tick_offload_init(void)
3572{
3573 tick_work_cpu = alloc_percpu(struct tick_work);
3574 BUG_ON(!tick_work_cpu);
3575
3576 return 0;
3577}
3578
3579#else
3580static inline void sched_tick_start(int cpu) { }
3581static inline void sched_tick_stop(int cpu) { }
3582#endif
3583
3584#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3585 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3586
3587
3588
3589
3590static inline void preempt_latency_start(int val)
3591{
3592 if (preempt_count() == val) {
3593 unsigned long ip = get_lock_parent_ip();
3594#ifdef CONFIG_DEBUG_PREEMPT
3595 current->preempt_disable_ip = ip;
3596#endif
3597 trace_preempt_off(CALLER_ADDR0, ip);
3598 }
3599}
3600
3601void preempt_count_add(int val)
3602{
3603#ifdef CONFIG_DEBUG_PREEMPT
3604
3605
3606
3607 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3608 return;
3609#endif
3610 __preempt_count_add(val);
3611#ifdef CONFIG_DEBUG_PREEMPT
3612
3613
3614
3615 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3616 PREEMPT_MASK - 10);
3617#endif
3618 preempt_latency_start(val);
3619}
3620EXPORT_SYMBOL(preempt_count_add);
3621NOKPROBE_SYMBOL(preempt_count_add);
3622
3623
3624
3625
3626
3627static inline void preempt_latency_stop(int val)
3628{
3629 if (preempt_count() == val)
3630 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3631}
3632
3633void preempt_count_sub(int val)
3634{
3635#ifdef CONFIG_DEBUG_PREEMPT
3636
3637
3638
3639 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3640 return;
3641
3642
3643
3644 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3645 !(preempt_count() & PREEMPT_MASK)))
3646 return;
3647#endif
3648
3649 preempt_latency_stop(val);
3650 __preempt_count_sub(val);
3651}
3652EXPORT_SYMBOL(preempt_count_sub);
3653NOKPROBE_SYMBOL(preempt_count_sub);
3654
3655#else
3656static inline void preempt_latency_start(int val) { }
3657static inline void preempt_latency_stop(int val) { }
3658#endif
3659
3660static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3661{
3662#ifdef CONFIG_DEBUG_PREEMPT
3663 return p->preempt_disable_ip;
3664#else
3665 return 0;
3666#endif
3667}
3668
3669
3670
3671
3672static noinline void __schedule_bug(struct task_struct *prev)
3673{
3674
3675 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3676
3677 if (oops_in_progress)
3678 return;
3679
3680 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3681 prev->comm, prev->pid, preempt_count());
3682
3683 debug_show_held_locks(prev);
3684 print_modules();
3685 if (irqs_disabled())
3686 print_irqtrace_events(prev);
3687 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3688 && in_atomic_preempt_off()) {
3689 pr_err("Preemption disabled at:");
3690 print_ip_sym(preempt_disable_ip);
3691 pr_cont("\n");
3692 }
3693 if (panic_on_warn)
3694 panic("scheduling while atomic\n");
3695
3696 dump_stack();
3697 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3698}
3699
3700
3701
3702
3703static inline void schedule_debug(struct task_struct *prev)
3704{
3705#ifdef CONFIG_SCHED_STACK_END_CHECK
3706 if (task_stack_end_corrupted(prev))
3707 panic("corrupted stack end detected inside scheduler\n");
3708#endif
3709
3710 if (unlikely(in_atomic_preempt_off())) {
3711 __schedule_bug(prev);
3712 preempt_count_set(PREEMPT_DISABLED);
3713 }
3714 rcu_sleep_check();
3715
3716 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3717
3718 schedstat_inc(this_rq()->sched_count);
3719}
3720
3721
3722
3723
3724static inline struct task_struct *
3725pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3726{
3727 const struct sched_class *class;
3728 struct task_struct *p;
3729
3730
3731
3732
3733
3734
3735
3736 if (likely((prev->sched_class == &idle_sched_class ||
3737 prev->sched_class == &fair_sched_class) &&
3738 rq->nr_running == rq->cfs.h_nr_running)) {
3739
3740 p = fair_sched_class.pick_next_task(rq, prev, rf);
3741 if (unlikely(p == RETRY_TASK))
3742 goto again;
3743
3744
3745 if (unlikely(!p))
3746 p = idle_sched_class.pick_next_task(rq, prev, rf);
3747
3748 return p;
3749 }
3750
3751again:
3752 for_each_class(class) {
3753 p = class->pick_next_task(rq, prev, rf);
3754 if (p) {
3755 if (unlikely(p == RETRY_TASK))
3756 goto again;
3757 return p;
3758 }
3759 }
3760
3761
3762 BUG();
3763}
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804static void __sched notrace __schedule(bool preempt)
3805{
3806 struct task_struct *prev, *next;
3807 unsigned long *switch_count;
3808 struct rq_flags rf;
3809 struct rq *rq;
3810 int cpu;
3811
3812 cpu = smp_processor_id();
3813 rq = cpu_rq(cpu);
3814 prev = rq->curr;
3815
3816 schedule_debug(prev);
3817
3818 if (sched_feat(HRTICK))
3819 hrtick_clear(rq);
3820
3821 local_irq_disable();
3822 rcu_note_context_switch(preempt);
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832 rq_lock(rq, &rf);
3833 smp_mb__after_spinlock();
3834
3835
3836 rq->clock_update_flags <<= 1;
3837 update_rq_clock(rq);
3838
3839 switch_count = &prev->nivcsw;
3840 if (!preempt && prev->state) {
3841 if (signal_pending_state(prev->state, prev)) {
3842 prev->state = TASK_RUNNING;
3843 } else {
3844 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3845
3846 if (prev->in_iowait) {
3847 atomic_inc(&rq->nr_iowait);
3848 delayacct_blkio_start();
3849 }
3850 }
3851 switch_count = &prev->nvcsw;
3852 }
3853
3854 next = pick_next_task(rq, prev, &rf);
3855 clear_tsk_need_resched(prev);
3856 clear_preempt_need_resched();
3857
3858 if (likely(prev != next)) {
3859 rq->nr_switches++;
3860 rq->curr = next;
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875 ++*switch_count;
3876
3877 trace_sched_switch(preempt, prev, next);
3878
3879
3880 rq = context_switch(rq, prev, next, &rf);
3881 } else {
3882 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3883 rq_unlock_irq(rq, &rf);
3884 }
3885
3886 balance_callback(rq);
3887}
3888
3889void __noreturn do_task_dead(void)
3890{
3891
3892 set_special_state(TASK_DEAD);
3893
3894
3895 current->flags |= PF_NOFREEZE;
3896
3897 __schedule(false);
3898 BUG();
3899
3900
3901 for (;;)
3902 cpu_relax();
3903}
3904
3905static inline void sched_submit_work(struct task_struct *tsk)
3906{
3907 if (!tsk->state)
3908 return;
3909
3910
3911
3912
3913
3914
3915
3916
3917 if (tsk->flags & PF_WQ_WORKER) {
3918 preempt_disable();
3919 wq_worker_sleeping(tsk);
3920 preempt_enable_no_resched();
3921 }
3922
3923 if (tsk_is_pi_blocked(tsk))
3924 return;
3925
3926
3927
3928
3929
3930 if (blk_needs_flush_plug(tsk))
3931 blk_schedule_flush_plug(tsk);
3932}
3933
3934static void sched_update_worker(struct task_struct *tsk)
3935{
3936 if (tsk->flags & PF_WQ_WORKER)
3937 wq_worker_running(tsk);
3938}
3939
3940asmlinkage __visible void __sched schedule(void)
3941{
3942 struct task_struct *tsk = current;
3943
3944 sched_submit_work(tsk);
3945 do {
3946 preempt_disable();
3947 __schedule(false);
3948 sched_preempt_enable_no_resched();
3949 } while (need_resched());
3950 sched_update_worker(tsk);
3951}
3952EXPORT_SYMBOL(schedule);
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964void __sched schedule_idle(void)
3965{
3966
3967
3968
3969
3970
3971
3972
3973 WARN_ON_ONCE(current->state);
3974 do {
3975 __schedule(false);
3976 } while (need_resched());
3977}
3978
3979#ifdef CONFIG_CONTEXT_TRACKING
3980asmlinkage __visible void __sched schedule_user(void)
3981{
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992 enum ctx_state prev_state = exception_enter();
3993 schedule();
3994 exception_exit(prev_state);
3995}
3996#endif
3997
3998
3999
4000
4001
4002
4003void __sched schedule_preempt_disabled(void)
4004{
4005 sched_preempt_enable_no_resched();
4006 schedule();
4007 preempt_disable();
4008}
4009
4010static void __sched notrace preempt_schedule_common(void)
4011{
4012 do {
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026 preempt_disable_notrace();
4027 preempt_latency_start(1);
4028 __schedule(true);
4029 preempt_latency_stop(1);
4030 preempt_enable_no_resched_notrace();
4031
4032
4033
4034
4035
4036 } while (need_resched());
4037}
4038
4039#ifdef CONFIG_PREEMPT
4040
4041
4042
4043
4044
4045asmlinkage __visible void __sched notrace preempt_schedule(void)
4046{
4047
4048
4049
4050
4051 if (likely(!preemptible()))
4052 return;
4053
4054 preempt_schedule_common();
4055}
4056NOKPROBE_SYMBOL(preempt_schedule);
4057EXPORT_SYMBOL(preempt_schedule);
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
4074{
4075 enum ctx_state prev_ctx;
4076
4077 if (likely(!preemptible()))
4078 return;
4079
4080 do {
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094 preempt_disable_notrace();
4095 preempt_latency_start(1);
4096
4097
4098
4099
4100
4101 prev_ctx = exception_enter();
4102 __schedule(true);
4103 exception_exit(prev_ctx);
4104
4105 preempt_latency_stop(1);
4106 preempt_enable_no_resched_notrace();
4107 } while (need_resched());
4108}
4109EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4110
4111#endif
4112
4113
4114
4115
4116
4117
4118
4119asmlinkage __visible void __sched preempt_schedule_irq(void)
4120{
4121 enum ctx_state prev_state;
4122
4123
4124 BUG_ON(preempt_count() || !irqs_disabled());
4125
4126 prev_state = exception_enter();
4127
4128 do {
4129 preempt_disable();
4130 local_irq_enable();
4131 __schedule(true);
4132 local_irq_disable();
4133 sched_preempt_enable_no_resched();
4134 } while (need_resched());
4135
4136 exception_exit(prev_state);
4137}
4138
4139int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4140 void *key)
4141{
4142 return try_to_wake_up(curr->private, mode, wake_flags);
4143}
4144EXPORT_SYMBOL(default_wake_function);
4145
4146#ifdef CONFIG_RT_MUTEXES
4147
4148static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
4149{
4150 if (pi_task)
4151 prio = min(prio, pi_task->prio);
4152
4153 return prio;
4154}
4155
4156static inline int rt_effective_prio(struct task_struct *p, int prio)
4157{
4158 struct task_struct *pi_task = rt_mutex_get_top_task(p);
4159
4160 return __rt_effective_prio(pi_task, prio);
4161}
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
4175{
4176 int prio, oldprio, queued, running, queue_flag =
4177 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4178 const struct sched_class *prev_class;
4179 struct rq_flags rf;
4180 struct rq *rq;
4181
4182
4183 prio = __rt_effective_prio(pi_task, p->normal_prio);
4184
4185
4186
4187
4188 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
4189 return;
4190
4191 rq = __task_rq_lock(p, &rf);
4192 update_rq_clock(rq);
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203 p->pi_top_task = pi_task;
4204
4205
4206
4207
4208 if (prio == p->prio && !dl_prio(prio))
4209 goto out_unlock;
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223 if (unlikely(p == rq->idle)) {
4224 WARN_ON(p != rq->curr);
4225 WARN_ON(p->pi_blocked_on);
4226 goto out_unlock;
4227 }
4228
4229 trace_sched_pi_setprio(p, pi_task);
4230 oldprio = p->prio;
4231
4232 if (oldprio == prio)
4233 queue_flag &= ~DEQUEUE_MOVE;
4234
4235 prev_class = p->sched_class;
4236 queued = task_on_rq_queued(p);
4237 running = task_current(rq, p);
4238 if (queued)
4239 dequeue_task(rq, p, queue_flag);
4240 if (running)
4241 put_prev_task(rq, p);
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252 if (dl_prio(prio)) {
4253 if (!dl_prio(p->normal_prio) ||
4254 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
4255 p->dl.dl_boosted = 1;
4256 queue_flag |= ENQUEUE_REPLENISH;
4257 } else
4258 p->dl.dl_boosted = 0;
4259 p->sched_class = &dl_sched_class;
4260 } else if (rt_prio(prio)) {
4261 if (dl_prio(oldprio))
4262 p->dl.dl_boosted = 0;
4263 if (oldprio < prio)
4264 queue_flag |= ENQUEUE_HEAD;
4265 p->sched_class = &rt_sched_class;
4266 } else {
4267 if (dl_prio(oldprio))
4268 p->dl.dl_boosted = 0;
4269 if (rt_prio(oldprio))
4270 p->rt.timeout = 0;
4271 p->sched_class = &fair_sched_class;
4272 }
4273
4274 p->prio = prio;
4275
4276 if (queued)
4277 enqueue_task(rq, p, queue_flag);
4278 if (running)
4279 set_curr_task(rq, p);
4280
4281 check_class_changed(rq, p, prev_class, oldprio);
4282out_unlock:
4283
4284 preempt_disable();
4285 __task_rq_unlock(rq, &rf);
4286
4287 balance_callback(rq);
4288 preempt_enable();
4289}
4290#else
4291static inline int rt_effective_prio(struct task_struct *p, int prio)
4292{
4293 return prio;
4294}
4295#endif
4296
4297void set_user_nice(struct task_struct *p, long nice)
4298{
4299 bool queued, running;
4300 int old_prio, delta;
4301 struct rq_flags rf;
4302 struct rq *rq;
4303
4304 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
4305 return;
4306
4307
4308
4309
4310 rq = task_rq_lock(p, &rf);
4311 update_rq_clock(rq);
4312
4313
4314
4315
4316
4317
4318
4319 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4320 p->static_prio = NICE_TO_PRIO(nice);
4321 goto out_unlock;
4322 }
4323 queued = task_on_rq_queued(p);
4324 running = task_current(rq, p);
4325 if (queued)
4326 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
4327 if (running)
4328 put_prev_task(rq, p);
4329
4330 p->static_prio = NICE_TO_PRIO(nice);
4331 set_load_weight(p, true);
4332 old_prio = p->prio;
4333 p->prio = effective_prio(p);
4334 delta = p->prio - old_prio;
4335
4336 if (queued) {
4337 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4338
4339
4340
4341
4342 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4343 resched_curr(rq);
4344 }
4345 if (running)
4346 set_curr_task(rq, p);
4347out_unlock:
4348 task_rq_unlock(rq, p, &rf);
4349}
4350EXPORT_SYMBOL(set_user_nice);
4351
4352
4353
4354
4355
4356
4357int can_nice(const struct task_struct *p, const int nice)
4358{
4359
4360 int nice_rlim = nice_to_rlimit(nice);
4361
4362 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4363 capable(CAP_SYS_NICE));
4364}
4365
4366#ifdef __ARCH_WANT_SYS_NICE
4367
4368
4369
4370
4371
4372
4373
4374
4375SYSCALL_DEFINE1(nice, int, increment)
4376{
4377 long nice, retval;
4378
4379
4380
4381
4382
4383
4384 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
4385 nice = task_nice(current) + increment;
4386
4387 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
4388 if (increment < 0 && !can_nice(current, nice))
4389 return -EPERM;
4390
4391 retval = security_task_setnice(current, nice);
4392 if (retval)
4393 return retval;
4394
4395 set_user_nice(current, nice);
4396 return 0;
4397}
4398
4399#endif
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409int task_prio(const struct task_struct *p)
4410{
4411 return p->prio - MAX_RT_PRIO;
4412}
4413
4414
4415
4416
4417
4418
4419
4420int idle_cpu(int cpu)
4421{
4422 struct rq *rq = cpu_rq(cpu);
4423
4424 if (rq->curr != rq->idle)
4425 return 0;
4426
4427 if (rq->nr_running)
4428 return 0;
4429
4430#ifdef CONFIG_SMP
4431 if (!llist_empty(&rq->wake_list))
4432 return 0;
4433#endif
4434
4435 return 1;
4436}
4437
4438
4439
4440
4441
4442
4443
4444int available_idle_cpu(int cpu)
4445{
4446 if (!idle_cpu(cpu))
4447 return 0;
4448
4449 if (vcpu_is_preempted(cpu))
4450 return 0;
4451
4452 return 1;
4453}
4454
4455
4456
4457
4458
4459
4460
4461struct task_struct *idle_task(int cpu)
4462{
4463 return cpu_rq(cpu)->idle;
4464}
4465
4466
4467
4468
4469
4470
4471
4472static struct task_struct *find_process_by_pid(pid_t pid)
4473{
4474 return pid ? find_task_by_vpid(pid) : current;
4475}
4476
4477
4478
4479
4480
4481#define SETPARAM_POLICY -1
4482
4483static void __setscheduler_params(struct task_struct *p,
4484 const struct sched_attr *attr)
4485{
4486 int policy = attr->sched_policy;
4487
4488 if (policy == SETPARAM_POLICY)
4489 policy = p->policy;
4490
4491 p->policy = policy;
4492
4493 if (dl_policy(policy))
4494 __setparam_dl(p, attr);
4495 else if (fair_policy(policy))
4496 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4497
4498
4499
4500
4501
4502
4503 p->rt_priority = attr->sched_priority;
4504 p->normal_prio = normal_prio(p);
4505 set_load_weight(p, true);
4506}
4507
4508
4509static void __setscheduler(struct rq *rq, struct task_struct *p,
4510 const struct sched_attr *attr, bool keep_boost)
4511{
4512
4513
4514
4515
4516 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4517 return;
4518
4519 __setscheduler_params(p, attr);
4520
4521
4522
4523
4524
4525 p->prio = normal_prio(p);
4526 if (keep_boost)
4527 p->prio = rt_effective_prio(p, p->prio);
4528
4529 if (dl_prio(p->prio))
4530 p->sched_class = &dl_sched_class;
4531 else if (rt_prio(p->prio))
4532 p->sched_class = &rt_sched_class;
4533 else
4534 p->sched_class = &fair_sched_class;
4535}
4536
4537
4538
4539
4540static bool check_same_owner(struct task_struct *p)
4541{
4542 const struct cred *cred = current_cred(), *pcred;
4543 bool match;
4544
4545 rcu_read_lock();
4546 pcred = __task_cred(p);
4547 match = (uid_eq(cred->euid, pcred->euid) ||
4548 uid_eq(cred->euid, pcred->uid));
4549 rcu_read_unlock();
4550 return match;
4551}
4552
4553static int __sched_setscheduler(struct task_struct *p,
4554 const struct sched_attr *attr,
4555 bool user, bool pi)
4556{
4557 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4558 MAX_RT_PRIO - 1 - attr->sched_priority;
4559 int retval, oldprio, oldpolicy = -1, queued, running;
4560 int new_effective_prio, policy = attr->sched_policy;
4561 const struct sched_class *prev_class;
4562 struct rq_flags rf;
4563 int reset_on_fork;
4564 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4565 struct rq *rq;
4566
4567
4568 BUG_ON(pi && in_interrupt());
4569recheck:
4570
4571 if (policy < 0) {
4572 reset_on_fork = p->sched_reset_on_fork;
4573 policy = oldpolicy = p->policy;
4574 } else {
4575 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4576
4577 if (!valid_policy(policy))
4578 return -EINVAL;
4579 }
4580
4581 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
4582 return -EINVAL;
4583
4584
4585
4586
4587
4588
4589 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4590 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4591 return -EINVAL;
4592 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4593 (rt_policy(policy) != (attr->sched_priority != 0)))
4594 return -EINVAL;
4595
4596
4597
4598
4599 if (user && !capable(CAP_SYS_NICE)) {
4600 if (fair_policy(policy)) {
4601 if (attr->sched_nice < task_nice(p) &&
4602 !can_nice(p, attr->sched_nice))
4603 return -EPERM;
4604 }
4605
4606 if (rt_policy(policy)) {
4607 unsigned long rlim_rtprio =
4608 task_rlimit(p, RLIMIT_RTPRIO);
4609
4610
4611 if (policy != p->policy && !rlim_rtprio)
4612 return -EPERM;
4613
4614
4615 if (attr->sched_priority > p->rt_priority &&
4616 attr->sched_priority > rlim_rtprio)
4617 return -EPERM;
4618 }
4619
4620
4621
4622
4623
4624
4625
4626 if (dl_policy(policy))
4627 return -EPERM;
4628
4629
4630
4631
4632
4633 if (task_has_idle_policy(p) && !idle_policy(policy)) {
4634 if (!can_nice(p, task_nice(p)))
4635 return -EPERM;
4636 }
4637
4638
4639 if (!check_same_owner(p))
4640 return -EPERM;
4641
4642
4643 if (p->sched_reset_on_fork && !reset_on_fork)
4644 return -EPERM;
4645 }
4646
4647 if (user) {
4648 if (attr->sched_flags & SCHED_FLAG_SUGOV)
4649 return -EINVAL;
4650
4651 retval = security_task_setscheduler(p);
4652 if (retval)
4653 return retval;
4654 }
4655
4656
4657 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
4658 retval = uclamp_validate(p, attr);
4659 if (retval)
4660 return retval;
4661 }
4662
4663
4664
4665
4666
4667
4668
4669
4670 rq = task_rq_lock(p, &rf);
4671 update_rq_clock(rq);
4672
4673
4674
4675
4676 if (p == rq->stop) {
4677 task_rq_unlock(rq, p, &rf);
4678 return -EINVAL;
4679 }
4680
4681
4682
4683
4684
4685 if (unlikely(policy == p->policy)) {
4686 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4687 goto change;
4688 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4689 goto change;
4690 if (dl_policy(policy) && dl_param_changed(p, attr))
4691 goto change;
4692 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4693 goto change;
4694
4695 p->sched_reset_on_fork = reset_on_fork;
4696 task_rq_unlock(rq, p, &rf);
4697 return 0;
4698 }
4699change:
4700
4701 if (user) {
4702#ifdef CONFIG_RT_GROUP_SCHED
4703
4704
4705
4706
4707 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4708 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4709 !task_group_is_autogroup(task_group(p))) {
4710 task_rq_unlock(rq, p, &rf);
4711 return -EPERM;
4712 }
4713#endif
4714#ifdef CONFIG_SMP
4715 if (dl_bandwidth_enabled() && dl_policy(policy) &&
4716 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
4717 cpumask_t *span = rq->rd->span;
4718
4719
4720
4721
4722
4723
4724 if (!cpumask_subset(span, p->cpus_ptr) ||
4725 rq->rd->dl_bw.bw == 0) {
4726 task_rq_unlock(rq, p, &rf);
4727 return -EPERM;
4728 }
4729 }
4730#endif
4731 }
4732
4733
4734 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4735 policy = oldpolicy = -1;
4736 task_rq_unlock(rq, p, &rf);
4737 goto recheck;
4738 }
4739
4740
4741
4742
4743
4744
4745 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4746 task_rq_unlock(rq, p, &rf);
4747 return -EBUSY;
4748 }
4749
4750 p->sched_reset_on_fork = reset_on_fork;
4751 oldprio = p->prio;
4752
4753 if (pi) {
4754
4755
4756
4757
4758
4759
4760
4761 new_effective_prio = rt_effective_prio(p, newprio);
4762 if (new_effective_prio == oldprio)
4763 queue_flags &= ~DEQUEUE_MOVE;
4764 }
4765
4766 queued = task_on_rq_queued(p);
4767 running = task_current(rq, p);
4768 if (queued)
4769 dequeue_task(rq, p, queue_flags);
4770 if (running)
4771 put_prev_task(rq, p);
4772
4773 prev_class = p->sched_class;
4774
4775 __setscheduler(rq, p, attr, pi);
4776 __setscheduler_uclamp(p, attr);
4777
4778 if (queued) {
4779
4780
4781
4782
4783 if (oldprio < p->prio)
4784 queue_flags |= ENQUEUE_HEAD;
4785
4786 enqueue_task(rq, p, queue_flags);
4787 }
4788 if (running)
4789 set_curr_task(rq, p);
4790
4791 check_class_changed(rq, p, prev_class, oldprio);
4792
4793
4794 preempt_disable();
4795 task_rq_unlock(rq, p, &rf);
4796
4797 if (pi)
4798 rt_mutex_adjust_pi(p);
4799
4800
4801 balance_callback(rq);
4802 preempt_enable();
4803
4804 return 0;
4805}
4806
4807static int _sched_setscheduler(struct task_struct *p, int policy,
4808 const struct sched_param *param, bool check)
4809{
4810 struct sched_attr attr = {
4811 .sched_policy = policy,
4812 .sched_priority = param->sched_priority,
4813 .sched_nice = PRIO_TO_NICE(p->static_prio),
4814 };
4815
4816
4817 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4818 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4819 policy &= ~SCHED_RESET_ON_FORK;
4820 attr.sched_policy = policy;
4821 }
4822
4823 return __sched_setscheduler(p, &attr, check, true);
4824}
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835int sched_setscheduler(struct task_struct *p, int policy,
4836 const struct sched_param *param)
4837{
4838 return _sched_setscheduler(p, policy, param, true);
4839}
4840EXPORT_SYMBOL_GPL(sched_setscheduler);
4841
4842int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4843{
4844 return __sched_setscheduler(p, attr, true, true);
4845}
4846EXPORT_SYMBOL_GPL(sched_setattr);
4847
4848int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
4849{
4850 return __sched_setscheduler(p, attr, false, true);
4851}
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4867 const struct sched_param *param)
4868{
4869 return _sched_setscheduler(p, policy, param, false);
4870}
4871EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4872
4873static int
4874do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4875{
4876 struct sched_param lparam;
4877 struct task_struct *p;
4878 int retval;
4879
4880 if (!param || pid < 0)
4881 return -EINVAL;
4882 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4883 return -EFAULT;
4884
4885 rcu_read_lock();
4886 retval = -ESRCH;
4887 p = find_process_by_pid(pid);
4888 if (p != NULL)
4889 retval = sched_setscheduler(p, policy, &lparam);
4890 rcu_read_unlock();
4891
4892 return retval;
4893}
4894
4895
4896
4897
4898static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4899{
4900 u32 size;
4901 int ret;
4902
4903 if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
4904 return -EFAULT;
4905
4906
4907 memset(attr, 0, sizeof(*attr));
4908
4909 ret = get_user(size, &uattr->size);
4910 if (ret)
4911 return ret;
4912
4913
4914 if (size > PAGE_SIZE)
4915 goto err_size;
4916
4917
4918 if (!size)
4919 size = SCHED_ATTR_SIZE_VER0;
4920
4921 if (size < SCHED_ATTR_SIZE_VER0)
4922 goto err_size;
4923
4924
4925
4926
4927
4928
4929
4930 if (size > sizeof(*attr)) {
4931 unsigned char __user *addr;
4932 unsigned char __user *end;
4933 unsigned char val;
4934
4935 addr = (void __user *)uattr + sizeof(*attr);
4936 end = (void __user *)uattr + size;
4937
4938 for (; addr < end; addr++) {
4939 ret = get_user(val, addr);
4940 if (ret)
4941 return ret;
4942 if (val)
4943 goto err_size;
4944 }
4945 size = sizeof(*attr);
4946 }
4947
4948 ret = copy_from_user(attr, uattr, size);
4949 if (ret)
4950 return -EFAULT;
4951
4952 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
4953 size < SCHED_ATTR_SIZE_VER1)
4954 return -EINVAL;
4955
4956
4957
4958
4959
4960 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4961
4962 return 0;
4963
4964err_size:
4965 put_user(sizeof(*attr), &uattr->size);
4966 return -E2BIG;
4967}
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4978{
4979 if (policy < 0)
4980 return -EINVAL;
4981
4982 return do_sched_setscheduler(pid, policy, param);
4983}
4984
4985
4986
4987
4988
4989
4990
4991
4992SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4993{
4994 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4995}
4996
4997
4998
4999
5000
5001
5002
5003SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
5004 unsigned int, flags)
5005{
5006 struct sched_attr attr;
5007 struct task_struct *p;
5008 int retval;
5009
5010 if (!uattr || pid < 0 || flags)
5011 return -EINVAL;
5012
5013 retval = sched_copy_attr(uattr, &attr);
5014 if (retval)
5015 return retval;
5016
5017 if ((int)attr.sched_policy < 0)
5018 return -EINVAL;
5019 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
5020 attr.sched_policy = SETPARAM_POLICY;
5021
5022 rcu_read_lock();
5023 retval = -ESRCH;
5024 p = find_process_by_pid(pid);
5025 if (likely(p))
5026 get_task_struct(p);
5027 rcu_read_unlock();
5028
5029 if (likely(p)) {
5030 retval = sched_setattr(p, &attr);
5031 put_task_struct(p);
5032 }
5033
5034 return retval;
5035}
5036
5037
5038
5039
5040
5041
5042
5043
5044SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5045{
5046 struct task_struct *p;
5047 int retval;
5048
5049 if (pid < 0)
5050 return -EINVAL;
5051
5052 retval = -ESRCH;
5053 rcu_read_lock();
5054 p = find_process_by_pid(pid);
5055 if (p) {
5056 retval = security_task_getscheduler(p);
5057 if (!retval)
5058 retval = p->policy
5059 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5060 }
5061 rcu_read_unlock();
5062 return retval;
5063}
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5074{
5075 struct sched_param lp = { .sched_priority = 0 };
5076 struct task_struct *p;
5077 int retval;
5078
5079 if (!param || pid < 0)
5080 return -EINVAL;
5081
5082 rcu_read_lock();
5083 p = find_process_by_pid(pid);
5084 retval = -ESRCH;
5085 if (!p)
5086 goto out_unlock;
5087
5088 retval = security_task_getscheduler(p);
5089 if (retval)
5090 goto out_unlock;
5091
5092 if (task_has_rt_policy(p))
5093 lp.sched_priority = p->rt_priority;
5094 rcu_read_unlock();
5095
5096
5097
5098
5099 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5100
5101 return retval;
5102
5103out_unlock:
5104 rcu_read_unlock();
5105 return retval;
5106}
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116static int
5117sched_attr_copy_to_user(struct sched_attr __user *uattr,
5118 struct sched_attr *kattr,
5119 unsigned int usize)
5120{
5121 unsigned int ksize = sizeof(*kattr);
5122
5123 if (!access_ok(uattr, usize))
5124 return -EFAULT;
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139 kattr->size = min(usize, ksize);
5140
5141 if (copy_to_user(uattr, kattr, kattr->size))
5142 return -EFAULT;
5143
5144 return 0;
5145}
5146
5147
5148
5149
5150
5151
5152
5153
5154SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5155 unsigned int, usize, unsigned int, flags)
5156{
5157 struct sched_attr kattr = { };
5158 struct task_struct *p;
5159 int retval;
5160
5161 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
5162 usize < SCHED_ATTR_SIZE_VER0 || flags)
5163 return -EINVAL;
5164
5165 rcu_read_lock();
5166 p = find_process_by_pid(pid);
5167 retval = -ESRCH;
5168 if (!p)
5169 goto out_unlock;
5170
5171 retval = security_task_getscheduler(p);
5172 if (retval)
5173 goto out_unlock;
5174
5175 kattr.sched_policy = p->policy;
5176 if (p->sched_reset_on_fork)
5177 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5178 if (task_has_dl_policy(p))
5179 __getparam_dl(p, &kattr);
5180 else if (task_has_rt_policy(p))
5181 kattr.sched_priority = p->rt_priority;
5182 else
5183 kattr.sched_nice = task_nice(p);
5184
5185#ifdef CONFIG_UCLAMP_TASK
5186 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5187 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5188#endif
5189
5190 rcu_read_unlock();
5191
5192 return sched_attr_copy_to_user(uattr, &kattr, usize);
5193
5194out_unlock:
5195 rcu_read_unlock();
5196 return retval;
5197}
5198
5199long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5200{
5201 cpumask_var_t cpus_allowed, new_mask;
5202 struct task_struct *p;
5203 int retval;
5204
5205 rcu_read_lock();
5206
5207 p = find_process_by_pid(pid);
5208 if (!p) {
5209 rcu_read_unlock();
5210 return -ESRCH;
5211 }
5212
5213
5214 get_task_struct(p);
5215 rcu_read_unlock();
5216
5217 if (p->flags & PF_NO_SETAFFINITY) {
5218 retval = -EINVAL;
5219 goto out_put_task;
5220 }
5221 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5222 retval = -ENOMEM;
5223 goto out_put_task;
5224 }
5225 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5226 retval = -ENOMEM;
5227 goto out_free_cpus_allowed;
5228 }
5229 retval = -EPERM;
5230 if (!check_same_owner(p)) {
5231 rcu_read_lock();
5232 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
5233 rcu_read_unlock();
5234 goto out_free_new_mask;
5235 }
5236 rcu_read_unlock();
5237 }
5238
5239 retval = security_task_setscheduler(p);
5240 if (retval)
5241 goto out_free_new_mask;
5242
5243
5244 cpuset_cpus_allowed(p, cpus_allowed);
5245 cpumask_and(new_mask, in_mask, cpus_allowed);
5246
5247
5248
5249
5250
5251
5252
5253#ifdef CONFIG_SMP
5254 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
5255 rcu_read_lock();
5256 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
5257 retval = -EBUSY;
5258 rcu_read_unlock();
5259 goto out_free_new_mask;
5260 }
5261 rcu_read_unlock();
5262 }
5263#endif
5264again:
5265 retval = __set_cpus_allowed_ptr(p, new_mask, true);
5266
5267 if (!retval) {
5268 cpuset_cpus_allowed(p, cpus_allowed);
5269 if (!cpumask_subset(new_mask, cpus_allowed)) {
5270
5271
5272
5273
5274
5275 cpumask_copy(new_mask, cpus_allowed);
5276 goto again;
5277 }
5278 }
5279out_free_new_mask:
5280 free_cpumask_var(new_mask);
5281out_free_cpus_allowed:
5282 free_cpumask_var(cpus_allowed);
5283out_put_task:
5284 put_task_struct(p);
5285 return retval;
5286}
5287
5288static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5289 struct cpumask *new_mask)
5290{
5291 if (len < cpumask_size())
5292 cpumask_clear(new_mask);
5293 else if (len > cpumask_size())
5294 len = cpumask_size();
5295
5296 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5297}
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5308 unsigned long __user *, user_mask_ptr)
5309{
5310 cpumask_var_t new_mask;
5311 int retval;
5312
5313 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5314 return -ENOMEM;
5315
5316 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5317 if (retval == 0)
5318 retval = sched_setaffinity(pid, new_mask);
5319 free_cpumask_var(new_mask);
5320 return retval;
5321}
5322
5323long sched_getaffinity(pid_t pid, struct cpumask *mask)
5324{
5325 struct task_struct *p;
5326 unsigned long flags;
5327 int retval;
5328
5329 rcu_read_lock();
5330
5331 retval = -ESRCH;
5332 p = find_process_by_pid(pid);
5333 if (!p)
5334 goto out_unlock;
5335
5336 retval = security_task_getscheduler(p);
5337 if (retval)
5338 goto out_unlock;
5339
5340 raw_spin_lock_irqsave(&p->pi_lock, flags);
5341 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
5342 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5343
5344out_unlock:
5345 rcu_read_unlock();
5346
5347 return retval;
5348}
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5360 unsigned long __user *, user_mask_ptr)
5361{
5362 int ret;
5363 cpumask_var_t mask;
5364
5365 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5366 return -EINVAL;
5367 if (len & (sizeof(unsigned long)-1))
5368 return -EINVAL;
5369
5370 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5371 return -ENOMEM;
5372
5373 ret = sched_getaffinity(pid, mask);
5374 if (ret == 0) {
5375 unsigned int retlen = min(len, cpumask_size());
5376
5377 if (copy_to_user(user_mask_ptr, mask, retlen))
5378 ret = -EFAULT;
5379 else
5380 ret = retlen;
5381 }
5382 free_cpumask_var(mask);
5383
5384 return ret;
5385}
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395static void do_sched_yield(void)
5396{
5397 struct rq_flags rf;
5398 struct rq *rq;
5399
5400 rq = this_rq_lock_irq(&rf);
5401
5402 schedstat_inc(rq->yld_count);
5403 current->sched_class->yield_task(rq);
5404
5405
5406
5407
5408
5409 preempt_disable();
5410 rq_unlock(rq, &rf);
5411 sched_preempt_enable_no_resched();
5412
5413 schedule();
5414}
5415
5416SYSCALL_DEFINE0(sched_yield)
5417{
5418 do_sched_yield();
5419 return 0;
5420}
5421
5422#ifndef CONFIG_PREEMPT
5423int __sched _cond_resched(void)
5424{
5425 if (should_resched(0)) {
5426 preempt_schedule_common();
5427 return 1;
5428 }
5429 rcu_all_qs();
5430 return 0;
5431}
5432EXPORT_SYMBOL(_cond_resched);
5433#endif
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443int __cond_resched_lock(spinlock_t *lock)
5444{
5445 int resched = should_resched(PREEMPT_LOCK_OFFSET);
5446 int ret = 0;
5447
5448 lockdep_assert_held(lock);
5449
5450 if (spin_needbreak(lock) || resched) {
5451 spin_unlock(lock);
5452 if (resched)
5453 preempt_schedule_common();
5454 else
5455 cpu_relax();
5456 ret = 1;
5457 spin_lock(lock);
5458 }
5459 return ret;
5460}
5461EXPORT_SYMBOL(__cond_resched_lock);
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485void __sched yield(void)
5486{
5487 set_current_state(TASK_RUNNING);
5488 do_sched_yield();
5489}
5490EXPORT_SYMBOL(yield);
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507int __sched yield_to(struct task_struct *p, bool preempt)
5508{
5509 struct task_struct *curr = current;
5510 struct rq *rq, *p_rq;
5511 unsigned long flags;
5512 int yielded = 0;
5513
5514 local_irq_save(flags);
5515 rq = this_rq();
5516
5517again:
5518 p_rq = task_rq(p);
5519
5520
5521
5522
5523 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5524 yielded = -ESRCH;
5525 goto out_irq;
5526 }
5527
5528 double_rq_lock(rq, p_rq);
5529 if (task_rq(p) != p_rq) {
5530 double_rq_unlock(rq, p_rq);
5531 goto again;
5532 }
5533
5534 if (!curr->sched_class->yield_to_task)
5535 goto out_unlock;
5536
5537 if (curr->sched_class != p->sched_class)
5538 goto out_unlock;
5539
5540 if (task_running(p_rq, p) || p->state)
5541 goto out_unlock;
5542
5543 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5544 if (yielded) {
5545 schedstat_inc(rq->yld_count);
5546
5547
5548
5549
5550 if (preempt && rq != p_rq)
5551 resched_curr(p_rq);
5552 }
5553
5554out_unlock:
5555 double_rq_unlock(rq, p_rq);
5556out_irq:
5557 local_irq_restore(flags);
5558
5559 if (yielded > 0)
5560 schedule();
5561
5562 return yielded;
5563}
5564EXPORT_SYMBOL_GPL(yield_to);
5565
5566int io_schedule_prepare(void)
5567{
5568 int old_iowait = current->in_iowait;
5569
5570 current->in_iowait = 1;
5571 blk_schedule_flush_plug(current);
5572
5573 return old_iowait;
5574}
5575
5576void io_schedule_finish(int token)
5577{
5578 current->in_iowait = token;
5579}
5580
5581
5582
5583
5584
5585long __sched io_schedule_timeout(long timeout)
5586{
5587 int token;
5588 long ret;
5589
5590 token = io_schedule_prepare();
5591 ret = schedule_timeout(timeout);
5592 io_schedule_finish(token);
5593
5594 return ret;
5595}
5596EXPORT_SYMBOL(io_schedule_timeout);
5597
5598void __sched io_schedule(void)
5599{
5600 int token;
5601
5602 token = io_schedule_prepare();
5603 schedule();
5604 io_schedule_finish(token);
5605}
5606EXPORT_SYMBOL(io_schedule);
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5617{
5618 int ret = -EINVAL;
5619
5620 switch (policy) {
5621 case SCHED_FIFO:
5622 case SCHED_RR:
5623 ret = MAX_USER_RT_PRIO-1;
5624 break;
5625 case SCHED_DEADLINE:
5626 case SCHED_NORMAL:
5627 case SCHED_BATCH:
5628 case SCHED_IDLE:
5629 ret = 0;
5630 break;
5631 }
5632 return ret;
5633}
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5644{
5645 int ret = -EINVAL;
5646
5647 switch (policy) {
5648 case SCHED_FIFO:
5649 case SCHED_RR:
5650 ret = 1;
5651 break;
5652 case SCHED_DEADLINE:
5653 case SCHED_NORMAL:
5654 case SCHED_BATCH:
5655 case SCHED_IDLE:
5656 ret = 0;
5657 }
5658 return ret;
5659}
5660
5661static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
5662{
5663 struct task_struct *p;
5664 unsigned int time_slice;
5665 struct rq_flags rf;
5666 struct rq *rq;
5667 int retval;
5668
5669 if (pid < 0)
5670 return -EINVAL;
5671
5672 retval = -ESRCH;
5673 rcu_read_lock();
5674 p = find_process_by_pid(pid);
5675 if (!p)
5676 goto out_unlock;
5677
5678 retval = security_task_getscheduler(p);
5679 if (retval)
5680 goto out_unlock;
5681
5682 rq = task_rq_lock(p, &rf);
5683 time_slice = 0;
5684 if (p->sched_class->get_rr_interval)
5685 time_slice = p->sched_class->get_rr_interval(rq, p);
5686 task_rq_unlock(rq, p, &rf);
5687
5688 rcu_read_unlock();
5689 jiffies_to_timespec64(time_slice, t);
5690 return 0;
5691
5692out_unlock:
5693 rcu_read_unlock();
5694 return retval;
5695}
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5709 struct __kernel_timespec __user *, interval)
5710{
5711 struct timespec64 t;
5712 int retval = sched_rr_get_interval(pid, &t);
5713
5714 if (retval == 0)
5715 retval = put_timespec64(&t, interval);
5716
5717 return retval;
5718}
5719
5720#ifdef CONFIG_COMPAT_32BIT_TIME
5721SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
5722 struct old_timespec32 __user *, interval)
5723{
5724 struct timespec64 t;
5725 int retval = sched_rr_get_interval(pid, &t);
5726
5727 if (retval == 0)
5728 retval = put_old_timespec32(&t, interval);
5729 return retval;
5730}
5731#endif
5732
5733void sched_show_task(struct task_struct *p)
5734{
5735 unsigned long free = 0;
5736 int ppid;
5737
5738 if (!try_get_task_stack(p))
5739 return;
5740
5741 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
5742
5743 if (p->state == TASK_RUNNING)
5744 printk(KERN_CONT " running task ");
5745#ifdef CONFIG_DEBUG_STACK_USAGE
5746 free = stack_not_used(p);
5747#endif
5748 ppid = 0;
5749 rcu_read_lock();
5750 if (pid_alive(p))
5751 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5752 rcu_read_unlock();
5753 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5754 task_pid_nr(p), ppid,
5755 (unsigned long)task_thread_info(p)->flags);
5756
5757 print_worker_info(KERN_INFO, p);
5758 show_stack(p, NULL);
5759 put_task_stack(p);
5760}
5761EXPORT_SYMBOL_GPL(sched_show_task);
5762
5763static inline bool
5764state_filter_match(unsigned long state_filter, struct task_struct *p)
5765{
5766
5767 if (!state_filter)
5768 return true;
5769
5770
5771 if (!(p->state & state_filter))
5772 return false;
5773
5774
5775
5776
5777
5778 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5779 return false;
5780
5781 return true;
5782}
5783
5784
5785void show_state_filter(unsigned long state_filter)
5786{
5787 struct task_struct *g, *p;
5788
5789#if BITS_PER_LONG == 32
5790 printk(KERN_INFO
5791 " task PC stack pid father\n");
5792#else
5793 printk(KERN_INFO
5794 " task PC stack pid father\n");
5795#endif
5796 rcu_read_lock();
5797 for_each_process_thread(g, p) {
5798
5799
5800
5801
5802
5803
5804
5805 touch_nmi_watchdog();
5806 touch_all_softlockup_watchdogs();
5807 if (state_filter_match(state_filter, p))
5808 sched_show_task(p);
5809 }
5810
5811#ifdef CONFIG_SCHED_DEBUG
5812 if (!state_filter)
5813 sysrq_sched_debug_show();
5814#endif
5815 rcu_read_unlock();
5816
5817
5818
5819 if (!state_filter)
5820 debug_show_all_locks();
5821}
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831void init_idle(struct task_struct *idle, int cpu)
5832{
5833 struct rq *rq = cpu_rq(cpu);
5834 unsigned long flags;
5835
5836 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5837 raw_spin_lock(&rq->lock);
5838
5839 __sched_fork(0, idle);
5840 idle->state = TASK_RUNNING;
5841 idle->se.exec_start = sched_clock();
5842 idle->flags |= PF_IDLE;
5843
5844 kasan_unpoison_task_stack(idle);
5845
5846#ifdef CONFIG_SMP
5847
5848
5849
5850
5851
5852
5853 set_cpus_allowed_common(idle, cpumask_of(cpu));
5854#endif
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865 rcu_read_lock();
5866 __set_task_cpu(idle, cpu);
5867 rcu_read_unlock();
5868
5869 rq->curr = rq->idle = idle;
5870 idle->on_rq = TASK_ON_RQ_QUEUED;
5871#ifdef CONFIG_SMP
5872 idle->on_cpu = 1;
5873#endif
5874 raw_spin_unlock(&rq->lock);
5875 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5876
5877
5878 init_idle_preempt_count(idle, cpu);
5879
5880
5881
5882
5883 idle->sched_class = &idle_sched_class;
5884 ftrace_graph_init_idle_task(idle, cpu);
5885 vtime_init_idle(idle, cpu);
5886#ifdef CONFIG_SMP
5887 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5888#endif
5889}
5890
5891#ifdef CONFIG_SMP
5892
5893int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5894 const struct cpumask *trial)
5895{
5896 int ret = 1;
5897
5898 if (!cpumask_weight(cur))
5899 return ret;
5900
5901 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5902
5903 return ret;
5904}
5905
5906int task_can_attach(struct task_struct *p,
5907 const struct cpumask *cs_cpus_allowed)
5908{
5909 int ret = 0;
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920 if (p->flags & PF_NO_SETAFFINITY) {
5921 ret = -EINVAL;
5922 goto out;
5923 }
5924
5925 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5926 cs_cpus_allowed))
5927 ret = dl_task_can_attach(p, cs_cpus_allowed);
5928
5929out:
5930 return ret;
5931}
5932
5933bool sched_smp_initialized __read_mostly;
5934
5935#ifdef CONFIG_NUMA_BALANCING
5936
5937int migrate_task_to(struct task_struct *p, int target_cpu)
5938{
5939 struct migration_arg arg = { p, target_cpu };
5940 int curr_cpu = task_cpu(p);
5941
5942 if (curr_cpu == target_cpu)
5943 return 0;
5944
5945 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
5946 return -EINVAL;
5947
5948
5949
5950 trace_sched_move_numa(p, curr_cpu, target_cpu);
5951 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5952}
5953
5954
5955
5956
5957
5958void sched_setnuma(struct task_struct *p, int nid)
5959{
5960 bool queued, running;
5961 struct rq_flags rf;
5962 struct rq *rq;
5963
5964 rq = task_rq_lock(p, &rf);
5965 queued = task_on_rq_queued(p);
5966 running = task_current(rq, p);
5967
5968 if (queued)
5969 dequeue_task(rq, p, DEQUEUE_SAVE);
5970 if (running)
5971 put_prev_task(rq, p);
5972
5973 p->numa_preferred_nid = nid;
5974
5975 if (queued)
5976 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5977 if (running)
5978 set_curr_task(rq, p);
5979 task_rq_unlock(rq, p, &rf);
5980}
5981#endif
5982
5983#ifdef CONFIG_HOTPLUG_CPU
5984
5985
5986
5987
5988void idle_task_exit(void)
5989{
5990 struct mm_struct *mm = current->active_mm;
5991
5992 BUG_ON(cpu_online(smp_processor_id()));
5993
5994 if (mm != &init_mm) {
5995 switch_mm(mm, &init_mm, current);
5996 current->active_mm = &init_mm;
5997 finish_arch_post_lock_switch();
5998 }
5999 mmdrop(mm);
6000}
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011static void calc_load_migrate(struct rq *rq)
6012{
6013 long delta = calc_load_fold_active(rq, 1);
6014 if (delta)
6015 atomic_long_add(delta, &calc_load_tasks);
6016}
6017
6018static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6019{
6020}
6021
6022static const struct sched_class fake_sched_class = {
6023 .put_prev_task = put_prev_task_fake,
6024};
6025
6026static struct task_struct fake_task = {
6027
6028
6029
6030 .prio = MAX_PRIO + 1,
6031 .sched_class = &fake_sched_class,
6032};
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6043{
6044 struct rq *rq = dead_rq;
6045 struct task_struct *next, *stop = rq->stop;
6046 struct rq_flags orf = *rf;
6047 int dest_cpu;
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058 rq->stop = NULL;
6059
6060
6061
6062
6063
6064
6065 update_rq_clock(rq);
6066
6067 for (;;) {
6068
6069
6070
6071
6072 if (rq->nr_running == 1)
6073 break;
6074
6075
6076
6077
6078 next = pick_next_task(rq, &fake_task, rf);
6079 BUG_ON(!next);
6080 put_prev_task(rq, next);
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091 rq_unlock(rq, rf);
6092 raw_spin_lock(&next->pi_lock);
6093 rq_relock(rq, rf);
6094
6095
6096
6097
6098
6099
6100 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6101 raw_spin_unlock(&next->pi_lock);
6102 continue;
6103 }
6104
6105
6106 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6107 rq = __migrate_task(rq, rf, next, dest_cpu);
6108 if (rq != dead_rq) {
6109 rq_unlock(rq, rf);
6110 rq = dead_rq;
6111 *rf = orf;
6112 rq_relock(rq, rf);
6113 }
6114 raw_spin_unlock(&next->pi_lock);
6115 }
6116
6117 rq->stop = stop;
6118}
6119#endif
6120
6121void set_rq_online(struct rq *rq)
6122{
6123 if (!rq->online) {
6124 const struct sched_class *class;
6125
6126 cpumask_set_cpu(rq->cpu, rq->rd->online);
6127 rq->online = 1;
6128
6129 for_each_class(class) {
6130 if (class->rq_online)
6131 class->rq_online(rq);
6132 }
6133 }
6134}
6135
6136void set_rq_offline(struct rq *rq)
6137{
6138 if (rq->online) {
6139 const struct sched_class *class;
6140
6141 for_each_class(class) {
6142 if (class->rq_offline)
6143 class->rq_offline(rq);
6144 }
6145
6146 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6147 rq->online = 0;
6148 }
6149}
6150
6151
6152
6153
6154static int num_cpus_frozen;
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164static void cpuset_cpu_active(void)
6165{
6166 if (cpuhp_tasks_frozen) {
6167
6168
6169
6170
6171
6172
6173 partition_sched_domains(1, NULL, NULL);
6174 if (--num_cpus_frozen)
6175 return;
6176
6177
6178
6179
6180
6181 cpuset_force_rebuild();
6182 }
6183 cpuset_update_active_cpus();
6184}
6185
6186static int cpuset_cpu_inactive(unsigned int cpu)
6187{
6188 if (!cpuhp_tasks_frozen) {
6189 if (dl_cpu_busy(cpu))
6190 return -EBUSY;
6191 cpuset_update_active_cpus();
6192 } else {
6193 num_cpus_frozen++;
6194 partition_sched_domains(1, NULL, NULL);
6195 }
6196 return 0;
6197}
6198
6199int sched_cpu_activate(unsigned int cpu)
6200{
6201 struct rq *rq = cpu_rq(cpu);
6202 struct rq_flags rf;
6203
6204#ifdef CONFIG_SCHED_SMT
6205
6206
6207
6208 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6209 static_branch_inc_cpuslocked(&sched_smt_present);
6210#endif
6211 set_cpu_active(cpu, true);
6212
6213 if (sched_smp_initialized) {
6214 sched_domains_numa_masks_set(cpu);
6215 cpuset_cpu_active();
6216 }
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227 rq_lock_irqsave(rq, &rf);
6228 if (rq->rd) {
6229 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6230 set_rq_online(rq);
6231 }
6232 rq_unlock_irqrestore(rq, &rf);
6233
6234 update_max_interval();
6235
6236 return 0;
6237}
6238
6239int sched_cpu_deactivate(unsigned int cpu)
6240{
6241 int ret;
6242
6243 set_cpu_active(cpu, false);
6244
6245
6246
6247
6248
6249
6250
6251 synchronize_rcu();
6252
6253#ifdef CONFIG_SCHED_SMT
6254
6255
6256
6257 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
6258 static_branch_dec_cpuslocked(&sched_smt_present);
6259#endif
6260
6261 if (!sched_smp_initialized)
6262 return 0;
6263
6264 ret = cpuset_cpu_inactive(cpu);
6265 if (ret) {
6266 set_cpu_active(cpu, true);
6267 return ret;
6268 }
6269 sched_domains_numa_masks_clear(cpu);
6270 return 0;
6271}
6272
6273static void sched_rq_cpu_starting(unsigned int cpu)
6274{
6275 struct rq *rq = cpu_rq(cpu);
6276
6277 rq->calc_load_update = calc_load_update;
6278 update_max_interval();
6279}
6280
6281int sched_cpu_starting(unsigned int cpu)
6282{
6283 sched_rq_cpu_starting(cpu);
6284 sched_tick_start(cpu);
6285 return 0;
6286}
6287
6288#ifdef CONFIG_HOTPLUG_CPU
6289int sched_cpu_dying(unsigned int cpu)
6290{
6291 struct rq *rq = cpu_rq(cpu);
6292 struct rq_flags rf;
6293
6294
6295 sched_ttwu_pending();
6296 sched_tick_stop(cpu);
6297
6298 rq_lock_irqsave(rq, &rf);
6299 if (rq->rd) {
6300 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6301 set_rq_offline(rq);
6302 }
6303 migrate_tasks(rq, &rf);
6304 BUG_ON(rq->nr_running != 1);
6305 rq_unlock_irqrestore(rq, &rf);
6306
6307 calc_load_migrate(rq);
6308 update_max_interval();
6309 nohz_balance_exit_idle(rq);
6310 hrtick_clear(rq);
6311 return 0;
6312}
6313#endif
6314
6315void __init sched_init_smp(void)
6316{
6317 sched_init_numa();
6318
6319
6320
6321
6322
6323
6324 mutex_lock(&sched_domains_mutex);
6325 sched_init_domains(cpu_active_mask);
6326 mutex_unlock(&sched_domains_mutex);
6327
6328
6329 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
6330 BUG();
6331 sched_init_granularity();
6332
6333 init_sched_rt_class();
6334 init_sched_dl_class();
6335
6336 sched_smp_initialized = true;
6337}
6338
6339static int __init migration_init(void)
6340{
6341 sched_cpu_starting(smp_processor_id());
6342 return 0;
6343}
6344early_initcall(migration_init);
6345
6346#else
6347void __init sched_init_smp(void)
6348{
6349 sched_init_granularity();
6350}
6351#endif
6352
6353int in_sched_functions(unsigned long addr)
6354{
6355 return in_lock_functions(addr) ||
6356 (addr >= (unsigned long)__sched_text_start
6357 && addr < (unsigned long)__sched_text_end);
6358}
6359
6360#ifdef CONFIG_CGROUP_SCHED
6361
6362
6363
6364
6365struct task_group root_task_group;
6366LIST_HEAD(task_groups);
6367
6368
6369static struct kmem_cache *task_group_cache __read_mostly;
6370#endif
6371
6372DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6373DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
6374
6375void __init sched_init(void)
6376{
6377 unsigned long alloc_size = 0, ptr;
6378 int i;
6379
6380 wait_bit_init();
6381
6382#ifdef CONFIG_FAIR_GROUP_SCHED
6383 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6384#endif
6385#ifdef CONFIG_RT_GROUP_SCHED
6386 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6387#endif
6388 if (alloc_size) {
6389 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6390
6391#ifdef CONFIG_FAIR_GROUP_SCHED
6392 root_task_group.se = (struct sched_entity **)ptr;
6393 ptr += nr_cpu_ids * sizeof(void **);
6394
6395 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6396 ptr += nr_cpu_ids * sizeof(void **);
6397
6398#endif
6399#ifdef CONFIG_RT_GROUP_SCHED
6400 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6401 ptr += nr_cpu_ids * sizeof(void **);
6402
6403 root_task_group.rt_rq = (struct rt_rq **)ptr;
6404 ptr += nr_cpu_ids * sizeof(void **);
6405
6406#endif
6407 }
6408#ifdef CONFIG_CPUMASK_OFFSTACK
6409 for_each_possible_cpu(i) {
6410 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
6411 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
6412 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
6413 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
6414 }
6415#endif
6416
6417 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
6418 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
6419
6420#ifdef CONFIG_SMP
6421 init_defrootdomain();
6422#endif
6423
6424#ifdef CONFIG_RT_GROUP_SCHED
6425 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6426 global_rt_period(), global_rt_runtime());
6427#endif
6428
6429#ifdef CONFIG_CGROUP_SCHED
6430 task_group_cache = KMEM_CACHE(task_group, 0);
6431
6432 list_add(&root_task_group.list, &task_groups);
6433 INIT_LIST_HEAD(&root_task_group.children);
6434 INIT_LIST_HEAD(&root_task_group.siblings);
6435 autogroup_init(&init_task);
6436#endif
6437
6438 for_each_possible_cpu(i) {
6439 struct rq *rq;
6440
6441 rq = cpu_rq(i);
6442 raw_spin_lock_init(&rq->lock);
6443 rq->nr_running = 0;
6444 rq->calc_load_active = 0;
6445 rq->calc_load_update = jiffies + LOAD_FREQ;
6446 init_cfs_rq(&rq->cfs);
6447 init_rt_rq(&rq->rt);
6448 init_dl_rq(&rq->dl);
6449#ifdef CONFIG_FAIR_GROUP_SCHED
6450 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6451 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6452 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6473 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6474#endif
6475
6476 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6477#ifdef CONFIG_RT_GROUP_SCHED
6478 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6479#endif
6480#ifdef CONFIG_SMP
6481 rq->sd = NULL;
6482 rq->rd = NULL;
6483 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
6484 rq->balance_callback = NULL;
6485 rq->active_balance = 0;
6486 rq->next_balance = jiffies;
6487 rq->push_cpu = 0;
6488 rq->cpu = i;
6489 rq->online = 0;
6490 rq->idle_stamp = 0;
6491 rq->avg_idle = 2*sysctl_sched_migration_cost;
6492 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6493
6494 INIT_LIST_HEAD(&rq->cfs_tasks);
6495
6496 rq_attach_root(rq, &def_root_domain);
6497#ifdef CONFIG_NO_HZ_COMMON
6498 rq->last_load_update_tick = jiffies;
6499 rq->last_blocked_load_update_tick = jiffies;
6500 atomic_set(&rq->nohz_flags, 0);
6501#endif
6502#endif
6503 hrtick_rq_init(rq);
6504 atomic_set(&rq->nr_iowait, 0);
6505 }
6506
6507 set_load_weight(&init_task, false);
6508
6509
6510
6511
6512 mmgrab(&init_mm);
6513 enter_lazy_tlb(&init_mm, current);
6514
6515
6516
6517
6518
6519
6520
6521 init_idle(current, smp_processor_id());
6522
6523 calc_load_update = jiffies + LOAD_FREQ;
6524
6525#ifdef CONFIG_SMP
6526 idle_thread_set_boot_cpu();
6527#endif
6528 init_sched_fair_class();
6529
6530 init_schedstats();
6531
6532 psi_init();
6533
6534 init_uclamp();
6535
6536 scheduler_running = 1;
6537}
6538
6539#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6540static inline int preempt_count_equals(int preempt_offset)
6541{
6542 int nested = preempt_count() + rcu_preempt_depth();
6543
6544 return (nested == preempt_offset);
6545}
6546
6547void __might_sleep(const char *file, int line, int preempt_offset)
6548{
6549
6550
6551
6552
6553
6554 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6555 "do not call blocking ops when !TASK_RUNNING; "
6556 "state=%lx set at [<%p>] %pS\n",
6557 current->state,
6558 (void *)current->task_state_change,
6559 (void *)current->task_state_change);
6560
6561 ___might_sleep(file, line, preempt_offset);
6562}
6563EXPORT_SYMBOL(__might_sleep);
6564
6565void ___might_sleep(const char *file, int line, int preempt_offset)
6566{
6567
6568 static unsigned long prev_jiffy;
6569
6570 unsigned long preempt_disable_ip;
6571
6572
6573 rcu_sleep_check();
6574
6575 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6576 !is_idle_task(current)) ||
6577 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6578 oops_in_progress)
6579 return;
6580
6581 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6582 return;
6583 prev_jiffy = jiffies;
6584
6585
6586 preempt_disable_ip = get_preempt_disable_ip(current);
6587
6588 printk(KERN_ERR
6589 "BUG: sleeping function called from invalid context at %s:%d\n",
6590 file, line);
6591 printk(KERN_ERR
6592 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6593 in_atomic(), irqs_disabled(),
6594 current->pid, current->comm);
6595
6596 if (task_stack_end_corrupted(current))
6597 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6598
6599 debug_show_held_locks(current);
6600 if (irqs_disabled())
6601 print_irqtrace_events(current);
6602 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6603 && !preempt_count_equals(preempt_offset)) {
6604 pr_err("Preemption disabled at:");
6605 print_ip_sym(preempt_disable_ip);
6606 pr_cont("\n");
6607 }
6608 dump_stack();
6609 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6610}
6611EXPORT_SYMBOL(___might_sleep);
6612
6613void __cant_sleep(const char *file, int line, int preempt_offset)
6614{
6615 static unsigned long prev_jiffy;
6616
6617 if (irqs_disabled())
6618 return;
6619
6620 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
6621 return;
6622
6623 if (preempt_count() > preempt_offset)
6624 return;
6625
6626 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6627 return;
6628 prev_jiffy = jiffies;
6629
6630 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
6631 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6632 in_atomic(), irqs_disabled(),
6633 current->pid, current->comm);
6634
6635 debug_show_held_locks(current);
6636 dump_stack();
6637 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6638}
6639EXPORT_SYMBOL_GPL(__cant_sleep);
6640#endif
6641
6642#ifdef CONFIG_MAGIC_SYSRQ
6643void normalize_rt_tasks(void)
6644{
6645 struct task_struct *g, *p;
6646 struct sched_attr attr = {
6647 .sched_policy = SCHED_NORMAL,
6648 };
6649
6650 read_lock(&tasklist_lock);
6651 for_each_process_thread(g, p) {
6652
6653
6654
6655 if (p->flags & PF_KTHREAD)
6656 continue;
6657
6658 p->se.exec_start = 0;
6659 schedstat_set(p->se.statistics.wait_start, 0);
6660 schedstat_set(p->se.statistics.sleep_start, 0);
6661 schedstat_set(p->se.statistics.block_start, 0);
6662
6663 if (!dl_task(p) && !rt_task(p)) {
6664
6665
6666
6667
6668 if (task_nice(p) < 0)
6669 set_user_nice(p, 0);
6670 continue;
6671 }
6672
6673 __sched_setscheduler(p, &attr, false, false);
6674 }
6675 read_unlock(&tasklist_lock);
6676}
6677
6678#endif
6679
6680#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699struct task_struct *curr_task(int cpu)
6700{
6701 return cpu_curr(cpu);
6702}
6703
6704#endif
6705
6706#ifdef CONFIG_IA64
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722void ia64_set_curr_task(int cpu, struct task_struct *p)
6723{
6724 cpu_curr(cpu) = p;
6725}
6726
6727#endif
6728
6729#ifdef CONFIG_CGROUP_SCHED
6730
6731static DEFINE_SPINLOCK(task_group_lock);
6732
6733static void sched_free_group(struct task_group *tg)
6734{
6735 free_fair_sched_group(tg);
6736 free_rt_sched_group(tg);
6737 autogroup_free(tg);
6738 kmem_cache_free(task_group_cache, tg);
6739}
6740
6741
6742struct task_group *sched_create_group(struct task_group *parent)
6743{
6744 struct task_group *tg;
6745
6746 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6747 if (!tg)
6748 return ERR_PTR(-ENOMEM);
6749
6750 if (!alloc_fair_sched_group(tg, parent))
6751 goto err;
6752
6753 if (!alloc_rt_sched_group(tg, parent))
6754 goto err;
6755
6756 return tg;
6757
6758err:
6759 sched_free_group(tg);
6760 return ERR_PTR(-ENOMEM);
6761}
6762
6763void sched_online_group(struct task_group *tg, struct task_group *parent)
6764{
6765 unsigned long flags;
6766
6767 spin_lock_irqsave(&task_group_lock, flags);
6768 list_add_rcu(&tg->list, &task_groups);
6769
6770
6771 WARN_ON(!parent);
6772
6773 tg->parent = parent;
6774 INIT_LIST_HEAD(&tg->children);
6775 list_add_rcu(&tg->siblings, &parent->children);
6776 spin_unlock_irqrestore(&task_group_lock, flags);
6777
6778 online_fair_sched_group(tg);
6779}
6780
6781
6782static void sched_free_group_rcu(struct rcu_head *rhp)
6783{
6784
6785 sched_free_group(container_of(rhp, struct task_group, rcu));
6786}
6787
6788void sched_destroy_group(struct task_group *tg)
6789{
6790
6791 call_rcu(&tg->rcu, sched_free_group_rcu);
6792}
6793
6794void sched_offline_group(struct task_group *tg)
6795{
6796 unsigned long flags;
6797
6798
6799 unregister_fair_sched_group(tg);
6800
6801 spin_lock_irqsave(&task_group_lock, flags);
6802 list_del_rcu(&tg->list);
6803 list_del_rcu(&tg->siblings);
6804 spin_unlock_irqrestore(&task_group_lock, flags);
6805}
6806
6807static void sched_change_group(struct task_struct *tsk, int type)
6808{
6809 struct task_group *tg;
6810
6811
6812
6813
6814
6815
6816 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6817 struct task_group, css);
6818 tg = autogroup_task_group(tsk, tg);
6819 tsk->sched_task_group = tg;
6820
6821#ifdef CONFIG_FAIR_GROUP_SCHED
6822 if (tsk->sched_class->task_change_group)
6823 tsk->sched_class->task_change_group(tsk, type);
6824 else
6825#endif
6826 set_task_rq(tsk, task_cpu(tsk));
6827}
6828
6829
6830
6831
6832
6833
6834
6835
6836void sched_move_task(struct task_struct *tsk)
6837{
6838 int queued, running, queue_flags =
6839 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6840 struct rq_flags rf;
6841 struct rq *rq;
6842
6843 rq = task_rq_lock(tsk, &rf);
6844 update_rq_clock(rq);
6845
6846 running = task_current(rq, tsk);
6847 queued = task_on_rq_queued(tsk);
6848
6849 if (queued)
6850 dequeue_task(rq, tsk, queue_flags);
6851 if (running)
6852 put_prev_task(rq, tsk);
6853
6854 sched_change_group(tsk, TASK_MOVE_GROUP);
6855
6856 if (queued)
6857 enqueue_task(rq, tsk, queue_flags);
6858 if (running)
6859 set_curr_task(rq, tsk);
6860
6861 task_rq_unlock(rq, tsk, &rf);
6862}
6863
6864static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6865{
6866 return css ? container_of(css, struct task_group, css) : NULL;
6867}
6868
6869static struct cgroup_subsys_state *
6870cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6871{
6872 struct task_group *parent = css_tg(parent_css);
6873 struct task_group *tg;
6874
6875 if (!parent) {
6876
6877 return &root_task_group.css;
6878 }
6879
6880 tg = sched_create_group(parent);
6881 if (IS_ERR(tg))
6882 return ERR_PTR(-ENOMEM);
6883
6884 return &tg->css;
6885}
6886
6887
6888static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6889{
6890 struct task_group *tg = css_tg(css);
6891 struct task_group *parent = css_tg(css->parent);
6892
6893 if (parent)
6894 sched_online_group(tg, parent);
6895 return 0;
6896}
6897
6898static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6899{
6900 struct task_group *tg = css_tg(css);
6901
6902 sched_offline_group(tg);
6903}
6904
6905static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6906{
6907 struct task_group *tg = css_tg(css);
6908
6909
6910
6911
6912 sched_free_group(tg);
6913}
6914
6915
6916
6917
6918
6919static void cpu_cgroup_fork(struct task_struct *task)
6920{
6921 struct rq_flags rf;
6922 struct rq *rq;
6923
6924 rq = task_rq_lock(task, &rf);
6925
6926 update_rq_clock(rq);
6927 sched_change_group(task, TASK_SET_GROUP);
6928
6929 task_rq_unlock(rq, task, &rf);
6930}
6931
6932static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6933{
6934 struct task_struct *task;
6935 struct cgroup_subsys_state *css;
6936 int ret = 0;
6937
6938 cgroup_taskset_for_each(task, css, tset) {
6939#ifdef CONFIG_RT_GROUP_SCHED
6940 if (!sched_rt_can_attach(css_tg(css), task))
6941 return -EINVAL;
6942#else
6943
6944 if (task->sched_class != &fair_sched_class)
6945 return -EINVAL;
6946#endif
6947
6948
6949
6950
6951 raw_spin_lock_irq(&task->pi_lock);
6952
6953
6954
6955
6956
6957 if (task->state == TASK_NEW)
6958 ret = -EINVAL;
6959 raw_spin_unlock_irq(&task->pi_lock);
6960
6961 if (ret)
6962 break;
6963 }
6964 return ret;
6965}
6966
6967static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6968{
6969 struct task_struct *task;
6970 struct cgroup_subsys_state *css;
6971
6972 cgroup_taskset_for_each(task, css, tset)
6973 sched_move_task(task);
6974}
6975
6976#ifdef CONFIG_FAIR_GROUP_SCHED
6977static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6978 struct cftype *cftype, u64 shareval)
6979{
6980 if (shareval > scale_load_down(ULONG_MAX))
6981 shareval = MAX_SHARES;
6982 return sched_group_set_shares(css_tg(css), scale_load(shareval));
6983}
6984
6985static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6986 struct cftype *cft)
6987{
6988 struct task_group *tg = css_tg(css);
6989
6990 return (u64) scale_load_down(tg->shares);
6991}
6992
6993#ifdef CONFIG_CFS_BANDWIDTH
6994static DEFINE_MUTEX(cfs_constraints_mutex);
6995
6996const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
6997static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
6998
6999static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7000
7001static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7002{
7003 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7004 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7005
7006 if (tg == &root_task_group)
7007 return -EINVAL;
7008
7009
7010
7011
7012
7013
7014 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7015 return -EINVAL;
7016
7017
7018
7019
7020
7021
7022 if (period > max_cfs_quota_period)
7023 return -EINVAL;
7024
7025
7026
7027
7028
7029 get_online_cpus();
7030 mutex_lock(&cfs_constraints_mutex);
7031 ret = __cfs_schedulable(tg, period, quota);
7032 if (ret)
7033 goto out_unlock;
7034
7035 runtime_enabled = quota != RUNTIME_INF;
7036 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7037
7038
7039
7040
7041 if (runtime_enabled && !runtime_was_enabled)
7042 cfs_bandwidth_usage_inc();
7043 raw_spin_lock_irq(&cfs_b->lock);
7044 cfs_b->period = ns_to_ktime(period);
7045 cfs_b->quota = quota;
7046
7047 __refill_cfs_bandwidth_runtime(cfs_b);
7048
7049
7050 if (runtime_enabled)
7051 start_cfs_bandwidth(cfs_b);
7052
7053 raw_spin_unlock_irq(&cfs_b->lock);
7054
7055 for_each_online_cpu(i) {
7056 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7057 struct rq *rq = cfs_rq->rq;
7058 struct rq_flags rf;
7059
7060 rq_lock_irq(rq, &rf);
7061 cfs_rq->runtime_enabled = runtime_enabled;
7062 cfs_rq->runtime_remaining = 0;
7063
7064 if (cfs_rq->throttled)
7065 unthrottle_cfs_rq(cfs_rq);
7066 rq_unlock_irq(rq, &rf);
7067 }
7068 if (runtime_was_enabled && !runtime_enabled)
7069 cfs_bandwidth_usage_dec();
7070out_unlock:
7071 mutex_unlock(&cfs_constraints_mutex);
7072 put_online_cpus();
7073
7074 return ret;
7075}
7076
7077static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7078{
7079 u64 quota, period;
7080
7081 period = ktime_to_ns(tg->cfs_bandwidth.period);
7082 if (cfs_quota_us < 0)
7083 quota = RUNTIME_INF;
7084 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
7085 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7086 else
7087 return -EINVAL;
7088
7089 return tg_set_cfs_bandwidth(tg, period, quota);
7090}
7091
7092static long tg_get_cfs_quota(struct task_group *tg)
7093{
7094 u64 quota_us;
7095
7096 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7097 return -1;
7098
7099 quota_us = tg->cfs_bandwidth.quota;
7100 do_div(quota_us, NSEC_PER_USEC);
7101
7102 return quota_us;
7103}
7104
7105static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7106{
7107 u64 quota, period;
7108
7109 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
7110 return -EINVAL;
7111
7112 period = (u64)cfs_period_us * NSEC_PER_USEC;
7113 quota = tg->cfs_bandwidth.quota;
7114
7115 return tg_set_cfs_bandwidth(tg, period, quota);
7116}
7117
7118static long tg_get_cfs_period(struct task_group *tg)
7119{
7120 u64 cfs_period_us;
7121
7122 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7123 do_div(cfs_period_us, NSEC_PER_USEC);
7124
7125 return cfs_period_us;
7126}
7127
7128static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7129 struct cftype *cft)
7130{
7131 return tg_get_cfs_quota(css_tg(css));
7132}
7133
7134static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7135 struct cftype *cftype, s64 cfs_quota_us)
7136{
7137 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7138}
7139
7140static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7141 struct cftype *cft)
7142{
7143 return tg_get_cfs_period(css_tg(css));
7144}
7145
7146static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7147 struct cftype *cftype, u64 cfs_period_us)
7148{
7149 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7150}
7151
7152struct cfs_schedulable_data {
7153 struct task_group *tg;
7154 u64 period, quota;
7155};
7156
7157
7158
7159
7160
7161static u64 normalize_cfs_quota(struct task_group *tg,
7162 struct cfs_schedulable_data *d)
7163{
7164 u64 quota, period;
7165
7166 if (tg == d->tg) {
7167 period = d->period;
7168 quota = d->quota;
7169 } else {
7170 period = tg_get_cfs_period(tg);
7171 quota = tg_get_cfs_quota(tg);
7172 }
7173
7174
7175 if (quota == RUNTIME_INF || quota == -1)
7176 return RUNTIME_INF;
7177
7178 return to_ratio(period, quota);
7179}
7180
7181static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7182{
7183 struct cfs_schedulable_data *d = data;
7184 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7185 s64 quota = 0, parent_quota = -1;
7186
7187 if (!tg->parent) {
7188 quota = RUNTIME_INF;
7189 } else {
7190 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7191
7192 quota = normalize_cfs_quota(tg, d);
7193 parent_quota = parent_b->hierarchical_quota;
7194
7195
7196
7197
7198
7199
7200 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
7201 quota = min(quota, parent_quota);
7202 } else {
7203 if (quota == RUNTIME_INF)
7204 quota = parent_quota;
7205 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7206 return -EINVAL;
7207 }
7208 }
7209 cfs_b->hierarchical_quota = quota;
7210
7211 return 0;
7212}
7213
7214static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7215{
7216 int ret;
7217 struct cfs_schedulable_data data = {
7218 .tg = tg,
7219 .period = period,
7220 .quota = quota,
7221 };
7222
7223 if (quota != RUNTIME_INF) {
7224 do_div(data.period, NSEC_PER_USEC);
7225 do_div(data.quota, NSEC_PER_USEC);
7226 }
7227
7228 rcu_read_lock();
7229 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7230 rcu_read_unlock();
7231
7232 return ret;
7233}
7234
7235static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
7236{
7237 struct task_group *tg = css_tg(seq_css(sf));
7238 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7239
7240 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7241 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7242 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7243
7244 if (schedstat_enabled() && tg != &root_task_group) {
7245 u64 ws = 0;
7246 int i;
7247
7248 for_each_possible_cpu(i)
7249 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
7250
7251 seq_printf(sf, "wait_sum %llu\n", ws);
7252 }
7253
7254 return 0;
7255}
7256#endif
7257#endif
7258
7259#ifdef CONFIG_RT_GROUP_SCHED
7260static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7261 struct cftype *cft, s64 val)
7262{
7263 return sched_group_set_rt_runtime(css_tg(css), val);
7264}
7265
7266static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7267 struct cftype *cft)
7268{
7269 return sched_group_rt_runtime(css_tg(css));
7270}
7271
7272static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7273 struct cftype *cftype, u64 rt_period_us)
7274{
7275 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7276}
7277
7278static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7279 struct cftype *cft)
7280{
7281 return sched_group_rt_period(css_tg(css));
7282}
7283#endif
7284
7285static struct cftype cpu_legacy_files[] = {
7286#ifdef CONFIG_FAIR_GROUP_SCHED
7287 {
7288 .name = "shares",
7289 .read_u64 = cpu_shares_read_u64,
7290 .write_u64 = cpu_shares_write_u64,
7291 },
7292#endif
7293#ifdef CONFIG_CFS_BANDWIDTH
7294 {
7295 .name = "cfs_quota_us",
7296 .read_s64 = cpu_cfs_quota_read_s64,
7297 .write_s64 = cpu_cfs_quota_write_s64,
7298 },
7299 {
7300 .name = "cfs_period_us",
7301 .read_u64 = cpu_cfs_period_read_u64,
7302 .write_u64 = cpu_cfs_period_write_u64,
7303 },
7304 {
7305 .name = "stat",
7306 .seq_show = cpu_cfs_stat_show,
7307 },
7308#endif
7309#ifdef CONFIG_RT_GROUP_SCHED
7310 {
7311 .name = "rt_runtime_us",
7312 .read_s64 = cpu_rt_runtime_read,
7313 .write_s64 = cpu_rt_runtime_write,
7314 },
7315 {
7316 .name = "rt_period_us",
7317 .read_u64 = cpu_rt_period_read_uint,
7318 .write_u64 = cpu_rt_period_write_uint,
7319 },
7320#endif
7321 { }
7322};
7323
7324static int cpu_extra_stat_show(struct seq_file *sf,
7325 struct cgroup_subsys_state *css)
7326{
7327#ifdef CONFIG_CFS_BANDWIDTH
7328 {
7329 struct task_group *tg = css_tg(css);
7330 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7331 u64 throttled_usec;
7332
7333 throttled_usec = cfs_b->throttled_time;
7334 do_div(throttled_usec, NSEC_PER_USEC);
7335
7336 seq_printf(sf, "nr_periods %d\n"
7337 "nr_throttled %d\n"
7338 "throttled_usec %llu\n",
7339 cfs_b->nr_periods, cfs_b->nr_throttled,
7340 throttled_usec);
7341 }
7342#endif
7343 return 0;
7344}
7345
7346#ifdef CONFIG_FAIR_GROUP_SCHED
7347static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
7348 struct cftype *cft)
7349{
7350 struct task_group *tg = css_tg(css);
7351 u64 weight = scale_load_down(tg->shares);
7352
7353 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
7354}
7355
7356static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
7357 struct cftype *cft, u64 weight)
7358{
7359
7360
7361
7362
7363
7364
7365
7366 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
7367 return -ERANGE;
7368
7369 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
7370
7371 return sched_group_set_shares(css_tg(css), scale_load(weight));
7372}
7373
7374static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
7375 struct cftype *cft)
7376{
7377 unsigned long weight = scale_load_down(css_tg(css)->shares);
7378 int last_delta = INT_MAX;
7379 int prio, delta;
7380
7381
7382 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
7383 delta = abs(sched_prio_to_weight[prio] - weight);
7384 if (delta >= last_delta)
7385 break;
7386 last_delta = delta;
7387 }
7388
7389 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
7390}
7391
7392static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
7393 struct cftype *cft, s64 nice)
7394{
7395 unsigned long weight;
7396 int idx;
7397
7398 if (nice < MIN_NICE || nice > MAX_NICE)
7399 return -ERANGE;
7400
7401 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
7402 idx = array_index_nospec(idx, 40);
7403 weight = sched_prio_to_weight[idx];
7404
7405 return sched_group_set_shares(css_tg(css), scale_load(weight));
7406}
7407#endif
7408
7409static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
7410 long period, long quota)
7411{
7412 if (quota < 0)
7413 seq_puts(sf, "max");
7414 else
7415 seq_printf(sf, "%ld", quota);
7416
7417 seq_printf(sf, " %ld\n", period);
7418}
7419
7420
7421static int __maybe_unused cpu_period_quota_parse(char *buf,
7422 u64 *periodp, u64 *quotap)
7423{
7424 char tok[21];
7425
7426 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
7427 return -EINVAL;
7428
7429 *periodp *= NSEC_PER_USEC;
7430
7431 if (sscanf(tok, "%llu", quotap))
7432 *quotap *= NSEC_PER_USEC;
7433 else if (!strcmp(tok, "max"))
7434 *quotap = RUNTIME_INF;
7435 else
7436 return -EINVAL;
7437
7438 return 0;
7439}
7440
7441#ifdef CONFIG_CFS_BANDWIDTH
7442static int cpu_max_show(struct seq_file *sf, void *v)
7443{
7444 struct task_group *tg = css_tg(seq_css(sf));
7445
7446 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
7447 return 0;
7448}
7449
7450static ssize_t cpu_max_write(struct kernfs_open_file *of,
7451 char *buf, size_t nbytes, loff_t off)
7452{
7453 struct task_group *tg = css_tg(of_css(of));
7454 u64 period = tg_get_cfs_period(tg);
7455 u64 quota;
7456 int ret;
7457
7458 ret = cpu_period_quota_parse(buf, &period, "a);
7459 if (!ret)
7460 ret = tg_set_cfs_bandwidth(tg, period, quota);
7461 return ret ?: nbytes;
7462}
7463#endif
7464
7465static struct cftype cpu_files[] = {
7466#ifdef CONFIG_FAIR_GROUP_SCHED
7467 {
7468 .name = "weight",
7469 .flags = CFTYPE_NOT_ON_ROOT,
7470 .read_u64 = cpu_weight_read_u64,
7471 .write_u64 = cpu_weight_write_u64,
7472 },
7473 {
7474 .name = "weight.nice",
7475 .flags = CFTYPE_NOT_ON_ROOT,
7476 .read_s64 = cpu_weight_nice_read_s64,
7477 .write_s64 = cpu_weight_nice_write_s64,
7478 },
7479#endif
7480#ifdef CONFIG_CFS_BANDWIDTH
7481 {
7482 .name = "max",
7483 .flags = CFTYPE_NOT_ON_ROOT,
7484 .seq_show = cpu_max_show,
7485 .write = cpu_max_write,
7486 },
7487#endif
7488 { }
7489};
7490
7491struct cgroup_subsys cpu_cgrp_subsys = {
7492 .css_alloc = cpu_cgroup_css_alloc,
7493 .css_online = cpu_cgroup_css_online,
7494 .css_released = cpu_cgroup_css_released,
7495 .css_free = cpu_cgroup_css_free,
7496 .css_extra_stat_show = cpu_extra_stat_show,
7497 .fork = cpu_cgroup_fork,
7498 .can_attach = cpu_cgroup_can_attach,
7499 .attach = cpu_cgroup_attach,
7500 .legacy_cftypes = cpu_legacy_files,
7501 .dfl_cftypes = cpu_files,
7502 .early_init = true,
7503 .threaded = true,
7504};
7505
7506#endif
7507
7508void dump_cpu_task(int cpu)
7509{
7510 pr_info("Task dump for CPU %d:\n", cpu);
7511 sched_show_task(cpu_curr(cpu));
7512}
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526const int sched_prio_to_weight[40] = {
7527 88761, 71755, 56483, 46273, 36291,
7528 29154, 23254, 18705, 14949, 11916,
7529 9548, 7620, 6100, 4904, 3906,
7530 3121, 2501, 1991, 1586, 1277,
7531 1024, 820, 655, 526, 423,
7532 335, 272, 215, 172, 137,
7533 110, 87, 70, 56, 45,
7534 36, 29, 23, 18, 15,
7535};
7536
7537
7538
7539
7540
7541
7542
7543
7544const u32 sched_prio_to_wmult[40] = {
7545 48388, 59856, 76040, 92818, 118348,
7546 147320, 184698, 229616, 287308, 360437,
7547 449829, 563644, 704093, 875809, 1099582,
7548 1376151, 1717300, 2157191, 2708050, 3363326,
7549 4194304, 5237765, 6557202, 8165337, 10153587,
7550 12820798, 15790321, 19976592, 24970740, 31350126,
7551 39045157, 49367440, 61356676, 76695844, 95443717,
7552 119304647, 148102320, 186737708, 238609294, 286331153,
7553};
7554
7555#undef CREATE_TRACE_POINTS
7556