1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
44
45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
46
47#ifdef CONFIG_SCHED_DEBUG
48
49
50
51
52
53
54
55#define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57const_debug unsigned int sysctl_sched_features =
58#include "features.h"
59 0;
60#undef SCHED_FEAT
61
62
63
64
65
66
67
68
69__read_mostly int sysctl_resched_latency_warn_ms = 100;
70__read_mostly int sysctl_resched_latency_warn_once = 1;
71#endif
72
73
74
75
76
77const_debug unsigned int sysctl_sched_nr_migrate = 32;
78
79
80
81
82
83unsigned int sysctl_sched_rt_period = 1000000;
84
85__read_mostly int scheduler_running;
86
87#ifdef CONFIG_SCHED_CORE
88
89DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
90
91
92static inline int __task_prio(struct task_struct *p)
93{
94 if (p->sched_class == &stop_sched_class)
95 return -2;
96
97 if (rt_prio(p->prio))
98 return p->prio;
99
100 if (p->sched_class == &idle_sched_class)
101 return MAX_RT_PRIO + NICE_WIDTH;
102
103 return MAX_RT_PRIO + MAX_NICE;
104}
105
106
107
108
109
110
111
112
113
114static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
115{
116
117 int pa = __task_prio(a), pb = __task_prio(b);
118
119 if (-pa < -pb)
120 return true;
121
122 if (-pb < -pa)
123 return false;
124
125 if (pa == -1)
126 return !dl_time_before(a->dl.deadline, b->dl.deadline);
127
128 if (pa == MAX_RT_PRIO + MAX_NICE)
129 return cfs_prio_less(a, b, in_fi);
130
131 return false;
132}
133
134static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
135{
136 if (a->core_cookie < b->core_cookie)
137 return true;
138
139 if (a->core_cookie > b->core_cookie)
140 return false;
141
142
143 if (prio_less(b, a, task_rq(a)->core->core_forceidle))
144 return true;
145
146 return false;
147}
148
149#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
150
151static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
152{
153 return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
154}
155
156static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
157{
158 const struct task_struct *p = __node_2_sc(node);
159 unsigned long cookie = (unsigned long)key;
160
161 if (cookie < p->core_cookie)
162 return -1;
163
164 if (cookie > p->core_cookie)
165 return 1;
166
167 return 0;
168}
169
170void sched_core_enqueue(struct rq *rq, struct task_struct *p)
171{
172 rq->core->core_task_seq++;
173
174 if (!p->core_cookie)
175 return;
176
177 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
178}
179
180void sched_core_dequeue(struct rq *rq, struct task_struct *p)
181{
182 rq->core->core_task_seq++;
183
184 if (!sched_core_enqueued(p))
185 return;
186
187 rb_erase(&p->core_node, &rq->core_tree);
188 RB_CLEAR_NODE(&p->core_node);
189}
190
191
192
193
194static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
195{
196 struct rb_node *node;
197
198 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
199
200
201
202 if (!node)
203 return idle_sched_class.pick_task(rq);
204
205 return __node_2_sc(node);
206}
207
208static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
209{
210 struct rb_node *node = &p->core_node;
211
212 node = rb_next(node);
213 if (!node)
214 return NULL;
215
216 p = container_of(node, struct task_struct, core_node);
217 if (p->core_cookie != cookie)
218 return NULL;
219
220 return p;
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236static DEFINE_MUTEX(sched_core_mutex);
237static atomic_t sched_core_count;
238static struct cpumask sched_core_mask;
239
240static void sched_core_lock(int cpu, unsigned long *flags)
241{
242 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
243 int t, i = 0;
244
245 local_irq_save(*flags);
246 for_each_cpu(t, smt_mask)
247 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
248}
249
250static void sched_core_unlock(int cpu, unsigned long *flags)
251{
252 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
253 int t;
254
255 for_each_cpu(t, smt_mask)
256 raw_spin_unlock(&cpu_rq(t)->__lock);
257 local_irq_restore(*flags);
258}
259
260static void __sched_core_flip(bool enabled)
261{
262 unsigned long flags;
263 int cpu, t;
264
265 cpus_read_lock();
266
267
268
269
270 cpumask_copy(&sched_core_mask, cpu_online_mask);
271 for_each_cpu(cpu, &sched_core_mask) {
272 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
273
274 sched_core_lock(cpu, &flags);
275
276 for_each_cpu(t, smt_mask)
277 cpu_rq(t)->core_enabled = enabled;
278
279 sched_core_unlock(cpu, &flags);
280
281 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
282 }
283
284
285
286
287 cpumask_copy(&sched_core_mask, cpu_possible_mask);
288 cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
289
290 for_each_cpu(cpu, &sched_core_mask)
291 cpu_rq(cpu)->core_enabled = enabled;
292
293 cpus_read_unlock();
294}
295
296static void sched_core_assert_empty(void)
297{
298 int cpu;
299
300 for_each_possible_cpu(cpu)
301 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
302}
303
304static void __sched_core_enable(void)
305{
306 static_branch_enable(&__sched_core_enabled);
307
308
309
310
311 synchronize_rcu();
312 __sched_core_flip(true);
313 sched_core_assert_empty();
314}
315
316static void __sched_core_disable(void)
317{
318 sched_core_assert_empty();
319 __sched_core_flip(false);
320 static_branch_disable(&__sched_core_enabled);
321}
322
323void sched_core_get(void)
324{
325 if (atomic_inc_not_zero(&sched_core_count))
326 return;
327
328 mutex_lock(&sched_core_mutex);
329 if (!atomic_read(&sched_core_count))
330 __sched_core_enable();
331
332 smp_mb__before_atomic();
333 atomic_inc(&sched_core_count);
334 mutex_unlock(&sched_core_mutex);
335}
336
337static void __sched_core_put(struct work_struct *work)
338{
339 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
340 __sched_core_disable();
341 mutex_unlock(&sched_core_mutex);
342 }
343}
344
345void sched_core_put(void)
346{
347 static DECLARE_WORK(_work, __sched_core_put);
348
349
350
351
352
353
354
355
356 if (!atomic_add_unless(&sched_core_count, -1, 1))
357 schedule_work(&_work);
358}
359
360#else
361
362static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
363static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
364
365#endif
366
367
368
369
370
371int sysctl_sched_rt_runtime = 950000;
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
468{
469 raw_spinlock_t *lock;
470
471
472 preempt_disable();
473 if (sched_core_disabled()) {
474 raw_spin_lock_nested(&rq->__lock, subclass);
475
476 preempt_enable_no_resched();
477 return;
478 }
479
480 for (;;) {
481 lock = __rq_lockp(rq);
482 raw_spin_lock_nested(lock, subclass);
483 if (likely(lock == __rq_lockp(rq))) {
484
485 preempt_enable_no_resched();
486 return;
487 }
488 raw_spin_unlock(lock);
489 }
490}
491
492bool raw_spin_rq_trylock(struct rq *rq)
493{
494 raw_spinlock_t *lock;
495 bool ret;
496
497
498 preempt_disable();
499 if (sched_core_disabled()) {
500 ret = raw_spin_trylock(&rq->__lock);
501 preempt_enable();
502 return ret;
503 }
504
505 for (;;) {
506 lock = __rq_lockp(rq);
507 ret = raw_spin_trylock(lock);
508 if (!ret || (likely(lock == __rq_lockp(rq)))) {
509 preempt_enable();
510 return ret;
511 }
512 raw_spin_unlock(lock);
513 }
514}
515
516void raw_spin_rq_unlock(struct rq *rq)
517{
518 raw_spin_unlock(rq_lockp(rq));
519}
520
521#ifdef CONFIG_SMP
522
523
524
525void double_rq_lock(struct rq *rq1, struct rq *rq2)
526{
527 lockdep_assert_irqs_disabled();
528
529 if (rq_order_less(rq2, rq1))
530 swap(rq1, rq2);
531
532 raw_spin_rq_lock(rq1);
533 if (__rq_lockp(rq1) == __rq_lockp(rq2))
534 return;
535
536 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
537}
538#endif
539
540
541
542
543struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
544 __acquires(rq->lock)
545{
546 struct rq *rq;
547
548 lockdep_assert_held(&p->pi_lock);
549
550 for (;;) {
551 rq = task_rq(p);
552 raw_spin_rq_lock(rq);
553 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
554 rq_pin_lock(rq, rf);
555 return rq;
556 }
557 raw_spin_rq_unlock(rq);
558
559 while (unlikely(task_on_rq_migrating(p)))
560 cpu_relax();
561 }
562}
563
564
565
566
567struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
568 __acquires(p->pi_lock)
569 __acquires(rq->lock)
570{
571 struct rq *rq;
572
573 for (;;) {
574 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
575 rq = task_rq(p);
576 raw_spin_rq_lock(rq);
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
595 rq_pin_lock(rq, rf);
596 return rq;
597 }
598 raw_spin_rq_unlock(rq);
599 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
600
601 while (unlikely(task_on_rq_migrating(p)))
602 cpu_relax();
603 }
604}
605
606
607
608
609
610static void update_rq_clock_task(struct rq *rq, s64 delta)
611{
612
613
614
615
616 s64 __maybe_unused steal = 0, irq_delta = 0;
617
618#ifdef CONFIG_IRQ_TIME_ACCOUNTING
619 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636 if (irq_delta > delta)
637 irq_delta = delta;
638
639 rq->prev_irq_time += irq_delta;
640 delta -= irq_delta;
641#endif
642#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
643 if (static_key_false((¶virt_steal_rq_enabled))) {
644 steal = paravirt_steal_clock(cpu_of(rq));
645 steal -= rq->prev_steal_time_rq;
646
647 if (unlikely(steal > delta))
648 steal = delta;
649
650 rq->prev_steal_time_rq += steal;
651 delta -= steal;
652 }
653#endif
654
655 rq->clock_task += delta;
656
657#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
658 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
659 update_irq_load_avg(rq, irq_delta + steal);
660#endif
661 update_rq_clock_pelt(rq, delta);
662}
663
664void update_rq_clock(struct rq *rq)
665{
666 s64 delta;
667
668 lockdep_assert_rq_held(rq);
669
670 if (rq->clock_update_flags & RQCF_ACT_SKIP)
671 return;
672
673#ifdef CONFIG_SCHED_DEBUG
674 if (sched_feat(WARN_DOUBLE_CLOCK))
675 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
676 rq->clock_update_flags |= RQCF_UPDATED;
677#endif
678
679 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
680 if (delta < 0)
681 return;
682 rq->clock += delta;
683 update_rq_clock_task(rq, delta);
684}
685
686#ifdef CONFIG_SCHED_HRTICK
687
688
689
690
691static void hrtick_clear(struct rq *rq)
692{
693 if (hrtimer_active(&rq->hrtick_timer))
694 hrtimer_cancel(&rq->hrtick_timer);
695}
696
697
698
699
700
701static enum hrtimer_restart hrtick(struct hrtimer *timer)
702{
703 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
704 struct rq_flags rf;
705
706 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
707
708 rq_lock(rq, &rf);
709 update_rq_clock(rq);
710 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
711 rq_unlock(rq, &rf);
712
713 return HRTIMER_NORESTART;
714}
715
716#ifdef CONFIG_SMP
717
718static void __hrtick_restart(struct rq *rq)
719{
720 struct hrtimer *timer = &rq->hrtick_timer;
721 ktime_t time = rq->hrtick_time;
722
723 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
724}
725
726
727
728
729static void __hrtick_start(void *arg)
730{
731 struct rq *rq = arg;
732 struct rq_flags rf;
733
734 rq_lock(rq, &rf);
735 __hrtick_restart(rq);
736 rq_unlock(rq, &rf);
737}
738
739
740
741
742
743
744void hrtick_start(struct rq *rq, u64 delay)
745{
746 struct hrtimer *timer = &rq->hrtick_timer;
747 s64 delta;
748
749
750
751
752
753 delta = max_t(s64, delay, 10000LL);
754 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
755
756 if (rq == this_rq())
757 __hrtick_restart(rq);
758 else
759 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
760}
761
762#else
763
764
765
766
767
768void hrtick_start(struct rq *rq, u64 delay)
769{
770
771
772
773
774 delay = max_t(u64, delay, 10000LL);
775 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
776 HRTIMER_MODE_REL_PINNED_HARD);
777}
778
779#endif
780
781static void hrtick_rq_init(struct rq *rq)
782{
783#ifdef CONFIG_SMP
784 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
785#endif
786 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
787 rq->hrtick_timer.function = hrtick;
788}
789#else
790static inline void hrtick_clear(struct rq *rq)
791{
792}
793
794static inline void hrtick_rq_init(struct rq *rq)
795{
796}
797#endif
798
799
800
801
802#define fetch_or(ptr, mask) \
803 ({ \
804 typeof(ptr) _ptr = (ptr); \
805 typeof(mask) _mask = (mask); \
806 typeof(*_ptr) _old, _val = *_ptr; \
807 \
808 for (;;) { \
809 _old = cmpxchg(_ptr, _val, _val | _mask); \
810 if (_old == _val) \
811 break; \
812 _val = _old; \
813 } \
814 _old; \
815})
816
817#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
818
819
820
821
822
823static bool set_nr_and_not_polling(struct task_struct *p)
824{
825 struct thread_info *ti = task_thread_info(p);
826 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
827}
828
829
830
831
832
833
834
835static bool set_nr_if_polling(struct task_struct *p)
836{
837 struct thread_info *ti = task_thread_info(p);
838 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
839
840 for (;;) {
841 if (!(val & _TIF_POLLING_NRFLAG))
842 return false;
843 if (val & _TIF_NEED_RESCHED)
844 return true;
845 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
846 if (old == val)
847 break;
848 val = old;
849 }
850 return true;
851}
852
853#else
854static bool set_nr_and_not_polling(struct task_struct *p)
855{
856 set_tsk_need_resched(p);
857 return true;
858}
859
860#ifdef CONFIG_SMP
861static bool set_nr_if_polling(struct task_struct *p)
862{
863 return false;
864}
865#endif
866#endif
867
868static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
869{
870 struct wake_q_node *node = &task->wake_q;
871
872
873
874
875
876
877
878
879
880 smp_mb__before_atomic();
881 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
882 return false;
883
884
885
886
887 *head->lastp = node;
888 head->lastp = &node->next;
889 return true;
890}
891
892
893
894
895
896
897
898
899
900
901
902
903
904void wake_q_add(struct wake_q_head *head, struct task_struct *task)
905{
906 if (__wake_q_add(head, task))
907 get_task_struct(task);
908}
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
928{
929 if (!__wake_q_add(head, task))
930 put_task_struct(task);
931}
932
933void wake_up_q(struct wake_q_head *head)
934{
935 struct wake_q_node *node = head->first;
936
937 while (node != WAKE_Q_TAIL) {
938 struct task_struct *task;
939
940 task = container_of(node, struct task_struct, wake_q);
941
942 node = node->next;
943 task->wake_q.next = NULL;
944
945
946
947
948
949 wake_up_process(task);
950 put_task_struct(task);
951 }
952}
953
954
955
956
957
958
959
960
961void resched_curr(struct rq *rq)
962{
963 struct task_struct *curr = rq->curr;
964 int cpu;
965
966 lockdep_assert_rq_held(rq);
967
968 if (test_tsk_need_resched(curr))
969 return;
970
971 cpu = cpu_of(rq);
972
973 if (cpu == smp_processor_id()) {
974 set_tsk_need_resched(curr);
975 set_preempt_need_resched();
976 return;
977 }
978
979 if (set_nr_and_not_polling(curr))
980 smp_send_reschedule(cpu);
981 else
982 trace_sched_wake_idle_without_ipi(cpu);
983}
984
985void resched_cpu(int cpu)
986{
987 struct rq *rq = cpu_rq(cpu);
988 unsigned long flags;
989
990 raw_spin_rq_lock_irqsave(rq, flags);
991 if (cpu_online(cpu) || cpu == smp_processor_id())
992 resched_curr(rq);
993 raw_spin_rq_unlock_irqrestore(rq, flags);
994}
995
996#ifdef CONFIG_SMP
997#ifdef CONFIG_NO_HZ_COMMON
998
999
1000
1001
1002
1003
1004
1005
1006int get_nohz_timer_target(void)
1007{
1008 int i, cpu = smp_processor_id(), default_cpu = -1;
1009 struct sched_domain *sd;
1010
1011 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1012 if (!idle_cpu(cpu))
1013 return cpu;
1014 default_cpu = cpu;
1015 }
1016
1017 rcu_read_lock();
1018 for_each_domain(cpu, sd) {
1019 for_each_cpu_and(i, sched_domain_span(sd),
1020 housekeeping_cpumask(HK_FLAG_TIMER)) {
1021 if (cpu == i)
1022 continue;
1023
1024 if (!idle_cpu(i)) {
1025 cpu = i;
1026 goto unlock;
1027 }
1028 }
1029 }
1030
1031 if (default_cpu == -1)
1032 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1033 cpu = default_cpu;
1034unlock:
1035 rcu_read_unlock();
1036 return cpu;
1037}
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049static void wake_up_idle_cpu(int cpu)
1050{
1051 struct rq *rq = cpu_rq(cpu);
1052
1053 if (cpu == smp_processor_id())
1054 return;
1055
1056 if (set_nr_and_not_polling(rq->idle))
1057 smp_send_reschedule(cpu);
1058 else
1059 trace_sched_wake_idle_without_ipi(cpu);
1060}
1061
1062static bool wake_up_full_nohz_cpu(int cpu)
1063{
1064
1065
1066
1067
1068
1069
1070 if (cpu_is_offline(cpu))
1071 return true;
1072 if (tick_nohz_full_cpu(cpu)) {
1073 if (cpu != smp_processor_id() ||
1074 tick_nohz_tick_stopped())
1075 tick_nohz_full_kick_cpu(cpu);
1076 return true;
1077 }
1078
1079 return false;
1080}
1081
1082
1083
1084
1085
1086
1087void wake_up_nohz_cpu(int cpu)
1088{
1089 if (!wake_up_full_nohz_cpu(cpu))
1090 wake_up_idle_cpu(cpu);
1091}
1092
1093static void nohz_csd_func(void *info)
1094{
1095 struct rq *rq = info;
1096 int cpu = cpu_of(rq);
1097 unsigned int flags;
1098
1099
1100
1101
1102 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1103 WARN_ON(!(flags & NOHZ_KICK_MASK));
1104
1105 rq->idle_balance = idle_cpu(cpu);
1106 if (rq->idle_balance && !need_resched()) {
1107 rq->nohz_idle_balance = flags;
1108 raise_softirq_irqoff(SCHED_SOFTIRQ);
1109 }
1110}
1111
1112#endif
1113
1114#ifdef CONFIG_NO_HZ_FULL
1115bool sched_can_stop_tick(struct rq *rq)
1116{
1117 int fifo_nr_running;
1118
1119
1120 if (rq->dl.dl_nr_running)
1121 return false;
1122
1123
1124
1125
1126
1127 if (rq->rt.rr_nr_running) {
1128 if (rq->rt.rr_nr_running == 1)
1129 return true;
1130 else
1131 return false;
1132 }
1133
1134
1135
1136
1137
1138 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1139 if (fifo_nr_running)
1140 return true;
1141
1142
1143
1144
1145
1146
1147 if (rq->nr_running > 1)
1148 return false;
1149
1150 return true;
1151}
1152#endif
1153#endif
1154
1155#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1156 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1157
1158
1159
1160
1161
1162
1163int walk_tg_tree_from(struct task_group *from,
1164 tg_visitor down, tg_visitor up, void *data)
1165{
1166 struct task_group *parent, *child;
1167 int ret;
1168
1169 parent = from;
1170
1171down:
1172 ret = (*down)(parent, data);
1173 if (ret)
1174 goto out;
1175 list_for_each_entry_rcu(child, &parent->children, siblings) {
1176 parent = child;
1177 goto down;
1178
1179up:
1180 continue;
1181 }
1182 ret = (*up)(parent, data);
1183 if (ret || parent == from)
1184 goto out;
1185
1186 child = parent;
1187 parent = parent->parent;
1188 if (parent)
1189 goto up;
1190out:
1191 return ret;
1192}
1193
1194int tg_nop(struct task_group *tg, void *data)
1195{
1196 return 0;
1197}
1198#endif
1199
1200static void set_load_weight(struct task_struct *p, bool update_load)
1201{
1202 int prio = p->static_prio - MAX_RT_PRIO;
1203 struct load_weight *load = &p->se.load;
1204
1205
1206
1207
1208 if (task_has_idle_policy(p)) {
1209 load->weight = scale_load(WEIGHT_IDLEPRIO);
1210 load->inv_weight = WMULT_IDLEPRIO;
1211 return;
1212 }
1213
1214
1215
1216
1217
1218 if (update_load && p->sched_class == &fair_sched_class) {
1219 reweight_task(p, prio);
1220 } else {
1221 load->weight = scale_load(sched_prio_to_weight[prio]);
1222 load->inv_weight = sched_prio_to_wmult[prio];
1223 }
1224}
1225
1226#ifdef CONFIG_UCLAMP_TASK
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237static DEFINE_MUTEX(uclamp_mutex);
1238
1239
1240unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1241
1242
1243unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1261
1262
1263static struct uclamp_se uclamp_default[UCLAMP_CNT];
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1284
1285
1286#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1287
1288#define for_each_clamp_id(clamp_id) \
1289 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1290
1291static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1292{
1293 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1294}
1295
1296static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1297{
1298 if (clamp_id == UCLAMP_MIN)
1299 return 0;
1300 return SCHED_CAPACITY_SCALE;
1301}
1302
1303static inline void uclamp_se_set(struct uclamp_se *uc_se,
1304 unsigned int value, bool user_defined)
1305{
1306 uc_se->value = value;
1307 uc_se->bucket_id = uclamp_bucket_id(value);
1308 uc_se->user_defined = user_defined;
1309}
1310
1311static inline unsigned int
1312uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1313 unsigned int clamp_value)
1314{
1315
1316
1317
1318
1319
1320 if (clamp_id == UCLAMP_MAX) {
1321 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1322 return clamp_value;
1323 }
1324
1325 return uclamp_none(UCLAMP_MIN);
1326}
1327
1328static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1329 unsigned int clamp_value)
1330{
1331
1332 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1333 return;
1334
1335 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1336}
1337
1338static inline
1339unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1340 unsigned int clamp_value)
1341{
1342 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1343 int bucket_id = UCLAMP_BUCKETS - 1;
1344
1345
1346
1347
1348
1349 for ( ; bucket_id >= 0; bucket_id--) {
1350 if (!bucket[bucket_id].tasks)
1351 continue;
1352 return bucket[bucket_id].value;
1353 }
1354
1355
1356 return uclamp_idle_value(rq, clamp_id, clamp_value);
1357}
1358
1359static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1360{
1361 unsigned int default_util_min;
1362 struct uclamp_se *uc_se;
1363
1364 lockdep_assert_held(&p->pi_lock);
1365
1366 uc_se = &p->uclamp_req[UCLAMP_MIN];
1367
1368
1369 if (uc_se->user_defined)
1370 return;
1371
1372 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1373 uclamp_se_set(uc_se, default_util_min, false);
1374}
1375
1376static void uclamp_update_util_min_rt_default(struct task_struct *p)
1377{
1378 struct rq_flags rf;
1379 struct rq *rq;
1380
1381 if (!rt_task(p))
1382 return;
1383
1384
1385 rq = task_rq_lock(p, &rf);
1386 __uclamp_update_util_min_rt_default(p);
1387 task_rq_unlock(rq, p, &rf);
1388}
1389
1390static void uclamp_sync_util_min_rt_default(void)
1391{
1392 struct task_struct *g, *p;
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407 read_lock(&tasklist_lock);
1408 smp_mb__after_spinlock();
1409 read_unlock(&tasklist_lock);
1410
1411 rcu_read_lock();
1412 for_each_process_thread(g, p)
1413 uclamp_update_util_min_rt_default(p);
1414 rcu_read_unlock();
1415}
1416
1417static inline struct uclamp_se
1418uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1419{
1420
1421 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1422#ifdef CONFIG_UCLAMP_TASK_GROUP
1423 unsigned int tg_min, tg_max, value;
1424
1425
1426
1427
1428
1429 if (task_group_is_autogroup(task_group(p)))
1430 return uc_req;
1431 if (task_group(p) == &root_task_group)
1432 return uc_req;
1433
1434 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1435 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1436 value = uc_req.value;
1437 value = clamp(value, tg_min, tg_max);
1438 uclamp_se_set(&uc_req, value, false);
1439#endif
1440
1441 return uc_req;
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452static inline struct uclamp_se
1453uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1454{
1455 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1456 struct uclamp_se uc_max = uclamp_default[clamp_id];
1457
1458
1459 if (unlikely(uc_req.value > uc_max.value))
1460 return uc_max;
1461
1462 return uc_req;
1463}
1464
1465unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1466{
1467 struct uclamp_se uc_eff;
1468
1469
1470 if (p->uclamp[clamp_id].active)
1471 return (unsigned long)p->uclamp[clamp_id].value;
1472
1473 uc_eff = uclamp_eff_get(p, clamp_id);
1474
1475 return (unsigned long)uc_eff.value;
1476}
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1489 enum uclamp_id clamp_id)
1490{
1491 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1492 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1493 struct uclamp_bucket *bucket;
1494
1495 lockdep_assert_rq_held(rq);
1496
1497
1498 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1499
1500 bucket = &uc_rq->bucket[uc_se->bucket_id];
1501 bucket->tasks++;
1502 uc_se->active = true;
1503
1504 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1505
1506
1507
1508
1509
1510 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1511 bucket->value = uc_se->value;
1512
1513 if (uc_se->value > READ_ONCE(uc_rq->value))
1514 WRITE_ONCE(uc_rq->value, uc_se->value);
1515}
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1527 enum uclamp_id clamp_id)
1528{
1529 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1530 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1531 struct uclamp_bucket *bucket;
1532 unsigned int bkt_clamp;
1533 unsigned int rq_clamp;
1534
1535 lockdep_assert_rq_held(rq);
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560 if (unlikely(!uc_se->active))
1561 return;
1562
1563 bucket = &uc_rq->bucket[uc_se->bucket_id];
1564
1565 SCHED_WARN_ON(!bucket->tasks);
1566 if (likely(bucket->tasks))
1567 bucket->tasks--;
1568
1569 uc_se->active = false;
1570
1571
1572
1573
1574
1575
1576
1577 if (likely(bucket->tasks))
1578 return;
1579
1580 rq_clamp = READ_ONCE(uc_rq->value);
1581
1582
1583
1584
1585 SCHED_WARN_ON(bucket->value > rq_clamp);
1586 if (bucket->value >= rq_clamp) {
1587 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1588 WRITE_ONCE(uc_rq->value, bkt_clamp);
1589 }
1590}
1591
1592static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1593{
1594 enum uclamp_id clamp_id;
1595
1596
1597
1598
1599
1600
1601
1602 if (!static_branch_unlikely(&sched_uclamp_used))
1603 return;
1604
1605 if (unlikely(!p->sched_class->uclamp_enabled))
1606 return;
1607
1608 for_each_clamp_id(clamp_id)
1609 uclamp_rq_inc_id(rq, p, clamp_id);
1610
1611
1612 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1613 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1614}
1615
1616static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1617{
1618 enum uclamp_id clamp_id;
1619
1620
1621
1622
1623
1624
1625
1626 if (!static_branch_unlikely(&sched_uclamp_used))
1627 return;
1628
1629 if (unlikely(!p->sched_class->uclamp_enabled))
1630 return;
1631
1632 for_each_clamp_id(clamp_id)
1633 uclamp_rq_dec_id(rq, p, clamp_id);
1634}
1635
1636static inline void
1637uclamp_update_active(struct task_struct *p)
1638{
1639 enum uclamp_id clamp_id;
1640 struct rq_flags rf;
1641 struct rq *rq;
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651 rq = task_rq_lock(p, &rf);
1652
1653
1654
1655
1656
1657
1658
1659 for_each_clamp_id(clamp_id) {
1660 if (p->uclamp[clamp_id].active) {
1661 uclamp_rq_dec_id(rq, p, clamp_id);
1662 uclamp_rq_inc_id(rq, p, clamp_id);
1663 }
1664 }
1665
1666 task_rq_unlock(rq, p, &rf);
1667}
1668
1669#ifdef CONFIG_UCLAMP_TASK_GROUP
1670static inline void
1671uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1672{
1673 struct css_task_iter it;
1674 struct task_struct *p;
1675
1676 css_task_iter_start(css, 0, &it);
1677 while ((p = css_task_iter_next(&it)))
1678 uclamp_update_active(p);
1679 css_task_iter_end(&it);
1680}
1681
1682static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1683static void uclamp_update_root_tg(void)
1684{
1685 struct task_group *tg = &root_task_group;
1686
1687 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1688 sysctl_sched_uclamp_util_min, false);
1689 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1690 sysctl_sched_uclamp_util_max, false);
1691
1692 rcu_read_lock();
1693 cpu_util_update_eff(&root_task_group.css);
1694 rcu_read_unlock();
1695}
1696#else
1697static void uclamp_update_root_tg(void) { }
1698#endif
1699
1700int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1701 void *buffer, size_t *lenp, loff_t *ppos)
1702{
1703 bool update_root_tg = false;
1704 int old_min, old_max, old_min_rt;
1705 int result;
1706
1707 mutex_lock(&uclamp_mutex);
1708 old_min = sysctl_sched_uclamp_util_min;
1709 old_max = sysctl_sched_uclamp_util_max;
1710 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1711
1712 result = proc_dointvec(table, write, buffer, lenp, ppos);
1713 if (result)
1714 goto undo;
1715 if (!write)
1716 goto done;
1717
1718 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1719 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1720 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1721
1722 result = -EINVAL;
1723 goto undo;
1724 }
1725
1726 if (old_min != sysctl_sched_uclamp_util_min) {
1727 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1728 sysctl_sched_uclamp_util_min, false);
1729 update_root_tg = true;
1730 }
1731 if (old_max != sysctl_sched_uclamp_util_max) {
1732 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1733 sysctl_sched_uclamp_util_max, false);
1734 update_root_tg = true;
1735 }
1736
1737 if (update_root_tg) {
1738 static_branch_enable(&sched_uclamp_used);
1739 uclamp_update_root_tg();
1740 }
1741
1742 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1743 static_branch_enable(&sched_uclamp_used);
1744 uclamp_sync_util_min_rt_default();
1745 }
1746
1747
1748
1749
1750
1751
1752
1753 goto done;
1754
1755undo:
1756 sysctl_sched_uclamp_util_min = old_min;
1757 sysctl_sched_uclamp_util_max = old_max;
1758 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1759done:
1760 mutex_unlock(&uclamp_mutex);
1761
1762 return result;
1763}
1764
1765static int uclamp_validate(struct task_struct *p,
1766 const struct sched_attr *attr)
1767{
1768 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1769 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1770
1771 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1772 util_min = attr->sched_util_min;
1773
1774 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1775 return -EINVAL;
1776 }
1777
1778 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1779 util_max = attr->sched_util_max;
1780
1781 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1782 return -EINVAL;
1783 }
1784
1785 if (util_min != -1 && util_max != -1 && util_min > util_max)
1786 return -EINVAL;
1787
1788
1789
1790
1791
1792
1793
1794
1795 static_branch_enable(&sched_uclamp_used);
1796
1797 return 0;
1798}
1799
1800static bool uclamp_reset(const struct sched_attr *attr,
1801 enum uclamp_id clamp_id,
1802 struct uclamp_se *uc_se)
1803{
1804
1805 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1806 !uc_se->user_defined)
1807 return true;
1808
1809
1810 if (clamp_id == UCLAMP_MIN &&
1811 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1812 attr->sched_util_min == -1) {
1813 return true;
1814 }
1815
1816 if (clamp_id == UCLAMP_MAX &&
1817 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1818 attr->sched_util_max == -1) {
1819 return true;
1820 }
1821
1822 return false;
1823}
1824
1825static void __setscheduler_uclamp(struct task_struct *p,
1826 const struct sched_attr *attr)
1827{
1828 enum uclamp_id clamp_id;
1829
1830 for_each_clamp_id(clamp_id) {
1831 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1832 unsigned int value;
1833
1834 if (!uclamp_reset(attr, clamp_id, uc_se))
1835 continue;
1836
1837
1838
1839
1840
1841 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1842 value = sysctl_sched_uclamp_util_min_rt_default;
1843 else
1844 value = uclamp_none(clamp_id);
1845
1846 uclamp_se_set(uc_se, value, false);
1847
1848 }
1849
1850 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1851 return;
1852
1853 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1854 attr->sched_util_min != -1) {
1855 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1856 attr->sched_util_min, true);
1857 }
1858
1859 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1860 attr->sched_util_max != -1) {
1861 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1862 attr->sched_util_max, true);
1863 }
1864}
1865
1866static void uclamp_fork(struct task_struct *p)
1867{
1868 enum uclamp_id clamp_id;
1869
1870
1871
1872
1873
1874 for_each_clamp_id(clamp_id)
1875 p->uclamp[clamp_id].active = false;
1876
1877 if (likely(!p->sched_reset_on_fork))
1878 return;
1879
1880 for_each_clamp_id(clamp_id) {
1881 uclamp_se_set(&p->uclamp_req[clamp_id],
1882 uclamp_none(clamp_id), false);
1883 }
1884}
1885
1886static void uclamp_post_fork(struct task_struct *p)
1887{
1888 uclamp_update_util_min_rt_default(p);
1889}
1890
1891static void __init init_uclamp_rq(struct rq *rq)
1892{
1893 enum uclamp_id clamp_id;
1894 struct uclamp_rq *uc_rq = rq->uclamp;
1895
1896 for_each_clamp_id(clamp_id) {
1897 uc_rq[clamp_id] = (struct uclamp_rq) {
1898 .value = uclamp_none(clamp_id)
1899 };
1900 }
1901
1902 rq->uclamp_flags = 0;
1903}
1904
1905static void __init init_uclamp(void)
1906{
1907 struct uclamp_se uc_max = {};
1908 enum uclamp_id clamp_id;
1909 int cpu;
1910
1911 for_each_possible_cpu(cpu)
1912 init_uclamp_rq(cpu_rq(cpu));
1913
1914 for_each_clamp_id(clamp_id) {
1915 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1916 uclamp_none(clamp_id), false);
1917 }
1918
1919
1920 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1921 for_each_clamp_id(clamp_id) {
1922 uclamp_default[clamp_id] = uc_max;
1923#ifdef CONFIG_UCLAMP_TASK_GROUP
1924 root_task_group.uclamp_req[clamp_id] = uc_max;
1925 root_task_group.uclamp[clamp_id] = uc_max;
1926#endif
1927 }
1928}
1929
1930#else
1931static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1932static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1933static inline int uclamp_validate(struct task_struct *p,
1934 const struct sched_attr *attr)
1935{
1936 return -EOPNOTSUPP;
1937}
1938static void __setscheduler_uclamp(struct task_struct *p,
1939 const struct sched_attr *attr) { }
1940static inline void uclamp_fork(struct task_struct *p) { }
1941static inline void uclamp_post_fork(struct task_struct *p) { }
1942static inline void init_uclamp(void) { }
1943#endif
1944
1945bool sched_task_on_rq(struct task_struct *p)
1946{
1947 return task_on_rq_queued(p);
1948}
1949
1950static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1951{
1952 if (!(flags & ENQUEUE_NOCLOCK))
1953 update_rq_clock(rq);
1954
1955 if (!(flags & ENQUEUE_RESTORE)) {
1956 sched_info_enqueue(rq, p);
1957 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1958 }
1959
1960 uclamp_rq_inc(rq, p);
1961 p->sched_class->enqueue_task(rq, p, flags);
1962
1963 if (sched_core_enabled(rq))
1964 sched_core_enqueue(rq, p);
1965}
1966
1967static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1968{
1969 if (sched_core_enabled(rq))
1970 sched_core_dequeue(rq, p);
1971
1972 if (!(flags & DEQUEUE_NOCLOCK))
1973 update_rq_clock(rq);
1974
1975 if (!(flags & DEQUEUE_SAVE)) {
1976 sched_info_dequeue(rq, p);
1977 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1978 }
1979
1980 uclamp_rq_dec(rq, p);
1981 p->sched_class->dequeue_task(rq, p, flags);
1982}
1983
1984void activate_task(struct rq *rq, struct task_struct *p, int flags)
1985{
1986 enqueue_task(rq, p, flags);
1987
1988 p->on_rq = TASK_ON_RQ_QUEUED;
1989}
1990
1991void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1992{
1993 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1994
1995 dequeue_task(rq, p, flags);
1996}
1997
1998static inline int __normal_prio(int policy, int rt_prio, int nice)
1999{
2000 int prio;
2001
2002 if (dl_policy(policy))
2003 prio = MAX_DL_PRIO - 1;
2004 else if (rt_policy(policy))
2005 prio = MAX_RT_PRIO - 1 - rt_prio;
2006 else
2007 prio = NICE_TO_PRIO(nice);
2008
2009 return prio;
2010}
2011
2012
2013
2014
2015
2016
2017
2018
2019static inline int normal_prio(struct task_struct *p)
2020{
2021 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2022}
2023
2024
2025
2026
2027
2028
2029
2030
2031static int effective_prio(struct task_struct *p)
2032{
2033 p->normal_prio = normal_prio(p);
2034
2035
2036
2037
2038
2039 if (!rt_prio(p->prio))
2040 return p->normal_prio;
2041 return p->prio;
2042}
2043
2044
2045
2046
2047
2048
2049
2050inline int task_curr(const struct task_struct *p)
2051{
2052 return cpu_curr(task_cpu(p)) == p;
2053}
2054
2055
2056
2057
2058
2059
2060
2061
2062static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2063 const struct sched_class *prev_class,
2064 int oldprio)
2065{
2066 if (prev_class != p->sched_class) {
2067 if (prev_class->switched_from)
2068 prev_class->switched_from(rq, p);
2069
2070 p->sched_class->switched_to(rq, p);
2071 } else if (oldprio != p->prio || dl_task(p))
2072 p->sched_class->prio_changed(rq, p, oldprio);
2073}
2074
2075void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2076{
2077 if (p->sched_class == rq->curr->sched_class)
2078 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2079 else if (p->sched_class > rq->curr->sched_class)
2080 resched_curr(rq);
2081
2082
2083
2084
2085
2086 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2087 rq_clock_skip_update(rq);
2088}
2089
2090#ifdef CONFIG_SMP
2091
2092static void
2093__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2094
2095static int __set_cpus_allowed_ptr(struct task_struct *p,
2096 const struct cpumask *new_mask,
2097 u32 flags);
2098
2099static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2100{
2101 if (likely(!p->migration_disabled))
2102 return;
2103
2104 if (p->cpus_ptr != &p->cpus_mask)
2105 return;
2106
2107
2108
2109
2110 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2111}
2112
2113void migrate_disable(void)
2114{
2115 struct task_struct *p = current;
2116
2117 if (p->migration_disabled) {
2118 p->migration_disabled++;
2119 return;
2120 }
2121
2122 preempt_disable();
2123 this_rq()->nr_pinned++;
2124 p->migration_disabled = 1;
2125 preempt_enable();
2126}
2127EXPORT_SYMBOL_GPL(migrate_disable);
2128
2129void migrate_enable(void)
2130{
2131 struct task_struct *p = current;
2132
2133 if (p->migration_disabled > 1) {
2134 p->migration_disabled--;
2135 return;
2136 }
2137
2138
2139
2140
2141
2142 preempt_disable();
2143 if (p->cpus_ptr != &p->cpus_mask)
2144 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2145
2146
2147
2148
2149
2150 barrier();
2151 p->migration_disabled = 0;
2152 this_rq()->nr_pinned--;
2153 preempt_enable();
2154}
2155EXPORT_SYMBOL_GPL(migrate_enable);
2156
2157static inline bool rq_has_pinned_tasks(struct rq *rq)
2158{
2159 return rq->nr_pinned;
2160}
2161
2162
2163
2164
2165
2166static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2167{
2168
2169 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2170 return false;
2171
2172
2173 if (is_migration_disabled(p))
2174 return cpu_online(cpu);
2175
2176
2177 if (!(p->flags & PF_KTHREAD))
2178 return cpu_active(cpu);
2179
2180
2181 if (kthread_is_per_cpu(p))
2182 return cpu_online(cpu);
2183
2184
2185 if (cpu_dying(cpu))
2186 return false;
2187
2188
2189 return cpu_online(cpu);
2190}
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2212 struct task_struct *p, int new_cpu)
2213{
2214 lockdep_assert_rq_held(rq);
2215
2216 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2217 set_task_cpu(p, new_cpu);
2218 rq_unlock(rq, rf);
2219
2220 rq = cpu_rq(new_cpu);
2221
2222 rq_lock(rq, rf);
2223 BUG_ON(task_cpu(p) != new_cpu);
2224 activate_task(rq, p, 0);
2225 check_preempt_curr(rq, p, 0);
2226
2227 return rq;
2228}
2229
2230struct migration_arg {
2231 struct task_struct *task;
2232 int dest_cpu;
2233 struct set_affinity_pending *pending;
2234};
2235
2236
2237
2238
2239
2240struct set_affinity_pending {
2241 refcount_t refs;
2242 unsigned int stop_pending;
2243 struct completion done;
2244 struct cpu_stop_work stop_work;
2245 struct migration_arg arg;
2246};
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2258 struct task_struct *p, int dest_cpu)
2259{
2260
2261 if (!is_cpu_allowed(p, dest_cpu))
2262 return rq;
2263
2264 update_rq_clock(rq);
2265 rq = move_queued_task(rq, rf, p, dest_cpu);
2266
2267 return rq;
2268}
2269
2270
2271
2272
2273
2274
2275static int migration_cpu_stop(void *data)
2276{
2277 struct migration_arg *arg = data;
2278 struct set_affinity_pending *pending = arg->pending;
2279 struct task_struct *p = arg->task;
2280 struct rq *rq = this_rq();
2281 bool complete = false;
2282 struct rq_flags rf;
2283
2284
2285
2286
2287
2288 local_irq_save(rf.flags);
2289
2290
2291
2292
2293
2294 flush_smp_call_function_from_idle();
2295
2296 raw_spin_lock(&p->pi_lock);
2297 rq_lock(rq, &rf);
2298
2299
2300
2301
2302
2303 WARN_ON_ONCE(pending && pending != p->migration_pending);
2304
2305
2306
2307
2308
2309
2310 if (task_rq(p) == rq) {
2311 if (is_migration_disabled(p))
2312 goto out;
2313
2314 if (pending) {
2315 p->migration_pending = NULL;
2316 complete = true;
2317
2318 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2319 goto out;
2320 }
2321
2322 if (task_on_rq_queued(p))
2323 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2324 else
2325 p->wake_cpu = arg->dest_cpu;
2326
2327
2328
2329
2330
2331
2332
2333
2334 } else if (pending) {
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2350 p->migration_pending = NULL;
2351 complete = true;
2352 goto out;
2353 }
2354
2355
2356
2357
2358
2359
2360 WARN_ON_ONCE(!pending->stop_pending);
2361 task_rq_unlock(rq, p, &rf);
2362 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2363 &pending->arg, &pending->stop_work);
2364 return 0;
2365 }
2366out:
2367 if (pending)
2368 pending->stop_pending = false;
2369 task_rq_unlock(rq, p, &rf);
2370
2371 if (complete)
2372 complete_all(&pending->done);
2373
2374 return 0;
2375}
2376
2377int push_cpu_stop(void *arg)
2378{
2379 struct rq *lowest_rq = NULL, *rq = this_rq();
2380 struct task_struct *p = arg;
2381
2382 raw_spin_lock_irq(&p->pi_lock);
2383 raw_spin_rq_lock(rq);
2384
2385 if (task_rq(p) != rq)
2386 goto out_unlock;
2387
2388 if (is_migration_disabled(p)) {
2389 p->migration_flags |= MDF_PUSH;
2390 goto out_unlock;
2391 }
2392
2393 p->migration_flags &= ~MDF_PUSH;
2394
2395 if (p->sched_class->find_lock_rq)
2396 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2397
2398 if (!lowest_rq)
2399 goto out_unlock;
2400
2401
2402 if (task_rq(p) == rq) {
2403 deactivate_task(rq, p, 0);
2404 set_task_cpu(p, lowest_rq->cpu);
2405 activate_task(lowest_rq, p, 0);
2406 resched_curr(lowest_rq);
2407 }
2408
2409 double_unlock_balance(rq, lowest_rq);
2410
2411out_unlock:
2412 rq->push_busy = false;
2413 raw_spin_rq_unlock(rq);
2414 raw_spin_unlock_irq(&p->pi_lock);
2415
2416 put_task_struct(p);
2417 return 0;
2418}
2419
2420
2421
2422
2423
2424void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2425{
2426 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2427 p->cpus_ptr = new_mask;
2428 return;
2429 }
2430
2431 cpumask_copy(&p->cpus_mask, new_mask);
2432 p->nr_cpus_allowed = cpumask_weight(new_mask);
2433}
2434
2435static void
2436__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2437{
2438 struct rq *rq = task_rq(p);
2439 bool queued, running;
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453 if (flags & SCA_MIGRATE_DISABLE)
2454 SCHED_WARN_ON(!p->on_cpu);
2455 else
2456 lockdep_assert_held(&p->pi_lock);
2457
2458 queued = task_on_rq_queued(p);
2459 running = task_current(rq, p);
2460
2461 if (queued) {
2462
2463
2464
2465
2466 lockdep_assert_rq_held(rq);
2467 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2468 }
2469 if (running)
2470 put_prev_task(rq, p);
2471
2472 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2473
2474 if (queued)
2475 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2476 if (running)
2477 set_next_task(rq, p);
2478}
2479
2480void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2481{
2482 __do_set_cpus_allowed(p, new_mask, 0);
2483}
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2562 int dest_cpu, unsigned int flags)
2563{
2564 struct set_affinity_pending my_pending = { }, *pending = NULL;
2565 bool stop_pending, complete = false;
2566
2567
2568 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2569 struct task_struct *push_task = NULL;
2570
2571 if ((flags & SCA_MIGRATE_ENABLE) &&
2572 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2573 rq->push_busy = true;
2574 push_task = get_task_struct(p);
2575 }
2576
2577
2578
2579
2580
2581 pending = p->migration_pending;
2582 if (pending && !pending->stop_pending) {
2583 p->migration_pending = NULL;
2584 complete = true;
2585 }
2586
2587 task_rq_unlock(rq, p, rf);
2588
2589 if (push_task) {
2590 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2591 p, &rq->push_work);
2592 }
2593
2594 if (complete)
2595 complete_all(&pending->done);
2596
2597 return 0;
2598 }
2599
2600 if (!(flags & SCA_MIGRATE_ENABLE)) {
2601
2602 if (!p->migration_pending) {
2603
2604 refcount_set(&my_pending.refs, 1);
2605 init_completion(&my_pending.done);
2606 my_pending.arg = (struct migration_arg) {
2607 .task = p,
2608 .dest_cpu = dest_cpu,
2609 .pending = &my_pending,
2610 };
2611
2612 p->migration_pending = &my_pending;
2613 } else {
2614 pending = p->migration_pending;
2615 refcount_inc(&pending->refs);
2616
2617
2618
2619
2620
2621
2622
2623
2624 pending->arg.dest_cpu = dest_cpu;
2625 }
2626 }
2627 pending = p->migration_pending;
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 if (WARN_ON_ONCE(!pending)) {
2641 task_rq_unlock(rq, p, rf);
2642 return -EINVAL;
2643 }
2644
2645 if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2646
2647
2648
2649
2650
2651 stop_pending = pending->stop_pending;
2652 if (!stop_pending)
2653 pending->stop_pending = true;
2654
2655 if (flags & SCA_MIGRATE_ENABLE)
2656 p->migration_flags &= ~MDF_PUSH;
2657
2658 task_rq_unlock(rq, p, rf);
2659
2660 if (!stop_pending) {
2661 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2662 &pending->arg, &pending->stop_work);
2663 }
2664
2665 if (flags & SCA_MIGRATE_ENABLE)
2666 return 0;
2667 } else {
2668
2669 if (!is_migration_disabled(p)) {
2670 if (task_on_rq_queued(p))
2671 rq = move_queued_task(rq, rf, p, dest_cpu);
2672
2673 if (!pending->stop_pending) {
2674 p->migration_pending = NULL;
2675 complete = true;
2676 }
2677 }
2678 task_rq_unlock(rq, p, rf);
2679
2680 if (complete)
2681 complete_all(&pending->done);
2682 }
2683
2684 wait_for_completion(&pending->done);
2685
2686 if (refcount_dec_and_test(&pending->refs))
2687 wake_up_var(&pending->refs);
2688
2689
2690
2691
2692
2693 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2694
2695
2696 WARN_ON_ONCE(my_pending.stop_pending);
2697
2698 return 0;
2699}
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710static int __set_cpus_allowed_ptr(struct task_struct *p,
2711 const struct cpumask *new_mask,
2712 u32 flags)
2713{
2714 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2715 unsigned int dest_cpu;
2716 struct rq_flags rf;
2717 struct rq *rq;
2718 int ret = 0;
2719
2720 rq = task_rq_lock(p, &rf);
2721 update_rq_clock(rq);
2722
2723 if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734 cpu_valid_mask = cpu_online_mask;
2735 }
2736
2737
2738
2739
2740
2741 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2742 ret = -EINVAL;
2743 goto out;
2744 }
2745
2746 if (!(flags & SCA_MIGRATE_ENABLE)) {
2747 if (cpumask_equal(&p->cpus_mask, new_mask))
2748 goto out;
2749
2750 if (WARN_ON_ONCE(p == current &&
2751 is_migration_disabled(p) &&
2752 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2753 ret = -EBUSY;
2754 goto out;
2755 }
2756 }
2757
2758
2759
2760
2761
2762
2763 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2764 if (dest_cpu >= nr_cpu_ids) {
2765 ret = -EINVAL;
2766 goto out;
2767 }
2768
2769 __do_set_cpus_allowed(p, new_mask, flags);
2770
2771 return affine_move_task(rq, p, &rf, dest_cpu, flags);
2772
2773out:
2774 task_rq_unlock(rq, p, &rf);
2775
2776 return ret;
2777}
2778
2779int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2780{
2781 return __set_cpus_allowed_ptr(p, new_mask, 0);
2782}
2783EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2784
2785void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2786{
2787#ifdef CONFIG_SCHED_DEBUG
2788 unsigned int state = READ_ONCE(p->__state);
2789
2790
2791
2792
2793
2794 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2795
2796
2797
2798
2799
2800
2801 WARN_ON_ONCE(state == TASK_RUNNING &&
2802 p->sched_class == &fair_sched_class &&
2803 (p->on_rq && !task_on_rq_migrating(p)));
2804
2805#ifdef CONFIG_LOCKDEP
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2817 lockdep_is_held(__rq_lockp(task_rq(p)))));
2818#endif
2819
2820
2821
2822 WARN_ON_ONCE(!cpu_online(new_cpu));
2823
2824 WARN_ON_ONCE(is_migration_disabled(p));
2825#endif
2826
2827 trace_sched_migrate_task(p, new_cpu);
2828
2829 if (task_cpu(p) != new_cpu) {
2830 if (p->sched_class->migrate_task_rq)
2831 p->sched_class->migrate_task_rq(p, new_cpu);
2832 p->se.nr_migrations++;
2833 rseq_migrate(p);
2834 perf_event_task_migrate(p);
2835 }
2836
2837 __set_task_cpu(p, new_cpu);
2838}
2839
2840#ifdef CONFIG_NUMA_BALANCING
2841static void __migrate_swap_task(struct task_struct *p, int cpu)
2842{
2843 if (task_on_rq_queued(p)) {
2844 struct rq *src_rq, *dst_rq;
2845 struct rq_flags srf, drf;
2846
2847 src_rq = task_rq(p);
2848 dst_rq = cpu_rq(cpu);
2849
2850 rq_pin_lock(src_rq, &srf);
2851 rq_pin_lock(dst_rq, &drf);
2852
2853 deactivate_task(src_rq, p, 0);
2854 set_task_cpu(p, cpu);
2855 activate_task(dst_rq, p, 0);
2856 check_preempt_curr(dst_rq, p, 0);
2857
2858 rq_unpin_lock(dst_rq, &drf);
2859 rq_unpin_lock(src_rq, &srf);
2860
2861 } else {
2862
2863
2864
2865
2866
2867 p->wake_cpu = cpu;
2868 }
2869}
2870
2871struct migration_swap_arg {
2872 struct task_struct *src_task, *dst_task;
2873 int src_cpu, dst_cpu;
2874};
2875
2876static int migrate_swap_stop(void *data)
2877{
2878 struct migration_swap_arg *arg = data;
2879 struct rq *src_rq, *dst_rq;
2880 int ret = -EAGAIN;
2881
2882 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2883 return -EAGAIN;
2884
2885 src_rq = cpu_rq(arg->src_cpu);
2886 dst_rq = cpu_rq(arg->dst_cpu);
2887
2888 double_raw_lock(&arg->src_task->pi_lock,
2889 &arg->dst_task->pi_lock);
2890 double_rq_lock(src_rq, dst_rq);
2891
2892 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2893 goto unlock;
2894
2895 if (task_cpu(arg->src_task) != arg->src_cpu)
2896 goto unlock;
2897
2898 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2899 goto unlock;
2900
2901 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2902 goto unlock;
2903
2904 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2905 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2906
2907 ret = 0;
2908
2909unlock:
2910 double_rq_unlock(src_rq, dst_rq);
2911 raw_spin_unlock(&arg->dst_task->pi_lock);
2912 raw_spin_unlock(&arg->src_task->pi_lock);
2913
2914 return ret;
2915}
2916
2917
2918
2919
2920int migrate_swap(struct task_struct *cur, struct task_struct *p,
2921 int target_cpu, int curr_cpu)
2922{
2923 struct migration_swap_arg arg;
2924 int ret = -EINVAL;
2925
2926 arg = (struct migration_swap_arg){
2927 .src_task = cur,
2928 .src_cpu = curr_cpu,
2929 .dst_task = p,
2930 .dst_cpu = target_cpu,
2931 };
2932
2933 if (arg.src_cpu == arg.dst_cpu)
2934 goto out;
2935
2936
2937
2938
2939
2940 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2941 goto out;
2942
2943 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2944 goto out;
2945
2946 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2947 goto out;
2948
2949 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2950 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2951
2952out:
2953 return ret;
2954}
2955#endif
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
2974{
2975 int running, queued;
2976 struct rq_flags rf;
2977 unsigned long ncsw;
2978 struct rq *rq;
2979
2980 for (;;) {
2981
2982
2983
2984
2985
2986
2987 rq = task_rq(p);
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000 while (task_running(rq, p)) {
3001 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3002 return 0;
3003 cpu_relax();
3004 }
3005
3006
3007
3008
3009
3010
3011 rq = task_rq_lock(p, &rf);
3012 trace_sched_wait_task(p);
3013 running = task_running(rq, p);
3014 queued = task_on_rq_queued(p);
3015 ncsw = 0;
3016 if (!match_state || READ_ONCE(p->__state) == match_state)
3017 ncsw = p->nvcsw | LONG_MIN;
3018 task_rq_unlock(rq, p, &rf);
3019
3020
3021
3022
3023 if (unlikely(!ncsw))
3024 break;
3025
3026
3027
3028
3029
3030
3031
3032 if (unlikely(running)) {
3033 cpu_relax();
3034 continue;
3035 }
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046 if (unlikely(queued)) {
3047 ktime_t to = NSEC_PER_SEC / HZ;
3048
3049 set_current_state(TASK_UNINTERRUPTIBLE);
3050 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3051 continue;
3052 }
3053
3054
3055
3056
3057
3058
3059 break;
3060 }
3061
3062 return ncsw;
3063}
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078void kick_process(struct task_struct *p)
3079{
3080 int cpu;
3081
3082 preempt_disable();
3083 cpu = task_cpu(p);
3084 if ((cpu != smp_processor_id()) && task_curr(p))
3085 smp_send_reschedule(cpu);
3086 preempt_enable();
3087}
3088EXPORT_SYMBOL_GPL(kick_process);
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112static int select_fallback_rq(int cpu, struct task_struct *p)
3113{
3114 int nid = cpu_to_node(cpu);
3115 const struct cpumask *nodemask = NULL;
3116 enum { cpuset, possible, fail } state = cpuset;
3117 int dest_cpu;
3118
3119
3120
3121
3122
3123
3124 if (nid != -1) {
3125 nodemask = cpumask_of_node(nid);
3126
3127
3128 for_each_cpu(dest_cpu, nodemask) {
3129 if (!cpu_active(dest_cpu))
3130 continue;
3131 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
3132 return dest_cpu;
3133 }
3134 }
3135
3136 for (;;) {
3137
3138 for_each_cpu(dest_cpu, p->cpus_ptr) {
3139 if (!is_cpu_allowed(p, dest_cpu))
3140 continue;
3141
3142 goto out;
3143 }
3144
3145
3146 switch (state) {
3147 case cpuset:
3148 if (IS_ENABLED(CONFIG_CPUSETS)) {
3149 cpuset_cpus_allowed_fallback(p);
3150 state = possible;
3151 break;
3152 }
3153 fallthrough;
3154 case possible:
3155
3156
3157
3158
3159
3160
3161 do_set_cpus_allowed(p, cpu_possible_mask);
3162 state = fail;
3163 break;
3164
3165 case fail:
3166 BUG();
3167 break;
3168 }
3169 }
3170
3171out:
3172 if (state != cpuset) {
3173
3174
3175
3176
3177
3178 if (p->mm && printk_ratelimit()) {
3179 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3180 task_pid_nr(p), p->comm, cpu);
3181 }
3182 }
3183
3184 return dest_cpu;
3185}
3186
3187
3188
3189
3190static inline
3191int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3192{
3193 lockdep_assert_held(&p->pi_lock);
3194
3195 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3196 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3197 else
3198 cpu = cpumask_any(p->cpus_ptr);
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210 if (unlikely(!is_cpu_allowed(p, cpu)))
3211 cpu = select_fallback_rq(task_cpu(p), p);
3212
3213 return cpu;
3214}
3215
3216void sched_set_stop_task(int cpu, struct task_struct *stop)
3217{
3218 static struct lock_class_key stop_pi_lock;
3219 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3220 struct task_struct *old_stop = cpu_rq(cpu)->stop;
3221
3222 if (stop) {
3223
3224
3225
3226
3227
3228
3229
3230
3231 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3232
3233 stop->sched_class = &stop_sched_class;
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3248 }
3249
3250 cpu_rq(cpu)->stop = stop;
3251
3252 if (old_stop) {
3253
3254
3255
3256
3257 old_stop->sched_class = &rt_sched_class;
3258 }
3259}
3260
3261#else
3262
3263static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3264 const struct cpumask *new_mask,
3265 u32 flags)
3266{
3267 return set_cpus_allowed_ptr(p, new_mask);
3268}
3269
3270static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3271
3272static inline bool rq_has_pinned_tasks(struct rq *rq)
3273{
3274 return false;
3275}
3276
3277#endif
3278
3279static void
3280ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3281{
3282 struct rq *rq;
3283
3284 if (!schedstat_enabled())
3285 return;
3286
3287 rq = this_rq();
3288
3289#ifdef CONFIG_SMP
3290 if (cpu == rq->cpu) {
3291 __schedstat_inc(rq->ttwu_local);
3292 __schedstat_inc(p->se.statistics.nr_wakeups_local);
3293 } else {
3294 struct sched_domain *sd;
3295
3296 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3297 rcu_read_lock();
3298 for_each_domain(rq->cpu, sd) {
3299 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3300 __schedstat_inc(sd->ttwu_wake_remote);
3301 break;
3302 }
3303 }
3304 rcu_read_unlock();
3305 }
3306
3307 if (wake_flags & WF_MIGRATED)
3308 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3309#endif
3310
3311 __schedstat_inc(rq->ttwu_count);
3312 __schedstat_inc(p->se.statistics.nr_wakeups);
3313
3314 if (wake_flags & WF_SYNC)
3315 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3316}
3317
3318
3319
3320
3321static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3322 struct rq_flags *rf)
3323{
3324 check_preempt_curr(rq, p, wake_flags);
3325 WRITE_ONCE(p->__state, TASK_RUNNING);
3326 trace_sched_wakeup(p);
3327
3328#ifdef CONFIG_SMP
3329 if (p->sched_class->task_woken) {
3330
3331
3332
3333
3334 rq_unpin_lock(rq, rf);
3335 p->sched_class->task_woken(rq, p);
3336 rq_repin_lock(rq, rf);
3337 }
3338
3339 if (rq->idle_stamp) {
3340 u64 delta = rq_clock(rq) - rq->idle_stamp;
3341 u64 max = 2*rq->max_idle_balance_cost;
3342
3343 update_avg(&rq->avg_idle, delta);
3344
3345 if (rq->avg_idle > max)
3346 rq->avg_idle = max;
3347
3348 rq->wake_stamp = jiffies;
3349 rq->wake_avg_idle = rq->avg_idle / 2;
3350
3351 rq->idle_stamp = 0;
3352 }
3353#endif
3354}
3355
3356static void
3357ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3358 struct rq_flags *rf)
3359{
3360 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3361
3362 lockdep_assert_rq_held(rq);
3363
3364 if (p->sched_contributes_to_load)
3365 rq->nr_uninterruptible--;
3366
3367#ifdef CONFIG_SMP
3368 if (wake_flags & WF_MIGRATED)
3369 en_flags |= ENQUEUE_MIGRATED;
3370 else
3371#endif
3372 if (p->in_iowait) {
3373 delayacct_blkio_end(p);
3374 atomic_dec(&task_rq(p)->nr_iowait);
3375 }
3376
3377 activate_task(rq, p, en_flags);
3378 ttwu_do_wakeup(rq, p, wake_flags, rf);
3379}
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406static int ttwu_runnable(struct task_struct *p, int wake_flags)
3407{
3408 struct rq_flags rf;
3409 struct rq *rq;
3410 int ret = 0;
3411
3412 rq = __task_rq_lock(p, &rf);
3413 if (task_on_rq_queued(p)) {
3414
3415 update_rq_clock(rq);
3416 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3417 ret = 1;
3418 }
3419 __task_rq_unlock(rq, &rf);
3420
3421 return ret;
3422}
3423
3424#ifdef CONFIG_SMP
3425void sched_ttwu_pending(void *arg)
3426{
3427 struct llist_node *llist = arg;
3428 struct rq *rq = this_rq();
3429 struct task_struct *p, *t;
3430 struct rq_flags rf;
3431
3432 if (!llist)
3433 return;
3434
3435
3436
3437
3438
3439
3440 WRITE_ONCE(rq->ttwu_pending, 0);
3441
3442 rq_lock_irqsave(rq, &rf);
3443 update_rq_clock(rq);
3444
3445 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3446 if (WARN_ON_ONCE(p->on_cpu))
3447 smp_cond_load_acquire(&p->on_cpu, !VAL);
3448
3449 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3450 set_task_cpu(p, cpu_of(rq));
3451
3452 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3453 }
3454
3455 rq_unlock_irqrestore(rq, &rf);
3456}
3457
3458void send_call_function_single_ipi(int cpu)
3459{
3460 struct rq *rq = cpu_rq(cpu);
3461
3462 if (!set_nr_if_polling(rq->idle))
3463 arch_send_call_function_single_ipi(cpu);
3464 else
3465 trace_sched_wake_idle_without_ipi(cpu);
3466}
3467
3468
3469
3470
3471
3472
3473
3474static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3475{
3476 struct rq *rq = cpu_rq(cpu);
3477
3478 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3479
3480 WRITE_ONCE(rq->ttwu_pending, 1);
3481 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3482}
3483
3484void wake_up_if_idle(int cpu)
3485{
3486 struct rq *rq = cpu_rq(cpu);
3487 struct rq_flags rf;
3488
3489 rcu_read_lock();
3490
3491 if (!is_idle_task(rcu_dereference(rq->curr)))
3492 goto out;
3493
3494 if (set_nr_if_polling(rq->idle)) {
3495 trace_sched_wake_idle_without_ipi(cpu);
3496 } else {
3497 rq_lock_irqsave(rq, &rf);
3498 if (is_idle_task(rq->curr))
3499 smp_send_reschedule(cpu);
3500
3501 rq_unlock_irqrestore(rq, &rf);
3502 }
3503
3504out:
3505 rcu_read_unlock();
3506}
3507
3508bool cpus_share_cache(int this_cpu, int that_cpu)
3509{
3510 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3511}
3512
3513static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3514{
3515
3516
3517
3518
3519 if (!cpu_active(cpu))
3520 return false;
3521
3522
3523
3524
3525
3526 if (!cpus_share_cache(smp_processor_id(), cpu))
3527 return true;
3528
3529
3530
3531
3532
3533
3534
3535 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3536 return true;
3537
3538 return false;
3539}
3540
3541static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3542{
3543 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3544 if (WARN_ON_ONCE(cpu == smp_processor_id()))
3545 return false;
3546
3547 sched_clock_cpu(cpu);
3548 __ttwu_queue_wakelist(p, cpu, wake_flags);
3549 return true;
3550 }
3551
3552 return false;
3553}
3554
3555#else
3556
3557static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3558{
3559 return false;
3560}
3561
3562#endif
3563
3564static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3565{
3566 struct rq *rq = cpu_rq(cpu);
3567 struct rq_flags rf;
3568
3569 if (ttwu_queue_wakelist(p, cpu, wake_flags))
3570 return;
3571
3572 rq_lock(rq, &rf);
3573 update_rq_clock(rq);
3574 ttwu_do_activate(rq, p, wake_flags, &rf);
3575 rq_unlock(rq, &rf);
3576}
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698static int
3699try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3700{
3701 unsigned long flags;
3702 int cpu, success = 0;
3703
3704 preempt_disable();
3705 if (p == current) {
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717 if (!(READ_ONCE(p->__state) & state))
3718 goto out;
3719
3720 success = 1;
3721 trace_sched_waking(p);
3722 WRITE_ONCE(p->__state, TASK_RUNNING);
3723 trace_sched_wakeup(p);
3724 goto out;
3725 }
3726
3727
3728
3729
3730
3731
3732
3733 raw_spin_lock_irqsave(&p->pi_lock, flags);
3734 smp_mb__after_spinlock();
3735 if (!(READ_ONCE(p->__state) & state))
3736 goto unlock;
3737
3738 trace_sched_waking(p);
3739
3740
3741 success = 1;
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765 smp_rmb();
3766 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3767 goto unlock;
3768
3769#ifdef CONFIG_SMP
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793 smp_acquire__after_ctrl_dep();
3794
3795
3796
3797
3798
3799
3800
3801 WRITE_ONCE(p->__state, TASK_WAKING);
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822 if (smp_load_acquire(&p->on_cpu) &&
3823 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3824 goto unlock;
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835 smp_cond_load_acquire(&p->on_cpu, !VAL);
3836
3837 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
3838 if (task_cpu(p) != cpu) {
3839 if (p->in_iowait) {
3840 delayacct_blkio_end(p);
3841 atomic_dec(&task_rq(p)->nr_iowait);
3842 }
3843
3844 wake_flags |= WF_MIGRATED;
3845 psi_ttwu_dequeue(p);
3846 set_task_cpu(p, cpu);
3847 }
3848#else
3849 cpu = task_cpu(p);
3850#endif
3851
3852 ttwu_queue(p, cpu, wake_flags);
3853unlock:
3854 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3855out:
3856 if (success)
3857 ttwu_stat(p, task_cpu(p), wake_flags);
3858 preempt_enable();
3859
3860 return success;
3861}
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3882{
3883 struct rq_flags rf;
3884 bool ret = false;
3885 struct rq *rq;
3886
3887 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3888 if (p->on_rq) {
3889 rq = __task_rq_lock(p, &rf);
3890 if (task_rq(p) == rq)
3891 ret = func(p, arg);
3892 rq_unlock(rq, &rf);
3893 } else {
3894 switch (READ_ONCE(p->__state)) {
3895 case TASK_RUNNING:
3896 case TASK_WAKING:
3897 break;
3898 default:
3899 smp_rmb();
3900 if (!p->on_rq)
3901 ret = func(p, arg);
3902 }
3903 }
3904 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3905 return ret;
3906}
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919int wake_up_process(struct task_struct *p)
3920{
3921 return try_to_wake_up(p, TASK_NORMAL, 0);
3922}
3923EXPORT_SYMBOL(wake_up_process);
3924
3925int wake_up_state(struct task_struct *p, unsigned int state)
3926{
3927 return try_to_wake_up(p, state, 0);
3928}
3929
3930
3931
3932
3933
3934
3935
3936static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3937{
3938 p->on_rq = 0;
3939
3940 p->se.on_rq = 0;
3941 p->se.exec_start = 0;
3942 p->se.sum_exec_runtime = 0;
3943 p->se.prev_sum_exec_runtime = 0;
3944 p->se.nr_migrations = 0;
3945 p->se.vruntime = 0;
3946 INIT_LIST_HEAD(&p->se.group_node);
3947
3948#ifdef CONFIG_FAIR_GROUP_SCHED
3949 p->se.cfs_rq = NULL;
3950#endif
3951
3952#ifdef CONFIG_SCHEDSTATS
3953
3954 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3955#endif
3956
3957 RB_CLEAR_NODE(&p->dl.rb_node);
3958 init_dl_task_timer(&p->dl);
3959 init_dl_inactive_task_timer(&p->dl);
3960 __dl_clear_params(p);
3961
3962 INIT_LIST_HEAD(&p->rt.run_list);
3963 p->rt.timeout = 0;
3964 p->rt.time_slice = sched_rr_timeslice;
3965 p->rt.on_rq = 0;
3966 p->rt.on_list = 0;
3967
3968#ifdef CONFIG_PREEMPT_NOTIFIERS
3969 INIT_HLIST_HEAD(&p->preempt_notifiers);
3970#endif
3971
3972#ifdef CONFIG_COMPACTION
3973 p->capture_control = NULL;
3974#endif
3975 init_numa_balancing(clone_flags, p);
3976#ifdef CONFIG_SMP
3977 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3978 p->migration_pending = NULL;
3979#endif
3980}
3981
3982DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3983
3984#ifdef CONFIG_NUMA_BALANCING
3985
3986void set_numabalancing_state(bool enabled)
3987{
3988 if (enabled)
3989 static_branch_enable(&sched_numa_balancing);
3990 else
3991 static_branch_disable(&sched_numa_balancing);
3992}
3993
3994#ifdef CONFIG_PROC_SYSCTL
3995int sysctl_numa_balancing(struct ctl_table *table, int write,
3996 void *buffer, size_t *lenp, loff_t *ppos)
3997{
3998 struct ctl_table t;
3999 int err;
4000 int state = static_branch_likely(&sched_numa_balancing);
4001
4002 if (write && !capable(CAP_SYS_ADMIN))
4003 return -EPERM;
4004
4005 t = *table;
4006 t.data = &state;
4007 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4008 if (err < 0)
4009 return err;
4010 if (write)
4011 set_numabalancing_state(state);
4012 return err;
4013}
4014#endif
4015#endif
4016
4017#ifdef CONFIG_SCHEDSTATS
4018
4019DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4020
4021static void set_schedstats(bool enabled)
4022{
4023 if (enabled)
4024 static_branch_enable(&sched_schedstats);
4025 else
4026 static_branch_disable(&sched_schedstats);
4027}
4028
4029void force_schedstat_enabled(void)
4030{
4031 if (!schedstat_enabled()) {
4032 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4033 static_branch_enable(&sched_schedstats);
4034 }
4035}
4036
4037static int __init setup_schedstats(char *str)
4038{
4039 int ret = 0;
4040 if (!str)
4041 goto out;
4042
4043 if (!strcmp(str, "enable")) {
4044 set_schedstats(true);
4045 ret = 1;
4046 } else if (!strcmp(str, "disable")) {
4047 set_schedstats(false);
4048 ret = 1;
4049 }
4050out:
4051 if (!ret)
4052 pr_warn("Unable to parse schedstats=\n");
4053
4054 return ret;
4055}
4056__setup("schedstats=", setup_schedstats);
4057
4058#ifdef CONFIG_PROC_SYSCTL
4059int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4060 size_t *lenp, loff_t *ppos)
4061{
4062 struct ctl_table t;
4063 int err;
4064 int state = static_branch_likely(&sched_schedstats);
4065
4066 if (write && !capable(CAP_SYS_ADMIN))
4067 return -EPERM;
4068
4069 t = *table;
4070 t.data = &state;
4071 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4072 if (err < 0)
4073 return err;
4074 if (write)
4075 set_schedstats(state);
4076 return err;
4077}
4078#endif
4079#endif
4080
4081
4082
4083
4084int sched_fork(unsigned long clone_flags, struct task_struct *p)
4085{
4086 unsigned long flags;
4087
4088 __sched_fork(clone_flags, p);
4089
4090
4091
4092
4093
4094 p->__state = TASK_NEW;
4095
4096
4097
4098
4099 p->prio = current->normal_prio;
4100
4101 uclamp_fork(p);
4102
4103
4104
4105
4106 if (unlikely(p->sched_reset_on_fork)) {
4107 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4108 p->policy = SCHED_NORMAL;
4109 p->static_prio = NICE_TO_PRIO(0);
4110 p->rt_priority = 0;
4111 } else if (PRIO_TO_NICE(p->static_prio) < 0)
4112 p->static_prio = NICE_TO_PRIO(0);
4113
4114 p->prio = p->normal_prio = p->static_prio;
4115 set_load_weight(p, false);
4116
4117
4118
4119
4120
4121 p->sched_reset_on_fork = 0;
4122 }
4123
4124 if (dl_prio(p->prio))
4125 return -EAGAIN;
4126 else if (rt_prio(p->prio))
4127 p->sched_class = &rt_sched_class;
4128 else
4129 p->sched_class = &fair_sched_class;
4130
4131 init_entity_runnable_average(&p->se);
4132
4133
4134
4135
4136
4137
4138
4139
4140 raw_spin_lock_irqsave(&p->pi_lock, flags);
4141 rseq_migrate(p);
4142
4143
4144
4145
4146 __set_task_cpu(p, smp_processor_id());
4147 if (p->sched_class->task_fork)
4148 p->sched_class->task_fork(p);
4149 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4150
4151#ifdef CONFIG_SCHED_INFO
4152 if (likely(sched_info_on()))
4153 memset(&p->sched_info, 0, sizeof(p->sched_info));
4154#endif
4155#if defined(CONFIG_SMP)
4156 p->on_cpu = 0;
4157#endif
4158 init_task_preempt_count(p);
4159#ifdef CONFIG_SMP
4160 plist_node_init(&p->pushable_tasks, MAX_PRIO);
4161 RB_CLEAR_NODE(&p->pushable_dl_tasks);
4162#endif
4163 return 0;
4164}
4165
4166void sched_post_fork(struct task_struct *p)
4167{
4168 uclamp_post_fork(p);
4169}
4170
4171unsigned long to_ratio(u64 period, u64 runtime)
4172{
4173 if (runtime == RUNTIME_INF)
4174 return BW_UNIT;
4175
4176
4177
4178
4179
4180
4181 if (period == 0)
4182 return 0;
4183
4184 return div64_u64(runtime << BW_SHIFT, period);
4185}
4186
4187
4188
4189
4190
4191
4192
4193
4194void wake_up_new_task(struct task_struct *p)
4195{
4196 struct rq_flags rf;
4197 struct rq *rq;
4198
4199 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4200 WRITE_ONCE(p->__state, TASK_RUNNING);
4201#ifdef CONFIG_SMP
4202
4203
4204
4205
4206
4207
4208
4209
4210 p->recent_used_cpu = task_cpu(p);
4211 rseq_migrate(p);
4212 __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4213#endif
4214 rq = __task_rq_lock(p, &rf);
4215 update_rq_clock(rq);
4216 post_init_entity_util_avg(p);
4217
4218 activate_task(rq, p, ENQUEUE_NOCLOCK);
4219 trace_sched_wakeup_new(p);
4220 check_preempt_curr(rq, p, WF_FORK);
4221#ifdef CONFIG_SMP
4222 if (p->sched_class->task_woken) {
4223
4224
4225
4226
4227 rq_unpin_lock(rq, &rf);
4228 p->sched_class->task_woken(rq, p);
4229 rq_repin_lock(rq, &rf);
4230 }
4231#endif
4232 task_rq_unlock(rq, p, &rf);
4233}
4234
4235#ifdef CONFIG_PREEMPT_NOTIFIERS
4236
4237static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4238
4239void preempt_notifier_inc(void)
4240{
4241 static_branch_inc(&preempt_notifier_key);
4242}
4243EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4244
4245void preempt_notifier_dec(void)
4246{
4247 static_branch_dec(&preempt_notifier_key);
4248}
4249EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4250
4251
4252
4253
4254
4255void preempt_notifier_register(struct preempt_notifier *notifier)
4256{
4257 if (!static_branch_unlikely(&preempt_notifier_key))
4258 WARN(1, "registering preempt_notifier while notifiers disabled\n");
4259
4260 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
4261}
4262EXPORT_SYMBOL_GPL(preempt_notifier_register);
4263
4264
4265
4266
4267
4268
4269
4270void preempt_notifier_unregister(struct preempt_notifier *notifier)
4271{
4272 hlist_del(¬ifier->link);
4273}
4274EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4275
4276static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4277{
4278 struct preempt_notifier *notifier;
4279
4280 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4281 notifier->ops->sched_in(notifier, raw_smp_processor_id());
4282}
4283
4284static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4285{
4286 if (static_branch_unlikely(&preempt_notifier_key))
4287 __fire_sched_in_preempt_notifiers(curr);
4288}
4289
4290static void
4291__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4292 struct task_struct *next)
4293{
4294 struct preempt_notifier *notifier;
4295
4296 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4297 notifier->ops->sched_out(notifier, next);
4298}
4299
4300static __always_inline void
4301fire_sched_out_preempt_notifiers(struct task_struct *curr,
4302 struct task_struct *next)
4303{
4304 if (static_branch_unlikely(&preempt_notifier_key))
4305 __fire_sched_out_preempt_notifiers(curr, next);
4306}
4307
4308#else
4309
4310static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4311{
4312}
4313
4314static inline void
4315fire_sched_out_preempt_notifiers(struct task_struct *curr,
4316 struct task_struct *next)
4317{
4318}
4319
4320#endif
4321
4322static inline void prepare_task(struct task_struct *next)
4323{
4324#ifdef CONFIG_SMP
4325
4326
4327
4328
4329
4330
4331 WRITE_ONCE(next->on_cpu, 1);
4332#endif
4333}
4334
4335static inline void finish_task(struct task_struct *prev)
4336{
4337#ifdef CONFIG_SMP
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349 smp_store_release(&prev->on_cpu, 0);
4350#endif
4351}
4352
4353#ifdef CONFIG_SMP
4354
4355static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4356{
4357 void (*func)(struct rq *rq);
4358 struct callback_head *next;
4359
4360 lockdep_assert_rq_held(rq);
4361
4362 while (head) {
4363 func = (void (*)(struct rq *))head->func;
4364 next = head->next;
4365 head->next = NULL;
4366 head = next;
4367
4368 func(rq);
4369 }
4370}
4371
4372static void balance_push(struct rq *rq);
4373
4374struct callback_head balance_push_callback = {
4375 .next = NULL,
4376 .func = (void (*)(struct callback_head *))balance_push,
4377};
4378
4379static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4380{
4381 struct callback_head *head = rq->balance_callback;
4382
4383 lockdep_assert_rq_held(rq);
4384 if (head)
4385 rq->balance_callback = NULL;
4386
4387 return head;
4388}
4389
4390static void __balance_callbacks(struct rq *rq)
4391{
4392 do_balance_callbacks(rq, splice_balance_callbacks(rq));
4393}
4394
4395static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4396{
4397 unsigned long flags;
4398
4399 if (unlikely(head)) {
4400 raw_spin_rq_lock_irqsave(rq, flags);
4401 do_balance_callbacks(rq, head);
4402 raw_spin_rq_unlock_irqrestore(rq, flags);
4403 }
4404}
4405
4406#else
4407
4408static inline void __balance_callbacks(struct rq *rq)
4409{
4410}
4411
4412static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4413{
4414 return NULL;
4415}
4416
4417static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4418{
4419}
4420
4421#endif
4422
4423static inline void
4424prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4425{
4426
4427
4428
4429
4430
4431
4432 rq_unpin_lock(rq, rf);
4433 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4434#ifdef CONFIG_DEBUG_SPINLOCK
4435
4436 rq_lockp(rq)->owner = next;
4437#endif
4438}
4439
4440static inline void finish_lock_switch(struct rq *rq)
4441{
4442
4443
4444
4445
4446
4447 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4448 __balance_callbacks(rq);
4449 raw_spin_rq_unlock_irq(rq);
4450}
4451
4452
4453
4454
4455
4456#ifndef prepare_arch_switch
4457# define prepare_arch_switch(next) do { } while (0)
4458#endif
4459
4460#ifndef finish_arch_post_lock_switch
4461# define finish_arch_post_lock_switch() do { } while (0)
4462#endif
4463
4464static inline void kmap_local_sched_out(void)
4465{
4466#ifdef CONFIG_KMAP_LOCAL
4467 if (unlikely(current->kmap_ctrl.idx))
4468 __kmap_local_sched_out();
4469#endif
4470}
4471
4472static inline void kmap_local_sched_in(void)
4473{
4474#ifdef CONFIG_KMAP_LOCAL
4475 if (unlikely(current->kmap_ctrl.idx))
4476 __kmap_local_sched_in();
4477#endif
4478}
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493static inline void
4494prepare_task_switch(struct rq *rq, struct task_struct *prev,
4495 struct task_struct *next)
4496{
4497 kcov_prepare_switch(prev);
4498 sched_info_switch(rq, prev, next);
4499 perf_event_task_sched_out(prev, next);
4500 rseq_preempt(prev);
4501 fire_sched_out_preempt_notifiers(prev, next);
4502 kmap_local_sched_out();
4503 prepare_task(next);
4504 prepare_arch_switch(next);
4505}
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526static struct rq *finish_task_switch(struct task_struct *prev)
4527 __releases(rq->lock)
4528{
4529 struct rq *rq = this_rq();
4530 struct mm_struct *mm = rq->prev_mm;
4531 long prev_state;
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4545 "corrupted preempt_count: %s/%d/0x%x\n",
4546 current->comm, current->pid, preempt_count()))
4547 preempt_count_set(FORK_PREEMPT_COUNT);
4548
4549 rq->prev_mm = NULL;
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562 prev_state = READ_ONCE(prev->__state);
4563 vtime_task_switch(prev);
4564 perf_event_task_sched_in(prev, current);
4565 finish_task(prev);
4566 tick_nohz_task_switch();
4567 finish_lock_switch(rq);
4568 finish_arch_post_lock_switch();
4569 kcov_finish_switch(current);
4570
4571
4572
4573
4574
4575
4576
4577 kmap_local_sched_in();
4578
4579 fire_sched_in_preempt_notifiers(current);
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592 if (mm) {
4593 membarrier_mm_sync_core_before_usermode(mm);
4594 mmdrop(mm);
4595 }
4596 if (unlikely(prev_state == TASK_DEAD)) {
4597 if (prev->sched_class->task_dead)
4598 prev->sched_class->task_dead(prev);
4599
4600
4601
4602
4603
4604 kprobe_flush_task(prev);
4605
4606
4607 put_task_stack(prev);
4608
4609 put_task_struct_rcu_user(prev);
4610 }
4611
4612 return rq;
4613}
4614
4615
4616
4617
4618
4619asmlinkage __visible void schedule_tail(struct task_struct *prev)
4620 __releases(rq->lock)
4621{
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631 finish_task_switch(prev);
4632 preempt_enable();
4633
4634 if (current->set_child_tid)
4635 put_user(task_pid_vnr(current), current->set_child_tid);
4636
4637 calculate_sigpending();
4638}
4639
4640
4641
4642
4643static __always_inline struct rq *
4644context_switch(struct rq *rq, struct task_struct *prev,
4645 struct task_struct *next, struct rq_flags *rf)
4646{
4647 prepare_task_switch(rq, prev, next);
4648
4649
4650
4651
4652
4653
4654 arch_start_context_switch(prev);
4655
4656
4657
4658
4659
4660
4661
4662
4663 if (!next->mm) {
4664 enter_lazy_tlb(prev->active_mm, next);
4665
4666 next->active_mm = prev->active_mm;
4667 if (prev->mm)
4668 mmgrab(prev->active_mm);
4669 else
4670 prev->active_mm = NULL;
4671 } else {
4672 membarrier_switch_mm(rq, prev->active_mm, next->mm);
4673
4674
4675
4676
4677
4678
4679
4680
4681 switch_mm_irqs_off(prev->active_mm, next->mm, next);
4682
4683 if (!prev->mm) {
4684
4685 rq->prev_mm = prev->active_mm;
4686 prev->active_mm = NULL;
4687 }
4688 }
4689
4690 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4691
4692 prepare_lock_switch(rq, next, rf);
4693
4694
4695 switch_to(prev, next, prev);
4696 barrier();
4697
4698 return finish_task_switch(prev);
4699}
4700
4701
4702
4703
4704
4705
4706
4707unsigned int nr_running(void)
4708{
4709 unsigned int i, sum = 0;
4710
4711 for_each_online_cpu(i)
4712 sum += cpu_rq(i)->nr_running;
4713
4714 return sum;
4715}
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730bool single_task_running(void)
4731{
4732 return raw_rq()->nr_running == 1;
4733}
4734EXPORT_SYMBOL(single_task_running);
4735
4736unsigned long long nr_context_switches(void)
4737{
4738 int i;
4739 unsigned long long sum = 0;
4740
4741 for_each_possible_cpu(i)
4742 sum += cpu_rq(i)->nr_switches;
4743
4744 return sum;
4745}
4746
4747
4748
4749
4750
4751
4752
4753
4754unsigned int nr_iowait_cpu(int cpu)
4755{
4756 return atomic_read(&cpu_rq(cpu)->nr_iowait);
4757}
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789unsigned int nr_iowait(void)
4790{
4791 unsigned int i, sum = 0;
4792
4793 for_each_possible_cpu(i)
4794 sum += nr_iowait_cpu(i);
4795
4796 return sum;
4797}
4798
4799#ifdef CONFIG_SMP
4800
4801
4802
4803
4804
4805void sched_exec(void)
4806{
4807 struct task_struct *p = current;
4808 unsigned long flags;
4809 int dest_cpu;
4810
4811 raw_spin_lock_irqsave(&p->pi_lock, flags);
4812 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
4813 if (dest_cpu == smp_processor_id())
4814 goto unlock;
4815
4816 if (likely(cpu_active(dest_cpu))) {
4817 struct migration_arg arg = { p, dest_cpu };
4818
4819 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4820 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4821 return;
4822 }
4823unlock:
4824 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4825}
4826
4827#endif
4828
4829DEFINE_PER_CPU(struct kernel_stat, kstat);
4830DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4831
4832EXPORT_PER_CPU_SYMBOL(kstat);
4833EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4834
4835
4836
4837
4838
4839
4840
4841static inline void prefetch_curr_exec_start(struct task_struct *p)
4842{
4843#ifdef CONFIG_FAIR_GROUP_SCHED
4844 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4845#else
4846 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4847#endif
4848 prefetch(curr);
4849 prefetch(&curr->exec_start);
4850}
4851
4852
4853
4854
4855
4856
4857unsigned long long task_sched_runtime(struct task_struct *p)
4858{
4859 struct rq_flags rf;
4860 struct rq *rq;
4861 u64 ns;
4862
4863#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875 if (!p->on_cpu || !task_on_rq_queued(p))
4876 return p->se.sum_exec_runtime;
4877#endif
4878
4879 rq = task_rq_lock(p, &rf);
4880
4881
4882
4883
4884
4885 if (task_current(rq, p) && task_on_rq_queued(p)) {
4886 prefetch_curr_exec_start(p);
4887 update_rq_clock(rq);
4888 p->sched_class->update_curr(rq);
4889 }
4890 ns = p->se.sum_exec_runtime;
4891 task_rq_unlock(rq, p, &rf);
4892
4893 return ns;
4894}
4895
4896#ifdef CONFIG_SCHED_DEBUG
4897static u64 cpu_resched_latency(struct rq *rq)
4898{
4899 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
4900 u64 resched_latency, now = rq_clock(rq);
4901 static bool warned_once;
4902
4903 if (sysctl_resched_latency_warn_once && warned_once)
4904 return 0;
4905
4906 if (!need_resched() || !latency_warn_ms)
4907 return 0;
4908
4909 if (system_state == SYSTEM_BOOTING)
4910 return 0;
4911
4912 if (!rq->last_seen_need_resched_ns) {
4913 rq->last_seen_need_resched_ns = now;
4914 rq->ticks_without_resched = 0;
4915 return 0;
4916 }
4917
4918 rq->ticks_without_resched++;
4919 resched_latency = now - rq->last_seen_need_resched_ns;
4920 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
4921 return 0;
4922
4923 warned_once = true;
4924
4925 return resched_latency;
4926}
4927
4928static int __init setup_resched_latency_warn_ms(char *str)
4929{
4930 long val;
4931
4932 if ((kstrtol(str, 0, &val))) {
4933 pr_warn("Unable to set resched_latency_warn_ms\n");
4934 return 1;
4935 }
4936
4937 sysctl_resched_latency_warn_ms = val;
4938 return 1;
4939}
4940__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
4941#else
4942static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
4943#endif
4944
4945
4946
4947
4948
4949void scheduler_tick(void)
4950{
4951 int cpu = smp_processor_id();
4952 struct rq *rq = cpu_rq(cpu);
4953 struct task_struct *curr = rq->curr;
4954 struct rq_flags rf;
4955 unsigned long thermal_pressure;
4956 u64 resched_latency;
4957
4958 arch_scale_freq_tick();
4959 sched_clock_tick();
4960
4961 rq_lock(rq, &rf);
4962
4963 update_rq_clock(rq);
4964 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4965 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4966 curr->sched_class->task_tick(rq, curr, 0);
4967 if (sched_feat(LATENCY_WARN))
4968 resched_latency = cpu_resched_latency(rq);
4969 calc_global_load_tick(rq);
4970
4971 rq_unlock(rq, &rf);
4972
4973 if (sched_feat(LATENCY_WARN) && resched_latency)
4974 resched_latency_warn(cpu, resched_latency);
4975
4976 perf_event_task_tick();
4977
4978#ifdef CONFIG_SMP
4979 rq->idle_balance = idle_cpu(cpu);
4980 trigger_load_balance(rq);
4981#endif
4982}
4983
4984#ifdef CONFIG_NO_HZ_FULL
4985
4986struct tick_work {
4987 int cpu;
4988 atomic_t state;
4989 struct delayed_work work;
4990};
4991
4992#define TICK_SCHED_REMOTE_OFFLINE 0
4993#define TICK_SCHED_REMOTE_OFFLINING 1
4994#define TICK_SCHED_REMOTE_RUNNING 2
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019static struct tick_work __percpu *tick_work_cpu;
5020
5021static void sched_tick_remote(struct work_struct *work)
5022{
5023 struct delayed_work *dwork = to_delayed_work(work);
5024 struct tick_work *twork = container_of(dwork, struct tick_work, work);
5025 int cpu = twork->cpu;
5026 struct rq *rq = cpu_rq(cpu);
5027 struct task_struct *curr;
5028 struct rq_flags rf;
5029 u64 delta;
5030 int os;
5031
5032
5033
5034
5035
5036
5037
5038
5039 if (!tick_nohz_tick_stopped_cpu(cpu))
5040 goto out_requeue;
5041
5042 rq_lock_irq(rq, &rf);
5043 curr = rq->curr;
5044 if (cpu_is_offline(cpu))
5045 goto out_unlock;
5046
5047 update_rq_clock(rq);
5048
5049 if (!is_idle_task(curr)) {
5050
5051
5052
5053
5054 delta = rq_clock_task(rq) - curr->se.exec_start;
5055 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5056 }
5057 curr->sched_class->task_tick(rq, curr, 0);
5058
5059 calc_load_nohz_remote(rq);
5060out_unlock:
5061 rq_unlock_irq(rq, &rf);
5062out_requeue:
5063
5064
5065
5066
5067
5068
5069
5070 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5071 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5072 if (os == TICK_SCHED_REMOTE_RUNNING)
5073 queue_delayed_work(system_unbound_wq, dwork, HZ);
5074}
5075
5076static void sched_tick_start(int cpu)
5077{
5078 int os;
5079 struct tick_work *twork;
5080
5081 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5082 return;
5083
5084 WARN_ON_ONCE(!tick_work_cpu);
5085
5086 twork = per_cpu_ptr(tick_work_cpu, cpu);
5087 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5088 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5089 if (os == TICK_SCHED_REMOTE_OFFLINE) {
5090 twork->cpu = cpu;
5091 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5092 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5093 }
5094}
5095
5096#ifdef CONFIG_HOTPLUG_CPU
5097static void sched_tick_stop(int cpu)
5098{
5099 struct tick_work *twork;
5100 int os;
5101
5102 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5103 return;
5104
5105 WARN_ON_ONCE(!tick_work_cpu);
5106
5107 twork = per_cpu_ptr(tick_work_cpu, cpu);
5108
5109 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5110 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5111
5112}
5113#endif
5114
5115int __init sched_tick_offload_init(void)
5116{
5117 tick_work_cpu = alloc_percpu(struct tick_work);
5118 BUG_ON(!tick_work_cpu);
5119 return 0;
5120}
5121
5122#else
5123static inline void sched_tick_start(int cpu) { }
5124static inline void sched_tick_stop(int cpu) { }
5125#endif
5126
5127#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5128 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5129
5130
5131
5132
5133static inline void preempt_latency_start(int val)
5134{
5135 if (preempt_count() == val) {
5136 unsigned long ip = get_lock_parent_ip();
5137#ifdef CONFIG_DEBUG_PREEMPT
5138 current->preempt_disable_ip = ip;
5139#endif
5140 trace_preempt_off(CALLER_ADDR0, ip);
5141 }
5142}
5143
5144void preempt_count_add(int val)
5145{
5146#ifdef CONFIG_DEBUG_PREEMPT
5147
5148
5149
5150 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5151 return;
5152#endif
5153 __preempt_count_add(val);
5154#ifdef CONFIG_DEBUG_PREEMPT
5155
5156
5157
5158 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5159 PREEMPT_MASK - 10);
5160#endif
5161 preempt_latency_start(val);
5162}
5163EXPORT_SYMBOL(preempt_count_add);
5164NOKPROBE_SYMBOL(preempt_count_add);
5165
5166
5167
5168
5169
5170static inline void preempt_latency_stop(int val)
5171{
5172 if (preempt_count() == val)
5173 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
5174}
5175
5176void preempt_count_sub(int val)
5177{
5178#ifdef CONFIG_DEBUG_PREEMPT
5179
5180
5181
5182 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5183 return;
5184
5185
5186
5187 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5188 !(preempt_count() & PREEMPT_MASK)))
5189 return;
5190#endif
5191
5192 preempt_latency_stop(val);
5193 __preempt_count_sub(val);
5194}
5195EXPORT_SYMBOL(preempt_count_sub);
5196NOKPROBE_SYMBOL(preempt_count_sub);
5197
5198#else
5199static inline void preempt_latency_start(int val) { }
5200static inline void preempt_latency_stop(int val) { }
5201#endif
5202
5203static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5204{
5205#ifdef CONFIG_DEBUG_PREEMPT
5206 return p->preempt_disable_ip;
5207#else
5208 return 0;
5209#endif
5210}
5211
5212
5213
5214
5215static noinline void __schedule_bug(struct task_struct *prev)
5216{
5217
5218 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5219
5220 if (oops_in_progress)
5221 return;
5222
5223 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5224 prev->comm, prev->pid, preempt_count());
5225
5226 debug_show_held_locks(prev);
5227 print_modules();
5228 if (irqs_disabled())
5229 print_irqtrace_events(prev);
5230 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
5231 && in_atomic_preempt_off()) {
5232 pr_err("Preemption disabled at:");
5233 print_ip_sym(KERN_ERR, preempt_disable_ip);
5234 }
5235 if (panic_on_warn)
5236 panic("scheduling while atomic\n");
5237
5238 dump_stack();
5239 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5240}
5241
5242
5243
5244
5245static inline void schedule_debug(struct task_struct *prev, bool preempt)
5246{
5247#ifdef CONFIG_SCHED_STACK_END_CHECK
5248 if (task_stack_end_corrupted(prev))
5249 panic("corrupted stack end detected inside scheduler\n");
5250
5251 if (task_scs_end_corrupted(prev))
5252 panic("corrupted shadow stack detected inside scheduler\n");
5253#endif
5254
5255#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5256 if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5257 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5258 prev->comm, prev->pid, prev->non_block_count);
5259 dump_stack();
5260 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5261 }
5262#endif
5263
5264 if (unlikely(in_atomic_preempt_off())) {
5265 __schedule_bug(prev);
5266 preempt_count_set(PREEMPT_DISABLED);
5267 }
5268 rcu_sleep_check();
5269 SCHED_WARN_ON(ct_state() == CONTEXT_USER);
5270
5271 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5272
5273 schedstat_inc(this_rq()->sched_count);
5274}
5275
5276static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5277 struct rq_flags *rf)
5278{
5279#ifdef CONFIG_SMP
5280 const struct sched_class *class;
5281
5282
5283
5284
5285
5286
5287
5288
5289 for_class_range(class, prev->sched_class, &idle_sched_class) {
5290 if (class->balance(rq, prev, rf))
5291 break;
5292 }
5293#endif
5294
5295 put_prev_task(rq, prev);
5296}
5297
5298
5299
5300
5301static inline struct task_struct *
5302__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5303{
5304 const struct sched_class *class;
5305 struct task_struct *p;
5306
5307
5308
5309
5310
5311
5312
5313 if (likely(prev->sched_class <= &fair_sched_class &&
5314 rq->nr_running == rq->cfs.h_nr_running)) {
5315
5316 p = pick_next_task_fair(rq, prev, rf);
5317 if (unlikely(p == RETRY_TASK))
5318 goto restart;
5319
5320
5321 if (!p) {
5322 put_prev_task(rq, prev);
5323 p = pick_next_task_idle(rq);
5324 }
5325
5326 return p;
5327 }
5328
5329restart:
5330 put_prev_task_balance(rq, prev, rf);
5331
5332 for_each_class(class) {
5333 p = class->pick_next_task(rq);
5334 if (p)
5335 return p;
5336 }
5337
5338
5339 BUG();
5340}
5341
5342#ifdef CONFIG_SCHED_CORE
5343static inline bool is_task_rq_idle(struct task_struct *t)
5344{
5345 return (task_rq(t)->idle == t);
5346}
5347
5348static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
5349{
5350 return is_task_rq_idle(a) || (a->core_cookie == cookie);
5351}
5352
5353static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
5354{
5355 if (is_task_rq_idle(a) || is_task_rq_idle(b))
5356 return true;
5357
5358 return a->core_cookie == b->core_cookie;
5359}
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369static struct task_struct *
5370pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
5371{
5372 struct task_struct *class_pick, *cookie_pick;
5373 unsigned long cookie = rq->core->core_cookie;
5374
5375 class_pick = class->pick_task(rq);
5376 if (!class_pick)
5377 return NULL;
5378
5379 if (!cookie) {
5380
5381
5382
5383
5384 if (max && class_pick->core_cookie &&
5385 prio_less(class_pick, max, in_fi))
5386 return idle_sched_class.pick_task(rq);
5387
5388 return class_pick;
5389 }
5390
5391
5392
5393
5394 if (cookie_equals(class_pick, cookie))
5395 return class_pick;
5396
5397 cookie_pick = sched_core_find(rq, cookie);
5398
5399
5400
5401
5402
5403
5404 if (prio_less(cookie_pick, class_pick, in_fi) &&
5405 (!max || prio_less(max, class_pick, in_fi)))
5406 return class_pick;
5407
5408 return cookie_pick;
5409}
5410
5411extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
5412
5413static struct task_struct *
5414pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5415{
5416 struct task_struct *next, *max = NULL;
5417 const struct sched_class *class;
5418 const struct cpumask *smt_mask;
5419 bool fi_before = false;
5420 int i, j, cpu, occ = 0;
5421 bool need_sync;
5422
5423 if (!sched_core_enabled(rq))
5424 return __pick_next_task(rq, prev, rf);
5425
5426 cpu = cpu_of(rq);
5427
5428
5429 if (cpu_is_offline(cpu)) {
5430
5431
5432
5433
5434
5435 rq->core_pick = NULL;
5436 return __pick_next_task(rq, prev, rf);
5437 }
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448 if (rq->core->core_pick_seq == rq->core->core_task_seq &&
5449 rq->core->core_pick_seq != rq->core_sched_seq &&
5450 rq->core_pick) {
5451 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
5452
5453 next = rq->core_pick;
5454 if (next != prev) {
5455 put_prev_task(rq, prev);
5456 set_next_task(rq, next);
5457 }
5458
5459 rq->core_pick = NULL;
5460 return next;
5461 }
5462
5463 put_prev_task_balance(rq, prev, rf);
5464
5465 smt_mask = cpu_smt_mask(cpu);
5466 need_sync = !!rq->core->core_cookie;
5467
5468
5469 rq->core->core_cookie = 0UL;
5470 if (rq->core->core_forceidle) {
5471 need_sync = true;
5472 fi_before = true;
5473 rq->core->core_forceidle = false;
5474 }
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486 rq->core->core_task_seq++;
5487
5488
5489
5490
5491
5492 if (!need_sync) {
5493 for_each_class(class) {
5494 next = class->pick_task(rq);
5495 if (next)
5496 break;
5497 }
5498
5499 if (!next->core_cookie) {
5500 rq->core_pick = NULL;
5501
5502
5503
5504
5505 WARN_ON_ONCE(fi_before);
5506 task_vruntime_update(rq, next, false);
5507 goto done;
5508 }
5509 }
5510
5511 for_each_cpu(i, smt_mask) {
5512 struct rq *rq_i = cpu_rq(i);
5513
5514 rq_i->core_pick = NULL;
5515
5516 if (i != cpu)
5517 update_rq_clock(rq_i);
5518 }
5519
5520
5521
5522
5523
5524 for_each_class(class) {
5525again:
5526 for_each_cpu_wrap(i, smt_mask, cpu) {
5527 struct rq *rq_i = cpu_rq(i);
5528 struct task_struct *p;
5529
5530 if (rq_i->core_pick)
5531 continue;
5532
5533
5534
5535
5536
5537
5538
5539 p = pick_task(rq_i, class, max, fi_before);
5540 if (!p)
5541 continue;
5542
5543 if (!is_task_rq_idle(p))
5544 occ++;
5545
5546 rq_i->core_pick = p;
5547 if (rq_i->idle == p && rq_i->nr_running) {
5548 rq->core->core_forceidle = true;
5549 if (!fi_before)
5550 rq->core->core_forceidle_seq++;
5551 }
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563 if (!max || !cookie_match(max, p)) {
5564 struct task_struct *old_max = max;
5565
5566 rq->core->core_cookie = p->core_cookie;
5567 max = p;
5568
5569 if (old_max) {
5570 rq->core->core_forceidle = false;
5571 for_each_cpu(j, smt_mask) {
5572 if (j == i)
5573 continue;
5574
5575 cpu_rq(j)->core_pick = NULL;
5576 }
5577 occ = 1;
5578 goto again;
5579 }
5580 }
5581 }
5582 }
5583
5584 rq->core->core_pick_seq = rq->core->core_task_seq;
5585 next = rq->core_pick;
5586 rq->core_sched_seq = rq->core->core_pick_seq;
5587
5588
5589 WARN_ON_ONCE(!next);
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599 for_each_cpu(i, smt_mask) {
5600 struct rq *rq_i = cpu_rq(i);
5601
5602
5603
5604
5605
5606
5607
5608
5609 if (!rq_i->core_pick)
5610 continue;
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620 if (!(fi_before && rq->core->core_forceidle))
5621 task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
5622
5623 rq_i->core_pick->core_occupation = occ;
5624
5625 if (i == cpu) {
5626 rq_i->core_pick = NULL;
5627 continue;
5628 }
5629
5630
5631 WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
5632
5633 if (rq_i->curr == rq_i->core_pick) {
5634 rq_i->core_pick = NULL;
5635 continue;
5636 }
5637
5638 resched_curr(rq_i);
5639 }
5640
5641done:
5642 set_next_task(rq, next);
5643 return next;
5644}
5645
5646static bool try_steal_cookie(int this, int that)
5647{
5648 struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
5649 struct task_struct *p;
5650 unsigned long cookie;
5651 bool success = false;
5652
5653 local_irq_disable();
5654 double_rq_lock(dst, src);
5655
5656 cookie = dst->core->core_cookie;
5657 if (!cookie)
5658 goto unlock;
5659
5660 if (dst->curr != dst->idle)
5661 goto unlock;
5662
5663 p = sched_core_find(src, cookie);
5664 if (p == src->idle)
5665 goto unlock;
5666
5667 do {
5668 if (p == src->core_pick || p == src->curr)
5669 goto next;
5670
5671 if (!cpumask_test_cpu(this, &p->cpus_mask))
5672 goto next;
5673
5674 if (p->core_occupation > dst->idle->core_occupation)
5675 goto next;
5676
5677 p->on_rq = TASK_ON_RQ_MIGRATING;
5678 deactivate_task(src, p, 0);
5679 set_task_cpu(p, this);
5680 activate_task(dst, p, 0);
5681 p->on_rq = TASK_ON_RQ_QUEUED;
5682
5683 resched_curr(dst);
5684
5685 success = true;
5686 break;
5687
5688next:
5689 p = sched_core_next(p, cookie);
5690 } while (p);
5691
5692unlock:
5693 double_rq_unlock(dst, src);
5694 local_irq_enable();
5695
5696 return success;
5697}
5698
5699static bool steal_cookie_task(int cpu, struct sched_domain *sd)
5700{
5701 int i;
5702
5703 for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
5704 if (i == cpu)
5705 continue;
5706
5707 if (need_resched())
5708 break;
5709
5710 if (try_steal_cookie(cpu, i))
5711 return true;
5712 }
5713
5714 return false;
5715}
5716
5717static void sched_core_balance(struct rq *rq)
5718{
5719 struct sched_domain *sd;
5720 int cpu = cpu_of(rq);
5721
5722 preempt_disable();
5723 rcu_read_lock();
5724 raw_spin_rq_unlock_irq(rq);
5725 for_each_domain(cpu, sd) {
5726 if (need_resched())
5727 break;
5728
5729 if (steal_cookie_task(cpu, sd))
5730 break;
5731 }
5732 raw_spin_rq_lock_irq(rq);
5733 rcu_read_unlock();
5734 preempt_enable();
5735}
5736
5737static DEFINE_PER_CPU(struct callback_head, core_balance_head);
5738
5739void queue_core_balance(struct rq *rq)
5740{
5741 if (!sched_core_enabled(rq))
5742 return;
5743
5744 if (!rq->core->core_cookie)
5745 return;
5746
5747 if (!rq->nr_running)
5748 return;
5749
5750 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
5751}
5752
5753static void sched_core_cpu_starting(unsigned int cpu)
5754{
5755 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
5756 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
5757 unsigned long flags;
5758 int t;
5759
5760 sched_core_lock(cpu, &flags);
5761
5762 WARN_ON_ONCE(rq->core != rq);
5763
5764
5765 if (cpumask_weight(smt_mask) == 1)
5766 goto unlock;
5767
5768
5769 for_each_cpu(t, smt_mask) {
5770 if (t == cpu)
5771 continue;
5772 rq = cpu_rq(t);
5773 if (rq->core == rq) {
5774 core_rq = rq;
5775 break;
5776 }
5777 }
5778
5779 if (WARN_ON_ONCE(!core_rq))
5780 goto unlock;
5781
5782
5783 for_each_cpu(t, smt_mask) {
5784 rq = cpu_rq(t);
5785
5786 if (t == cpu)
5787 rq->core = core_rq;
5788
5789 WARN_ON_ONCE(rq->core != core_rq);
5790 }
5791
5792unlock:
5793 sched_core_unlock(cpu, &flags);
5794}
5795
5796static void sched_core_cpu_deactivate(unsigned int cpu)
5797{
5798 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
5799 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
5800 unsigned long flags;
5801 int t;
5802
5803 sched_core_lock(cpu, &flags);
5804
5805
5806 if (cpumask_weight(smt_mask) == 1) {
5807 WARN_ON_ONCE(rq->core != rq);
5808 goto unlock;
5809 }
5810
5811
5812 if (rq->core != rq)
5813 goto unlock;
5814
5815
5816 for_each_cpu(t, smt_mask) {
5817 if (t == cpu)
5818 continue;
5819 core_rq = cpu_rq(t);
5820 break;
5821 }
5822
5823 if (WARN_ON_ONCE(!core_rq))
5824 goto unlock;
5825
5826
5827 core_rq->core_task_seq = rq->core_task_seq;
5828 core_rq->core_pick_seq = rq->core_pick_seq;
5829 core_rq->core_cookie = rq->core_cookie;
5830 core_rq->core_forceidle = rq->core_forceidle;
5831 core_rq->core_forceidle_seq = rq->core_forceidle_seq;
5832
5833
5834 for_each_cpu(t, smt_mask) {
5835 rq = cpu_rq(t);
5836 rq->core = core_rq;
5837 }
5838
5839unlock:
5840 sched_core_unlock(cpu, &flags);
5841}
5842
5843static inline void sched_core_cpu_dying(unsigned int cpu)
5844{
5845 struct rq *rq = cpu_rq(cpu);
5846
5847 if (rq->core != rq)
5848 rq->core = rq;
5849}
5850
5851#else
5852
5853static inline void sched_core_cpu_starting(unsigned int cpu) {}
5854static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
5855static inline void sched_core_cpu_dying(unsigned int cpu) {}
5856
5857static struct task_struct *
5858pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5859{
5860 return __pick_next_task(rq, prev, rf);
5861}
5862
5863#endif
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904static void __sched notrace __schedule(bool preempt)
5905{
5906 struct task_struct *prev, *next;
5907 unsigned long *switch_count;
5908 unsigned long prev_state;
5909 struct rq_flags rf;
5910 struct rq *rq;
5911 int cpu;
5912
5913 cpu = smp_processor_id();
5914 rq = cpu_rq(cpu);
5915 prev = rq->curr;
5916
5917 schedule_debug(prev, preempt);
5918
5919 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
5920 hrtick_clear(rq);
5921
5922 local_irq_disable();
5923 rcu_note_context_switch(preempt);
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940 rq_lock(rq, &rf);
5941 smp_mb__after_spinlock();
5942
5943
5944 rq->clock_update_flags <<= 1;
5945 update_rq_clock(rq);
5946
5947 switch_count = &prev->nivcsw;
5948
5949
5950
5951
5952
5953
5954
5955
5956 prev_state = READ_ONCE(prev->__state);
5957 if (!preempt && prev_state) {
5958 if (signal_pending_state(prev_state, prev)) {
5959 WRITE_ONCE(prev->__state, TASK_RUNNING);
5960 } else {
5961 prev->sched_contributes_to_load =
5962 (prev_state & TASK_UNINTERRUPTIBLE) &&
5963 !(prev_state & TASK_NOLOAD) &&
5964 !(prev->flags & PF_FROZEN);
5965
5966 if (prev->sched_contributes_to_load)
5967 rq->nr_uninterruptible++;
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
5981
5982 if (prev->in_iowait) {
5983 atomic_inc(&rq->nr_iowait);
5984 delayacct_blkio_start();
5985 }
5986 }
5987 switch_count = &prev->nvcsw;
5988 }
5989
5990 next = pick_next_task(rq, prev, &rf);
5991 clear_tsk_need_resched(prev);
5992 clear_preempt_need_resched();
5993#ifdef CONFIG_SCHED_DEBUG
5994 rq->last_seen_need_resched_ns = 0;
5995#endif
5996
5997 if (likely(prev != next)) {
5998 rq->nr_switches++;
5999
6000
6001
6002
6003 RCU_INIT_POINTER(rq->curr, next);
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018 ++*switch_count;
6019
6020 migrate_disable_switch(rq, prev);
6021 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6022
6023 trace_sched_switch(preempt, prev, next);
6024
6025
6026 rq = context_switch(rq, prev, next, &rf);
6027 } else {
6028 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
6029
6030 rq_unpin_lock(rq, &rf);
6031 __balance_callbacks(rq);
6032 raw_spin_rq_unlock_irq(rq);
6033 }
6034}
6035
6036void __noreturn do_task_dead(void)
6037{
6038
6039 set_special_state(TASK_DEAD);
6040
6041
6042 current->flags |= PF_NOFREEZE;
6043
6044 __schedule(false);
6045 BUG();
6046
6047
6048 for (;;)
6049 cpu_relax();
6050}
6051
6052static inline void sched_submit_work(struct task_struct *tsk)
6053{
6054 unsigned int task_flags;
6055
6056 if (task_is_running(tsk))
6057 return;
6058
6059 task_flags = tsk->flags;
6060
6061
6062
6063
6064
6065
6066
6067
6068 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6069 preempt_disable();
6070 if (task_flags & PF_WQ_WORKER)
6071 wq_worker_sleeping(tsk);
6072 else
6073 io_wq_worker_sleeping(tsk);
6074 preempt_enable_no_resched();
6075 }
6076
6077 if (tsk_is_pi_blocked(tsk))
6078 return;
6079
6080
6081
6082
6083
6084 if (blk_needs_flush_plug(tsk))
6085 blk_schedule_flush_plug(tsk);
6086}
6087
6088static void sched_update_worker(struct task_struct *tsk)
6089{
6090 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6091 if (tsk->flags & PF_WQ_WORKER)
6092 wq_worker_running(tsk);
6093 else
6094 io_wq_worker_running(tsk);
6095 }
6096}
6097
6098asmlinkage __visible void __sched schedule(void)
6099{
6100 struct task_struct *tsk = current;
6101
6102 sched_submit_work(tsk);
6103 do {
6104 preempt_disable();
6105 __schedule(false);
6106 sched_preempt_enable_no_resched();
6107 } while (need_resched());
6108 sched_update_worker(tsk);
6109}
6110EXPORT_SYMBOL(schedule);
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122void __sched schedule_idle(void)
6123{
6124
6125
6126
6127
6128
6129
6130
6131 WARN_ON_ONCE(current->__state);
6132 do {
6133 __schedule(false);
6134 } while (need_resched());
6135}
6136
6137#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
6138asmlinkage __visible void __sched schedule_user(void)
6139{
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150 enum ctx_state prev_state = exception_enter();
6151 schedule();
6152 exception_exit(prev_state);
6153}
6154#endif
6155
6156
6157
6158
6159
6160
6161void __sched schedule_preempt_disabled(void)
6162{
6163 sched_preempt_enable_no_resched();
6164 schedule();
6165 preempt_disable();
6166}
6167
6168static void __sched notrace preempt_schedule_common(void)
6169{
6170 do {
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184 preempt_disable_notrace();
6185 preempt_latency_start(1);
6186 __schedule(true);
6187 preempt_latency_stop(1);
6188 preempt_enable_no_resched_notrace();
6189
6190
6191
6192
6193
6194 } while (need_resched());
6195}
6196
6197#ifdef CONFIG_PREEMPTION
6198
6199
6200
6201
6202asmlinkage __visible void __sched notrace preempt_schedule(void)
6203{
6204
6205
6206
6207
6208 if (likely(!preemptible()))
6209 return;
6210
6211 preempt_schedule_common();
6212}
6213NOKPROBE_SYMBOL(preempt_schedule);
6214EXPORT_SYMBOL(preempt_schedule);
6215
6216#ifdef CONFIG_PREEMPT_DYNAMIC
6217DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
6218EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
6219#endif
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
6237{
6238 enum ctx_state prev_ctx;
6239
6240 if (likely(!preemptible()))
6241 return;
6242
6243 do {
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257 preempt_disable_notrace();
6258 preempt_latency_start(1);
6259
6260
6261
6262
6263
6264 prev_ctx = exception_enter();
6265 __schedule(true);
6266 exception_exit(prev_ctx);
6267
6268 preempt_latency_stop(1);
6269 preempt_enable_no_resched_notrace();
6270 } while (need_resched());
6271}
6272EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
6273
6274#ifdef CONFIG_PREEMPT_DYNAMIC
6275DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6276EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
6277#endif
6278
6279#endif
6280
6281#ifdef CONFIG_PREEMPT_DYNAMIC
6282
6283#include <linux/entry-common.h>
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315enum {
6316 preempt_dynamic_none = 0,
6317 preempt_dynamic_voluntary,
6318 preempt_dynamic_full,
6319};
6320
6321int preempt_dynamic_mode = preempt_dynamic_full;
6322
6323int sched_dynamic_mode(const char *str)
6324{
6325 if (!strcmp(str, "none"))
6326 return preempt_dynamic_none;
6327
6328 if (!strcmp(str, "voluntary"))
6329 return preempt_dynamic_voluntary;
6330
6331 if (!strcmp(str, "full"))
6332 return preempt_dynamic_full;
6333
6334 return -EINVAL;
6335}
6336
6337void sched_dynamic_update(int mode)
6338{
6339
6340
6341
6342
6343 static_call_update(cond_resched, __cond_resched);
6344 static_call_update(might_resched, __cond_resched);
6345 static_call_update(preempt_schedule, __preempt_schedule_func);
6346 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6347 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6348
6349 switch (mode) {
6350 case preempt_dynamic_none:
6351 static_call_update(cond_resched, __cond_resched);
6352 static_call_update(might_resched, (void *)&__static_call_return0);
6353 static_call_update(preempt_schedule, NULL);
6354 static_call_update(preempt_schedule_notrace, NULL);
6355 static_call_update(irqentry_exit_cond_resched, NULL);
6356 pr_info("Dynamic Preempt: none\n");
6357 break;
6358
6359 case preempt_dynamic_voluntary:
6360 static_call_update(cond_resched, __cond_resched);
6361 static_call_update(might_resched, __cond_resched);
6362 static_call_update(preempt_schedule, NULL);
6363 static_call_update(preempt_schedule_notrace, NULL);
6364 static_call_update(irqentry_exit_cond_resched, NULL);
6365 pr_info("Dynamic Preempt: voluntary\n");
6366 break;
6367
6368 case preempt_dynamic_full:
6369 static_call_update(cond_resched, (void *)&__static_call_return0);
6370 static_call_update(might_resched, (void *)&__static_call_return0);
6371 static_call_update(preempt_schedule, __preempt_schedule_func);
6372 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6373 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6374 pr_info("Dynamic Preempt: full\n");
6375 break;
6376 }
6377
6378 preempt_dynamic_mode = mode;
6379}
6380
6381static int __init setup_preempt_mode(char *str)
6382{
6383 int mode = sched_dynamic_mode(str);
6384 if (mode < 0) {
6385 pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
6386 return 1;
6387 }
6388
6389 sched_dynamic_update(mode);
6390 return 0;
6391}
6392__setup("preempt=", setup_preempt_mode);
6393
6394#endif
6395
6396
6397
6398
6399
6400
6401
6402asmlinkage __visible void __sched preempt_schedule_irq(void)
6403{
6404 enum ctx_state prev_state;
6405
6406
6407 BUG_ON(preempt_count() || !irqs_disabled());
6408
6409 prev_state = exception_enter();
6410
6411 do {
6412 preempt_disable();
6413 local_irq_enable();
6414 __schedule(true);
6415 local_irq_disable();
6416 sched_preempt_enable_no_resched();
6417 } while (need_resched());
6418
6419 exception_exit(prev_state);
6420}
6421
6422int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
6423 void *key)
6424{
6425 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
6426 return try_to_wake_up(curr->private, mode, wake_flags);
6427}
6428EXPORT_SYMBOL(default_wake_function);
6429
6430static void __setscheduler_prio(struct task_struct *p, int prio)
6431{
6432 if (dl_prio(prio))
6433 p->sched_class = &dl_sched_class;
6434 else if (rt_prio(prio))
6435 p->sched_class = &rt_sched_class;
6436 else
6437 p->sched_class = &fair_sched_class;
6438
6439 p->prio = prio;
6440}
6441
6442#ifdef CONFIG_RT_MUTEXES
6443
6444static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
6445{
6446 if (pi_task)
6447 prio = min(prio, pi_task->prio);
6448
6449 return prio;
6450}
6451
6452static inline int rt_effective_prio(struct task_struct *p, int prio)
6453{
6454 struct task_struct *pi_task = rt_mutex_get_top_task(p);
6455
6456 return __rt_effective_prio(pi_task, prio);
6457}
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
6471{
6472 int prio, oldprio, queued, running, queue_flag =
6473 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6474 const struct sched_class *prev_class;
6475 struct rq_flags rf;
6476 struct rq *rq;
6477
6478
6479 prio = __rt_effective_prio(pi_task, p->normal_prio);
6480
6481
6482
6483
6484 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
6485 return;
6486
6487 rq = __task_rq_lock(p, &rf);
6488 update_rq_clock(rq);
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499 p->pi_top_task = pi_task;
6500
6501
6502
6503
6504 if (prio == p->prio && !dl_prio(prio))
6505 goto out_unlock;
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519 if (unlikely(p == rq->idle)) {
6520 WARN_ON(p != rq->curr);
6521 WARN_ON(p->pi_blocked_on);
6522 goto out_unlock;
6523 }
6524
6525 trace_sched_pi_setprio(p, pi_task);
6526 oldprio = p->prio;
6527
6528 if (oldprio == prio)
6529 queue_flag &= ~DEQUEUE_MOVE;
6530
6531 prev_class = p->sched_class;
6532 queued = task_on_rq_queued(p);
6533 running = task_current(rq, p);
6534 if (queued)
6535 dequeue_task(rq, p, queue_flag);
6536 if (running)
6537 put_prev_task(rq, p);
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548 if (dl_prio(prio)) {
6549 if (!dl_prio(p->normal_prio) ||
6550 (pi_task && dl_prio(pi_task->prio) &&
6551 dl_entity_preempt(&pi_task->dl, &p->dl))) {
6552 p->dl.pi_se = pi_task->dl.pi_se;
6553 queue_flag |= ENQUEUE_REPLENISH;
6554 } else {
6555 p->dl.pi_se = &p->dl;
6556 }
6557 } else if (rt_prio(prio)) {
6558 if (dl_prio(oldprio))
6559 p->dl.pi_se = &p->dl;
6560 if (oldprio < prio)
6561 queue_flag |= ENQUEUE_HEAD;
6562 } else {
6563 if (dl_prio(oldprio))
6564 p->dl.pi_se = &p->dl;
6565 if (rt_prio(oldprio))
6566 p->rt.timeout = 0;
6567 }
6568
6569 __setscheduler_prio(p, prio);
6570
6571 if (queued)
6572 enqueue_task(rq, p, queue_flag);
6573 if (running)
6574 set_next_task(rq, p);
6575
6576 check_class_changed(rq, p, prev_class, oldprio);
6577out_unlock:
6578
6579 preempt_disable();
6580
6581 rq_unpin_lock(rq, &rf);
6582 __balance_callbacks(rq);
6583 raw_spin_rq_unlock(rq);
6584
6585 preempt_enable();
6586}
6587#else
6588static inline int rt_effective_prio(struct task_struct *p, int prio)
6589{
6590 return prio;
6591}
6592#endif
6593
6594void set_user_nice(struct task_struct *p, long nice)
6595{
6596 bool queued, running;
6597 int old_prio;
6598 struct rq_flags rf;
6599 struct rq *rq;
6600
6601 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
6602 return;
6603
6604
6605
6606
6607 rq = task_rq_lock(p, &rf);
6608 update_rq_clock(rq);
6609
6610
6611
6612
6613
6614
6615
6616 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
6617 p->static_prio = NICE_TO_PRIO(nice);
6618 goto out_unlock;
6619 }
6620 queued = task_on_rq_queued(p);
6621 running = task_current(rq, p);
6622 if (queued)
6623 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
6624 if (running)
6625 put_prev_task(rq, p);
6626
6627 p->static_prio = NICE_TO_PRIO(nice);
6628 set_load_weight(p, true);
6629 old_prio = p->prio;
6630 p->prio = effective_prio(p);
6631
6632 if (queued)
6633 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6634 if (running)
6635 set_next_task(rq, p);
6636
6637
6638
6639
6640
6641 p->sched_class->prio_changed(rq, p, old_prio);
6642
6643out_unlock:
6644 task_rq_unlock(rq, p, &rf);
6645}
6646EXPORT_SYMBOL(set_user_nice);
6647
6648
6649
6650
6651
6652
6653int can_nice(const struct task_struct *p, const int nice)
6654{
6655
6656 int nice_rlim = nice_to_rlimit(nice);
6657
6658 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6659 capable(CAP_SYS_NICE));
6660}
6661
6662#ifdef __ARCH_WANT_SYS_NICE
6663
6664
6665
6666
6667
6668
6669
6670
6671SYSCALL_DEFINE1(nice, int, increment)
6672{
6673 long nice, retval;
6674
6675
6676
6677
6678
6679
6680 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
6681 nice = task_nice(current) + increment;
6682
6683 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
6684 if (increment < 0 && !can_nice(current, nice))
6685 return -EPERM;
6686
6687 retval = security_task_setnice(current, nice);
6688 if (retval)
6689 return retval;
6690
6691 set_user_nice(current, nice);
6692 return 0;
6693}
6694
6695#endif
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709int task_prio(const struct task_struct *p)
6710{
6711 return p->prio - MAX_RT_PRIO;
6712}
6713
6714
6715
6716
6717
6718
6719
6720int idle_cpu(int cpu)
6721{
6722 struct rq *rq = cpu_rq(cpu);
6723
6724 if (rq->curr != rq->idle)
6725 return 0;
6726
6727 if (rq->nr_running)
6728 return 0;
6729
6730#ifdef CONFIG_SMP
6731 if (rq->ttwu_pending)
6732 return 0;
6733#endif
6734
6735 return 1;
6736}
6737
6738
6739
6740
6741
6742
6743
6744int available_idle_cpu(int cpu)
6745{
6746 if (!idle_cpu(cpu))
6747 return 0;
6748
6749 if (vcpu_is_preempted(cpu))
6750 return 0;
6751
6752 return 1;
6753}
6754
6755
6756
6757
6758
6759
6760
6761struct task_struct *idle_task(int cpu)
6762{
6763 return cpu_rq(cpu)->idle;
6764}
6765
6766#ifdef CONFIG_SMP
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
6788 unsigned long max, enum cpu_util_type type,
6789 struct task_struct *p)
6790{
6791 unsigned long dl_util, util, irq;
6792 struct rq *rq = cpu_rq(cpu);
6793
6794 if (!uclamp_is_used() &&
6795 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
6796 return max;
6797 }
6798
6799
6800
6801
6802
6803
6804 irq = cpu_util_irq(rq);
6805 if (unlikely(irq >= max))
6806 return max;
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820 util = util_cfs + cpu_util_rt(rq);
6821 if (type == FREQUENCY_UTIL)
6822 util = uclamp_rq_util_with(rq, util, p);
6823
6824 dl_util = cpu_util_dl(rq);
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835 if (util + dl_util >= max)
6836 return max;
6837
6838
6839
6840
6841
6842 if (type == ENERGY_UTIL)
6843 util += dl_util;
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854 util = scale_irq_capacity(util, irq, max);
6855 util += irq;
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867 if (type == FREQUENCY_UTIL)
6868 util += cpu_bw_dl(rq);
6869
6870 return min(max, util);
6871}
6872
6873unsigned long sched_cpu_util(int cpu, unsigned long max)
6874{
6875 return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
6876 ENERGY_UTIL, NULL);
6877}
6878#endif
6879
6880
6881
6882
6883
6884
6885
6886static struct task_struct *find_process_by_pid(pid_t pid)
6887{
6888 return pid ? find_task_by_vpid(pid) : current;
6889}
6890
6891
6892
6893
6894
6895#define SETPARAM_POLICY -1
6896
6897static void __setscheduler_params(struct task_struct *p,
6898 const struct sched_attr *attr)
6899{
6900 int policy = attr->sched_policy;
6901
6902 if (policy == SETPARAM_POLICY)
6903 policy = p->policy;
6904
6905 p->policy = policy;
6906
6907 if (dl_policy(policy))
6908 __setparam_dl(p, attr);
6909 else if (fair_policy(policy))
6910 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
6911
6912
6913
6914
6915
6916
6917 p->rt_priority = attr->sched_priority;
6918 p->normal_prio = normal_prio(p);
6919 set_load_weight(p, true);
6920}
6921
6922
6923
6924
6925static bool check_same_owner(struct task_struct *p)
6926{
6927 const struct cred *cred = current_cred(), *pcred;
6928 bool match;
6929
6930 rcu_read_lock();
6931 pcred = __task_cred(p);
6932 match = (uid_eq(cred->euid, pcred->euid) ||
6933 uid_eq(cred->euid, pcred->uid));
6934 rcu_read_unlock();
6935 return match;
6936}
6937
6938static int __sched_setscheduler(struct task_struct *p,
6939 const struct sched_attr *attr,
6940 bool user, bool pi)
6941{
6942 int oldpolicy = -1, policy = attr->sched_policy;
6943 int retval, oldprio, newprio, queued, running;
6944 const struct sched_class *prev_class;
6945 struct callback_head *head;
6946 struct rq_flags rf;
6947 int reset_on_fork;
6948 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6949 struct rq *rq;
6950
6951
6952 BUG_ON(pi && in_interrupt());
6953recheck:
6954
6955 if (policy < 0) {
6956 reset_on_fork = p->sched_reset_on_fork;
6957 policy = oldpolicy = p->policy;
6958 } else {
6959 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
6960
6961 if (!valid_policy(policy))
6962 return -EINVAL;
6963 }
6964
6965 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
6966 return -EINVAL;
6967
6968
6969
6970
6971
6972
6973 if (attr->sched_priority > MAX_RT_PRIO-1)
6974 return -EINVAL;
6975 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
6976 (rt_policy(policy) != (attr->sched_priority != 0)))
6977 return -EINVAL;
6978
6979
6980
6981
6982 if (user && !capable(CAP_SYS_NICE)) {
6983 if (fair_policy(policy)) {
6984 if (attr->sched_nice < task_nice(p) &&
6985 !can_nice(p, attr->sched_nice))
6986 return -EPERM;
6987 }
6988
6989 if (rt_policy(policy)) {
6990 unsigned long rlim_rtprio =
6991 task_rlimit(p, RLIMIT_RTPRIO);
6992
6993
6994 if (policy != p->policy && !rlim_rtprio)
6995 return -EPERM;
6996
6997
6998 if (attr->sched_priority > p->rt_priority &&
6999 attr->sched_priority > rlim_rtprio)
7000 return -EPERM;
7001 }
7002
7003
7004
7005
7006
7007
7008
7009 if (dl_policy(policy))
7010 return -EPERM;
7011
7012
7013
7014
7015
7016 if (task_has_idle_policy(p) && !idle_policy(policy)) {
7017 if (!can_nice(p, task_nice(p)))
7018 return -EPERM;
7019 }
7020
7021
7022 if (!check_same_owner(p))
7023 return -EPERM;
7024
7025
7026 if (p->sched_reset_on_fork && !reset_on_fork)
7027 return -EPERM;
7028 }
7029
7030 if (user) {
7031 if (attr->sched_flags & SCHED_FLAG_SUGOV)
7032 return -EINVAL;
7033
7034 retval = security_task_setscheduler(p);
7035 if (retval)
7036 return retval;
7037 }
7038
7039
7040 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
7041 retval = uclamp_validate(p, attr);
7042 if (retval)
7043 return retval;
7044 }
7045
7046 if (pi)
7047 cpuset_read_lock();
7048
7049
7050
7051
7052
7053
7054
7055
7056 rq = task_rq_lock(p, &rf);
7057 update_rq_clock(rq);
7058
7059
7060
7061
7062 if (p == rq->stop) {
7063 retval = -EINVAL;
7064 goto unlock;
7065 }
7066
7067
7068
7069
7070
7071 if (unlikely(policy == p->policy)) {
7072 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
7073 goto change;
7074 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
7075 goto change;
7076 if (dl_policy(policy) && dl_param_changed(p, attr))
7077 goto change;
7078 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
7079 goto change;
7080
7081 p->sched_reset_on_fork = reset_on_fork;
7082 retval = 0;
7083 goto unlock;
7084 }
7085change:
7086
7087 if (user) {
7088#ifdef CONFIG_RT_GROUP_SCHED
7089
7090
7091
7092
7093 if (rt_bandwidth_enabled() && rt_policy(policy) &&
7094 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
7095 !task_group_is_autogroup(task_group(p))) {
7096 retval = -EPERM;
7097 goto unlock;
7098 }
7099#endif
7100#ifdef CONFIG_SMP
7101 if (dl_bandwidth_enabled() && dl_policy(policy) &&
7102 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
7103 cpumask_t *span = rq->rd->span;
7104
7105
7106
7107
7108
7109
7110 if (!cpumask_subset(span, p->cpus_ptr) ||
7111 rq->rd->dl_bw.bw == 0) {
7112 retval = -EPERM;
7113 goto unlock;
7114 }
7115 }
7116#endif
7117 }
7118
7119
7120 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
7121 policy = oldpolicy = -1;
7122 task_rq_unlock(rq, p, &rf);
7123 if (pi)
7124 cpuset_read_unlock();
7125 goto recheck;
7126 }
7127
7128
7129
7130
7131
7132
7133 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
7134 retval = -EBUSY;
7135 goto unlock;
7136 }
7137
7138 p->sched_reset_on_fork = reset_on_fork;
7139 oldprio = p->prio;
7140
7141 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
7142 if (pi) {
7143
7144
7145
7146
7147
7148
7149
7150 newprio = rt_effective_prio(p, newprio);
7151 if (newprio == oldprio)
7152 queue_flags &= ~DEQUEUE_MOVE;
7153 }
7154
7155 queued = task_on_rq_queued(p);
7156 running = task_current(rq, p);
7157 if (queued)
7158 dequeue_task(rq, p, queue_flags);
7159 if (running)
7160 put_prev_task(rq, p);
7161
7162 prev_class = p->sched_class;
7163
7164 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
7165 __setscheduler_params(p, attr);
7166 __setscheduler_prio(p, newprio);
7167 }
7168 __setscheduler_uclamp(p, attr);
7169
7170 if (queued) {
7171
7172
7173
7174
7175 if (oldprio < p->prio)
7176 queue_flags |= ENQUEUE_HEAD;
7177
7178 enqueue_task(rq, p, queue_flags);
7179 }
7180 if (running)
7181 set_next_task(rq, p);
7182
7183 check_class_changed(rq, p, prev_class, oldprio);
7184
7185
7186 preempt_disable();
7187 head = splice_balance_callbacks(rq);
7188 task_rq_unlock(rq, p, &rf);
7189
7190 if (pi) {
7191 cpuset_read_unlock();
7192 rt_mutex_adjust_pi(p);
7193 }
7194
7195
7196 balance_callbacks(rq, head);
7197 preempt_enable();
7198
7199 return 0;
7200
7201unlock:
7202 task_rq_unlock(rq, p, &rf);
7203 if (pi)
7204 cpuset_read_unlock();
7205 return retval;
7206}
7207
7208static int _sched_setscheduler(struct task_struct *p, int policy,
7209 const struct sched_param *param, bool check)
7210{
7211 struct sched_attr attr = {
7212 .sched_policy = policy,
7213 .sched_priority = param->sched_priority,
7214 .sched_nice = PRIO_TO_NICE(p->static_prio),
7215 };
7216
7217
7218 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
7219 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7220 policy &= ~SCHED_RESET_ON_FORK;
7221 attr.sched_policy = policy;
7222 }
7223
7224 return __sched_setscheduler(p, &attr, check, true);
7225}
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238int sched_setscheduler(struct task_struct *p, int policy,
7239 const struct sched_param *param)
7240{
7241 return _sched_setscheduler(p, policy, param, true);
7242}
7243
7244int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
7245{
7246 return __sched_setscheduler(p, attr, true, true);
7247}
7248
7249int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
7250{
7251 return __sched_setscheduler(p, attr, false, true);
7252}
7253EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268int sched_setscheduler_nocheck(struct task_struct *p, int policy,
7269 const struct sched_param *param)
7270{
7271 return _sched_setscheduler(p, policy, param, false);
7272}
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292void sched_set_fifo(struct task_struct *p)
7293{
7294 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
7295 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7296}
7297EXPORT_SYMBOL_GPL(sched_set_fifo);
7298
7299
7300
7301
7302void sched_set_fifo_low(struct task_struct *p)
7303{
7304 struct sched_param sp = { .sched_priority = 1 };
7305 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7306}
7307EXPORT_SYMBOL_GPL(sched_set_fifo_low);
7308
7309void sched_set_normal(struct task_struct *p, int nice)
7310{
7311 struct sched_attr attr = {
7312 .sched_policy = SCHED_NORMAL,
7313 .sched_nice = nice,
7314 };
7315 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
7316}
7317EXPORT_SYMBOL_GPL(sched_set_normal);
7318
7319static int
7320do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
7321{
7322 struct sched_param lparam;
7323 struct task_struct *p;
7324 int retval;
7325
7326 if (!param || pid < 0)
7327 return -EINVAL;
7328 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
7329 return -EFAULT;
7330
7331 rcu_read_lock();
7332 retval = -ESRCH;
7333 p = find_process_by_pid(pid);
7334 if (likely(p))
7335 get_task_struct(p);
7336 rcu_read_unlock();
7337
7338 if (likely(p)) {
7339 retval = sched_setscheduler(p, policy, &lparam);
7340 put_task_struct(p);
7341 }
7342
7343 return retval;
7344}
7345
7346
7347
7348
7349static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
7350{
7351 u32 size;
7352 int ret;
7353
7354
7355 memset(attr, 0, sizeof(*attr));
7356
7357 ret = get_user(size, &uattr->size);
7358 if (ret)
7359 return ret;
7360
7361
7362 if (!size)
7363 size = SCHED_ATTR_SIZE_VER0;
7364 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
7365 goto err_size;
7366
7367 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
7368 if (ret) {
7369 if (ret == -E2BIG)
7370 goto err_size;
7371 return ret;
7372 }
7373
7374 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
7375 size < SCHED_ATTR_SIZE_VER1)
7376 return -EINVAL;
7377
7378
7379
7380
7381
7382 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
7383
7384 return 0;
7385
7386err_size:
7387 put_user(sizeof(*attr), &uattr->size);
7388 return -E2BIG;
7389}
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
7400{
7401 if (policy < 0)
7402 return -EINVAL;
7403
7404 return do_sched_setscheduler(pid, policy, param);
7405}
7406
7407
7408
7409
7410
7411
7412
7413
7414SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
7415{
7416 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
7417}
7418
7419
7420
7421
7422
7423
7424
7425SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
7426 unsigned int, flags)
7427{
7428 struct sched_attr attr;
7429 struct task_struct *p;
7430 int retval;
7431
7432 if (!uattr || pid < 0 || flags)
7433 return -EINVAL;
7434
7435 retval = sched_copy_attr(uattr, &attr);
7436 if (retval)
7437 return retval;
7438
7439 if ((int)attr.sched_policy < 0)
7440 return -EINVAL;
7441 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
7442 attr.sched_policy = SETPARAM_POLICY;
7443
7444 rcu_read_lock();
7445 retval = -ESRCH;
7446 p = find_process_by_pid(pid);
7447 if (likely(p))
7448 get_task_struct(p);
7449 rcu_read_unlock();
7450
7451 if (likely(p)) {
7452 retval = sched_setattr(p, &attr);
7453 put_task_struct(p);
7454 }
7455
7456 return retval;
7457}
7458
7459
7460
7461
7462
7463
7464
7465
7466SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
7467{
7468 struct task_struct *p;
7469 int retval;
7470
7471 if (pid < 0)
7472 return -EINVAL;
7473
7474 retval = -ESRCH;
7475 rcu_read_lock();
7476 p = find_process_by_pid(pid);
7477 if (p) {
7478 retval = security_task_getscheduler(p);
7479 if (!retval)
7480 retval = p->policy
7481 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
7482 }
7483 rcu_read_unlock();
7484 return retval;
7485}
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
7496{
7497 struct sched_param lp = { .sched_priority = 0 };
7498 struct task_struct *p;
7499 int retval;
7500
7501 if (!param || pid < 0)
7502 return -EINVAL;
7503
7504 rcu_read_lock();
7505 p = find_process_by_pid(pid);
7506 retval = -ESRCH;
7507 if (!p)
7508 goto out_unlock;
7509
7510 retval = security_task_getscheduler(p);
7511 if (retval)
7512 goto out_unlock;
7513
7514 if (task_has_rt_policy(p))
7515 lp.sched_priority = p->rt_priority;
7516 rcu_read_unlock();
7517
7518
7519
7520
7521 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
7522
7523 return retval;
7524
7525out_unlock:
7526 rcu_read_unlock();
7527 return retval;
7528}
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538static int
7539sched_attr_copy_to_user(struct sched_attr __user *uattr,
7540 struct sched_attr *kattr,
7541 unsigned int usize)
7542{
7543 unsigned int ksize = sizeof(*kattr);
7544
7545 if (!access_ok(uattr, usize))
7546 return -EFAULT;
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561 kattr->size = min(usize, ksize);
7562
7563 if (copy_to_user(uattr, kattr, kattr->size))
7564 return -EFAULT;
7565
7566 return 0;
7567}
7568
7569
7570
7571
7572
7573
7574
7575
7576SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
7577 unsigned int, usize, unsigned int, flags)
7578{
7579 struct sched_attr kattr = { };
7580 struct task_struct *p;
7581 int retval;
7582
7583 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
7584 usize < SCHED_ATTR_SIZE_VER0 || flags)
7585 return -EINVAL;
7586
7587 rcu_read_lock();
7588 p = find_process_by_pid(pid);
7589 retval = -ESRCH;
7590 if (!p)
7591 goto out_unlock;
7592
7593 retval = security_task_getscheduler(p);
7594 if (retval)
7595 goto out_unlock;
7596
7597 kattr.sched_policy = p->policy;
7598 if (p->sched_reset_on_fork)
7599 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7600 if (task_has_dl_policy(p))
7601 __getparam_dl(p, &kattr);
7602 else if (task_has_rt_policy(p))
7603 kattr.sched_priority = p->rt_priority;
7604 else
7605 kattr.sched_nice = task_nice(p);
7606
7607#ifdef CONFIG_UCLAMP_TASK
7608
7609
7610
7611
7612
7613 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
7614 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
7615#endif
7616
7617 rcu_read_unlock();
7618
7619 return sched_attr_copy_to_user(uattr, &kattr, usize);
7620
7621out_unlock:
7622 rcu_read_unlock();
7623 return retval;
7624}
7625
7626long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
7627{
7628 cpumask_var_t cpus_allowed, new_mask;
7629 struct task_struct *p;
7630 int retval;
7631
7632 rcu_read_lock();
7633
7634 p = find_process_by_pid(pid);
7635 if (!p) {
7636 rcu_read_unlock();
7637 return -ESRCH;
7638 }
7639
7640
7641 get_task_struct(p);
7642 rcu_read_unlock();
7643
7644 if (p->flags & PF_NO_SETAFFINITY) {
7645 retval = -EINVAL;
7646 goto out_put_task;
7647 }
7648 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
7649 retval = -ENOMEM;
7650 goto out_put_task;
7651 }
7652 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
7653 retval = -ENOMEM;
7654 goto out_free_cpus_allowed;
7655 }
7656 retval = -EPERM;
7657 if (!check_same_owner(p)) {
7658 rcu_read_lock();
7659 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
7660 rcu_read_unlock();
7661 goto out_free_new_mask;
7662 }
7663 rcu_read_unlock();
7664 }
7665
7666 retval = security_task_setscheduler(p);
7667 if (retval)
7668 goto out_free_new_mask;
7669
7670
7671 cpuset_cpus_allowed(p, cpus_allowed);
7672 cpumask_and(new_mask, in_mask, cpus_allowed);
7673
7674
7675
7676
7677
7678
7679
7680#ifdef CONFIG_SMP
7681 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
7682 rcu_read_lock();
7683 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
7684 retval = -EBUSY;
7685 rcu_read_unlock();
7686 goto out_free_new_mask;
7687 }
7688 rcu_read_unlock();
7689 }
7690#endif
7691again:
7692 retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
7693
7694 if (!retval) {
7695 cpuset_cpus_allowed(p, cpus_allowed);
7696 if (!cpumask_subset(new_mask, cpus_allowed)) {
7697
7698
7699
7700
7701
7702 cpumask_copy(new_mask, cpus_allowed);
7703 goto again;
7704 }
7705 }
7706out_free_new_mask:
7707 free_cpumask_var(new_mask);
7708out_free_cpus_allowed:
7709 free_cpumask_var(cpus_allowed);
7710out_put_task:
7711 put_task_struct(p);
7712 return retval;
7713}
7714
7715static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
7716 struct cpumask *new_mask)
7717{
7718 if (len < cpumask_size())
7719 cpumask_clear(new_mask);
7720 else if (len > cpumask_size())
7721 len = cpumask_size();
7722
7723 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
7724}
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
7735 unsigned long __user *, user_mask_ptr)
7736{
7737 cpumask_var_t new_mask;
7738 int retval;
7739
7740 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
7741 return -ENOMEM;
7742
7743 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
7744 if (retval == 0)
7745 retval = sched_setaffinity(pid, new_mask);
7746 free_cpumask_var(new_mask);
7747 return retval;
7748}
7749
7750long sched_getaffinity(pid_t pid, struct cpumask *mask)
7751{
7752 struct task_struct *p;
7753 unsigned long flags;
7754 int retval;
7755
7756 rcu_read_lock();
7757
7758 retval = -ESRCH;
7759 p = find_process_by_pid(pid);
7760 if (!p)
7761 goto out_unlock;
7762
7763 retval = security_task_getscheduler(p);
7764 if (retval)
7765 goto out_unlock;
7766
7767 raw_spin_lock_irqsave(&p->pi_lock, flags);
7768 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
7769 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
7770
7771out_unlock:
7772 rcu_read_unlock();
7773
7774 return retval;
7775}
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
7787 unsigned long __user *, user_mask_ptr)
7788{
7789 int ret;
7790 cpumask_var_t mask;
7791
7792 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
7793 return -EINVAL;
7794 if (len & (sizeof(unsigned long)-1))
7795 return -EINVAL;
7796
7797 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
7798 return -ENOMEM;
7799
7800 ret = sched_getaffinity(pid, mask);
7801 if (ret == 0) {
7802 unsigned int retlen = min(len, cpumask_size());
7803
7804 if (copy_to_user(user_mask_ptr, mask, retlen))
7805 ret = -EFAULT;
7806 else
7807 ret = retlen;
7808 }
7809 free_cpumask_var(mask);
7810
7811 return ret;
7812}
7813
7814static void do_sched_yield(void)
7815{
7816 struct rq_flags rf;
7817 struct rq *rq;
7818
7819 rq = this_rq_lock_irq(&rf);
7820
7821 schedstat_inc(rq->yld_count);
7822 current->sched_class->yield_task(rq);
7823
7824 preempt_disable();
7825 rq_unlock_irq(rq, &rf);
7826 sched_preempt_enable_no_resched();
7827
7828 schedule();
7829}
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839SYSCALL_DEFINE0(sched_yield)
7840{
7841 do_sched_yield();
7842 return 0;
7843}
7844
7845#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
7846int __sched __cond_resched(void)
7847{
7848 if (should_resched(0)) {
7849 preempt_schedule_common();
7850 return 1;
7851 }
7852#ifndef CONFIG_PREEMPT_RCU
7853 rcu_all_qs();
7854#endif
7855 return 0;
7856}
7857EXPORT_SYMBOL(__cond_resched);
7858#endif
7859
7860#ifdef CONFIG_PREEMPT_DYNAMIC
7861DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
7862EXPORT_STATIC_CALL_TRAMP(cond_resched);
7863
7864DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
7865EXPORT_STATIC_CALL_TRAMP(might_resched);
7866#endif
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876int __cond_resched_lock(spinlock_t *lock)
7877{
7878 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7879 int ret = 0;
7880
7881 lockdep_assert_held(lock);
7882
7883 if (spin_needbreak(lock) || resched) {
7884 spin_unlock(lock);
7885 if (resched)
7886 preempt_schedule_common();
7887 else
7888 cpu_relax();
7889 ret = 1;
7890 spin_lock(lock);
7891 }
7892 return ret;
7893}
7894EXPORT_SYMBOL(__cond_resched_lock);
7895
7896int __cond_resched_rwlock_read(rwlock_t *lock)
7897{
7898 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7899 int ret = 0;
7900
7901 lockdep_assert_held_read(lock);
7902
7903 if (rwlock_needbreak(lock) || resched) {
7904 read_unlock(lock);
7905 if (resched)
7906 preempt_schedule_common();
7907 else
7908 cpu_relax();
7909 ret = 1;
7910 read_lock(lock);
7911 }
7912 return ret;
7913}
7914EXPORT_SYMBOL(__cond_resched_rwlock_read);
7915
7916int __cond_resched_rwlock_write(rwlock_t *lock)
7917{
7918 int resched = should_resched(PREEMPT_LOCK_OFFSET);
7919 int ret = 0;
7920
7921 lockdep_assert_held_write(lock);
7922
7923 if (rwlock_needbreak(lock) || resched) {
7924 write_unlock(lock);
7925 if (resched)
7926 preempt_schedule_common();
7927 else
7928 cpu_relax();
7929 ret = 1;
7930 write_lock(lock);
7931 }
7932 return ret;
7933}
7934EXPORT_SYMBOL(__cond_resched_rwlock_write);
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958void __sched yield(void)
7959{
7960 set_current_state(TASK_RUNNING);
7961 do_sched_yield();
7962}
7963EXPORT_SYMBOL(yield);
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980int __sched yield_to(struct task_struct *p, bool preempt)
7981{
7982 struct task_struct *curr = current;
7983 struct rq *rq, *p_rq;
7984 unsigned long flags;
7985 int yielded = 0;
7986
7987 local_irq_save(flags);
7988 rq = this_rq();
7989
7990again:
7991 p_rq = task_rq(p);
7992
7993
7994
7995
7996 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
7997 yielded = -ESRCH;
7998 goto out_irq;
7999 }
8000
8001 double_rq_lock(rq, p_rq);
8002 if (task_rq(p) != p_rq) {
8003 double_rq_unlock(rq, p_rq);
8004 goto again;
8005 }
8006
8007 if (!curr->sched_class->yield_to_task)
8008 goto out_unlock;
8009
8010 if (curr->sched_class != p->sched_class)
8011 goto out_unlock;
8012
8013 if (task_running(p_rq, p) || !task_is_running(p))
8014 goto out_unlock;
8015
8016 yielded = curr->sched_class->yield_to_task(rq, p);
8017 if (yielded) {
8018 schedstat_inc(rq->yld_count);
8019
8020
8021
8022
8023 if (preempt && rq != p_rq)
8024 resched_curr(p_rq);
8025 }
8026
8027out_unlock:
8028 double_rq_unlock(rq, p_rq);
8029out_irq:
8030 local_irq_restore(flags);
8031
8032 if (yielded > 0)
8033 schedule();
8034
8035 return yielded;
8036}
8037EXPORT_SYMBOL_GPL(yield_to);
8038
8039int io_schedule_prepare(void)
8040{
8041 int old_iowait = current->in_iowait;
8042
8043 current->in_iowait = 1;
8044 blk_schedule_flush_plug(current);
8045
8046 return old_iowait;
8047}
8048
8049void io_schedule_finish(int token)
8050{
8051 current->in_iowait = token;
8052}
8053
8054
8055
8056
8057
8058long __sched io_schedule_timeout(long timeout)
8059{
8060 int token;
8061 long ret;
8062
8063 token = io_schedule_prepare();
8064 ret = schedule_timeout(timeout);
8065 io_schedule_finish(token);
8066
8067 return ret;
8068}
8069EXPORT_SYMBOL(io_schedule_timeout);
8070
8071void __sched io_schedule(void)
8072{
8073 int token;
8074
8075 token = io_schedule_prepare();
8076 schedule();
8077 io_schedule_finish(token);
8078}
8079EXPORT_SYMBOL(io_schedule);
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
8090{
8091 int ret = -EINVAL;
8092
8093 switch (policy) {
8094 case SCHED_FIFO:
8095 case SCHED_RR:
8096 ret = MAX_RT_PRIO-1;
8097 break;
8098 case SCHED_DEADLINE:
8099 case SCHED_NORMAL:
8100 case SCHED_BATCH:
8101 case SCHED_IDLE:
8102 ret = 0;
8103 break;
8104 }
8105 return ret;
8106}
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
8117{
8118 int ret = -EINVAL;
8119
8120 switch (policy) {
8121 case SCHED_FIFO:
8122 case SCHED_RR:
8123 ret = 1;
8124 break;
8125 case SCHED_DEADLINE:
8126 case SCHED_NORMAL:
8127 case SCHED_BATCH:
8128 case SCHED_IDLE:
8129 ret = 0;
8130 }
8131 return ret;
8132}
8133
8134static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
8135{
8136 struct task_struct *p;
8137 unsigned int time_slice;
8138 struct rq_flags rf;
8139 struct rq *rq;
8140 int retval;
8141
8142 if (pid < 0)
8143 return -EINVAL;
8144
8145 retval = -ESRCH;
8146 rcu_read_lock();
8147 p = find_process_by_pid(pid);
8148 if (!p)
8149 goto out_unlock;
8150
8151 retval = security_task_getscheduler(p);
8152 if (retval)
8153 goto out_unlock;
8154
8155 rq = task_rq_lock(p, &rf);
8156 time_slice = 0;
8157 if (p->sched_class->get_rr_interval)
8158 time_slice = p->sched_class->get_rr_interval(rq, p);
8159 task_rq_unlock(rq, p, &rf);
8160
8161 rcu_read_unlock();
8162 jiffies_to_timespec64(time_slice, t);
8163 return 0;
8164
8165out_unlock:
8166 rcu_read_unlock();
8167 return retval;
8168}
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
8182 struct __kernel_timespec __user *, interval)
8183{
8184 struct timespec64 t;
8185 int retval = sched_rr_get_interval(pid, &t);
8186
8187 if (retval == 0)
8188 retval = put_timespec64(&t, interval);
8189
8190 return retval;
8191}
8192
8193#ifdef CONFIG_COMPAT_32BIT_TIME
8194SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
8195 struct old_timespec32 __user *, interval)
8196{
8197 struct timespec64 t;
8198 int retval = sched_rr_get_interval(pid, &t);
8199
8200 if (retval == 0)
8201 retval = put_old_timespec32(&t, interval);
8202 return retval;
8203}
8204#endif
8205
8206void sched_show_task(struct task_struct *p)
8207{
8208 unsigned long free = 0;
8209 int ppid;
8210
8211 if (!try_get_task_stack(p))
8212 return;
8213
8214 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
8215
8216 if (task_is_running(p))
8217 pr_cont(" running task ");
8218#ifdef CONFIG_DEBUG_STACK_USAGE
8219 free = stack_not_used(p);
8220#endif
8221 ppid = 0;
8222 rcu_read_lock();
8223 if (pid_alive(p))
8224 ppid = task_pid_nr(rcu_dereference(p->real_parent));
8225 rcu_read_unlock();
8226 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
8227 free, task_pid_nr(p), ppid,
8228 (unsigned long)task_thread_info(p)->flags);
8229
8230 print_worker_info(KERN_INFO, p);
8231 print_stop_info(KERN_INFO, p);
8232 show_stack(p, NULL, KERN_INFO);
8233 put_task_stack(p);
8234}
8235EXPORT_SYMBOL_GPL(sched_show_task);
8236
8237static inline bool
8238state_filter_match(unsigned long state_filter, struct task_struct *p)
8239{
8240 unsigned int state = READ_ONCE(p->__state);
8241
8242
8243 if (!state_filter)
8244 return true;
8245
8246
8247 if (!(state & state_filter))
8248 return false;
8249
8250
8251
8252
8253
8254 if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
8255 return false;
8256
8257 return true;
8258}
8259
8260
8261void show_state_filter(unsigned int state_filter)
8262{
8263 struct task_struct *g, *p;
8264
8265 rcu_read_lock();
8266 for_each_process_thread(g, p) {
8267
8268
8269
8270
8271
8272
8273
8274 touch_nmi_watchdog();
8275 touch_all_softlockup_watchdogs();
8276 if (state_filter_match(state_filter, p))
8277 sched_show_task(p);
8278 }
8279
8280#ifdef CONFIG_SCHED_DEBUG
8281 if (!state_filter)
8282 sysrq_sched_debug_show();
8283#endif
8284 rcu_read_unlock();
8285
8286
8287
8288 if (!state_filter)
8289 debug_show_all_locks();
8290}
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300void __init init_idle(struct task_struct *idle, int cpu)
8301{
8302 struct rq *rq = cpu_rq(cpu);
8303 unsigned long flags;
8304
8305 __sched_fork(0, idle);
8306
8307
8308
8309
8310
8311
8312
8313 set_kthread_struct(idle);
8314
8315 raw_spin_lock_irqsave(&idle->pi_lock, flags);
8316 raw_spin_rq_lock(rq);
8317
8318 idle->__state = TASK_RUNNING;
8319 idle->se.exec_start = sched_clock();
8320
8321
8322
8323
8324 idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
8325 kthread_set_per_cpu(idle, cpu);
8326
8327 scs_task_reset(idle);
8328 kasan_unpoison_task_stack(idle);
8329
8330#ifdef CONFIG_SMP
8331
8332
8333
8334
8335
8336
8337 set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
8338#endif
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349 rcu_read_lock();
8350 __set_task_cpu(idle, cpu);
8351 rcu_read_unlock();
8352
8353 rq->idle = idle;
8354 rcu_assign_pointer(rq->curr, idle);
8355 idle->on_rq = TASK_ON_RQ_QUEUED;
8356#ifdef CONFIG_SMP
8357 idle->on_cpu = 1;
8358#endif
8359 raw_spin_rq_unlock(rq);
8360 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
8361
8362
8363 init_idle_preempt_count(idle, cpu);
8364
8365
8366
8367
8368 idle->sched_class = &idle_sched_class;
8369 ftrace_graph_init_idle_task(idle, cpu);
8370 vtime_init_idle(idle, cpu);
8371#ifdef CONFIG_SMP
8372 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
8373#endif
8374}
8375
8376#ifdef CONFIG_SMP
8377
8378int cpuset_cpumask_can_shrink(const struct cpumask *cur,
8379 const struct cpumask *trial)
8380{
8381 int ret = 1;
8382
8383 if (!cpumask_weight(cur))
8384 return ret;
8385
8386 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
8387
8388 return ret;
8389}
8390
8391int task_can_attach(struct task_struct *p,
8392 const struct cpumask *cs_cpus_allowed)
8393{
8394 int ret = 0;
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405 if (p->flags & PF_NO_SETAFFINITY) {
8406 ret = -EINVAL;
8407 goto out;
8408 }
8409
8410 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
8411 cs_cpus_allowed))
8412 ret = dl_task_can_attach(p, cs_cpus_allowed);
8413
8414out:
8415 return ret;
8416}
8417
8418bool sched_smp_initialized __read_mostly;
8419
8420#ifdef CONFIG_NUMA_BALANCING
8421
8422int migrate_task_to(struct task_struct *p, int target_cpu)
8423{
8424 struct migration_arg arg = { p, target_cpu };
8425 int curr_cpu = task_cpu(p);
8426
8427 if (curr_cpu == target_cpu)
8428 return 0;
8429
8430 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
8431 return -EINVAL;
8432
8433
8434
8435 trace_sched_move_numa(p, curr_cpu, target_cpu);
8436 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
8437}
8438
8439
8440
8441
8442
8443void sched_setnuma(struct task_struct *p, int nid)
8444{
8445 bool queued, running;
8446 struct rq_flags rf;
8447 struct rq *rq;
8448
8449 rq = task_rq_lock(p, &rf);
8450 queued = task_on_rq_queued(p);
8451 running = task_current(rq, p);
8452
8453 if (queued)
8454 dequeue_task(rq, p, DEQUEUE_SAVE);
8455 if (running)
8456 put_prev_task(rq, p);
8457
8458 p->numa_preferred_nid = nid;
8459
8460 if (queued)
8461 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
8462 if (running)
8463 set_next_task(rq, p);
8464 task_rq_unlock(rq, p, &rf);
8465}
8466#endif
8467
8468#ifdef CONFIG_HOTPLUG_CPU
8469
8470
8471
8472
8473void idle_task_exit(void)
8474{
8475 struct mm_struct *mm = current->active_mm;
8476
8477 BUG_ON(cpu_online(smp_processor_id()));
8478 BUG_ON(current != this_rq()->idle);
8479
8480 if (mm != &init_mm) {
8481 switch_mm(mm, &init_mm, current);
8482 finish_arch_post_lock_switch();
8483 }
8484
8485
8486}
8487
8488static int __balance_push_cpu_stop(void *arg)
8489{
8490 struct task_struct *p = arg;
8491 struct rq *rq = this_rq();
8492 struct rq_flags rf;
8493 int cpu;
8494
8495 raw_spin_lock_irq(&p->pi_lock);
8496 rq_lock(rq, &rf);
8497
8498 update_rq_clock(rq);
8499
8500 if (task_rq(p) == rq && task_on_rq_queued(p)) {
8501 cpu = select_fallback_rq(rq->cpu, p);
8502 rq = __migrate_task(rq, &rf, p, cpu);
8503 }
8504
8505 rq_unlock(rq, &rf);
8506 raw_spin_unlock_irq(&p->pi_lock);
8507
8508 put_task_struct(p);
8509
8510 return 0;
8511}
8512
8513static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
8514
8515
8516
8517
8518
8519
8520
8521static void balance_push(struct rq *rq)
8522{
8523 struct task_struct *push_task = rq->curr;
8524
8525 lockdep_assert_rq_held(rq);
8526 SCHED_WARN_ON(rq->cpu != smp_processor_id());
8527
8528
8529
8530
8531 rq->balance_callback = &balance_push_callback;
8532
8533
8534
8535
8536 if (!cpu_dying(rq->cpu))
8537 return;
8538
8539
8540
8541
8542
8543 if (kthread_is_per_cpu(push_task) ||
8544 is_migration_disabled(push_task)) {
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557 if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
8558 rcuwait_active(&rq->hotplug_wait)) {
8559 raw_spin_rq_unlock(rq);
8560 rcuwait_wake_up(&rq->hotplug_wait);
8561 raw_spin_rq_lock(rq);
8562 }
8563 return;
8564 }
8565
8566 get_task_struct(push_task);
8567
8568
8569
8570
8571 raw_spin_rq_unlock(rq);
8572 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
8573 this_cpu_ptr(&push_work));
8574
8575
8576
8577
8578
8579 raw_spin_rq_lock(rq);
8580}
8581
8582static void balance_push_set(int cpu, bool on)
8583{
8584 struct rq *rq = cpu_rq(cpu);
8585 struct rq_flags rf;
8586
8587 rq_lock_irqsave(rq, &rf);
8588 if (on) {
8589 WARN_ON_ONCE(rq->balance_callback);
8590 rq->balance_callback = &balance_push_callback;
8591 } else if (rq->balance_callback == &balance_push_callback) {
8592 rq->balance_callback = NULL;
8593 }
8594 rq_unlock_irqrestore(rq, &rf);
8595}
8596
8597
8598
8599
8600
8601
8602
8603static void balance_hotplug_wait(void)
8604{
8605 struct rq *rq = this_rq();
8606
8607 rcuwait_wait_event(&rq->hotplug_wait,
8608 rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
8609 TASK_UNINTERRUPTIBLE);
8610}
8611
8612#else
8613
8614static inline void balance_push(struct rq *rq)
8615{
8616}
8617
8618static inline void balance_push_set(int cpu, bool on)
8619{
8620}
8621
8622static inline void balance_hotplug_wait(void)
8623{
8624}
8625
8626#endif
8627
8628void set_rq_online(struct rq *rq)
8629{
8630 if (!rq->online) {
8631 const struct sched_class *class;
8632
8633 cpumask_set_cpu(rq->cpu, rq->rd->online);
8634 rq->online = 1;
8635
8636 for_each_class(class) {
8637 if (class->rq_online)
8638 class->rq_online(rq);
8639 }
8640 }
8641}
8642
8643void set_rq_offline(struct rq *rq)
8644{
8645 if (rq->online) {
8646 const struct sched_class *class;
8647
8648 for_each_class(class) {
8649 if (class->rq_offline)
8650 class->rq_offline(rq);
8651 }
8652
8653 cpumask_clear_cpu(rq->cpu, rq->rd->online);
8654 rq->online = 0;
8655 }
8656}
8657
8658
8659
8660
8661static int num_cpus_frozen;
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671static void cpuset_cpu_active(void)
8672{
8673 if (cpuhp_tasks_frozen) {
8674
8675
8676
8677
8678
8679
8680 partition_sched_domains(1, NULL, NULL);
8681 if (--num_cpus_frozen)
8682 return;
8683
8684
8685
8686
8687
8688 cpuset_force_rebuild();
8689 }
8690 cpuset_update_active_cpus();
8691}
8692
8693static int cpuset_cpu_inactive(unsigned int cpu)
8694{
8695 if (!cpuhp_tasks_frozen) {
8696 if (dl_cpu_busy(cpu))
8697 return -EBUSY;
8698 cpuset_update_active_cpus();
8699 } else {
8700 num_cpus_frozen++;
8701 partition_sched_domains(1, NULL, NULL);
8702 }
8703 return 0;
8704}
8705
8706int sched_cpu_activate(unsigned int cpu)
8707{
8708 struct rq *rq = cpu_rq(cpu);
8709 struct rq_flags rf;
8710
8711
8712
8713
8714
8715 balance_push_set(cpu, false);
8716
8717#ifdef CONFIG_SCHED_SMT
8718
8719
8720
8721 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
8722 static_branch_inc_cpuslocked(&sched_smt_present);
8723#endif
8724 set_cpu_active(cpu, true);
8725
8726 if (sched_smp_initialized) {
8727 sched_domains_numa_masks_set(cpu);
8728 cpuset_cpu_active();
8729 }
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740 rq_lock_irqsave(rq, &rf);
8741 if (rq->rd) {
8742 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8743 set_rq_online(rq);
8744 }
8745 rq_unlock_irqrestore(rq, &rf);
8746
8747 return 0;
8748}
8749
8750int sched_cpu_deactivate(unsigned int cpu)
8751{
8752 struct rq *rq = cpu_rq(cpu);
8753 struct rq_flags rf;
8754 int ret;
8755
8756
8757
8758
8759
8760 nohz_balance_exit_idle(rq);
8761
8762 set_cpu_active(cpu, false);
8763
8764
8765
8766
8767
8768
8769
8770 balance_push_set(cpu, true);
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782 synchronize_rcu();
8783
8784 rq_lock_irqsave(rq, &rf);
8785 if (rq->rd) {
8786 update_rq_clock(rq);
8787 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8788 set_rq_offline(rq);
8789 }
8790 rq_unlock_irqrestore(rq, &rf);
8791
8792#ifdef CONFIG_SCHED_SMT
8793
8794
8795
8796 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
8797 static_branch_dec_cpuslocked(&sched_smt_present);
8798
8799 sched_core_cpu_deactivate(cpu);
8800#endif
8801
8802 if (!sched_smp_initialized)
8803 return 0;
8804
8805 ret = cpuset_cpu_inactive(cpu);
8806 if (ret) {
8807 balance_push_set(cpu, false);
8808 set_cpu_active(cpu, true);
8809 return ret;
8810 }
8811 sched_domains_numa_masks_clear(cpu);
8812 return 0;
8813}
8814
8815static void sched_rq_cpu_starting(unsigned int cpu)
8816{
8817 struct rq *rq = cpu_rq(cpu);
8818
8819 rq->calc_load_update = calc_load_update;
8820 update_max_interval();
8821}
8822
8823int sched_cpu_starting(unsigned int cpu)
8824{
8825 sched_core_cpu_starting(cpu);
8826 sched_rq_cpu_starting(cpu);
8827 sched_tick_start(cpu);
8828 return 0;
8829}
8830
8831#ifdef CONFIG_HOTPLUG_CPU
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844int sched_cpu_wait_empty(unsigned int cpu)
8845{
8846 balance_hotplug_wait();
8847 return 0;
8848}
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859static void calc_load_migrate(struct rq *rq)
8860{
8861 long delta = calc_load_fold_active(rq, 1);
8862
8863 if (delta)
8864 atomic_long_add(delta, &calc_load_tasks);
8865}
8866
8867static void dump_rq_tasks(struct rq *rq, const char *loglvl)
8868{
8869 struct task_struct *g, *p;
8870 int cpu = cpu_of(rq);
8871
8872 lockdep_assert_rq_held(rq);
8873
8874 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
8875 for_each_process_thread(g, p) {
8876 if (task_cpu(p) != cpu)
8877 continue;
8878
8879 if (!task_on_rq_queued(p))
8880 continue;
8881
8882 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
8883 }
8884}
8885
8886int sched_cpu_dying(unsigned int cpu)
8887{
8888 struct rq *rq = cpu_rq(cpu);
8889 struct rq_flags rf;
8890
8891
8892 sched_tick_stop(cpu);
8893
8894 rq_lock_irqsave(rq, &rf);
8895 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
8896 WARN(true, "Dying CPU not properly vacated!");
8897 dump_rq_tasks(rq, KERN_WARNING);
8898 }
8899 rq_unlock_irqrestore(rq, &rf);
8900
8901 calc_load_migrate(rq);
8902 update_max_interval();
8903 hrtick_clear(rq);
8904 sched_core_cpu_dying(cpu);
8905 return 0;
8906}
8907#endif
8908
8909void __init sched_init_smp(void)
8910{
8911 sched_init_numa();
8912
8913
8914
8915
8916
8917
8918 mutex_lock(&sched_domains_mutex);
8919 sched_init_domains(cpu_active_mask);
8920 mutex_unlock(&sched_domains_mutex);
8921
8922
8923 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
8924 BUG();
8925 current->flags &= ~PF_NO_SETAFFINITY;
8926 sched_init_granularity();
8927
8928 init_sched_rt_class();
8929 init_sched_dl_class();
8930
8931 sched_smp_initialized = true;
8932}
8933
8934static int __init migration_init(void)
8935{
8936 sched_cpu_starting(smp_processor_id());
8937 return 0;
8938}
8939early_initcall(migration_init);
8940
8941#else
8942void __init sched_init_smp(void)
8943{
8944 sched_init_granularity();
8945}
8946#endif
8947
8948int in_sched_functions(unsigned long addr)
8949{
8950 return in_lock_functions(addr) ||
8951 (addr >= (unsigned long)__sched_text_start
8952 && addr < (unsigned long)__sched_text_end);
8953}
8954
8955#ifdef CONFIG_CGROUP_SCHED
8956
8957
8958
8959
8960struct task_group root_task_group;
8961LIST_HEAD(task_groups);
8962
8963
8964static struct kmem_cache *task_group_cache __read_mostly;
8965#endif
8966
8967DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
8968DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
8969
8970void __init sched_init(void)
8971{
8972 unsigned long ptr = 0;
8973 int i;
8974
8975
8976 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8977 &fair_sched_class + 1 != &rt_sched_class ||
8978 &rt_sched_class + 1 != &dl_sched_class);
8979#ifdef CONFIG_SMP
8980 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8981#endif
8982
8983 wait_bit_init();
8984
8985#ifdef CONFIG_FAIR_GROUP_SCHED
8986 ptr += 2 * nr_cpu_ids * sizeof(void **);
8987#endif
8988#ifdef CONFIG_RT_GROUP_SCHED
8989 ptr += 2 * nr_cpu_ids * sizeof(void **);
8990#endif
8991 if (ptr) {
8992 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
8993
8994#ifdef CONFIG_FAIR_GROUP_SCHED
8995 root_task_group.se = (struct sched_entity **)ptr;
8996 ptr += nr_cpu_ids * sizeof(void **);
8997
8998 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8999 ptr += nr_cpu_ids * sizeof(void **);
9000
9001 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
9002 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
9003#endif
9004#ifdef CONFIG_RT_GROUP_SCHED
9005 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9006 ptr += nr_cpu_ids * sizeof(void **);
9007
9008 root_task_group.rt_rq = (struct rt_rq **)ptr;
9009 ptr += nr_cpu_ids * sizeof(void **);
9010
9011#endif
9012 }
9013#ifdef CONFIG_CPUMASK_OFFSTACK
9014 for_each_possible_cpu(i) {
9015 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
9016 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9017 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
9018 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9019 }
9020#endif
9021
9022 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
9023 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
9024
9025#ifdef CONFIG_SMP
9026 init_defrootdomain();
9027#endif
9028
9029#ifdef CONFIG_RT_GROUP_SCHED
9030 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9031 global_rt_period(), global_rt_runtime());
9032#endif
9033
9034#ifdef CONFIG_CGROUP_SCHED
9035 task_group_cache = KMEM_CACHE(task_group, 0);
9036
9037 list_add(&root_task_group.list, &task_groups);
9038 INIT_LIST_HEAD(&root_task_group.children);
9039 INIT_LIST_HEAD(&root_task_group.siblings);
9040 autogroup_init(&init_task);
9041#endif
9042
9043 for_each_possible_cpu(i) {
9044 struct rq *rq;
9045
9046 rq = cpu_rq(i);
9047 raw_spin_lock_init(&rq->__lock);
9048 rq->nr_running = 0;
9049 rq->calc_load_active = 0;
9050 rq->calc_load_update = jiffies + LOAD_FREQ;
9051 init_cfs_rq(&rq->cfs);
9052 init_rt_rq(&rq->rt);
9053 init_dl_rq(&rq->dl);
9054#ifdef CONFIG_FAIR_GROUP_SCHED
9055 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9056 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
9077#endif
9078
9079 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9080#ifdef CONFIG_RT_GROUP_SCHED
9081 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
9082#endif
9083#ifdef CONFIG_SMP
9084 rq->sd = NULL;
9085 rq->rd = NULL;
9086 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
9087 rq->balance_callback = &balance_push_callback;
9088 rq->active_balance = 0;
9089 rq->next_balance = jiffies;
9090 rq->push_cpu = 0;
9091 rq->cpu = i;
9092 rq->online = 0;
9093 rq->idle_stamp = 0;
9094 rq->avg_idle = 2*sysctl_sched_migration_cost;
9095 rq->wake_stamp = jiffies;
9096 rq->wake_avg_idle = rq->avg_idle;
9097 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
9098
9099 INIT_LIST_HEAD(&rq->cfs_tasks);
9100
9101 rq_attach_root(rq, &def_root_domain);
9102#ifdef CONFIG_NO_HZ_COMMON
9103 rq->last_blocked_load_update_tick = jiffies;
9104 atomic_set(&rq->nohz_flags, 0);
9105
9106 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
9107#endif
9108#ifdef CONFIG_HOTPLUG_CPU
9109 rcuwait_init(&rq->hotplug_wait);
9110#endif
9111#endif
9112 hrtick_rq_init(rq);
9113 atomic_set(&rq->nr_iowait, 0);
9114
9115#ifdef CONFIG_SCHED_CORE
9116 rq->core = rq;
9117 rq->core_pick = NULL;
9118 rq->core_enabled = 0;
9119 rq->core_tree = RB_ROOT;
9120 rq->core_forceidle = false;
9121
9122 rq->core_cookie = 0UL;
9123#endif
9124 }
9125
9126 set_load_weight(&init_task, false);
9127
9128
9129
9130
9131 mmgrab(&init_mm);
9132 enter_lazy_tlb(&init_mm, current);
9133
9134
9135
9136
9137
9138
9139
9140 init_idle(current, smp_processor_id());
9141
9142 calc_load_update = jiffies + LOAD_FREQ;
9143
9144#ifdef CONFIG_SMP
9145 idle_thread_set_boot_cpu();
9146 balance_push_set(smp_processor_id(), false);
9147#endif
9148 init_sched_fair_class();
9149
9150 psi_init();
9151
9152 init_uclamp();
9153
9154 scheduler_running = 1;
9155}
9156
9157#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9158static inline int preempt_count_equals(int preempt_offset)
9159{
9160 int nested = preempt_count() + rcu_preempt_depth();
9161
9162 return (nested == preempt_offset);
9163}
9164
9165void __might_sleep(const char *file, int line, int preempt_offset)
9166{
9167 unsigned int state = get_current_state();
9168
9169
9170
9171
9172
9173 WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
9174 "do not call blocking ops when !TASK_RUNNING; "
9175 "state=%x set at [<%p>] %pS\n", state,
9176 (void *)current->task_state_change,
9177 (void *)current->task_state_change);
9178
9179 ___might_sleep(file, line, preempt_offset);
9180}
9181EXPORT_SYMBOL(__might_sleep);
9182
9183void ___might_sleep(const char *file, int line, int preempt_offset)
9184{
9185
9186 static unsigned long prev_jiffy;
9187
9188 unsigned long preempt_disable_ip;
9189
9190
9191 rcu_sleep_check();
9192
9193 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
9194 !is_idle_task(current) && !current->non_block_count) ||
9195 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
9196 oops_in_progress)
9197 return;
9198
9199 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9200 return;
9201 prev_jiffy = jiffies;
9202
9203
9204 preempt_disable_ip = get_preempt_disable_ip(current);
9205
9206 printk(KERN_ERR
9207 "BUG: sleeping function called from invalid context at %s:%d\n",
9208 file, line);
9209 printk(KERN_ERR
9210 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
9211 in_atomic(), irqs_disabled(), current->non_block_count,
9212 current->pid, current->comm);
9213
9214 if (task_stack_end_corrupted(current))
9215 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
9216
9217 debug_show_held_locks(current);
9218 if (irqs_disabled())
9219 print_irqtrace_events(current);
9220 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
9221 && !preempt_count_equals(preempt_offset)) {
9222 pr_err("Preemption disabled at:");
9223 print_ip_sym(KERN_ERR, preempt_disable_ip);
9224 }
9225 dump_stack();
9226 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9227}
9228EXPORT_SYMBOL(___might_sleep);
9229
9230void __cant_sleep(const char *file, int line, int preempt_offset)
9231{
9232 static unsigned long prev_jiffy;
9233
9234 if (irqs_disabled())
9235 return;
9236
9237 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9238 return;
9239
9240 if (preempt_count() > preempt_offset)
9241 return;
9242
9243 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9244 return;
9245 prev_jiffy = jiffies;
9246
9247 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
9248 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9249 in_atomic(), irqs_disabled(),
9250 current->pid, current->comm);
9251
9252 debug_show_held_locks(current);
9253 dump_stack();
9254 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9255}
9256EXPORT_SYMBOL_GPL(__cant_sleep);
9257
9258#ifdef CONFIG_SMP
9259void __cant_migrate(const char *file, int line)
9260{
9261 static unsigned long prev_jiffy;
9262
9263 if (irqs_disabled())
9264 return;
9265
9266 if (is_migration_disabled(current))
9267 return;
9268
9269 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9270 return;
9271
9272 if (preempt_count() > 0)
9273 return;
9274
9275 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9276 return;
9277 prev_jiffy = jiffies;
9278
9279 pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
9280 pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
9281 in_atomic(), irqs_disabled(), is_migration_disabled(current),
9282 current->pid, current->comm);
9283
9284 debug_show_held_locks(current);
9285 dump_stack();
9286 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9287}
9288EXPORT_SYMBOL_GPL(__cant_migrate);
9289#endif
9290#endif
9291
9292#ifdef CONFIG_MAGIC_SYSRQ
9293void normalize_rt_tasks(void)
9294{
9295 struct task_struct *g, *p;
9296 struct sched_attr attr = {
9297 .sched_policy = SCHED_NORMAL,
9298 };
9299
9300 read_lock(&tasklist_lock);
9301 for_each_process_thread(g, p) {
9302
9303
9304
9305 if (p->flags & PF_KTHREAD)
9306 continue;
9307
9308 p->se.exec_start = 0;
9309 schedstat_set(p->se.statistics.wait_start, 0);
9310 schedstat_set(p->se.statistics.sleep_start, 0);
9311 schedstat_set(p->se.statistics.block_start, 0);
9312
9313 if (!dl_task(p) && !rt_task(p)) {
9314
9315
9316
9317
9318 if (task_nice(p) < 0)
9319 set_user_nice(p, 0);
9320 continue;
9321 }
9322
9323 __sched_setscheduler(p, &attr, false, false);
9324 }
9325 read_unlock(&tasklist_lock);
9326}
9327
9328#endif
9329
9330#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349struct task_struct *curr_task(int cpu)
9350{
9351 return cpu_curr(cpu);
9352}
9353
9354#endif
9355
9356#ifdef CONFIG_IA64
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372void ia64_set_curr_task(int cpu, struct task_struct *p)
9373{
9374 cpu_curr(cpu) = p;
9375}
9376
9377#endif
9378
9379#ifdef CONFIG_CGROUP_SCHED
9380
9381static DEFINE_SPINLOCK(task_group_lock);
9382
9383static inline void alloc_uclamp_sched_group(struct task_group *tg,
9384 struct task_group *parent)
9385{
9386#ifdef CONFIG_UCLAMP_TASK_GROUP
9387 enum uclamp_id clamp_id;
9388
9389 for_each_clamp_id(clamp_id) {
9390 uclamp_se_set(&tg->uclamp_req[clamp_id],
9391 uclamp_none(clamp_id), false);
9392 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
9393 }
9394#endif
9395}
9396
9397static void sched_free_group(struct task_group *tg)
9398{
9399 free_fair_sched_group(tg);
9400 free_rt_sched_group(tg);
9401 autogroup_free(tg);
9402 kmem_cache_free(task_group_cache, tg);
9403}
9404
9405
9406struct task_group *sched_create_group(struct task_group *parent)
9407{
9408 struct task_group *tg;
9409
9410 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
9411 if (!tg)
9412 return ERR_PTR(-ENOMEM);
9413
9414 if (!alloc_fair_sched_group(tg, parent))
9415 goto err;
9416
9417 if (!alloc_rt_sched_group(tg, parent))
9418 goto err;
9419
9420 alloc_uclamp_sched_group(tg, parent);
9421
9422 return tg;
9423
9424err:
9425 sched_free_group(tg);
9426 return ERR_PTR(-ENOMEM);
9427}
9428
9429void sched_online_group(struct task_group *tg, struct task_group *parent)
9430{
9431 unsigned long flags;
9432
9433 spin_lock_irqsave(&task_group_lock, flags);
9434 list_add_rcu(&tg->list, &task_groups);
9435
9436
9437 WARN_ON(!parent);
9438
9439 tg->parent = parent;
9440 INIT_LIST_HEAD(&tg->children);
9441 list_add_rcu(&tg->siblings, &parent->children);
9442 spin_unlock_irqrestore(&task_group_lock, flags);
9443
9444 online_fair_sched_group(tg);
9445}
9446
9447
9448static void sched_free_group_rcu(struct rcu_head *rhp)
9449{
9450
9451 sched_free_group(container_of(rhp, struct task_group, rcu));
9452}
9453
9454void sched_destroy_group(struct task_group *tg)
9455{
9456
9457 call_rcu(&tg->rcu, sched_free_group_rcu);
9458}
9459
9460void sched_offline_group(struct task_group *tg)
9461{
9462 unsigned long flags;
9463
9464
9465 unregister_fair_sched_group(tg);
9466
9467 spin_lock_irqsave(&task_group_lock, flags);
9468 list_del_rcu(&tg->list);
9469 list_del_rcu(&tg->siblings);
9470 spin_unlock_irqrestore(&task_group_lock, flags);
9471}
9472
9473static void sched_change_group(struct task_struct *tsk, int type)
9474{
9475 struct task_group *tg;
9476
9477
9478
9479
9480
9481
9482 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
9483 struct task_group, css);
9484 tg = autogroup_task_group(tsk, tg);
9485 tsk->sched_task_group = tg;
9486
9487#ifdef CONFIG_FAIR_GROUP_SCHED
9488 if (tsk->sched_class->task_change_group)
9489 tsk->sched_class->task_change_group(tsk, type);
9490 else
9491#endif
9492 set_task_rq(tsk, task_cpu(tsk));
9493}
9494
9495
9496
9497
9498
9499
9500
9501
9502void sched_move_task(struct task_struct *tsk)
9503{
9504 int queued, running, queue_flags =
9505 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
9506 struct rq_flags rf;
9507 struct rq *rq;
9508
9509 rq = task_rq_lock(tsk, &rf);
9510 update_rq_clock(rq);
9511
9512 running = task_current(rq, tsk);
9513 queued = task_on_rq_queued(tsk);
9514
9515 if (queued)
9516 dequeue_task(rq, tsk, queue_flags);
9517 if (running)
9518 put_prev_task(rq, tsk);
9519
9520 sched_change_group(tsk, TASK_MOVE_GROUP);
9521
9522 if (queued)
9523 enqueue_task(rq, tsk, queue_flags);
9524 if (running) {
9525 set_next_task(rq, tsk);
9526
9527
9528
9529
9530
9531 resched_curr(rq);
9532 }
9533
9534 task_rq_unlock(rq, tsk, &rf);
9535}
9536
9537static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
9538{
9539 return css ? container_of(css, struct task_group, css) : NULL;
9540}
9541
9542static struct cgroup_subsys_state *
9543cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9544{
9545 struct task_group *parent = css_tg(parent_css);
9546 struct task_group *tg;
9547
9548 if (!parent) {
9549
9550 return &root_task_group.css;
9551 }
9552
9553 tg = sched_create_group(parent);
9554 if (IS_ERR(tg))
9555 return ERR_PTR(-ENOMEM);
9556
9557 return &tg->css;
9558}
9559
9560
9561static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
9562{
9563 struct task_group *tg = css_tg(css);
9564 struct task_group *parent = css_tg(css->parent);
9565
9566 if (parent)
9567 sched_online_group(tg, parent);
9568
9569#ifdef CONFIG_UCLAMP_TASK_GROUP
9570
9571 mutex_lock(&uclamp_mutex);
9572 rcu_read_lock();
9573 cpu_util_update_eff(css);
9574 rcu_read_unlock();
9575 mutex_unlock(&uclamp_mutex);
9576#endif
9577
9578 return 0;
9579}
9580
9581static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
9582{
9583 struct task_group *tg = css_tg(css);
9584
9585 sched_offline_group(tg);
9586}
9587
9588static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
9589{
9590 struct task_group *tg = css_tg(css);
9591
9592
9593
9594
9595 sched_free_group(tg);
9596}
9597
9598
9599
9600
9601
9602static void cpu_cgroup_fork(struct task_struct *task)
9603{
9604 struct rq_flags rf;
9605 struct rq *rq;
9606
9607 rq = task_rq_lock(task, &rf);
9608
9609 update_rq_clock(rq);
9610 sched_change_group(task, TASK_SET_GROUP);
9611
9612 task_rq_unlock(rq, task, &rf);
9613}
9614
9615static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
9616{
9617 struct task_struct *task;
9618 struct cgroup_subsys_state *css;
9619 int ret = 0;
9620
9621 cgroup_taskset_for_each(task, css, tset) {
9622#ifdef CONFIG_RT_GROUP_SCHED
9623 if (!sched_rt_can_attach(css_tg(css), task))
9624 return -EINVAL;
9625#endif
9626
9627
9628
9629
9630 raw_spin_lock_irq(&task->pi_lock);
9631
9632
9633
9634
9635
9636 if (READ_ONCE(task->__state) == TASK_NEW)
9637 ret = -EINVAL;
9638 raw_spin_unlock_irq(&task->pi_lock);
9639
9640 if (ret)
9641 break;
9642 }
9643 return ret;
9644}
9645
9646static void cpu_cgroup_attach(struct cgroup_taskset *tset)
9647{
9648 struct task_struct *task;
9649 struct cgroup_subsys_state *css;
9650
9651 cgroup_taskset_for_each(task, css, tset)
9652 sched_move_task(task);
9653}
9654
9655#ifdef CONFIG_UCLAMP_TASK_GROUP
9656static void cpu_util_update_eff(struct cgroup_subsys_state *css)
9657{
9658 struct cgroup_subsys_state *top_css = css;
9659 struct uclamp_se *uc_parent = NULL;
9660 struct uclamp_se *uc_se = NULL;
9661 unsigned int eff[UCLAMP_CNT];
9662 enum uclamp_id clamp_id;
9663 unsigned int clamps;
9664
9665 lockdep_assert_held(&uclamp_mutex);
9666 SCHED_WARN_ON(!rcu_read_lock_held());
9667
9668 css_for_each_descendant_pre(css, top_css) {
9669 uc_parent = css_tg(css)->parent
9670 ? css_tg(css)->parent->uclamp : NULL;
9671
9672 for_each_clamp_id(clamp_id) {
9673
9674 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
9675
9676 if (uc_parent &&
9677 eff[clamp_id] > uc_parent[clamp_id].value) {
9678 eff[clamp_id] = uc_parent[clamp_id].value;
9679 }
9680 }
9681
9682 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
9683
9684
9685 clamps = 0x0;
9686 uc_se = css_tg(css)->uclamp;
9687 for_each_clamp_id(clamp_id) {
9688 if (eff[clamp_id] == uc_se[clamp_id].value)
9689 continue;
9690 uc_se[clamp_id].value = eff[clamp_id];
9691 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
9692 clamps |= (0x1 << clamp_id);
9693 }
9694 if (!clamps) {
9695 css = css_rightmost_descendant(css);
9696 continue;
9697 }
9698
9699
9700 uclamp_update_active_tasks(css);
9701 }
9702}
9703
9704
9705
9706
9707
9708
9709#define _POW10(exp) ((unsigned int)1e##exp)
9710#define POW10(exp) _POW10(exp)
9711
9712struct uclamp_request {
9713#define UCLAMP_PERCENT_SHIFT 2
9714#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
9715 s64 percent;
9716 u64 util;
9717 int ret;
9718};
9719
9720static inline struct uclamp_request
9721capacity_from_percent(char *buf)
9722{
9723 struct uclamp_request req = {
9724 .percent = UCLAMP_PERCENT_SCALE,
9725 .util = SCHED_CAPACITY_SCALE,
9726 .ret = 0,
9727 };
9728
9729 buf = strim(buf);
9730 if (strcmp(buf, "max")) {
9731 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
9732 &req.percent);
9733 if (req.ret)
9734 return req;
9735 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
9736 req.ret = -ERANGE;
9737 return req;
9738 }
9739
9740 req.util = req.percent << SCHED_CAPACITY_SHIFT;
9741 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
9742 }
9743
9744 return req;
9745}
9746
9747static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
9748 size_t nbytes, loff_t off,
9749 enum uclamp_id clamp_id)
9750{
9751 struct uclamp_request req;
9752 struct task_group *tg;
9753
9754 req = capacity_from_percent(buf);
9755 if (req.ret)
9756 return req.ret;
9757
9758 static_branch_enable(&sched_uclamp_used);
9759
9760 mutex_lock(&uclamp_mutex);
9761 rcu_read_lock();
9762
9763 tg = css_tg(of_css(of));
9764 if (tg->uclamp_req[clamp_id].value != req.util)
9765 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
9766
9767
9768
9769
9770
9771 tg->uclamp_pct[clamp_id] = req.percent;
9772
9773
9774 cpu_util_update_eff(of_css(of));
9775
9776 rcu_read_unlock();
9777 mutex_unlock(&uclamp_mutex);
9778
9779 return nbytes;
9780}
9781
9782static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
9783 char *buf, size_t nbytes,
9784 loff_t off)
9785{
9786 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
9787}
9788
9789static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
9790 char *buf, size_t nbytes,
9791 loff_t off)
9792{
9793 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
9794}
9795
9796static inline void cpu_uclamp_print(struct seq_file *sf,
9797 enum uclamp_id clamp_id)
9798{
9799 struct task_group *tg;
9800 u64 util_clamp;
9801 u64 percent;
9802 u32 rem;
9803
9804 rcu_read_lock();
9805 tg = css_tg(seq_css(sf));
9806 util_clamp = tg->uclamp_req[clamp_id].value;
9807 rcu_read_unlock();
9808
9809 if (util_clamp == SCHED_CAPACITY_SCALE) {
9810 seq_puts(sf, "max\n");
9811 return;
9812 }
9813
9814 percent = tg->uclamp_pct[clamp_id];
9815 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
9816 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
9817}
9818
9819static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
9820{
9821 cpu_uclamp_print(sf, UCLAMP_MIN);
9822 return 0;
9823}
9824
9825static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
9826{
9827 cpu_uclamp_print(sf, UCLAMP_MAX);
9828 return 0;
9829}
9830#endif
9831
9832#ifdef CONFIG_FAIR_GROUP_SCHED
9833static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
9834 struct cftype *cftype, u64 shareval)
9835{
9836 if (shareval > scale_load_down(ULONG_MAX))
9837 shareval = MAX_SHARES;
9838 return sched_group_set_shares(css_tg(css), scale_load(shareval));
9839}
9840
9841static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
9842 struct cftype *cft)
9843{
9844 struct task_group *tg = css_tg(css);
9845
9846 return (u64) scale_load_down(tg->shares);
9847}
9848
9849#ifdef CONFIG_CFS_BANDWIDTH
9850static DEFINE_MUTEX(cfs_constraints_mutex);
9851
9852const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
9853static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
9854
9855static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
9856
9857static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9858
9859static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
9860 u64 burst)
9861{
9862 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9863 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9864
9865 if (tg == &root_task_group)
9866 return -EINVAL;
9867
9868
9869
9870
9871
9872
9873 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9874 return -EINVAL;
9875
9876
9877
9878
9879
9880
9881 if (period > max_cfs_quota_period)
9882 return -EINVAL;
9883
9884
9885
9886
9887 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
9888 return -EINVAL;
9889
9890 if (quota != RUNTIME_INF && (burst > quota ||
9891 burst + quota > max_cfs_runtime))
9892 return -EINVAL;
9893
9894
9895
9896
9897
9898 get_online_cpus();
9899 mutex_lock(&cfs_constraints_mutex);
9900 ret = __cfs_schedulable(tg, period, quota);
9901 if (ret)
9902 goto out_unlock;
9903
9904 runtime_enabled = quota != RUNTIME_INF;
9905 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9906
9907
9908
9909
9910 if (runtime_enabled && !runtime_was_enabled)
9911 cfs_bandwidth_usage_inc();
9912 raw_spin_lock_irq(&cfs_b->lock);
9913 cfs_b->period = ns_to_ktime(period);
9914 cfs_b->quota = quota;
9915 cfs_b->burst = burst;
9916
9917 __refill_cfs_bandwidth_runtime(cfs_b);
9918
9919
9920 if (runtime_enabled)
9921 start_cfs_bandwidth(cfs_b);
9922
9923 raw_spin_unlock_irq(&cfs_b->lock);
9924
9925 for_each_online_cpu(i) {
9926 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9927 struct rq *rq = cfs_rq->rq;
9928 struct rq_flags rf;
9929
9930 rq_lock_irq(rq, &rf);
9931 cfs_rq->runtime_enabled = runtime_enabled;
9932 cfs_rq->runtime_remaining = 0;
9933
9934 if (cfs_rq->throttled)
9935 unthrottle_cfs_rq(cfs_rq);
9936 rq_unlock_irq(rq, &rf);
9937 }
9938 if (runtime_was_enabled && !runtime_enabled)
9939 cfs_bandwidth_usage_dec();
9940out_unlock:
9941 mutex_unlock(&cfs_constraints_mutex);
9942 put_online_cpus();
9943
9944 return ret;
9945}
9946
9947static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9948{
9949 u64 quota, period, burst;
9950
9951 period = ktime_to_ns(tg->cfs_bandwidth.period);
9952 burst = tg->cfs_bandwidth.burst;
9953 if (cfs_quota_us < 0)
9954 quota = RUNTIME_INF;
9955 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
9956 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9957 else
9958 return -EINVAL;
9959
9960 return tg_set_cfs_bandwidth(tg, period, quota, burst);
9961}
9962
9963static long tg_get_cfs_quota(struct task_group *tg)
9964{
9965 u64 quota_us;
9966
9967 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9968 return -1;
9969
9970 quota_us = tg->cfs_bandwidth.quota;
9971 do_div(quota_us, NSEC_PER_USEC);
9972
9973 return quota_us;
9974}
9975
9976static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9977{
9978 u64 quota, period, burst;
9979
9980 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
9981 return -EINVAL;
9982
9983 period = (u64)cfs_period_us * NSEC_PER_USEC;
9984 quota = tg->cfs_bandwidth.quota;
9985 burst = tg->cfs_bandwidth.burst;
9986
9987 return tg_set_cfs_bandwidth(tg, period, quota, burst);
9988}
9989
9990static long tg_get_cfs_period(struct task_group *tg)
9991{
9992 u64 cfs_period_us;
9993
9994 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9995 do_div(cfs_period_us, NSEC_PER_USEC);
9996
9997 return cfs_period_us;
9998}
9999
10000static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
10001{
10002 u64 quota, period, burst;
10003
10004 if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
10005 return -EINVAL;
10006
10007 burst = (u64)cfs_burst_us * NSEC_PER_USEC;
10008 period = ktime_to_ns(tg->cfs_bandwidth.period);
10009 quota = tg->cfs_bandwidth.quota;
10010
10011 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10012}
10013
10014static long tg_get_cfs_burst(struct task_group *tg)
10015{
10016 u64 burst_us;
10017
10018 burst_us = tg->cfs_bandwidth.burst;
10019 do_div(burst_us, NSEC_PER_USEC);
10020
10021 return burst_us;
10022}
10023
10024static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
10025 struct cftype *cft)
10026{
10027 return tg_get_cfs_quota(css_tg(css));
10028}
10029
10030static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
10031 struct cftype *cftype, s64 cfs_quota_us)
10032{
10033 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
10034}
10035
10036static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
10037 struct cftype *cft)
10038{
10039 return tg_get_cfs_period(css_tg(css));
10040}
10041
10042static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
10043 struct cftype *cftype, u64 cfs_period_us)
10044{
10045 return tg_set_cfs_period(css_tg(css), cfs_period_us);
10046}
10047
10048static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
10049 struct cftype *cft)
10050{
10051 return tg_get_cfs_burst(css_tg(css));
10052}
10053
10054static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
10055 struct cftype *cftype, u64 cfs_burst_us)
10056{
10057 return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
10058}
10059
10060struct cfs_schedulable_data {
10061 struct task_group *tg;
10062 u64 period, quota;
10063};
10064
10065
10066
10067
10068
10069static u64 normalize_cfs_quota(struct task_group *tg,
10070 struct cfs_schedulable_data *d)
10071{
10072 u64 quota, period;
10073
10074 if (tg == d->tg) {
10075 period = d->period;
10076 quota = d->quota;
10077 } else {
10078 period = tg_get_cfs_period(tg);
10079 quota = tg_get_cfs_quota(tg);
10080 }
10081
10082
10083 if (quota == RUNTIME_INF || quota == -1)
10084 return RUNTIME_INF;
10085
10086 return to_ratio(period, quota);
10087}
10088
10089static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
10090{
10091 struct cfs_schedulable_data *d = data;
10092 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10093 s64 quota = 0, parent_quota = -1;
10094
10095 if (!tg->parent) {
10096 quota = RUNTIME_INF;
10097 } else {
10098 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
10099
10100 quota = normalize_cfs_quota(tg, d);
10101 parent_quota = parent_b->hierarchical_quota;
10102
10103
10104
10105
10106
10107
10108 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
10109 quota = min(quota, parent_quota);
10110 } else {
10111 if (quota == RUNTIME_INF)
10112 quota = parent_quota;
10113 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
10114 return -EINVAL;
10115 }
10116 }
10117 cfs_b->hierarchical_quota = quota;
10118
10119 return 0;
10120}
10121
10122static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
10123{
10124 int ret;
10125 struct cfs_schedulable_data data = {
10126 .tg = tg,
10127 .period = period,
10128 .quota = quota,
10129 };
10130
10131 if (quota != RUNTIME_INF) {
10132 do_div(data.period, NSEC_PER_USEC);
10133 do_div(data.quota, NSEC_PER_USEC);
10134 }
10135
10136 rcu_read_lock();
10137 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
10138 rcu_read_unlock();
10139
10140 return ret;
10141}
10142
10143static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
10144{
10145 struct task_group *tg = css_tg(seq_css(sf));
10146 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10147
10148 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
10149 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
10150 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
10151
10152 if (schedstat_enabled() && tg != &root_task_group) {
10153 u64 ws = 0;
10154 int i;
10155
10156 for_each_possible_cpu(i)
10157 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
10158
10159 seq_printf(sf, "wait_sum %llu\n", ws);
10160 }
10161
10162 return 0;
10163}
10164#endif
10165#endif
10166
10167#ifdef CONFIG_RT_GROUP_SCHED
10168static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
10169 struct cftype *cft, s64 val)
10170{
10171 return sched_group_set_rt_runtime(css_tg(css), val);
10172}
10173
10174static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
10175 struct cftype *cft)
10176{
10177 return sched_group_rt_runtime(css_tg(css));
10178}
10179
10180static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
10181 struct cftype *cftype, u64 rt_period_us)
10182{
10183 return sched_group_set_rt_period(css_tg(css), rt_period_us);
10184}
10185
10186static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
10187 struct cftype *cft)
10188{
10189 return sched_group_rt_period(css_tg(css));
10190}
10191#endif
10192
10193static struct cftype cpu_legacy_files[] = {
10194#ifdef CONFIG_FAIR_GROUP_SCHED
10195 {
10196 .name = "shares",
10197 .read_u64 = cpu_shares_read_u64,
10198 .write_u64 = cpu_shares_write_u64,
10199 },
10200#endif
10201#ifdef CONFIG_CFS_BANDWIDTH
10202 {
10203 .name = "cfs_quota_us",
10204 .read_s64 = cpu_cfs_quota_read_s64,
10205 .write_s64 = cpu_cfs_quota_write_s64,
10206 },
10207 {
10208 .name = "cfs_period_us",
10209 .read_u64 = cpu_cfs_period_read_u64,
10210 .write_u64 = cpu_cfs_period_write_u64,
10211 },
10212 {
10213 .name = "cfs_burst_us",
10214 .read_u64 = cpu_cfs_burst_read_u64,
10215 .write_u64 = cpu_cfs_burst_write_u64,
10216 },
10217 {
10218 .name = "stat",
10219 .seq_show = cpu_cfs_stat_show,
10220 },
10221#endif
10222#ifdef CONFIG_RT_GROUP_SCHED
10223 {
10224 .name = "rt_runtime_us",
10225 .read_s64 = cpu_rt_runtime_read,
10226 .write_s64 = cpu_rt_runtime_write,
10227 },
10228 {
10229 .name = "rt_period_us",
10230 .read_u64 = cpu_rt_period_read_uint,
10231 .write_u64 = cpu_rt_period_write_uint,
10232 },
10233#endif
10234#ifdef CONFIG_UCLAMP_TASK_GROUP
10235 {
10236 .name = "uclamp.min",
10237 .flags = CFTYPE_NOT_ON_ROOT,
10238 .seq_show = cpu_uclamp_min_show,
10239 .write = cpu_uclamp_min_write,
10240 },
10241 {
10242 .name = "uclamp.max",
10243 .flags = CFTYPE_NOT_ON_ROOT,
10244 .seq_show = cpu_uclamp_max_show,
10245 .write = cpu_uclamp_max_write,
10246 },
10247#endif
10248 { }
10249};
10250
10251static int cpu_extra_stat_show(struct seq_file *sf,
10252 struct cgroup_subsys_state *css)
10253{
10254#ifdef CONFIG_CFS_BANDWIDTH
10255 {
10256 struct task_group *tg = css_tg(css);
10257 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10258 u64 throttled_usec;
10259
10260 throttled_usec = cfs_b->throttled_time;
10261 do_div(throttled_usec, NSEC_PER_USEC);
10262
10263 seq_printf(sf, "nr_periods %d\n"
10264 "nr_throttled %d\n"
10265 "throttled_usec %llu\n",
10266 cfs_b->nr_periods, cfs_b->nr_throttled,
10267 throttled_usec);
10268 }
10269#endif
10270 return 0;
10271}
10272
10273#ifdef CONFIG_FAIR_GROUP_SCHED
10274static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
10275 struct cftype *cft)
10276{
10277 struct task_group *tg = css_tg(css);
10278 u64 weight = scale_load_down(tg->shares);
10279
10280 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
10281}
10282
10283static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
10284 struct cftype *cft, u64 weight)
10285{
10286
10287
10288
10289
10290
10291
10292
10293 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
10294 return -ERANGE;
10295
10296 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
10297
10298 return sched_group_set_shares(css_tg(css), scale_load(weight));
10299}
10300
10301static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
10302 struct cftype *cft)
10303{
10304 unsigned long weight = scale_load_down(css_tg(css)->shares);
10305 int last_delta = INT_MAX;
10306 int prio, delta;
10307
10308
10309 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
10310 delta = abs(sched_prio_to_weight[prio] - weight);
10311 if (delta >= last_delta)
10312 break;
10313 last_delta = delta;
10314 }
10315
10316 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
10317}
10318
10319static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
10320 struct cftype *cft, s64 nice)
10321{
10322 unsigned long weight;
10323 int idx;
10324
10325 if (nice < MIN_NICE || nice > MAX_NICE)
10326 return -ERANGE;
10327
10328 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
10329 idx = array_index_nospec(idx, 40);
10330 weight = sched_prio_to_weight[idx];
10331
10332 return sched_group_set_shares(css_tg(css), scale_load(weight));
10333}
10334#endif
10335
10336static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
10337 long period, long quota)
10338{
10339 if (quota < 0)
10340 seq_puts(sf, "max");
10341 else
10342 seq_printf(sf, "%ld", quota);
10343
10344 seq_printf(sf, " %ld\n", period);
10345}
10346
10347
10348static int __maybe_unused cpu_period_quota_parse(char *buf,
10349 u64 *periodp, u64 *quotap)
10350{
10351 char tok[21];
10352
10353 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
10354 return -EINVAL;
10355
10356 *periodp *= NSEC_PER_USEC;
10357
10358 if (sscanf(tok, "%llu", quotap))
10359 *quotap *= NSEC_PER_USEC;
10360 else if (!strcmp(tok, "max"))
10361 *quotap = RUNTIME_INF;
10362 else
10363 return -EINVAL;
10364
10365 return 0;
10366}
10367
10368#ifdef CONFIG_CFS_BANDWIDTH
10369static int cpu_max_show(struct seq_file *sf, void *v)
10370{
10371 struct task_group *tg = css_tg(seq_css(sf));
10372
10373 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
10374 return 0;
10375}
10376
10377static ssize_t cpu_max_write(struct kernfs_open_file *of,
10378 char *buf, size_t nbytes, loff_t off)
10379{
10380 struct task_group *tg = css_tg(of_css(of));
10381 u64 period = tg_get_cfs_period(tg);
10382 u64 burst = tg_get_cfs_burst(tg);
10383 u64 quota;
10384 int ret;
10385
10386 ret = cpu_period_quota_parse(buf, &period, "a);
10387 if (!ret)
10388 ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
10389 return ret ?: nbytes;
10390}
10391#endif
10392
10393static struct cftype cpu_files[] = {
10394#ifdef CONFIG_FAIR_GROUP_SCHED
10395 {
10396 .name = "weight",
10397 .flags = CFTYPE_NOT_ON_ROOT,
10398 .read_u64 = cpu_weight_read_u64,
10399 .write_u64 = cpu_weight_write_u64,
10400 },
10401 {
10402 .name = "weight.nice",
10403 .flags = CFTYPE_NOT_ON_ROOT,
10404 .read_s64 = cpu_weight_nice_read_s64,
10405 .write_s64 = cpu_weight_nice_write_s64,
10406 },
10407#endif
10408#ifdef CONFIG_CFS_BANDWIDTH
10409 {
10410 .name = "max",
10411 .flags = CFTYPE_NOT_ON_ROOT,
10412 .seq_show = cpu_max_show,
10413 .write = cpu_max_write,
10414 },
10415 {
10416 .name = "max.burst",
10417 .flags = CFTYPE_NOT_ON_ROOT,
10418 .read_u64 = cpu_cfs_burst_read_u64,
10419 .write_u64 = cpu_cfs_burst_write_u64,
10420 },
10421#endif
10422#ifdef CONFIG_UCLAMP_TASK_GROUP
10423 {
10424 .name = "uclamp.min",
10425 .flags = CFTYPE_NOT_ON_ROOT,
10426 .seq_show = cpu_uclamp_min_show,
10427 .write = cpu_uclamp_min_write,
10428 },
10429 {
10430 .name = "uclamp.max",
10431 .flags = CFTYPE_NOT_ON_ROOT,
10432 .seq_show = cpu_uclamp_max_show,
10433 .write = cpu_uclamp_max_write,
10434 },
10435#endif
10436 { }
10437};
10438
10439struct cgroup_subsys cpu_cgrp_subsys = {
10440 .css_alloc = cpu_cgroup_css_alloc,
10441 .css_online = cpu_cgroup_css_online,
10442 .css_released = cpu_cgroup_css_released,
10443 .css_free = cpu_cgroup_css_free,
10444 .css_extra_stat_show = cpu_extra_stat_show,
10445 .fork = cpu_cgroup_fork,
10446 .can_attach = cpu_cgroup_can_attach,
10447 .attach = cpu_cgroup_attach,
10448 .legacy_cftypes = cpu_legacy_files,
10449 .dfl_cftypes = cpu_files,
10450 .early_init = true,
10451 .threaded = true,
10452};
10453
10454#endif
10455
10456void dump_cpu_task(int cpu)
10457{
10458 pr_info("Task dump for CPU %d:\n", cpu);
10459 sched_show_task(cpu_curr(cpu));
10460}
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474const int sched_prio_to_weight[40] = {
10475 88761, 71755, 56483, 46273, 36291,
10476 29154, 23254, 18705, 14949, 11916,
10477 9548, 7620, 6100, 4904, 3906,
10478 3121, 2501, 1991, 1586, 1277,
10479 1024, 820, 655, 526, 423,
10480 335, 272, 215, 172, 137,
10481 110, 87, 70, 56, 45,
10482 36, 29, 23, 18, 15,
10483};
10484
10485
10486
10487
10488
10489
10490
10491
10492const u32 sched_prio_to_wmult[40] = {
10493 48388, 59856, 76040, 92818, 118348,
10494 147320, 184698, 229616, 287308, 360437,
10495 449829, 563644, 704093, 875809, 1099582,
10496 1376151, 1717300, 2157191, 2708050, 3363326,
10497 4194304, 5237765, 6557202, 8165337, 10153587,
10498 12820798, 15790321, 19976592, 24970740, 31350126,
10499 39045157, 49367440, 61356676, 76695844, 95443717,
10500 119304647, 148102320, 186737708, 238609294, 286331153,
10501};
10502
10503void call_trace_sched_update_nr_running(struct rq *rq, int count)
10504{
10505 trace_sched_update_nr_running_tp(rq, count);
10506}
10507