1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
44
45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
46
47#ifdef CONFIG_SCHED_DEBUG
48
49
50
51
52
53
54
55#define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57const_debug unsigned int sysctl_sched_features =
58#include "features.h"
59 0;
60#undef SCHED_FEAT
61
62
63
64
65
66
67
68
69__read_mostly int sysctl_resched_latency_warn_ms = 100;
70__read_mostly int sysctl_resched_latency_warn_once = 1;
71#endif
72
73
74
75
76
77const_debug unsigned int sysctl_sched_nr_migrate = 32;
78
79
80
81
82
83unsigned int sysctl_sched_rt_period = 1000000;
84
85__read_mostly int scheduler_running;
86
87#ifdef CONFIG_SCHED_CORE
88
89DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
90
91
92static inline int __task_prio(struct task_struct *p)
93{
94 if (p->sched_class == &stop_sched_class)
95 return -2;
96
97 if (rt_prio(p->prio))
98 return p->prio;
99
100 if (p->sched_class == &idle_sched_class)
101 return MAX_RT_PRIO + NICE_WIDTH;
102
103 return MAX_RT_PRIO + MAX_NICE;
104}
105
106
107
108
109
110
111
112
113
114static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
115{
116
117 int pa = __task_prio(a), pb = __task_prio(b);
118
119 if (-pa < -pb)
120 return true;
121
122 if (-pb < -pa)
123 return false;
124
125 if (pa == -1)
126 return !dl_time_before(a->dl.deadline, b->dl.deadline);
127
128 if (pa == MAX_RT_PRIO + MAX_NICE)
129 return cfs_prio_less(a, b, in_fi);
130
131 return false;
132}
133
134static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
135{
136 if (a->core_cookie < b->core_cookie)
137 return true;
138
139 if (a->core_cookie > b->core_cookie)
140 return false;
141
142
143 if (prio_less(b, a, task_rq(a)->core->core_forceidle))
144 return true;
145
146 return false;
147}
148
149#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
150
151static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
152{
153 return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
154}
155
156static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
157{
158 const struct task_struct *p = __node_2_sc(node);
159 unsigned long cookie = (unsigned long)key;
160
161 if (cookie < p->core_cookie)
162 return -1;
163
164 if (cookie > p->core_cookie)
165 return 1;
166
167 return 0;
168}
169
170void sched_core_enqueue(struct rq *rq, struct task_struct *p)
171{
172 rq->core->core_task_seq++;
173
174 if (!p->core_cookie)
175 return;
176
177 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
178}
179
180void sched_core_dequeue(struct rq *rq, struct task_struct *p)
181{
182 rq->core->core_task_seq++;
183
184 if (!sched_core_enqueued(p))
185 return;
186
187 rb_erase(&p->core_node, &rq->core_tree);
188 RB_CLEAR_NODE(&p->core_node);
189}
190
191
192
193
194static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
195{
196 struct rb_node *node;
197
198 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
199
200
201
202 if (!node)
203 return idle_sched_class.pick_task(rq);
204
205 return __node_2_sc(node);
206}
207
208static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
209{
210 struct rb_node *node = &p->core_node;
211
212 node = rb_next(node);
213 if (!node)
214 return NULL;
215
216 p = container_of(node, struct task_struct, core_node);
217 if (p->core_cookie != cookie)
218 return NULL;
219
220 return p;
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236static DEFINE_MUTEX(sched_core_mutex);
237static atomic_t sched_core_count;
238static struct cpumask sched_core_mask;
239
240static void sched_core_lock(int cpu, unsigned long *flags)
241{
242 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
243 int t, i = 0;
244
245 local_irq_save(*flags);
246 for_each_cpu(t, smt_mask)
247 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
248}
249
250static void sched_core_unlock(int cpu, unsigned long *flags)
251{
252 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
253 int t;
254
255 for_each_cpu(t, smt_mask)
256 raw_spin_unlock(&cpu_rq(t)->__lock);
257 local_irq_restore(*flags);
258}
259
260static void __sched_core_flip(bool enabled)
261{
262 unsigned long flags;
263 int cpu, t;
264
265 cpus_read_lock();
266
267
268
269
270 cpumask_copy(&sched_core_mask, cpu_online_mask);
271 for_each_cpu(cpu, &sched_core_mask) {
272 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
273
274 sched_core_lock(cpu, &flags);
275
276 for_each_cpu(t, smt_mask)
277 cpu_rq(t)->core_enabled = enabled;
278
279 sched_core_unlock(cpu, &flags);
280
281 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
282 }
283
284
285
286
287 cpumask_copy(&sched_core_mask, cpu_possible_mask);
288 cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
289
290 for_each_cpu(cpu, &sched_core_mask)
291 cpu_rq(cpu)->core_enabled = enabled;
292
293 cpus_read_unlock();
294}
295
296static void sched_core_assert_empty(void)
297{
298 int cpu;
299
300 for_each_possible_cpu(cpu)
301 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
302}
303
304static void __sched_core_enable(void)
305{
306 static_branch_enable(&__sched_core_enabled);
307
308
309
310
311 synchronize_rcu();
312 __sched_core_flip(true);
313 sched_core_assert_empty();
314}
315
316static void __sched_core_disable(void)
317{
318 sched_core_assert_empty();
319 __sched_core_flip(false);
320 static_branch_disable(&__sched_core_enabled);
321}
322
323void sched_core_get(void)
324{
325 if (atomic_inc_not_zero(&sched_core_count))
326 return;
327
328 mutex_lock(&sched_core_mutex);
329 if (!atomic_read(&sched_core_count))
330 __sched_core_enable();
331
332 smp_mb__before_atomic();
333 atomic_inc(&sched_core_count);
334 mutex_unlock(&sched_core_mutex);
335}
336
337static void __sched_core_put(struct work_struct *work)
338{
339 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
340 __sched_core_disable();
341 mutex_unlock(&sched_core_mutex);
342 }
343}
344
345void sched_core_put(void)
346{
347 static DECLARE_WORK(_work, __sched_core_put);
348
349
350
351
352
353
354
355
356 if (!atomic_add_unless(&sched_core_count, -1, 1))
357 schedule_work(&_work);
358}
359
360#else
361
362static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
363static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
364
365#endif
366
367
368
369
370
371int sysctl_sched_rt_runtime = 950000;
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
468{
469 raw_spinlock_t *lock;
470
471
472 preempt_disable();
473 if (sched_core_disabled()) {
474 raw_spin_lock_nested(&rq->__lock, subclass);
475
476 preempt_enable_no_resched();
477 return;
478 }
479
480 for (;;) {
481 lock = __rq_lockp(rq);
482 raw_spin_lock_nested(lock, subclass);
483 if (likely(lock == __rq_lockp(rq))) {
484
485 preempt_enable_no_resched();
486 return;
487 }
488 raw_spin_unlock(lock);
489 }
490}
491
492bool raw_spin_rq_trylock(struct rq *rq)
493{
494 raw_spinlock_t *lock;
495 bool ret;
496
497
498 preempt_disable();
499 if (sched_core_disabled()) {
500 ret = raw_spin_trylock(&rq->__lock);
501 preempt_enable();
502 return ret;
503 }
504
505 for (;;) {
506 lock = __rq_lockp(rq);
507 ret = raw_spin_trylock(lock);
508 if (!ret || (likely(lock == __rq_lockp(rq)))) {
509 preempt_enable();
510 return ret;
511 }
512 raw_spin_unlock(lock);
513 }
514}
515
516void raw_spin_rq_unlock(struct rq *rq)
517{
518 raw_spin_unlock(rq_lockp(rq));
519}
520
521#ifdef CONFIG_SMP
522
523
524
525void double_rq_lock(struct rq *rq1, struct rq *rq2)
526{
527 lockdep_assert_irqs_disabled();
528
529 if (rq_order_less(rq2, rq1))
530 swap(rq1, rq2);
531
532 raw_spin_rq_lock(rq1);
533 if (__rq_lockp(rq1) == __rq_lockp(rq2))
534 return;
535
536 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
537}
538#endif
539
540
541
542
543struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
544 __acquires(rq->lock)
545{
546 struct rq *rq;
547
548 lockdep_assert_held(&p->pi_lock);
549
550 for (;;) {
551 rq = task_rq(p);
552 raw_spin_rq_lock(rq);
553 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
554 rq_pin_lock(rq, rf);
555 return rq;
556 }
557 raw_spin_rq_unlock(rq);
558
559 while (unlikely(task_on_rq_migrating(p)))
560 cpu_relax();
561 }
562}
563
564
565
566
567struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
568 __acquires(p->pi_lock)
569 __acquires(rq->lock)
570{
571 struct rq *rq;
572
573 for (;;) {
574 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
575 rq = task_rq(p);
576 raw_spin_rq_lock(rq);
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
595 rq_pin_lock(rq, rf);
596 return rq;
597 }
598 raw_spin_rq_unlock(rq);
599 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
600
601 while (unlikely(task_on_rq_migrating(p)))
602 cpu_relax();
603 }
604}
605
606
607
608
609
610static void update_rq_clock_task(struct rq *rq, s64 delta)
611{
612
613
614
615
616 s64 __maybe_unused steal = 0, irq_delta = 0;
617
618#ifdef CONFIG_IRQ_TIME_ACCOUNTING
619 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636 if (irq_delta > delta)
637 irq_delta = delta;
638
639 rq->prev_irq_time += irq_delta;
640 delta -= irq_delta;
641#endif
642#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
643 if (static_key_false((¶virt_steal_rq_enabled))) {
644 steal = paravirt_steal_clock(cpu_of(rq));
645 steal -= rq->prev_steal_time_rq;
646
647 if (unlikely(steal > delta))
648 steal = delta;
649
650 rq->prev_steal_time_rq += steal;
651 delta -= steal;
652 }
653#endif
654
655 rq->clock_task += delta;
656
657#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
658 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
659 update_irq_load_avg(rq, irq_delta + steal);
660#endif
661 update_rq_clock_pelt(rq, delta);
662}
663
664void update_rq_clock(struct rq *rq)
665{
666 s64 delta;
667
668 lockdep_assert_rq_held(rq);
669
670 if (rq->clock_update_flags & RQCF_ACT_SKIP)
671 return;
672
673#ifdef CONFIG_SCHED_DEBUG
674 if (sched_feat(WARN_DOUBLE_CLOCK))
675 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
676 rq->clock_update_flags |= RQCF_UPDATED;
677#endif
678
679 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
680 if (delta < 0)
681 return;
682 rq->clock += delta;
683 update_rq_clock_task(rq, delta);
684}
685
686#ifdef CONFIG_SCHED_HRTICK
687
688
689
690
691static void hrtick_clear(struct rq *rq)
692{
693 if (hrtimer_active(&rq->hrtick_timer))
694 hrtimer_cancel(&rq->hrtick_timer);
695}
696
697
698
699
700
701static enum hrtimer_restart hrtick(struct hrtimer *timer)
702{
703 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
704 struct rq_flags rf;
705
706 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
707
708 rq_lock(rq, &rf);
709 update_rq_clock(rq);
710 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
711 rq_unlock(rq, &rf);
712
713 return HRTIMER_NORESTART;
714}
715
716#ifdef CONFIG_SMP
717
718static void __hrtick_restart(struct rq *rq)
719{
720 struct hrtimer *timer = &rq->hrtick_timer;
721 ktime_t time = rq->hrtick_time;
722
723 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
724}
725
726
727
728
729static void __hrtick_start(void *arg)
730{
731 struct rq *rq = arg;
732 struct rq_flags rf;
733
734 rq_lock(rq, &rf);
735 __hrtick_restart(rq);
736 rq_unlock(rq, &rf);
737}
738
739
740
741
742
743
744void hrtick_start(struct rq *rq, u64 delay)
745{
746 struct hrtimer *timer = &rq->hrtick_timer;
747 s64 delta;
748
749
750
751
752
753 delta = max_t(s64, delay, 10000LL);
754 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
755
756 if (rq == this_rq())
757 __hrtick_restart(rq);
758 else
759 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
760}
761
762#else
763
764
765
766
767
768void hrtick_start(struct rq *rq, u64 delay)
769{
770
771
772
773
774 delay = max_t(u64, delay, 10000LL);
775 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
776 HRTIMER_MODE_REL_PINNED_HARD);
777}
778
779#endif
780
781static void hrtick_rq_init(struct rq *rq)
782{
783#ifdef CONFIG_SMP
784 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
785#endif
786 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
787 rq->hrtick_timer.function = hrtick;
788}
789#else
790static inline void hrtick_clear(struct rq *rq)
791{
792}
793
794static inline void hrtick_rq_init(struct rq *rq)
795{
796}
797#endif
798
799
800
801
802#define fetch_or(ptr, mask) \
803 ({ \
804 typeof(ptr) _ptr = (ptr); \
805 typeof(mask) _mask = (mask); \
806 typeof(*_ptr) _old, _val = *_ptr; \
807 \
808 for (;;) { \
809 _old = cmpxchg(_ptr, _val, _val | _mask); \
810 if (_old == _val) \
811 break; \
812 _val = _old; \
813 } \
814 _old; \
815})
816
817#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
818
819
820
821
822
823static bool set_nr_and_not_polling(struct task_struct *p)
824{
825 struct thread_info *ti = task_thread_info(p);
826 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
827}
828
829
830
831
832
833
834
835static bool set_nr_if_polling(struct task_struct *p)
836{
837 struct thread_info *ti = task_thread_info(p);
838 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
839
840 for (;;) {
841 if (!(val & _TIF_POLLING_NRFLAG))
842 return false;
843 if (val & _TIF_NEED_RESCHED)
844 return true;
845 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
846 if (old == val)
847 break;
848 val = old;
849 }
850 return true;
851}
852
853#else
854static bool set_nr_and_not_polling(struct task_struct *p)
855{
856 set_tsk_need_resched(p);
857 return true;
858}
859
860#ifdef CONFIG_SMP
861static bool set_nr_if_polling(struct task_struct *p)
862{
863 return false;
864}
865#endif
866#endif
867
868static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
869{
870 struct wake_q_node *node = &task->wake_q;
871
872
873
874
875
876
877
878
879
880 smp_mb__before_atomic();
881 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
882 return false;
883
884
885
886
887 *head->lastp = node;
888 head->lastp = &node->next;
889 return true;
890}
891
892
893
894
895
896
897
898
899
900
901
902
903
904void wake_q_add(struct wake_q_head *head, struct task_struct *task)
905{
906 if (__wake_q_add(head, task))
907 get_task_struct(task);
908}
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
928{
929 if (!__wake_q_add(head, task))
930 put_task_struct(task);
931}
932
933void wake_up_q(struct wake_q_head *head)
934{
935 struct wake_q_node *node = head->first;
936
937 while (node != WAKE_Q_TAIL) {
938 struct task_struct *task;
939
940 task = container_of(node, struct task_struct, wake_q);
941
942 node = node->next;
943 task->wake_q.next = NULL;
944
945
946
947
948
949 wake_up_process(task);
950 put_task_struct(task);
951 }
952}
953
954
955
956
957
958
959
960
961void resched_curr(struct rq *rq)
962{
963 struct task_struct *curr = rq->curr;
964 int cpu;
965
966 lockdep_assert_rq_held(rq);
967
968 if (test_tsk_need_resched(curr))
969 return;
970
971 cpu = cpu_of(rq);
972
973 if (cpu == smp_processor_id()) {
974 set_tsk_need_resched(curr);
975 set_preempt_need_resched();
976 return;
977 }
978
979 if (set_nr_and_not_polling(curr))
980 smp_send_reschedule(cpu);
981 else
982 trace_sched_wake_idle_without_ipi(cpu);
983}
984
985void resched_cpu(int cpu)
986{
987 struct rq *rq = cpu_rq(cpu);
988 unsigned long flags;
989
990 raw_spin_rq_lock_irqsave(rq, flags);
991 if (cpu_online(cpu) || cpu == smp_processor_id())
992 resched_curr(rq);
993 raw_spin_rq_unlock_irqrestore(rq, flags);
994}
995
996#ifdef CONFIG_SMP
997#ifdef CONFIG_NO_HZ_COMMON
998
999
1000
1001
1002
1003
1004
1005
1006int get_nohz_timer_target(void)
1007{
1008 int i, cpu = smp_processor_id(), default_cpu = -1;
1009 struct sched_domain *sd;
1010 const struct cpumask *hk_mask;
1011
1012 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1013 if (!idle_cpu(cpu))
1014 return cpu;
1015 default_cpu = cpu;
1016 }
1017
1018 hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
1019
1020 rcu_read_lock();
1021 for_each_domain(cpu, sd) {
1022 for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
1023 if (cpu == i)
1024 continue;
1025
1026 if (!idle_cpu(i)) {
1027 cpu = i;
1028 goto unlock;
1029 }
1030 }
1031 }
1032
1033 if (default_cpu == -1)
1034 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1035 cpu = default_cpu;
1036unlock:
1037 rcu_read_unlock();
1038 return cpu;
1039}
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051static void wake_up_idle_cpu(int cpu)
1052{
1053 struct rq *rq = cpu_rq(cpu);
1054
1055 if (cpu == smp_processor_id())
1056 return;
1057
1058 if (set_nr_and_not_polling(rq->idle))
1059 smp_send_reschedule(cpu);
1060 else
1061 trace_sched_wake_idle_without_ipi(cpu);
1062}
1063
1064static bool wake_up_full_nohz_cpu(int cpu)
1065{
1066
1067
1068
1069
1070
1071
1072 if (cpu_is_offline(cpu))
1073 return true;
1074 if (tick_nohz_full_cpu(cpu)) {
1075 if (cpu != smp_processor_id() ||
1076 tick_nohz_tick_stopped())
1077 tick_nohz_full_kick_cpu(cpu);
1078 return true;
1079 }
1080
1081 return false;
1082}
1083
1084
1085
1086
1087
1088
1089void wake_up_nohz_cpu(int cpu)
1090{
1091 if (!wake_up_full_nohz_cpu(cpu))
1092 wake_up_idle_cpu(cpu);
1093}
1094
1095static void nohz_csd_func(void *info)
1096{
1097 struct rq *rq = info;
1098 int cpu = cpu_of(rq);
1099 unsigned int flags;
1100
1101
1102
1103
1104 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1105 WARN_ON(!(flags & NOHZ_KICK_MASK));
1106
1107 rq->idle_balance = idle_cpu(cpu);
1108 if (rq->idle_balance && !need_resched()) {
1109 rq->nohz_idle_balance = flags;
1110 raise_softirq_irqoff(SCHED_SOFTIRQ);
1111 }
1112}
1113
1114#endif
1115
1116#ifdef CONFIG_NO_HZ_FULL
1117bool sched_can_stop_tick(struct rq *rq)
1118{
1119 int fifo_nr_running;
1120
1121
1122 if (rq->dl.dl_nr_running)
1123 return false;
1124
1125
1126
1127
1128
1129 if (rq->rt.rr_nr_running) {
1130 if (rq->rt.rr_nr_running == 1)
1131 return true;
1132 else
1133 return false;
1134 }
1135
1136
1137
1138
1139
1140 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1141 if (fifo_nr_running)
1142 return true;
1143
1144
1145
1146
1147
1148
1149 if (rq->nr_running > 1)
1150 return false;
1151
1152 return true;
1153}
1154#endif
1155#endif
1156
1157#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1158 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1159
1160
1161
1162
1163
1164
1165int walk_tg_tree_from(struct task_group *from,
1166 tg_visitor down, tg_visitor up, void *data)
1167{
1168 struct task_group *parent, *child;
1169 int ret;
1170
1171 parent = from;
1172
1173down:
1174 ret = (*down)(parent, data);
1175 if (ret)
1176 goto out;
1177 list_for_each_entry_rcu(child, &parent->children, siblings) {
1178 parent = child;
1179 goto down;
1180
1181up:
1182 continue;
1183 }
1184 ret = (*up)(parent, data);
1185 if (ret || parent == from)
1186 goto out;
1187
1188 child = parent;
1189 parent = parent->parent;
1190 if (parent)
1191 goto up;
1192out:
1193 return ret;
1194}
1195
1196int tg_nop(struct task_group *tg, void *data)
1197{
1198 return 0;
1199}
1200#endif
1201
1202static void set_load_weight(struct task_struct *p, bool update_load)
1203{
1204 int prio = p->static_prio - MAX_RT_PRIO;
1205 struct load_weight *load = &p->se.load;
1206
1207
1208
1209
1210 if (task_has_idle_policy(p)) {
1211 load->weight = scale_load(WEIGHT_IDLEPRIO);
1212 load->inv_weight = WMULT_IDLEPRIO;
1213 return;
1214 }
1215
1216
1217
1218
1219
1220 if (update_load && p->sched_class == &fair_sched_class) {
1221 reweight_task(p, prio);
1222 } else {
1223 load->weight = scale_load(sched_prio_to_weight[prio]);
1224 load->inv_weight = sched_prio_to_wmult[prio];
1225 }
1226}
1227
1228#ifdef CONFIG_UCLAMP_TASK
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239static DEFINE_MUTEX(uclamp_mutex);
1240
1241
1242unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1243
1244
1245unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1263
1264
1265static struct uclamp_se uclamp_default[UCLAMP_CNT];
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1286
1287
1288#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1289
1290#define for_each_clamp_id(clamp_id) \
1291 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1292
1293static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1294{
1295 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1296}
1297
1298static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1299{
1300 if (clamp_id == UCLAMP_MIN)
1301 return 0;
1302 return SCHED_CAPACITY_SCALE;
1303}
1304
1305static inline void uclamp_se_set(struct uclamp_se *uc_se,
1306 unsigned int value, bool user_defined)
1307{
1308 uc_se->value = value;
1309 uc_se->bucket_id = uclamp_bucket_id(value);
1310 uc_se->user_defined = user_defined;
1311}
1312
1313static inline unsigned int
1314uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1315 unsigned int clamp_value)
1316{
1317
1318
1319
1320
1321
1322 if (clamp_id == UCLAMP_MAX) {
1323 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1324 return clamp_value;
1325 }
1326
1327 return uclamp_none(UCLAMP_MIN);
1328}
1329
1330static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1331 unsigned int clamp_value)
1332{
1333
1334 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1335 return;
1336
1337 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1338}
1339
1340static inline
1341unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1342 unsigned int clamp_value)
1343{
1344 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1345 int bucket_id = UCLAMP_BUCKETS - 1;
1346
1347
1348
1349
1350
1351 for ( ; bucket_id >= 0; bucket_id--) {
1352 if (!bucket[bucket_id].tasks)
1353 continue;
1354 return bucket[bucket_id].value;
1355 }
1356
1357
1358 return uclamp_idle_value(rq, clamp_id, clamp_value);
1359}
1360
1361static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1362{
1363 unsigned int default_util_min;
1364 struct uclamp_se *uc_se;
1365
1366 lockdep_assert_held(&p->pi_lock);
1367
1368 uc_se = &p->uclamp_req[UCLAMP_MIN];
1369
1370
1371 if (uc_se->user_defined)
1372 return;
1373
1374 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1375 uclamp_se_set(uc_se, default_util_min, false);
1376}
1377
1378static void uclamp_update_util_min_rt_default(struct task_struct *p)
1379{
1380 struct rq_flags rf;
1381 struct rq *rq;
1382
1383 if (!rt_task(p))
1384 return;
1385
1386
1387 rq = task_rq_lock(p, &rf);
1388 __uclamp_update_util_min_rt_default(p);
1389 task_rq_unlock(rq, p, &rf);
1390}
1391
1392static void uclamp_sync_util_min_rt_default(void)
1393{
1394 struct task_struct *g, *p;
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409 read_lock(&tasklist_lock);
1410 smp_mb__after_spinlock();
1411 read_unlock(&tasklist_lock);
1412
1413 rcu_read_lock();
1414 for_each_process_thread(g, p)
1415 uclamp_update_util_min_rt_default(p);
1416 rcu_read_unlock();
1417}
1418
1419static inline struct uclamp_se
1420uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1421{
1422
1423 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1424#ifdef CONFIG_UCLAMP_TASK_GROUP
1425 unsigned int tg_min, tg_max, value;
1426
1427
1428
1429
1430
1431 if (task_group_is_autogroup(task_group(p)))
1432 return uc_req;
1433 if (task_group(p) == &root_task_group)
1434 return uc_req;
1435
1436 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1437 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1438 value = uc_req.value;
1439 value = clamp(value, tg_min, tg_max);
1440 uclamp_se_set(&uc_req, value, false);
1441#endif
1442
1443 return uc_req;
1444}
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454static inline struct uclamp_se
1455uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1456{
1457 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1458 struct uclamp_se uc_max = uclamp_default[clamp_id];
1459
1460
1461 if (unlikely(uc_req.value > uc_max.value))
1462 return uc_max;
1463
1464 return uc_req;
1465}
1466
1467unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1468{
1469 struct uclamp_se uc_eff;
1470
1471
1472 if (p->uclamp[clamp_id].active)
1473 return (unsigned long)p->uclamp[clamp_id].value;
1474
1475 uc_eff = uclamp_eff_get(p, clamp_id);
1476
1477 return (unsigned long)uc_eff.value;
1478}
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1491 enum uclamp_id clamp_id)
1492{
1493 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1494 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1495 struct uclamp_bucket *bucket;
1496
1497 lockdep_assert_rq_held(rq);
1498
1499
1500 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1501
1502 bucket = &uc_rq->bucket[uc_se->bucket_id];
1503 bucket->tasks++;
1504 uc_se->active = true;
1505
1506 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1507
1508
1509
1510
1511
1512 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1513 bucket->value = uc_se->value;
1514
1515 if (uc_se->value > READ_ONCE(uc_rq->value))
1516 WRITE_ONCE(uc_rq->value, uc_se->value);
1517}
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1529 enum uclamp_id clamp_id)
1530{
1531 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1532 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1533 struct uclamp_bucket *bucket;
1534 unsigned int bkt_clamp;
1535 unsigned int rq_clamp;
1536
1537 lockdep_assert_rq_held(rq);
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562 if (unlikely(!uc_se->active))
1563 return;
1564
1565 bucket = &uc_rq->bucket[uc_se->bucket_id];
1566
1567 SCHED_WARN_ON(!bucket->tasks);
1568 if (likely(bucket->tasks))
1569 bucket->tasks--;
1570
1571 uc_se->active = false;
1572
1573
1574
1575
1576
1577
1578
1579 if (likely(bucket->tasks))
1580 return;
1581
1582 rq_clamp = READ_ONCE(uc_rq->value);
1583
1584
1585
1586
1587 SCHED_WARN_ON(bucket->value > rq_clamp);
1588 if (bucket->value >= rq_clamp) {
1589 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1590 WRITE_ONCE(uc_rq->value, bkt_clamp);
1591 }
1592}
1593
1594static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1595{
1596 enum uclamp_id clamp_id;
1597
1598
1599
1600
1601
1602
1603
1604 if (!static_branch_unlikely(&sched_uclamp_used))
1605 return;
1606
1607 if (unlikely(!p->sched_class->uclamp_enabled))
1608 return;
1609
1610 for_each_clamp_id(clamp_id)
1611 uclamp_rq_inc_id(rq, p, clamp_id);
1612
1613
1614 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1615 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1616}
1617
1618static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1619{
1620 enum uclamp_id clamp_id;
1621
1622
1623
1624
1625
1626
1627
1628 if (!static_branch_unlikely(&sched_uclamp_used))
1629 return;
1630
1631 if (unlikely(!p->sched_class->uclamp_enabled))
1632 return;
1633
1634 for_each_clamp_id(clamp_id)
1635 uclamp_rq_dec_id(rq, p, clamp_id);
1636}
1637
1638static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1639 enum uclamp_id clamp_id)
1640{
1641 if (!p->uclamp[clamp_id].active)
1642 return;
1643
1644 uclamp_rq_dec_id(rq, p, clamp_id);
1645 uclamp_rq_inc_id(rq, p, clamp_id);
1646
1647
1648
1649
1650
1651 if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1652 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1653}
1654
1655static inline void
1656uclamp_update_active(struct task_struct *p)
1657{
1658 enum uclamp_id clamp_id;
1659 struct rq_flags rf;
1660 struct rq *rq;
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 rq = task_rq_lock(p, &rf);
1671
1672
1673
1674
1675
1676
1677
1678 for_each_clamp_id(clamp_id)
1679 uclamp_rq_reinc_id(rq, p, clamp_id);
1680
1681 task_rq_unlock(rq, p, &rf);
1682}
1683
1684#ifdef CONFIG_UCLAMP_TASK_GROUP
1685static inline void
1686uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1687{
1688 struct css_task_iter it;
1689 struct task_struct *p;
1690
1691 css_task_iter_start(css, 0, &it);
1692 while ((p = css_task_iter_next(&it)))
1693 uclamp_update_active(p);
1694 css_task_iter_end(&it);
1695}
1696
1697static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1698static void uclamp_update_root_tg(void)
1699{
1700 struct task_group *tg = &root_task_group;
1701
1702 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1703 sysctl_sched_uclamp_util_min, false);
1704 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1705 sysctl_sched_uclamp_util_max, false);
1706
1707 rcu_read_lock();
1708 cpu_util_update_eff(&root_task_group.css);
1709 rcu_read_unlock();
1710}
1711#else
1712static void uclamp_update_root_tg(void) { }
1713#endif
1714
1715int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1716 void *buffer, size_t *lenp, loff_t *ppos)
1717{
1718 bool update_root_tg = false;
1719 int old_min, old_max, old_min_rt;
1720 int result;
1721
1722 mutex_lock(&uclamp_mutex);
1723 old_min = sysctl_sched_uclamp_util_min;
1724 old_max = sysctl_sched_uclamp_util_max;
1725 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1726
1727 result = proc_dointvec(table, write, buffer, lenp, ppos);
1728 if (result)
1729 goto undo;
1730 if (!write)
1731 goto done;
1732
1733 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1734 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1735 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1736
1737 result = -EINVAL;
1738 goto undo;
1739 }
1740
1741 if (old_min != sysctl_sched_uclamp_util_min) {
1742 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1743 sysctl_sched_uclamp_util_min, false);
1744 update_root_tg = true;
1745 }
1746 if (old_max != sysctl_sched_uclamp_util_max) {
1747 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1748 sysctl_sched_uclamp_util_max, false);
1749 update_root_tg = true;
1750 }
1751
1752 if (update_root_tg) {
1753 static_branch_enable(&sched_uclamp_used);
1754 uclamp_update_root_tg();
1755 }
1756
1757 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1758 static_branch_enable(&sched_uclamp_used);
1759 uclamp_sync_util_min_rt_default();
1760 }
1761
1762
1763
1764
1765
1766
1767
1768 goto done;
1769
1770undo:
1771 sysctl_sched_uclamp_util_min = old_min;
1772 sysctl_sched_uclamp_util_max = old_max;
1773 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1774done:
1775 mutex_unlock(&uclamp_mutex);
1776
1777 return result;
1778}
1779
1780static int uclamp_validate(struct task_struct *p,
1781 const struct sched_attr *attr)
1782{
1783 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1784 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1785
1786 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1787 util_min = attr->sched_util_min;
1788
1789 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1790 return -EINVAL;
1791 }
1792
1793 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1794 util_max = attr->sched_util_max;
1795
1796 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1797 return -EINVAL;
1798 }
1799
1800 if (util_min != -1 && util_max != -1 && util_min > util_max)
1801 return -EINVAL;
1802
1803
1804
1805
1806
1807
1808
1809
1810 static_branch_enable(&sched_uclamp_used);
1811
1812 return 0;
1813}
1814
1815static bool uclamp_reset(const struct sched_attr *attr,
1816 enum uclamp_id clamp_id,
1817 struct uclamp_se *uc_se)
1818{
1819
1820 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1821 !uc_se->user_defined)
1822 return true;
1823
1824
1825 if (clamp_id == UCLAMP_MIN &&
1826 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1827 attr->sched_util_min == -1) {
1828 return true;
1829 }
1830
1831 if (clamp_id == UCLAMP_MAX &&
1832 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1833 attr->sched_util_max == -1) {
1834 return true;
1835 }
1836
1837 return false;
1838}
1839
1840static void __setscheduler_uclamp(struct task_struct *p,
1841 const struct sched_attr *attr)
1842{
1843 enum uclamp_id clamp_id;
1844
1845 for_each_clamp_id(clamp_id) {
1846 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1847 unsigned int value;
1848
1849 if (!uclamp_reset(attr, clamp_id, uc_se))
1850 continue;
1851
1852
1853
1854
1855
1856 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1857 value = sysctl_sched_uclamp_util_min_rt_default;
1858 else
1859 value = uclamp_none(clamp_id);
1860
1861 uclamp_se_set(uc_se, value, false);
1862
1863 }
1864
1865 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1866 return;
1867
1868 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1869 attr->sched_util_min != -1) {
1870 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1871 attr->sched_util_min, true);
1872 }
1873
1874 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1875 attr->sched_util_max != -1) {
1876 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1877 attr->sched_util_max, true);
1878 }
1879}
1880
1881static void uclamp_fork(struct task_struct *p)
1882{
1883 enum uclamp_id clamp_id;
1884
1885
1886
1887
1888
1889 for_each_clamp_id(clamp_id)
1890 p->uclamp[clamp_id].active = false;
1891
1892 if (likely(!p->sched_reset_on_fork))
1893 return;
1894
1895 for_each_clamp_id(clamp_id) {
1896 uclamp_se_set(&p->uclamp_req[clamp_id],
1897 uclamp_none(clamp_id), false);
1898 }
1899}
1900
1901static void uclamp_post_fork(struct task_struct *p)
1902{
1903 uclamp_update_util_min_rt_default(p);
1904}
1905
1906static void __init init_uclamp_rq(struct rq *rq)
1907{
1908 enum uclamp_id clamp_id;
1909 struct uclamp_rq *uc_rq = rq->uclamp;
1910
1911 for_each_clamp_id(clamp_id) {
1912 uc_rq[clamp_id] = (struct uclamp_rq) {
1913 .value = uclamp_none(clamp_id)
1914 };
1915 }
1916
1917 rq->uclamp_flags = 0;
1918}
1919
1920static void __init init_uclamp(void)
1921{
1922 struct uclamp_se uc_max = {};
1923 enum uclamp_id clamp_id;
1924 int cpu;
1925
1926 for_each_possible_cpu(cpu)
1927 init_uclamp_rq(cpu_rq(cpu));
1928
1929 for_each_clamp_id(clamp_id) {
1930 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1931 uclamp_none(clamp_id), false);
1932 }
1933
1934
1935 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1936 for_each_clamp_id(clamp_id) {
1937 uclamp_default[clamp_id] = uc_max;
1938#ifdef CONFIG_UCLAMP_TASK_GROUP
1939 root_task_group.uclamp_req[clamp_id] = uc_max;
1940 root_task_group.uclamp[clamp_id] = uc_max;
1941#endif
1942 }
1943}
1944
1945#else
1946static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1947static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1948static inline int uclamp_validate(struct task_struct *p,
1949 const struct sched_attr *attr)
1950{
1951 return -EOPNOTSUPP;
1952}
1953static void __setscheduler_uclamp(struct task_struct *p,
1954 const struct sched_attr *attr) { }
1955static inline void uclamp_fork(struct task_struct *p) { }
1956static inline void uclamp_post_fork(struct task_struct *p) { }
1957static inline void init_uclamp(void) { }
1958#endif
1959
1960bool sched_task_on_rq(struct task_struct *p)
1961{
1962 return task_on_rq_queued(p);
1963}
1964
1965static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1966{
1967 if (!(flags & ENQUEUE_NOCLOCK))
1968 update_rq_clock(rq);
1969
1970 if (!(flags & ENQUEUE_RESTORE)) {
1971 sched_info_enqueue(rq, p);
1972 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1973 }
1974
1975 uclamp_rq_inc(rq, p);
1976 p->sched_class->enqueue_task(rq, p, flags);
1977
1978 if (sched_core_enabled(rq))
1979 sched_core_enqueue(rq, p);
1980}
1981
1982static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1983{
1984 if (sched_core_enabled(rq))
1985 sched_core_dequeue(rq, p);
1986
1987 if (!(flags & DEQUEUE_NOCLOCK))
1988 update_rq_clock(rq);
1989
1990 if (!(flags & DEQUEUE_SAVE)) {
1991 sched_info_dequeue(rq, p);
1992 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1993 }
1994
1995 uclamp_rq_dec(rq, p);
1996 p->sched_class->dequeue_task(rq, p, flags);
1997}
1998
1999void activate_task(struct rq *rq, struct task_struct *p, int flags)
2000{
2001 enqueue_task(rq, p, flags);
2002
2003 p->on_rq = TASK_ON_RQ_QUEUED;
2004}
2005
2006void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2007{
2008 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
2009
2010 dequeue_task(rq, p, flags);
2011}
2012
2013static inline int __normal_prio(int policy, int rt_prio, int nice)
2014{
2015 int prio;
2016
2017 if (dl_policy(policy))
2018 prio = MAX_DL_PRIO - 1;
2019 else if (rt_policy(policy))
2020 prio = MAX_RT_PRIO - 1 - rt_prio;
2021 else
2022 prio = NICE_TO_PRIO(nice);
2023
2024 return prio;
2025}
2026
2027
2028
2029
2030
2031
2032
2033
2034static inline int normal_prio(struct task_struct *p)
2035{
2036 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2037}
2038
2039
2040
2041
2042
2043
2044
2045
2046static int effective_prio(struct task_struct *p)
2047{
2048 p->normal_prio = normal_prio(p);
2049
2050
2051
2052
2053
2054 if (!rt_prio(p->prio))
2055 return p->normal_prio;
2056 return p->prio;
2057}
2058
2059
2060
2061
2062
2063
2064
2065inline int task_curr(const struct task_struct *p)
2066{
2067 return cpu_curr(task_cpu(p)) == p;
2068}
2069
2070
2071
2072
2073
2074
2075
2076
2077static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2078 const struct sched_class *prev_class,
2079 int oldprio)
2080{
2081 if (prev_class != p->sched_class) {
2082 if (prev_class->switched_from)
2083 prev_class->switched_from(rq, p);
2084
2085 p->sched_class->switched_to(rq, p);
2086 } else if (oldprio != p->prio || dl_task(p))
2087 p->sched_class->prio_changed(rq, p, oldprio);
2088}
2089
2090void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2091{
2092 if (p->sched_class == rq->curr->sched_class)
2093 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2094 else if (p->sched_class > rq->curr->sched_class)
2095 resched_curr(rq);
2096
2097
2098
2099
2100
2101 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2102 rq_clock_skip_update(rq);
2103}
2104
2105#ifdef CONFIG_SMP
2106
2107static void
2108__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2109
2110static int __set_cpus_allowed_ptr(struct task_struct *p,
2111 const struct cpumask *new_mask,
2112 u32 flags);
2113
2114static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2115{
2116 if (likely(!p->migration_disabled))
2117 return;
2118
2119 if (p->cpus_ptr != &p->cpus_mask)
2120 return;
2121
2122
2123
2124
2125 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2126}
2127
2128void migrate_disable(void)
2129{
2130 struct task_struct *p = current;
2131
2132 if (p->migration_disabled) {
2133 p->migration_disabled++;
2134 return;
2135 }
2136
2137 preempt_disable();
2138 this_rq()->nr_pinned++;
2139 p->migration_disabled = 1;
2140 preempt_enable();
2141}
2142EXPORT_SYMBOL_GPL(migrate_disable);
2143
2144void migrate_enable(void)
2145{
2146 struct task_struct *p = current;
2147
2148 if (p->migration_disabled > 1) {
2149 p->migration_disabled--;
2150 return;
2151 }
2152
2153
2154
2155
2156
2157 preempt_disable();
2158 if (p->cpus_ptr != &p->cpus_mask)
2159 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2160
2161
2162
2163
2164
2165 barrier();
2166 p->migration_disabled = 0;
2167 this_rq()->nr_pinned--;
2168 preempt_enable();
2169}
2170EXPORT_SYMBOL_GPL(migrate_enable);
2171
2172static inline bool rq_has_pinned_tasks(struct rq *rq)
2173{
2174 return rq->nr_pinned;
2175}
2176
2177
2178
2179
2180
2181static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2182{
2183
2184 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2185 return false;
2186
2187
2188 if (is_migration_disabled(p))
2189 return cpu_online(cpu);
2190
2191
2192 if (!(p->flags & PF_KTHREAD))
2193 return cpu_active(cpu) && task_cpu_possible(cpu, p);
2194
2195
2196 if (kthread_is_per_cpu(p))
2197 return cpu_online(cpu);
2198
2199
2200 if (cpu_dying(cpu))
2201 return false;
2202
2203
2204 return cpu_online(cpu);
2205}
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2227 struct task_struct *p, int new_cpu)
2228{
2229 lockdep_assert_rq_held(rq);
2230
2231 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2232 set_task_cpu(p, new_cpu);
2233 rq_unlock(rq, rf);
2234
2235 rq = cpu_rq(new_cpu);
2236
2237 rq_lock(rq, rf);
2238 BUG_ON(task_cpu(p) != new_cpu);
2239 activate_task(rq, p, 0);
2240 check_preempt_curr(rq, p, 0);
2241
2242 return rq;
2243}
2244
2245struct migration_arg {
2246 struct task_struct *task;
2247 int dest_cpu;
2248 struct set_affinity_pending *pending;
2249};
2250
2251
2252
2253
2254
2255struct set_affinity_pending {
2256 refcount_t refs;
2257 unsigned int stop_pending;
2258 struct completion done;
2259 struct cpu_stop_work stop_work;
2260 struct migration_arg arg;
2261};
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2273 struct task_struct *p, int dest_cpu)
2274{
2275
2276 if (!is_cpu_allowed(p, dest_cpu))
2277 return rq;
2278
2279 update_rq_clock(rq);
2280 rq = move_queued_task(rq, rf, p, dest_cpu);
2281
2282 return rq;
2283}
2284
2285
2286
2287
2288
2289
2290static int migration_cpu_stop(void *data)
2291{
2292 struct migration_arg *arg = data;
2293 struct set_affinity_pending *pending = arg->pending;
2294 struct task_struct *p = arg->task;
2295 struct rq *rq = this_rq();
2296 bool complete = false;
2297 struct rq_flags rf;
2298
2299
2300
2301
2302
2303 local_irq_save(rf.flags);
2304
2305
2306
2307
2308
2309 flush_smp_call_function_from_idle();
2310
2311 raw_spin_lock(&p->pi_lock);
2312 rq_lock(rq, &rf);
2313
2314
2315
2316
2317
2318 WARN_ON_ONCE(pending && pending != p->migration_pending);
2319
2320
2321
2322
2323
2324
2325 if (task_rq(p) == rq) {
2326 if (is_migration_disabled(p))
2327 goto out;
2328
2329 if (pending) {
2330 p->migration_pending = NULL;
2331 complete = true;
2332
2333 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2334 goto out;
2335 }
2336
2337 if (task_on_rq_queued(p))
2338 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2339 else
2340 p->wake_cpu = arg->dest_cpu;
2341
2342
2343
2344
2345
2346
2347
2348
2349 } else if (pending) {
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2365 p->migration_pending = NULL;
2366 complete = true;
2367 goto out;
2368 }
2369
2370
2371
2372
2373
2374
2375 WARN_ON_ONCE(!pending->stop_pending);
2376 task_rq_unlock(rq, p, &rf);
2377 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2378 &pending->arg, &pending->stop_work);
2379 return 0;
2380 }
2381out:
2382 if (pending)
2383 pending->stop_pending = false;
2384 task_rq_unlock(rq, p, &rf);
2385
2386 if (complete)
2387 complete_all(&pending->done);
2388
2389 return 0;
2390}
2391
2392int push_cpu_stop(void *arg)
2393{
2394 struct rq *lowest_rq = NULL, *rq = this_rq();
2395 struct task_struct *p = arg;
2396
2397 raw_spin_lock_irq(&p->pi_lock);
2398 raw_spin_rq_lock(rq);
2399
2400 if (task_rq(p) != rq)
2401 goto out_unlock;
2402
2403 if (is_migration_disabled(p)) {
2404 p->migration_flags |= MDF_PUSH;
2405 goto out_unlock;
2406 }
2407
2408 p->migration_flags &= ~MDF_PUSH;
2409
2410 if (p->sched_class->find_lock_rq)
2411 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2412
2413 if (!lowest_rq)
2414 goto out_unlock;
2415
2416
2417 if (task_rq(p) == rq) {
2418 deactivate_task(rq, p, 0);
2419 set_task_cpu(p, lowest_rq->cpu);
2420 activate_task(lowest_rq, p, 0);
2421 resched_curr(lowest_rq);
2422 }
2423
2424 double_unlock_balance(rq, lowest_rq);
2425
2426out_unlock:
2427 rq->push_busy = false;
2428 raw_spin_rq_unlock(rq);
2429 raw_spin_unlock_irq(&p->pi_lock);
2430
2431 put_task_struct(p);
2432 return 0;
2433}
2434
2435
2436
2437
2438
2439void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2440{
2441 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2442 p->cpus_ptr = new_mask;
2443 return;
2444 }
2445
2446 cpumask_copy(&p->cpus_mask, new_mask);
2447 p->nr_cpus_allowed = cpumask_weight(new_mask);
2448}
2449
2450static void
2451__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2452{
2453 struct rq *rq = task_rq(p);
2454 bool queued, running;
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468 if (flags & SCA_MIGRATE_DISABLE)
2469 SCHED_WARN_ON(!p->on_cpu);
2470 else
2471 lockdep_assert_held(&p->pi_lock);
2472
2473 queued = task_on_rq_queued(p);
2474 running = task_current(rq, p);
2475
2476 if (queued) {
2477
2478
2479
2480
2481 lockdep_assert_rq_held(rq);
2482 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2483 }
2484 if (running)
2485 put_prev_task(rq, p);
2486
2487 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2488
2489 if (queued)
2490 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2491 if (running)
2492 set_next_task(rq, p);
2493}
2494
2495void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2496{
2497 __do_set_cpus_allowed(p, new_mask, 0);
2498}
2499
2500int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
2501 int node)
2502{
2503 if (!src->user_cpus_ptr)
2504 return 0;
2505
2506 dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
2507 if (!dst->user_cpus_ptr)
2508 return -ENOMEM;
2509
2510 cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
2511 return 0;
2512}
2513
2514static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
2515{
2516 struct cpumask *user_mask = NULL;
2517
2518 swap(p->user_cpus_ptr, user_mask);
2519
2520 return user_mask;
2521}
2522
2523void release_user_cpus_ptr(struct task_struct *p)
2524{
2525 kfree(clear_user_cpus_ptr(p));
2526}
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2605 int dest_cpu, unsigned int flags)
2606{
2607 struct set_affinity_pending my_pending = { }, *pending = NULL;
2608 bool stop_pending, complete = false;
2609
2610
2611 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2612 struct task_struct *push_task = NULL;
2613
2614 if ((flags & SCA_MIGRATE_ENABLE) &&
2615 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2616 rq->push_busy = true;
2617 push_task = get_task_struct(p);
2618 }
2619
2620
2621
2622
2623
2624 pending = p->migration_pending;
2625 if (pending && !pending->stop_pending) {
2626 p->migration_pending = NULL;
2627 complete = true;
2628 }
2629
2630 task_rq_unlock(rq, p, rf);
2631
2632 if (push_task) {
2633 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2634 p, &rq->push_work);
2635 }
2636
2637 if (complete)
2638 complete_all(&pending->done);
2639
2640 return 0;
2641 }
2642
2643 if (!(flags & SCA_MIGRATE_ENABLE)) {
2644
2645 if (!p->migration_pending) {
2646
2647 refcount_set(&my_pending.refs, 1);
2648 init_completion(&my_pending.done);
2649 my_pending.arg = (struct migration_arg) {
2650 .task = p,
2651 .dest_cpu = dest_cpu,
2652 .pending = &my_pending,
2653 };
2654
2655 p->migration_pending = &my_pending;
2656 } else {
2657 pending = p->migration_pending;
2658 refcount_inc(&pending->refs);
2659
2660
2661
2662
2663
2664
2665
2666
2667 pending->arg.dest_cpu = dest_cpu;
2668 }
2669 }
2670 pending = p->migration_pending;
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683 if (WARN_ON_ONCE(!pending)) {
2684 task_rq_unlock(rq, p, rf);
2685 return -EINVAL;
2686 }
2687
2688 if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2689
2690
2691
2692
2693
2694 stop_pending = pending->stop_pending;
2695 if (!stop_pending)
2696 pending->stop_pending = true;
2697
2698 if (flags & SCA_MIGRATE_ENABLE)
2699 p->migration_flags &= ~MDF_PUSH;
2700
2701 task_rq_unlock(rq, p, rf);
2702
2703 if (!stop_pending) {
2704 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2705 &pending->arg, &pending->stop_work);
2706 }
2707
2708 if (flags & SCA_MIGRATE_ENABLE)
2709 return 0;
2710 } else {
2711
2712 if (!is_migration_disabled(p)) {
2713 if (task_on_rq_queued(p))
2714 rq = move_queued_task(rq, rf, p, dest_cpu);
2715
2716 if (!pending->stop_pending) {
2717 p->migration_pending = NULL;
2718 complete = true;
2719 }
2720 }
2721 task_rq_unlock(rq, p, rf);
2722
2723 if (complete)
2724 complete_all(&pending->done);
2725 }
2726
2727 wait_for_completion(&pending->done);
2728
2729 if (refcount_dec_and_test(&pending->refs))
2730 wake_up_var(&pending->refs);
2731
2732
2733
2734
2735
2736 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2737
2738
2739 WARN_ON_ONCE(my_pending.stop_pending);
2740
2741 return 0;
2742}
2743
2744
2745
2746
2747static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2748 const struct cpumask *new_mask,
2749 u32 flags,
2750 struct rq *rq,
2751 struct rq_flags *rf)
2752 __releases(rq->lock)
2753 __releases(p->pi_lock)
2754{
2755 const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2756 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2757 bool kthread = p->flags & PF_KTHREAD;
2758 struct cpumask *user_mask = NULL;
2759 unsigned int dest_cpu;
2760 int ret = 0;
2761
2762 update_rq_clock(rq);
2763
2764 if (kthread || is_migration_disabled(p)) {
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775 cpu_valid_mask = cpu_online_mask;
2776 }
2777
2778 if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
2779 ret = -EINVAL;
2780 goto out;
2781 }
2782
2783
2784
2785
2786
2787 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2788 ret = -EINVAL;
2789 goto out;
2790 }
2791
2792 if (!(flags & SCA_MIGRATE_ENABLE)) {
2793 if (cpumask_equal(&p->cpus_mask, new_mask))
2794 goto out;
2795
2796 if (WARN_ON_ONCE(p == current &&
2797 is_migration_disabled(p) &&
2798 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2799 ret = -EBUSY;
2800 goto out;
2801 }
2802 }
2803
2804
2805
2806
2807
2808
2809 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2810 if (dest_cpu >= nr_cpu_ids) {
2811 ret = -EINVAL;
2812 goto out;
2813 }
2814
2815 __do_set_cpus_allowed(p, new_mask, flags);
2816
2817 if (flags & SCA_USER)
2818 user_mask = clear_user_cpus_ptr(p);
2819
2820 ret = affine_move_task(rq, p, rf, dest_cpu, flags);
2821
2822 kfree(user_mask);
2823
2824 return ret;
2825
2826out:
2827 task_rq_unlock(rq, p, rf);
2828
2829 return ret;
2830}
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841static int __set_cpus_allowed_ptr(struct task_struct *p,
2842 const struct cpumask *new_mask, u32 flags)
2843{
2844 struct rq_flags rf;
2845 struct rq *rq;
2846
2847 rq = task_rq_lock(p, &rf);
2848 return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
2849}
2850
2851int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2852{
2853 return __set_cpus_allowed_ptr(p, new_mask, 0);
2854}
2855EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2856
2857
2858
2859
2860
2861
2862
2863
2864static int restrict_cpus_allowed_ptr(struct task_struct *p,
2865 struct cpumask *new_mask,
2866 const struct cpumask *subset_mask)
2867{
2868 struct cpumask *user_mask = NULL;
2869 struct rq_flags rf;
2870 struct rq *rq;
2871 int err;
2872
2873 if (!p->user_cpus_ptr) {
2874 user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
2875 if (!user_mask)
2876 return -ENOMEM;
2877 }
2878
2879 rq = task_rq_lock(p, &rf);
2880
2881
2882
2883
2884
2885
2886 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
2887 err = -EPERM;
2888 goto err_unlock;
2889 }
2890
2891 if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2892 err = -EINVAL;
2893 goto err_unlock;
2894 }
2895
2896
2897
2898
2899
2900 if (user_mask) {
2901 cpumask_copy(user_mask, p->cpus_ptr);
2902 p->user_cpus_ptr = user_mask;
2903 }
2904
2905 return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
2906
2907err_unlock:
2908 task_rq_unlock(rq, p, &rf);
2909 kfree(user_mask);
2910 return err;
2911}
2912
2913
2914
2915
2916
2917
2918
2919void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2920{
2921 cpumask_var_t new_mask;
2922 const struct cpumask *override_mask = task_cpu_possible_mask(p);
2923
2924 alloc_cpumask_var(&new_mask, GFP_KERNEL);
2925
2926
2927
2928
2929
2930
2931 cpus_read_lock();
2932 if (!cpumask_available(new_mask))
2933 goto out_set_mask;
2934
2935 if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2936 goto out_free_mask;
2937
2938
2939
2940
2941
2942 cpuset_cpus_allowed(p, new_mask);
2943 override_mask = new_mask;
2944
2945out_set_mask:
2946 if (printk_ratelimit()) {
2947 printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2948 task_pid_nr(p), p->comm,
2949 cpumask_pr_args(override_mask));
2950 }
2951
2952 WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2953out_free_mask:
2954 cpus_read_unlock();
2955 free_cpumask_var(new_mask);
2956}
2957
2958static int
2959__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
2970{
2971 struct cpumask *user_mask = p->user_cpus_ptr;
2972 unsigned long flags;
2973
2974
2975
2976
2977
2978
2979 if (!user_mask || !__sched_setaffinity(p, user_mask))
2980 return;
2981
2982 raw_spin_lock_irqsave(&p->pi_lock, flags);
2983 user_mask = clear_user_cpus_ptr(p);
2984 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2985
2986 kfree(user_mask);
2987}
2988
2989void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2990{
2991#ifdef CONFIG_SCHED_DEBUG
2992 unsigned int state = READ_ONCE(p->__state);
2993
2994
2995
2996
2997
2998 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2999
3000
3001
3002
3003
3004
3005 WARN_ON_ONCE(state == TASK_RUNNING &&
3006 p->sched_class == &fair_sched_class &&
3007 (p->on_rq && !task_on_rq_migrating(p)));
3008
3009#ifdef CONFIG_LOCKDEP
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
3021 lockdep_is_held(__rq_lockp(task_rq(p)))));
3022#endif
3023
3024
3025
3026 WARN_ON_ONCE(!cpu_online(new_cpu));
3027
3028 WARN_ON_ONCE(is_migration_disabled(p));
3029#endif
3030
3031 trace_sched_migrate_task(p, new_cpu);
3032
3033 if (task_cpu(p) != new_cpu) {
3034 if (p->sched_class->migrate_task_rq)
3035 p->sched_class->migrate_task_rq(p, new_cpu);
3036 p->se.nr_migrations++;
3037 rseq_migrate(p);
3038 perf_event_task_migrate(p);
3039 }
3040
3041 __set_task_cpu(p, new_cpu);
3042}
3043
3044#ifdef CONFIG_NUMA_BALANCING
3045static void __migrate_swap_task(struct task_struct *p, int cpu)
3046{
3047 if (task_on_rq_queued(p)) {
3048 struct rq *src_rq, *dst_rq;
3049 struct rq_flags srf, drf;
3050
3051 src_rq = task_rq(p);
3052 dst_rq = cpu_rq(cpu);
3053
3054 rq_pin_lock(src_rq, &srf);
3055 rq_pin_lock(dst_rq, &drf);
3056
3057 deactivate_task(src_rq, p, 0);
3058 set_task_cpu(p, cpu);
3059 activate_task(dst_rq, p, 0);
3060 check_preempt_curr(dst_rq, p, 0);
3061
3062 rq_unpin_lock(dst_rq, &drf);
3063 rq_unpin_lock(src_rq, &srf);
3064
3065 } else {
3066
3067
3068
3069
3070
3071 p->wake_cpu = cpu;
3072 }
3073}
3074
3075struct migration_swap_arg {
3076 struct task_struct *src_task, *dst_task;
3077 int src_cpu, dst_cpu;
3078};
3079
3080static int migrate_swap_stop(void *data)
3081{
3082 struct migration_swap_arg *arg = data;
3083 struct rq *src_rq, *dst_rq;
3084 int ret = -EAGAIN;
3085
3086 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
3087 return -EAGAIN;
3088
3089 src_rq = cpu_rq(arg->src_cpu);
3090 dst_rq = cpu_rq(arg->dst_cpu);
3091
3092 double_raw_lock(&arg->src_task->pi_lock,
3093 &arg->dst_task->pi_lock);
3094 double_rq_lock(src_rq, dst_rq);
3095
3096 if (task_cpu(arg->dst_task) != arg->dst_cpu)
3097 goto unlock;
3098
3099 if (task_cpu(arg->src_task) != arg->src_cpu)
3100 goto unlock;
3101
3102 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
3103 goto unlock;
3104
3105 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
3106 goto unlock;
3107
3108 __migrate_swap_task(arg->src_task, arg->dst_cpu);
3109 __migrate_swap_task(arg->dst_task, arg->src_cpu);
3110
3111 ret = 0;
3112
3113unlock:
3114 double_rq_unlock(src_rq, dst_rq);
3115 raw_spin_unlock(&arg->dst_task->pi_lock);
3116 raw_spin_unlock(&arg->src_task->pi_lock);
3117
3118 return ret;
3119}
3120
3121
3122
3123
3124int migrate_swap(struct task_struct *cur, struct task_struct *p,
3125 int target_cpu, int curr_cpu)
3126{
3127 struct migration_swap_arg arg;
3128 int ret = -EINVAL;
3129
3130 arg = (struct migration_swap_arg){
3131 .src_task = cur,
3132 .src_cpu = curr_cpu,
3133 .dst_task = p,
3134 .dst_cpu = target_cpu,
3135 };
3136
3137 if (arg.src_cpu == arg.dst_cpu)
3138 goto out;
3139
3140
3141
3142
3143
3144 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
3145 goto out;
3146
3147 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
3148 goto out;
3149
3150 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
3151 goto out;
3152
3153 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
3154 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
3155
3156out:
3157 return ret;
3158}
3159#endif
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
3178{
3179 int running, queued;
3180 struct rq_flags rf;
3181 unsigned long ncsw;
3182 struct rq *rq;
3183
3184 for (;;) {
3185
3186
3187
3188
3189
3190
3191 rq = task_rq(p);
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204 while (task_running(rq, p)) {
3205 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3206 return 0;
3207 cpu_relax();
3208 }
3209
3210
3211
3212
3213
3214
3215 rq = task_rq_lock(p, &rf);
3216 trace_sched_wait_task(p);
3217 running = task_running(rq, p);
3218 queued = task_on_rq_queued(p);
3219 ncsw = 0;
3220 if (!match_state || READ_ONCE(p->__state) == match_state)
3221 ncsw = p->nvcsw | LONG_MIN;
3222 task_rq_unlock(rq, p, &rf);
3223
3224
3225
3226
3227 if (unlikely(!ncsw))
3228 break;
3229
3230
3231
3232
3233
3234
3235
3236 if (unlikely(running)) {
3237 cpu_relax();
3238 continue;
3239 }
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250 if (unlikely(queued)) {
3251 ktime_t to = NSEC_PER_SEC / HZ;
3252
3253 set_current_state(TASK_UNINTERRUPTIBLE);
3254 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3255 continue;
3256 }
3257
3258
3259
3260
3261
3262
3263 break;
3264 }
3265
3266 return ncsw;
3267}
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282void kick_process(struct task_struct *p)
3283{
3284 int cpu;
3285
3286 preempt_disable();
3287 cpu = task_cpu(p);
3288 if ((cpu != smp_processor_id()) && task_curr(p))
3289 smp_send_reschedule(cpu);
3290 preempt_enable();
3291}
3292EXPORT_SYMBOL_GPL(kick_process);
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316static int select_fallback_rq(int cpu, struct task_struct *p)
3317{
3318 int nid = cpu_to_node(cpu);
3319 const struct cpumask *nodemask = NULL;
3320 enum { cpuset, possible, fail } state = cpuset;
3321 int dest_cpu;
3322
3323
3324
3325
3326
3327
3328 if (nid != -1) {
3329 nodemask = cpumask_of_node(nid);
3330
3331
3332 for_each_cpu(dest_cpu, nodemask) {
3333 if (is_cpu_allowed(p, dest_cpu))
3334 return dest_cpu;
3335 }
3336 }
3337
3338 for (;;) {
3339
3340 for_each_cpu(dest_cpu, p->cpus_ptr) {
3341 if (!is_cpu_allowed(p, dest_cpu))
3342 continue;
3343
3344 goto out;
3345 }
3346
3347
3348 switch (state) {
3349 case cpuset:
3350 if (cpuset_cpus_allowed_fallback(p)) {
3351 state = possible;
3352 break;
3353 }
3354 fallthrough;
3355 case possible:
3356
3357
3358
3359
3360
3361
3362 do_set_cpus_allowed(p, task_cpu_possible_mask(p));
3363 state = fail;
3364 break;
3365 case fail:
3366 BUG();
3367 break;
3368 }
3369 }
3370
3371out:
3372 if (state != cpuset) {
3373
3374
3375
3376
3377
3378 if (p->mm && printk_ratelimit()) {
3379 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3380 task_pid_nr(p), p->comm, cpu);
3381 }
3382 }
3383
3384 return dest_cpu;
3385}
3386
3387
3388
3389
3390static inline
3391int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3392{
3393 lockdep_assert_held(&p->pi_lock);
3394
3395 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3396 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3397 else
3398 cpu = cpumask_any(p->cpus_ptr);
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410 if (unlikely(!is_cpu_allowed(p, cpu)))
3411 cpu = select_fallback_rq(task_cpu(p), p);
3412
3413 return cpu;
3414}
3415
3416void sched_set_stop_task(int cpu, struct task_struct *stop)
3417{
3418 static struct lock_class_key stop_pi_lock;
3419 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3420 struct task_struct *old_stop = cpu_rq(cpu)->stop;
3421
3422 if (stop) {
3423
3424
3425
3426
3427
3428
3429
3430
3431 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3432
3433 stop->sched_class = &stop_sched_class;
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3448 }
3449
3450 cpu_rq(cpu)->stop = stop;
3451
3452 if (old_stop) {
3453
3454
3455
3456
3457 old_stop->sched_class = &rt_sched_class;
3458 }
3459}
3460
3461#else
3462
3463static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3464 const struct cpumask *new_mask,
3465 u32 flags)
3466{
3467 return set_cpus_allowed_ptr(p, new_mask);
3468}
3469
3470static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3471
3472static inline bool rq_has_pinned_tasks(struct rq *rq)
3473{
3474 return false;
3475}
3476
3477#endif
3478
3479static void
3480ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3481{
3482 struct rq *rq;
3483
3484 if (!schedstat_enabled())
3485 return;
3486
3487 rq = this_rq();
3488
3489#ifdef CONFIG_SMP
3490 if (cpu == rq->cpu) {
3491 __schedstat_inc(rq->ttwu_local);
3492 __schedstat_inc(p->se.statistics.nr_wakeups_local);
3493 } else {
3494 struct sched_domain *sd;
3495
3496 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3497 rcu_read_lock();
3498 for_each_domain(rq->cpu, sd) {
3499 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3500 __schedstat_inc(sd->ttwu_wake_remote);
3501 break;
3502 }
3503 }
3504 rcu_read_unlock();
3505 }
3506
3507 if (wake_flags & WF_MIGRATED)
3508 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3509#endif
3510
3511 __schedstat_inc(rq->ttwu_count);
3512 __schedstat_inc(p->se.statistics.nr_wakeups);
3513
3514 if (wake_flags & WF_SYNC)
3515 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3516}
3517
3518
3519
3520
3521static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3522 struct rq_flags *rf)
3523{
3524 check_preempt_curr(rq, p, wake_flags);
3525 WRITE_ONCE(p->__state, TASK_RUNNING);
3526 trace_sched_wakeup(p);
3527
3528#ifdef CONFIG_SMP
3529 if (p->sched_class->task_woken) {
3530
3531
3532
3533
3534 rq_unpin_lock(rq, rf);
3535 p->sched_class->task_woken(rq, p);
3536 rq_repin_lock(rq, rf);
3537 }
3538
3539 if (rq->idle_stamp) {
3540 u64 delta = rq_clock(rq) - rq->idle_stamp;
3541 u64 max = 2*rq->max_idle_balance_cost;
3542
3543 update_avg(&rq->avg_idle, delta);
3544
3545 if (rq->avg_idle > max)
3546 rq->avg_idle = max;
3547
3548 rq->wake_stamp = jiffies;
3549 rq->wake_avg_idle = rq->avg_idle / 2;
3550
3551 rq->idle_stamp = 0;
3552 }
3553#endif
3554}
3555
3556static void
3557ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3558 struct rq_flags *rf)
3559{
3560 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3561
3562 lockdep_assert_rq_held(rq);
3563
3564 if (p->sched_contributes_to_load)
3565 rq->nr_uninterruptible--;
3566
3567#ifdef CONFIG_SMP
3568 if (wake_flags & WF_MIGRATED)
3569 en_flags |= ENQUEUE_MIGRATED;
3570 else
3571#endif
3572 if (p->in_iowait) {
3573 delayacct_blkio_end(p);
3574 atomic_dec(&task_rq(p)->nr_iowait);
3575 }
3576
3577 activate_task(rq, p, en_flags);
3578 ttwu_do_wakeup(rq, p, wake_flags, rf);
3579}
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606static int ttwu_runnable(struct task_struct *p, int wake_flags)
3607{
3608 struct rq_flags rf;
3609 struct rq *rq;
3610 int ret = 0;
3611
3612 rq = __task_rq_lock(p, &rf);
3613 if (task_on_rq_queued(p)) {
3614
3615 update_rq_clock(rq);
3616 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3617 ret = 1;
3618 }
3619 __task_rq_unlock(rq, &rf);
3620
3621 return ret;
3622}
3623
3624#ifdef CONFIG_SMP
3625void sched_ttwu_pending(void *arg)
3626{
3627 struct llist_node *llist = arg;
3628 struct rq *rq = this_rq();
3629 struct task_struct *p, *t;
3630 struct rq_flags rf;
3631
3632 if (!llist)
3633 return;
3634
3635
3636
3637
3638
3639
3640 WRITE_ONCE(rq->ttwu_pending, 0);
3641
3642 rq_lock_irqsave(rq, &rf);
3643 update_rq_clock(rq);
3644
3645 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3646 if (WARN_ON_ONCE(p->on_cpu))
3647 smp_cond_load_acquire(&p->on_cpu, !VAL);
3648
3649 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3650 set_task_cpu(p, cpu_of(rq));
3651
3652 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3653 }
3654
3655 rq_unlock_irqrestore(rq, &rf);
3656}
3657
3658void send_call_function_single_ipi(int cpu)
3659{
3660 struct rq *rq = cpu_rq(cpu);
3661
3662 if (!set_nr_if_polling(rq->idle))
3663 arch_send_call_function_single_ipi(cpu);
3664 else
3665 trace_sched_wake_idle_without_ipi(cpu);
3666}
3667
3668
3669
3670
3671
3672
3673
3674static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3675{
3676 struct rq *rq = cpu_rq(cpu);
3677
3678 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3679
3680 WRITE_ONCE(rq->ttwu_pending, 1);
3681 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3682}
3683
3684void wake_up_if_idle(int cpu)
3685{
3686 struct rq *rq = cpu_rq(cpu);
3687 struct rq_flags rf;
3688
3689 rcu_read_lock();
3690
3691 if (!is_idle_task(rcu_dereference(rq->curr)))
3692 goto out;
3693
3694 if (set_nr_if_polling(rq->idle)) {
3695 trace_sched_wake_idle_without_ipi(cpu);
3696 } else {
3697 rq_lock_irqsave(rq, &rf);
3698 if (is_idle_task(rq->curr))
3699 smp_send_reschedule(cpu);
3700
3701 rq_unlock_irqrestore(rq, &rf);
3702 }
3703
3704out:
3705 rcu_read_unlock();
3706}
3707
3708bool cpus_share_cache(int this_cpu, int that_cpu)
3709{
3710 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3711}
3712
3713static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3714{
3715
3716
3717
3718
3719 if (!cpu_active(cpu))
3720 return false;
3721
3722
3723
3724
3725
3726 if (!cpus_share_cache(smp_processor_id(), cpu))
3727 return true;
3728
3729
3730
3731
3732
3733
3734
3735 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3736 return true;
3737
3738 return false;
3739}
3740
3741static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3742{
3743 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3744 if (WARN_ON_ONCE(cpu == smp_processor_id()))
3745 return false;
3746
3747 sched_clock_cpu(cpu);
3748 __ttwu_queue_wakelist(p, cpu, wake_flags);
3749 return true;
3750 }
3751
3752 return false;
3753}
3754
3755#else
3756
3757static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3758{
3759 return false;
3760}
3761
3762#endif
3763
3764static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3765{
3766 struct rq *rq = cpu_rq(cpu);
3767 struct rq_flags rf;
3768
3769 if (ttwu_queue_wakelist(p, cpu, wake_flags))
3770 return;
3771
3772 rq_lock(rq, &rf);
3773 update_rq_clock(rq);
3774 ttwu_do_activate(rq, p, wake_flags, &rf);
3775 rq_unlock(rq, &rf);
3776}
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792static __always_inline
3793bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
3794{
3795 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
3796 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
3797 state != TASK_RTLOCK_WAIT);
3798 }
3799
3800 if (READ_ONCE(p->__state) & state) {
3801 *success = 1;
3802 return true;
3803 }
3804
3805#ifdef CONFIG_PREEMPT_RT
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819 if (p->saved_state & state) {
3820 p->saved_state = TASK_RUNNING;
3821 *success = 1;
3822 }
3823#endif
3824 return false;
3825}
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947static int
3948try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3949{
3950 unsigned long flags;
3951 int cpu, success = 0;
3952
3953 preempt_disable();
3954 if (p == current) {
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966 if (!ttwu_state_match(p, state, &success))
3967 goto out;
3968
3969 trace_sched_waking(p);
3970 WRITE_ONCE(p->__state, TASK_RUNNING);
3971 trace_sched_wakeup(p);
3972 goto out;
3973 }
3974
3975
3976
3977
3978
3979
3980
3981 raw_spin_lock_irqsave(&p->pi_lock, flags);
3982 smp_mb__after_spinlock();
3983 if (!ttwu_state_match(p, state, &success))
3984 goto unlock;
3985
3986 trace_sched_waking(p);
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010 smp_rmb();
4011 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
4012 goto unlock;
4013
4014#ifdef CONFIG_SMP
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038 smp_acquire__after_ctrl_dep();
4039
4040
4041
4042
4043
4044
4045
4046 WRITE_ONCE(p->__state, TASK_WAKING);
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067 if (smp_load_acquire(&p->on_cpu) &&
4068 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
4069 goto unlock;
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080 smp_cond_load_acquire(&p->on_cpu, !VAL);
4081
4082 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
4083 if (task_cpu(p) != cpu) {
4084 if (p->in_iowait) {
4085 delayacct_blkio_end(p);
4086 atomic_dec(&task_rq(p)->nr_iowait);
4087 }
4088
4089 wake_flags |= WF_MIGRATED;
4090 psi_ttwu_dequeue(p);
4091 set_task_cpu(p, cpu);
4092 }
4093#else
4094 cpu = task_cpu(p);
4095#endif
4096
4097 ttwu_queue(p, cpu, wake_flags);
4098unlock:
4099 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4100out:
4101 if (success)
4102 ttwu_stat(p, task_cpu(p), wake_flags);
4103 preempt_enable();
4104
4105 return success;
4106}
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
4127{
4128 struct rq_flags rf;
4129 bool ret = false;
4130 struct rq *rq;
4131
4132 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4133 if (p->on_rq) {
4134 rq = __task_rq_lock(p, &rf);
4135 if (task_rq(p) == rq)
4136 ret = func(p, arg);
4137 rq_unlock(rq, &rf);
4138 } else {
4139 switch (READ_ONCE(p->__state)) {
4140 case TASK_RUNNING:
4141 case TASK_WAKING:
4142 break;
4143 default:
4144 smp_rmb();
4145 if (!p->on_rq)
4146 ret = func(p, arg);
4147 }
4148 }
4149 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
4150 return ret;
4151}
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164int wake_up_process(struct task_struct *p)
4165{
4166 return try_to_wake_up(p, TASK_NORMAL, 0);
4167}
4168EXPORT_SYMBOL(wake_up_process);
4169
4170int wake_up_state(struct task_struct *p, unsigned int state)
4171{
4172 return try_to_wake_up(p, state, 0);
4173}
4174
4175
4176
4177
4178
4179
4180
4181static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
4182{
4183 p->on_rq = 0;
4184
4185 p->se.on_rq = 0;
4186 p->se.exec_start = 0;
4187 p->se.sum_exec_runtime = 0;
4188 p->se.prev_sum_exec_runtime = 0;
4189 p->se.nr_migrations = 0;
4190 p->se.vruntime = 0;
4191 INIT_LIST_HEAD(&p->se.group_node);
4192
4193#ifdef CONFIG_FAIR_GROUP_SCHED
4194 p->se.cfs_rq = NULL;
4195#endif
4196
4197#ifdef CONFIG_SCHEDSTATS
4198
4199 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
4200#endif
4201
4202 RB_CLEAR_NODE(&p->dl.rb_node);
4203 init_dl_task_timer(&p->dl);
4204 init_dl_inactive_task_timer(&p->dl);
4205 __dl_clear_params(p);
4206
4207 INIT_LIST_HEAD(&p->rt.run_list);
4208 p->rt.timeout = 0;
4209 p->rt.time_slice = sched_rr_timeslice;
4210 p->rt.on_rq = 0;
4211 p->rt.on_list = 0;
4212
4213#ifdef CONFIG_PREEMPT_NOTIFIERS
4214 INIT_HLIST_HEAD(&p->preempt_notifiers);
4215#endif
4216
4217#ifdef CONFIG_COMPACTION
4218 p->capture_control = NULL;
4219#endif
4220 init_numa_balancing(clone_flags, p);
4221#ifdef CONFIG_SMP
4222 p->wake_entry.u_flags = CSD_TYPE_TTWU;
4223 p->migration_pending = NULL;
4224#endif
4225}
4226
4227DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
4228
4229#ifdef CONFIG_NUMA_BALANCING
4230
4231void set_numabalancing_state(bool enabled)
4232{
4233 if (enabled)
4234 static_branch_enable(&sched_numa_balancing);
4235 else
4236 static_branch_disable(&sched_numa_balancing);
4237}
4238
4239#ifdef CONFIG_PROC_SYSCTL
4240int sysctl_numa_balancing(struct ctl_table *table, int write,
4241 void *buffer, size_t *lenp, loff_t *ppos)
4242{
4243 struct ctl_table t;
4244 int err;
4245 int state = static_branch_likely(&sched_numa_balancing);
4246
4247 if (write && !capable(CAP_SYS_ADMIN))
4248 return -EPERM;
4249
4250 t = *table;
4251 t.data = &state;
4252 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4253 if (err < 0)
4254 return err;
4255 if (write)
4256 set_numabalancing_state(state);
4257 return err;
4258}
4259#endif
4260#endif
4261
4262#ifdef CONFIG_SCHEDSTATS
4263
4264DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4265
4266static void set_schedstats(bool enabled)
4267{
4268 if (enabled)
4269 static_branch_enable(&sched_schedstats);
4270 else
4271 static_branch_disable(&sched_schedstats);
4272}
4273
4274void force_schedstat_enabled(void)
4275{
4276 if (!schedstat_enabled()) {
4277 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4278 static_branch_enable(&sched_schedstats);
4279 }
4280}
4281
4282static int __init setup_schedstats(char *str)
4283{
4284 int ret = 0;
4285 if (!str)
4286 goto out;
4287
4288 if (!strcmp(str, "enable")) {
4289 set_schedstats(true);
4290 ret = 1;
4291 } else if (!strcmp(str, "disable")) {
4292 set_schedstats(false);
4293 ret = 1;
4294 }
4295out:
4296 if (!ret)
4297 pr_warn("Unable to parse schedstats=\n");
4298
4299 return ret;
4300}
4301__setup("schedstats=", setup_schedstats);
4302
4303#ifdef CONFIG_PROC_SYSCTL
4304int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4305 size_t *lenp, loff_t *ppos)
4306{
4307 struct ctl_table t;
4308 int err;
4309 int state = static_branch_likely(&sched_schedstats);
4310
4311 if (write && !capable(CAP_SYS_ADMIN))
4312 return -EPERM;
4313
4314 t = *table;
4315 t.data = &state;
4316 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4317 if (err < 0)
4318 return err;
4319 if (write)
4320 set_schedstats(state);
4321 return err;
4322}
4323#endif
4324#endif
4325
4326
4327
4328
4329int sched_fork(unsigned long clone_flags, struct task_struct *p)
4330{
4331 unsigned long flags;
4332
4333 __sched_fork(clone_flags, p);
4334
4335
4336
4337
4338
4339 p->__state = TASK_NEW;
4340
4341
4342
4343
4344 p->prio = current->normal_prio;
4345
4346 uclamp_fork(p);
4347
4348
4349
4350
4351 if (unlikely(p->sched_reset_on_fork)) {
4352 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4353 p->policy = SCHED_NORMAL;
4354 p->static_prio = NICE_TO_PRIO(0);
4355 p->rt_priority = 0;
4356 } else if (PRIO_TO_NICE(p->static_prio) < 0)
4357 p->static_prio = NICE_TO_PRIO(0);
4358
4359 p->prio = p->normal_prio = p->static_prio;
4360 set_load_weight(p, false);
4361
4362
4363
4364
4365
4366 p->sched_reset_on_fork = 0;
4367 }
4368
4369 if (dl_prio(p->prio))
4370 return -EAGAIN;
4371 else if (rt_prio(p->prio))
4372 p->sched_class = &rt_sched_class;
4373 else
4374 p->sched_class = &fair_sched_class;
4375
4376 init_entity_runnable_average(&p->se);
4377
4378
4379
4380
4381
4382
4383
4384
4385 raw_spin_lock_irqsave(&p->pi_lock, flags);
4386 rseq_migrate(p);
4387
4388
4389
4390
4391 __set_task_cpu(p, smp_processor_id());
4392 if (p->sched_class->task_fork)
4393 p->sched_class->task_fork(p);
4394 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4395
4396#ifdef CONFIG_SCHED_INFO
4397 if (likely(sched_info_on()))
4398 memset(&p->sched_info, 0, sizeof(p->sched_info));
4399#endif
4400#if defined(CONFIG_SMP)
4401 p->on_cpu = 0;
4402#endif
4403 init_task_preempt_count(p);
4404#ifdef CONFIG_SMP
4405 plist_node_init(&p->pushable_tasks, MAX_PRIO);
4406 RB_CLEAR_NODE(&p->pushable_dl_tasks);
4407#endif
4408 return 0;
4409}
4410
4411void sched_post_fork(struct task_struct *p)
4412{
4413 uclamp_post_fork(p);
4414}
4415
4416unsigned long to_ratio(u64 period, u64 runtime)
4417{
4418 if (runtime == RUNTIME_INF)
4419 return BW_UNIT;
4420
4421
4422
4423
4424
4425
4426 if (period == 0)
4427 return 0;
4428
4429 return div64_u64(runtime << BW_SHIFT, period);
4430}
4431
4432
4433
4434
4435
4436
4437
4438
4439void wake_up_new_task(struct task_struct *p)
4440{
4441 struct rq_flags rf;
4442 struct rq *rq;
4443
4444 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4445 WRITE_ONCE(p->__state, TASK_RUNNING);
4446#ifdef CONFIG_SMP
4447
4448
4449
4450
4451
4452
4453
4454
4455 p->recent_used_cpu = task_cpu(p);
4456 rseq_migrate(p);
4457 __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4458#endif
4459 rq = __task_rq_lock(p, &rf);
4460 update_rq_clock(rq);
4461 post_init_entity_util_avg(p);
4462
4463 activate_task(rq, p, ENQUEUE_NOCLOCK);
4464 trace_sched_wakeup_new(p);
4465 check_preempt_curr(rq, p, WF_FORK);
4466#ifdef CONFIG_SMP
4467 if (p->sched_class->task_woken) {
4468
4469
4470
4471
4472 rq_unpin_lock(rq, &rf);
4473 p->sched_class->task_woken(rq, p);
4474 rq_repin_lock(rq, &rf);
4475 }
4476#endif
4477 task_rq_unlock(rq, p, &rf);
4478}
4479
4480#ifdef CONFIG_PREEMPT_NOTIFIERS
4481
4482static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4483
4484void preempt_notifier_inc(void)
4485{
4486 static_branch_inc(&preempt_notifier_key);
4487}
4488EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4489
4490void preempt_notifier_dec(void)
4491{
4492 static_branch_dec(&preempt_notifier_key);
4493}
4494EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4495
4496
4497
4498
4499
4500void preempt_notifier_register(struct preempt_notifier *notifier)
4501{
4502 if (!static_branch_unlikely(&preempt_notifier_key))
4503 WARN(1, "registering preempt_notifier while notifiers disabled\n");
4504
4505 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
4506}
4507EXPORT_SYMBOL_GPL(preempt_notifier_register);
4508
4509
4510
4511
4512
4513
4514
4515void preempt_notifier_unregister(struct preempt_notifier *notifier)
4516{
4517 hlist_del(¬ifier->link);
4518}
4519EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4520
4521static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4522{
4523 struct preempt_notifier *notifier;
4524
4525 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4526 notifier->ops->sched_in(notifier, raw_smp_processor_id());
4527}
4528
4529static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4530{
4531 if (static_branch_unlikely(&preempt_notifier_key))
4532 __fire_sched_in_preempt_notifiers(curr);
4533}
4534
4535static void
4536__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4537 struct task_struct *next)
4538{
4539 struct preempt_notifier *notifier;
4540
4541 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4542 notifier->ops->sched_out(notifier, next);
4543}
4544
4545static __always_inline void
4546fire_sched_out_preempt_notifiers(struct task_struct *curr,
4547 struct task_struct *next)
4548{
4549 if (static_branch_unlikely(&preempt_notifier_key))
4550 __fire_sched_out_preempt_notifiers(curr, next);
4551}
4552
4553#else
4554
4555static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4556{
4557}
4558
4559static inline void
4560fire_sched_out_preempt_notifiers(struct task_struct *curr,
4561 struct task_struct *next)
4562{
4563}
4564
4565#endif
4566
4567static inline void prepare_task(struct task_struct *next)
4568{
4569#ifdef CONFIG_SMP
4570
4571
4572
4573
4574
4575
4576 WRITE_ONCE(next->on_cpu, 1);
4577#endif
4578}
4579
4580static inline void finish_task(struct task_struct *prev)
4581{
4582#ifdef CONFIG_SMP
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594 smp_store_release(&prev->on_cpu, 0);
4595#endif
4596}
4597
4598#ifdef CONFIG_SMP
4599
4600static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4601{
4602 void (*func)(struct rq *rq);
4603 struct callback_head *next;
4604
4605 lockdep_assert_rq_held(rq);
4606
4607 while (head) {
4608 func = (void (*)(struct rq *))head->func;
4609 next = head->next;
4610 head->next = NULL;
4611 head = next;
4612
4613 func(rq);
4614 }
4615}
4616
4617static void balance_push(struct rq *rq);
4618
4619struct callback_head balance_push_callback = {
4620 .next = NULL,
4621 .func = (void (*)(struct callback_head *))balance_push,
4622};
4623
4624static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4625{
4626 struct callback_head *head = rq->balance_callback;
4627
4628 lockdep_assert_rq_held(rq);
4629 if (head)
4630 rq->balance_callback = NULL;
4631
4632 return head;
4633}
4634
4635static void __balance_callbacks(struct rq *rq)
4636{
4637 do_balance_callbacks(rq, splice_balance_callbacks(rq));
4638}
4639
4640static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4641{
4642 unsigned long flags;
4643
4644 if (unlikely(head)) {
4645 raw_spin_rq_lock_irqsave(rq, flags);
4646 do_balance_callbacks(rq, head);
4647 raw_spin_rq_unlock_irqrestore(rq, flags);
4648 }
4649}
4650
4651#else
4652
4653static inline void __balance_callbacks(struct rq *rq)
4654{
4655}
4656
4657static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4658{
4659 return NULL;
4660}
4661
4662static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4663{
4664}
4665
4666#endif
4667
4668static inline void
4669prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4670{
4671
4672
4673
4674
4675
4676
4677 rq_unpin_lock(rq, rf);
4678 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4679#ifdef CONFIG_DEBUG_SPINLOCK
4680
4681 rq_lockp(rq)->owner = next;
4682#endif
4683}
4684
4685static inline void finish_lock_switch(struct rq *rq)
4686{
4687
4688
4689
4690
4691
4692 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4693 __balance_callbacks(rq);
4694 raw_spin_rq_unlock_irq(rq);
4695}
4696
4697
4698
4699
4700
4701#ifndef prepare_arch_switch
4702# define prepare_arch_switch(next) do { } while (0)
4703#endif
4704
4705#ifndef finish_arch_post_lock_switch
4706# define finish_arch_post_lock_switch() do { } while (0)
4707#endif
4708
4709static inline void kmap_local_sched_out(void)
4710{
4711#ifdef CONFIG_KMAP_LOCAL
4712 if (unlikely(current->kmap_ctrl.idx))
4713 __kmap_local_sched_out();
4714#endif
4715}
4716
4717static inline void kmap_local_sched_in(void)
4718{
4719#ifdef CONFIG_KMAP_LOCAL
4720 if (unlikely(current->kmap_ctrl.idx))
4721 __kmap_local_sched_in();
4722#endif
4723}
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738static inline void
4739prepare_task_switch(struct rq *rq, struct task_struct *prev,
4740 struct task_struct *next)
4741{
4742 kcov_prepare_switch(prev);
4743 sched_info_switch(rq, prev, next);
4744 perf_event_task_sched_out(prev, next);
4745 rseq_preempt(prev);
4746 fire_sched_out_preempt_notifiers(prev, next);
4747 kmap_local_sched_out();
4748 prepare_task(next);
4749 prepare_arch_switch(next);
4750}
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771static struct rq *finish_task_switch(struct task_struct *prev)
4772 __releases(rq->lock)
4773{
4774 struct rq *rq = this_rq();
4775 struct mm_struct *mm = rq->prev_mm;
4776 long prev_state;
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4790 "corrupted preempt_count: %s/%d/0x%x\n",
4791 current->comm, current->pid, preempt_count()))
4792 preempt_count_set(FORK_PREEMPT_COUNT);
4793
4794 rq->prev_mm = NULL;
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807 prev_state = READ_ONCE(prev->__state);
4808 vtime_task_switch(prev);
4809 perf_event_task_sched_in(prev, current);
4810 finish_task(prev);
4811 tick_nohz_task_switch();
4812 finish_lock_switch(rq);
4813 finish_arch_post_lock_switch();
4814 kcov_finish_switch(current);
4815
4816
4817
4818
4819
4820
4821
4822 kmap_local_sched_in();
4823
4824 fire_sched_in_preempt_notifiers(current);
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837 if (mm) {
4838 membarrier_mm_sync_core_before_usermode(mm);
4839 mmdrop(mm);
4840 }
4841 if (unlikely(prev_state == TASK_DEAD)) {
4842 if (prev->sched_class->task_dead)
4843 prev->sched_class->task_dead(prev);
4844
4845
4846
4847
4848
4849 kprobe_flush_task(prev);
4850
4851
4852 put_task_stack(prev);
4853
4854 put_task_struct_rcu_user(prev);
4855 }
4856
4857 return rq;
4858}
4859
4860
4861
4862
4863
4864asmlinkage __visible void schedule_tail(struct task_struct *prev)
4865 __releases(rq->lock)
4866{
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876 finish_task_switch(prev);
4877 preempt_enable();
4878
4879 if (current->set_child_tid)
4880 put_user(task_pid_vnr(current), current->set_child_tid);
4881
4882 calculate_sigpending();
4883}
4884
4885
4886
4887
4888static __always_inline struct rq *
4889context_switch(struct rq *rq, struct task_struct *prev,
4890 struct task_struct *next, struct rq_flags *rf)
4891{
4892 prepare_task_switch(rq, prev, next);
4893
4894
4895
4896
4897
4898
4899 arch_start_context_switch(prev);
4900
4901
4902
4903
4904
4905
4906
4907
4908 if (!next->mm) {
4909 enter_lazy_tlb(prev->active_mm, next);
4910
4911 next->active_mm = prev->active_mm;
4912 if (prev->mm)
4913 mmgrab(prev->active_mm);
4914 else
4915 prev->active_mm = NULL;
4916 } else {
4917 membarrier_switch_mm(rq, prev->active_mm, next->mm);
4918
4919
4920
4921
4922
4923
4924
4925
4926 switch_mm_irqs_off(prev->active_mm, next->mm, next);
4927
4928 if (!prev->mm) {
4929
4930 rq->prev_mm = prev->active_mm;
4931 prev->active_mm = NULL;
4932 }
4933 }
4934
4935 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4936
4937 prepare_lock_switch(rq, next, rf);
4938
4939
4940 switch_to(prev, next, prev);
4941 barrier();
4942
4943 return finish_task_switch(prev);
4944}
4945
4946
4947
4948
4949
4950
4951
4952unsigned int nr_running(void)
4953{
4954 unsigned int i, sum = 0;
4955
4956 for_each_online_cpu(i)
4957 sum += cpu_rq(i)->nr_running;
4958
4959 return sum;
4960}
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975bool single_task_running(void)
4976{
4977 return raw_rq()->nr_running == 1;
4978}
4979EXPORT_SYMBOL(single_task_running);
4980
4981unsigned long long nr_context_switches(void)
4982{
4983 int i;
4984 unsigned long long sum = 0;
4985
4986 for_each_possible_cpu(i)
4987 sum += cpu_rq(i)->nr_switches;
4988
4989 return sum;
4990}
4991
4992
4993
4994
4995
4996
4997
4998
4999unsigned int nr_iowait_cpu(int cpu)
5000{
5001 return atomic_read(&cpu_rq(cpu)->nr_iowait);
5002}
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034unsigned int nr_iowait(void)
5035{
5036 unsigned int i, sum = 0;
5037
5038 for_each_possible_cpu(i)
5039 sum += nr_iowait_cpu(i);
5040
5041 return sum;
5042}
5043
5044#ifdef CONFIG_SMP
5045
5046
5047
5048
5049
5050void sched_exec(void)
5051{
5052 struct task_struct *p = current;
5053 unsigned long flags;
5054 int dest_cpu;
5055
5056 raw_spin_lock_irqsave(&p->pi_lock, flags);
5057 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
5058 if (dest_cpu == smp_processor_id())
5059 goto unlock;
5060
5061 if (likely(cpu_active(dest_cpu))) {
5062 struct migration_arg arg = { p, dest_cpu };
5063
5064 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5065 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
5066 return;
5067 }
5068unlock:
5069 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5070}
5071
5072#endif
5073
5074DEFINE_PER_CPU(struct kernel_stat, kstat);
5075DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
5076
5077EXPORT_PER_CPU_SYMBOL(kstat);
5078EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
5079
5080
5081
5082
5083
5084
5085
5086static inline void prefetch_curr_exec_start(struct task_struct *p)
5087{
5088#ifdef CONFIG_FAIR_GROUP_SCHED
5089 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
5090#else
5091 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
5092#endif
5093 prefetch(curr);
5094 prefetch(&curr->exec_start);
5095}
5096
5097
5098
5099
5100
5101
5102unsigned long long task_sched_runtime(struct task_struct *p)
5103{
5104 struct rq_flags rf;
5105 struct rq *rq;
5106 u64 ns;
5107
5108#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120 if (!p->on_cpu || !task_on_rq_queued(p))
5121 return p->se.sum_exec_runtime;
5122#endif
5123
5124 rq = task_rq_lock(p, &rf);
5125
5126
5127
5128
5129
5130 if (task_current(rq, p) && task_on_rq_queued(p)) {
5131 prefetch_curr_exec_start(p);
5132 update_rq_clock(rq);
5133 p->sched_class->update_curr(rq);
5134 }
5135 ns = p->se.sum_exec_runtime;
5136 task_rq_unlock(rq, p, &rf);
5137
5138 return ns;
5139}
5140
5141#ifdef CONFIG_SCHED_DEBUG
5142static u64 cpu_resched_latency(struct rq *rq)
5143{
5144 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
5145 u64 resched_latency, now = rq_clock(rq);
5146 static bool warned_once;
5147
5148 if (sysctl_resched_latency_warn_once && warned_once)
5149 return 0;
5150
5151 if (!need_resched() || !latency_warn_ms)
5152 return 0;
5153
5154 if (system_state == SYSTEM_BOOTING)
5155 return 0;
5156
5157 if (!rq->last_seen_need_resched_ns) {
5158 rq->last_seen_need_resched_ns = now;
5159 rq->ticks_without_resched = 0;
5160 return 0;
5161 }
5162
5163 rq->ticks_without_resched++;
5164 resched_latency = now - rq->last_seen_need_resched_ns;
5165 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
5166 return 0;
5167
5168 warned_once = true;
5169
5170 return resched_latency;
5171}
5172
5173static int __init setup_resched_latency_warn_ms(char *str)
5174{
5175 long val;
5176
5177 if ((kstrtol(str, 0, &val))) {
5178 pr_warn("Unable to set resched_latency_warn_ms\n");
5179 return 1;
5180 }
5181
5182 sysctl_resched_latency_warn_ms = val;
5183 return 1;
5184}
5185__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
5186#else
5187static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
5188#endif
5189
5190
5191
5192
5193
5194void scheduler_tick(void)
5195{
5196 int cpu = smp_processor_id();
5197 struct rq *rq = cpu_rq(cpu);
5198 struct task_struct *curr = rq->curr;
5199 struct rq_flags rf;
5200 unsigned long thermal_pressure;
5201 u64 resched_latency;
5202
5203 arch_scale_freq_tick();
5204 sched_clock_tick();
5205
5206 rq_lock(rq, &rf);
5207
5208 update_rq_clock(rq);
5209 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
5210 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
5211 curr->sched_class->task_tick(rq, curr, 0);
5212 if (sched_feat(LATENCY_WARN))
5213 resched_latency = cpu_resched_latency(rq);
5214 calc_global_load_tick(rq);
5215
5216 rq_unlock(rq, &rf);
5217
5218 if (sched_feat(LATENCY_WARN) && resched_latency)
5219 resched_latency_warn(cpu, resched_latency);
5220
5221 perf_event_task_tick();
5222
5223#ifdef CONFIG_SMP
5224 rq->idle_balance = idle_cpu(cpu);
5225 trigger_load_balance(rq);
5226#endif
5227}
5228
5229#ifdef CONFIG_NO_HZ_FULL
5230
5231struct tick_work {
5232 int cpu;
5233 atomic_t state;
5234 struct delayed_work work;
5235};
5236
5237#define TICK_SCHED_REMOTE_OFFLINE 0
5238#define TICK_SCHED_REMOTE_OFFLINING 1
5239#define TICK_SCHED_REMOTE_RUNNING 2
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264static struct tick_work __percpu *tick_work_cpu;
5265
5266static void sched_tick_remote(struct work_struct *work)
5267{
5268 struct delayed_work *dwork = to_delayed_work(work);
5269 struct tick_work *twork = container_of(dwork, struct tick_work, work);
5270 int cpu = twork->cpu;
5271 struct rq *rq = cpu_rq(cpu);
5272 struct task_struct *curr;
5273 struct rq_flags rf;
5274 u64 delta;
5275 int os;
5276
5277
5278
5279
5280
5281
5282
5283
5284 if (!tick_nohz_tick_stopped_cpu(cpu))
5285 goto out_requeue;
5286
5287 rq_lock_irq(rq, &rf);
5288 curr = rq->curr;
5289 if (cpu_is_offline(cpu))
5290 goto out_unlock;
5291
5292 update_rq_clock(rq);
5293
5294 if (!is_idle_task(curr)) {
5295
5296
5297
5298
5299 delta = rq_clock_task(rq) - curr->se.exec_start;
5300 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5301 }
5302 curr->sched_class->task_tick(rq, curr, 0);
5303
5304 calc_load_nohz_remote(rq);
5305out_unlock:
5306 rq_unlock_irq(rq, &rf);
5307out_requeue:
5308
5309
5310
5311
5312
5313
5314
5315 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5316 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5317 if (os == TICK_SCHED_REMOTE_RUNNING)
5318 queue_delayed_work(system_unbound_wq, dwork, HZ);
5319}
5320
5321static void sched_tick_start(int cpu)
5322{
5323 int os;
5324 struct tick_work *twork;
5325
5326 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5327 return;
5328
5329 WARN_ON_ONCE(!tick_work_cpu);
5330
5331 twork = per_cpu_ptr(tick_work_cpu, cpu);
5332 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5333 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5334 if (os == TICK_SCHED_REMOTE_OFFLINE) {
5335 twork->cpu = cpu;
5336 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5337 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5338 }
5339}
5340
5341#ifdef CONFIG_HOTPLUG_CPU
5342static void sched_tick_stop(int cpu)
5343{
5344 struct tick_work *twork;
5345 int os;
5346
5347 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5348 return;
5349
5350 WARN_ON_ONCE(!tick_work_cpu);
5351
5352 twork = per_cpu_ptr(tick_work_cpu, cpu);
5353
5354 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5355 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5356
5357}
5358#endif
5359
5360int __init sched_tick_offload_init(void)
5361{
5362 tick_work_cpu = alloc_percpu(struct tick_work);
5363 BUG_ON(!tick_work_cpu);
5364 return 0;
5365}
5366
5367#else
5368static inline void sched_tick_start(int cpu) { }
5369static inline void sched_tick_stop(int cpu) { }
5370#endif
5371
5372#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5373 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5374
5375
5376
5377
5378static inline void preempt_latency_start(int val)
5379{
5380 if (preempt_count() == val) {
5381 unsigned long ip = get_lock_parent_ip();
5382#ifdef CONFIG_DEBUG_PREEMPT
5383 current->preempt_disable_ip = ip;
5384#endif
5385 trace_preempt_off(CALLER_ADDR0, ip);
5386 }
5387}
5388
5389void preempt_count_add(int val)
5390{
5391#ifdef CONFIG_DEBUG_PREEMPT
5392
5393
5394
5395 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5396 return;
5397#endif
5398 __preempt_count_add(val);
5399#ifdef CONFIG_DEBUG_PREEMPT
5400
5401
5402
5403 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5404 PREEMPT_MASK - 10);
5405#endif
5406 preempt_latency_start(val);
5407}
5408EXPORT_SYMBOL(preempt_count_add);
5409NOKPROBE_SYMBOL(preempt_count_add);
5410
5411
5412
5413
5414
5415static inline void preempt_latency_stop(int val)
5416{
5417 if (preempt_count() == val)
5418 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
5419}
5420
5421void preempt_count_sub(int val)
5422{
5423#ifdef CONFIG_DEBUG_PREEMPT
5424
5425
5426
5427 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5428 return;
5429
5430
5431
5432 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5433 !(preempt_count() & PREEMPT_MASK)))
5434 return;
5435#endif
5436
5437 preempt_latency_stop(val);
5438 __preempt_count_sub(val);
5439}
5440EXPORT_SYMBOL(preempt_count_sub);
5441NOKPROBE_SYMBOL(preempt_count_sub);
5442
5443#else
5444static inline void preempt_latency_start(int val) { }
5445static inline void preempt_latency_stop(int val) { }
5446#endif
5447
5448static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5449{
5450#ifdef CONFIG_DEBUG_PREEMPT
5451 return p->preempt_disable_ip;
5452#else
5453 return 0;
5454#endif
5455}
5456
5457
5458
5459
5460static noinline void __schedule_bug(struct task_struct *prev)
5461{
5462
5463 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5464
5465 if (oops_in_progress)
5466 return;
5467
5468 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5469 prev->comm, prev->pid, preempt_count());
5470
5471 debug_show_held_locks(prev);
5472 print_modules();
5473 if (irqs_disabled())
5474 print_irqtrace_events(prev);
5475 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
5476 && in_atomic_preempt_off()) {
5477 pr_err("Preemption disabled at:");
5478 print_ip_sym(KERN_ERR, preempt_disable_ip);
5479 }
5480 if (panic_on_warn)
5481 panic("scheduling while atomic\n");
5482
5483 dump_stack();
5484 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5485}
5486
5487
5488
5489
5490static inline void schedule_debug(struct task_struct *prev, bool preempt)
5491{
5492#ifdef CONFIG_SCHED_STACK_END_CHECK
5493 if (task_stack_end_corrupted(prev))
5494 panic("corrupted stack end detected inside scheduler\n");
5495
5496 if (task_scs_end_corrupted(prev))
5497 panic("corrupted shadow stack detected inside scheduler\n");
5498#endif
5499
5500#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5501 if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5502 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5503 prev->comm, prev->pid, prev->non_block_count);
5504 dump_stack();
5505 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5506 }
5507#endif
5508
5509 if (unlikely(in_atomic_preempt_off())) {
5510 __schedule_bug(prev);
5511 preempt_count_set(PREEMPT_DISABLED);
5512 }
5513 rcu_sleep_check();
5514 SCHED_WARN_ON(ct_state() == CONTEXT_USER);
5515
5516 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5517
5518 schedstat_inc(this_rq()->sched_count);
5519}
5520
5521static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5522 struct rq_flags *rf)
5523{
5524#ifdef CONFIG_SMP
5525 const struct sched_class *class;
5526
5527
5528
5529
5530
5531
5532
5533
5534 for_class_range(class, prev->sched_class, &idle_sched_class) {
5535 if (class->balance(rq, prev, rf))
5536 break;
5537 }
5538#endif
5539
5540 put_prev_task(rq, prev);
5541}
5542
5543
5544
5545
5546static inline struct task_struct *
5547__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5548{
5549 const struct sched_class *class;
5550 struct task_struct *p;
5551
5552
5553
5554
5555
5556
5557
5558 if (likely(prev->sched_class <= &fair_sched_class &&
5559 rq->nr_running == rq->cfs.h_nr_running)) {
5560
5561 p = pick_next_task_fair(rq, prev, rf);
5562 if (unlikely(p == RETRY_TASK))
5563 goto restart;
5564
5565
5566 if (!p) {
5567 put_prev_task(rq, prev);
5568 p = pick_next_task_idle(rq);
5569 }
5570
5571 return p;
5572 }
5573
5574restart:
5575 put_prev_task_balance(rq, prev, rf);
5576
5577 for_each_class(class) {
5578 p = class->pick_next_task(rq);
5579 if (p)
5580 return p;
5581 }
5582
5583
5584 BUG();
5585}
5586
5587#ifdef CONFIG_SCHED_CORE
5588static inline bool is_task_rq_idle(struct task_struct *t)
5589{
5590 return (task_rq(t)->idle == t);
5591}
5592
5593static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
5594{
5595 return is_task_rq_idle(a) || (a->core_cookie == cookie);
5596}
5597
5598static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
5599{
5600 if (is_task_rq_idle(a) || is_task_rq_idle(b))
5601 return true;
5602
5603 return a->core_cookie == b->core_cookie;
5604}
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614static struct task_struct *
5615pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
5616{
5617 struct task_struct *class_pick, *cookie_pick;
5618 unsigned long cookie = rq->core->core_cookie;
5619
5620 class_pick = class->pick_task(rq);
5621 if (!class_pick)
5622 return NULL;
5623
5624 if (!cookie) {
5625
5626
5627
5628
5629 if (max && class_pick->core_cookie &&
5630 prio_less(class_pick, max, in_fi))
5631 return idle_sched_class.pick_task(rq);
5632
5633 return class_pick;
5634 }
5635
5636
5637
5638
5639 if (cookie_equals(class_pick, cookie))
5640 return class_pick;
5641
5642 cookie_pick = sched_core_find(rq, cookie);
5643
5644
5645
5646
5647
5648
5649 if (prio_less(cookie_pick, class_pick, in_fi) &&
5650 (!max || prio_less(max, class_pick, in_fi)))
5651 return class_pick;
5652
5653 return cookie_pick;
5654}
5655
5656extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
5657
5658static struct task_struct *
5659pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5660{
5661 struct task_struct *next, *max = NULL;
5662 const struct sched_class *class;
5663 const struct cpumask *smt_mask;
5664 bool fi_before = false;
5665 int i, j, cpu, occ = 0;
5666 bool need_sync;
5667
5668 if (!sched_core_enabled(rq))
5669 return __pick_next_task(rq, prev, rf);
5670
5671 cpu = cpu_of(rq);
5672
5673
5674 if (cpu_is_offline(cpu)) {
5675
5676
5677
5678
5679
5680 rq->core_pick = NULL;
5681 return __pick_next_task(rq, prev, rf);
5682 }
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693 if (rq->core->core_pick_seq == rq->core->core_task_seq &&
5694 rq->core->core_pick_seq != rq->core_sched_seq &&
5695 rq->core_pick) {
5696 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
5697
5698 next = rq->core_pick;
5699 if (next != prev) {
5700 put_prev_task(rq, prev);
5701 set_next_task(rq, next);
5702 }
5703
5704 rq->core_pick = NULL;
5705 return next;
5706 }
5707
5708 put_prev_task_balance(rq, prev, rf);
5709
5710 smt_mask = cpu_smt_mask(cpu);
5711 need_sync = !!rq->core->core_cookie;
5712
5713
5714 rq->core->core_cookie = 0UL;
5715 if (rq->core->core_forceidle) {
5716 need_sync = true;
5717 fi_before = true;
5718 rq->core->core_forceidle = false;
5719 }
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731 rq->core->core_task_seq++;
5732
5733
5734
5735
5736
5737 if (!need_sync) {
5738 for_each_class(class) {
5739 next = class->pick_task(rq);
5740 if (next)
5741 break;
5742 }
5743
5744 if (!next->core_cookie) {
5745 rq->core_pick = NULL;
5746
5747
5748
5749
5750 WARN_ON_ONCE(fi_before);
5751 task_vruntime_update(rq, next, false);
5752 goto done;
5753 }
5754 }
5755
5756 for_each_cpu(i, smt_mask) {
5757 struct rq *rq_i = cpu_rq(i);
5758
5759 rq_i->core_pick = NULL;
5760
5761 if (i != cpu)
5762 update_rq_clock(rq_i);
5763 }
5764
5765
5766
5767
5768
5769 for_each_class(class) {
5770again:
5771 for_each_cpu_wrap(i, smt_mask, cpu) {
5772 struct rq *rq_i = cpu_rq(i);
5773 struct task_struct *p;
5774
5775 if (rq_i->core_pick)
5776 continue;
5777
5778
5779
5780
5781
5782
5783
5784 p = pick_task(rq_i, class, max, fi_before);
5785 if (!p)
5786 continue;
5787
5788 if (!is_task_rq_idle(p))
5789 occ++;
5790
5791 rq_i->core_pick = p;
5792 if (rq_i->idle == p && rq_i->nr_running) {
5793 rq->core->core_forceidle = true;
5794 if (!fi_before)
5795 rq->core->core_forceidle_seq++;
5796 }
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808 if (!max || !cookie_match(max, p)) {
5809 struct task_struct *old_max = max;
5810
5811 rq->core->core_cookie = p->core_cookie;
5812 max = p;
5813
5814 if (old_max) {
5815 rq->core->core_forceidle = false;
5816 for_each_cpu(j, smt_mask) {
5817 if (j == i)
5818 continue;
5819
5820 cpu_rq(j)->core_pick = NULL;
5821 }
5822 occ = 1;
5823 goto again;
5824 }
5825 }
5826 }
5827 }
5828
5829 rq->core->core_pick_seq = rq->core->core_task_seq;
5830 next = rq->core_pick;
5831 rq->core_sched_seq = rq->core->core_pick_seq;
5832
5833
5834 WARN_ON_ONCE(!next);
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844 for_each_cpu(i, smt_mask) {
5845 struct rq *rq_i = cpu_rq(i);
5846
5847
5848
5849
5850
5851
5852
5853
5854 if (!rq_i->core_pick)
5855 continue;
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865 if (!(fi_before && rq->core->core_forceidle))
5866 task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
5867
5868 rq_i->core_pick->core_occupation = occ;
5869
5870 if (i == cpu) {
5871 rq_i->core_pick = NULL;
5872 continue;
5873 }
5874
5875
5876 WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
5877
5878 if (rq_i->curr == rq_i->core_pick) {
5879 rq_i->core_pick = NULL;
5880 continue;
5881 }
5882
5883 resched_curr(rq_i);
5884 }
5885
5886done:
5887 set_next_task(rq, next);
5888 return next;
5889}
5890
5891static bool try_steal_cookie(int this, int that)
5892{
5893 struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
5894 struct task_struct *p;
5895 unsigned long cookie;
5896 bool success = false;
5897
5898 local_irq_disable();
5899 double_rq_lock(dst, src);
5900
5901 cookie = dst->core->core_cookie;
5902 if (!cookie)
5903 goto unlock;
5904
5905 if (dst->curr != dst->idle)
5906 goto unlock;
5907
5908 p = sched_core_find(src, cookie);
5909 if (p == src->idle)
5910 goto unlock;
5911
5912 do {
5913 if (p == src->core_pick || p == src->curr)
5914 goto next;
5915
5916 if (!cpumask_test_cpu(this, &p->cpus_mask))
5917 goto next;
5918
5919 if (p->core_occupation > dst->idle->core_occupation)
5920 goto next;
5921
5922 deactivate_task(src, p, 0);
5923 set_task_cpu(p, this);
5924 activate_task(dst, p, 0);
5925
5926 resched_curr(dst);
5927
5928 success = true;
5929 break;
5930
5931next:
5932 p = sched_core_next(p, cookie);
5933 } while (p);
5934
5935unlock:
5936 double_rq_unlock(dst, src);
5937 local_irq_enable();
5938
5939 return success;
5940}
5941
5942static bool steal_cookie_task(int cpu, struct sched_domain *sd)
5943{
5944 int i;
5945
5946 for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
5947 if (i == cpu)
5948 continue;
5949
5950 if (need_resched())
5951 break;
5952
5953 if (try_steal_cookie(cpu, i))
5954 return true;
5955 }
5956
5957 return false;
5958}
5959
5960static void sched_core_balance(struct rq *rq)
5961{
5962 struct sched_domain *sd;
5963 int cpu = cpu_of(rq);
5964
5965 preempt_disable();
5966 rcu_read_lock();
5967 raw_spin_rq_unlock_irq(rq);
5968 for_each_domain(cpu, sd) {
5969 if (need_resched())
5970 break;
5971
5972 if (steal_cookie_task(cpu, sd))
5973 break;
5974 }
5975 raw_spin_rq_lock_irq(rq);
5976 rcu_read_unlock();
5977 preempt_enable();
5978}
5979
5980static DEFINE_PER_CPU(struct callback_head, core_balance_head);
5981
5982void queue_core_balance(struct rq *rq)
5983{
5984 if (!sched_core_enabled(rq))
5985 return;
5986
5987 if (!rq->core->core_cookie)
5988 return;
5989
5990 if (!rq->nr_running)
5991 return;
5992
5993 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
5994}
5995
5996static void sched_core_cpu_starting(unsigned int cpu)
5997{
5998 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
5999 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6000 unsigned long flags;
6001 int t;
6002
6003 sched_core_lock(cpu, &flags);
6004
6005 WARN_ON_ONCE(rq->core != rq);
6006
6007
6008 if (cpumask_weight(smt_mask) == 1)
6009 goto unlock;
6010
6011
6012 for_each_cpu(t, smt_mask) {
6013 if (t == cpu)
6014 continue;
6015 rq = cpu_rq(t);
6016 if (rq->core == rq) {
6017 core_rq = rq;
6018 break;
6019 }
6020 }
6021
6022 if (WARN_ON_ONCE(!core_rq))
6023 goto unlock;
6024
6025
6026 for_each_cpu(t, smt_mask) {
6027 rq = cpu_rq(t);
6028
6029 if (t == cpu)
6030 rq->core = core_rq;
6031
6032 WARN_ON_ONCE(rq->core != core_rq);
6033 }
6034
6035unlock:
6036 sched_core_unlock(cpu, &flags);
6037}
6038
6039static void sched_core_cpu_deactivate(unsigned int cpu)
6040{
6041 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6042 struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6043 unsigned long flags;
6044 int t;
6045
6046 sched_core_lock(cpu, &flags);
6047
6048
6049 if (cpumask_weight(smt_mask) == 1) {
6050 WARN_ON_ONCE(rq->core != rq);
6051 goto unlock;
6052 }
6053
6054
6055 if (rq->core != rq)
6056 goto unlock;
6057
6058
6059 for_each_cpu(t, smt_mask) {
6060 if (t == cpu)
6061 continue;
6062 core_rq = cpu_rq(t);
6063 break;
6064 }
6065
6066 if (WARN_ON_ONCE(!core_rq))
6067 goto unlock;
6068
6069
6070 core_rq->core_task_seq = rq->core_task_seq;
6071 core_rq->core_pick_seq = rq->core_pick_seq;
6072 core_rq->core_cookie = rq->core_cookie;
6073 core_rq->core_forceidle = rq->core_forceidle;
6074 core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6075
6076
6077 for_each_cpu(t, smt_mask) {
6078 rq = cpu_rq(t);
6079 rq->core = core_rq;
6080 }
6081
6082unlock:
6083 sched_core_unlock(cpu, &flags);
6084}
6085
6086static inline void sched_core_cpu_dying(unsigned int cpu)
6087{
6088 struct rq *rq = cpu_rq(cpu);
6089
6090 if (rq->core != rq)
6091 rq->core = rq;
6092}
6093
6094#else
6095
6096static inline void sched_core_cpu_starting(unsigned int cpu) {}
6097static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
6098static inline void sched_core_cpu_dying(unsigned int cpu) {}
6099
6100static struct task_struct *
6101pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6102{
6103 return __pick_next_task(rq, prev, rf);
6104}
6105
6106#endif
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116#define SM_NONE 0x0
6117#define SM_PREEMPT 0x1
6118#define SM_RTLOCK_WAIT 0x2
6119
6120#ifndef CONFIG_PREEMPT_RT
6121# define SM_MASK_PREEMPT (~0U)
6122#else
6123# define SM_MASK_PREEMPT SM_PREEMPT
6124#endif
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165static void __sched notrace __schedule(unsigned int sched_mode)
6166{
6167 struct task_struct *prev, *next;
6168 unsigned long *switch_count;
6169 unsigned long prev_state;
6170 struct rq_flags rf;
6171 struct rq *rq;
6172 int cpu;
6173
6174 cpu = smp_processor_id();
6175 rq = cpu_rq(cpu);
6176 prev = rq->curr;
6177
6178 schedule_debug(prev, !!sched_mode);
6179
6180 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
6181 hrtick_clear(rq);
6182
6183 local_irq_disable();
6184 rcu_note_context_switch(!!sched_mode);
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201 rq_lock(rq, &rf);
6202 smp_mb__after_spinlock();
6203
6204
6205 rq->clock_update_flags <<= 1;
6206 update_rq_clock(rq);
6207
6208 switch_count = &prev->nivcsw;
6209
6210
6211
6212
6213
6214
6215
6216
6217 prev_state = READ_ONCE(prev->__state);
6218 if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
6219 if (signal_pending_state(prev_state, prev)) {
6220 WRITE_ONCE(prev->__state, TASK_RUNNING);
6221 } else {
6222 prev->sched_contributes_to_load =
6223 (prev_state & TASK_UNINTERRUPTIBLE) &&
6224 !(prev_state & TASK_NOLOAD) &&
6225 !(prev->flags & PF_FROZEN);
6226
6227 if (prev->sched_contributes_to_load)
6228 rq->nr_uninterruptible++;
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
6242
6243 if (prev->in_iowait) {
6244 atomic_inc(&rq->nr_iowait);
6245 delayacct_blkio_start();
6246 }
6247 }
6248 switch_count = &prev->nvcsw;
6249 }
6250
6251 next = pick_next_task(rq, prev, &rf);
6252 clear_tsk_need_resched(prev);
6253 clear_preempt_need_resched();
6254#ifdef CONFIG_SCHED_DEBUG
6255 rq->last_seen_need_resched_ns = 0;
6256#endif
6257
6258 if (likely(prev != next)) {
6259 rq->nr_switches++;
6260
6261
6262
6263
6264 RCU_INIT_POINTER(rq->curr, next);
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279 ++*switch_count;
6280
6281 migrate_disable_switch(rq, prev);
6282 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6283
6284 trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
6285
6286
6287 rq = context_switch(rq, prev, next, &rf);
6288 } else {
6289 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
6290
6291 rq_unpin_lock(rq, &rf);
6292 __balance_callbacks(rq);
6293 raw_spin_rq_unlock_irq(rq);
6294 }
6295}
6296
6297void __noreturn do_task_dead(void)
6298{
6299
6300 set_special_state(TASK_DEAD);
6301
6302
6303 current->flags |= PF_NOFREEZE;
6304
6305 __schedule(SM_NONE);
6306 BUG();
6307
6308
6309 for (;;)
6310 cpu_relax();
6311}
6312
6313static inline void sched_submit_work(struct task_struct *tsk)
6314{
6315 unsigned int task_flags;
6316
6317 if (task_is_running(tsk))
6318 return;
6319
6320 task_flags = tsk->flags;
6321
6322
6323
6324
6325
6326
6327
6328
6329 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6330 preempt_disable();
6331 if (task_flags & PF_WQ_WORKER)
6332 wq_worker_sleeping(tsk);
6333 else
6334 io_wq_worker_sleeping(tsk);
6335 preempt_enable_no_resched();
6336 }
6337
6338 if (tsk_is_pi_blocked(tsk))
6339 return;
6340
6341
6342
6343
6344
6345 if (blk_needs_flush_plug(tsk))
6346 blk_schedule_flush_plug(tsk);
6347}
6348
6349static void sched_update_worker(struct task_struct *tsk)
6350{
6351 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6352 if (tsk->flags & PF_WQ_WORKER)
6353 wq_worker_running(tsk);
6354 else
6355 io_wq_worker_running(tsk);
6356 }
6357}
6358
6359asmlinkage __visible void __sched schedule(void)
6360{
6361 struct task_struct *tsk = current;
6362
6363 sched_submit_work(tsk);
6364 do {
6365 preempt_disable();
6366 __schedule(SM_NONE);
6367 sched_preempt_enable_no_resched();
6368 } while (need_resched());
6369 sched_update_worker(tsk);
6370}
6371EXPORT_SYMBOL(schedule);
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383void __sched schedule_idle(void)
6384{
6385
6386
6387
6388
6389
6390
6391
6392 WARN_ON_ONCE(current->__state);
6393 do {
6394 __schedule(SM_NONE);
6395 } while (need_resched());
6396}
6397
6398#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
6399asmlinkage __visible void __sched schedule_user(void)
6400{
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411 enum ctx_state prev_state = exception_enter();
6412 schedule();
6413 exception_exit(prev_state);
6414}
6415#endif
6416
6417
6418
6419
6420
6421
6422void __sched schedule_preempt_disabled(void)
6423{
6424 sched_preempt_enable_no_resched();
6425 schedule();
6426 preempt_disable();
6427}
6428
6429#ifdef CONFIG_PREEMPT_RT
6430void __sched notrace schedule_rtlock(void)
6431{
6432 do {
6433 preempt_disable();
6434 __schedule(SM_RTLOCK_WAIT);
6435 sched_preempt_enable_no_resched();
6436 } while (need_resched());
6437}
6438NOKPROBE_SYMBOL(schedule_rtlock);
6439#endif
6440
6441static void __sched notrace preempt_schedule_common(void)
6442{
6443 do {
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457 preempt_disable_notrace();
6458 preempt_latency_start(1);
6459 __schedule(SM_PREEMPT);
6460 preempt_latency_stop(1);
6461 preempt_enable_no_resched_notrace();
6462
6463
6464
6465
6466
6467 } while (need_resched());
6468}
6469
6470#ifdef CONFIG_PREEMPTION
6471
6472
6473
6474
6475asmlinkage __visible void __sched notrace preempt_schedule(void)
6476{
6477
6478
6479
6480
6481 if (likely(!preemptible()))
6482 return;
6483
6484 preempt_schedule_common();
6485}
6486NOKPROBE_SYMBOL(preempt_schedule);
6487EXPORT_SYMBOL(preempt_schedule);
6488
6489#ifdef CONFIG_PREEMPT_DYNAMIC
6490DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
6491EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
6492#endif
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
6510{
6511 enum ctx_state prev_ctx;
6512
6513 if (likely(!preemptible()))
6514 return;
6515
6516 do {
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530 preempt_disable_notrace();
6531 preempt_latency_start(1);
6532
6533
6534
6535
6536
6537 prev_ctx = exception_enter();
6538 __schedule(SM_PREEMPT);
6539 exception_exit(prev_ctx);
6540
6541 preempt_latency_stop(1);
6542 preempt_enable_no_resched_notrace();
6543 } while (need_resched());
6544}
6545EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
6546
6547#ifdef CONFIG_PREEMPT_DYNAMIC
6548DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6549EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
6550#endif
6551
6552#endif
6553
6554#ifdef CONFIG_PREEMPT_DYNAMIC
6555
6556#include <linux/entry-common.h>
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588enum {
6589 preempt_dynamic_none = 0,
6590 preempt_dynamic_voluntary,
6591 preempt_dynamic_full,
6592};
6593
6594int preempt_dynamic_mode = preempt_dynamic_full;
6595
6596int sched_dynamic_mode(const char *str)
6597{
6598 if (!strcmp(str, "none"))
6599 return preempt_dynamic_none;
6600
6601 if (!strcmp(str, "voluntary"))
6602 return preempt_dynamic_voluntary;
6603
6604 if (!strcmp(str, "full"))
6605 return preempt_dynamic_full;
6606
6607 return -EINVAL;
6608}
6609
6610void sched_dynamic_update(int mode)
6611{
6612
6613
6614
6615
6616 static_call_update(cond_resched, __cond_resched);
6617 static_call_update(might_resched, __cond_resched);
6618 static_call_update(preempt_schedule, __preempt_schedule_func);
6619 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6620 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6621
6622 switch (mode) {
6623 case preempt_dynamic_none:
6624 static_call_update(cond_resched, __cond_resched);
6625 static_call_update(might_resched, (void *)&__static_call_return0);
6626 static_call_update(preempt_schedule, NULL);
6627 static_call_update(preempt_schedule_notrace, NULL);
6628 static_call_update(irqentry_exit_cond_resched, NULL);
6629 pr_info("Dynamic Preempt: none\n");
6630 break;
6631
6632 case preempt_dynamic_voluntary:
6633 static_call_update(cond_resched, __cond_resched);
6634 static_call_update(might_resched, __cond_resched);
6635 static_call_update(preempt_schedule, NULL);
6636 static_call_update(preempt_schedule_notrace, NULL);
6637 static_call_update(irqentry_exit_cond_resched, NULL);
6638 pr_info("Dynamic Preempt: voluntary\n");
6639 break;
6640
6641 case preempt_dynamic_full:
6642 static_call_update(cond_resched, (void *)&__static_call_return0);
6643 static_call_update(might_resched, (void *)&__static_call_return0);
6644 static_call_update(preempt_schedule, __preempt_schedule_func);
6645 static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6646 static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6647 pr_info("Dynamic Preempt: full\n");
6648 break;
6649 }
6650
6651 preempt_dynamic_mode = mode;
6652}
6653
6654static int __init setup_preempt_mode(char *str)
6655{
6656 int mode = sched_dynamic_mode(str);
6657 if (mode < 0) {
6658 pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
6659 return 1;
6660 }
6661
6662 sched_dynamic_update(mode);
6663 return 0;
6664}
6665__setup("preempt=", setup_preempt_mode);
6666
6667#endif
6668
6669
6670
6671
6672
6673
6674
6675asmlinkage __visible void __sched preempt_schedule_irq(void)
6676{
6677 enum ctx_state prev_state;
6678
6679
6680 BUG_ON(preempt_count() || !irqs_disabled());
6681
6682 prev_state = exception_enter();
6683
6684 do {
6685 preempt_disable();
6686 local_irq_enable();
6687 __schedule(SM_PREEMPT);
6688 local_irq_disable();
6689 sched_preempt_enable_no_resched();
6690 } while (need_resched());
6691
6692 exception_exit(prev_state);
6693}
6694
6695int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
6696 void *key)
6697{
6698 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
6699 return try_to_wake_up(curr->private, mode, wake_flags);
6700}
6701EXPORT_SYMBOL(default_wake_function);
6702
6703static void __setscheduler_prio(struct task_struct *p, int prio)
6704{
6705 if (dl_prio(prio))
6706 p->sched_class = &dl_sched_class;
6707 else if (rt_prio(prio))
6708 p->sched_class = &rt_sched_class;
6709 else
6710 p->sched_class = &fair_sched_class;
6711
6712 p->prio = prio;
6713}
6714
6715#ifdef CONFIG_RT_MUTEXES
6716
6717static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
6718{
6719 if (pi_task)
6720 prio = min(prio, pi_task->prio);
6721
6722 return prio;
6723}
6724
6725static inline int rt_effective_prio(struct task_struct *p, int prio)
6726{
6727 struct task_struct *pi_task = rt_mutex_get_top_task(p);
6728
6729 return __rt_effective_prio(pi_task, prio);
6730}
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
6744{
6745 int prio, oldprio, queued, running, queue_flag =
6746 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6747 const struct sched_class *prev_class;
6748 struct rq_flags rf;
6749 struct rq *rq;
6750
6751
6752 prio = __rt_effective_prio(pi_task, p->normal_prio);
6753
6754
6755
6756
6757 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
6758 return;
6759
6760 rq = __task_rq_lock(p, &rf);
6761 update_rq_clock(rq);
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772 p->pi_top_task = pi_task;
6773
6774
6775
6776
6777 if (prio == p->prio && !dl_prio(prio))
6778 goto out_unlock;
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792 if (unlikely(p == rq->idle)) {
6793 WARN_ON(p != rq->curr);
6794 WARN_ON(p->pi_blocked_on);
6795 goto out_unlock;
6796 }
6797
6798 trace_sched_pi_setprio(p, pi_task);
6799 oldprio = p->prio;
6800
6801 if (oldprio == prio)
6802 queue_flag &= ~DEQUEUE_MOVE;
6803
6804 prev_class = p->sched_class;
6805 queued = task_on_rq_queued(p);
6806 running = task_current(rq, p);
6807 if (queued)
6808 dequeue_task(rq, p, queue_flag);
6809 if (running)
6810 put_prev_task(rq, p);
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821 if (dl_prio(prio)) {
6822 if (!dl_prio(p->normal_prio) ||
6823 (pi_task && dl_prio(pi_task->prio) &&
6824 dl_entity_preempt(&pi_task->dl, &p->dl))) {
6825 p->dl.pi_se = pi_task->dl.pi_se;
6826 queue_flag |= ENQUEUE_REPLENISH;
6827 } else {
6828 p->dl.pi_se = &p->dl;
6829 }
6830 } else if (rt_prio(prio)) {
6831 if (dl_prio(oldprio))
6832 p->dl.pi_se = &p->dl;
6833 if (oldprio < prio)
6834 queue_flag |= ENQUEUE_HEAD;
6835 } else {
6836 if (dl_prio(oldprio))
6837 p->dl.pi_se = &p->dl;
6838 if (rt_prio(oldprio))
6839 p->rt.timeout = 0;
6840 }
6841
6842 __setscheduler_prio(p, prio);
6843
6844 if (queued)
6845 enqueue_task(rq, p, queue_flag);
6846 if (running)
6847 set_next_task(rq, p);
6848
6849 check_class_changed(rq, p, prev_class, oldprio);
6850out_unlock:
6851
6852 preempt_disable();
6853
6854 rq_unpin_lock(rq, &rf);
6855 __balance_callbacks(rq);
6856 raw_spin_rq_unlock(rq);
6857
6858 preempt_enable();
6859}
6860#else
6861static inline int rt_effective_prio(struct task_struct *p, int prio)
6862{
6863 return prio;
6864}
6865#endif
6866
6867void set_user_nice(struct task_struct *p, long nice)
6868{
6869 bool queued, running;
6870 int old_prio;
6871 struct rq_flags rf;
6872 struct rq *rq;
6873
6874 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
6875 return;
6876
6877
6878
6879
6880 rq = task_rq_lock(p, &rf);
6881 update_rq_clock(rq);
6882
6883
6884
6885
6886
6887
6888
6889 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
6890 p->static_prio = NICE_TO_PRIO(nice);
6891 goto out_unlock;
6892 }
6893 queued = task_on_rq_queued(p);
6894 running = task_current(rq, p);
6895 if (queued)
6896 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
6897 if (running)
6898 put_prev_task(rq, p);
6899
6900 p->static_prio = NICE_TO_PRIO(nice);
6901 set_load_weight(p, true);
6902 old_prio = p->prio;
6903 p->prio = effective_prio(p);
6904
6905 if (queued)
6906 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6907 if (running)
6908 set_next_task(rq, p);
6909
6910
6911
6912
6913
6914 p->sched_class->prio_changed(rq, p, old_prio);
6915
6916out_unlock:
6917 task_rq_unlock(rq, p, &rf);
6918}
6919EXPORT_SYMBOL(set_user_nice);
6920
6921
6922
6923
6924
6925
6926int can_nice(const struct task_struct *p, const int nice)
6927{
6928
6929 int nice_rlim = nice_to_rlimit(nice);
6930
6931 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6932 capable(CAP_SYS_NICE));
6933}
6934
6935#ifdef __ARCH_WANT_SYS_NICE
6936
6937
6938
6939
6940
6941
6942
6943
6944SYSCALL_DEFINE1(nice, int, increment)
6945{
6946 long nice, retval;
6947
6948
6949
6950
6951
6952
6953 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
6954 nice = task_nice(current) + increment;
6955
6956 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
6957 if (increment < 0 && !can_nice(current, nice))
6958 return -EPERM;
6959
6960 retval = security_task_setnice(current, nice);
6961 if (retval)
6962 return retval;
6963
6964 set_user_nice(current, nice);
6965 return 0;
6966}
6967
6968#endif
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982int task_prio(const struct task_struct *p)
6983{
6984 return p->prio - MAX_RT_PRIO;
6985}
6986
6987
6988
6989
6990
6991
6992
6993int idle_cpu(int cpu)
6994{
6995 struct rq *rq = cpu_rq(cpu);
6996
6997 if (rq->curr != rq->idle)
6998 return 0;
6999
7000 if (rq->nr_running)
7001 return 0;
7002
7003#ifdef CONFIG_SMP
7004 if (rq->ttwu_pending)
7005 return 0;
7006#endif
7007
7008 return 1;
7009}
7010
7011
7012
7013
7014
7015
7016
7017int available_idle_cpu(int cpu)
7018{
7019 if (!idle_cpu(cpu))
7020 return 0;
7021
7022 if (vcpu_is_preempted(cpu))
7023 return 0;
7024
7025 return 1;
7026}
7027
7028
7029
7030
7031
7032
7033
7034struct task_struct *idle_task(int cpu)
7035{
7036 return cpu_rq(cpu)->idle;
7037}
7038
7039#ifdef CONFIG_SMP
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
7061 unsigned long max, enum cpu_util_type type,
7062 struct task_struct *p)
7063{
7064 unsigned long dl_util, util, irq;
7065 struct rq *rq = cpu_rq(cpu);
7066
7067 if (!uclamp_is_used() &&
7068 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
7069 return max;
7070 }
7071
7072
7073
7074
7075
7076
7077 irq = cpu_util_irq(rq);
7078 if (unlikely(irq >= max))
7079 return max;
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093 util = util_cfs + cpu_util_rt(rq);
7094 if (type == FREQUENCY_UTIL)
7095 util = uclamp_rq_util_with(rq, util, p);
7096
7097 dl_util = cpu_util_dl(rq);
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108 if (util + dl_util >= max)
7109 return max;
7110
7111
7112
7113
7114
7115 if (type == ENERGY_UTIL)
7116 util += dl_util;
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127 util = scale_irq_capacity(util, irq, max);
7128 util += irq;
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140 if (type == FREQUENCY_UTIL)
7141 util += cpu_bw_dl(rq);
7142
7143 return min(max, util);
7144}
7145
7146unsigned long sched_cpu_util(int cpu, unsigned long max)
7147{
7148 return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
7149 ENERGY_UTIL, NULL);
7150}
7151#endif
7152
7153
7154
7155
7156
7157
7158
7159static struct task_struct *find_process_by_pid(pid_t pid)
7160{
7161 return pid ? find_task_by_vpid(pid) : current;
7162}
7163
7164
7165
7166
7167
7168#define SETPARAM_POLICY -1
7169
7170static void __setscheduler_params(struct task_struct *p,
7171 const struct sched_attr *attr)
7172{
7173 int policy = attr->sched_policy;
7174
7175 if (policy == SETPARAM_POLICY)
7176 policy = p->policy;
7177
7178 p->policy = policy;
7179
7180 if (dl_policy(policy))
7181 __setparam_dl(p, attr);
7182 else if (fair_policy(policy))
7183 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
7184
7185
7186
7187
7188
7189
7190 p->rt_priority = attr->sched_priority;
7191 p->normal_prio = normal_prio(p);
7192 set_load_weight(p, true);
7193}
7194
7195
7196
7197
7198static bool check_same_owner(struct task_struct *p)
7199{
7200 const struct cred *cred = current_cred(), *pcred;
7201 bool match;
7202
7203 rcu_read_lock();
7204 pcred = __task_cred(p);
7205 match = (uid_eq(cred->euid, pcred->euid) ||
7206 uid_eq(cred->euid, pcred->uid));
7207 rcu_read_unlock();
7208 return match;
7209}
7210
7211static int __sched_setscheduler(struct task_struct *p,
7212 const struct sched_attr *attr,
7213 bool user, bool pi)
7214{
7215 int oldpolicy = -1, policy = attr->sched_policy;
7216 int retval, oldprio, newprio, queued, running;
7217 const struct sched_class *prev_class;
7218 struct callback_head *head;
7219 struct rq_flags rf;
7220 int reset_on_fork;
7221 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7222 struct rq *rq;
7223
7224
7225 BUG_ON(pi && in_interrupt());
7226recheck:
7227
7228 if (policy < 0) {
7229 reset_on_fork = p->sched_reset_on_fork;
7230 policy = oldpolicy = p->policy;
7231 } else {
7232 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
7233
7234 if (!valid_policy(policy))
7235 return -EINVAL;
7236 }
7237
7238 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
7239 return -EINVAL;
7240
7241
7242
7243
7244
7245
7246 if (attr->sched_priority > MAX_RT_PRIO-1)
7247 return -EINVAL;
7248 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
7249 (rt_policy(policy) != (attr->sched_priority != 0)))
7250 return -EINVAL;
7251
7252
7253
7254
7255 if (user && !capable(CAP_SYS_NICE)) {
7256 if (fair_policy(policy)) {
7257 if (attr->sched_nice < task_nice(p) &&
7258 !can_nice(p, attr->sched_nice))
7259 return -EPERM;
7260 }
7261
7262 if (rt_policy(policy)) {
7263 unsigned long rlim_rtprio =
7264 task_rlimit(p, RLIMIT_RTPRIO);
7265
7266
7267 if (policy != p->policy && !rlim_rtprio)
7268 return -EPERM;
7269
7270
7271 if (attr->sched_priority > p->rt_priority &&
7272 attr->sched_priority > rlim_rtprio)
7273 return -EPERM;
7274 }
7275
7276
7277
7278
7279
7280
7281
7282 if (dl_policy(policy))
7283 return -EPERM;
7284
7285
7286
7287
7288
7289 if (task_has_idle_policy(p) && !idle_policy(policy)) {
7290 if (!can_nice(p, task_nice(p)))
7291 return -EPERM;
7292 }
7293
7294
7295 if (!check_same_owner(p))
7296 return -EPERM;
7297
7298
7299 if (p->sched_reset_on_fork && !reset_on_fork)
7300 return -EPERM;
7301 }
7302
7303 if (user) {
7304 if (attr->sched_flags & SCHED_FLAG_SUGOV)
7305 return -EINVAL;
7306
7307 retval = security_task_setscheduler(p);
7308 if (retval)
7309 return retval;
7310 }
7311
7312
7313 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
7314 retval = uclamp_validate(p, attr);
7315 if (retval)
7316 return retval;
7317 }
7318
7319 if (pi)
7320 cpuset_read_lock();
7321
7322
7323
7324
7325
7326
7327
7328
7329 rq = task_rq_lock(p, &rf);
7330 update_rq_clock(rq);
7331
7332
7333
7334
7335 if (p == rq->stop) {
7336 retval = -EINVAL;
7337 goto unlock;
7338 }
7339
7340
7341
7342
7343
7344 if (unlikely(policy == p->policy)) {
7345 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
7346 goto change;
7347 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
7348 goto change;
7349 if (dl_policy(policy) && dl_param_changed(p, attr))
7350 goto change;
7351 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
7352 goto change;
7353
7354 p->sched_reset_on_fork = reset_on_fork;
7355 retval = 0;
7356 goto unlock;
7357 }
7358change:
7359
7360 if (user) {
7361#ifdef CONFIG_RT_GROUP_SCHED
7362
7363
7364
7365
7366 if (rt_bandwidth_enabled() && rt_policy(policy) &&
7367 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
7368 !task_group_is_autogroup(task_group(p))) {
7369 retval = -EPERM;
7370 goto unlock;
7371 }
7372#endif
7373#ifdef CONFIG_SMP
7374 if (dl_bandwidth_enabled() && dl_policy(policy) &&
7375 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
7376 cpumask_t *span = rq->rd->span;
7377
7378
7379
7380
7381
7382
7383 if (!cpumask_subset(span, p->cpus_ptr) ||
7384 rq->rd->dl_bw.bw == 0) {
7385 retval = -EPERM;
7386 goto unlock;
7387 }
7388 }
7389#endif
7390 }
7391
7392
7393 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
7394 policy = oldpolicy = -1;
7395 task_rq_unlock(rq, p, &rf);
7396 if (pi)
7397 cpuset_read_unlock();
7398 goto recheck;
7399 }
7400
7401
7402
7403
7404
7405
7406 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
7407 retval = -EBUSY;
7408 goto unlock;
7409 }
7410
7411 p->sched_reset_on_fork = reset_on_fork;
7412 oldprio = p->prio;
7413
7414 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
7415 if (pi) {
7416
7417
7418
7419
7420
7421
7422
7423 newprio = rt_effective_prio(p, newprio);
7424 if (newprio == oldprio)
7425 queue_flags &= ~DEQUEUE_MOVE;
7426 }
7427
7428 queued = task_on_rq_queued(p);
7429 running = task_current(rq, p);
7430 if (queued)
7431 dequeue_task(rq, p, queue_flags);
7432 if (running)
7433 put_prev_task(rq, p);
7434
7435 prev_class = p->sched_class;
7436
7437 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
7438 __setscheduler_params(p, attr);
7439 __setscheduler_prio(p, newprio);
7440 }
7441 __setscheduler_uclamp(p, attr);
7442
7443 if (queued) {
7444
7445
7446
7447
7448 if (oldprio < p->prio)
7449 queue_flags |= ENQUEUE_HEAD;
7450
7451 enqueue_task(rq, p, queue_flags);
7452 }
7453 if (running)
7454 set_next_task(rq, p);
7455
7456 check_class_changed(rq, p, prev_class, oldprio);
7457
7458
7459 preempt_disable();
7460 head = splice_balance_callbacks(rq);
7461 task_rq_unlock(rq, p, &rf);
7462
7463 if (pi) {
7464 cpuset_read_unlock();
7465 rt_mutex_adjust_pi(p);
7466 }
7467
7468
7469 balance_callbacks(rq, head);
7470 preempt_enable();
7471
7472 return 0;
7473
7474unlock:
7475 task_rq_unlock(rq, p, &rf);
7476 if (pi)
7477 cpuset_read_unlock();
7478 return retval;
7479}
7480
7481static int _sched_setscheduler(struct task_struct *p, int policy,
7482 const struct sched_param *param, bool check)
7483{
7484 struct sched_attr attr = {
7485 .sched_policy = policy,
7486 .sched_priority = param->sched_priority,
7487 .sched_nice = PRIO_TO_NICE(p->static_prio),
7488 };
7489
7490
7491 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
7492 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7493 policy &= ~SCHED_RESET_ON_FORK;
7494 attr.sched_policy = policy;
7495 }
7496
7497 return __sched_setscheduler(p, &attr, check, true);
7498}
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511int sched_setscheduler(struct task_struct *p, int policy,
7512 const struct sched_param *param)
7513{
7514 return _sched_setscheduler(p, policy, param, true);
7515}
7516
7517int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
7518{
7519 return __sched_setscheduler(p, attr, true, true);
7520}
7521
7522int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
7523{
7524 return __sched_setscheduler(p, attr, false, true);
7525}
7526EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541int sched_setscheduler_nocheck(struct task_struct *p, int policy,
7542 const struct sched_param *param)
7543{
7544 return _sched_setscheduler(p, policy, param, false);
7545}
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565void sched_set_fifo(struct task_struct *p)
7566{
7567 struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
7568 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7569}
7570EXPORT_SYMBOL_GPL(sched_set_fifo);
7571
7572
7573
7574
7575void sched_set_fifo_low(struct task_struct *p)
7576{
7577 struct sched_param sp = { .sched_priority = 1 };
7578 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7579}
7580EXPORT_SYMBOL_GPL(sched_set_fifo_low);
7581
7582void sched_set_normal(struct task_struct *p, int nice)
7583{
7584 struct sched_attr attr = {
7585 .sched_policy = SCHED_NORMAL,
7586 .sched_nice = nice,
7587 };
7588 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
7589}
7590EXPORT_SYMBOL_GPL(sched_set_normal);
7591
7592static int
7593do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
7594{
7595 struct sched_param lparam;
7596 struct task_struct *p;
7597 int retval;
7598
7599 if (!param || pid < 0)
7600 return -EINVAL;
7601 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
7602 return -EFAULT;
7603
7604 rcu_read_lock();
7605 retval = -ESRCH;
7606 p = find_process_by_pid(pid);
7607 if (likely(p))
7608 get_task_struct(p);
7609 rcu_read_unlock();
7610
7611 if (likely(p)) {
7612 retval = sched_setscheduler(p, policy, &lparam);
7613 put_task_struct(p);
7614 }
7615
7616 return retval;
7617}
7618
7619
7620
7621
7622static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
7623{
7624 u32 size;
7625 int ret;
7626
7627
7628 memset(attr, 0, sizeof(*attr));
7629
7630 ret = get_user(size, &uattr->size);
7631 if (ret)
7632 return ret;
7633
7634
7635 if (!size)
7636 size = SCHED_ATTR_SIZE_VER0;
7637 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
7638 goto err_size;
7639
7640 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
7641 if (ret) {
7642 if (ret == -E2BIG)
7643 goto err_size;
7644 return ret;
7645 }
7646
7647 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
7648 size < SCHED_ATTR_SIZE_VER1)
7649 return -EINVAL;
7650
7651
7652
7653
7654
7655 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
7656
7657 return 0;
7658
7659err_size:
7660 put_user(sizeof(*attr), &uattr->size);
7661 return -E2BIG;
7662}
7663
7664static void get_params(struct task_struct *p, struct sched_attr *attr)
7665{
7666 if (task_has_dl_policy(p))
7667 __getparam_dl(p, attr);
7668 else if (task_has_rt_policy(p))
7669 attr->sched_priority = p->rt_priority;
7670 else
7671 attr->sched_nice = task_nice(p);
7672}
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
7683{
7684 if (policy < 0)
7685 return -EINVAL;
7686
7687 return do_sched_setscheduler(pid, policy, param);
7688}
7689
7690
7691
7692
7693
7694
7695
7696
7697SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
7698{
7699 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
7700}
7701
7702
7703
7704
7705
7706
7707
7708SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
7709 unsigned int, flags)
7710{
7711 struct sched_attr attr;
7712 struct task_struct *p;
7713 int retval;
7714
7715 if (!uattr || pid < 0 || flags)
7716 return -EINVAL;
7717
7718 retval = sched_copy_attr(uattr, &attr);
7719 if (retval)
7720 return retval;
7721
7722 if ((int)attr.sched_policy < 0)
7723 return -EINVAL;
7724 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
7725 attr.sched_policy = SETPARAM_POLICY;
7726
7727 rcu_read_lock();
7728 retval = -ESRCH;
7729 p = find_process_by_pid(pid);
7730 if (likely(p))
7731 get_task_struct(p);
7732 rcu_read_unlock();
7733
7734 if (likely(p)) {
7735 if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
7736 get_params(p, &attr);
7737 retval = sched_setattr(p, &attr);
7738 put_task_struct(p);
7739 }
7740
7741 return retval;
7742}
7743
7744
7745
7746
7747
7748
7749
7750
7751SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
7752{
7753 struct task_struct *p;
7754 int retval;
7755
7756 if (pid < 0)
7757 return -EINVAL;
7758
7759 retval = -ESRCH;
7760 rcu_read_lock();
7761 p = find_process_by_pid(pid);
7762 if (p) {
7763 retval = security_task_getscheduler(p);
7764 if (!retval)
7765 retval = p->policy
7766 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
7767 }
7768 rcu_read_unlock();
7769 return retval;
7770}
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
7781{
7782 struct sched_param lp = { .sched_priority = 0 };
7783 struct task_struct *p;
7784 int retval;
7785
7786 if (!param || pid < 0)
7787 return -EINVAL;
7788
7789 rcu_read_lock();
7790 p = find_process_by_pid(pid);
7791 retval = -ESRCH;
7792 if (!p)
7793 goto out_unlock;
7794
7795 retval = security_task_getscheduler(p);
7796 if (retval)
7797 goto out_unlock;
7798
7799 if (task_has_rt_policy(p))
7800 lp.sched_priority = p->rt_priority;
7801 rcu_read_unlock();
7802
7803
7804
7805
7806 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
7807
7808 return retval;
7809
7810out_unlock:
7811 rcu_read_unlock();
7812 return retval;
7813}
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823static int
7824sched_attr_copy_to_user(struct sched_attr __user *uattr,
7825 struct sched_attr *kattr,
7826 unsigned int usize)
7827{
7828 unsigned int ksize = sizeof(*kattr);
7829
7830 if (!access_ok(uattr, usize))
7831 return -EFAULT;
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846 kattr->size = min(usize, ksize);
7847
7848 if (copy_to_user(uattr, kattr, kattr->size))
7849 return -EFAULT;
7850
7851 return 0;
7852}
7853
7854
7855
7856
7857
7858
7859
7860
7861SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
7862 unsigned int, usize, unsigned int, flags)
7863{
7864 struct sched_attr kattr = { };
7865 struct task_struct *p;
7866 int retval;
7867
7868 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
7869 usize < SCHED_ATTR_SIZE_VER0 || flags)
7870 return -EINVAL;
7871
7872 rcu_read_lock();
7873 p = find_process_by_pid(pid);
7874 retval = -ESRCH;
7875 if (!p)
7876 goto out_unlock;
7877
7878 retval = security_task_getscheduler(p);
7879 if (retval)
7880 goto out_unlock;
7881
7882 kattr.sched_policy = p->policy;
7883 if (p->sched_reset_on_fork)
7884 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7885 get_params(p, &kattr);
7886 kattr.sched_flags &= SCHED_FLAG_ALL;
7887
7888#ifdef CONFIG_UCLAMP_TASK
7889
7890
7891
7892
7893
7894 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
7895 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
7896#endif
7897
7898 rcu_read_unlock();
7899
7900 return sched_attr_copy_to_user(uattr, &kattr, usize);
7901
7902out_unlock:
7903 rcu_read_unlock();
7904 return retval;
7905}
7906
7907#ifdef CONFIG_SMP
7908int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
7909{
7910 int ret = 0;
7911
7912
7913
7914
7915
7916 if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
7917 return 0;
7918
7919
7920
7921
7922
7923
7924
7925 rcu_read_lock();
7926 if (!cpumask_subset(task_rq(p)->rd->span, mask))
7927 ret = -EBUSY;
7928 rcu_read_unlock();
7929 return ret;
7930}
7931#endif
7932
7933static int
7934__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
7935{
7936 int retval;
7937 cpumask_var_t cpus_allowed, new_mask;
7938
7939 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
7940 return -ENOMEM;
7941
7942 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
7943 retval = -ENOMEM;
7944 goto out_free_cpus_allowed;
7945 }
7946
7947 cpuset_cpus_allowed(p, cpus_allowed);
7948 cpumask_and(new_mask, mask, cpus_allowed);
7949
7950 retval = dl_task_check_affinity(p, new_mask);
7951 if (retval)
7952 goto out_free_new_mask;
7953again:
7954 retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
7955 if (retval)
7956 goto out_free_new_mask;
7957
7958 cpuset_cpus_allowed(p, cpus_allowed);
7959 if (!cpumask_subset(new_mask, cpus_allowed)) {
7960
7961
7962
7963
7964 cpumask_copy(new_mask, cpus_allowed);
7965 goto again;
7966 }
7967
7968out_free_new_mask:
7969 free_cpumask_var(new_mask);
7970out_free_cpus_allowed:
7971 free_cpumask_var(cpus_allowed);
7972 return retval;
7973}
7974
7975long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
7976{
7977 struct task_struct *p;
7978 int retval;
7979
7980 rcu_read_lock();
7981
7982 p = find_process_by_pid(pid);
7983 if (!p) {
7984 rcu_read_unlock();
7985 return -ESRCH;
7986 }
7987
7988
7989 get_task_struct(p);
7990 rcu_read_unlock();
7991
7992 if (p->flags & PF_NO_SETAFFINITY) {
7993 retval = -EINVAL;
7994 goto out_put_task;
7995 }
7996
7997 if (!check_same_owner(p)) {
7998 rcu_read_lock();
7999 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
8000 rcu_read_unlock();
8001 retval = -EPERM;
8002 goto out_put_task;
8003 }
8004 rcu_read_unlock();
8005 }
8006
8007 retval = security_task_setscheduler(p);
8008 if (retval)
8009 goto out_put_task;
8010
8011 retval = __sched_setaffinity(p, in_mask);
8012out_put_task:
8013 put_task_struct(p);
8014 return retval;
8015}
8016
8017static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
8018 struct cpumask *new_mask)
8019{
8020 if (len < cpumask_size())
8021 cpumask_clear(new_mask);
8022 else if (len > cpumask_size())
8023 len = cpumask_size();
8024
8025 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
8026}
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
8037 unsigned long __user *, user_mask_ptr)
8038{
8039 cpumask_var_t new_mask;
8040 int retval;
8041
8042 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
8043 return -ENOMEM;
8044
8045 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
8046 if (retval == 0)
8047 retval = sched_setaffinity(pid, new_mask);
8048 free_cpumask_var(new_mask);
8049 return retval;
8050}
8051
8052long sched_getaffinity(pid_t pid, struct cpumask *mask)
8053{
8054 struct task_struct *p;
8055 unsigned long flags;
8056 int retval;
8057
8058 rcu_read_lock();
8059
8060 retval = -ESRCH;
8061 p = find_process_by_pid(pid);
8062 if (!p)
8063 goto out_unlock;
8064
8065 retval = security_task_getscheduler(p);
8066 if (retval)
8067 goto out_unlock;
8068
8069 raw_spin_lock_irqsave(&p->pi_lock, flags);
8070 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
8071 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
8072
8073out_unlock:
8074 rcu_read_unlock();
8075
8076 return retval;
8077}
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
8089 unsigned long __user *, user_mask_ptr)
8090{
8091 int ret;
8092 cpumask_var_t mask;
8093
8094 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
8095 return -EINVAL;
8096 if (len & (sizeof(unsigned long)-1))
8097 return -EINVAL;
8098
8099 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
8100 return -ENOMEM;
8101
8102 ret = sched_getaffinity(pid, mask);
8103 if (ret == 0) {
8104 unsigned int retlen = min(len, cpumask_size());
8105
8106 if (copy_to_user(user_mask_ptr, mask, retlen))
8107 ret = -EFAULT;
8108 else
8109 ret = retlen;
8110 }
8111 free_cpumask_var(mask);
8112
8113 return ret;
8114}
8115
8116static void do_sched_yield(void)
8117{
8118 struct rq_flags rf;
8119 struct rq *rq;
8120
8121 rq = this_rq_lock_irq(&rf);
8122
8123 schedstat_inc(rq->yld_count);
8124 current->sched_class->yield_task(rq);
8125
8126 preempt_disable();
8127 rq_unlock_irq(rq, &rf);
8128 sched_preempt_enable_no_resched();
8129
8130 schedule();
8131}
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141SYSCALL_DEFINE0(sched_yield)
8142{
8143 do_sched_yield();
8144 return 0;
8145}
8146
8147#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
8148int __sched __cond_resched(void)
8149{
8150 if (should_resched(0)) {
8151 preempt_schedule_common();
8152 return 1;
8153 }
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165#ifndef CONFIG_PREEMPT_RCU
8166 rcu_all_qs();
8167#endif
8168 return 0;
8169}
8170EXPORT_SYMBOL(__cond_resched);
8171#endif
8172
8173#ifdef CONFIG_PREEMPT_DYNAMIC
8174DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
8175EXPORT_STATIC_CALL_TRAMP(cond_resched);
8176
8177DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
8178EXPORT_STATIC_CALL_TRAMP(might_resched);
8179#endif
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189int __cond_resched_lock(spinlock_t *lock)
8190{
8191 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8192 int ret = 0;
8193
8194 lockdep_assert_held(lock);
8195
8196 if (spin_needbreak(lock) || resched) {
8197 spin_unlock(lock);
8198 if (resched)
8199 preempt_schedule_common();
8200 else
8201 cpu_relax();
8202 ret = 1;
8203 spin_lock(lock);
8204 }
8205 return ret;
8206}
8207EXPORT_SYMBOL(__cond_resched_lock);
8208
8209int __cond_resched_rwlock_read(rwlock_t *lock)
8210{
8211 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8212 int ret = 0;
8213
8214 lockdep_assert_held_read(lock);
8215
8216 if (rwlock_needbreak(lock) || resched) {
8217 read_unlock(lock);
8218 if (resched)
8219 preempt_schedule_common();
8220 else
8221 cpu_relax();
8222 ret = 1;
8223 read_lock(lock);
8224 }
8225 return ret;
8226}
8227EXPORT_SYMBOL(__cond_resched_rwlock_read);
8228
8229int __cond_resched_rwlock_write(rwlock_t *lock)
8230{
8231 int resched = should_resched(PREEMPT_LOCK_OFFSET);
8232 int ret = 0;
8233
8234 lockdep_assert_held_write(lock);
8235
8236 if (rwlock_needbreak(lock) || resched) {
8237 write_unlock(lock);
8238 if (resched)
8239 preempt_schedule_common();
8240 else
8241 cpu_relax();
8242 ret = 1;
8243 write_lock(lock);
8244 }
8245 return ret;
8246}
8247EXPORT_SYMBOL(__cond_resched_rwlock_write);
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271void __sched yield(void)
8272{
8273 set_current_state(TASK_RUNNING);
8274 do_sched_yield();
8275}
8276EXPORT_SYMBOL(yield);
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293int __sched yield_to(struct task_struct *p, bool preempt)
8294{
8295 struct task_struct *curr = current;
8296 struct rq *rq, *p_rq;
8297 unsigned long flags;
8298 int yielded = 0;
8299
8300 local_irq_save(flags);
8301 rq = this_rq();
8302
8303again:
8304 p_rq = task_rq(p);
8305
8306
8307
8308
8309 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
8310 yielded = -ESRCH;
8311 goto out_irq;
8312 }
8313
8314 double_rq_lock(rq, p_rq);
8315 if (task_rq(p) != p_rq) {
8316 double_rq_unlock(rq, p_rq);
8317 goto again;
8318 }
8319
8320 if (!curr->sched_class->yield_to_task)
8321 goto out_unlock;
8322
8323 if (curr->sched_class != p->sched_class)
8324 goto out_unlock;
8325
8326 if (task_running(p_rq, p) || !task_is_running(p))
8327 goto out_unlock;
8328
8329 yielded = curr->sched_class->yield_to_task(rq, p);
8330 if (yielded) {
8331 schedstat_inc(rq->yld_count);
8332
8333
8334
8335
8336 if (preempt && rq != p_rq)
8337 resched_curr(p_rq);
8338 }
8339
8340out_unlock:
8341 double_rq_unlock(rq, p_rq);
8342out_irq:
8343 local_irq_restore(flags);
8344
8345 if (yielded > 0)
8346 schedule();
8347
8348 return yielded;
8349}
8350EXPORT_SYMBOL_GPL(yield_to);
8351
8352int io_schedule_prepare(void)
8353{
8354 int old_iowait = current->in_iowait;
8355
8356 current->in_iowait = 1;
8357 blk_schedule_flush_plug(current);
8358
8359 return old_iowait;
8360}
8361
8362void io_schedule_finish(int token)
8363{
8364 current->in_iowait = token;
8365}
8366
8367
8368
8369
8370
8371long __sched io_schedule_timeout(long timeout)
8372{
8373 int token;
8374 long ret;
8375
8376 token = io_schedule_prepare();
8377 ret = schedule_timeout(timeout);
8378 io_schedule_finish(token);
8379
8380 return ret;
8381}
8382EXPORT_SYMBOL(io_schedule_timeout);
8383
8384void __sched io_schedule(void)
8385{
8386 int token;
8387
8388 token = io_schedule_prepare();
8389 schedule();
8390 io_schedule_finish(token);
8391}
8392EXPORT_SYMBOL(io_schedule);
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
8403{
8404 int ret = -EINVAL;
8405
8406 switch (policy) {
8407 case SCHED_FIFO:
8408 case SCHED_RR:
8409 ret = MAX_RT_PRIO-1;
8410 break;
8411 case SCHED_DEADLINE:
8412 case SCHED_NORMAL:
8413 case SCHED_BATCH:
8414 case SCHED_IDLE:
8415 ret = 0;
8416 break;
8417 }
8418 return ret;
8419}
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
8430{
8431 int ret = -EINVAL;
8432
8433 switch (policy) {
8434 case SCHED_FIFO:
8435 case SCHED_RR:
8436 ret = 1;
8437 break;
8438 case SCHED_DEADLINE:
8439 case SCHED_NORMAL:
8440 case SCHED_BATCH:
8441 case SCHED_IDLE:
8442 ret = 0;
8443 }
8444 return ret;
8445}
8446
8447static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
8448{
8449 struct task_struct *p;
8450 unsigned int time_slice;
8451 struct rq_flags rf;
8452 struct rq *rq;
8453 int retval;
8454
8455 if (pid < 0)
8456 return -EINVAL;
8457
8458 retval = -ESRCH;
8459 rcu_read_lock();
8460 p = find_process_by_pid(pid);
8461 if (!p)
8462 goto out_unlock;
8463
8464 retval = security_task_getscheduler(p);
8465 if (retval)
8466 goto out_unlock;
8467
8468 rq = task_rq_lock(p, &rf);
8469 time_slice = 0;
8470 if (p->sched_class->get_rr_interval)
8471 time_slice = p->sched_class->get_rr_interval(rq, p);
8472 task_rq_unlock(rq, p, &rf);
8473
8474 rcu_read_unlock();
8475 jiffies_to_timespec64(time_slice, t);
8476 return 0;
8477
8478out_unlock:
8479 rcu_read_unlock();
8480 return retval;
8481}
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
8495 struct __kernel_timespec __user *, interval)
8496{
8497 struct timespec64 t;
8498 int retval = sched_rr_get_interval(pid, &t);
8499
8500 if (retval == 0)
8501 retval = put_timespec64(&t, interval);
8502
8503 return retval;
8504}
8505
8506#ifdef CONFIG_COMPAT_32BIT_TIME
8507SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
8508 struct old_timespec32 __user *, interval)
8509{
8510 struct timespec64 t;
8511 int retval = sched_rr_get_interval(pid, &t);
8512
8513 if (retval == 0)
8514 retval = put_old_timespec32(&t, interval);
8515 return retval;
8516}
8517#endif
8518
8519void sched_show_task(struct task_struct *p)
8520{
8521 unsigned long free = 0;
8522 int ppid;
8523
8524 if (!try_get_task_stack(p))
8525 return;
8526
8527 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
8528
8529 if (task_is_running(p))
8530 pr_cont(" running task ");
8531#ifdef CONFIG_DEBUG_STACK_USAGE
8532 free = stack_not_used(p);
8533#endif
8534 ppid = 0;
8535 rcu_read_lock();
8536 if (pid_alive(p))
8537 ppid = task_pid_nr(rcu_dereference(p->real_parent));
8538 rcu_read_unlock();
8539 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
8540 free, task_pid_nr(p), ppid,
8541 (unsigned long)task_thread_info(p)->flags);
8542
8543 print_worker_info(KERN_INFO, p);
8544 print_stop_info(KERN_INFO, p);
8545 show_stack(p, NULL, KERN_INFO);
8546 put_task_stack(p);
8547}
8548EXPORT_SYMBOL_GPL(sched_show_task);
8549
8550static inline bool
8551state_filter_match(unsigned long state_filter, struct task_struct *p)
8552{
8553 unsigned int state = READ_ONCE(p->__state);
8554
8555
8556 if (!state_filter)
8557 return true;
8558
8559
8560 if (!(state & state_filter))
8561 return false;
8562
8563
8564
8565
8566
8567 if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
8568 return false;
8569
8570 return true;
8571}
8572
8573
8574void show_state_filter(unsigned int state_filter)
8575{
8576 struct task_struct *g, *p;
8577
8578 rcu_read_lock();
8579 for_each_process_thread(g, p) {
8580
8581
8582
8583
8584
8585
8586
8587 touch_nmi_watchdog();
8588 touch_all_softlockup_watchdogs();
8589 if (state_filter_match(state_filter, p))
8590 sched_show_task(p);
8591 }
8592
8593#ifdef CONFIG_SCHED_DEBUG
8594 if (!state_filter)
8595 sysrq_sched_debug_show();
8596#endif
8597 rcu_read_unlock();
8598
8599
8600
8601 if (!state_filter)
8602 debug_show_all_locks();
8603}
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613void __init init_idle(struct task_struct *idle, int cpu)
8614{
8615 struct rq *rq = cpu_rq(cpu);
8616 unsigned long flags;
8617
8618 __sched_fork(0, idle);
8619
8620
8621
8622
8623
8624
8625
8626 set_kthread_struct(idle);
8627
8628 raw_spin_lock_irqsave(&idle->pi_lock, flags);
8629 raw_spin_rq_lock(rq);
8630
8631 idle->__state = TASK_RUNNING;
8632 idle->se.exec_start = sched_clock();
8633
8634
8635
8636
8637 idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
8638 kthread_set_per_cpu(idle, cpu);
8639
8640 scs_task_reset(idle);
8641 kasan_unpoison_task_stack(idle);
8642
8643#ifdef CONFIG_SMP
8644
8645
8646
8647
8648
8649
8650 set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
8651#endif
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662 rcu_read_lock();
8663 __set_task_cpu(idle, cpu);
8664 rcu_read_unlock();
8665
8666 rq->idle = idle;
8667 rcu_assign_pointer(rq->curr, idle);
8668 idle->on_rq = TASK_ON_RQ_QUEUED;
8669#ifdef CONFIG_SMP
8670 idle->on_cpu = 1;
8671#endif
8672 raw_spin_rq_unlock(rq);
8673 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
8674
8675
8676 init_idle_preempt_count(idle, cpu);
8677
8678
8679
8680
8681 idle->sched_class = &idle_sched_class;
8682 ftrace_graph_init_idle_task(idle, cpu);
8683 vtime_init_idle(idle, cpu);
8684#ifdef CONFIG_SMP
8685 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
8686#endif
8687}
8688
8689#ifdef CONFIG_SMP
8690
8691int cpuset_cpumask_can_shrink(const struct cpumask *cur,
8692 const struct cpumask *trial)
8693{
8694 int ret = 1;
8695
8696 if (!cpumask_weight(cur))
8697 return ret;
8698
8699 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
8700
8701 return ret;
8702}
8703
8704int task_can_attach(struct task_struct *p,
8705 const struct cpumask *cs_cpus_allowed)
8706{
8707 int ret = 0;
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718 if (p->flags & PF_NO_SETAFFINITY) {
8719 ret = -EINVAL;
8720 goto out;
8721 }
8722
8723 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
8724 cs_cpus_allowed))
8725 ret = dl_task_can_attach(p, cs_cpus_allowed);
8726
8727out:
8728 return ret;
8729}
8730
8731bool sched_smp_initialized __read_mostly;
8732
8733#ifdef CONFIG_NUMA_BALANCING
8734
8735int migrate_task_to(struct task_struct *p, int target_cpu)
8736{
8737 struct migration_arg arg = { p, target_cpu };
8738 int curr_cpu = task_cpu(p);
8739
8740 if (curr_cpu == target_cpu)
8741 return 0;
8742
8743 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
8744 return -EINVAL;
8745
8746
8747
8748 trace_sched_move_numa(p, curr_cpu, target_cpu);
8749 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
8750}
8751
8752
8753
8754
8755
8756void sched_setnuma(struct task_struct *p, int nid)
8757{
8758 bool queued, running;
8759 struct rq_flags rf;
8760 struct rq *rq;
8761
8762 rq = task_rq_lock(p, &rf);
8763 queued = task_on_rq_queued(p);
8764 running = task_current(rq, p);
8765
8766 if (queued)
8767 dequeue_task(rq, p, DEQUEUE_SAVE);
8768 if (running)
8769 put_prev_task(rq, p);
8770
8771 p->numa_preferred_nid = nid;
8772
8773 if (queued)
8774 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
8775 if (running)
8776 set_next_task(rq, p);
8777 task_rq_unlock(rq, p, &rf);
8778}
8779#endif
8780
8781#ifdef CONFIG_HOTPLUG_CPU
8782
8783
8784
8785
8786void idle_task_exit(void)
8787{
8788 struct mm_struct *mm = current->active_mm;
8789
8790 BUG_ON(cpu_online(smp_processor_id()));
8791 BUG_ON(current != this_rq()->idle);
8792
8793 if (mm != &init_mm) {
8794 switch_mm(mm, &init_mm, current);
8795 finish_arch_post_lock_switch();
8796 }
8797
8798 scs_task_reset(current);
8799
8800}
8801
8802static int __balance_push_cpu_stop(void *arg)
8803{
8804 struct task_struct *p = arg;
8805 struct rq *rq = this_rq();
8806 struct rq_flags rf;
8807 int cpu;
8808
8809 raw_spin_lock_irq(&p->pi_lock);
8810 rq_lock(rq, &rf);
8811
8812 update_rq_clock(rq);
8813
8814 if (task_rq(p) == rq && task_on_rq_queued(p)) {
8815 cpu = select_fallback_rq(rq->cpu, p);
8816 rq = __migrate_task(rq, &rf, p, cpu);
8817 }
8818
8819 rq_unlock(rq, &rf);
8820 raw_spin_unlock_irq(&p->pi_lock);
8821
8822 put_task_struct(p);
8823
8824 return 0;
8825}
8826
8827static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
8828
8829
8830
8831
8832
8833
8834
8835static void balance_push(struct rq *rq)
8836{
8837 struct task_struct *push_task = rq->curr;
8838
8839 lockdep_assert_rq_held(rq);
8840
8841
8842
8843
8844 rq->balance_callback = &balance_push_callback;
8845
8846
8847
8848
8849
8850 if (!cpu_dying(rq->cpu) || rq != this_rq())
8851 return;
8852
8853
8854
8855
8856
8857 if (kthread_is_per_cpu(push_task) ||
8858 is_migration_disabled(push_task)) {
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871 if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
8872 rcuwait_active(&rq->hotplug_wait)) {
8873 raw_spin_rq_unlock(rq);
8874 rcuwait_wake_up(&rq->hotplug_wait);
8875 raw_spin_rq_lock(rq);
8876 }
8877 return;
8878 }
8879
8880 get_task_struct(push_task);
8881
8882
8883
8884
8885 raw_spin_rq_unlock(rq);
8886 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
8887 this_cpu_ptr(&push_work));
8888
8889
8890
8891
8892
8893 raw_spin_rq_lock(rq);
8894}
8895
8896static void balance_push_set(int cpu, bool on)
8897{
8898 struct rq *rq = cpu_rq(cpu);
8899 struct rq_flags rf;
8900
8901 rq_lock_irqsave(rq, &rf);
8902 if (on) {
8903 WARN_ON_ONCE(rq->balance_callback);
8904 rq->balance_callback = &balance_push_callback;
8905 } else if (rq->balance_callback == &balance_push_callback) {
8906 rq->balance_callback = NULL;
8907 }
8908 rq_unlock_irqrestore(rq, &rf);
8909}
8910
8911
8912
8913
8914
8915
8916
8917static void balance_hotplug_wait(void)
8918{
8919 struct rq *rq = this_rq();
8920
8921 rcuwait_wait_event(&rq->hotplug_wait,
8922 rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
8923 TASK_UNINTERRUPTIBLE);
8924}
8925
8926#else
8927
8928static inline void balance_push(struct rq *rq)
8929{
8930}
8931
8932static inline void balance_push_set(int cpu, bool on)
8933{
8934}
8935
8936static inline void balance_hotplug_wait(void)
8937{
8938}
8939
8940#endif
8941
8942void set_rq_online(struct rq *rq)
8943{
8944 if (!rq->online) {
8945 const struct sched_class *class;
8946
8947 cpumask_set_cpu(rq->cpu, rq->rd->online);
8948 rq->online = 1;
8949
8950 for_each_class(class) {
8951 if (class->rq_online)
8952 class->rq_online(rq);
8953 }
8954 }
8955}
8956
8957void set_rq_offline(struct rq *rq)
8958{
8959 if (rq->online) {
8960 const struct sched_class *class;
8961
8962 for_each_class(class) {
8963 if (class->rq_offline)
8964 class->rq_offline(rq);
8965 }
8966
8967 cpumask_clear_cpu(rq->cpu, rq->rd->online);
8968 rq->online = 0;
8969 }
8970}
8971
8972
8973
8974
8975static int num_cpus_frozen;
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985static void cpuset_cpu_active(void)
8986{
8987 if (cpuhp_tasks_frozen) {
8988
8989
8990
8991
8992
8993
8994 partition_sched_domains(1, NULL, NULL);
8995 if (--num_cpus_frozen)
8996 return;
8997
8998
8999
9000
9001
9002 cpuset_force_rebuild();
9003 }
9004 cpuset_update_active_cpus();
9005}
9006
9007static int cpuset_cpu_inactive(unsigned int cpu)
9008{
9009 if (!cpuhp_tasks_frozen) {
9010 if (dl_cpu_busy(cpu))
9011 return -EBUSY;
9012 cpuset_update_active_cpus();
9013 } else {
9014 num_cpus_frozen++;
9015 partition_sched_domains(1, NULL, NULL);
9016 }
9017 return 0;
9018}
9019
9020int sched_cpu_activate(unsigned int cpu)
9021{
9022 struct rq *rq = cpu_rq(cpu);
9023 struct rq_flags rf;
9024
9025
9026
9027
9028
9029 balance_push_set(cpu, false);
9030
9031#ifdef CONFIG_SCHED_SMT
9032
9033
9034
9035 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9036 static_branch_inc_cpuslocked(&sched_smt_present);
9037#endif
9038 set_cpu_active(cpu, true);
9039
9040 if (sched_smp_initialized) {
9041 sched_domains_numa_masks_set(cpu);
9042 cpuset_cpu_active();
9043 }
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054 rq_lock_irqsave(rq, &rf);
9055 if (rq->rd) {
9056 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9057 set_rq_online(rq);
9058 }
9059 rq_unlock_irqrestore(rq, &rf);
9060
9061 return 0;
9062}
9063
9064int sched_cpu_deactivate(unsigned int cpu)
9065{
9066 struct rq *rq = cpu_rq(cpu);
9067 struct rq_flags rf;
9068 int ret;
9069
9070
9071
9072
9073
9074 nohz_balance_exit_idle(rq);
9075
9076 set_cpu_active(cpu, false);
9077
9078
9079
9080
9081
9082
9083
9084 balance_push_set(cpu, true);
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096 synchronize_rcu();
9097
9098 rq_lock_irqsave(rq, &rf);
9099 if (rq->rd) {
9100 update_rq_clock(rq);
9101 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9102 set_rq_offline(rq);
9103 }
9104 rq_unlock_irqrestore(rq, &rf);
9105
9106#ifdef CONFIG_SCHED_SMT
9107
9108
9109
9110 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9111 static_branch_dec_cpuslocked(&sched_smt_present);
9112
9113 sched_core_cpu_deactivate(cpu);
9114#endif
9115
9116 if (!sched_smp_initialized)
9117 return 0;
9118
9119 ret = cpuset_cpu_inactive(cpu);
9120 if (ret) {
9121 balance_push_set(cpu, false);
9122 set_cpu_active(cpu, true);
9123 return ret;
9124 }
9125 sched_domains_numa_masks_clear(cpu);
9126 return 0;
9127}
9128
9129static void sched_rq_cpu_starting(unsigned int cpu)
9130{
9131 struct rq *rq = cpu_rq(cpu);
9132
9133 rq->calc_load_update = calc_load_update;
9134 update_max_interval();
9135}
9136
9137int sched_cpu_starting(unsigned int cpu)
9138{
9139 sched_core_cpu_starting(cpu);
9140 sched_rq_cpu_starting(cpu);
9141 sched_tick_start(cpu);
9142 return 0;
9143}
9144
9145#ifdef CONFIG_HOTPLUG_CPU
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158int sched_cpu_wait_empty(unsigned int cpu)
9159{
9160 balance_hotplug_wait();
9161 return 0;
9162}
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173static void calc_load_migrate(struct rq *rq)
9174{
9175 long delta = calc_load_fold_active(rq, 1);
9176
9177 if (delta)
9178 atomic_long_add(delta, &calc_load_tasks);
9179}
9180
9181static void dump_rq_tasks(struct rq *rq, const char *loglvl)
9182{
9183 struct task_struct *g, *p;
9184 int cpu = cpu_of(rq);
9185
9186 lockdep_assert_rq_held(rq);
9187
9188 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
9189 for_each_process_thread(g, p) {
9190 if (task_cpu(p) != cpu)
9191 continue;
9192
9193 if (!task_on_rq_queued(p))
9194 continue;
9195
9196 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
9197 }
9198}
9199
9200int sched_cpu_dying(unsigned int cpu)
9201{
9202 struct rq *rq = cpu_rq(cpu);
9203 struct rq_flags rf;
9204
9205
9206 sched_tick_stop(cpu);
9207
9208 rq_lock_irqsave(rq, &rf);
9209 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
9210 WARN(true, "Dying CPU not properly vacated!");
9211 dump_rq_tasks(rq, KERN_WARNING);
9212 }
9213 rq_unlock_irqrestore(rq, &rf);
9214
9215 calc_load_migrate(rq);
9216 update_max_interval();
9217 hrtick_clear(rq);
9218 sched_core_cpu_dying(cpu);
9219 return 0;
9220}
9221#endif
9222
9223void __init sched_init_smp(void)
9224{
9225 sched_init_numa();
9226
9227
9228
9229
9230
9231
9232 mutex_lock(&sched_domains_mutex);
9233 sched_init_domains(cpu_active_mask);
9234 mutex_unlock(&sched_domains_mutex);
9235
9236
9237 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
9238 BUG();
9239 current->flags &= ~PF_NO_SETAFFINITY;
9240 sched_init_granularity();
9241
9242 init_sched_rt_class();
9243 init_sched_dl_class();
9244
9245 sched_smp_initialized = true;
9246}
9247
9248static int __init migration_init(void)
9249{
9250 sched_cpu_starting(smp_processor_id());
9251 return 0;
9252}
9253early_initcall(migration_init);
9254
9255#else
9256void __init sched_init_smp(void)
9257{
9258 sched_init_granularity();
9259}
9260#endif
9261
9262int in_sched_functions(unsigned long addr)
9263{
9264 return in_lock_functions(addr) ||
9265 (addr >= (unsigned long)__sched_text_start
9266 && addr < (unsigned long)__sched_text_end);
9267}
9268
9269#ifdef CONFIG_CGROUP_SCHED
9270
9271
9272
9273
9274struct task_group root_task_group;
9275LIST_HEAD(task_groups);
9276
9277
9278static struct kmem_cache *task_group_cache __read_mostly;
9279#endif
9280
9281DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
9282DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
9283
9284void __init sched_init(void)
9285{
9286 unsigned long ptr = 0;
9287 int i;
9288
9289
9290 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
9291 &fair_sched_class + 1 != &rt_sched_class ||
9292 &rt_sched_class + 1 != &dl_sched_class);
9293#ifdef CONFIG_SMP
9294 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
9295#endif
9296
9297 wait_bit_init();
9298
9299#ifdef CONFIG_FAIR_GROUP_SCHED
9300 ptr += 2 * nr_cpu_ids * sizeof(void **);
9301#endif
9302#ifdef CONFIG_RT_GROUP_SCHED
9303 ptr += 2 * nr_cpu_ids * sizeof(void **);
9304#endif
9305 if (ptr) {
9306 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
9307
9308#ifdef CONFIG_FAIR_GROUP_SCHED
9309 root_task_group.se = (struct sched_entity **)ptr;
9310 ptr += nr_cpu_ids * sizeof(void **);
9311
9312 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9313 ptr += nr_cpu_ids * sizeof(void **);
9314
9315 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
9316 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
9317#endif
9318#ifdef CONFIG_RT_GROUP_SCHED
9319 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9320 ptr += nr_cpu_ids * sizeof(void **);
9321
9322 root_task_group.rt_rq = (struct rt_rq **)ptr;
9323 ptr += nr_cpu_ids * sizeof(void **);
9324
9325#endif
9326 }
9327#ifdef CONFIG_CPUMASK_OFFSTACK
9328 for_each_possible_cpu(i) {
9329 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
9330 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9331 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
9332 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9333 }
9334#endif
9335
9336 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
9337 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
9338
9339#ifdef CONFIG_SMP
9340 init_defrootdomain();
9341#endif
9342
9343#ifdef CONFIG_RT_GROUP_SCHED
9344 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9345 global_rt_period(), global_rt_runtime());
9346#endif
9347
9348#ifdef CONFIG_CGROUP_SCHED
9349 task_group_cache = KMEM_CACHE(task_group, 0);
9350
9351 list_add(&root_task_group.list, &task_groups);
9352 INIT_LIST_HEAD(&root_task_group.children);
9353 INIT_LIST_HEAD(&root_task_group.siblings);
9354 autogroup_init(&init_task);
9355#endif
9356
9357 for_each_possible_cpu(i) {
9358 struct rq *rq;
9359
9360 rq = cpu_rq(i);
9361 raw_spin_lock_init(&rq->__lock);
9362 rq->nr_running = 0;
9363 rq->calc_load_active = 0;
9364 rq->calc_load_update = jiffies + LOAD_FREQ;
9365 init_cfs_rq(&rq->cfs);
9366 init_rt_rq(&rq->rt);
9367 init_dl_rq(&rq->dl);
9368#ifdef CONFIG_FAIR_GROUP_SCHED
9369 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9370 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
9391#endif
9392
9393 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9394#ifdef CONFIG_RT_GROUP_SCHED
9395 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
9396#endif
9397#ifdef CONFIG_SMP
9398 rq->sd = NULL;
9399 rq->rd = NULL;
9400 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
9401 rq->balance_callback = &balance_push_callback;
9402 rq->active_balance = 0;
9403 rq->next_balance = jiffies;
9404 rq->push_cpu = 0;
9405 rq->cpu = i;
9406 rq->online = 0;
9407 rq->idle_stamp = 0;
9408 rq->avg_idle = 2*sysctl_sched_migration_cost;
9409 rq->wake_stamp = jiffies;
9410 rq->wake_avg_idle = rq->avg_idle;
9411 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
9412
9413 INIT_LIST_HEAD(&rq->cfs_tasks);
9414
9415 rq_attach_root(rq, &def_root_domain);
9416#ifdef CONFIG_NO_HZ_COMMON
9417 rq->last_blocked_load_update_tick = jiffies;
9418 atomic_set(&rq->nohz_flags, 0);
9419
9420 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
9421#endif
9422#ifdef CONFIG_HOTPLUG_CPU
9423 rcuwait_init(&rq->hotplug_wait);
9424#endif
9425#endif
9426 hrtick_rq_init(rq);
9427 atomic_set(&rq->nr_iowait, 0);
9428
9429#ifdef CONFIG_SCHED_CORE
9430 rq->core = rq;
9431 rq->core_pick = NULL;
9432 rq->core_enabled = 0;
9433 rq->core_tree = RB_ROOT;
9434 rq->core_forceidle = false;
9435
9436 rq->core_cookie = 0UL;
9437#endif
9438 }
9439
9440 set_load_weight(&init_task, false);
9441
9442
9443
9444
9445 mmgrab(&init_mm);
9446 enter_lazy_tlb(&init_mm, current);
9447
9448
9449
9450
9451
9452
9453
9454 init_idle(current, smp_processor_id());
9455
9456 calc_load_update = jiffies + LOAD_FREQ;
9457
9458#ifdef CONFIG_SMP
9459 idle_thread_set_boot_cpu();
9460 balance_push_set(smp_processor_id(), false);
9461#endif
9462 init_sched_fair_class();
9463
9464 psi_init();
9465
9466 init_uclamp();
9467
9468 scheduler_running = 1;
9469}
9470
9471#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9472static inline int preempt_count_equals(int preempt_offset)
9473{
9474 int nested = preempt_count() + rcu_preempt_depth();
9475
9476 return (nested == preempt_offset);
9477}
9478
9479void __might_sleep(const char *file, int line, int preempt_offset)
9480{
9481 unsigned int state = get_current_state();
9482
9483
9484
9485
9486
9487 WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
9488 "do not call blocking ops when !TASK_RUNNING; "
9489 "state=%x set at [<%p>] %pS\n", state,
9490 (void *)current->task_state_change,
9491 (void *)current->task_state_change);
9492
9493 ___might_sleep(file, line, preempt_offset);
9494}
9495EXPORT_SYMBOL(__might_sleep);
9496
9497void ___might_sleep(const char *file, int line, int preempt_offset)
9498{
9499
9500 static unsigned long prev_jiffy;
9501
9502 unsigned long preempt_disable_ip;
9503
9504
9505 rcu_sleep_check();
9506
9507 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
9508 !is_idle_task(current) && !current->non_block_count) ||
9509 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
9510 oops_in_progress)
9511 return;
9512
9513 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9514 return;
9515 prev_jiffy = jiffies;
9516
9517
9518 preempt_disable_ip = get_preempt_disable_ip(current);
9519
9520 printk(KERN_ERR
9521 "BUG: sleeping function called from invalid context at %s:%d\n",
9522 file, line);
9523 printk(KERN_ERR
9524 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
9525 in_atomic(), irqs_disabled(), current->non_block_count,
9526 current->pid, current->comm);
9527
9528 if (task_stack_end_corrupted(current))
9529 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
9530
9531 debug_show_held_locks(current);
9532 if (irqs_disabled())
9533 print_irqtrace_events(current);
9534 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
9535 && !preempt_count_equals(preempt_offset)) {
9536 pr_err("Preemption disabled at:");
9537 print_ip_sym(KERN_ERR, preempt_disable_ip);
9538 }
9539 dump_stack();
9540 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9541}
9542EXPORT_SYMBOL(___might_sleep);
9543
9544void __cant_sleep(const char *file, int line, int preempt_offset)
9545{
9546 static unsigned long prev_jiffy;
9547
9548 if (irqs_disabled())
9549 return;
9550
9551 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9552 return;
9553
9554 if (preempt_count() > preempt_offset)
9555 return;
9556
9557 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9558 return;
9559 prev_jiffy = jiffies;
9560
9561 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
9562 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9563 in_atomic(), irqs_disabled(),
9564 current->pid, current->comm);
9565
9566 debug_show_held_locks(current);
9567 dump_stack();
9568 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9569}
9570EXPORT_SYMBOL_GPL(__cant_sleep);
9571
9572#ifdef CONFIG_SMP
9573void __cant_migrate(const char *file, int line)
9574{
9575 static unsigned long prev_jiffy;
9576
9577 if (irqs_disabled())
9578 return;
9579
9580 if (is_migration_disabled(current))
9581 return;
9582
9583 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9584 return;
9585
9586 if (preempt_count() > 0)
9587 return;
9588
9589 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9590 return;
9591 prev_jiffy = jiffies;
9592
9593 pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
9594 pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
9595 in_atomic(), irqs_disabled(), is_migration_disabled(current),
9596 current->pid, current->comm);
9597
9598 debug_show_held_locks(current);
9599 dump_stack();
9600 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9601}
9602EXPORT_SYMBOL_GPL(__cant_migrate);
9603#endif
9604#endif
9605
9606#ifdef CONFIG_MAGIC_SYSRQ
9607void normalize_rt_tasks(void)
9608{
9609 struct task_struct *g, *p;
9610 struct sched_attr attr = {
9611 .sched_policy = SCHED_NORMAL,
9612 };
9613
9614 read_lock(&tasklist_lock);
9615 for_each_process_thread(g, p) {
9616
9617
9618
9619 if (p->flags & PF_KTHREAD)
9620 continue;
9621
9622 p->se.exec_start = 0;
9623 schedstat_set(p->se.statistics.wait_start, 0);
9624 schedstat_set(p->se.statistics.sleep_start, 0);
9625 schedstat_set(p->se.statistics.block_start, 0);
9626
9627 if (!dl_task(p) && !rt_task(p)) {
9628
9629
9630
9631
9632 if (task_nice(p) < 0)
9633 set_user_nice(p, 0);
9634 continue;
9635 }
9636
9637 __sched_setscheduler(p, &attr, false, false);
9638 }
9639 read_unlock(&tasklist_lock);
9640}
9641
9642#endif
9643
9644#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663struct task_struct *curr_task(int cpu)
9664{
9665 return cpu_curr(cpu);
9666}
9667
9668#endif
9669
9670#ifdef CONFIG_IA64
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686void ia64_set_curr_task(int cpu, struct task_struct *p)
9687{
9688 cpu_curr(cpu) = p;
9689}
9690
9691#endif
9692
9693#ifdef CONFIG_CGROUP_SCHED
9694
9695static DEFINE_SPINLOCK(task_group_lock);
9696
9697static inline void alloc_uclamp_sched_group(struct task_group *tg,
9698 struct task_group *parent)
9699{
9700#ifdef CONFIG_UCLAMP_TASK_GROUP
9701 enum uclamp_id clamp_id;
9702
9703 for_each_clamp_id(clamp_id) {
9704 uclamp_se_set(&tg->uclamp_req[clamp_id],
9705 uclamp_none(clamp_id), false);
9706 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
9707 }
9708#endif
9709}
9710
9711static void sched_free_group(struct task_group *tg)
9712{
9713 free_fair_sched_group(tg);
9714 free_rt_sched_group(tg);
9715 autogroup_free(tg);
9716 kmem_cache_free(task_group_cache, tg);
9717}
9718
9719
9720struct task_group *sched_create_group(struct task_group *parent)
9721{
9722 struct task_group *tg;
9723
9724 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
9725 if (!tg)
9726 return ERR_PTR(-ENOMEM);
9727
9728 if (!alloc_fair_sched_group(tg, parent))
9729 goto err;
9730
9731 if (!alloc_rt_sched_group(tg, parent))
9732 goto err;
9733
9734 alloc_uclamp_sched_group(tg, parent);
9735
9736 return tg;
9737
9738err:
9739 sched_free_group(tg);
9740 return ERR_PTR(-ENOMEM);
9741}
9742
9743void sched_online_group(struct task_group *tg, struct task_group *parent)
9744{
9745 unsigned long flags;
9746
9747 spin_lock_irqsave(&task_group_lock, flags);
9748 list_add_rcu(&tg->list, &task_groups);
9749
9750
9751 WARN_ON(!parent);
9752
9753 tg->parent = parent;
9754 INIT_LIST_HEAD(&tg->children);
9755 list_add_rcu(&tg->siblings, &parent->children);
9756 spin_unlock_irqrestore(&task_group_lock, flags);
9757
9758 online_fair_sched_group(tg);
9759}
9760
9761
9762static void sched_free_group_rcu(struct rcu_head *rhp)
9763{
9764
9765 sched_free_group(container_of(rhp, struct task_group, rcu));
9766}
9767
9768void sched_destroy_group(struct task_group *tg)
9769{
9770
9771 call_rcu(&tg->rcu, sched_free_group_rcu);
9772}
9773
9774void sched_offline_group(struct task_group *tg)
9775{
9776 unsigned long flags;
9777
9778
9779 unregister_fair_sched_group(tg);
9780
9781 spin_lock_irqsave(&task_group_lock, flags);
9782 list_del_rcu(&tg->list);
9783 list_del_rcu(&tg->siblings);
9784 spin_unlock_irqrestore(&task_group_lock, flags);
9785}
9786
9787static void sched_change_group(struct task_struct *tsk, int type)
9788{
9789 struct task_group *tg;
9790
9791
9792
9793
9794
9795
9796 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
9797 struct task_group, css);
9798 tg = autogroup_task_group(tsk, tg);
9799 tsk->sched_task_group = tg;
9800
9801#ifdef CONFIG_FAIR_GROUP_SCHED
9802 if (tsk->sched_class->task_change_group)
9803 tsk->sched_class->task_change_group(tsk, type);
9804 else
9805#endif
9806 set_task_rq(tsk, task_cpu(tsk));
9807}
9808
9809
9810
9811
9812
9813
9814
9815
9816void sched_move_task(struct task_struct *tsk)
9817{
9818 int queued, running, queue_flags =
9819 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
9820 struct rq_flags rf;
9821 struct rq *rq;
9822
9823 rq = task_rq_lock(tsk, &rf);
9824 update_rq_clock(rq);
9825
9826 running = task_current(rq, tsk);
9827 queued = task_on_rq_queued(tsk);
9828
9829 if (queued)
9830 dequeue_task(rq, tsk, queue_flags);
9831 if (running)
9832 put_prev_task(rq, tsk);
9833
9834 sched_change_group(tsk, TASK_MOVE_GROUP);
9835
9836 if (queued)
9837 enqueue_task(rq, tsk, queue_flags);
9838 if (running) {
9839 set_next_task(rq, tsk);
9840
9841
9842
9843
9844
9845 resched_curr(rq);
9846 }
9847
9848 task_rq_unlock(rq, tsk, &rf);
9849}
9850
9851static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
9852{
9853 return css ? container_of(css, struct task_group, css) : NULL;
9854}
9855
9856static struct cgroup_subsys_state *
9857cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9858{
9859 struct task_group *parent = css_tg(parent_css);
9860 struct task_group *tg;
9861
9862 if (!parent) {
9863
9864 return &root_task_group.css;
9865 }
9866
9867 tg = sched_create_group(parent);
9868 if (IS_ERR(tg))
9869 return ERR_PTR(-ENOMEM);
9870
9871 return &tg->css;
9872}
9873
9874
9875static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
9876{
9877 struct task_group *tg = css_tg(css);
9878 struct task_group *parent = css_tg(css->parent);
9879
9880 if (parent)
9881 sched_online_group(tg, parent);
9882
9883#ifdef CONFIG_UCLAMP_TASK_GROUP
9884
9885 mutex_lock(&uclamp_mutex);
9886 rcu_read_lock();
9887 cpu_util_update_eff(css);
9888 rcu_read_unlock();
9889 mutex_unlock(&uclamp_mutex);
9890#endif
9891
9892 return 0;
9893}
9894
9895static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
9896{
9897 struct task_group *tg = css_tg(css);
9898
9899 sched_offline_group(tg);
9900}
9901
9902static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
9903{
9904 struct task_group *tg = css_tg(css);
9905
9906
9907
9908
9909 sched_free_group(tg);
9910}
9911
9912
9913
9914
9915
9916static void cpu_cgroup_fork(struct task_struct *task)
9917{
9918 struct rq_flags rf;
9919 struct rq *rq;
9920
9921 rq = task_rq_lock(task, &rf);
9922
9923 update_rq_clock(rq);
9924 sched_change_group(task, TASK_SET_GROUP);
9925
9926 task_rq_unlock(rq, task, &rf);
9927}
9928
9929static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
9930{
9931 struct task_struct *task;
9932 struct cgroup_subsys_state *css;
9933 int ret = 0;
9934
9935 cgroup_taskset_for_each(task, css, tset) {
9936#ifdef CONFIG_RT_GROUP_SCHED
9937 if (!sched_rt_can_attach(css_tg(css), task))
9938 return -EINVAL;
9939#endif
9940
9941
9942
9943
9944 raw_spin_lock_irq(&task->pi_lock);
9945
9946
9947
9948
9949
9950 if (READ_ONCE(task->__state) == TASK_NEW)
9951 ret = -EINVAL;
9952 raw_spin_unlock_irq(&task->pi_lock);
9953
9954 if (ret)
9955 break;
9956 }
9957 return ret;
9958}
9959
9960static void cpu_cgroup_attach(struct cgroup_taskset *tset)
9961{
9962 struct task_struct *task;
9963 struct cgroup_subsys_state *css;
9964
9965 cgroup_taskset_for_each(task, css, tset)
9966 sched_move_task(task);
9967}
9968
9969#ifdef CONFIG_UCLAMP_TASK_GROUP
9970static void cpu_util_update_eff(struct cgroup_subsys_state *css)
9971{
9972 struct cgroup_subsys_state *top_css = css;
9973 struct uclamp_se *uc_parent = NULL;
9974 struct uclamp_se *uc_se = NULL;
9975 unsigned int eff[UCLAMP_CNT];
9976 enum uclamp_id clamp_id;
9977 unsigned int clamps;
9978
9979 lockdep_assert_held(&uclamp_mutex);
9980 SCHED_WARN_ON(!rcu_read_lock_held());
9981
9982 css_for_each_descendant_pre(css, top_css) {
9983 uc_parent = css_tg(css)->parent
9984 ? css_tg(css)->parent->uclamp : NULL;
9985
9986 for_each_clamp_id(clamp_id) {
9987
9988 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
9989
9990 if (uc_parent &&
9991 eff[clamp_id] > uc_parent[clamp_id].value) {
9992 eff[clamp_id] = uc_parent[clamp_id].value;
9993 }
9994 }
9995
9996 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
9997
9998
9999 clamps = 0x0;
10000 uc_se = css_tg(css)->uclamp;
10001 for_each_clamp_id(clamp_id) {
10002 if (eff[clamp_id] == uc_se[clamp_id].value)
10003 continue;
10004 uc_se[clamp_id].value = eff[clamp_id];
10005 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
10006 clamps |= (0x1 << clamp_id);
10007 }
10008 if (!clamps) {
10009 css = css_rightmost_descendant(css);
10010 continue;
10011 }
10012
10013
10014 uclamp_update_active_tasks(css);
10015 }
10016}
10017
10018
10019
10020
10021
10022
10023#define _POW10(exp) ((unsigned int)1e##exp)
10024#define POW10(exp) _POW10(exp)
10025
10026struct uclamp_request {
10027#define UCLAMP_PERCENT_SHIFT 2
10028#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
10029 s64 percent;
10030 u64 util;
10031 int ret;
10032};
10033
10034static inline struct uclamp_request
10035capacity_from_percent(char *buf)
10036{
10037 struct uclamp_request req = {
10038 .percent = UCLAMP_PERCENT_SCALE,
10039 .util = SCHED_CAPACITY_SCALE,
10040 .ret = 0,
10041 };
10042
10043 buf = strim(buf);
10044 if (strcmp(buf, "max")) {
10045 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
10046 &req.percent);
10047 if (req.ret)
10048 return req;
10049 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
10050 req.ret = -ERANGE;
10051 return req;
10052 }
10053
10054 req.util = req.percent << SCHED_CAPACITY_SHIFT;
10055 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
10056 }
10057
10058 return req;
10059}
10060
10061static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
10062 size_t nbytes, loff_t off,
10063 enum uclamp_id clamp_id)
10064{
10065 struct uclamp_request req;
10066 struct task_group *tg;
10067
10068 req = capacity_from_percent(buf);
10069 if (req.ret)
10070 return req.ret;
10071
10072 static_branch_enable(&sched_uclamp_used);
10073
10074 mutex_lock(&uclamp_mutex);
10075 rcu_read_lock();
10076
10077 tg = css_tg(of_css(of));
10078 if (tg->uclamp_req[clamp_id].value != req.util)
10079 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
10080
10081
10082
10083
10084
10085 tg->uclamp_pct[clamp_id] = req.percent;
10086
10087
10088 cpu_util_update_eff(of_css(of));
10089
10090 rcu_read_unlock();
10091 mutex_unlock(&uclamp_mutex);
10092
10093 return nbytes;
10094}
10095
10096static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
10097 char *buf, size_t nbytes,
10098 loff_t off)
10099{
10100 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
10101}
10102
10103static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
10104 char *buf, size_t nbytes,
10105 loff_t off)
10106{
10107 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
10108}
10109
10110static inline void cpu_uclamp_print(struct seq_file *sf,
10111 enum uclamp_id clamp_id)
10112{
10113 struct task_group *tg;
10114 u64 util_clamp;
10115 u64 percent;
10116 u32 rem;
10117
10118 rcu_read_lock();
10119 tg = css_tg(seq_css(sf));
10120 util_clamp = tg->uclamp_req[clamp_id].value;
10121 rcu_read_unlock();
10122
10123 if (util_clamp == SCHED_CAPACITY_SCALE) {
10124 seq_puts(sf, "max\n");
10125 return;
10126 }
10127
10128 percent = tg->uclamp_pct[clamp_id];
10129 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
10130 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
10131}
10132
10133static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
10134{
10135 cpu_uclamp_print(sf, UCLAMP_MIN);
10136 return 0;
10137}
10138
10139static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
10140{
10141 cpu_uclamp_print(sf, UCLAMP_MAX);
10142 return 0;
10143}
10144#endif
10145
10146#ifdef CONFIG_FAIR_GROUP_SCHED
10147static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
10148 struct cftype *cftype, u64 shareval)
10149{
10150 if (shareval > scale_load_down(ULONG_MAX))
10151 shareval = MAX_SHARES;
10152 return sched_group_set_shares(css_tg(css), scale_load(shareval));
10153}
10154
10155static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
10156 struct cftype *cft)
10157{
10158 struct task_group *tg = css_tg(css);
10159
10160 return (u64) scale_load_down(tg->shares);
10161}
10162
10163#ifdef CONFIG_CFS_BANDWIDTH
10164static DEFINE_MUTEX(cfs_constraints_mutex);
10165
10166const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
10167static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
10168
10169static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
10170
10171static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
10172
10173static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
10174 u64 burst)
10175{
10176 int i, ret = 0, runtime_enabled, runtime_was_enabled;
10177 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10178
10179 if (tg == &root_task_group)
10180 return -EINVAL;
10181
10182
10183
10184
10185
10186
10187 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
10188 return -EINVAL;
10189
10190
10191
10192
10193
10194
10195 if (period > max_cfs_quota_period)
10196 return -EINVAL;
10197
10198
10199
10200
10201 if (quota != RUNTIME_INF && quota > max_cfs_runtime)
10202 return -EINVAL;
10203
10204 if (quota != RUNTIME_INF && (burst > quota ||
10205 burst + quota > max_cfs_runtime))
10206 return -EINVAL;
10207
10208
10209
10210
10211
10212 cpus_read_lock();
10213 mutex_lock(&cfs_constraints_mutex);
10214 ret = __cfs_schedulable(tg, period, quota);
10215 if (ret)
10216 goto out_unlock;
10217
10218 runtime_enabled = quota != RUNTIME_INF;
10219 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
10220
10221
10222
10223
10224 if (runtime_enabled && !runtime_was_enabled)
10225 cfs_bandwidth_usage_inc();
10226 raw_spin_lock_irq(&cfs_b->lock);
10227 cfs_b->period = ns_to_ktime(period);
10228 cfs_b->quota = quota;
10229 cfs_b->burst = burst;
10230
10231 __refill_cfs_bandwidth_runtime(cfs_b);
10232
10233
10234 if (runtime_enabled)
10235 start_cfs_bandwidth(cfs_b);
10236
10237 raw_spin_unlock_irq(&cfs_b->lock);
10238
10239 for_each_online_cpu(i) {
10240 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
10241 struct rq *rq = cfs_rq->rq;
10242 struct rq_flags rf;
10243
10244 rq_lock_irq(rq, &rf);
10245 cfs_rq->runtime_enabled = runtime_enabled;
10246 cfs_rq->runtime_remaining = 0;
10247
10248 if (cfs_rq->throttled)
10249 unthrottle_cfs_rq(cfs_rq);
10250 rq_unlock_irq(rq, &rf);
10251 }
10252 if (runtime_was_enabled && !runtime_enabled)
10253 cfs_bandwidth_usage_dec();
10254out_unlock:
10255 mutex_unlock(&cfs_constraints_mutex);
10256 cpus_read_unlock();
10257
10258 return ret;
10259}
10260
10261static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
10262{
10263 u64 quota, period, burst;
10264
10265 period = ktime_to_ns(tg->cfs_bandwidth.period);
10266 burst = tg->cfs_bandwidth.burst;
10267 if (cfs_quota_us < 0)
10268 quota = RUNTIME_INF;
10269 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
10270 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
10271 else
10272 return -EINVAL;
10273
10274 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10275}
10276
10277static long tg_get_cfs_quota(struct task_group *tg)
10278{
10279 u64 quota_us;
10280
10281 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
10282 return -1;
10283
10284 quota_us = tg->cfs_bandwidth.quota;
10285 do_div(quota_us, NSEC_PER_USEC);
10286
10287 return quota_us;
10288}
10289
10290static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
10291{
10292 u64 quota, period, burst;
10293
10294 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
10295 return -EINVAL;
10296
10297 period = (u64)cfs_period_us * NSEC_PER_USEC;
10298 quota = tg->cfs_bandwidth.quota;
10299 burst = tg->cfs_bandwidth.burst;
10300
10301 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10302}
10303
10304static long tg_get_cfs_period(struct task_group *tg)
10305{
10306 u64 cfs_period_us;
10307
10308 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
10309 do_div(cfs_period_us, NSEC_PER_USEC);
10310
10311 return cfs_period_us;
10312}
10313
10314static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
10315{
10316 u64 quota, period, burst;
10317
10318 if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
10319 return -EINVAL;
10320
10321 burst = (u64)cfs_burst_us * NSEC_PER_USEC;
10322 period = ktime_to_ns(tg->cfs_bandwidth.period);
10323 quota = tg->cfs_bandwidth.quota;
10324
10325 return tg_set_cfs_bandwidth(tg, period, quota, burst);
10326}
10327
10328static long tg_get_cfs_burst(struct task_group *tg)
10329{
10330 u64 burst_us;
10331
10332 burst_us = tg->cfs_bandwidth.burst;
10333 do_div(burst_us, NSEC_PER_USEC);
10334
10335 return burst_us;
10336}
10337
10338static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
10339 struct cftype *cft)
10340{
10341 return tg_get_cfs_quota(css_tg(css));
10342}
10343
10344static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
10345 struct cftype *cftype, s64 cfs_quota_us)
10346{
10347 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
10348}
10349
10350static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
10351 struct cftype *cft)
10352{
10353 return tg_get_cfs_period(css_tg(css));
10354}
10355
10356static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
10357 struct cftype *cftype, u64 cfs_period_us)
10358{
10359 return tg_set_cfs_period(css_tg(css), cfs_period_us);
10360}
10361
10362static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
10363 struct cftype *cft)
10364{
10365 return tg_get_cfs_burst(css_tg(css));
10366}
10367
10368static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
10369 struct cftype *cftype, u64 cfs_burst_us)
10370{
10371 return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
10372}
10373
10374struct cfs_schedulable_data {
10375 struct task_group *tg;
10376 u64 period, quota;
10377};
10378
10379
10380
10381
10382
10383static u64 normalize_cfs_quota(struct task_group *tg,
10384 struct cfs_schedulable_data *d)
10385{
10386 u64 quota, period;
10387
10388 if (tg == d->tg) {
10389 period = d->period;
10390 quota = d->quota;
10391 } else {
10392 period = tg_get_cfs_period(tg);
10393 quota = tg_get_cfs_quota(tg);
10394 }
10395
10396
10397 if (quota == RUNTIME_INF || quota == -1)
10398 return RUNTIME_INF;
10399
10400 return to_ratio(period, quota);
10401}
10402
10403static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
10404{
10405 struct cfs_schedulable_data *d = data;
10406 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10407 s64 quota = 0, parent_quota = -1;
10408
10409 if (!tg->parent) {
10410 quota = RUNTIME_INF;
10411 } else {
10412 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
10413
10414 quota = normalize_cfs_quota(tg, d);
10415 parent_quota = parent_b->hierarchical_quota;
10416
10417
10418
10419
10420
10421
10422 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
10423 quota = min(quota, parent_quota);
10424 } else {
10425 if (quota == RUNTIME_INF)
10426 quota = parent_quota;
10427 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
10428 return -EINVAL;
10429 }
10430 }
10431 cfs_b->hierarchical_quota = quota;
10432
10433 return 0;
10434}
10435
10436static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
10437{
10438 int ret;
10439 struct cfs_schedulable_data data = {
10440 .tg = tg,
10441 .period = period,
10442 .quota = quota,
10443 };
10444
10445 if (quota != RUNTIME_INF) {
10446 do_div(data.period, NSEC_PER_USEC);
10447 do_div(data.quota, NSEC_PER_USEC);
10448 }
10449
10450 rcu_read_lock();
10451 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
10452 rcu_read_unlock();
10453
10454 return ret;
10455}
10456
10457static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
10458{
10459 struct task_group *tg = css_tg(seq_css(sf));
10460 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10461
10462 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
10463 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
10464 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
10465
10466 if (schedstat_enabled() && tg != &root_task_group) {
10467 u64 ws = 0;
10468 int i;
10469
10470 for_each_possible_cpu(i)
10471 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
10472
10473 seq_printf(sf, "wait_sum %llu\n", ws);
10474 }
10475
10476 return 0;
10477}
10478#endif
10479#endif
10480
10481#ifdef CONFIG_RT_GROUP_SCHED
10482static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
10483 struct cftype *cft, s64 val)
10484{
10485 return sched_group_set_rt_runtime(css_tg(css), val);
10486}
10487
10488static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
10489 struct cftype *cft)
10490{
10491 return sched_group_rt_runtime(css_tg(css));
10492}
10493
10494static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
10495 struct cftype *cftype, u64 rt_period_us)
10496{
10497 return sched_group_set_rt_period(css_tg(css), rt_period_us);
10498}
10499
10500static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
10501 struct cftype *cft)
10502{
10503 return sched_group_rt_period(css_tg(css));
10504}
10505#endif
10506
10507#ifdef CONFIG_FAIR_GROUP_SCHED
10508static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
10509 struct cftype *cft)
10510{
10511 return css_tg(css)->idle;
10512}
10513
10514static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
10515 struct cftype *cft, s64 idle)
10516{
10517 return sched_group_set_idle(css_tg(css), idle);
10518}
10519#endif
10520
10521static struct cftype cpu_legacy_files[] = {
10522#ifdef CONFIG_FAIR_GROUP_SCHED
10523 {
10524 .name = "shares",
10525 .read_u64 = cpu_shares_read_u64,
10526 .write_u64 = cpu_shares_write_u64,
10527 },
10528 {
10529 .name = "idle",
10530 .read_s64 = cpu_idle_read_s64,
10531 .write_s64 = cpu_idle_write_s64,
10532 },
10533#endif
10534#ifdef CONFIG_CFS_BANDWIDTH
10535 {
10536 .name = "cfs_quota_us",
10537 .read_s64 = cpu_cfs_quota_read_s64,
10538 .write_s64 = cpu_cfs_quota_write_s64,
10539 },
10540 {
10541 .name = "cfs_period_us",
10542 .read_u64 = cpu_cfs_period_read_u64,
10543 .write_u64 = cpu_cfs_period_write_u64,
10544 },
10545 {
10546 .name = "cfs_burst_us",
10547 .read_u64 = cpu_cfs_burst_read_u64,
10548 .write_u64 = cpu_cfs_burst_write_u64,
10549 },
10550 {
10551 .name = "stat",
10552 .seq_show = cpu_cfs_stat_show,
10553 },
10554#endif
10555#ifdef CONFIG_RT_GROUP_SCHED
10556 {
10557 .name = "rt_runtime_us",
10558 .read_s64 = cpu_rt_runtime_read,
10559 .write_s64 = cpu_rt_runtime_write,
10560 },
10561 {
10562 .name = "rt_period_us",
10563 .read_u64 = cpu_rt_period_read_uint,
10564 .write_u64 = cpu_rt_period_write_uint,
10565 },
10566#endif
10567#ifdef CONFIG_UCLAMP_TASK_GROUP
10568 {
10569 .name = "uclamp.min",
10570 .flags = CFTYPE_NOT_ON_ROOT,
10571 .seq_show = cpu_uclamp_min_show,
10572 .write = cpu_uclamp_min_write,
10573 },
10574 {
10575 .name = "uclamp.max",
10576 .flags = CFTYPE_NOT_ON_ROOT,
10577 .seq_show = cpu_uclamp_max_show,
10578 .write = cpu_uclamp_max_write,
10579 },
10580#endif
10581 { }
10582};
10583
10584static int cpu_extra_stat_show(struct seq_file *sf,
10585 struct cgroup_subsys_state *css)
10586{
10587#ifdef CONFIG_CFS_BANDWIDTH
10588 {
10589 struct task_group *tg = css_tg(css);
10590 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10591 u64 throttled_usec;
10592
10593 throttled_usec = cfs_b->throttled_time;
10594 do_div(throttled_usec, NSEC_PER_USEC);
10595
10596 seq_printf(sf, "nr_periods %d\n"
10597 "nr_throttled %d\n"
10598 "throttled_usec %llu\n",
10599 cfs_b->nr_periods, cfs_b->nr_throttled,
10600 throttled_usec);
10601 }
10602#endif
10603 return 0;
10604}
10605
10606#ifdef CONFIG_FAIR_GROUP_SCHED
10607static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
10608 struct cftype *cft)
10609{
10610 struct task_group *tg = css_tg(css);
10611 u64 weight = scale_load_down(tg->shares);
10612
10613 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
10614}
10615
10616static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
10617 struct cftype *cft, u64 weight)
10618{
10619
10620
10621
10622
10623
10624
10625
10626 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
10627 return -ERANGE;
10628
10629 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
10630
10631 return sched_group_set_shares(css_tg(css), scale_load(weight));
10632}
10633
10634static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
10635 struct cftype *cft)
10636{
10637 unsigned long weight = scale_load_down(css_tg(css)->shares);
10638 int last_delta = INT_MAX;
10639 int prio, delta;
10640
10641
10642 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
10643 delta = abs(sched_prio_to_weight[prio] - weight);
10644 if (delta >= last_delta)
10645 break;
10646 last_delta = delta;
10647 }
10648
10649 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
10650}
10651
10652static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
10653 struct cftype *cft, s64 nice)
10654{
10655 unsigned long weight;
10656 int idx;
10657
10658 if (nice < MIN_NICE || nice > MAX_NICE)
10659 return -ERANGE;
10660
10661 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
10662 idx = array_index_nospec(idx, 40);
10663 weight = sched_prio_to_weight[idx];
10664
10665 return sched_group_set_shares(css_tg(css), scale_load(weight));
10666}
10667#endif
10668
10669static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
10670 long period, long quota)
10671{
10672 if (quota < 0)
10673 seq_puts(sf, "max");
10674 else
10675 seq_printf(sf, "%ld", quota);
10676
10677 seq_printf(sf, " %ld\n", period);
10678}
10679
10680
10681static int __maybe_unused cpu_period_quota_parse(char *buf,
10682 u64 *periodp, u64 *quotap)
10683{
10684 char tok[21];
10685
10686 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
10687 return -EINVAL;
10688
10689 *periodp *= NSEC_PER_USEC;
10690
10691 if (sscanf(tok, "%llu", quotap))
10692 *quotap *= NSEC_PER_USEC;
10693 else if (!strcmp(tok, "max"))
10694 *quotap = RUNTIME_INF;
10695 else
10696 return -EINVAL;
10697
10698 return 0;
10699}
10700
10701#ifdef CONFIG_CFS_BANDWIDTH
10702static int cpu_max_show(struct seq_file *sf, void *v)
10703{
10704 struct task_group *tg = css_tg(seq_css(sf));
10705
10706 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
10707 return 0;
10708}
10709
10710static ssize_t cpu_max_write(struct kernfs_open_file *of,
10711 char *buf, size_t nbytes, loff_t off)
10712{
10713 struct task_group *tg = css_tg(of_css(of));
10714 u64 period = tg_get_cfs_period(tg);
10715 u64 burst = tg_get_cfs_burst(tg);
10716 u64 quota;
10717 int ret;
10718
10719 ret = cpu_period_quota_parse(buf, &period, "a);
10720 if (!ret)
10721 ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
10722 return ret ?: nbytes;
10723}
10724#endif
10725
10726static struct cftype cpu_files[] = {
10727#ifdef CONFIG_FAIR_GROUP_SCHED
10728 {
10729 .name = "weight",
10730 .flags = CFTYPE_NOT_ON_ROOT,
10731 .read_u64 = cpu_weight_read_u64,
10732 .write_u64 = cpu_weight_write_u64,
10733 },
10734 {
10735 .name = "weight.nice",
10736 .flags = CFTYPE_NOT_ON_ROOT,
10737 .read_s64 = cpu_weight_nice_read_s64,
10738 .write_s64 = cpu_weight_nice_write_s64,
10739 },
10740 {
10741 .name = "idle",
10742 .flags = CFTYPE_NOT_ON_ROOT,
10743 .read_s64 = cpu_idle_read_s64,
10744 .write_s64 = cpu_idle_write_s64,
10745 },
10746#endif
10747#ifdef CONFIG_CFS_BANDWIDTH
10748 {
10749 .name = "max",
10750 .flags = CFTYPE_NOT_ON_ROOT,
10751 .seq_show = cpu_max_show,
10752 .write = cpu_max_write,
10753 },
10754 {
10755 .name = "max.burst",
10756 .flags = CFTYPE_NOT_ON_ROOT,
10757 .read_u64 = cpu_cfs_burst_read_u64,
10758 .write_u64 = cpu_cfs_burst_write_u64,
10759 },
10760#endif
10761#ifdef CONFIG_UCLAMP_TASK_GROUP
10762 {
10763 .name = "uclamp.min",
10764 .flags = CFTYPE_NOT_ON_ROOT,
10765 .seq_show = cpu_uclamp_min_show,
10766 .write = cpu_uclamp_min_write,
10767 },
10768 {
10769 .name = "uclamp.max",
10770 .flags = CFTYPE_NOT_ON_ROOT,
10771 .seq_show = cpu_uclamp_max_show,
10772 .write = cpu_uclamp_max_write,
10773 },
10774#endif
10775 { }
10776};
10777
10778struct cgroup_subsys cpu_cgrp_subsys = {
10779 .css_alloc = cpu_cgroup_css_alloc,
10780 .css_online = cpu_cgroup_css_online,
10781 .css_released = cpu_cgroup_css_released,
10782 .css_free = cpu_cgroup_css_free,
10783 .css_extra_stat_show = cpu_extra_stat_show,
10784 .fork = cpu_cgroup_fork,
10785 .can_attach = cpu_cgroup_can_attach,
10786 .attach = cpu_cgroup_attach,
10787 .legacy_cftypes = cpu_legacy_files,
10788 .dfl_cftypes = cpu_files,
10789 .early_init = true,
10790 .threaded = true,
10791};
10792
10793#endif
10794
10795void dump_cpu_task(int cpu)
10796{
10797 pr_info("Task dump for CPU %d:\n", cpu);
10798 sched_show_task(cpu_curr(cpu));
10799}
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813const int sched_prio_to_weight[40] = {
10814 88761, 71755, 56483, 46273, 36291,
10815 29154, 23254, 18705, 14949, 11916,
10816 9548, 7620, 6100, 4904, 3906,
10817 3121, 2501, 1991, 1586, 1277,
10818 1024, 820, 655, 526, 423,
10819 335, 272, 215, 172, 137,
10820 110, 87, 70, 56, 45,
10821 36, 29, 23, 18, 15,
10822};
10823
10824
10825
10826
10827
10828
10829
10830
10831const u32 sched_prio_to_wmult[40] = {
10832 48388, 59856, 76040, 92818, 118348,
10833 147320, 184698, 229616, 287308, 360437,
10834 449829, 563644, 704093, 875809, 1099582,
10835 1376151, 1717300, 2157191, 2708050, 3363326,
10836 4194304, 5237765, 6557202, 8165337, 10153587,
10837 12820798, 15790321, 19976592, 24970740, 31350126,
10838 39045157, 49367440, 61356676, 76695844, 95443717,
10839 119304647, 148102320, 186737708, 238609294, 286331153,
10840};
10841
10842void call_trace_sched_update_nr_running(struct rq *rq, int count)
10843{
10844 trace_sched_update_nr_running_tp(rq, count);
10845}
10846