1
2
3
4
5
6
7
8#include "sched.h"
9
10#include <linux/nospec.h>
11
12#include <linux/kcov.h>
13
14#include <asm/switch_to.h>
15#include <asm/tlb.h>
16
17#include "../workqueue_internal.h"
18#include "../smpboot.h"
19
20#define CREATE_TRACE_POINTS
21#include <trace/events/sched.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
24
25#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
26
27
28
29
30
31
32
33#define SCHED_FEAT(name, enabled) \
34 (1UL << __SCHED_FEAT_##name) * enabled |
35const_debug unsigned int sysctl_sched_features =
36#include "features.h"
37 0;
38#undef SCHED_FEAT
39#endif
40
41
42
43
44
45const_debug unsigned int sysctl_sched_nr_migrate = 32;
46
47
48
49
50
51
52
53const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
54
55
56
57
58
59unsigned int sysctl_sched_rt_period = 1000000;
60
61__read_mostly int scheduler_running;
62
63
64
65
66
67int sysctl_sched_rt_runtime = 950000;
68
69
70
71
72struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
73 __acquires(rq->lock)
74{
75 struct rq *rq;
76
77 lockdep_assert_held(&p->pi_lock);
78
79 for (;;) {
80 rq = task_rq(p);
81 raw_spin_lock(&rq->lock);
82 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
83 rq_pin_lock(rq, rf);
84 return rq;
85 }
86 raw_spin_unlock(&rq->lock);
87
88 while (unlikely(task_on_rq_migrating(p)))
89 cpu_relax();
90 }
91}
92
93
94
95
96struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
97 __acquires(p->pi_lock)
98 __acquires(rq->lock)
99{
100 struct rq *rq;
101
102 for (;;) {
103 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
104 rq = task_rq(p);
105 raw_spin_lock(&rq->lock);
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
123 rq_pin_lock(rq, rf);
124 return rq;
125 }
126 raw_spin_unlock(&rq->lock);
127 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
128
129 while (unlikely(task_on_rq_migrating(p)))
130 cpu_relax();
131 }
132}
133
134
135
136
137
138static void update_rq_clock_task(struct rq *rq, s64 delta)
139{
140
141
142
143
144#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
145 s64 steal = 0, irq_delta = 0;
146#endif
147#ifdef CONFIG_IRQ_TIME_ACCOUNTING
148 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165 if (irq_delta > delta)
166 irq_delta = delta;
167
168 rq->prev_irq_time += irq_delta;
169 delta -= irq_delta;
170#endif
171#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
172 if (static_key_false((¶virt_steal_rq_enabled))) {
173 steal = paravirt_steal_clock(cpu_of(rq));
174 steal -= rq->prev_steal_time_rq;
175
176 if (unlikely(steal > delta))
177 steal = delta;
178
179 rq->prev_steal_time_rq += steal;
180 delta -= steal;
181 }
182#endif
183
184 rq->clock_task += delta;
185
186#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
187 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
188 sched_rt_avg_update(rq, irq_delta + steal);
189#endif
190}
191
192void update_rq_clock(struct rq *rq)
193{
194 s64 delta;
195
196 lockdep_assert_held(&rq->lock);
197
198 if (rq->clock_update_flags & RQCF_ACT_SKIP)
199 return;
200
201#ifdef CONFIG_SCHED_DEBUG
202 if (sched_feat(WARN_DOUBLE_CLOCK))
203 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
204 rq->clock_update_flags |= RQCF_UPDATED;
205#endif
206
207 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
208 if (delta < 0)
209 return;
210 rq->clock += delta;
211 update_rq_clock_task(rq, delta);
212}
213
214
215#ifdef CONFIG_SCHED_HRTICK
216
217
218
219
220static void hrtick_clear(struct rq *rq)
221{
222 if (hrtimer_active(&rq->hrtick_timer))
223 hrtimer_cancel(&rq->hrtick_timer);
224}
225
226
227
228
229
230static enum hrtimer_restart hrtick(struct hrtimer *timer)
231{
232 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
233 struct rq_flags rf;
234
235 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
236
237 rq_lock(rq, &rf);
238 update_rq_clock(rq);
239 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
240 rq_unlock(rq, &rf);
241
242 return HRTIMER_NORESTART;
243}
244
245#ifdef CONFIG_SMP
246
247static void __hrtick_restart(struct rq *rq)
248{
249 struct hrtimer *timer = &rq->hrtick_timer;
250
251 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
252}
253
254
255
256
257static void __hrtick_start(void *arg)
258{
259 struct rq *rq = arg;
260 struct rq_flags rf;
261
262 rq_lock(rq, &rf);
263 __hrtick_restart(rq);
264 rq->hrtick_csd_pending = 0;
265 rq_unlock(rq, &rf);
266}
267
268
269
270
271
272
273void hrtick_start(struct rq *rq, u64 delay)
274{
275 struct hrtimer *timer = &rq->hrtick_timer;
276 ktime_t time;
277 s64 delta;
278
279
280
281
282
283 delta = max_t(s64, delay, 10000LL);
284 time = ktime_add_ns(timer->base->get_time(), delta);
285
286 hrtimer_set_expires(timer, time);
287
288 if (rq == this_rq()) {
289 __hrtick_restart(rq);
290 } else if (!rq->hrtick_csd_pending) {
291 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292 rq->hrtick_csd_pending = 1;
293 }
294}
295
296#else
297
298
299
300
301
302void hrtick_start(struct rq *rq, u64 delay)
303{
304
305
306
307
308 delay = max_t(u64, delay, 10000LL);
309 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310 HRTIMER_MODE_REL_PINNED);
311}
312#endif
313
314static void hrtick_rq_init(struct rq *rq)
315{
316#ifdef CONFIG_SMP
317 rq->hrtick_csd_pending = 0;
318
319 rq->hrtick_csd.flags = 0;
320 rq->hrtick_csd.func = __hrtick_start;
321 rq->hrtick_csd.info = rq;
322#endif
323
324 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
325 rq->hrtick_timer.function = hrtick;
326}
327#else
328static inline void hrtick_clear(struct rq *rq)
329{
330}
331
332static inline void hrtick_rq_init(struct rq *rq)
333{
334}
335#endif
336
337
338
339
340#define fetch_or(ptr, mask) \
341 ({ \
342 typeof(ptr) _ptr = (ptr); \
343 typeof(mask) _mask = (mask); \
344 typeof(*_ptr) _old, _val = *_ptr; \
345 \
346 for (;;) { \
347 _old = cmpxchg(_ptr, _val, _val | _mask); \
348 if (_old == _val) \
349 break; \
350 _val = _old; \
351 } \
352 _old; \
353})
354
355#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
356
357
358
359
360
361static bool set_nr_and_not_polling(struct task_struct *p)
362{
363 struct thread_info *ti = task_thread_info(p);
364 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
365}
366
367
368
369
370
371
372
373static bool set_nr_if_polling(struct task_struct *p)
374{
375 struct thread_info *ti = task_thread_info(p);
376 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
377
378 for (;;) {
379 if (!(val & _TIF_POLLING_NRFLAG))
380 return false;
381 if (val & _TIF_NEED_RESCHED)
382 return true;
383 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
384 if (old == val)
385 break;
386 val = old;
387 }
388 return true;
389}
390
391#else
392static bool set_nr_and_not_polling(struct task_struct *p)
393{
394 set_tsk_need_resched(p);
395 return true;
396}
397
398#ifdef CONFIG_SMP
399static bool set_nr_if_polling(struct task_struct *p)
400{
401 return false;
402}
403#endif
404#endif
405
406
407
408
409
410
411
412
413
414
415
416
417
418void wake_q_add(struct wake_q_head *head, struct task_struct *task)
419{
420 struct wake_q_node *node = &task->wake_q;
421
422
423
424
425
426
427
428
429
430 smp_mb__before_atomic();
431 if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
432 return;
433
434 get_task_struct(task);
435
436
437
438
439 *head->lastp = node;
440 head->lastp = &node->next;
441}
442
443void wake_up_q(struct wake_q_head *head)
444{
445 struct wake_q_node *node = head->first;
446
447 while (node != WAKE_Q_TAIL) {
448 struct task_struct *task;
449
450 task = container_of(node, struct task_struct, wake_q);
451 BUG_ON(!task);
452
453 node = node->next;
454 task->wake_q.next = NULL;
455
456
457
458
459
460 wake_up_process(task);
461 put_task_struct(task);
462 }
463}
464
465
466
467
468
469
470
471
472void resched_curr(struct rq *rq)
473{
474 struct task_struct *curr = rq->curr;
475 int cpu;
476
477 lockdep_assert_held(&rq->lock);
478
479 if (test_tsk_need_resched(curr))
480 return;
481
482 cpu = cpu_of(rq);
483
484 if (cpu == smp_processor_id()) {
485 set_tsk_need_resched(curr);
486 set_preempt_need_resched();
487 return;
488 }
489
490 if (set_nr_and_not_polling(curr))
491 smp_send_reschedule(cpu);
492 else
493 trace_sched_wake_idle_without_ipi(cpu);
494}
495
496void resched_cpu(int cpu)
497{
498 struct rq *rq = cpu_rq(cpu);
499 unsigned long flags;
500
501 raw_spin_lock_irqsave(&rq->lock, flags);
502 if (cpu_online(cpu) || cpu == smp_processor_id())
503 resched_curr(rq);
504 raw_spin_unlock_irqrestore(&rq->lock, flags);
505}
506
507#ifdef CONFIG_SMP
508#ifdef CONFIG_NO_HZ_COMMON
509
510
511
512
513
514
515
516
517int get_nohz_timer_target(void)
518{
519 int i, cpu = smp_processor_id();
520 struct sched_domain *sd;
521
522 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
523 return cpu;
524
525 rcu_read_lock();
526 for_each_domain(cpu, sd) {
527 for_each_cpu(i, sched_domain_span(sd)) {
528 if (cpu == i)
529 continue;
530
531 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
532 cpu = i;
533 goto unlock;
534 }
535 }
536 }
537
538 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
539 cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
540unlock:
541 rcu_read_unlock();
542 return cpu;
543}
544
545
546
547
548
549
550
551
552
553
554
555static void wake_up_idle_cpu(int cpu)
556{
557 struct rq *rq = cpu_rq(cpu);
558
559 if (cpu == smp_processor_id())
560 return;
561
562 if (set_nr_and_not_polling(rq->idle))
563 smp_send_reschedule(cpu);
564 else
565 trace_sched_wake_idle_without_ipi(cpu);
566}
567
568static bool wake_up_full_nohz_cpu(int cpu)
569{
570
571
572
573
574
575
576 if (cpu_is_offline(cpu))
577 return true;
578 if (tick_nohz_full_cpu(cpu)) {
579 if (cpu != smp_processor_id() ||
580 tick_nohz_tick_stopped())
581 tick_nohz_full_kick_cpu(cpu);
582 return true;
583 }
584
585 return false;
586}
587
588
589
590
591
592
593void wake_up_nohz_cpu(int cpu)
594{
595 if (!wake_up_full_nohz_cpu(cpu))
596 wake_up_idle_cpu(cpu);
597}
598
599static inline bool got_nohz_idle_kick(void)
600{
601 int cpu = smp_processor_id();
602
603 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
604 return false;
605
606 if (idle_cpu(cpu) && !need_resched())
607 return true;
608
609
610
611
612
613 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
614 return false;
615}
616
617#else
618
619static inline bool got_nohz_idle_kick(void)
620{
621 return false;
622}
623
624#endif
625
626#ifdef CONFIG_NO_HZ_FULL
627bool sched_can_stop_tick(struct rq *rq)
628{
629 int fifo_nr_running;
630
631
632 if (rq->dl.dl_nr_running)
633 return false;
634
635
636
637
638
639 if (rq->rt.rr_nr_running) {
640 if (rq->rt.rr_nr_running == 1)
641 return true;
642 else
643 return false;
644 }
645
646
647
648
649
650 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
651 if (fifo_nr_running)
652 return true;
653
654
655
656
657
658
659 if (rq->nr_running > 1)
660 return false;
661
662 return true;
663}
664#endif
665
666void sched_avg_update(struct rq *rq)
667{
668 s64 period = sched_avg_period();
669
670 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
671
672
673
674
675
676 asm("" : "+rm" (rq->age_stamp));
677 rq->age_stamp += period;
678 rq->rt_avg /= 2;
679 }
680}
681
682#endif
683
684#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
685 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
686
687
688
689
690
691
692int walk_tg_tree_from(struct task_group *from,
693 tg_visitor down, tg_visitor up, void *data)
694{
695 struct task_group *parent, *child;
696 int ret;
697
698 parent = from;
699
700down:
701 ret = (*down)(parent, data);
702 if (ret)
703 goto out;
704 list_for_each_entry_rcu(child, &parent->children, siblings) {
705 parent = child;
706 goto down;
707
708up:
709 continue;
710 }
711 ret = (*up)(parent, data);
712 if (ret || parent == from)
713 goto out;
714
715 child = parent;
716 parent = parent->parent;
717 if (parent)
718 goto up;
719out:
720 return ret;
721}
722
723int tg_nop(struct task_group *tg, void *data)
724{
725 return 0;
726}
727#endif
728
729static void set_load_weight(struct task_struct *p, bool update_load)
730{
731 int prio = p->static_prio - MAX_RT_PRIO;
732 struct load_weight *load = &p->se.load;
733
734
735
736
737 if (idle_policy(p->policy)) {
738 load->weight = scale_load(WEIGHT_IDLEPRIO);
739 load->inv_weight = WMULT_IDLEPRIO;
740 return;
741 }
742
743
744
745
746
747 if (update_load && p->sched_class == &fair_sched_class) {
748 reweight_task(p, prio);
749 } else {
750 load->weight = scale_load(sched_prio_to_weight[prio]);
751 load->inv_weight = sched_prio_to_wmult[prio];
752 }
753}
754
755static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
756{
757 if (!(flags & ENQUEUE_NOCLOCK))
758 update_rq_clock(rq);
759
760 if (!(flags & ENQUEUE_RESTORE))
761 sched_info_queued(rq, p);
762
763 p->sched_class->enqueue_task(rq, p, flags);
764}
765
766static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
767{
768 if (!(flags & DEQUEUE_NOCLOCK))
769 update_rq_clock(rq);
770
771 if (!(flags & DEQUEUE_SAVE))
772 sched_info_dequeued(rq, p);
773
774 p->sched_class->dequeue_task(rq, p, flags);
775}
776
777void activate_task(struct rq *rq, struct task_struct *p, int flags)
778{
779 if (task_contributes_to_load(p))
780 rq->nr_uninterruptible--;
781
782 enqueue_task(rq, p, flags);
783}
784
785void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
786{
787 if (task_contributes_to_load(p))
788 rq->nr_uninterruptible++;
789
790 dequeue_task(rq, p, flags);
791}
792
793
794
795
796static inline int __normal_prio(struct task_struct *p)
797{
798 return p->static_prio;
799}
800
801
802
803
804
805
806
807
808static inline int normal_prio(struct task_struct *p)
809{
810 int prio;
811
812 if (task_has_dl_policy(p))
813 prio = MAX_DL_PRIO-1;
814 else if (task_has_rt_policy(p))
815 prio = MAX_RT_PRIO-1 - p->rt_priority;
816 else
817 prio = __normal_prio(p);
818 return prio;
819}
820
821
822
823
824
825
826
827
828static int effective_prio(struct task_struct *p)
829{
830 p->normal_prio = normal_prio(p);
831
832
833
834
835
836 if (!rt_prio(p->prio))
837 return p->normal_prio;
838 return p->prio;
839}
840
841
842
843
844
845
846
847inline int task_curr(const struct task_struct *p)
848{
849 return cpu_curr(task_cpu(p)) == p;
850}
851
852
853
854
855
856
857
858
859static inline void check_class_changed(struct rq *rq, struct task_struct *p,
860 const struct sched_class *prev_class,
861 int oldprio)
862{
863 if (prev_class != p->sched_class) {
864 if (prev_class->switched_from)
865 prev_class->switched_from(rq, p);
866
867 p->sched_class->switched_to(rq, p);
868 } else if (oldprio != p->prio || dl_task(p))
869 p->sched_class->prio_changed(rq, p, oldprio);
870}
871
872void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
873{
874 const struct sched_class *class;
875
876 if (p->sched_class == rq->curr->sched_class) {
877 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
878 } else {
879 for_each_class(class) {
880 if (class == rq->curr->sched_class)
881 break;
882 if (class == p->sched_class) {
883 resched_curr(rq);
884 break;
885 }
886 }
887 }
888
889
890
891
892
893 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
894 rq_clock_skip_update(rq);
895}
896
897#ifdef CONFIG_SMP
898
899static inline bool is_per_cpu_kthread(struct task_struct *p)
900{
901 if (!(p->flags & PF_KTHREAD))
902 return false;
903
904 if (p->nr_cpus_allowed != 1)
905 return false;
906
907 return true;
908}
909
910
911
912
913
914static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
915{
916 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
917 return false;
918
919 if (is_per_cpu_kthread(p))
920 return cpu_online(cpu);
921
922 return cpu_active(cpu);
923}
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
945 struct task_struct *p, int new_cpu)
946{
947 lockdep_assert_held(&rq->lock);
948
949 p->on_rq = TASK_ON_RQ_MIGRATING;
950 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
951 set_task_cpu(p, new_cpu);
952 rq_unlock(rq, rf);
953
954 rq = cpu_rq(new_cpu);
955
956 rq_lock(rq, rf);
957 BUG_ON(task_cpu(p) != new_cpu);
958 enqueue_task(rq, p, 0);
959 p->on_rq = TASK_ON_RQ_QUEUED;
960 check_preempt_curr(rq, p, 0);
961
962 return rq;
963}
964
965struct migration_arg {
966 struct task_struct *task;
967 int dest_cpu;
968};
969
970
971
972
973
974
975
976
977
978
979static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
980 struct task_struct *p, int dest_cpu)
981{
982
983 if (!is_cpu_allowed(p, dest_cpu))
984 return rq;
985
986 update_rq_clock(rq);
987 rq = move_queued_task(rq, rf, p, dest_cpu);
988
989 return rq;
990}
991
992
993
994
995
996
997static int migration_cpu_stop(void *data)
998{
999 struct migration_arg *arg = data;
1000 struct task_struct *p = arg->task;
1001 struct rq *rq = this_rq();
1002 struct rq_flags rf;
1003
1004
1005
1006
1007
1008 local_irq_disable();
1009
1010
1011
1012
1013
1014 sched_ttwu_pending();
1015
1016 raw_spin_lock(&p->pi_lock);
1017 rq_lock(rq, &rf);
1018
1019
1020
1021
1022
1023 if (task_rq(p) == rq) {
1024 if (task_on_rq_queued(p))
1025 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1026 else
1027 p->wake_cpu = arg->dest_cpu;
1028 }
1029 rq_unlock(rq, &rf);
1030 raw_spin_unlock(&p->pi_lock);
1031
1032 local_irq_enable();
1033 return 0;
1034}
1035
1036
1037
1038
1039
1040void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1041{
1042 cpumask_copy(&p->cpus_allowed, new_mask);
1043 p->nr_cpus_allowed = cpumask_weight(new_mask);
1044}
1045
1046void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1047{
1048 struct rq *rq = task_rq(p);
1049 bool queued, running;
1050
1051 lockdep_assert_held(&p->pi_lock);
1052
1053 queued = task_on_rq_queued(p);
1054 running = task_current(rq, p);
1055
1056 if (queued) {
1057
1058
1059
1060
1061 lockdep_assert_held(&rq->lock);
1062 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1063 }
1064 if (running)
1065 put_prev_task(rq, p);
1066
1067 p->sched_class->set_cpus_allowed(p, new_mask);
1068
1069 if (queued)
1070 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1071 if (running)
1072 set_curr_task(rq, p);
1073}
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084static int __set_cpus_allowed_ptr(struct task_struct *p,
1085 const struct cpumask *new_mask, bool check)
1086{
1087 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1088 unsigned int dest_cpu;
1089 struct rq_flags rf;
1090 struct rq *rq;
1091 int ret = 0;
1092
1093 rq = task_rq_lock(p, &rf);
1094 update_rq_clock(rq);
1095
1096 if (p->flags & PF_KTHREAD) {
1097
1098
1099
1100 cpu_valid_mask = cpu_online_mask;
1101 }
1102
1103
1104
1105
1106
1107 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1108 ret = -EINVAL;
1109 goto out;
1110 }
1111
1112 if (cpumask_equal(&p->cpus_allowed, new_mask))
1113 goto out;
1114
1115 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1116 ret = -EINVAL;
1117 goto out;
1118 }
1119
1120 do_set_cpus_allowed(p, new_mask);
1121
1122 if (p->flags & PF_KTHREAD) {
1123
1124
1125
1126
1127 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1128 !cpumask_intersects(new_mask, cpu_active_mask) &&
1129 p->nr_cpus_allowed != 1);
1130 }
1131
1132
1133 if (cpumask_test_cpu(task_cpu(p), new_mask))
1134 goto out;
1135
1136 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1137 if (task_running(rq, p) || p->state == TASK_WAKING) {
1138 struct migration_arg arg = { p, dest_cpu };
1139
1140 task_rq_unlock(rq, p, &rf);
1141 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1142 tlb_migrate_finish(p->mm);
1143 return 0;
1144 } else if (task_on_rq_queued(p)) {
1145
1146
1147
1148
1149 rq = move_queued_task(rq, &rf, p, dest_cpu);
1150 }
1151out:
1152 task_rq_unlock(rq, p, &rf);
1153
1154 return ret;
1155}
1156
1157int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1158{
1159 return __set_cpus_allowed_ptr(p, new_mask, false);
1160}
1161EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1162
1163void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1164{
1165#ifdef CONFIG_SCHED_DEBUG
1166
1167
1168
1169
1170 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1171 !p->on_rq);
1172
1173
1174
1175
1176
1177
1178 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1179 p->sched_class == &fair_sched_class &&
1180 (p->on_rq && !task_on_rq_migrating(p)));
1181
1182#ifdef CONFIG_LOCKDEP
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1194 lockdep_is_held(&task_rq(p)->lock)));
1195#endif
1196
1197
1198
1199 WARN_ON_ONCE(!cpu_online(new_cpu));
1200#endif
1201
1202 trace_sched_migrate_task(p, new_cpu);
1203
1204 if (task_cpu(p) != new_cpu) {
1205 if (p->sched_class->migrate_task_rq)
1206 p->sched_class->migrate_task_rq(p);
1207 p->se.nr_migrations++;
1208 rseq_migrate(p);
1209 perf_event_task_migrate(p);
1210 }
1211
1212 __set_task_cpu(p, new_cpu);
1213}
1214
1215static void __migrate_swap_task(struct task_struct *p, int cpu)
1216{
1217 if (task_on_rq_queued(p)) {
1218 struct rq *src_rq, *dst_rq;
1219 struct rq_flags srf, drf;
1220
1221 src_rq = task_rq(p);
1222 dst_rq = cpu_rq(cpu);
1223
1224 rq_pin_lock(src_rq, &srf);
1225 rq_pin_lock(dst_rq, &drf);
1226
1227 p->on_rq = TASK_ON_RQ_MIGRATING;
1228 deactivate_task(src_rq, p, 0);
1229 set_task_cpu(p, cpu);
1230 activate_task(dst_rq, p, 0);
1231 p->on_rq = TASK_ON_RQ_QUEUED;
1232 check_preempt_curr(dst_rq, p, 0);
1233
1234 rq_unpin_lock(dst_rq, &drf);
1235 rq_unpin_lock(src_rq, &srf);
1236
1237 } else {
1238
1239
1240
1241
1242
1243 p->wake_cpu = cpu;
1244 }
1245}
1246
1247struct migration_swap_arg {
1248 struct task_struct *src_task, *dst_task;
1249 int src_cpu, dst_cpu;
1250};
1251
1252static int migrate_swap_stop(void *data)
1253{
1254 struct migration_swap_arg *arg = data;
1255 struct rq *src_rq, *dst_rq;
1256 int ret = -EAGAIN;
1257
1258 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1259 return -EAGAIN;
1260
1261 src_rq = cpu_rq(arg->src_cpu);
1262 dst_rq = cpu_rq(arg->dst_cpu);
1263
1264 double_raw_lock(&arg->src_task->pi_lock,
1265 &arg->dst_task->pi_lock);
1266 double_rq_lock(src_rq, dst_rq);
1267
1268 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1269 goto unlock;
1270
1271 if (task_cpu(arg->src_task) != arg->src_cpu)
1272 goto unlock;
1273
1274 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1275 goto unlock;
1276
1277 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1278 goto unlock;
1279
1280 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1281 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1282
1283 ret = 0;
1284
1285unlock:
1286 double_rq_unlock(src_rq, dst_rq);
1287 raw_spin_unlock(&arg->dst_task->pi_lock);
1288 raw_spin_unlock(&arg->src_task->pi_lock);
1289
1290 return ret;
1291}
1292
1293
1294
1295
1296int migrate_swap(struct task_struct *cur, struct task_struct *p)
1297{
1298 struct migration_swap_arg arg;
1299 int ret = -EINVAL;
1300
1301 arg = (struct migration_swap_arg){
1302 .src_task = cur,
1303 .src_cpu = task_cpu(cur),
1304 .dst_task = p,
1305 .dst_cpu = task_cpu(p),
1306 };
1307
1308 if (arg.src_cpu == arg.dst_cpu)
1309 goto out;
1310
1311
1312
1313
1314
1315 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1316 goto out;
1317
1318 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1319 goto out;
1320
1321 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1322 goto out;
1323
1324 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1325 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1326
1327out:
1328 return ret;
1329}
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1348{
1349 int running, queued;
1350 struct rq_flags rf;
1351 unsigned long ncsw;
1352 struct rq *rq;
1353
1354 for (;;) {
1355
1356
1357
1358
1359
1360
1361 rq = task_rq(p);
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374 while (task_running(rq, p)) {
1375 if (match_state && unlikely(p->state != match_state))
1376 return 0;
1377 cpu_relax();
1378 }
1379
1380
1381
1382
1383
1384
1385 rq = task_rq_lock(p, &rf);
1386 trace_sched_wait_task(p);
1387 running = task_running(rq, p);
1388 queued = task_on_rq_queued(p);
1389 ncsw = 0;
1390 if (!match_state || p->state == match_state)
1391 ncsw = p->nvcsw | LONG_MIN;
1392 task_rq_unlock(rq, p, &rf);
1393
1394
1395
1396
1397 if (unlikely(!ncsw))
1398 break;
1399
1400
1401
1402
1403
1404
1405
1406 if (unlikely(running)) {
1407 cpu_relax();
1408 continue;
1409 }
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420 if (unlikely(queued)) {
1421 ktime_t to = NSEC_PER_SEC / HZ;
1422
1423 set_current_state(TASK_UNINTERRUPTIBLE);
1424 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1425 continue;
1426 }
1427
1428
1429
1430
1431
1432
1433 break;
1434 }
1435
1436 return ncsw;
1437}
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452void kick_process(struct task_struct *p)
1453{
1454 int cpu;
1455
1456 preempt_disable();
1457 cpu = task_cpu(p);
1458 if ((cpu != smp_processor_id()) && task_curr(p))
1459 smp_send_reschedule(cpu);
1460 preempt_enable();
1461}
1462EXPORT_SYMBOL_GPL(kick_process);
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486static int select_fallback_rq(int cpu, struct task_struct *p)
1487{
1488 int nid = cpu_to_node(cpu);
1489 const struct cpumask *nodemask = NULL;
1490 enum { cpuset, possible, fail } state = cpuset;
1491 int dest_cpu;
1492
1493
1494
1495
1496
1497
1498 if (nid != -1) {
1499 nodemask = cpumask_of_node(nid);
1500
1501
1502 for_each_cpu(dest_cpu, nodemask) {
1503 if (!cpu_active(dest_cpu))
1504 continue;
1505 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
1506 return dest_cpu;
1507 }
1508 }
1509
1510 for (;;) {
1511
1512 for_each_cpu(dest_cpu, &p->cpus_allowed) {
1513 if (!is_cpu_allowed(p, dest_cpu))
1514 continue;
1515
1516 goto out;
1517 }
1518
1519
1520 switch (state) {
1521 case cpuset:
1522 if (IS_ENABLED(CONFIG_CPUSETS)) {
1523 cpuset_cpus_allowed_fallback(p);
1524 state = possible;
1525 break;
1526 }
1527
1528 case possible:
1529 do_set_cpus_allowed(p, cpu_possible_mask);
1530 state = fail;
1531 break;
1532
1533 case fail:
1534 BUG();
1535 break;
1536 }
1537 }
1538
1539out:
1540 if (state != cpuset) {
1541
1542
1543
1544
1545
1546 if (p->mm && printk_ratelimit()) {
1547 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1548 task_pid_nr(p), p->comm, cpu);
1549 }
1550 }
1551
1552 return dest_cpu;
1553}
1554
1555
1556
1557
1558static inline
1559int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1560{
1561 lockdep_assert_held(&p->pi_lock);
1562
1563 if (p->nr_cpus_allowed > 1)
1564 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1565 else
1566 cpu = cpumask_any(&p->cpus_allowed);
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (unlikely(!is_cpu_allowed(p, cpu)))
1579 cpu = select_fallback_rq(task_cpu(p), p);
1580
1581 return cpu;
1582}
1583
1584static void update_avg(u64 *avg, u64 sample)
1585{
1586 s64 diff = sample - *avg;
1587 *avg += diff >> 3;
1588}
1589
1590void sched_set_stop_task(int cpu, struct task_struct *stop)
1591{
1592 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1593 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1594
1595 if (stop) {
1596
1597
1598
1599
1600
1601
1602
1603
1604 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
1605
1606 stop->sched_class = &stop_sched_class;
1607 }
1608
1609 cpu_rq(cpu)->stop = stop;
1610
1611 if (old_stop) {
1612
1613
1614
1615
1616 old_stop->sched_class = &rt_sched_class;
1617 }
1618}
1619
1620#else
1621
1622static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1623 const struct cpumask *new_mask, bool check)
1624{
1625 return set_cpus_allowed_ptr(p, new_mask);
1626}
1627
1628#endif
1629
1630static void
1631ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1632{
1633 struct rq *rq;
1634
1635 if (!schedstat_enabled())
1636 return;
1637
1638 rq = this_rq();
1639
1640#ifdef CONFIG_SMP
1641 if (cpu == rq->cpu) {
1642 __schedstat_inc(rq->ttwu_local);
1643 __schedstat_inc(p->se.statistics.nr_wakeups_local);
1644 } else {
1645 struct sched_domain *sd;
1646
1647 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
1648 rcu_read_lock();
1649 for_each_domain(rq->cpu, sd) {
1650 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1651 __schedstat_inc(sd->ttwu_wake_remote);
1652 break;
1653 }
1654 }
1655 rcu_read_unlock();
1656 }
1657
1658 if (wake_flags & WF_MIGRATED)
1659 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1660#endif
1661
1662 __schedstat_inc(rq->ttwu_count);
1663 __schedstat_inc(p->se.statistics.nr_wakeups);
1664
1665 if (wake_flags & WF_SYNC)
1666 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
1667}
1668
1669static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1670{
1671 activate_task(rq, p, en_flags);
1672 p->on_rq = TASK_ON_RQ_QUEUED;
1673
1674
1675 if (p->flags & PF_WQ_WORKER)
1676 wq_worker_waking_up(p, cpu_of(rq));
1677}
1678
1679
1680
1681
1682static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1683 struct rq_flags *rf)
1684{
1685 check_preempt_curr(rq, p, wake_flags);
1686 p->state = TASK_RUNNING;
1687 trace_sched_wakeup(p);
1688
1689#ifdef CONFIG_SMP
1690 if (p->sched_class->task_woken) {
1691
1692
1693
1694
1695 rq_unpin_lock(rq, rf);
1696 p->sched_class->task_woken(rq, p);
1697 rq_repin_lock(rq, rf);
1698 }
1699
1700 if (rq->idle_stamp) {
1701 u64 delta = rq_clock(rq) - rq->idle_stamp;
1702 u64 max = 2*rq->max_idle_balance_cost;
1703
1704 update_avg(&rq->avg_idle, delta);
1705
1706 if (rq->avg_idle > max)
1707 rq->avg_idle = max;
1708
1709 rq->idle_stamp = 0;
1710 }
1711#endif
1712}
1713
1714static void
1715ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1716 struct rq_flags *rf)
1717{
1718 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1719
1720 lockdep_assert_held(&rq->lock);
1721
1722#ifdef CONFIG_SMP
1723 if (p->sched_contributes_to_load)
1724 rq->nr_uninterruptible--;
1725
1726 if (wake_flags & WF_MIGRATED)
1727 en_flags |= ENQUEUE_MIGRATED;
1728#endif
1729
1730 ttwu_activate(rq, p, en_flags);
1731 ttwu_do_wakeup(rq, p, wake_flags, rf);
1732}
1733
1734
1735
1736
1737
1738
1739
1740static int ttwu_remote(struct task_struct *p, int wake_flags)
1741{
1742 struct rq_flags rf;
1743 struct rq *rq;
1744 int ret = 0;
1745
1746 rq = __task_rq_lock(p, &rf);
1747 if (task_on_rq_queued(p)) {
1748
1749 update_rq_clock(rq);
1750 ttwu_do_wakeup(rq, p, wake_flags, &rf);
1751 ret = 1;
1752 }
1753 __task_rq_unlock(rq, &rf);
1754
1755 return ret;
1756}
1757
1758#ifdef CONFIG_SMP
1759void sched_ttwu_pending(void)
1760{
1761 struct rq *rq = this_rq();
1762 struct llist_node *llist = llist_del_all(&rq->wake_list);
1763 struct task_struct *p, *t;
1764 struct rq_flags rf;
1765
1766 if (!llist)
1767 return;
1768
1769 rq_lock_irqsave(rq, &rf);
1770 update_rq_clock(rq);
1771
1772 llist_for_each_entry_safe(p, t, llist, wake_entry)
1773 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
1774
1775 rq_unlock_irqrestore(rq, &rf);
1776}
1777
1778void scheduler_ipi(void)
1779{
1780
1781
1782
1783
1784
1785 preempt_fold_need_resched();
1786
1787 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1788 return;
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803 irq_enter();
1804 sched_ttwu_pending();
1805
1806
1807
1808
1809 if (unlikely(got_nohz_idle_kick())) {
1810 this_rq()->idle_balance = 1;
1811 raise_softirq_irqoff(SCHED_SOFTIRQ);
1812 }
1813 irq_exit();
1814}
1815
1816static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1817{
1818 struct rq *rq = cpu_rq(cpu);
1819
1820 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1821
1822 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1823 if (!set_nr_if_polling(rq->idle))
1824 smp_send_reschedule(cpu);
1825 else
1826 trace_sched_wake_idle_without_ipi(cpu);
1827 }
1828}
1829
1830void wake_up_if_idle(int cpu)
1831{
1832 struct rq *rq = cpu_rq(cpu);
1833 struct rq_flags rf;
1834
1835 rcu_read_lock();
1836
1837 if (!is_idle_task(rcu_dereference(rq->curr)))
1838 goto out;
1839
1840 if (set_nr_if_polling(rq->idle)) {
1841 trace_sched_wake_idle_without_ipi(cpu);
1842 } else {
1843 rq_lock_irqsave(rq, &rf);
1844 if (is_idle_task(rq->curr))
1845 smp_send_reschedule(cpu);
1846
1847 rq_unlock_irqrestore(rq, &rf);
1848 }
1849
1850out:
1851 rcu_read_unlock();
1852}
1853
1854bool cpus_share_cache(int this_cpu, int that_cpu)
1855{
1856 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1857}
1858#endif
1859
1860static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1861{
1862 struct rq *rq = cpu_rq(cpu);
1863 struct rq_flags rf;
1864
1865#if defined(CONFIG_SMP)
1866 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1867 sched_clock_cpu(cpu);
1868 ttwu_queue_remote(p, cpu, wake_flags);
1869 return;
1870 }
1871#endif
1872
1873 rq_lock(rq, &rf);
1874 update_rq_clock(rq);
1875 ttwu_do_activate(rq, p, wake_flags, &rf);
1876 rq_unlock(rq, &rf);
1877}
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981static int
1982try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1983{
1984 unsigned long flags;
1985 int cpu, success = 0;
1986
1987
1988
1989
1990
1991
1992
1993 raw_spin_lock_irqsave(&p->pi_lock, flags);
1994 smp_mb__after_spinlock();
1995 if (!(p->state & state))
1996 goto out;
1997
1998 trace_sched_waking(p);
1999
2000
2001 success = 1;
2002 cpu = task_cpu(p);
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025 smp_rmb();
2026 if (p->on_rq && ttwu_remote(p, wake_flags))
2027 goto stat;
2028
2029#ifdef CONFIG_SMP
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047 smp_rmb();
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058 smp_cond_load_acquire(&p->on_cpu, !VAL);
2059
2060 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2061 p->state = TASK_WAKING;
2062
2063 if (p->in_iowait) {
2064 delayacct_blkio_end(p);
2065 atomic_dec(&task_rq(p)->nr_iowait);
2066 }
2067
2068 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2069 if (task_cpu(p) != cpu) {
2070 wake_flags |= WF_MIGRATED;
2071 set_task_cpu(p, cpu);
2072 }
2073
2074#else
2075
2076 if (p->in_iowait) {
2077 delayacct_blkio_end(p);
2078 atomic_dec(&task_rq(p)->nr_iowait);
2079 }
2080
2081#endif
2082
2083 ttwu_queue(p, cpu, wake_flags);
2084stat:
2085 ttwu_stat(p, cpu, wake_flags);
2086out:
2087 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2088
2089 return success;
2090}
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2102{
2103 struct rq *rq = task_rq(p);
2104
2105 if (WARN_ON_ONCE(rq != this_rq()) ||
2106 WARN_ON_ONCE(p == current))
2107 return;
2108
2109 lockdep_assert_held(&rq->lock);
2110
2111 if (!raw_spin_trylock(&p->pi_lock)) {
2112
2113
2114
2115
2116
2117
2118 rq_unlock(rq, rf);
2119 raw_spin_lock(&p->pi_lock);
2120 rq_relock(rq, rf);
2121 }
2122
2123 if (!(p->state & TASK_NORMAL))
2124 goto out;
2125
2126 trace_sched_waking(p);
2127
2128 if (!task_on_rq_queued(p)) {
2129 if (p->in_iowait) {
2130 delayacct_blkio_end(p);
2131 atomic_dec(&rq->nr_iowait);
2132 }
2133 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2134 }
2135
2136 ttwu_do_wakeup(rq, p, 0, rf);
2137 ttwu_stat(p, smp_processor_id(), 0);
2138out:
2139 raw_spin_unlock(&p->pi_lock);
2140}
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153int wake_up_process(struct task_struct *p)
2154{
2155 return try_to_wake_up(p, TASK_NORMAL, 0);
2156}
2157EXPORT_SYMBOL(wake_up_process);
2158
2159int wake_up_state(struct task_struct *p, unsigned int state)
2160{
2161 return try_to_wake_up(p, state, 0);
2162}
2163
2164
2165
2166
2167
2168
2169
2170static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2171{
2172 p->on_rq = 0;
2173
2174 p->se.on_rq = 0;
2175 p->se.exec_start = 0;
2176 p->se.sum_exec_runtime = 0;
2177 p->se.prev_sum_exec_runtime = 0;
2178 p->se.nr_migrations = 0;
2179 p->se.vruntime = 0;
2180 INIT_LIST_HEAD(&p->se.group_node);
2181
2182#ifdef CONFIG_FAIR_GROUP_SCHED
2183 p->se.cfs_rq = NULL;
2184#endif
2185
2186#ifdef CONFIG_SCHEDSTATS
2187
2188 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2189#endif
2190
2191 RB_CLEAR_NODE(&p->dl.rb_node);
2192 init_dl_task_timer(&p->dl);
2193 init_dl_inactive_task_timer(&p->dl);
2194 __dl_clear_params(p);
2195
2196 INIT_LIST_HEAD(&p->rt.run_list);
2197 p->rt.timeout = 0;
2198 p->rt.time_slice = sched_rr_timeslice;
2199 p->rt.on_rq = 0;
2200 p->rt.on_list = 0;
2201
2202#ifdef CONFIG_PREEMPT_NOTIFIERS
2203 INIT_HLIST_HEAD(&p->preempt_notifiers);
2204#endif
2205
2206 init_numa_balancing(clone_flags, p);
2207}
2208
2209DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2210
2211#ifdef CONFIG_NUMA_BALANCING
2212
2213void set_numabalancing_state(bool enabled)
2214{
2215 if (enabled)
2216 static_branch_enable(&sched_numa_balancing);
2217 else
2218 static_branch_disable(&sched_numa_balancing);
2219}
2220
2221#ifdef CONFIG_PROC_SYSCTL
2222int sysctl_numa_balancing(struct ctl_table *table, int write,
2223 void __user *buffer, size_t *lenp, loff_t *ppos)
2224{
2225 struct ctl_table t;
2226 int err;
2227 int state = static_branch_likely(&sched_numa_balancing);
2228
2229 if (write && !capable(CAP_SYS_ADMIN))
2230 return -EPERM;
2231
2232 t = *table;
2233 t.data = &state;
2234 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2235 if (err < 0)
2236 return err;
2237 if (write)
2238 set_numabalancing_state(state);
2239 return err;
2240}
2241#endif
2242#endif
2243
2244#ifdef CONFIG_SCHEDSTATS
2245
2246DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2247static bool __initdata __sched_schedstats = false;
2248
2249static void set_schedstats(bool enabled)
2250{
2251 if (enabled)
2252 static_branch_enable(&sched_schedstats);
2253 else
2254 static_branch_disable(&sched_schedstats);
2255}
2256
2257void force_schedstat_enabled(void)
2258{
2259 if (!schedstat_enabled()) {
2260 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2261 static_branch_enable(&sched_schedstats);
2262 }
2263}
2264
2265static int __init setup_schedstats(char *str)
2266{
2267 int ret = 0;
2268 if (!str)
2269 goto out;
2270
2271
2272
2273
2274
2275
2276 if (!strcmp(str, "enable")) {
2277 __sched_schedstats = true;
2278 ret = 1;
2279 } else if (!strcmp(str, "disable")) {
2280 __sched_schedstats = false;
2281 ret = 1;
2282 }
2283out:
2284 if (!ret)
2285 pr_warn("Unable to parse schedstats=\n");
2286
2287 return ret;
2288}
2289__setup("schedstats=", setup_schedstats);
2290
2291static void __init init_schedstats(void)
2292{
2293 set_schedstats(__sched_schedstats);
2294}
2295
2296#ifdef CONFIG_PROC_SYSCTL
2297int sysctl_schedstats(struct ctl_table *table, int write,
2298 void __user *buffer, size_t *lenp, loff_t *ppos)
2299{
2300 struct ctl_table t;
2301 int err;
2302 int state = static_branch_likely(&sched_schedstats);
2303
2304 if (write && !capable(CAP_SYS_ADMIN))
2305 return -EPERM;
2306
2307 t = *table;
2308 t.data = &state;
2309 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2310 if (err < 0)
2311 return err;
2312 if (write)
2313 set_schedstats(state);
2314 return err;
2315}
2316#endif
2317#else
2318static inline void init_schedstats(void) {}
2319#endif
2320
2321
2322
2323
2324int sched_fork(unsigned long clone_flags, struct task_struct *p)
2325{
2326 unsigned long flags;
2327 int cpu = get_cpu();
2328
2329 __sched_fork(clone_flags, p);
2330
2331
2332
2333
2334
2335 p->state = TASK_NEW;
2336
2337
2338
2339
2340 p->prio = current->normal_prio;
2341
2342
2343
2344
2345 if (unlikely(p->sched_reset_on_fork)) {
2346 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2347 p->policy = SCHED_NORMAL;
2348 p->static_prio = NICE_TO_PRIO(0);
2349 p->rt_priority = 0;
2350 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2351 p->static_prio = NICE_TO_PRIO(0);
2352
2353 p->prio = p->normal_prio = __normal_prio(p);
2354 set_load_weight(p, false);
2355
2356
2357
2358
2359
2360 p->sched_reset_on_fork = 0;
2361 }
2362
2363 if (dl_prio(p->prio)) {
2364 put_cpu();
2365 return -EAGAIN;
2366 } else if (rt_prio(p->prio)) {
2367 p->sched_class = &rt_sched_class;
2368 } else {
2369 p->sched_class = &fair_sched_class;
2370 }
2371
2372 init_entity_runnable_average(&p->se);
2373
2374
2375
2376
2377
2378
2379
2380
2381 raw_spin_lock_irqsave(&p->pi_lock, flags);
2382
2383
2384
2385
2386 __set_task_cpu(p, cpu);
2387 if (p->sched_class->task_fork)
2388 p->sched_class->task_fork(p);
2389 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2390
2391#ifdef CONFIG_SCHED_INFO
2392 if (likely(sched_info_on()))
2393 memset(&p->sched_info, 0, sizeof(p->sched_info));
2394#endif
2395#if defined(CONFIG_SMP)
2396 p->on_cpu = 0;
2397#endif
2398 init_task_preempt_count(p);
2399#ifdef CONFIG_SMP
2400 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2401 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2402#endif
2403
2404 put_cpu();
2405 return 0;
2406}
2407
2408unsigned long to_ratio(u64 period, u64 runtime)
2409{
2410 if (runtime == RUNTIME_INF)
2411 return BW_UNIT;
2412
2413
2414
2415
2416
2417
2418 if (period == 0)
2419 return 0;
2420
2421 return div64_u64(runtime << BW_SHIFT, period);
2422}
2423
2424
2425
2426
2427
2428
2429
2430
2431void wake_up_new_task(struct task_struct *p)
2432{
2433 struct rq_flags rf;
2434 struct rq *rq;
2435
2436 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2437 p->state = TASK_RUNNING;
2438#ifdef CONFIG_SMP
2439
2440
2441
2442
2443
2444
2445
2446
2447 p->recent_used_cpu = task_cpu(p);
2448 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2449#endif
2450 rq = __task_rq_lock(p, &rf);
2451 update_rq_clock(rq);
2452 post_init_entity_util_avg(&p->se);
2453
2454 activate_task(rq, p, ENQUEUE_NOCLOCK);
2455 p->on_rq = TASK_ON_RQ_QUEUED;
2456 trace_sched_wakeup_new(p);
2457 check_preempt_curr(rq, p, WF_FORK);
2458#ifdef CONFIG_SMP
2459 if (p->sched_class->task_woken) {
2460
2461
2462
2463
2464 rq_unpin_lock(rq, &rf);
2465 p->sched_class->task_woken(rq, p);
2466 rq_repin_lock(rq, &rf);
2467 }
2468#endif
2469 task_rq_unlock(rq, p, &rf);
2470}
2471
2472#ifdef CONFIG_PREEMPT_NOTIFIERS
2473
2474static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2475
2476void preempt_notifier_inc(void)
2477{
2478 static_branch_inc(&preempt_notifier_key);
2479}
2480EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2481
2482void preempt_notifier_dec(void)
2483{
2484 static_branch_dec(&preempt_notifier_key);
2485}
2486EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2487
2488
2489
2490
2491
2492void preempt_notifier_register(struct preempt_notifier *notifier)
2493{
2494 if (!static_branch_unlikely(&preempt_notifier_key))
2495 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2496
2497 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2498}
2499EXPORT_SYMBOL_GPL(preempt_notifier_register);
2500
2501
2502
2503
2504
2505
2506
2507void preempt_notifier_unregister(struct preempt_notifier *notifier)
2508{
2509 hlist_del(¬ifier->link);
2510}
2511EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2512
2513static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2514{
2515 struct preempt_notifier *notifier;
2516
2517 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2518 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2519}
2520
2521static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2522{
2523 if (static_branch_unlikely(&preempt_notifier_key))
2524 __fire_sched_in_preempt_notifiers(curr);
2525}
2526
2527static void
2528__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2529 struct task_struct *next)
2530{
2531 struct preempt_notifier *notifier;
2532
2533 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2534 notifier->ops->sched_out(notifier, next);
2535}
2536
2537static __always_inline void
2538fire_sched_out_preempt_notifiers(struct task_struct *curr,
2539 struct task_struct *next)
2540{
2541 if (static_branch_unlikely(&preempt_notifier_key))
2542 __fire_sched_out_preempt_notifiers(curr, next);
2543}
2544
2545#else
2546
2547static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2548{
2549}
2550
2551static inline void
2552fire_sched_out_preempt_notifiers(struct task_struct *curr,
2553 struct task_struct *next)
2554{
2555}
2556
2557#endif
2558
2559static inline void prepare_task(struct task_struct *next)
2560{
2561#ifdef CONFIG_SMP
2562
2563
2564
2565
2566 next->on_cpu = 1;
2567#endif
2568}
2569
2570static inline void finish_task(struct task_struct *prev)
2571{
2572#ifdef CONFIG_SMP
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583 smp_store_release(&prev->on_cpu, 0);
2584#endif
2585}
2586
2587static inline void
2588prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
2589{
2590
2591
2592
2593
2594
2595
2596 rq_unpin_lock(rq, rf);
2597 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2598#ifdef CONFIG_DEBUG_SPINLOCK
2599
2600 rq->lock.owner = next;
2601#endif
2602}
2603
2604static inline void finish_lock_switch(struct rq *rq)
2605{
2606
2607
2608
2609
2610
2611 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
2612 raw_spin_unlock_irq(&rq->lock);
2613}
2614
2615
2616
2617
2618
2619#ifndef prepare_arch_switch
2620# define prepare_arch_switch(next) do { } while (0)
2621#endif
2622
2623#ifndef finish_arch_post_lock_switch
2624# define finish_arch_post_lock_switch() do { } while (0)
2625#endif
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640static inline void
2641prepare_task_switch(struct rq *rq, struct task_struct *prev,
2642 struct task_struct *next)
2643{
2644 kcov_prepare_switch(prev);
2645 sched_info_switch(rq, prev, next);
2646 perf_event_task_sched_out(prev, next);
2647 rseq_preempt(prev);
2648 fire_sched_out_preempt_notifiers(prev, next);
2649 prepare_task(next);
2650 prepare_arch_switch(next);
2651}
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672static struct rq *finish_task_switch(struct task_struct *prev)
2673 __releases(rq->lock)
2674{
2675 struct rq *rq = this_rq();
2676 struct mm_struct *mm = rq->prev_mm;
2677 long prev_state;
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2691 "corrupted preempt_count: %s/%d/0x%x\n",
2692 current->comm, current->pid, preempt_count()))
2693 preempt_count_set(FORK_PREEMPT_COUNT);
2694
2695 rq->prev_mm = NULL;
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708 prev_state = prev->state;
2709 vtime_task_switch(prev);
2710 perf_event_task_sched_in(prev, current);
2711 finish_task(prev);
2712 finish_lock_switch(rq);
2713 finish_arch_post_lock_switch();
2714 kcov_finish_switch(current);
2715
2716 fire_sched_in_preempt_notifiers(current);
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729 if (mm) {
2730 membarrier_mm_sync_core_before_usermode(mm);
2731 mmdrop(mm);
2732 }
2733 if (unlikely(prev_state == TASK_DEAD)) {
2734 if (prev->sched_class->task_dead)
2735 prev->sched_class->task_dead(prev);
2736
2737
2738
2739
2740
2741 kprobe_flush_task(prev);
2742
2743
2744 put_task_stack(prev);
2745
2746 put_task_struct(prev);
2747 }
2748
2749 tick_nohz_task_switch();
2750 return rq;
2751}
2752
2753#ifdef CONFIG_SMP
2754
2755
2756static void __balance_callback(struct rq *rq)
2757{
2758 struct callback_head *head, *next;
2759 void (*func)(struct rq *rq);
2760 unsigned long flags;
2761
2762 raw_spin_lock_irqsave(&rq->lock, flags);
2763 head = rq->balance_callback;
2764 rq->balance_callback = NULL;
2765 while (head) {
2766 func = (void (*)(struct rq *))head->func;
2767 next = head->next;
2768 head->next = NULL;
2769 head = next;
2770
2771 func(rq);
2772 }
2773 raw_spin_unlock_irqrestore(&rq->lock, flags);
2774}
2775
2776static inline void balance_callback(struct rq *rq)
2777{
2778 if (unlikely(rq->balance_callback))
2779 __balance_callback(rq);
2780}
2781
2782#else
2783
2784static inline void balance_callback(struct rq *rq)
2785{
2786}
2787
2788#endif
2789
2790
2791
2792
2793
2794asmlinkage __visible void schedule_tail(struct task_struct *prev)
2795 __releases(rq->lock)
2796{
2797 struct rq *rq;
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808 rq = finish_task_switch(prev);
2809 balance_callback(rq);
2810 preempt_enable();
2811
2812 if (current->set_child_tid)
2813 put_user(task_pid_vnr(current), current->set_child_tid);
2814}
2815
2816
2817
2818
2819static __always_inline struct rq *
2820context_switch(struct rq *rq, struct task_struct *prev,
2821 struct task_struct *next, struct rq_flags *rf)
2822{
2823 struct mm_struct *mm, *oldmm;
2824
2825 prepare_task_switch(rq, prev, next);
2826
2827 mm = next->mm;
2828 oldmm = prev->active_mm;
2829
2830
2831
2832
2833
2834 arch_start_context_switch(prev);
2835
2836
2837
2838
2839
2840
2841
2842
2843 if (!mm) {
2844 next->active_mm = oldmm;
2845 mmgrab(oldmm);
2846 enter_lazy_tlb(oldmm, next);
2847 } else
2848 switch_mm_irqs_off(oldmm, mm, next);
2849
2850 if (!prev->mm) {
2851 prev->active_mm = NULL;
2852 rq->prev_mm = oldmm;
2853 }
2854
2855 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
2856
2857 prepare_lock_switch(rq, next, rf);
2858
2859
2860 switch_to(prev, next, prev);
2861 barrier();
2862
2863 return finish_task_switch(prev);
2864}
2865
2866
2867
2868
2869
2870
2871
2872unsigned long nr_running(void)
2873{
2874 unsigned long i, sum = 0;
2875
2876 for_each_online_cpu(i)
2877 sum += cpu_rq(i)->nr_running;
2878
2879 return sum;
2880}
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895bool single_task_running(void)
2896{
2897 return raw_rq()->nr_running == 1;
2898}
2899EXPORT_SYMBOL(single_task_running);
2900
2901unsigned long long nr_context_switches(void)
2902{
2903 int i;
2904 unsigned long long sum = 0;
2905
2906 for_each_possible_cpu(i)
2907 sum += cpu_rq(i)->nr_switches;
2908
2909 return sum;
2910}
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942unsigned long nr_iowait(void)
2943{
2944 unsigned long i, sum = 0;
2945
2946 for_each_possible_cpu(i)
2947 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2948
2949 return sum;
2950}
2951
2952
2953
2954
2955
2956
2957
2958
2959unsigned long nr_iowait_cpu(int cpu)
2960{
2961 struct rq *this = cpu_rq(cpu);
2962 return atomic_read(&this->nr_iowait);
2963}
2964
2965void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2966{
2967 struct rq *rq = this_rq();
2968 *nr_waiters = atomic_read(&rq->nr_iowait);
2969 *load = rq->load.weight;
2970}
2971
2972#ifdef CONFIG_SMP
2973
2974
2975
2976
2977
2978void sched_exec(void)
2979{
2980 struct task_struct *p = current;
2981 unsigned long flags;
2982 int dest_cpu;
2983
2984 raw_spin_lock_irqsave(&p->pi_lock, flags);
2985 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2986 if (dest_cpu == smp_processor_id())
2987 goto unlock;
2988
2989 if (likely(cpu_active(dest_cpu))) {
2990 struct migration_arg arg = { p, dest_cpu };
2991
2992 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2993 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2994 return;
2995 }
2996unlock:
2997 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2998}
2999
3000#endif
3001
3002DEFINE_PER_CPU(struct kernel_stat, kstat);
3003DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3004
3005EXPORT_PER_CPU_SYMBOL(kstat);
3006EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3007
3008
3009
3010
3011
3012
3013
3014static inline void prefetch_curr_exec_start(struct task_struct *p)
3015{
3016#ifdef CONFIG_FAIR_GROUP_SCHED
3017 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3018#else
3019 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3020#endif
3021 prefetch(curr);
3022 prefetch(&curr->exec_start);
3023}
3024
3025
3026
3027
3028
3029
3030unsigned long long task_sched_runtime(struct task_struct *p)
3031{
3032 struct rq_flags rf;
3033 struct rq *rq;
3034 u64 ns;
3035
3036#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048 if (!p->on_cpu || !task_on_rq_queued(p))
3049 return p->se.sum_exec_runtime;
3050#endif
3051
3052 rq = task_rq_lock(p, &rf);
3053
3054
3055
3056
3057
3058 if (task_current(rq, p) && task_on_rq_queued(p)) {
3059 prefetch_curr_exec_start(p);
3060 update_rq_clock(rq);
3061 p->sched_class->update_curr(rq);
3062 }
3063 ns = p->se.sum_exec_runtime;
3064 task_rq_unlock(rq, p, &rf);
3065
3066 return ns;
3067}
3068
3069
3070
3071
3072
3073void scheduler_tick(void)
3074{
3075 int cpu = smp_processor_id();
3076 struct rq *rq = cpu_rq(cpu);
3077 struct task_struct *curr = rq->curr;
3078 struct rq_flags rf;
3079
3080 sched_clock_tick();
3081
3082 rq_lock(rq, &rf);
3083
3084 update_rq_clock(rq);
3085 curr->sched_class->task_tick(rq, curr, 0);
3086 cpu_load_update_active(rq);
3087 calc_global_load_tick(rq);
3088
3089 rq_unlock(rq, &rf);
3090
3091 perf_event_task_tick();
3092
3093#ifdef CONFIG_SMP
3094 rq->idle_balance = idle_cpu(cpu);
3095 trigger_load_balance(rq);
3096#endif
3097}
3098
3099#ifdef CONFIG_NO_HZ_FULL
3100
3101struct tick_work {
3102 int cpu;
3103 struct delayed_work work;
3104};
3105
3106static struct tick_work __percpu *tick_work_cpu;
3107
3108static void sched_tick_remote(struct work_struct *work)
3109{
3110 struct delayed_work *dwork = to_delayed_work(work);
3111 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3112 int cpu = twork->cpu;
3113 struct rq *rq = cpu_rq(cpu);
3114 struct task_struct *curr;
3115 struct rq_flags rf;
3116 u64 delta;
3117
3118
3119
3120
3121
3122
3123
3124
3125 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3126 goto out_requeue;
3127
3128 rq_lock_irq(rq, &rf);
3129 curr = rq->curr;
3130 if (is_idle_task(curr))
3131 goto out_unlock;
3132
3133 update_rq_clock(rq);
3134 delta = rq_clock_task(rq) - curr->se.exec_start;
3135
3136
3137
3138
3139
3140 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3141 curr->sched_class->task_tick(rq, curr, 0);
3142
3143out_unlock:
3144 rq_unlock_irq(rq, &rf);
3145
3146out_requeue:
3147
3148
3149
3150
3151
3152 queue_delayed_work(system_unbound_wq, dwork, HZ);
3153}
3154
3155static void sched_tick_start(int cpu)
3156{
3157 struct tick_work *twork;
3158
3159 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3160 return;
3161
3162 WARN_ON_ONCE(!tick_work_cpu);
3163
3164 twork = per_cpu_ptr(tick_work_cpu, cpu);
3165 twork->cpu = cpu;
3166 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3167 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3168}
3169
3170#ifdef CONFIG_HOTPLUG_CPU
3171static void sched_tick_stop(int cpu)
3172{
3173 struct tick_work *twork;
3174
3175 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3176 return;
3177
3178 WARN_ON_ONCE(!tick_work_cpu);
3179
3180 twork = per_cpu_ptr(tick_work_cpu, cpu);
3181 cancel_delayed_work_sync(&twork->work);
3182}
3183#endif
3184
3185int __init sched_tick_offload_init(void)
3186{
3187 tick_work_cpu = alloc_percpu(struct tick_work);
3188 BUG_ON(!tick_work_cpu);
3189
3190 return 0;
3191}
3192
3193#else
3194static inline void sched_tick_start(int cpu) { }
3195static inline void sched_tick_stop(int cpu) { }
3196#endif
3197
3198#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3199 defined(CONFIG_PREEMPT_TRACER))
3200
3201
3202
3203
3204static inline void preempt_latency_start(int val)
3205{
3206 if (preempt_count() == val) {
3207 unsigned long ip = get_lock_parent_ip();
3208#ifdef CONFIG_DEBUG_PREEMPT
3209 current->preempt_disable_ip = ip;
3210#endif
3211 trace_preempt_off(CALLER_ADDR0, ip);
3212 }
3213}
3214
3215void preempt_count_add(int val)
3216{
3217#ifdef CONFIG_DEBUG_PREEMPT
3218
3219
3220
3221 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3222 return;
3223#endif
3224 __preempt_count_add(val);
3225#ifdef CONFIG_DEBUG_PREEMPT
3226
3227
3228
3229 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3230 PREEMPT_MASK - 10);
3231#endif
3232 preempt_latency_start(val);
3233}
3234EXPORT_SYMBOL(preempt_count_add);
3235NOKPROBE_SYMBOL(preempt_count_add);
3236
3237
3238
3239
3240
3241static inline void preempt_latency_stop(int val)
3242{
3243 if (preempt_count() == val)
3244 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3245}
3246
3247void preempt_count_sub(int val)
3248{
3249#ifdef CONFIG_DEBUG_PREEMPT
3250
3251
3252
3253 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3254 return;
3255
3256
3257
3258 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3259 !(preempt_count() & PREEMPT_MASK)))
3260 return;
3261#endif
3262
3263 preempt_latency_stop(val);
3264 __preempt_count_sub(val);
3265}
3266EXPORT_SYMBOL(preempt_count_sub);
3267NOKPROBE_SYMBOL(preempt_count_sub);
3268
3269#else
3270static inline void preempt_latency_start(int val) { }
3271static inline void preempt_latency_stop(int val) { }
3272#endif
3273
3274static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3275{
3276#ifdef CONFIG_DEBUG_PREEMPT
3277 return p->preempt_disable_ip;
3278#else
3279 return 0;
3280#endif
3281}
3282
3283
3284
3285
3286static noinline void __schedule_bug(struct task_struct *prev)
3287{
3288
3289 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3290
3291 if (oops_in_progress)
3292 return;
3293
3294 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3295 prev->comm, prev->pid, preempt_count());
3296
3297 debug_show_held_locks(prev);
3298 print_modules();
3299 if (irqs_disabled())
3300 print_irqtrace_events(prev);
3301 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3302 && in_atomic_preempt_off()) {
3303 pr_err("Preemption disabled at:");
3304 print_ip_sym(preempt_disable_ip);
3305 pr_cont("\n");
3306 }
3307 if (panic_on_warn)
3308 panic("scheduling while atomic\n");
3309
3310 dump_stack();
3311 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3312}
3313
3314
3315
3316
3317static inline void schedule_debug(struct task_struct *prev)
3318{
3319#ifdef CONFIG_SCHED_STACK_END_CHECK
3320 if (task_stack_end_corrupted(prev))
3321 panic("corrupted stack end detected inside scheduler\n");
3322#endif
3323
3324 if (unlikely(in_atomic_preempt_off())) {
3325 __schedule_bug(prev);
3326 preempt_count_set(PREEMPT_DISABLED);
3327 }
3328 rcu_sleep_check();
3329
3330 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3331
3332 schedstat_inc(this_rq()->sched_count);
3333}
3334
3335
3336
3337
3338static inline struct task_struct *
3339pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3340{
3341 const struct sched_class *class;
3342 struct task_struct *p;
3343
3344
3345
3346
3347
3348
3349
3350 if (likely((prev->sched_class == &idle_sched_class ||
3351 prev->sched_class == &fair_sched_class) &&
3352 rq->nr_running == rq->cfs.h_nr_running)) {
3353
3354 p = fair_sched_class.pick_next_task(rq, prev, rf);
3355 if (unlikely(p == RETRY_TASK))
3356 goto again;
3357
3358
3359 if (unlikely(!p))
3360 p = idle_sched_class.pick_next_task(rq, prev, rf);
3361
3362 return p;
3363 }
3364
3365again:
3366 for_each_class(class) {
3367 p = class->pick_next_task(rq, prev, rf);
3368 if (p) {
3369 if (unlikely(p == RETRY_TASK))
3370 goto again;
3371 return p;
3372 }
3373 }
3374
3375
3376 BUG();
3377}
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418static void __sched notrace __schedule(bool preempt)
3419{
3420 struct task_struct *prev, *next;
3421 unsigned long *switch_count;
3422 struct rq_flags rf;
3423 struct rq *rq;
3424 int cpu;
3425
3426 cpu = smp_processor_id();
3427 rq = cpu_rq(cpu);
3428 prev = rq->curr;
3429
3430 schedule_debug(prev);
3431
3432 if (sched_feat(HRTICK))
3433 hrtick_clear(rq);
3434
3435 local_irq_disable();
3436 rcu_note_context_switch(preempt);
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446 rq_lock(rq, &rf);
3447 smp_mb__after_spinlock();
3448
3449
3450 rq->clock_update_flags <<= 1;
3451 update_rq_clock(rq);
3452
3453 switch_count = &prev->nivcsw;
3454 if (!preempt && prev->state) {
3455 if (unlikely(signal_pending_state(prev->state, prev))) {
3456 prev->state = TASK_RUNNING;
3457 } else {
3458 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3459 prev->on_rq = 0;
3460
3461 if (prev->in_iowait) {
3462 atomic_inc(&rq->nr_iowait);
3463 delayacct_blkio_start();
3464 }
3465
3466
3467
3468
3469
3470
3471 if (prev->flags & PF_WQ_WORKER) {
3472 struct task_struct *to_wakeup;
3473
3474 to_wakeup = wq_worker_sleeping(prev);
3475 if (to_wakeup)
3476 try_to_wake_up_local(to_wakeup, &rf);
3477 }
3478 }
3479 switch_count = &prev->nvcsw;
3480 }
3481
3482 next = pick_next_task(rq, prev, &rf);
3483 clear_tsk_need_resched(prev);
3484 clear_preempt_need_resched();
3485
3486 if (likely(prev != next)) {
3487 rq->nr_switches++;
3488 rq->curr = next;
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503 ++*switch_count;
3504
3505 trace_sched_switch(preempt, prev, next);
3506
3507
3508 rq = context_switch(rq, prev, next, &rf);
3509 } else {
3510 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3511 rq_unlock_irq(rq, &rf);
3512 }
3513
3514 balance_callback(rq);
3515}
3516
3517void __noreturn do_task_dead(void)
3518{
3519
3520 set_special_state(TASK_DEAD);
3521
3522
3523 current->flags |= PF_NOFREEZE;
3524
3525 __schedule(false);
3526 BUG();
3527
3528
3529 for (;;)
3530 cpu_relax();
3531}
3532
3533static inline void sched_submit_work(struct task_struct *tsk)
3534{
3535 if (!tsk->state || tsk_is_pi_blocked(tsk))
3536 return;
3537
3538
3539
3540
3541 if (blk_needs_flush_plug(tsk))
3542 blk_schedule_flush_plug(tsk);
3543}
3544
3545asmlinkage __visible void __sched schedule(void)
3546{
3547 struct task_struct *tsk = current;
3548
3549 sched_submit_work(tsk);
3550 do {
3551 preempt_disable();
3552 __schedule(false);
3553 sched_preempt_enable_no_resched();
3554 } while (need_resched());
3555}
3556EXPORT_SYMBOL(schedule);
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568void __sched schedule_idle(void)
3569{
3570
3571
3572
3573
3574
3575
3576
3577 WARN_ON_ONCE(current->state);
3578 do {
3579 __schedule(false);
3580 } while (need_resched());
3581}
3582
3583#ifdef CONFIG_CONTEXT_TRACKING
3584asmlinkage __visible void __sched schedule_user(void)
3585{
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596 enum ctx_state prev_state = exception_enter();
3597 schedule();
3598 exception_exit(prev_state);
3599}
3600#endif
3601
3602
3603
3604
3605
3606
3607void __sched schedule_preempt_disabled(void)
3608{
3609 sched_preempt_enable_no_resched();
3610 schedule();
3611 preempt_disable();
3612}
3613
3614static void __sched notrace preempt_schedule_common(void)
3615{
3616 do {
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630 preempt_disable_notrace();
3631 preempt_latency_start(1);
3632 __schedule(true);
3633 preempt_latency_stop(1);
3634 preempt_enable_no_resched_notrace();
3635
3636
3637
3638
3639
3640 } while (need_resched());
3641}
3642
3643#ifdef CONFIG_PREEMPT
3644
3645
3646
3647
3648
3649asmlinkage __visible void __sched notrace preempt_schedule(void)
3650{
3651
3652
3653
3654
3655 if (likely(!preemptible()))
3656 return;
3657
3658 preempt_schedule_common();
3659}
3660NOKPROBE_SYMBOL(preempt_schedule);
3661EXPORT_SYMBOL(preempt_schedule);
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3678{
3679 enum ctx_state prev_ctx;
3680
3681 if (likely(!preemptible()))
3682 return;
3683
3684 do {
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698 preempt_disable_notrace();
3699 preempt_latency_start(1);
3700
3701
3702
3703
3704
3705 prev_ctx = exception_enter();
3706 __schedule(true);
3707 exception_exit(prev_ctx);
3708
3709 preempt_latency_stop(1);
3710 preempt_enable_no_resched_notrace();
3711 } while (need_resched());
3712}
3713EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3714
3715#endif
3716
3717
3718
3719
3720
3721
3722
3723asmlinkage __visible void __sched preempt_schedule_irq(void)
3724{
3725 enum ctx_state prev_state;
3726
3727
3728 BUG_ON(preempt_count() || !irqs_disabled());
3729
3730 prev_state = exception_enter();
3731
3732 do {
3733 preempt_disable();
3734 local_irq_enable();
3735 __schedule(true);
3736 local_irq_disable();
3737 sched_preempt_enable_no_resched();
3738 } while (need_resched());
3739
3740 exception_exit(prev_state);
3741}
3742
3743int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
3744 void *key)
3745{
3746 return try_to_wake_up(curr->private, mode, wake_flags);
3747}
3748EXPORT_SYMBOL(default_wake_function);
3749
3750#ifdef CONFIG_RT_MUTEXES
3751
3752static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3753{
3754 if (pi_task)
3755 prio = min(prio, pi_task->prio);
3756
3757 return prio;
3758}
3759
3760static inline int rt_effective_prio(struct task_struct *p, int prio)
3761{
3762 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3763
3764 return __rt_effective_prio(pi_task, prio);
3765}
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3779{
3780 int prio, oldprio, queued, running, queue_flag =
3781 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3782 const struct sched_class *prev_class;
3783 struct rq_flags rf;
3784 struct rq *rq;
3785
3786
3787 prio = __rt_effective_prio(pi_task, p->normal_prio);
3788
3789
3790
3791
3792 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3793 return;
3794
3795 rq = __task_rq_lock(p, &rf);
3796 update_rq_clock(rq);
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807 p->pi_top_task = pi_task;
3808
3809
3810
3811
3812 if (prio == p->prio && !dl_prio(prio))
3813 goto out_unlock;
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827 if (unlikely(p == rq->idle)) {
3828 WARN_ON(p != rq->curr);
3829 WARN_ON(p->pi_blocked_on);
3830 goto out_unlock;
3831 }
3832
3833 trace_sched_pi_setprio(p, pi_task);
3834 oldprio = p->prio;
3835
3836 if (oldprio == prio)
3837 queue_flag &= ~DEQUEUE_MOVE;
3838
3839 prev_class = p->sched_class;
3840 queued = task_on_rq_queued(p);
3841 running = task_current(rq, p);
3842 if (queued)
3843 dequeue_task(rq, p, queue_flag);
3844 if (running)
3845 put_prev_task(rq, p);
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856 if (dl_prio(prio)) {
3857 if (!dl_prio(p->normal_prio) ||
3858 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3859 p->dl.dl_boosted = 1;
3860 queue_flag |= ENQUEUE_REPLENISH;
3861 } else
3862 p->dl.dl_boosted = 0;
3863 p->sched_class = &dl_sched_class;
3864 } else if (rt_prio(prio)) {
3865 if (dl_prio(oldprio))
3866 p->dl.dl_boosted = 0;
3867 if (oldprio < prio)
3868 queue_flag |= ENQUEUE_HEAD;
3869 p->sched_class = &rt_sched_class;
3870 } else {
3871 if (dl_prio(oldprio))
3872 p->dl.dl_boosted = 0;
3873 if (rt_prio(oldprio))
3874 p->rt.timeout = 0;
3875 p->sched_class = &fair_sched_class;
3876 }
3877
3878 p->prio = prio;
3879
3880 if (queued)
3881 enqueue_task(rq, p, queue_flag);
3882 if (running)
3883 set_curr_task(rq, p);
3884
3885 check_class_changed(rq, p, prev_class, oldprio);
3886out_unlock:
3887
3888 preempt_disable();
3889 __task_rq_unlock(rq, &rf);
3890
3891 balance_callback(rq);
3892 preempt_enable();
3893}
3894#else
3895static inline int rt_effective_prio(struct task_struct *p, int prio)
3896{
3897 return prio;
3898}
3899#endif
3900
3901void set_user_nice(struct task_struct *p, long nice)
3902{
3903 bool queued, running;
3904 int old_prio, delta;
3905 struct rq_flags rf;
3906 struct rq *rq;
3907
3908 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3909 return;
3910
3911
3912
3913
3914 rq = task_rq_lock(p, &rf);
3915 update_rq_clock(rq);
3916
3917
3918
3919
3920
3921
3922
3923 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3924 p->static_prio = NICE_TO_PRIO(nice);
3925 goto out_unlock;
3926 }
3927 queued = task_on_rq_queued(p);
3928 running = task_current(rq, p);
3929 if (queued)
3930 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3931 if (running)
3932 put_prev_task(rq, p);
3933
3934 p->static_prio = NICE_TO_PRIO(nice);
3935 set_load_weight(p, true);
3936 old_prio = p->prio;
3937 p->prio = effective_prio(p);
3938 delta = p->prio - old_prio;
3939
3940 if (queued) {
3941 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3942
3943
3944
3945
3946 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3947 resched_curr(rq);
3948 }
3949 if (running)
3950 set_curr_task(rq, p);
3951out_unlock:
3952 task_rq_unlock(rq, p, &rf);
3953}
3954EXPORT_SYMBOL(set_user_nice);
3955
3956
3957
3958
3959
3960
3961int can_nice(const struct task_struct *p, const int nice)
3962{
3963
3964 int nice_rlim = nice_to_rlimit(nice);
3965
3966 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3967 capable(CAP_SYS_NICE));
3968}
3969
3970#ifdef __ARCH_WANT_SYS_NICE
3971
3972
3973
3974
3975
3976
3977
3978
3979SYSCALL_DEFINE1(nice, int, increment)
3980{
3981 long nice, retval;
3982
3983
3984
3985
3986
3987
3988 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3989 nice = task_nice(current) + increment;
3990
3991 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3992 if (increment < 0 && !can_nice(current, nice))
3993 return -EPERM;
3994
3995 retval = security_task_setnice(current, nice);
3996 if (retval)
3997 return retval;
3998
3999 set_user_nice(current, nice);
4000 return 0;
4001}
4002
4003#endif
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013int task_prio(const struct task_struct *p)
4014{
4015 return p->prio - MAX_RT_PRIO;
4016}
4017
4018
4019
4020
4021
4022
4023
4024int idle_cpu(int cpu)
4025{
4026 struct rq *rq = cpu_rq(cpu);
4027
4028 if (rq->curr != rq->idle)
4029 return 0;
4030
4031 if (rq->nr_running)
4032 return 0;
4033
4034#ifdef CONFIG_SMP
4035 if (!llist_empty(&rq->wake_list))
4036 return 0;
4037#endif
4038
4039 return 1;
4040}
4041
4042
4043
4044
4045
4046
4047
4048int available_idle_cpu(int cpu)
4049{
4050 if (!idle_cpu(cpu))
4051 return 0;
4052
4053 if (vcpu_is_preempted(cpu))
4054 return 0;
4055
4056 return 1;
4057}
4058
4059
4060
4061
4062
4063
4064
4065struct task_struct *idle_task(int cpu)
4066{
4067 return cpu_rq(cpu)->idle;
4068}
4069
4070
4071
4072
4073
4074
4075
4076static struct task_struct *find_process_by_pid(pid_t pid)
4077{
4078 return pid ? find_task_by_vpid(pid) : current;
4079}
4080
4081
4082
4083
4084
4085#define SETPARAM_POLICY -1
4086
4087static void __setscheduler_params(struct task_struct *p,
4088 const struct sched_attr *attr)
4089{
4090 int policy = attr->sched_policy;
4091
4092 if (policy == SETPARAM_POLICY)
4093 policy = p->policy;
4094
4095 p->policy = policy;
4096
4097 if (dl_policy(policy))
4098 __setparam_dl(p, attr);
4099 else if (fair_policy(policy))
4100 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4101
4102
4103
4104
4105
4106
4107 p->rt_priority = attr->sched_priority;
4108 p->normal_prio = normal_prio(p);
4109 set_load_weight(p, true);
4110}
4111
4112
4113static void __setscheduler(struct rq *rq, struct task_struct *p,
4114 const struct sched_attr *attr, bool keep_boost)
4115{
4116 __setscheduler_params(p, attr);
4117
4118
4119
4120
4121
4122 p->prio = normal_prio(p);
4123 if (keep_boost)
4124 p->prio = rt_effective_prio(p, p->prio);
4125
4126 if (dl_prio(p->prio))
4127 p->sched_class = &dl_sched_class;
4128 else if (rt_prio(p->prio))
4129 p->sched_class = &rt_sched_class;
4130 else
4131 p->sched_class = &fair_sched_class;
4132}
4133
4134
4135
4136
4137static bool check_same_owner(struct task_struct *p)
4138{
4139 const struct cred *cred = current_cred(), *pcred;
4140 bool match;
4141
4142 rcu_read_lock();
4143 pcred = __task_cred(p);
4144 match = (uid_eq(cred->euid, pcred->euid) ||
4145 uid_eq(cred->euid, pcred->uid));
4146 rcu_read_unlock();
4147 return match;
4148}
4149
4150static int __sched_setscheduler(struct task_struct *p,
4151 const struct sched_attr *attr,
4152 bool user, bool pi)
4153{
4154 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4155 MAX_RT_PRIO - 1 - attr->sched_priority;
4156 int retval, oldprio, oldpolicy = -1, queued, running;
4157 int new_effective_prio, policy = attr->sched_policy;
4158 const struct sched_class *prev_class;
4159 struct rq_flags rf;
4160 int reset_on_fork;
4161 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4162 struct rq *rq;
4163
4164
4165 BUG_ON(pi && in_interrupt());
4166recheck:
4167
4168 if (policy < 0) {
4169 reset_on_fork = p->sched_reset_on_fork;
4170 policy = oldpolicy = p->policy;
4171 } else {
4172 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4173
4174 if (!valid_policy(policy))
4175 return -EINVAL;
4176 }
4177
4178 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
4179 return -EINVAL;
4180
4181
4182
4183
4184
4185
4186 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4187 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4188 return -EINVAL;
4189 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4190 (rt_policy(policy) != (attr->sched_priority != 0)))
4191 return -EINVAL;
4192
4193
4194
4195
4196 if (user && !capable(CAP_SYS_NICE)) {
4197 if (fair_policy(policy)) {
4198 if (attr->sched_nice < task_nice(p) &&
4199 !can_nice(p, attr->sched_nice))
4200 return -EPERM;
4201 }
4202
4203 if (rt_policy(policy)) {
4204 unsigned long rlim_rtprio =
4205 task_rlimit(p, RLIMIT_RTPRIO);
4206
4207
4208 if (policy != p->policy && !rlim_rtprio)
4209 return -EPERM;
4210
4211
4212 if (attr->sched_priority > p->rt_priority &&
4213 attr->sched_priority > rlim_rtprio)
4214 return -EPERM;
4215 }
4216
4217
4218
4219
4220
4221
4222
4223 if (dl_policy(policy))
4224 return -EPERM;
4225
4226
4227
4228
4229
4230 if (idle_policy(p->policy) && !idle_policy(policy)) {
4231 if (!can_nice(p, task_nice(p)))
4232 return -EPERM;
4233 }
4234
4235
4236 if (!check_same_owner(p))
4237 return -EPERM;
4238
4239
4240 if (p->sched_reset_on_fork && !reset_on_fork)
4241 return -EPERM;
4242 }
4243
4244 if (user) {
4245 if (attr->sched_flags & SCHED_FLAG_SUGOV)
4246 return -EINVAL;
4247
4248 retval = security_task_setscheduler(p);
4249 if (retval)
4250 return retval;
4251 }
4252
4253
4254
4255
4256
4257
4258
4259
4260 rq = task_rq_lock(p, &rf);
4261 update_rq_clock(rq);
4262
4263
4264
4265
4266 if (p == rq->stop) {
4267 task_rq_unlock(rq, p, &rf);
4268 return -EINVAL;
4269 }
4270
4271
4272
4273
4274
4275 if (unlikely(policy == p->policy)) {
4276 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4277 goto change;
4278 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4279 goto change;
4280 if (dl_policy(policy) && dl_param_changed(p, attr))
4281 goto change;
4282
4283 p->sched_reset_on_fork = reset_on_fork;
4284 task_rq_unlock(rq, p, &rf);
4285 return 0;
4286 }
4287change:
4288
4289 if (user) {
4290#ifdef CONFIG_RT_GROUP_SCHED
4291
4292
4293
4294
4295 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4296 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4297 !task_group_is_autogroup(task_group(p))) {
4298 task_rq_unlock(rq, p, &rf);
4299 return -EPERM;
4300 }
4301#endif
4302#ifdef CONFIG_SMP
4303 if (dl_bandwidth_enabled() && dl_policy(policy) &&
4304 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
4305 cpumask_t *span = rq->rd->span;
4306
4307
4308
4309
4310
4311
4312 if (!cpumask_subset(span, &p->cpus_allowed) ||
4313 rq->rd->dl_bw.bw == 0) {
4314 task_rq_unlock(rq, p, &rf);
4315 return -EPERM;
4316 }
4317 }
4318#endif
4319 }
4320
4321
4322 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4323 policy = oldpolicy = -1;
4324 task_rq_unlock(rq, p, &rf);
4325 goto recheck;
4326 }
4327
4328
4329
4330
4331
4332
4333 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4334 task_rq_unlock(rq, p, &rf);
4335 return -EBUSY;
4336 }
4337
4338 p->sched_reset_on_fork = reset_on_fork;
4339 oldprio = p->prio;
4340
4341 if (pi) {
4342
4343
4344
4345
4346
4347
4348
4349 new_effective_prio = rt_effective_prio(p, newprio);
4350 if (new_effective_prio == oldprio)
4351 queue_flags &= ~DEQUEUE_MOVE;
4352 }
4353
4354 queued = task_on_rq_queued(p);
4355 running = task_current(rq, p);
4356 if (queued)
4357 dequeue_task(rq, p, queue_flags);
4358 if (running)
4359 put_prev_task(rq, p);
4360
4361 prev_class = p->sched_class;
4362 __setscheduler(rq, p, attr, pi);
4363
4364 if (queued) {
4365
4366
4367
4368
4369 if (oldprio < p->prio)
4370 queue_flags |= ENQUEUE_HEAD;
4371
4372 enqueue_task(rq, p, queue_flags);
4373 }
4374 if (running)
4375 set_curr_task(rq, p);
4376
4377 check_class_changed(rq, p, prev_class, oldprio);
4378
4379
4380 preempt_disable();
4381 task_rq_unlock(rq, p, &rf);
4382
4383 if (pi)
4384 rt_mutex_adjust_pi(p);
4385
4386
4387 balance_callback(rq);
4388 preempt_enable();
4389
4390 return 0;
4391}
4392
4393static int _sched_setscheduler(struct task_struct *p, int policy,
4394 const struct sched_param *param, bool check)
4395{
4396 struct sched_attr attr = {
4397 .sched_policy = policy,
4398 .sched_priority = param->sched_priority,
4399 .sched_nice = PRIO_TO_NICE(p->static_prio),
4400 };
4401
4402
4403 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4404 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4405 policy &= ~SCHED_RESET_ON_FORK;
4406 attr.sched_policy = policy;
4407 }
4408
4409 return __sched_setscheduler(p, &attr, check, true);
4410}
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421int sched_setscheduler(struct task_struct *p, int policy,
4422 const struct sched_param *param)
4423{
4424 return _sched_setscheduler(p, policy, param, true);
4425}
4426EXPORT_SYMBOL_GPL(sched_setscheduler);
4427
4428int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4429{
4430 return __sched_setscheduler(p, attr, true, true);
4431}
4432EXPORT_SYMBOL_GPL(sched_setattr);
4433
4434int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
4435{
4436 return __sched_setscheduler(p, attr, false, true);
4437}
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4453 const struct sched_param *param)
4454{
4455 return _sched_setscheduler(p, policy, param, false);
4456}
4457EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4458
4459static int
4460do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4461{
4462 struct sched_param lparam;
4463 struct task_struct *p;
4464 int retval;
4465
4466 if (!param || pid < 0)
4467 return -EINVAL;
4468 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4469 return -EFAULT;
4470
4471 rcu_read_lock();
4472 retval = -ESRCH;
4473 p = find_process_by_pid(pid);
4474 if (p != NULL)
4475 retval = sched_setscheduler(p, policy, &lparam);
4476 rcu_read_unlock();
4477
4478 return retval;
4479}
4480
4481
4482
4483
4484static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4485{
4486 u32 size;
4487 int ret;
4488
4489 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4490 return -EFAULT;
4491
4492
4493 memset(attr, 0, sizeof(*attr));
4494
4495 ret = get_user(size, &uattr->size);
4496 if (ret)
4497 return ret;
4498
4499
4500 if (size > PAGE_SIZE)
4501 goto err_size;
4502
4503
4504 if (!size)
4505 size = SCHED_ATTR_SIZE_VER0;
4506
4507 if (size < SCHED_ATTR_SIZE_VER0)
4508 goto err_size;
4509
4510
4511
4512
4513
4514
4515
4516 if (size > sizeof(*attr)) {
4517 unsigned char __user *addr;
4518 unsigned char __user *end;
4519 unsigned char val;
4520
4521 addr = (void __user *)uattr + sizeof(*attr);
4522 end = (void __user *)uattr + size;
4523
4524 for (; addr < end; addr++) {
4525 ret = get_user(val, addr);
4526 if (ret)
4527 return ret;
4528 if (val)
4529 goto err_size;
4530 }
4531 size = sizeof(*attr);
4532 }
4533
4534 ret = copy_from_user(attr, uattr, size);
4535 if (ret)
4536 return -EFAULT;
4537
4538
4539
4540
4541
4542 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4543
4544 return 0;
4545
4546err_size:
4547 put_user(sizeof(*attr), &uattr->size);
4548 return -E2BIG;
4549}
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4560{
4561 if (policy < 0)
4562 return -EINVAL;
4563
4564 return do_sched_setscheduler(pid, policy, param);
4565}
4566
4567
4568
4569
4570
4571
4572
4573
4574SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4575{
4576 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4577}
4578
4579
4580
4581
4582
4583
4584
4585SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4586 unsigned int, flags)
4587{
4588 struct sched_attr attr;
4589 struct task_struct *p;
4590 int retval;
4591
4592 if (!uattr || pid < 0 || flags)
4593 return -EINVAL;
4594
4595 retval = sched_copy_attr(uattr, &attr);
4596 if (retval)
4597 return retval;
4598
4599 if ((int)attr.sched_policy < 0)
4600 return -EINVAL;
4601
4602 rcu_read_lock();
4603 retval = -ESRCH;
4604 p = find_process_by_pid(pid);
4605 if (p != NULL)
4606 retval = sched_setattr(p, &attr);
4607 rcu_read_unlock();
4608
4609 return retval;
4610}
4611
4612
4613
4614
4615
4616
4617
4618
4619SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4620{
4621 struct task_struct *p;
4622 int retval;
4623
4624 if (pid < 0)
4625 return -EINVAL;
4626
4627 retval = -ESRCH;
4628 rcu_read_lock();
4629 p = find_process_by_pid(pid);
4630 if (p) {
4631 retval = security_task_getscheduler(p);
4632 if (!retval)
4633 retval = p->policy
4634 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4635 }
4636 rcu_read_unlock();
4637 return retval;
4638}
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4649{
4650 struct sched_param lp = { .sched_priority = 0 };
4651 struct task_struct *p;
4652 int retval;
4653
4654 if (!param || pid < 0)
4655 return -EINVAL;
4656
4657 rcu_read_lock();
4658 p = find_process_by_pid(pid);
4659 retval = -ESRCH;
4660 if (!p)
4661 goto out_unlock;
4662
4663 retval = security_task_getscheduler(p);
4664 if (retval)
4665 goto out_unlock;
4666
4667 if (task_has_rt_policy(p))
4668 lp.sched_priority = p->rt_priority;
4669 rcu_read_unlock();
4670
4671
4672
4673
4674 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4675
4676 return retval;
4677
4678out_unlock:
4679 rcu_read_unlock();
4680 return retval;
4681}
4682
4683static int sched_read_attr(struct sched_attr __user *uattr,
4684 struct sched_attr *attr,
4685 unsigned int usize)
4686{
4687 int ret;
4688
4689 if (!access_ok(VERIFY_WRITE, uattr, usize))
4690 return -EFAULT;
4691
4692
4693
4694
4695
4696
4697 if (usize < sizeof(*attr)) {
4698 unsigned char *addr;
4699 unsigned char *end;
4700
4701 addr = (void *)attr + usize;
4702 end = (void *)attr + sizeof(*attr);
4703
4704 for (; addr < end; addr++) {
4705 if (*addr)
4706 return -EFBIG;
4707 }
4708
4709 attr->size = usize;
4710 }
4711
4712 ret = copy_to_user(uattr, attr, attr->size);
4713 if (ret)
4714 return -EFAULT;
4715
4716 return 0;
4717}
4718
4719
4720
4721
4722
4723
4724
4725
4726SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4727 unsigned int, size, unsigned int, flags)
4728{
4729 struct sched_attr attr = {
4730 .size = sizeof(struct sched_attr),
4731 };
4732 struct task_struct *p;
4733 int retval;
4734
4735 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4736 size < SCHED_ATTR_SIZE_VER0 || flags)
4737 return -EINVAL;
4738
4739 rcu_read_lock();
4740 p = find_process_by_pid(pid);
4741 retval = -ESRCH;
4742 if (!p)
4743 goto out_unlock;
4744
4745 retval = security_task_getscheduler(p);
4746 if (retval)
4747 goto out_unlock;
4748
4749 attr.sched_policy = p->policy;
4750 if (p->sched_reset_on_fork)
4751 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4752 if (task_has_dl_policy(p))
4753 __getparam_dl(p, &attr);
4754 else if (task_has_rt_policy(p))
4755 attr.sched_priority = p->rt_priority;
4756 else
4757 attr.sched_nice = task_nice(p);
4758
4759 rcu_read_unlock();
4760
4761 retval = sched_read_attr(uattr, &attr, size);
4762 return retval;
4763
4764out_unlock:
4765 rcu_read_unlock();
4766 return retval;
4767}
4768
4769long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4770{
4771 cpumask_var_t cpus_allowed, new_mask;
4772 struct task_struct *p;
4773 int retval;
4774
4775 rcu_read_lock();
4776
4777 p = find_process_by_pid(pid);
4778 if (!p) {
4779 rcu_read_unlock();
4780 return -ESRCH;
4781 }
4782
4783
4784 get_task_struct(p);
4785 rcu_read_unlock();
4786
4787 if (p->flags & PF_NO_SETAFFINITY) {
4788 retval = -EINVAL;
4789 goto out_put_task;
4790 }
4791 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4792 retval = -ENOMEM;
4793 goto out_put_task;
4794 }
4795 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4796 retval = -ENOMEM;
4797 goto out_free_cpus_allowed;
4798 }
4799 retval = -EPERM;
4800 if (!check_same_owner(p)) {
4801 rcu_read_lock();
4802 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4803 rcu_read_unlock();
4804 goto out_free_new_mask;
4805 }
4806 rcu_read_unlock();
4807 }
4808
4809 retval = security_task_setscheduler(p);
4810 if (retval)
4811 goto out_free_new_mask;
4812
4813
4814 cpuset_cpus_allowed(p, cpus_allowed);
4815 cpumask_and(new_mask, in_mask, cpus_allowed);
4816
4817
4818
4819
4820
4821
4822
4823#ifdef CONFIG_SMP
4824 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4825 rcu_read_lock();
4826 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4827 retval = -EBUSY;
4828 rcu_read_unlock();
4829 goto out_free_new_mask;
4830 }
4831 rcu_read_unlock();
4832 }
4833#endif
4834again:
4835 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4836
4837 if (!retval) {
4838 cpuset_cpus_allowed(p, cpus_allowed);
4839 if (!cpumask_subset(new_mask, cpus_allowed)) {
4840
4841
4842
4843
4844
4845 cpumask_copy(new_mask, cpus_allowed);
4846 goto again;
4847 }
4848 }
4849out_free_new_mask:
4850 free_cpumask_var(new_mask);
4851out_free_cpus_allowed:
4852 free_cpumask_var(cpus_allowed);
4853out_put_task:
4854 put_task_struct(p);
4855 return retval;
4856}
4857
4858static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4859 struct cpumask *new_mask)
4860{
4861 if (len < cpumask_size())
4862 cpumask_clear(new_mask);
4863 else if (len > cpumask_size())
4864 len = cpumask_size();
4865
4866 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4867}
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4878 unsigned long __user *, user_mask_ptr)
4879{
4880 cpumask_var_t new_mask;
4881 int retval;
4882
4883 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4884 return -ENOMEM;
4885
4886 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4887 if (retval == 0)
4888 retval = sched_setaffinity(pid, new_mask);
4889 free_cpumask_var(new_mask);
4890 return retval;
4891}
4892
4893long sched_getaffinity(pid_t pid, struct cpumask *mask)
4894{
4895 struct task_struct *p;
4896 unsigned long flags;
4897 int retval;
4898
4899 rcu_read_lock();
4900
4901 retval = -ESRCH;
4902 p = find_process_by_pid(pid);
4903 if (!p)
4904 goto out_unlock;
4905
4906 retval = security_task_getscheduler(p);
4907 if (retval)
4908 goto out_unlock;
4909
4910 raw_spin_lock_irqsave(&p->pi_lock, flags);
4911 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913
4914out_unlock:
4915 rcu_read_unlock();
4916
4917 return retval;
4918}
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4930 unsigned long __user *, user_mask_ptr)
4931{
4932 int ret;
4933 cpumask_var_t mask;
4934
4935 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4936 return -EINVAL;
4937 if (len & (sizeof(unsigned long)-1))
4938 return -EINVAL;
4939
4940 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4941 return -ENOMEM;
4942
4943 ret = sched_getaffinity(pid, mask);
4944 if (ret == 0) {
4945 unsigned int retlen = min(len, cpumask_size());
4946
4947 if (copy_to_user(user_mask_ptr, mask, retlen))
4948 ret = -EFAULT;
4949 else
4950 ret = retlen;
4951 }
4952 free_cpumask_var(mask);
4953
4954 return ret;
4955}
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965static void do_sched_yield(void)
4966{
4967 struct rq_flags rf;
4968 struct rq *rq;
4969
4970 local_irq_disable();
4971 rq = this_rq();
4972 rq_lock(rq, &rf);
4973
4974 schedstat_inc(rq->yld_count);
4975 current->sched_class->yield_task(rq);
4976
4977
4978
4979
4980
4981 preempt_disable();
4982 rq_unlock(rq, &rf);
4983 sched_preempt_enable_no_resched();
4984
4985 schedule();
4986}
4987
4988SYSCALL_DEFINE0(sched_yield)
4989{
4990 do_sched_yield();
4991 return 0;
4992}
4993
4994#ifndef CONFIG_PREEMPT
4995int __sched _cond_resched(void)
4996{
4997 if (should_resched(0)) {
4998 preempt_schedule_common();
4999 return 1;
5000 }
5001 rcu_all_qs();
5002 return 0;
5003}
5004EXPORT_SYMBOL(_cond_resched);
5005#endif
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015int __cond_resched_lock(spinlock_t *lock)
5016{
5017 int resched = should_resched(PREEMPT_LOCK_OFFSET);
5018 int ret = 0;
5019
5020 lockdep_assert_held(lock);
5021
5022 if (spin_needbreak(lock) || resched) {
5023 spin_unlock(lock);
5024 if (resched)
5025 preempt_schedule_common();
5026 else
5027 cpu_relax();
5028 ret = 1;
5029 spin_lock(lock);
5030 }
5031 return ret;
5032}
5033EXPORT_SYMBOL(__cond_resched_lock);
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057void __sched yield(void)
5058{
5059 set_current_state(TASK_RUNNING);
5060 do_sched_yield();
5061}
5062EXPORT_SYMBOL(yield);
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079int __sched yield_to(struct task_struct *p, bool preempt)
5080{
5081 struct task_struct *curr = current;
5082 struct rq *rq, *p_rq;
5083 unsigned long flags;
5084 int yielded = 0;
5085
5086 local_irq_save(flags);
5087 rq = this_rq();
5088
5089again:
5090 p_rq = task_rq(p);
5091
5092
5093
5094
5095 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5096 yielded = -ESRCH;
5097 goto out_irq;
5098 }
5099
5100 double_rq_lock(rq, p_rq);
5101 if (task_rq(p) != p_rq) {
5102 double_rq_unlock(rq, p_rq);
5103 goto again;
5104 }
5105
5106 if (!curr->sched_class->yield_to_task)
5107 goto out_unlock;
5108
5109 if (curr->sched_class != p->sched_class)
5110 goto out_unlock;
5111
5112 if (task_running(p_rq, p) || p->state)
5113 goto out_unlock;
5114
5115 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5116 if (yielded) {
5117 schedstat_inc(rq->yld_count);
5118
5119
5120
5121
5122 if (preempt && rq != p_rq)
5123 resched_curr(p_rq);
5124 }
5125
5126out_unlock:
5127 double_rq_unlock(rq, p_rq);
5128out_irq:
5129 local_irq_restore(flags);
5130
5131 if (yielded > 0)
5132 schedule();
5133
5134 return yielded;
5135}
5136EXPORT_SYMBOL_GPL(yield_to);
5137
5138int io_schedule_prepare(void)
5139{
5140 int old_iowait = current->in_iowait;
5141
5142 current->in_iowait = 1;
5143 blk_schedule_flush_plug(current);
5144
5145 return old_iowait;
5146}
5147
5148void io_schedule_finish(int token)
5149{
5150 current->in_iowait = token;
5151}
5152
5153
5154
5155
5156
5157long __sched io_schedule_timeout(long timeout)
5158{
5159 int token;
5160 long ret;
5161
5162 token = io_schedule_prepare();
5163 ret = schedule_timeout(timeout);
5164 io_schedule_finish(token);
5165
5166 return ret;
5167}
5168EXPORT_SYMBOL(io_schedule_timeout);
5169
5170void io_schedule(void)
5171{
5172 int token;
5173
5174 token = io_schedule_prepare();
5175 schedule();
5176 io_schedule_finish(token);
5177}
5178EXPORT_SYMBOL(io_schedule);
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5189{
5190 int ret = -EINVAL;
5191
5192 switch (policy) {
5193 case SCHED_FIFO:
5194 case SCHED_RR:
5195 ret = MAX_USER_RT_PRIO-1;
5196 break;
5197 case SCHED_DEADLINE:
5198 case SCHED_NORMAL:
5199 case SCHED_BATCH:
5200 case SCHED_IDLE:
5201 ret = 0;
5202 break;
5203 }
5204 return ret;
5205}
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5216{
5217 int ret = -EINVAL;
5218
5219 switch (policy) {
5220 case SCHED_FIFO:
5221 case SCHED_RR:
5222 ret = 1;
5223 break;
5224 case SCHED_DEADLINE:
5225 case SCHED_NORMAL:
5226 case SCHED_BATCH:
5227 case SCHED_IDLE:
5228 ret = 0;
5229 }
5230 return ret;
5231}
5232
5233static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
5234{
5235 struct task_struct *p;
5236 unsigned int time_slice;
5237 struct rq_flags rf;
5238 struct rq *rq;
5239 int retval;
5240
5241 if (pid < 0)
5242 return -EINVAL;
5243
5244 retval = -ESRCH;
5245 rcu_read_lock();
5246 p = find_process_by_pid(pid);
5247 if (!p)
5248 goto out_unlock;
5249
5250 retval = security_task_getscheduler(p);
5251 if (retval)
5252 goto out_unlock;
5253
5254 rq = task_rq_lock(p, &rf);
5255 time_slice = 0;
5256 if (p->sched_class->get_rr_interval)
5257 time_slice = p->sched_class->get_rr_interval(rq, p);
5258 task_rq_unlock(rq, p, &rf);
5259
5260 rcu_read_unlock();
5261 jiffies_to_timespec64(time_slice, t);
5262 return 0;
5263
5264out_unlock:
5265 rcu_read_unlock();
5266 return retval;
5267}
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5281 struct timespec __user *, interval)
5282{
5283 struct timespec64 t;
5284 int retval = sched_rr_get_interval(pid, &t);
5285
5286 if (retval == 0)
5287 retval = put_timespec64(&t, interval);
5288
5289 return retval;
5290}
5291
5292#ifdef CONFIG_COMPAT
5293COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
5294 compat_pid_t, pid,
5295 struct compat_timespec __user *, interval)
5296{
5297 struct timespec64 t;
5298 int retval = sched_rr_get_interval(pid, &t);
5299
5300 if (retval == 0)
5301 retval = compat_put_timespec64(&t, interval);
5302 return retval;
5303}
5304#endif
5305
5306void sched_show_task(struct task_struct *p)
5307{
5308 unsigned long free = 0;
5309 int ppid;
5310
5311 if (!try_get_task_stack(p))
5312 return;
5313
5314 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
5315
5316 if (p->state == TASK_RUNNING)
5317 printk(KERN_CONT " running task ");
5318#ifdef CONFIG_DEBUG_STACK_USAGE
5319 free = stack_not_used(p);
5320#endif
5321 ppid = 0;
5322 rcu_read_lock();
5323 if (pid_alive(p))
5324 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5325 rcu_read_unlock();
5326 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5327 task_pid_nr(p), ppid,
5328 (unsigned long)task_thread_info(p)->flags);
5329
5330 print_worker_info(KERN_INFO, p);
5331 show_stack(p, NULL);
5332 put_task_stack(p);
5333}
5334EXPORT_SYMBOL_GPL(sched_show_task);
5335
5336static inline bool
5337state_filter_match(unsigned long state_filter, struct task_struct *p)
5338{
5339
5340 if (!state_filter)
5341 return true;
5342
5343
5344 if (!(p->state & state_filter))
5345 return false;
5346
5347
5348
5349
5350
5351 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5352 return false;
5353
5354 return true;
5355}
5356
5357
5358void show_state_filter(unsigned long state_filter)
5359{
5360 struct task_struct *g, *p;
5361
5362#if BITS_PER_LONG == 32
5363 printk(KERN_INFO
5364 " task PC stack pid father\n");
5365#else
5366 printk(KERN_INFO
5367 " task PC stack pid father\n");
5368#endif
5369 rcu_read_lock();
5370 for_each_process_thread(g, p) {
5371
5372
5373
5374
5375
5376
5377
5378 touch_nmi_watchdog();
5379 touch_all_softlockup_watchdogs();
5380 if (state_filter_match(state_filter, p))
5381 sched_show_task(p);
5382 }
5383
5384#ifdef CONFIG_SCHED_DEBUG
5385 if (!state_filter)
5386 sysrq_sched_debug_show();
5387#endif
5388 rcu_read_unlock();
5389
5390
5391
5392 if (!state_filter)
5393 debug_show_all_locks();
5394}
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404void init_idle(struct task_struct *idle, int cpu)
5405{
5406 struct rq *rq = cpu_rq(cpu);
5407 unsigned long flags;
5408
5409 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5410 raw_spin_lock(&rq->lock);
5411
5412 __sched_fork(0, idle);
5413 idle->state = TASK_RUNNING;
5414 idle->se.exec_start = sched_clock();
5415 idle->flags |= PF_IDLE;
5416
5417 kasan_unpoison_task_stack(idle);
5418
5419#ifdef CONFIG_SMP
5420
5421
5422
5423
5424
5425
5426 set_cpus_allowed_common(idle, cpumask_of(cpu));
5427#endif
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438 rcu_read_lock();
5439 __set_task_cpu(idle, cpu);
5440 rcu_read_unlock();
5441
5442 rq->curr = rq->idle = idle;
5443 idle->on_rq = TASK_ON_RQ_QUEUED;
5444#ifdef CONFIG_SMP
5445 idle->on_cpu = 1;
5446#endif
5447 raw_spin_unlock(&rq->lock);
5448 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5449
5450
5451 init_idle_preempt_count(idle, cpu);
5452
5453
5454
5455
5456 idle->sched_class = &idle_sched_class;
5457 ftrace_graph_init_idle_task(idle, cpu);
5458 vtime_init_idle(idle, cpu);
5459#ifdef CONFIG_SMP
5460 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5461#endif
5462}
5463
5464#ifdef CONFIG_SMP
5465
5466int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5467 const struct cpumask *trial)
5468{
5469 int ret = 1;
5470
5471 if (!cpumask_weight(cur))
5472 return ret;
5473
5474 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5475
5476 return ret;
5477}
5478
5479int task_can_attach(struct task_struct *p,
5480 const struct cpumask *cs_cpus_allowed)
5481{
5482 int ret = 0;
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493 if (p->flags & PF_NO_SETAFFINITY) {
5494 ret = -EINVAL;
5495 goto out;
5496 }
5497
5498 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5499 cs_cpus_allowed))
5500 ret = dl_task_can_attach(p, cs_cpus_allowed);
5501
5502out:
5503 return ret;
5504}
5505
5506bool sched_smp_initialized __read_mostly;
5507
5508#ifdef CONFIG_NUMA_BALANCING
5509
5510int migrate_task_to(struct task_struct *p, int target_cpu)
5511{
5512 struct migration_arg arg = { p, target_cpu };
5513 int curr_cpu = task_cpu(p);
5514
5515 if (curr_cpu == target_cpu)
5516 return 0;
5517
5518 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
5519 return -EINVAL;
5520
5521
5522
5523 trace_sched_move_numa(p, curr_cpu, target_cpu);
5524 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5525}
5526
5527
5528
5529
5530
5531void sched_setnuma(struct task_struct *p, int nid)
5532{
5533 bool queued, running;
5534 struct rq_flags rf;
5535 struct rq *rq;
5536
5537 rq = task_rq_lock(p, &rf);
5538 queued = task_on_rq_queued(p);
5539 running = task_current(rq, p);
5540
5541 if (queued)
5542 dequeue_task(rq, p, DEQUEUE_SAVE);
5543 if (running)
5544 put_prev_task(rq, p);
5545
5546 p->numa_preferred_nid = nid;
5547
5548 if (queued)
5549 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5550 if (running)
5551 set_curr_task(rq, p);
5552 task_rq_unlock(rq, p, &rf);
5553}
5554#endif
5555
5556#ifdef CONFIG_HOTPLUG_CPU
5557
5558
5559
5560
5561void idle_task_exit(void)
5562{
5563 struct mm_struct *mm = current->active_mm;
5564
5565 BUG_ON(cpu_online(smp_processor_id()));
5566
5567 if (mm != &init_mm) {
5568 switch_mm(mm, &init_mm, current);
5569 current->active_mm = &init_mm;
5570 finish_arch_post_lock_switch();
5571 }
5572 mmdrop(mm);
5573}
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584static void calc_load_migrate(struct rq *rq)
5585{
5586 long delta = calc_load_fold_active(rq, 1);
5587 if (delta)
5588 atomic_long_add(delta, &calc_load_tasks);
5589}
5590
5591static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5592{
5593}
5594
5595static const struct sched_class fake_sched_class = {
5596 .put_prev_task = put_prev_task_fake,
5597};
5598
5599static struct task_struct fake_task = {
5600
5601
5602
5603 .prio = MAX_PRIO + 1,
5604 .sched_class = &fake_sched_class,
5605};
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5616{
5617 struct rq *rq = dead_rq;
5618 struct task_struct *next, *stop = rq->stop;
5619 struct rq_flags orf = *rf;
5620 int dest_cpu;
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631 rq->stop = NULL;
5632
5633
5634
5635
5636
5637
5638 update_rq_clock(rq);
5639
5640 for (;;) {
5641
5642
5643
5644
5645 if (rq->nr_running == 1)
5646 break;
5647
5648
5649
5650
5651 next = pick_next_task(rq, &fake_task, rf);
5652 BUG_ON(!next);
5653 put_prev_task(rq, next);
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664 rq_unlock(rq, rf);
5665 raw_spin_lock(&next->pi_lock);
5666 rq_relock(rq, rf);
5667
5668
5669
5670
5671
5672
5673 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5674 raw_spin_unlock(&next->pi_lock);
5675 continue;
5676 }
5677
5678
5679 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5680 rq = __migrate_task(rq, rf, next, dest_cpu);
5681 if (rq != dead_rq) {
5682 rq_unlock(rq, rf);
5683 rq = dead_rq;
5684 *rf = orf;
5685 rq_relock(rq, rf);
5686 }
5687 raw_spin_unlock(&next->pi_lock);
5688 }
5689
5690 rq->stop = stop;
5691}
5692#endif
5693
5694void set_rq_online(struct rq *rq)
5695{
5696 if (!rq->online) {
5697 const struct sched_class *class;
5698
5699 cpumask_set_cpu(rq->cpu, rq->rd->online);
5700 rq->online = 1;
5701
5702 for_each_class(class) {
5703 if (class->rq_online)
5704 class->rq_online(rq);
5705 }
5706 }
5707}
5708
5709void set_rq_offline(struct rq *rq)
5710{
5711 if (rq->online) {
5712 const struct sched_class *class;
5713
5714 for_each_class(class) {
5715 if (class->rq_offline)
5716 class->rq_offline(rq);
5717 }
5718
5719 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5720 rq->online = 0;
5721 }
5722}
5723
5724static void set_cpu_rq_start_time(unsigned int cpu)
5725{
5726 struct rq *rq = cpu_rq(cpu);
5727
5728 rq->age_stamp = sched_clock_cpu(cpu);
5729}
5730
5731
5732
5733
5734static int num_cpus_frozen;
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744static void cpuset_cpu_active(void)
5745{
5746 if (cpuhp_tasks_frozen) {
5747
5748
5749
5750
5751
5752
5753 partition_sched_domains(1, NULL, NULL);
5754 if (--num_cpus_frozen)
5755 return;
5756
5757
5758
5759
5760
5761 cpuset_force_rebuild();
5762 }
5763 cpuset_update_active_cpus();
5764}
5765
5766static int cpuset_cpu_inactive(unsigned int cpu)
5767{
5768 if (!cpuhp_tasks_frozen) {
5769 if (dl_cpu_busy(cpu))
5770 return -EBUSY;
5771 cpuset_update_active_cpus();
5772 } else {
5773 num_cpus_frozen++;
5774 partition_sched_domains(1, NULL, NULL);
5775 }
5776 return 0;
5777}
5778
5779int sched_cpu_activate(unsigned int cpu)
5780{
5781 struct rq *rq = cpu_rq(cpu);
5782 struct rq_flags rf;
5783
5784#ifdef CONFIG_SCHED_SMT
5785
5786
5787
5788 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5789 static_branch_inc_cpuslocked(&sched_smt_present);
5790#endif
5791 set_cpu_active(cpu, true);
5792
5793 if (sched_smp_initialized) {
5794 sched_domains_numa_masks_set(cpu);
5795 cpuset_cpu_active();
5796 }
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807 rq_lock_irqsave(rq, &rf);
5808 if (rq->rd) {
5809 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5810 set_rq_online(rq);
5811 }
5812 rq_unlock_irqrestore(rq, &rf);
5813
5814 update_max_interval();
5815
5816 return 0;
5817}
5818
5819int sched_cpu_deactivate(unsigned int cpu)
5820{
5821 int ret;
5822
5823 set_cpu_active(cpu, false);
5824
5825
5826
5827
5828
5829
5830
5831 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5832
5833#ifdef CONFIG_SCHED_SMT
5834
5835
5836
5837 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5838 static_branch_dec_cpuslocked(&sched_smt_present);
5839#endif
5840
5841 if (!sched_smp_initialized)
5842 return 0;
5843
5844 ret = cpuset_cpu_inactive(cpu);
5845 if (ret) {
5846 set_cpu_active(cpu, true);
5847 return ret;
5848 }
5849 sched_domains_numa_masks_clear(cpu);
5850 return 0;
5851}
5852
5853static void sched_rq_cpu_starting(unsigned int cpu)
5854{
5855 struct rq *rq = cpu_rq(cpu);
5856
5857 rq->calc_load_update = calc_load_update;
5858 update_max_interval();
5859}
5860
5861int sched_cpu_starting(unsigned int cpu)
5862{
5863 set_cpu_rq_start_time(cpu);
5864 sched_rq_cpu_starting(cpu);
5865 sched_tick_start(cpu);
5866 return 0;
5867}
5868
5869#ifdef CONFIG_HOTPLUG_CPU
5870int sched_cpu_dying(unsigned int cpu)
5871{
5872 struct rq *rq = cpu_rq(cpu);
5873 struct rq_flags rf;
5874
5875
5876 sched_ttwu_pending();
5877 sched_tick_stop(cpu);
5878
5879 rq_lock_irqsave(rq, &rf);
5880 if (rq->rd) {
5881 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5882 set_rq_offline(rq);
5883 }
5884 migrate_tasks(rq, &rf);
5885 BUG_ON(rq->nr_running != 1);
5886 rq_unlock_irqrestore(rq, &rf);
5887
5888 calc_load_migrate(rq);
5889 update_max_interval();
5890 nohz_balance_exit_idle(rq);
5891 hrtick_clear(rq);
5892 return 0;
5893}
5894#endif
5895
5896void __init sched_init_smp(void)
5897{
5898 sched_init_numa();
5899
5900
5901
5902
5903
5904
5905 mutex_lock(&sched_domains_mutex);
5906 sched_init_domains(cpu_active_mask);
5907 mutex_unlock(&sched_domains_mutex);
5908
5909
5910 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
5911 BUG();
5912 sched_init_granularity();
5913
5914 init_sched_rt_class();
5915 init_sched_dl_class();
5916
5917 sched_smp_initialized = true;
5918}
5919
5920static int __init migration_init(void)
5921{
5922 sched_rq_cpu_starting(smp_processor_id());
5923 return 0;
5924}
5925early_initcall(migration_init);
5926
5927#else
5928void __init sched_init_smp(void)
5929{
5930 sched_init_granularity();
5931}
5932#endif
5933
5934int in_sched_functions(unsigned long addr)
5935{
5936 return in_lock_functions(addr) ||
5937 (addr >= (unsigned long)__sched_text_start
5938 && addr < (unsigned long)__sched_text_end);
5939}
5940
5941#ifdef CONFIG_CGROUP_SCHED
5942
5943
5944
5945
5946struct task_group root_task_group;
5947LIST_HEAD(task_groups);
5948
5949
5950static struct kmem_cache *task_group_cache __read_mostly;
5951#endif
5952
5953DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
5954DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5955
5956void __init sched_init(void)
5957{
5958 int i, j;
5959 unsigned long alloc_size = 0, ptr;
5960
5961 sched_clock_init();
5962 wait_bit_init();
5963
5964#ifdef CONFIG_FAIR_GROUP_SCHED
5965 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5966#endif
5967#ifdef CONFIG_RT_GROUP_SCHED
5968 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5969#endif
5970 if (alloc_size) {
5971 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
5972
5973#ifdef CONFIG_FAIR_GROUP_SCHED
5974 root_task_group.se = (struct sched_entity **)ptr;
5975 ptr += nr_cpu_ids * sizeof(void **);
5976
5977 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
5978 ptr += nr_cpu_ids * sizeof(void **);
5979
5980#endif
5981#ifdef CONFIG_RT_GROUP_SCHED
5982 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
5983 ptr += nr_cpu_ids * sizeof(void **);
5984
5985 root_task_group.rt_rq = (struct rt_rq **)ptr;
5986 ptr += nr_cpu_ids * sizeof(void **);
5987
5988#endif
5989 }
5990#ifdef CONFIG_CPUMASK_OFFSTACK
5991 for_each_possible_cpu(i) {
5992 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
5993 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5994 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
5995 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5996 }
5997#endif
5998
5999 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
6000 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
6001
6002#ifdef CONFIG_SMP
6003 init_defrootdomain();
6004#endif
6005
6006#ifdef CONFIG_RT_GROUP_SCHED
6007 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6008 global_rt_period(), global_rt_runtime());
6009#endif
6010
6011#ifdef CONFIG_CGROUP_SCHED
6012 task_group_cache = KMEM_CACHE(task_group, 0);
6013
6014 list_add(&root_task_group.list, &task_groups);
6015 INIT_LIST_HEAD(&root_task_group.children);
6016 INIT_LIST_HEAD(&root_task_group.siblings);
6017 autogroup_init(&init_task);
6018#endif
6019
6020 for_each_possible_cpu(i) {
6021 struct rq *rq;
6022
6023 rq = cpu_rq(i);
6024 raw_spin_lock_init(&rq->lock);
6025 rq->nr_running = 0;
6026 rq->calc_load_active = 0;
6027 rq->calc_load_update = jiffies + LOAD_FREQ;
6028 init_cfs_rq(&rq->cfs);
6029 init_rt_rq(&rq->rt);
6030 init_dl_rq(&rq->dl);
6031#ifdef CONFIG_FAIR_GROUP_SCHED
6032 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6033 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6034 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6055 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6056#endif
6057
6058 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6059#ifdef CONFIG_RT_GROUP_SCHED
6060 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6061#endif
6062
6063 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6064 rq->cpu_load[j] = 0;
6065
6066#ifdef CONFIG_SMP
6067 rq->sd = NULL;
6068 rq->rd = NULL;
6069 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
6070 rq->balance_callback = NULL;
6071 rq->active_balance = 0;
6072 rq->next_balance = jiffies;
6073 rq->push_cpu = 0;
6074 rq->cpu = i;
6075 rq->online = 0;
6076 rq->idle_stamp = 0;
6077 rq->avg_idle = 2*sysctl_sched_migration_cost;
6078 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6079
6080 INIT_LIST_HEAD(&rq->cfs_tasks);
6081
6082 rq_attach_root(rq, &def_root_domain);
6083#ifdef CONFIG_NO_HZ_COMMON
6084 rq->last_load_update_tick = jiffies;
6085 rq->last_blocked_load_update_tick = jiffies;
6086 atomic_set(&rq->nohz_flags, 0);
6087#endif
6088#endif
6089 hrtick_rq_init(rq);
6090 atomic_set(&rq->nr_iowait, 0);
6091 }
6092
6093 set_load_weight(&init_task, false);
6094
6095
6096
6097
6098 mmgrab(&init_mm);
6099 enter_lazy_tlb(&init_mm, current);
6100
6101
6102
6103
6104
6105
6106
6107 init_idle(current, smp_processor_id());
6108
6109 calc_load_update = jiffies + LOAD_FREQ;
6110
6111#ifdef CONFIG_SMP
6112 idle_thread_set_boot_cpu();
6113 set_cpu_rq_start_time(smp_processor_id());
6114#endif
6115 init_sched_fair_class();
6116
6117 init_schedstats();
6118
6119 scheduler_running = 1;
6120}
6121
6122#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6123static inline int preempt_count_equals(int preempt_offset)
6124{
6125 int nested = preempt_count() + rcu_preempt_depth();
6126
6127 return (nested == preempt_offset);
6128}
6129
6130void __might_sleep(const char *file, int line, int preempt_offset)
6131{
6132
6133
6134
6135
6136
6137 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6138 "do not call blocking ops when !TASK_RUNNING; "
6139 "state=%lx set at [<%p>] %pS\n",
6140 current->state,
6141 (void *)current->task_state_change,
6142 (void *)current->task_state_change);
6143
6144 ___might_sleep(file, line, preempt_offset);
6145}
6146EXPORT_SYMBOL(__might_sleep);
6147
6148void ___might_sleep(const char *file, int line, int preempt_offset)
6149{
6150
6151 static unsigned long prev_jiffy;
6152
6153 unsigned long preempt_disable_ip;
6154
6155
6156 rcu_sleep_check();
6157
6158 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6159 !is_idle_task(current)) ||
6160 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6161 oops_in_progress)
6162 return;
6163
6164 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6165 return;
6166 prev_jiffy = jiffies;
6167
6168
6169 preempt_disable_ip = get_preempt_disable_ip(current);
6170
6171 printk(KERN_ERR
6172 "BUG: sleeping function called from invalid context at %s:%d\n",
6173 file, line);
6174 printk(KERN_ERR
6175 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6176 in_atomic(), irqs_disabled(),
6177 current->pid, current->comm);
6178
6179 if (task_stack_end_corrupted(current))
6180 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6181
6182 debug_show_held_locks(current);
6183 if (irqs_disabled())
6184 print_irqtrace_events(current);
6185 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6186 && !preempt_count_equals(preempt_offset)) {
6187 pr_err("Preemption disabled at:");
6188 print_ip_sym(preempt_disable_ip);
6189 pr_cont("\n");
6190 }
6191 dump_stack();
6192 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6193}
6194EXPORT_SYMBOL(___might_sleep);
6195#endif
6196
6197#ifdef CONFIG_MAGIC_SYSRQ
6198void normalize_rt_tasks(void)
6199{
6200 struct task_struct *g, *p;
6201 struct sched_attr attr = {
6202 .sched_policy = SCHED_NORMAL,
6203 };
6204
6205 read_lock(&tasklist_lock);
6206 for_each_process_thread(g, p) {
6207
6208
6209
6210 if (p->flags & PF_KTHREAD)
6211 continue;
6212
6213 p->se.exec_start = 0;
6214 schedstat_set(p->se.statistics.wait_start, 0);
6215 schedstat_set(p->se.statistics.sleep_start, 0);
6216 schedstat_set(p->se.statistics.block_start, 0);
6217
6218 if (!dl_task(p) && !rt_task(p)) {
6219
6220
6221
6222
6223 if (task_nice(p) < 0)
6224 set_user_nice(p, 0);
6225 continue;
6226 }
6227
6228 __sched_setscheduler(p, &attr, false, false);
6229 }
6230 read_unlock(&tasklist_lock);
6231}
6232
6233#endif
6234
6235#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254struct task_struct *curr_task(int cpu)
6255{
6256 return cpu_curr(cpu);
6257}
6258
6259#endif
6260
6261#ifdef CONFIG_IA64
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277void ia64_set_curr_task(int cpu, struct task_struct *p)
6278{
6279 cpu_curr(cpu) = p;
6280}
6281
6282#endif
6283
6284#ifdef CONFIG_CGROUP_SCHED
6285
6286static DEFINE_SPINLOCK(task_group_lock);
6287
6288static void sched_free_group(struct task_group *tg)
6289{
6290 free_fair_sched_group(tg);
6291 free_rt_sched_group(tg);
6292 autogroup_free(tg);
6293 kmem_cache_free(task_group_cache, tg);
6294}
6295
6296
6297struct task_group *sched_create_group(struct task_group *parent)
6298{
6299 struct task_group *tg;
6300
6301 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6302 if (!tg)
6303 return ERR_PTR(-ENOMEM);
6304
6305 if (!alloc_fair_sched_group(tg, parent))
6306 goto err;
6307
6308 if (!alloc_rt_sched_group(tg, parent))
6309 goto err;
6310
6311 return tg;
6312
6313err:
6314 sched_free_group(tg);
6315 return ERR_PTR(-ENOMEM);
6316}
6317
6318void sched_online_group(struct task_group *tg, struct task_group *parent)
6319{
6320 unsigned long flags;
6321
6322 spin_lock_irqsave(&task_group_lock, flags);
6323 list_add_rcu(&tg->list, &task_groups);
6324
6325
6326 WARN_ON(!parent);
6327
6328 tg->parent = parent;
6329 INIT_LIST_HEAD(&tg->children);
6330 list_add_rcu(&tg->siblings, &parent->children);
6331 spin_unlock_irqrestore(&task_group_lock, flags);
6332
6333 online_fair_sched_group(tg);
6334}
6335
6336
6337static void sched_free_group_rcu(struct rcu_head *rhp)
6338{
6339
6340 sched_free_group(container_of(rhp, struct task_group, rcu));
6341}
6342
6343void sched_destroy_group(struct task_group *tg)
6344{
6345
6346 call_rcu(&tg->rcu, sched_free_group_rcu);
6347}
6348
6349void sched_offline_group(struct task_group *tg)
6350{
6351 unsigned long flags;
6352
6353
6354 unregister_fair_sched_group(tg);
6355
6356 spin_lock_irqsave(&task_group_lock, flags);
6357 list_del_rcu(&tg->list);
6358 list_del_rcu(&tg->siblings);
6359 spin_unlock_irqrestore(&task_group_lock, flags);
6360}
6361
6362static void sched_change_group(struct task_struct *tsk, int type)
6363{
6364 struct task_group *tg;
6365
6366
6367
6368
6369
6370
6371 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6372 struct task_group, css);
6373 tg = autogroup_task_group(tsk, tg);
6374 tsk->sched_task_group = tg;
6375
6376#ifdef CONFIG_FAIR_GROUP_SCHED
6377 if (tsk->sched_class->task_change_group)
6378 tsk->sched_class->task_change_group(tsk, type);
6379 else
6380#endif
6381 set_task_rq(tsk, task_cpu(tsk));
6382}
6383
6384
6385
6386
6387
6388
6389
6390
6391void sched_move_task(struct task_struct *tsk)
6392{
6393 int queued, running, queue_flags =
6394 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6395 struct rq_flags rf;
6396 struct rq *rq;
6397
6398 rq = task_rq_lock(tsk, &rf);
6399 update_rq_clock(rq);
6400
6401 running = task_current(rq, tsk);
6402 queued = task_on_rq_queued(tsk);
6403
6404 if (queued)
6405 dequeue_task(rq, tsk, queue_flags);
6406 if (running)
6407 put_prev_task(rq, tsk);
6408
6409 sched_change_group(tsk, TASK_MOVE_GROUP);
6410
6411 if (queued)
6412 enqueue_task(rq, tsk, queue_flags);
6413 if (running)
6414 set_curr_task(rq, tsk);
6415
6416 task_rq_unlock(rq, tsk, &rf);
6417}
6418
6419static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6420{
6421 return css ? container_of(css, struct task_group, css) : NULL;
6422}
6423
6424static struct cgroup_subsys_state *
6425cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6426{
6427 struct task_group *parent = css_tg(parent_css);
6428 struct task_group *tg;
6429
6430 if (!parent) {
6431
6432 return &root_task_group.css;
6433 }
6434
6435 tg = sched_create_group(parent);
6436 if (IS_ERR(tg))
6437 return ERR_PTR(-ENOMEM);
6438
6439 return &tg->css;
6440}
6441
6442
6443static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6444{
6445 struct task_group *tg = css_tg(css);
6446 struct task_group *parent = css_tg(css->parent);
6447
6448 if (parent)
6449 sched_online_group(tg, parent);
6450 return 0;
6451}
6452
6453static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6454{
6455 struct task_group *tg = css_tg(css);
6456
6457 sched_offline_group(tg);
6458}
6459
6460static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6461{
6462 struct task_group *tg = css_tg(css);
6463
6464
6465
6466
6467 sched_free_group(tg);
6468}
6469
6470
6471
6472
6473
6474static void cpu_cgroup_fork(struct task_struct *task)
6475{
6476 struct rq_flags rf;
6477 struct rq *rq;
6478
6479 rq = task_rq_lock(task, &rf);
6480
6481 update_rq_clock(rq);
6482 sched_change_group(task, TASK_SET_GROUP);
6483
6484 task_rq_unlock(rq, task, &rf);
6485}
6486
6487static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6488{
6489 struct task_struct *task;
6490 struct cgroup_subsys_state *css;
6491 int ret = 0;
6492
6493 cgroup_taskset_for_each(task, css, tset) {
6494#ifdef CONFIG_RT_GROUP_SCHED
6495 if (!sched_rt_can_attach(css_tg(css), task))
6496 return -EINVAL;
6497#else
6498
6499 if (task->sched_class != &fair_sched_class)
6500 return -EINVAL;
6501#endif
6502
6503
6504
6505
6506 raw_spin_lock_irq(&task->pi_lock);
6507
6508
6509
6510
6511
6512 if (task->state == TASK_NEW)
6513 ret = -EINVAL;
6514 raw_spin_unlock_irq(&task->pi_lock);
6515
6516 if (ret)
6517 break;
6518 }
6519 return ret;
6520}
6521
6522static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6523{
6524 struct task_struct *task;
6525 struct cgroup_subsys_state *css;
6526
6527 cgroup_taskset_for_each(task, css, tset)
6528 sched_move_task(task);
6529}
6530
6531#ifdef CONFIG_FAIR_GROUP_SCHED
6532static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6533 struct cftype *cftype, u64 shareval)
6534{
6535 return sched_group_set_shares(css_tg(css), scale_load(shareval));
6536}
6537
6538static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6539 struct cftype *cft)
6540{
6541 struct task_group *tg = css_tg(css);
6542
6543 return (u64) scale_load_down(tg->shares);
6544}
6545
6546#ifdef CONFIG_CFS_BANDWIDTH
6547static DEFINE_MUTEX(cfs_constraints_mutex);
6548
6549const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
6550const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
6551
6552static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
6553
6554static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
6555{
6556 int i, ret = 0, runtime_enabled, runtime_was_enabled;
6557 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6558
6559 if (tg == &root_task_group)
6560 return -EINVAL;
6561
6562
6563
6564
6565
6566
6567 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
6568 return -EINVAL;
6569
6570
6571
6572
6573
6574
6575 if (period > max_cfs_quota_period)
6576 return -EINVAL;
6577
6578
6579
6580
6581
6582 get_online_cpus();
6583 mutex_lock(&cfs_constraints_mutex);
6584 ret = __cfs_schedulable(tg, period, quota);
6585 if (ret)
6586 goto out_unlock;
6587
6588 runtime_enabled = quota != RUNTIME_INF;
6589 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
6590
6591
6592
6593
6594 if (runtime_enabled && !runtime_was_enabled)
6595 cfs_bandwidth_usage_inc();
6596 raw_spin_lock_irq(&cfs_b->lock);
6597 cfs_b->period = ns_to_ktime(period);
6598 cfs_b->quota = quota;
6599
6600 __refill_cfs_bandwidth_runtime(cfs_b);
6601
6602
6603 if (runtime_enabled)
6604 start_cfs_bandwidth(cfs_b);
6605
6606 raw_spin_unlock_irq(&cfs_b->lock);
6607
6608 for_each_online_cpu(i) {
6609 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
6610 struct rq *rq = cfs_rq->rq;
6611 struct rq_flags rf;
6612
6613 rq_lock_irq(rq, &rf);
6614 cfs_rq->runtime_enabled = runtime_enabled;
6615 cfs_rq->runtime_remaining = 0;
6616
6617 if (cfs_rq->throttled)
6618 unthrottle_cfs_rq(cfs_rq);
6619 rq_unlock_irq(rq, &rf);
6620 }
6621 if (runtime_was_enabled && !runtime_enabled)
6622 cfs_bandwidth_usage_dec();
6623out_unlock:
6624 mutex_unlock(&cfs_constraints_mutex);
6625 put_online_cpus();
6626
6627 return ret;
6628}
6629
6630int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
6631{
6632 u64 quota, period;
6633
6634 period = ktime_to_ns(tg->cfs_bandwidth.period);
6635 if (cfs_quota_us < 0)
6636 quota = RUNTIME_INF;
6637 else
6638 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
6639
6640 return tg_set_cfs_bandwidth(tg, period, quota);
6641}
6642
6643long tg_get_cfs_quota(struct task_group *tg)
6644{
6645 u64 quota_us;
6646
6647 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
6648 return -1;
6649
6650 quota_us = tg->cfs_bandwidth.quota;
6651 do_div(quota_us, NSEC_PER_USEC);
6652
6653 return quota_us;
6654}
6655
6656int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
6657{
6658 u64 quota, period;
6659
6660 period = (u64)cfs_period_us * NSEC_PER_USEC;
6661 quota = tg->cfs_bandwidth.quota;
6662
6663 return tg_set_cfs_bandwidth(tg, period, quota);
6664}
6665
6666long tg_get_cfs_period(struct task_group *tg)
6667{
6668 u64 cfs_period_us;
6669
6670 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
6671 do_div(cfs_period_us, NSEC_PER_USEC);
6672
6673 return cfs_period_us;
6674}
6675
6676static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
6677 struct cftype *cft)
6678{
6679 return tg_get_cfs_quota(css_tg(css));
6680}
6681
6682static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
6683 struct cftype *cftype, s64 cfs_quota_us)
6684{
6685 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
6686}
6687
6688static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
6689 struct cftype *cft)
6690{
6691 return tg_get_cfs_period(css_tg(css));
6692}
6693
6694static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
6695 struct cftype *cftype, u64 cfs_period_us)
6696{
6697 return tg_set_cfs_period(css_tg(css), cfs_period_us);
6698}
6699
6700struct cfs_schedulable_data {
6701 struct task_group *tg;
6702 u64 period, quota;
6703};
6704
6705
6706
6707
6708
6709static u64 normalize_cfs_quota(struct task_group *tg,
6710 struct cfs_schedulable_data *d)
6711{
6712 u64 quota, period;
6713
6714 if (tg == d->tg) {
6715 period = d->period;
6716 quota = d->quota;
6717 } else {
6718 period = tg_get_cfs_period(tg);
6719 quota = tg_get_cfs_quota(tg);
6720 }
6721
6722
6723 if (quota == RUNTIME_INF || quota == -1)
6724 return RUNTIME_INF;
6725
6726 return to_ratio(period, quota);
6727}
6728
6729static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
6730{
6731 struct cfs_schedulable_data *d = data;
6732 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6733 s64 quota = 0, parent_quota = -1;
6734
6735 if (!tg->parent) {
6736 quota = RUNTIME_INF;
6737 } else {
6738 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
6739
6740 quota = normalize_cfs_quota(tg, d);
6741 parent_quota = parent_b->hierarchical_quota;
6742
6743
6744
6745
6746
6747
6748 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
6749 quota = min(quota, parent_quota);
6750 } else {
6751 if (quota == RUNTIME_INF)
6752 quota = parent_quota;
6753 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
6754 return -EINVAL;
6755 }
6756 }
6757 cfs_b->hierarchical_quota = quota;
6758
6759 return 0;
6760}
6761
6762static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
6763{
6764 int ret;
6765 struct cfs_schedulable_data data = {
6766 .tg = tg,
6767 .period = period,
6768 .quota = quota,
6769 };
6770
6771 if (quota != RUNTIME_INF) {
6772 do_div(data.period, NSEC_PER_USEC);
6773 do_div(data.quota, NSEC_PER_USEC);
6774 }
6775
6776 rcu_read_lock();
6777 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
6778 rcu_read_unlock();
6779
6780 return ret;
6781}
6782
6783static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
6784{
6785 struct task_group *tg = css_tg(seq_css(sf));
6786 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6787
6788 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
6789 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
6790 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
6791
6792 return 0;
6793}
6794#endif
6795#endif
6796
6797#ifdef CONFIG_RT_GROUP_SCHED
6798static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
6799 struct cftype *cft, s64 val)
6800{
6801 return sched_group_set_rt_runtime(css_tg(css), val);
6802}
6803
6804static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
6805 struct cftype *cft)
6806{
6807 return sched_group_rt_runtime(css_tg(css));
6808}
6809
6810static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
6811 struct cftype *cftype, u64 rt_period_us)
6812{
6813 return sched_group_set_rt_period(css_tg(css), rt_period_us);
6814}
6815
6816static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
6817 struct cftype *cft)
6818{
6819 return sched_group_rt_period(css_tg(css));
6820}
6821#endif
6822
6823static struct cftype cpu_legacy_files[] = {
6824#ifdef CONFIG_FAIR_GROUP_SCHED
6825 {
6826 .name = "shares",
6827 .read_u64 = cpu_shares_read_u64,
6828 .write_u64 = cpu_shares_write_u64,
6829 },
6830#endif
6831#ifdef CONFIG_CFS_BANDWIDTH
6832 {
6833 .name = "cfs_quota_us",
6834 .read_s64 = cpu_cfs_quota_read_s64,
6835 .write_s64 = cpu_cfs_quota_write_s64,
6836 },
6837 {
6838 .name = "cfs_period_us",
6839 .read_u64 = cpu_cfs_period_read_u64,
6840 .write_u64 = cpu_cfs_period_write_u64,
6841 },
6842 {
6843 .name = "stat",
6844 .seq_show = cpu_cfs_stat_show,
6845 },
6846#endif
6847#ifdef CONFIG_RT_GROUP_SCHED
6848 {
6849 .name = "rt_runtime_us",
6850 .read_s64 = cpu_rt_runtime_read,
6851 .write_s64 = cpu_rt_runtime_write,
6852 },
6853 {
6854 .name = "rt_period_us",
6855 .read_u64 = cpu_rt_period_read_uint,
6856 .write_u64 = cpu_rt_period_write_uint,
6857 },
6858#endif
6859 { }
6860};
6861
6862static int cpu_extra_stat_show(struct seq_file *sf,
6863 struct cgroup_subsys_state *css)
6864{
6865#ifdef CONFIG_CFS_BANDWIDTH
6866 {
6867 struct task_group *tg = css_tg(css);
6868 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6869 u64 throttled_usec;
6870
6871 throttled_usec = cfs_b->throttled_time;
6872 do_div(throttled_usec, NSEC_PER_USEC);
6873
6874 seq_printf(sf, "nr_periods %d\n"
6875 "nr_throttled %d\n"
6876 "throttled_usec %llu\n",
6877 cfs_b->nr_periods, cfs_b->nr_throttled,
6878 throttled_usec);
6879 }
6880#endif
6881 return 0;
6882}
6883
6884#ifdef CONFIG_FAIR_GROUP_SCHED
6885static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
6886 struct cftype *cft)
6887{
6888 struct task_group *tg = css_tg(css);
6889 u64 weight = scale_load_down(tg->shares);
6890
6891 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
6892}
6893
6894static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
6895 struct cftype *cft, u64 weight)
6896{
6897
6898
6899
6900
6901
6902
6903
6904 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
6905 return -ERANGE;
6906
6907 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
6908
6909 return sched_group_set_shares(css_tg(css), scale_load(weight));
6910}
6911
6912static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
6913 struct cftype *cft)
6914{
6915 unsigned long weight = scale_load_down(css_tg(css)->shares);
6916 int last_delta = INT_MAX;
6917 int prio, delta;
6918
6919
6920 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
6921 delta = abs(sched_prio_to_weight[prio] - weight);
6922 if (delta >= last_delta)
6923 break;
6924 last_delta = delta;
6925 }
6926
6927 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
6928}
6929
6930static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
6931 struct cftype *cft, s64 nice)
6932{
6933 unsigned long weight;
6934 int idx;
6935
6936 if (nice < MIN_NICE || nice > MAX_NICE)
6937 return -ERANGE;
6938
6939 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
6940 idx = array_index_nospec(idx, 40);
6941 weight = sched_prio_to_weight[idx];
6942
6943 return sched_group_set_shares(css_tg(css), scale_load(weight));
6944}
6945#endif
6946
6947static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
6948 long period, long quota)
6949{
6950 if (quota < 0)
6951 seq_puts(sf, "max");
6952 else
6953 seq_printf(sf, "%ld", quota);
6954
6955 seq_printf(sf, " %ld\n", period);
6956}
6957
6958
6959static int __maybe_unused cpu_period_quota_parse(char *buf,
6960 u64 *periodp, u64 *quotap)
6961{
6962 char tok[21];
6963
6964 if (!sscanf(buf, "%s %llu", tok, periodp))
6965 return -EINVAL;
6966
6967 *periodp *= NSEC_PER_USEC;
6968
6969 if (sscanf(tok, "%llu", quotap))
6970 *quotap *= NSEC_PER_USEC;
6971 else if (!strcmp(tok, "max"))
6972 *quotap = RUNTIME_INF;
6973 else
6974 return -EINVAL;
6975
6976 return 0;
6977}
6978
6979#ifdef CONFIG_CFS_BANDWIDTH
6980static int cpu_max_show(struct seq_file *sf, void *v)
6981{
6982 struct task_group *tg = css_tg(seq_css(sf));
6983
6984 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
6985 return 0;
6986}
6987
6988static ssize_t cpu_max_write(struct kernfs_open_file *of,
6989 char *buf, size_t nbytes, loff_t off)
6990{
6991 struct task_group *tg = css_tg(of_css(of));
6992 u64 period = tg_get_cfs_period(tg);
6993 u64 quota;
6994 int ret;
6995
6996 ret = cpu_period_quota_parse(buf, &period, "a);
6997 if (!ret)
6998 ret = tg_set_cfs_bandwidth(tg, period, quota);
6999 return ret ?: nbytes;
7000}
7001#endif
7002
7003static struct cftype cpu_files[] = {
7004#ifdef CONFIG_FAIR_GROUP_SCHED
7005 {
7006 .name = "weight",
7007 .flags = CFTYPE_NOT_ON_ROOT,
7008 .read_u64 = cpu_weight_read_u64,
7009 .write_u64 = cpu_weight_write_u64,
7010 },
7011 {
7012 .name = "weight.nice",
7013 .flags = CFTYPE_NOT_ON_ROOT,
7014 .read_s64 = cpu_weight_nice_read_s64,
7015 .write_s64 = cpu_weight_nice_write_s64,
7016 },
7017#endif
7018#ifdef CONFIG_CFS_BANDWIDTH
7019 {
7020 .name = "max",
7021 .flags = CFTYPE_NOT_ON_ROOT,
7022 .seq_show = cpu_max_show,
7023 .write = cpu_max_write,
7024 },
7025#endif
7026 { }
7027};
7028
7029struct cgroup_subsys cpu_cgrp_subsys = {
7030 .css_alloc = cpu_cgroup_css_alloc,
7031 .css_online = cpu_cgroup_css_online,
7032 .css_released = cpu_cgroup_css_released,
7033 .css_free = cpu_cgroup_css_free,
7034 .css_extra_stat_show = cpu_extra_stat_show,
7035 .fork = cpu_cgroup_fork,
7036 .can_attach = cpu_cgroup_can_attach,
7037 .attach = cpu_cgroup_attach,
7038 .legacy_cftypes = cpu_legacy_files,
7039 .dfl_cftypes = cpu_files,
7040 .early_init = true,
7041 .threaded = true,
7042};
7043
7044#endif
7045
7046void dump_cpu_task(int cpu)
7047{
7048 pr_info("Task dump for CPU %d:\n", cpu);
7049 sched_show_task(cpu_curr(cpu));
7050}
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064const int sched_prio_to_weight[40] = {
7065 88761, 71755, 56483, 46273, 36291,
7066 29154, 23254, 18705, 14949, 11916,
7067 9548, 7620, 6100, 4904, 3906,
7068 3121, 2501, 1991, 1586, 1277,
7069 1024, 820, 655, 526, 423,
7070 335, 272, 215, 172, 137,
7071 110, 87, 70, 56, 45,
7072 36, 29, 23, 18, 15,
7073};
7074
7075
7076
7077
7078
7079
7080
7081
7082const u32 sched_prio_to_wmult[40] = {
7083 48388, 59856, 76040, 92818, 118348,
7084 147320, 184698, 229616, 287308, 360437,
7085 449829, 563644, 704093, 875809, 1099582,
7086 1376151, 1717300, 2157191, 2708050, 3363326,
7087 4194304, 5237765, 6557202, 8165337, 10153587,
7088 12820798, 15790321, 19976592, 24970740, 31350126,
7089 39045157, 49367440, 61356676, 76695844, 95443717,
7090 119304647, 148102320, 186737708, 238609294, 286331153,
7091};
7092
7093#undef CREATE_TRACE_POINTS
7094