1
2
3
4
5
6
7
8#include <linux/sched.h>
9#include <linux/sched/clock.h>
10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h>
13#include <linux/cpuset.h>
14#include <linux/delayacct.h>
15#include <linux/init_task.h>
16#include <linux/context_tracking.h>
17#include <linux/rcupdate_wait.h>
18
19#include <linux/blkdev.h>
20#include <linux/kprobes.h>
21#include <linux/mmu_context.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/prefetch.h>
25#include <linux/profile.h>
26#include <linux/security.h>
27#include <linux/syscalls.h>
28
29#include <asm/switch_to.h>
30#include <asm/tlb.h>
31#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h>
33#endif
34
35#include "sched.h"
36#include "../workqueue_internal.h"
37#include "../smpboot.h"
38
39#define CREATE_TRACE_POINTS
40#include <trace/events/sched.h>
41
42DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
43
44
45
46
47
48#define SCHED_FEAT(name, enabled) \
49 (1UL << __SCHED_FEAT_##name) * enabled |
50
51const_debug unsigned int sysctl_sched_features =
52#include "features.h"
53 0;
54
55#undef SCHED_FEAT
56
57
58
59
60
61const_debug unsigned int sysctl_sched_nr_migrate = 32;
62
63
64
65
66
67
68
69const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
70
71
72
73
74
75unsigned int sysctl_sched_rt_period = 1000000;
76
77__read_mostly int scheduler_running;
78
79
80
81
82
83int sysctl_sched_rt_runtime = 950000;
84
85
86cpumask_var_t cpu_isolated_map;
87
88
89
90
91struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
92 __acquires(rq->lock)
93{
94 struct rq *rq;
95
96 lockdep_assert_held(&p->pi_lock);
97
98 for (;;) {
99 rq = task_rq(p);
100 raw_spin_lock(&rq->lock);
101 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
102 rq_pin_lock(rq, rf);
103 return rq;
104 }
105 raw_spin_unlock(&rq->lock);
106
107 while (unlikely(task_on_rq_migrating(p)))
108 cpu_relax();
109 }
110}
111
112
113
114
115struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
116 __acquires(p->pi_lock)
117 __acquires(rq->lock)
118{
119 struct rq *rq;
120
121 for (;;) {
122 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
123 rq = task_rq(p);
124 raw_spin_lock(&rq->lock);
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
142 rq_pin_lock(rq, rf);
143 return rq;
144 }
145 raw_spin_unlock(&rq->lock);
146 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
147
148 while (unlikely(task_on_rq_migrating(p)))
149 cpu_relax();
150 }
151}
152
153
154
155
156
157static void update_rq_clock_task(struct rq *rq, s64 delta)
158{
159
160
161
162
163#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
164 s64 steal = 0, irq_delta = 0;
165#endif
166#ifdef CONFIG_IRQ_TIME_ACCOUNTING
167 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184 if (irq_delta > delta)
185 irq_delta = delta;
186
187 rq->prev_irq_time += irq_delta;
188 delta -= irq_delta;
189#endif
190#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
191 if (static_key_false((¶virt_steal_rq_enabled))) {
192 steal = paravirt_steal_clock(cpu_of(rq));
193 steal -= rq->prev_steal_time_rq;
194
195 if (unlikely(steal > delta))
196 steal = delta;
197
198 rq->prev_steal_time_rq += steal;
199 delta -= steal;
200 }
201#endif
202
203 rq->clock_task += delta;
204
205#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
206 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
207 sched_rt_avg_update(rq, irq_delta + steal);
208#endif
209}
210
211void update_rq_clock(struct rq *rq)
212{
213 s64 delta;
214
215 lockdep_assert_held(&rq->lock);
216
217 if (rq->clock_update_flags & RQCF_ACT_SKIP)
218 return;
219
220#ifdef CONFIG_SCHED_DEBUG
221 if (sched_feat(WARN_DOUBLE_CLOCK))
222 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
223 rq->clock_update_flags |= RQCF_UPDATED;
224#endif
225
226 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
227 if (delta < 0)
228 return;
229 rq->clock += delta;
230 update_rq_clock_task(rq, delta);
231}
232
233
234#ifdef CONFIG_SCHED_HRTICK
235
236
237
238
239static void hrtick_clear(struct rq *rq)
240{
241 if (hrtimer_active(&rq->hrtick_timer))
242 hrtimer_cancel(&rq->hrtick_timer);
243}
244
245
246
247
248
249static enum hrtimer_restart hrtick(struct hrtimer *timer)
250{
251 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
252 struct rq_flags rf;
253
254 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
255
256 rq_lock(rq, &rf);
257 update_rq_clock(rq);
258 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
259 rq_unlock(rq, &rf);
260
261 return HRTIMER_NORESTART;
262}
263
264#ifdef CONFIG_SMP
265
266static void __hrtick_restart(struct rq *rq)
267{
268 struct hrtimer *timer = &rq->hrtick_timer;
269
270 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
271}
272
273
274
275
276static void __hrtick_start(void *arg)
277{
278 struct rq *rq = arg;
279 struct rq_flags rf;
280
281 rq_lock(rq, &rf);
282 __hrtick_restart(rq);
283 rq->hrtick_csd_pending = 0;
284 rq_unlock(rq, &rf);
285}
286
287
288
289
290
291
292void hrtick_start(struct rq *rq, u64 delay)
293{
294 struct hrtimer *timer = &rq->hrtick_timer;
295 ktime_t time;
296 s64 delta;
297
298
299
300
301
302 delta = max_t(s64, delay, 10000LL);
303 time = ktime_add_ns(timer->base->get_time(), delta);
304
305 hrtimer_set_expires(timer, time);
306
307 if (rq == this_rq()) {
308 __hrtick_restart(rq);
309 } else if (!rq->hrtick_csd_pending) {
310 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
311 rq->hrtick_csd_pending = 1;
312 }
313}
314
315#else
316
317
318
319
320
321void hrtick_start(struct rq *rq, u64 delay)
322{
323
324
325
326
327 delay = max_t(u64, delay, 10000LL);
328 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
329 HRTIMER_MODE_REL_PINNED);
330}
331#endif
332
333static void init_rq_hrtick(struct rq *rq)
334{
335#ifdef CONFIG_SMP
336 rq->hrtick_csd_pending = 0;
337
338 rq->hrtick_csd.flags = 0;
339 rq->hrtick_csd.func = __hrtick_start;
340 rq->hrtick_csd.info = rq;
341#endif
342
343 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
344 rq->hrtick_timer.function = hrtick;
345}
346#else
347static inline void hrtick_clear(struct rq *rq)
348{
349}
350
351static inline void init_rq_hrtick(struct rq *rq)
352{
353}
354#endif
355
356
357
358
359#define fetch_or(ptr, mask) \
360 ({ \
361 typeof(ptr) _ptr = (ptr); \
362 typeof(mask) _mask = (mask); \
363 typeof(*_ptr) _old, _val = *_ptr; \
364 \
365 for (;;) { \
366 _old = cmpxchg(_ptr, _val, _val | _mask); \
367 if (_old == _val) \
368 break; \
369 _val = _old; \
370 } \
371 _old; \
372})
373
374#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
375
376
377
378
379
380static bool set_nr_and_not_polling(struct task_struct *p)
381{
382 struct thread_info *ti = task_thread_info(p);
383 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
384}
385
386
387
388
389
390
391
392static bool set_nr_if_polling(struct task_struct *p)
393{
394 struct thread_info *ti = task_thread_info(p);
395 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
396
397 for (;;) {
398 if (!(val & _TIF_POLLING_NRFLAG))
399 return false;
400 if (val & _TIF_NEED_RESCHED)
401 return true;
402 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
403 if (old == val)
404 break;
405 val = old;
406 }
407 return true;
408}
409
410#else
411static bool set_nr_and_not_polling(struct task_struct *p)
412{
413 set_tsk_need_resched(p);
414 return true;
415}
416
417#ifdef CONFIG_SMP
418static bool set_nr_if_polling(struct task_struct *p)
419{
420 return false;
421}
422#endif
423#endif
424
425void wake_q_add(struct wake_q_head *head, struct task_struct *task)
426{
427 struct wake_q_node *node = &task->wake_q;
428
429
430
431
432
433
434
435
436
437 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
438 return;
439
440 get_task_struct(task);
441
442
443
444
445 *head->lastp = node;
446 head->lastp = &node->next;
447}
448
449void wake_up_q(struct wake_q_head *head)
450{
451 struct wake_q_node *node = head->first;
452
453 while (node != WAKE_Q_TAIL) {
454 struct task_struct *task;
455
456 task = container_of(node, struct task_struct, wake_q);
457 BUG_ON(!task);
458
459 node = node->next;
460 task->wake_q.next = NULL;
461
462
463
464
465
466 wake_up_process(task);
467 put_task_struct(task);
468 }
469}
470
471
472
473
474
475
476
477
478void resched_curr(struct rq *rq)
479{
480 struct task_struct *curr = rq->curr;
481 int cpu;
482
483 lockdep_assert_held(&rq->lock);
484
485 if (test_tsk_need_resched(curr))
486 return;
487
488 cpu = cpu_of(rq);
489
490 if (cpu == smp_processor_id()) {
491 set_tsk_need_resched(curr);
492 set_preempt_need_resched();
493 return;
494 }
495
496 if (set_nr_and_not_polling(curr))
497 smp_send_reschedule(cpu);
498 else
499 trace_sched_wake_idle_without_ipi(cpu);
500}
501
502void resched_cpu(int cpu)
503{
504 struct rq *rq = cpu_rq(cpu);
505 unsigned long flags;
506
507 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
508 return;
509 resched_curr(rq);
510 raw_spin_unlock_irqrestore(&rq->lock, flags);
511}
512
513#ifdef CONFIG_SMP
514#ifdef CONFIG_NO_HZ_COMMON
515
516
517
518
519
520
521
522
523int get_nohz_timer_target(void)
524{
525 int i, cpu = smp_processor_id();
526 struct sched_domain *sd;
527
528 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
529 return cpu;
530
531 rcu_read_lock();
532 for_each_domain(cpu, sd) {
533 for_each_cpu(i, sched_domain_span(sd)) {
534 if (cpu == i)
535 continue;
536
537 if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
538 cpu = i;
539 goto unlock;
540 }
541 }
542 }
543
544 if (!is_housekeeping_cpu(cpu))
545 cpu = housekeeping_any_cpu();
546unlock:
547 rcu_read_unlock();
548 return cpu;
549}
550
551
552
553
554
555
556
557
558
559
560
561static void wake_up_idle_cpu(int cpu)
562{
563 struct rq *rq = cpu_rq(cpu);
564
565 if (cpu == smp_processor_id())
566 return;
567
568 if (set_nr_and_not_polling(rq->idle))
569 smp_send_reschedule(cpu);
570 else
571 trace_sched_wake_idle_without_ipi(cpu);
572}
573
574static bool wake_up_full_nohz_cpu(int cpu)
575{
576
577
578
579
580
581
582 if (cpu_is_offline(cpu))
583 return true;
584 if (tick_nohz_full_cpu(cpu)) {
585 if (cpu != smp_processor_id() ||
586 tick_nohz_tick_stopped())
587 tick_nohz_full_kick_cpu(cpu);
588 return true;
589 }
590
591 return false;
592}
593
594
595
596
597
598
599void wake_up_nohz_cpu(int cpu)
600{
601 if (!wake_up_full_nohz_cpu(cpu))
602 wake_up_idle_cpu(cpu);
603}
604
605static inline bool got_nohz_idle_kick(void)
606{
607 int cpu = smp_processor_id();
608
609 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
610 return false;
611
612 if (idle_cpu(cpu) && !need_resched())
613 return true;
614
615
616
617
618
619 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
620 return false;
621}
622
623#else
624
625static inline bool got_nohz_idle_kick(void)
626{
627 return false;
628}
629
630#endif
631
632#ifdef CONFIG_NO_HZ_FULL
633bool sched_can_stop_tick(struct rq *rq)
634{
635 int fifo_nr_running;
636
637
638 if (rq->dl.dl_nr_running)
639 return false;
640
641
642
643
644
645 if (rq->rt.rr_nr_running) {
646 if (rq->rt.rr_nr_running == 1)
647 return true;
648 else
649 return false;
650 }
651
652
653
654
655
656 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
657 if (fifo_nr_running)
658 return true;
659
660
661
662
663
664
665 if (rq->nr_running > 1)
666 return false;
667
668 return true;
669}
670#endif
671
672void sched_avg_update(struct rq *rq)
673{
674 s64 period = sched_avg_period();
675
676 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
677
678
679
680
681
682 asm("" : "+rm" (rq->age_stamp));
683 rq->age_stamp += period;
684 rq->rt_avg /= 2;
685 }
686}
687
688#endif
689
690#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
691 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
692
693
694
695
696
697
698int walk_tg_tree_from(struct task_group *from,
699 tg_visitor down, tg_visitor up, void *data)
700{
701 struct task_group *parent, *child;
702 int ret;
703
704 parent = from;
705
706down:
707 ret = (*down)(parent, data);
708 if (ret)
709 goto out;
710 list_for_each_entry_rcu(child, &parent->children, siblings) {
711 parent = child;
712 goto down;
713
714up:
715 continue;
716 }
717 ret = (*up)(parent, data);
718 if (ret || parent == from)
719 goto out;
720
721 child = parent;
722 parent = parent->parent;
723 if (parent)
724 goto up;
725out:
726 return ret;
727}
728
729int tg_nop(struct task_group *tg, void *data)
730{
731 return 0;
732}
733#endif
734
735static void set_load_weight(struct task_struct *p)
736{
737 int prio = p->static_prio - MAX_RT_PRIO;
738 struct load_weight *load = &p->se.load;
739
740
741
742
743 if (idle_policy(p->policy)) {
744 load->weight = scale_load(WEIGHT_IDLEPRIO);
745 load->inv_weight = WMULT_IDLEPRIO;
746 return;
747 }
748
749 load->weight = scale_load(sched_prio_to_weight[prio]);
750 load->inv_weight = sched_prio_to_wmult[prio];
751}
752
753static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
754{
755 if (!(flags & ENQUEUE_NOCLOCK))
756 update_rq_clock(rq);
757
758 if (!(flags & ENQUEUE_RESTORE))
759 sched_info_queued(rq, p);
760
761 p->sched_class->enqueue_task(rq, p, flags);
762}
763
764static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
765{
766 if (!(flags & DEQUEUE_NOCLOCK))
767 update_rq_clock(rq);
768
769 if (!(flags & DEQUEUE_SAVE))
770 sched_info_dequeued(rq, p);
771
772 p->sched_class->dequeue_task(rq, p, flags);
773}
774
775void activate_task(struct rq *rq, struct task_struct *p, int flags)
776{
777 if (task_contributes_to_load(p))
778 rq->nr_uninterruptible--;
779
780 enqueue_task(rq, p, flags);
781}
782
783void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
784{
785 if (task_contributes_to_load(p))
786 rq->nr_uninterruptible++;
787
788 dequeue_task(rq, p, flags);
789}
790
791void sched_set_stop_task(int cpu, struct task_struct *stop)
792{
793 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
794 struct task_struct *old_stop = cpu_rq(cpu)->stop;
795
796 if (stop) {
797
798
799
800
801
802
803
804
805 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
806
807 stop->sched_class = &stop_sched_class;
808 }
809
810 cpu_rq(cpu)->stop = stop;
811
812 if (old_stop) {
813
814
815
816
817 old_stop->sched_class = &rt_sched_class;
818 }
819}
820
821
822
823
824static inline int __normal_prio(struct task_struct *p)
825{
826 return p->static_prio;
827}
828
829
830
831
832
833
834
835
836static inline int normal_prio(struct task_struct *p)
837{
838 int prio;
839
840 if (task_has_dl_policy(p))
841 prio = MAX_DL_PRIO-1;
842 else if (task_has_rt_policy(p))
843 prio = MAX_RT_PRIO-1 - p->rt_priority;
844 else
845 prio = __normal_prio(p);
846 return prio;
847}
848
849
850
851
852
853
854
855
856static int effective_prio(struct task_struct *p)
857{
858 p->normal_prio = normal_prio(p);
859
860
861
862
863
864 if (!rt_prio(p->prio))
865 return p->normal_prio;
866 return p->prio;
867}
868
869
870
871
872
873
874
875inline int task_curr(const struct task_struct *p)
876{
877 return cpu_curr(task_cpu(p)) == p;
878}
879
880
881
882
883
884
885
886
887static inline void check_class_changed(struct rq *rq, struct task_struct *p,
888 const struct sched_class *prev_class,
889 int oldprio)
890{
891 if (prev_class != p->sched_class) {
892 if (prev_class->switched_from)
893 prev_class->switched_from(rq, p);
894
895 p->sched_class->switched_to(rq, p);
896 } else if (oldprio != p->prio || dl_task(p))
897 p->sched_class->prio_changed(rq, p, oldprio);
898}
899
900void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
901{
902 const struct sched_class *class;
903
904 if (p->sched_class == rq->curr->sched_class) {
905 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
906 } else {
907 for_each_class(class) {
908 if (class == rq->curr->sched_class)
909 break;
910 if (class == p->sched_class) {
911 resched_curr(rq);
912 break;
913 }
914 }
915 }
916
917
918
919
920
921 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
922 rq_clock_skip_update(rq, true);
923}
924
925#ifdef CONFIG_SMP
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
946 struct task_struct *p, int new_cpu)
947{
948 lockdep_assert_held(&rq->lock);
949
950 p->on_rq = TASK_ON_RQ_MIGRATING;
951 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
952 set_task_cpu(p, new_cpu);
953 rq_unlock(rq, rf);
954
955 rq = cpu_rq(new_cpu);
956
957 rq_lock(rq, rf);
958 BUG_ON(task_cpu(p) != new_cpu);
959 enqueue_task(rq, p, 0);
960 p->on_rq = TASK_ON_RQ_QUEUED;
961 check_preempt_curr(rq, p, 0);
962
963 return rq;
964}
965
966struct migration_arg {
967 struct task_struct *task;
968 int dest_cpu;
969};
970
971
972
973
974
975
976
977
978
979
980static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
981 struct task_struct *p, int dest_cpu)
982{
983 if (unlikely(!cpu_active(dest_cpu)))
984 return rq;
985
986
987 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
988 return rq;
989
990 update_rq_clock(rq);
991 rq = move_queued_task(rq, rf, p, dest_cpu);
992
993 return rq;
994}
995
996
997
998
999
1000
1001static int migration_cpu_stop(void *data)
1002{
1003 struct migration_arg *arg = data;
1004 struct task_struct *p = arg->task;
1005 struct rq *rq = this_rq();
1006 struct rq_flags rf;
1007
1008
1009
1010
1011
1012 local_irq_disable();
1013
1014
1015
1016
1017
1018 sched_ttwu_pending();
1019
1020 raw_spin_lock(&p->pi_lock);
1021 rq_lock(rq, &rf);
1022
1023
1024
1025
1026
1027 if (task_rq(p) == rq) {
1028 if (task_on_rq_queued(p))
1029 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1030 else
1031 p->wake_cpu = arg->dest_cpu;
1032 }
1033 rq_unlock(rq, &rf);
1034 raw_spin_unlock(&p->pi_lock);
1035
1036 local_irq_enable();
1037 return 0;
1038}
1039
1040
1041
1042
1043
1044void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1045{
1046 cpumask_copy(&p->cpus_allowed, new_mask);
1047 p->nr_cpus_allowed = cpumask_weight(new_mask);
1048}
1049
1050void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1051{
1052 struct rq *rq = task_rq(p);
1053 bool queued, running;
1054
1055 lockdep_assert_held(&p->pi_lock);
1056
1057 queued = task_on_rq_queued(p);
1058 running = task_current(rq, p);
1059
1060 if (queued) {
1061
1062
1063
1064
1065 lockdep_assert_held(&rq->lock);
1066 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1067 }
1068 if (running)
1069 put_prev_task(rq, p);
1070
1071 p->sched_class->set_cpus_allowed(p, new_mask);
1072
1073 if (queued)
1074 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1075 if (running)
1076 set_curr_task(rq, p);
1077}
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088static int __set_cpus_allowed_ptr(struct task_struct *p,
1089 const struct cpumask *new_mask, bool check)
1090{
1091 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1092 unsigned int dest_cpu;
1093 struct rq_flags rf;
1094 struct rq *rq;
1095 int ret = 0;
1096
1097 rq = task_rq_lock(p, &rf);
1098 update_rq_clock(rq);
1099
1100 if (p->flags & PF_KTHREAD) {
1101
1102
1103
1104 cpu_valid_mask = cpu_online_mask;
1105 }
1106
1107
1108
1109
1110
1111 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1112 ret = -EINVAL;
1113 goto out;
1114 }
1115
1116 if (cpumask_equal(&p->cpus_allowed, new_mask))
1117 goto out;
1118
1119 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1120 ret = -EINVAL;
1121 goto out;
1122 }
1123
1124 do_set_cpus_allowed(p, new_mask);
1125
1126 if (p->flags & PF_KTHREAD) {
1127
1128
1129
1130
1131 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1132 !cpumask_intersects(new_mask, cpu_active_mask) &&
1133 p->nr_cpus_allowed != 1);
1134 }
1135
1136
1137 if (cpumask_test_cpu(task_cpu(p), new_mask))
1138 goto out;
1139
1140 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1141 if (task_running(rq, p) || p->state == TASK_WAKING) {
1142 struct migration_arg arg = { p, dest_cpu };
1143
1144 task_rq_unlock(rq, p, &rf);
1145 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1146 tlb_migrate_finish(p->mm);
1147 return 0;
1148 } else if (task_on_rq_queued(p)) {
1149
1150
1151
1152
1153 rq = move_queued_task(rq, &rf, p, dest_cpu);
1154 }
1155out:
1156 task_rq_unlock(rq, p, &rf);
1157
1158 return ret;
1159}
1160
1161int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1162{
1163 return __set_cpus_allowed_ptr(p, new_mask, false);
1164}
1165EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1166
1167void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1168{
1169#ifdef CONFIG_SCHED_DEBUG
1170
1171
1172
1173
1174 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1175 !p->on_rq);
1176
1177
1178
1179
1180
1181
1182 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1183 p->sched_class == &fair_sched_class &&
1184 (p->on_rq && !task_on_rq_migrating(p)));
1185
1186#ifdef CONFIG_LOCKDEP
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1198 lockdep_is_held(&task_rq(p)->lock)));
1199#endif
1200#endif
1201
1202 trace_sched_migrate_task(p, new_cpu);
1203
1204 if (task_cpu(p) != new_cpu) {
1205 if (p->sched_class->migrate_task_rq)
1206 p->sched_class->migrate_task_rq(p);
1207 p->se.nr_migrations++;
1208 perf_event_task_migrate(p);
1209 }
1210
1211 __set_task_cpu(p, new_cpu);
1212}
1213
1214static void __migrate_swap_task(struct task_struct *p, int cpu)
1215{
1216 if (task_on_rq_queued(p)) {
1217 struct rq *src_rq, *dst_rq;
1218 struct rq_flags srf, drf;
1219
1220 src_rq = task_rq(p);
1221 dst_rq = cpu_rq(cpu);
1222
1223 rq_pin_lock(src_rq, &srf);
1224 rq_pin_lock(dst_rq, &drf);
1225
1226 p->on_rq = TASK_ON_RQ_MIGRATING;
1227 deactivate_task(src_rq, p, 0);
1228 set_task_cpu(p, cpu);
1229 activate_task(dst_rq, p, 0);
1230 p->on_rq = TASK_ON_RQ_QUEUED;
1231 check_preempt_curr(dst_rq, p, 0);
1232
1233 rq_unpin_lock(dst_rq, &drf);
1234 rq_unpin_lock(src_rq, &srf);
1235
1236 } else {
1237
1238
1239
1240
1241
1242 p->wake_cpu = cpu;
1243 }
1244}
1245
1246struct migration_swap_arg {
1247 struct task_struct *src_task, *dst_task;
1248 int src_cpu, dst_cpu;
1249};
1250
1251static int migrate_swap_stop(void *data)
1252{
1253 struct migration_swap_arg *arg = data;
1254 struct rq *src_rq, *dst_rq;
1255 int ret = -EAGAIN;
1256
1257 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1258 return -EAGAIN;
1259
1260 src_rq = cpu_rq(arg->src_cpu);
1261 dst_rq = cpu_rq(arg->dst_cpu);
1262
1263 double_raw_lock(&arg->src_task->pi_lock,
1264 &arg->dst_task->pi_lock);
1265 double_rq_lock(src_rq, dst_rq);
1266
1267 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1268 goto unlock;
1269
1270 if (task_cpu(arg->src_task) != arg->src_cpu)
1271 goto unlock;
1272
1273 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1274 goto unlock;
1275
1276 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1277 goto unlock;
1278
1279 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1280 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1281
1282 ret = 0;
1283
1284unlock:
1285 double_rq_unlock(src_rq, dst_rq);
1286 raw_spin_unlock(&arg->dst_task->pi_lock);
1287 raw_spin_unlock(&arg->src_task->pi_lock);
1288
1289 return ret;
1290}
1291
1292
1293
1294
1295int migrate_swap(struct task_struct *cur, struct task_struct *p)
1296{
1297 struct migration_swap_arg arg;
1298 int ret = -EINVAL;
1299
1300 arg = (struct migration_swap_arg){
1301 .src_task = cur,
1302 .src_cpu = task_cpu(cur),
1303 .dst_task = p,
1304 .dst_cpu = task_cpu(p),
1305 };
1306
1307 if (arg.src_cpu == arg.dst_cpu)
1308 goto out;
1309
1310
1311
1312
1313
1314 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1315 goto out;
1316
1317 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1318 goto out;
1319
1320 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1321 goto out;
1322
1323 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1324 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1325
1326out:
1327 return ret;
1328}
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1347{
1348 int running, queued;
1349 struct rq_flags rf;
1350 unsigned long ncsw;
1351 struct rq *rq;
1352
1353 for (;;) {
1354
1355
1356
1357
1358
1359
1360 rq = task_rq(p);
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373 while (task_running(rq, p)) {
1374 if (match_state && unlikely(p->state != match_state))
1375 return 0;
1376 cpu_relax();
1377 }
1378
1379
1380
1381
1382
1383
1384 rq = task_rq_lock(p, &rf);
1385 trace_sched_wait_task(p);
1386 running = task_running(rq, p);
1387 queued = task_on_rq_queued(p);
1388 ncsw = 0;
1389 if (!match_state || p->state == match_state)
1390 ncsw = p->nvcsw | LONG_MIN;
1391 task_rq_unlock(rq, p, &rf);
1392
1393
1394
1395
1396 if (unlikely(!ncsw))
1397 break;
1398
1399
1400
1401
1402
1403
1404
1405 if (unlikely(running)) {
1406 cpu_relax();
1407 continue;
1408 }
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419 if (unlikely(queued)) {
1420 ktime_t to = NSEC_PER_SEC / HZ;
1421
1422 set_current_state(TASK_UNINTERRUPTIBLE);
1423 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1424 continue;
1425 }
1426
1427
1428
1429
1430
1431
1432 break;
1433 }
1434
1435 return ncsw;
1436}
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451void kick_process(struct task_struct *p)
1452{
1453 int cpu;
1454
1455 preempt_disable();
1456 cpu = task_cpu(p);
1457 if ((cpu != smp_processor_id()) && task_curr(p))
1458 smp_send_reschedule(cpu);
1459 preempt_enable();
1460}
1461EXPORT_SYMBOL_GPL(kick_process);
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485static int select_fallback_rq(int cpu, struct task_struct *p)
1486{
1487 int nid = cpu_to_node(cpu);
1488 const struct cpumask *nodemask = NULL;
1489 enum { cpuset, possible, fail } state = cpuset;
1490 int dest_cpu;
1491
1492
1493
1494
1495
1496
1497 if (nid != -1) {
1498 nodemask = cpumask_of_node(nid);
1499
1500
1501 for_each_cpu(dest_cpu, nodemask) {
1502 if (!cpu_active(dest_cpu))
1503 continue;
1504 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
1505 return dest_cpu;
1506 }
1507 }
1508
1509 for (;;) {
1510
1511 for_each_cpu(dest_cpu, &p->cpus_allowed) {
1512 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1513 continue;
1514 if (!cpu_online(dest_cpu))
1515 continue;
1516 goto out;
1517 }
1518
1519
1520 switch (state) {
1521 case cpuset:
1522 if (IS_ENABLED(CONFIG_CPUSETS)) {
1523 cpuset_cpus_allowed_fallback(p);
1524 state = possible;
1525 break;
1526 }
1527
1528 case possible:
1529 do_set_cpus_allowed(p, cpu_possible_mask);
1530 state = fail;
1531 break;
1532
1533 case fail:
1534 BUG();
1535 break;
1536 }
1537 }
1538
1539out:
1540 if (state != cpuset) {
1541
1542
1543
1544
1545
1546 if (p->mm && printk_ratelimit()) {
1547 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1548 task_pid_nr(p), p->comm, cpu);
1549 }
1550 }
1551
1552 return dest_cpu;
1553}
1554
1555
1556
1557
1558static inline
1559int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1560{
1561 lockdep_assert_held(&p->pi_lock);
1562
1563 if (p->nr_cpus_allowed > 1)
1564 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1565 else
1566 cpu = cpumask_any(&p->cpus_allowed);
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
1579 !cpu_online(cpu)))
1580 cpu = select_fallback_rq(task_cpu(p), p);
1581
1582 return cpu;
1583}
1584
1585static void update_avg(u64 *avg, u64 sample)
1586{
1587 s64 diff = sample - *avg;
1588 *avg += diff >> 3;
1589}
1590
1591#else
1592
1593static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1594 const struct cpumask *new_mask, bool check)
1595{
1596 return set_cpus_allowed_ptr(p, new_mask);
1597}
1598
1599#endif
1600
1601static void
1602ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1603{
1604 struct rq *rq;
1605
1606 if (!schedstat_enabled())
1607 return;
1608
1609 rq = this_rq();
1610
1611#ifdef CONFIG_SMP
1612 if (cpu == rq->cpu) {
1613 schedstat_inc(rq->ttwu_local);
1614 schedstat_inc(p->se.statistics.nr_wakeups_local);
1615 } else {
1616 struct sched_domain *sd;
1617
1618 schedstat_inc(p->se.statistics.nr_wakeups_remote);
1619 rcu_read_lock();
1620 for_each_domain(rq->cpu, sd) {
1621 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1622 schedstat_inc(sd->ttwu_wake_remote);
1623 break;
1624 }
1625 }
1626 rcu_read_unlock();
1627 }
1628
1629 if (wake_flags & WF_MIGRATED)
1630 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1631#endif
1632
1633 schedstat_inc(rq->ttwu_count);
1634 schedstat_inc(p->se.statistics.nr_wakeups);
1635
1636 if (wake_flags & WF_SYNC)
1637 schedstat_inc(p->se.statistics.nr_wakeups_sync);
1638}
1639
1640static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1641{
1642 activate_task(rq, p, en_flags);
1643 p->on_rq = TASK_ON_RQ_QUEUED;
1644
1645
1646 if (p->flags & PF_WQ_WORKER)
1647 wq_worker_waking_up(p, cpu_of(rq));
1648}
1649
1650
1651
1652
1653static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1654 struct rq_flags *rf)
1655{
1656 check_preempt_curr(rq, p, wake_flags);
1657 p->state = TASK_RUNNING;
1658 trace_sched_wakeup(p);
1659
1660#ifdef CONFIG_SMP
1661 if (p->sched_class->task_woken) {
1662
1663
1664
1665
1666 rq_unpin_lock(rq, rf);
1667 p->sched_class->task_woken(rq, p);
1668 rq_repin_lock(rq, rf);
1669 }
1670
1671 if (rq->idle_stamp) {
1672 u64 delta = rq_clock(rq) - rq->idle_stamp;
1673 u64 max = 2*rq->max_idle_balance_cost;
1674
1675 update_avg(&rq->avg_idle, delta);
1676
1677 if (rq->avg_idle > max)
1678 rq->avg_idle = max;
1679
1680 rq->idle_stamp = 0;
1681 }
1682#endif
1683}
1684
1685static void
1686ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1687 struct rq_flags *rf)
1688{
1689 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1690
1691 lockdep_assert_held(&rq->lock);
1692
1693#ifdef CONFIG_SMP
1694 if (p->sched_contributes_to_load)
1695 rq->nr_uninterruptible--;
1696
1697 if (wake_flags & WF_MIGRATED)
1698 en_flags |= ENQUEUE_MIGRATED;
1699#endif
1700
1701 ttwu_activate(rq, p, en_flags);
1702 ttwu_do_wakeup(rq, p, wake_flags, rf);
1703}
1704
1705
1706
1707
1708
1709
1710
1711static int ttwu_remote(struct task_struct *p, int wake_flags)
1712{
1713 struct rq_flags rf;
1714 struct rq *rq;
1715 int ret = 0;
1716
1717 rq = __task_rq_lock(p, &rf);
1718 if (task_on_rq_queued(p)) {
1719
1720 update_rq_clock(rq);
1721 ttwu_do_wakeup(rq, p, wake_flags, &rf);
1722 ret = 1;
1723 }
1724 __task_rq_unlock(rq, &rf);
1725
1726 return ret;
1727}
1728
1729#ifdef CONFIG_SMP
1730void sched_ttwu_pending(void)
1731{
1732 struct rq *rq = this_rq();
1733 struct llist_node *llist = llist_del_all(&rq->wake_list);
1734 struct task_struct *p;
1735 struct rq_flags rf;
1736
1737 if (!llist)
1738 return;
1739
1740 rq_lock_irqsave(rq, &rf);
1741 update_rq_clock(rq);
1742
1743 while (llist) {
1744 int wake_flags = 0;
1745
1746 p = llist_entry(llist, struct task_struct, wake_entry);
1747 llist = llist_next(llist);
1748
1749 if (p->sched_remote_wakeup)
1750 wake_flags = WF_MIGRATED;
1751
1752 ttwu_do_activate(rq, p, wake_flags, &rf);
1753 }
1754
1755 rq_unlock_irqrestore(rq, &rf);
1756}
1757
1758void scheduler_ipi(void)
1759{
1760
1761
1762
1763
1764
1765 preempt_fold_need_resched();
1766
1767 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1768 return;
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783 irq_enter();
1784 sched_ttwu_pending();
1785
1786
1787
1788
1789 if (unlikely(got_nohz_idle_kick())) {
1790 this_rq()->idle_balance = 1;
1791 raise_softirq_irqoff(SCHED_SOFTIRQ);
1792 }
1793 irq_exit();
1794}
1795
1796static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1797{
1798 struct rq *rq = cpu_rq(cpu);
1799
1800 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1801
1802 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1803 if (!set_nr_if_polling(rq->idle))
1804 smp_send_reschedule(cpu);
1805 else
1806 trace_sched_wake_idle_without_ipi(cpu);
1807 }
1808}
1809
1810void wake_up_if_idle(int cpu)
1811{
1812 struct rq *rq = cpu_rq(cpu);
1813 struct rq_flags rf;
1814
1815 rcu_read_lock();
1816
1817 if (!is_idle_task(rcu_dereference(rq->curr)))
1818 goto out;
1819
1820 if (set_nr_if_polling(rq->idle)) {
1821 trace_sched_wake_idle_without_ipi(cpu);
1822 } else {
1823 rq_lock_irqsave(rq, &rf);
1824 if (is_idle_task(rq->curr))
1825 smp_send_reschedule(cpu);
1826
1827 rq_unlock_irqrestore(rq, &rf);
1828 }
1829
1830out:
1831 rcu_read_unlock();
1832}
1833
1834bool cpus_share_cache(int this_cpu, int that_cpu)
1835{
1836 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1837}
1838#endif
1839
1840static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1841{
1842 struct rq *rq = cpu_rq(cpu);
1843 struct rq_flags rf;
1844
1845#if defined(CONFIG_SMP)
1846 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1847 sched_clock_cpu(cpu);
1848 ttwu_queue_remote(p, cpu, wake_flags);
1849 return;
1850 }
1851#endif
1852
1853 rq_lock(rq, &rf);
1854 update_rq_clock(rq);
1855 ttwu_do_activate(rq, p, wake_flags, &rf);
1856 rq_unlock(rq, &rf);
1857}
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966static int
1967try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1968{
1969 unsigned long flags;
1970 int cpu, success = 0;
1971
1972
1973
1974
1975
1976
1977
1978 smp_mb__before_spinlock();
1979 raw_spin_lock_irqsave(&p->pi_lock, flags);
1980 if (!(p->state & state))
1981 goto out;
1982
1983 trace_sched_waking(p);
1984
1985
1986 success = 1;
1987 cpu = task_cpu(p);
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010 smp_rmb();
2011 if (p->on_rq && ttwu_remote(p, wake_flags))
2012 goto stat;
2013
2014#ifdef CONFIG_SMP
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032 smp_rmb();
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043 smp_cond_load_acquire(&p->on_cpu, !VAL);
2044
2045 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2046 p->state = TASK_WAKING;
2047
2048 if (p->in_iowait) {
2049 delayacct_blkio_end();
2050 atomic_dec(&task_rq(p)->nr_iowait);
2051 }
2052
2053 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2054 if (task_cpu(p) != cpu) {
2055 wake_flags |= WF_MIGRATED;
2056 set_task_cpu(p, cpu);
2057 }
2058
2059#else
2060
2061 if (p->in_iowait) {
2062 delayacct_blkio_end();
2063 atomic_dec(&task_rq(p)->nr_iowait);
2064 }
2065
2066#endif
2067
2068 ttwu_queue(p, cpu, wake_flags);
2069stat:
2070 ttwu_stat(p, cpu, wake_flags);
2071out:
2072 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2073
2074 return success;
2075}
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2087{
2088 struct rq *rq = task_rq(p);
2089
2090 if (WARN_ON_ONCE(rq != this_rq()) ||
2091 WARN_ON_ONCE(p == current))
2092 return;
2093
2094 lockdep_assert_held(&rq->lock);
2095
2096 if (!raw_spin_trylock(&p->pi_lock)) {
2097
2098
2099
2100
2101
2102
2103 rq_unlock(rq, rf);
2104 raw_spin_lock(&p->pi_lock);
2105 rq_relock(rq, rf);
2106 }
2107
2108 if (!(p->state & TASK_NORMAL))
2109 goto out;
2110
2111 trace_sched_waking(p);
2112
2113 if (!task_on_rq_queued(p)) {
2114 if (p->in_iowait) {
2115 delayacct_blkio_end();
2116 atomic_dec(&rq->nr_iowait);
2117 }
2118 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2119 }
2120
2121 ttwu_do_wakeup(rq, p, 0, rf);
2122 ttwu_stat(p, smp_processor_id(), 0);
2123out:
2124 raw_spin_unlock(&p->pi_lock);
2125}
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139int wake_up_process(struct task_struct *p)
2140{
2141 return try_to_wake_up(p, TASK_NORMAL, 0);
2142}
2143EXPORT_SYMBOL(wake_up_process);
2144
2145int wake_up_state(struct task_struct *p, unsigned int state)
2146{
2147 return try_to_wake_up(p, state, 0);
2148}
2149
2150
2151
2152
2153void __dl_clear_params(struct task_struct *p)
2154{
2155 struct sched_dl_entity *dl_se = &p->dl;
2156
2157 dl_se->dl_runtime = 0;
2158 dl_se->dl_deadline = 0;
2159 dl_se->dl_period = 0;
2160 dl_se->flags = 0;
2161 dl_se->dl_bw = 0;
2162
2163 dl_se->dl_throttled = 0;
2164 dl_se->dl_yielded = 0;
2165}
2166
2167
2168
2169
2170
2171
2172
2173static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2174{
2175 p->on_rq = 0;
2176
2177 p->se.on_rq = 0;
2178 p->se.exec_start = 0;
2179 p->se.sum_exec_runtime = 0;
2180 p->se.prev_sum_exec_runtime = 0;
2181 p->se.nr_migrations = 0;
2182 p->se.vruntime = 0;
2183 INIT_LIST_HEAD(&p->se.group_node);
2184
2185#ifdef CONFIG_FAIR_GROUP_SCHED
2186 p->se.cfs_rq = NULL;
2187#endif
2188
2189#ifdef CONFIG_SCHEDSTATS
2190
2191 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2192#endif
2193
2194 RB_CLEAR_NODE(&p->dl.rb_node);
2195 init_dl_task_timer(&p->dl);
2196 __dl_clear_params(p);
2197
2198 INIT_LIST_HEAD(&p->rt.run_list);
2199 p->rt.timeout = 0;
2200 p->rt.time_slice = sched_rr_timeslice;
2201 p->rt.on_rq = 0;
2202 p->rt.on_list = 0;
2203
2204#ifdef CONFIG_PREEMPT_NOTIFIERS
2205 INIT_HLIST_HEAD(&p->preempt_notifiers);
2206#endif
2207
2208#ifdef CONFIG_NUMA_BALANCING
2209 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2210 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2211 p->mm->numa_scan_seq = 0;
2212 }
2213
2214 if (clone_flags & CLONE_VM)
2215 p->numa_preferred_nid = current->numa_preferred_nid;
2216 else
2217 p->numa_preferred_nid = -1;
2218
2219 p->node_stamp = 0ULL;
2220 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2221 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2222 p->numa_work.next = &p->numa_work;
2223 p->numa_faults = NULL;
2224 p->last_task_numa_placement = 0;
2225 p->last_sum_exec_runtime = 0;
2226
2227 p->numa_group = NULL;
2228#endif
2229}
2230
2231DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2232
2233#ifdef CONFIG_NUMA_BALANCING
2234
2235void set_numabalancing_state(bool enabled)
2236{
2237 if (enabled)
2238 static_branch_enable(&sched_numa_balancing);
2239 else
2240 static_branch_disable(&sched_numa_balancing);
2241}
2242
2243#ifdef CONFIG_PROC_SYSCTL
2244int sysctl_numa_balancing(struct ctl_table *table, int write,
2245 void __user *buffer, size_t *lenp, loff_t *ppos)
2246{
2247 struct ctl_table t;
2248 int err;
2249 int state = static_branch_likely(&sched_numa_balancing);
2250
2251 if (write && !capable(CAP_SYS_ADMIN))
2252 return -EPERM;
2253
2254 t = *table;
2255 t.data = &state;
2256 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2257 if (err < 0)
2258 return err;
2259 if (write)
2260 set_numabalancing_state(state);
2261 return err;
2262}
2263#endif
2264#endif
2265
2266#ifdef CONFIG_SCHEDSTATS
2267
2268DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2269static bool __initdata __sched_schedstats = false;
2270
2271static void set_schedstats(bool enabled)
2272{
2273 if (enabled)
2274 static_branch_enable(&sched_schedstats);
2275 else
2276 static_branch_disable(&sched_schedstats);
2277}
2278
2279void force_schedstat_enabled(void)
2280{
2281 if (!schedstat_enabled()) {
2282 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2283 static_branch_enable(&sched_schedstats);
2284 }
2285}
2286
2287static int __init setup_schedstats(char *str)
2288{
2289 int ret = 0;
2290 if (!str)
2291 goto out;
2292
2293
2294
2295
2296
2297
2298 if (!strcmp(str, "enable")) {
2299 __sched_schedstats = true;
2300 ret = 1;
2301 } else if (!strcmp(str, "disable")) {
2302 __sched_schedstats = false;
2303 ret = 1;
2304 }
2305out:
2306 if (!ret)
2307 pr_warn("Unable to parse schedstats=\n");
2308
2309 return ret;
2310}
2311__setup("schedstats=", setup_schedstats);
2312
2313static void __init init_schedstats(void)
2314{
2315 set_schedstats(__sched_schedstats);
2316}
2317
2318#ifdef CONFIG_PROC_SYSCTL
2319int sysctl_schedstats(struct ctl_table *table, int write,
2320 void __user *buffer, size_t *lenp, loff_t *ppos)
2321{
2322 struct ctl_table t;
2323 int err;
2324 int state = static_branch_likely(&sched_schedstats);
2325
2326 if (write && !capable(CAP_SYS_ADMIN))
2327 return -EPERM;
2328
2329 t = *table;
2330 t.data = &state;
2331 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2332 if (err < 0)
2333 return err;
2334 if (write)
2335 set_schedstats(state);
2336 return err;
2337}
2338#endif
2339#else
2340static inline void init_schedstats(void) {}
2341#endif
2342
2343
2344
2345
2346int sched_fork(unsigned long clone_flags, struct task_struct *p)
2347{
2348 unsigned long flags;
2349 int cpu = get_cpu();
2350
2351 __sched_fork(clone_flags, p);
2352
2353
2354
2355
2356
2357 p->state = TASK_NEW;
2358
2359
2360
2361
2362 p->prio = current->normal_prio;
2363
2364
2365
2366
2367 if (unlikely(p->sched_reset_on_fork)) {
2368 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2369 p->policy = SCHED_NORMAL;
2370 p->static_prio = NICE_TO_PRIO(0);
2371 p->rt_priority = 0;
2372 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2373 p->static_prio = NICE_TO_PRIO(0);
2374
2375 p->prio = p->normal_prio = __normal_prio(p);
2376 set_load_weight(p);
2377
2378
2379
2380
2381
2382 p->sched_reset_on_fork = 0;
2383 }
2384
2385 if (dl_prio(p->prio)) {
2386 put_cpu();
2387 return -EAGAIN;
2388 } else if (rt_prio(p->prio)) {
2389 p->sched_class = &rt_sched_class;
2390 } else {
2391 p->sched_class = &fair_sched_class;
2392 }
2393
2394 init_entity_runnable_average(&p->se);
2395
2396
2397
2398
2399
2400
2401
2402
2403 raw_spin_lock_irqsave(&p->pi_lock, flags);
2404
2405
2406
2407
2408 __set_task_cpu(p, cpu);
2409 if (p->sched_class->task_fork)
2410 p->sched_class->task_fork(p);
2411 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2412
2413#ifdef CONFIG_SCHED_INFO
2414 if (likely(sched_info_on()))
2415 memset(&p->sched_info, 0, sizeof(p->sched_info));
2416#endif
2417#if defined(CONFIG_SMP)
2418 p->on_cpu = 0;
2419#endif
2420 init_task_preempt_count(p);
2421#ifdef CONFIG_SMP
2422 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2423 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2424#endif
2425
2426 put_cpu();
2427 return 0;
2428}
2429
2430unsigned long to_ratio(u64 period, u64 runtime)
2431{
2432 if (runtime == RUNTIME_INF)
2433 return 1ULL << 20;
2434
2435
2436
2437
2438
2439
2440 if (period == 0)
2441 return 0;
2442
2443 return div64_u64(runtime << 20, period);
2444}
2445
2446#ifdef CONFIG_SMP
2447inline struct dl_bw *dl_bw_of(int i)
2448{
2449 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2450 "sched RCU must be held");
2451 return &cpu_rq(i)->rd->dl_bw;
2452}
2453
2454static inline int dl_bw_cpus(int i)
2455{
2456 struct root_domain *rd = cpu_rq(i)->rd;
2457 int cpus = 0;
2458
2459 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2460 "sched RCU must be held");
2461 for_each_cpu_and(i, rd->span, cpu_active_mask)
2462 cpus++;
2463
2464 return cpus;
2465}
2466#else
2467inline struct dl_bw *dl_bw_of(int i)
2468{
2469 return &cpu_rq(i)->dl.dl_bw;
2470}
2471
2472static inline int dl_bw_cpus(int i)
2473{
2474 return 1;
2475}
2476#endif
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489static int dl_overflow(struct task_struct *p, int policy,
2490 const struct sched_attr *attr)
2491{
2492
2493 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2494 u64 period = attr->sched_period ?: attr->sched_deadline;
2495 u64 runtime = attr->sched_runtime;
2496 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2497 int cpus, err = -1;
2498
2499
2500 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2501 return 0;
2502
2503
2504
2505
2506
2507
2508 raw_spin_lock(&dl_b->lock);
2509 cpus = dl_bw_cpus(task_cpu(p));
2510 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2511 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2512 __dl_add(dl_b, new_bw);
2513 err = 0;
2514 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2515 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2516 __dl_clear(dl_b, p->dl.dl_bw);
2517 __dl_add(dl_b, new_bw);
2518 err = 0;
2519 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2520 __dl_clear(dl_b, p->dl.dl_bw);
2521 err = 0;
2522 }
2523 raw_spin_unlock(&dl_b->lock);
2524
2525 return err;
2526}
2527
2528extern void init_dl_bw(struct dl_bw *dl_b);
2529
2530
2531
2532
2533
2534
2535
2536
2537void wake_up_new_task(struct task_struct *p)
2538{
2539 struct rq_flags rf;
2540 struct rq *rq;
2541
2542 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2543 p->state = TASK_RUNNING;
2544#ifdef CONFIG_SMP
2545
2546
2547
2548
2549
2550
2551
2552
2553 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2554#endif
2555 rq = __task_rq_lock(p, &rf);
2556 update_rq_clock(rq);
2557 post_init_entity_util_avg(&p->se);
2558
2559 activate_task(rq, p, ENQUEUE_NOCLOCK);
2560 p->on_rq = TASK_ON_RQ_QUEUED;
2561 trace_sched_wakeup_new(p);
2562 check_preempt_curr(rq, p, WF_FORK);
2563#ifdef CONFIG_SMP
2564 if (p->sched_class->task_woken) {
2565
2566
2567
2568
2569 rq_unpin_lock(rq, &rf);
2570 p->sched_class->task_woken(rq, p);
2571 rq_repin_lock(rq, &rf);
2572 }
2573#endif
2574 task_rq_unlock(rq, p, &rf);
2575}
2576
2577#ifdef CONFIG_PREEMPT_NOTIFIERS
2578
2579static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2580
2581void preempt_notifier_inc(void)
2582{
2583 static_key_slow_inc(&preempt_notifier_key);
2584}
2585EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2586
2587void preempt_notifier_dec(void)
2588{
2589 static_key_slow_dec(&preempt_notifier_key);
2590}
2591EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2592
2593
2594
2595
2596
2597void preempt_notifier_register(struct preempt_notifier *notifier)
2598{
2599 if (!static_key_false(&preempt_notifier_key))
2600 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2601
2602 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2603}
2604EXPORT_SYMBOL_GPL(preempt_notifier_register);
2605
2606
2607
2608
2609
2610
2611
2612void preempt_notifier_unregister(struct preempt_notifier *notifier)
2613{
2614 hlist_del(¬ifier->link);
2615}
2616EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2617
2618static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2619{
2620 struct preempt_notifier *notifier;
2621
2622 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2623 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2624}
2625
2626static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2627{
2628 if (static_key_false(&preempt_notifier_key))
2629 __fire_sched_in_preempt_notifiers(curr);
2630}
2631
2632static void
2633__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2634 struct task_struct *next)
2635{
2636 struct preempt_notifier *notifier;
2637
2638 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2639 notifier->ops->sched_out(notifier, next);
2640}
2641
2642static __always_inline void
2643fire_sched_out_preempt_notifiers(struct task_struct *curr,
2644 struct task_struct *next)
2645{
2646 if (static_key_false(&preempt_notifier_key))
2647 __fire_sched_out_preempt_notifiers(curr, next);
2648}
2649
2650#else
2651
2652static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2653{
2654}
2655
2656static inline void
2657fire_sched_out_preempt_notifiers(struct task_struct *curr,
2658 struct task_struct *next)
2659{
2660}
2661
2662#endif
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677static inline void
2678prepare_task_switch(struct rq *rq, struct task_struct *prev,
2679 struct task_struct *next)
2680{
2681 sched_info_switch(rq, prev, next);
2682 perf_event_task_sched_out(prev, next);
2683 fire_sched_out_preempt_notifiers(prev, next);
2684 prepare_lock_switch(rq, next);
2685 prepare_arch_switch(next);
2686}
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707static struct rq *finish_task_switch(struct task_struct *prev)
2708 __releases(rq->lock)
2709{
2710 struct rq *rq = this_rq();
2711 struct mm_struct *mm = rq->prev_mm;
2712 long prev_state;
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2726 "corrupted preempt_count: %s/%d/0x%x\n",
2727 current->comm, current->pid, preempt_count()))
2728 preempt_count_set(FORK_PREEMPT_COUNT);
2729
2730 rq->prev_mm = NULL;
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743 prev_state = prev->state;
2744 vtime_task_switch(prev);
2745 perf_event_task_sched_in(prev, current);
2746 finish_lock_switch(rq, prev);
2747 finish_arch_post_lock_switch();
2748
2749 fire_sched_in_preempt_notifiers(current);
2750 if (mm)
2751 mmdrop(mm);
2752 if (unlikely(prev_state == TASK_DEAD)) {
2753 if (prev->sched_class->task_dead)
2754 prev->sched_class->task_dead(prev);
2755
2756
2757
2758
2759
2760 kprobe_flush_task(prev);
2761
2762
2763 put_task_stack(prev);
2764
2765 put_task_struct(prev);
2766 }
2767
2768 tick_nohz_task_switch();
2769 return rq;
2770}
2771
2772#ifdef CONFIG_SMP
2773
2774
2775static void __balance_callback(struct rq *rq)
2776{
2777 struct callback_head *head, *next;
2778 void (*func)(struct rq *rq);
2779 unsigned long flags;
2780
2781 raw_spin_lock_irqsave(&rq->lock, flags);
2782 head = rq->balance_callback;
2783 rq->balance_callback = NULL;
2784 while (head) {
2785 func = (void (*)(struct rq *))head->func;
2786 next = head->next;
2787 head->next = NULL;
2788 head = next;
2789
2790 func(rq);
2791 }
2792 raw_spin_unlock_irqrestore(&rq->lock, flags);
2793}
2794
2795static inline void balance_callback(struct rq *rq)
2796{
2797 if (unlikely(rq->balance_callback))
2798 __balance_callback(rq);
2799}
2800
2801#else
2802
2803static inline void balance_callback(struct rq *rq)
2804{
2805}
2806
2807#endif
2808
2809
2810
2811
2812
2813asmlinkage __visible void schedule_tail(struct task_struct *prev)
2814 __releases(rq->lock)
2815{
2816 struct rq *rq;
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827 rq = finish_task_switch(prev);
2828 balance_callback(rq);
2829 preempt_enable();
2830
2831 if (current->set_child_tid)
2832 put_user(task_pid_vnr(current), current->set_child_tid);
2833}
2834
2835
2836
2837
2838static __always_inline struct rq *
2839context_switch(struct rq *rq, struct task_struct *prev,
2840 struct task_struct *next, struct rq_flags *rf)
2841{
2842 struct mm_struct *mm, *oldmm;
2843
2844 prepare_task_switch(rq, prev, next);
2845
2846 mm = next->mm;
2847 oldmm = prev->active_mm;
2848
2849
2850
2851
2852
2853 arch_start_context_switch(prev);
2854
2855 if (!mm) {
2856 next->active_mm = oldmm;
2857 mmgrab(oldmm);
2858 enter_lazy_tlb(oldmm, next);
2859 } else
2860 switch_mm_irqs_off(oldmm, mm, next);
2861
2862 if (!prev->mm) {
2863 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm;
2865 }
2866
2867 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
2868
2869
2870
2871
2872
2873
2874
2875 rq_unpin_lock(rq, rf);
2876 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2877
2878
2879 switch_to(prev, next, prev);
2880 barrier();
2881
2882 return finish_task_switch(prev);
2883}
2884
2885
2886
2887
2888
2889
2890
2891unsigned long nr_running(void)
2892{
2893 unsigned long i, sum = 0;
2894
2895 for_each_online_cpu(i)
2896 sum += cpu_rq(i)->nr_running;
2897
2898 return sum;
2899}
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914bool single_task_running(void)
2915{
2916 return raw_rq()->nr_running == 1;
2917}
2918EXPORT_SYMBOL(single_task_running);
2919
2920unsigned long long nr_context_switches(void)
2921{
2922 int i;
2923 unsigned long long sum = 0;
2924
2925 for_each_possible_cpu(i)
2926 sum += cpu_rq(i)->nr_switches;
2927
2928 return sum;
2929}
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961unsigned long nr_iowait(void)
2962{
2963 unsigned long i, sum = 0;
2964
2965 for_each_possible_cpu(i)
2966 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2967
2968 return sum;
2969}
2970
2971
2972
2973
2974
2975
2976
2977
2978unsigned long nr_iowait_cpu(int cpu)
2979{
2980 struct rq *this = cpu_rq(cpu);
2981 return atomic_read(&this->nr_iowait);
2982}
2983
2984void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2985{
2986 struct rq *rq = this_rq();
2987 *nr_waiters = atomic_read(&rq->nr_iowait);
2988 *load = rq->load.weight;
2989}
2990
2991#ifdef CONFIG_SMP
2992
2993
2994
2995
2996
2997void sched_exec(void)
2998{
2999 struct task_struct *p = current;
3000 unsigned long flags;
3001 int dest_cpu;
3002
3003 raw_spin_lock_irqsave(&p->pi_lock, flags);
3004 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3005 if (dest_cpu == smp_processor_id())
3006 goto unlock;
3007
3008 if (likely(cpu_active(dest_cpu))) {
3009 struct migration_arg arg = { p, dest_cpu };
3010
3011 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3012 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3013 return;
3014 }
3015unlock:
3016 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3017}
3018
3019#endif
3020
3021DEFINE_PER_CPU(struct kernel_stat, kstat);
3022DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3023
3024EXPORT_PER_CPU_SYMBOL(kstat);
3025EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3026
3027
3028
3029
3030
3031
3032
3033static inline void prefetch_curr_exec_start(struct task_struct *p)
3034{
3035#ifdef CONFIG_FAIR_GROUP_SCHED
3036 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3037#else
3038 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3039#endif
3040 prefetch(curr);
3041 prefetch(&curr->exec_start);
3042}
3043
3044
3045
3046
3047
3048
3049unsigned long long task_sched_runtime(struct task_struct *p)
3050{
3051 struct rq_flags rf;
3052 struct rq *rq;
3053 u64 ns;
3054
3055#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067 if (!p->on_cpu || !task_on_rq_queued(p))
3068 return p->se.sum_exec_runtime;
3069#endif
3070
3071 rq = task_rq_lock(p, &rf);
3072
3073
3074
3075
3076
3077 if (task_current(rq, p) && task_on_rq_queued(p)) {
3078 prefetch_curr_exec_start(p);
3079 update_rq_clock(rq);
3080 p->sched_class->update_curr(rq);
3081 }
3082 ns = p->se.sum_exec_runtime;
3083 task_rq_unlock(rq, p, &rf);
3084
3085 return ns;
3086}
3087
3088
3089
3090
3091
3092void scheduler_tick(void)
3093{
3094 int cpu = smp_processor_id();
3095 struct rq *rq = cpu_rq(cpu);
3096 struct task_struct *curr = rq->curr;
3097 struct rq_flags rf;
3098
3099 sched_clock_tick();
3100
3101 rq_lock(rq, &rf);
3102
3103 update_rq_clock(rq);
3104 curr->sched_class->task_tick(rq, curr, 0);
3105 cpu_load_update_active(rq);
3106 calc_global_load_tick(rq);
3107
3108 rq_unlock(rq, &rf);
3109
3110 perf_event_task_tick();
3111
3112#ifdef CONFIG_SMP
3113 rq->idle_balance = idle_cpu(cpu);
3114 trigger_load_balance(rq);
3115#endif
3116 rq_last_tick_reset(rq);
3117}
3118
3119#ifdef CONFIG_NO_HZ_FULL
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133u64 scheduler_tick_max_deferment(void)
3134{
3135 struct rq *rq = this_rq();
3136 unsigned long next, now = READ_ONCE(jiffies);
3137
3138 next = rq->last_sched_tick + HZ;
3139
3140 if (time_before_eq(next, now))
3141 return 0;
3142
3143 return jiffies_to_nsecs(next - now);
3144}
3145#endif
3146
3147#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3148 defined(CONFIG_PREEMPT_TRACER))
3149
3150
3151
3152
3153static inline void preempt_latency_start(int val)
3154{
3155 if (preempt_count() == val) {
3156 unsigned long ip = get_lock_parent_ip();
3157#ifdef CONFIG_DEBUG_PREEMPT
3158 current->preempt_disable_ip = ip;
3159#endif
3160 trace_preempt_off(CALLER_ADDR0, ip);
3161 }
3162}
3163
3164void preempt_count_add(int val)
3165{
3166#ifdef CONFIG_DEBUG_PREEMPT
3167
3168
3169
3170 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3171 return;
3172#endif
3173 __preempt_count_add(val);
3174#ifdef CONFIG_DEBUG_PREEMPT
3175
3176
3177
3178 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3179 PREEMPT_MASK - 10);
3180#endif
3181 preempt_latency_start(val);
3182}
3183EXPORT_SYMBOL(preempt_count_add);
3184NOKPROBE_SYMBOL(preempt_count_add);
3185
3186
3187
3188
3189
3190static inline void preempt_latency_stop(int val)
3191{
3192 if (preempt_count() == val)
3193 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3194}
3195
3196void preempt_count_sub(int val)
3197{
3198#ifdef CONFIG_DEBUG_PREEMPT
3199
3200
3201
3202 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3203 return;
3204
3205
3206
3207 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3208 !(preempt_count() & PREEMPT_MASK)))
3209 return;
3210#endif
3211
3212 preempt_latency_stop(val);
3213 __preempt_count_sub(val);
3214}
3215EXPORT_SYMBOL(preempt_count_sub);
3216NOKPROBE_SYMBOL(preempt_count_sub);
3217
3218#else
3219static inline void preempt_latency_start(int val) { }
3220static inline void preempt_latency_stop(int val) { }
3221#endif
3222
3223static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3224{
3225#ifdef CONFIG_DEBUG_PREEMPT
3226 return p->preempt_disable_ip;
3227#else
3228 return 0;
3229#endif
3230}
3231
3232
3233
3234
3235static noinline void __schedule_bug(struct task_struct *prev)
3236{
3237
3238 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3239
3240 if (oops_in_progress)
3241 return;
3242
3243 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3244 prev->comm, prev->pid, preempt_count());
3245
3246 debug_show_held_locks(prev);
3247 print_modules();
3248 if (irqs_disabled())
3249 print_irqtrace_events(prev);
3250 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3251 && in_atomic_preempt_off()) {
3252 pr_err("Preemption disabled at:");
3253 print_ip_sym(preempt_disable_ip);
3254 pr_cont("\n");
3255 }
3256 if (panic_on_warn)
3257 panic("scheduling while atomic\n");
3258
3259 dump_stack();
3260 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3261}
3262
3263
3264
3265
3266static inline void schedule_debug(struct task_struct *prev)
3267{
3268#ifdef CONFIG_SCHED_STACK_END_CHECK
3269 if (task_stack_end_corrupted(prev))
3270 panic("corrupted stack end detected inside scheduler\n");
3271#endif
3272
3273 if (unlikely(in_atomic_preempt_off())) {
3274 __schedule_bug(prev);
3275 preempt_count_set(PREEMPT_DISABLED);
3276 }
3277 rcu_sleep_check();
3278
3279 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3280
3281 schedstat_inc(this_rq()->sched_count);
3282}
3283
3284
3285
3286
3287static inline struct task_struct *
3288pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3289{
3290 const struct sched_class *class;
3291 struct task_struct *p;
3292
3293
3294
3295
3296
3297
3298
3299 if (likely((prev->sched_class == &idle_sched_class ||
3300 prev->sched_class == &fair_sched_class) &&
3301 rq->nr_running == rq->cfs.h_nr_running)) {
3302
3303 p = fair_sched_class.pick_next_task(rq, prev, rf);
3304 if (unlikely(p == RETRY_TASK))
3305 goto again;
3306
3307
3308 if (unlikely(!p))
3309 p = idle_sched_class.pick_next_task(rq, prev, rf);
3310
3311 return p;
3312 }
3313
3314again:
3315 for_each_class(class) {
3316 p = class->pick_next_task(rq, prev, rf);
3317 if (p) {
3318 if (unlikely(p == RETRY_TASK))
3319 goto again;
3320 return p;
3321 }
3322 }
3323
3324
3325 BUG();
3326}
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367static void __sched notrace __schedule(bool preempt)
3368{
3369 struct task_struct *prev, *next;
3370 unsigned long *switch_count;
3371 struct rq_flags rf;
3372 struct rq *rq;
3373 int cpu;
3374
3375 cpu = smp_processor_id();
3376 rq = cpu_rq(cpu);
3377 prev = rq->curr;
3378
3379 schedule_debug(prev);
3380
3381 if (sched_feat(HRTICK))
3382 hrtick_clear(rq);
3383
3384 local_irq_disable();
3385 rcu_note_context_switch(preempt);
3386
3387
3388
3389
3390
3391
3392 smp_mb__before_spinlock();
3393 rq_lock(rq, &rf);
3394
3395
3396 rq->clock_update_flags <<= 1;
3397 update_rq_clock(rq);
3398
3399 switch_count = &prev->nivcsw;
3400 if (!preempt && prev->state) {
3401 if (unlikely(signal_pending_state(prev->state, prev))) {
3402 prev->state = TASK_RUNNING;
3403 } else {
3404 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3405 prev->on_rq = 0;
3406
3407 if (prev->in_iowait) {
3408 atomic_inc(&rq->nr_iowait);
3409 delayacct_blkio_start();
3410 }
3411
3412
3413
3414
3415
3416
3417 if (prev->flags & PF_WQ_WORKER) {
3418 struct task_struct *to_wakeup;
3419
3420 to_wakeup = wq_worker_sleeping(prev);
3421 if (to_wakeup)
3422 try_to_wake_up_local(to_wakeup, &rf);
3423 }
3424 }
3425 switch_count = &prev->nvcsw;
3426 }
3427
3428 next = pick_next_task(rq, prev, &rf);
3429 clear_tsk_need_resched(prev);
3430 clear_preempt_need_resched();
3431
3432 if (likely(prev != next)) {
3433 rq->nr_switches++;
3434 rq->curr = next;
3435 ++*switch_count;
3436
3437 trace_sched_switch(preempt, prev, next);
3438
3439
3440 rq = context_switch(rq, prev, next, &rf);
3441 } else {
3442 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3443 rq_unlock_irq(rq, &rf);
3444 }
3445
3446 balance_callback(rq);
3447}
3448
3449void __noreturn do_task_dead(void)
3450{
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463 smp_mb();
3464 raw_spin_unlock_wait(¤t->pi_lock);
3465
3466
3467 __set_current_state(TASK_DEAD);
3468
3469
3470 current->flags |= PF_NOFREEZE;
3471
3472 __schedule(false);
3473 BUG();
3474
3475
3476 for (;;)
3477 cpu_relax();
3478}
3479
3480static inline void sched_submit_work(struct task_struct *tsk)
3481{
3482 if (!tsk->state || tsk_is_pi_blocked(tsk))
3483 return;
3484
3485
3486
3487
3488 if (blk_needs_flush_plug(tsk))
3489 blk_schedule_flush_plug(tsk);
3490}
3491
3492asmlinkage __visible void __sched schedule(void)
3493{
3494 struct task_struct *tsk = current;
3495
3496 sched_submit_work(tsk);
3497 do {
3498 preempt_disable();
3499 __schedule(false);
3500 sched_preempt_enable_no_resched();
3501 } while (need_resched());
3502}
3503EXPORT_SYMBOL(schedule);
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515void __sched schedule_idle(void)
3516{
3517
3518
3519
3520
3521
3522
3523
3524 WARN_ON_ONCE(current->state);
3525 do {
3526 __schedule(false);
3527 } while (need_resched());
3528}
3529
3530#ifdef CONFIG_CONTEXT_TRACKING
3531asmlinkage __visible void __sched schedule_user(void)
3532{
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543 enum ctx_state prev_state = exception_enter();
3544 schedule();
3545 exception_exit(prev_state);
3546}
3547#endif
3548
3549
3550
3551
3552
3553
3554void __sched schedule_preempt_disabled(void)
3555{
3556 sched_preempt_enable_no_resched();
3557 schedule();
3558 preempt_disable();
3559}
3560
3561static void __sched notrace preempt_schedule_common(void)
3562{
3563 do {
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577 preempt_disable_notrace();
3578 preempt_latency_start(1);
3579 __schedule(true);
3580 preempt_latency_stop(1);
3581 preempt_enable_no_resched_notrace();
3582
3583
3584
3585
3586
3587 } while (need_resched());
3588}
3589
3590#ifdef CONFIG_PREEMPT
3591
3592
3593
3594
3595
3596asmlinkage __visible void __sched notrace preempt_schedule(void)
3597{
3598
3599
3600
3601
3602 if (likely(!preemptible()))
3603 return;
3604
3605 preempt_schedule_common();
3606}
3607NOKPROBE_SYMBOL(preempt_schedule);
3608EXPORT_SYMBOL(preempt_schedule);
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3625{
3626 enum ctx_state prev_ctx;
3627
3628 if (likely(!preemptible()))
3629 return;
3630
3631 do {
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645 preempt_disable_notrace();
3646 preempt_latency_start(1);
3647
3648
3649
3650
3651
3652 prev_ctx = exception_enter();
3653 __schedule(true);
3654 exception_exit(prev_ctx);
3655
3656 preempt_latency_stop(1);
3657 preempt_enable_no_resched_notrace();
3658 } while (need_resched());
3659}
3660EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3661
3662#endif
3663
3664
3665
3666
3667
3668
3669
3670asmlinkage __visible void __sched preempt_schedule_irq(void)
3671{
3672 enum ctx_state prev_state;
3673
3674
3675 BUG_ON(preempt_count() || !irqs_disabled());
3676
3677 prev_state = exception_enter();
3678
3679 do {
3680 preempt_disable();
3681 local_irq_enable();
3682 __schedule(true);
3683 local_irq_disable();
3684 sched_preempt_enable_no_resched();
3685 } while (need_resched());
3686
3687 exception_exit(prev_state);
3688}
3689
3690int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3691 void *key)
3692{
3693 return try_to_wake_up(curr->private, mode, wake_flags);
3694}
3695EXPORT_SYMBOL(default_wake_function);
3696
3697#ifdef CONFIG_RT_MUTEXES
3698
3699static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3700{
3701 if (pi_task)
3702 prio = min(prio, pi_task->prio);
3703
3704 return prio;
3705}
3706
3707static inline int rt_effective_prio(struct task_struct *p, int prio)
3708{
3709 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3710
3711 return __rt_effective_prio(pi_task, prio);
3712}
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3726{
3727 int prio, oldprio, queued, running, queue_flag =
3728 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3729 const struct sched_class *prev_class;
3730 struct rq_flags rf;
3731 struct rq *rq;
3732
3733
3734 prio = __rt_effective_prio(pi_task, p->normal_prio);
3735
3736
3737
3738
3739 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3740 return;
3741
3742 rq = __task_rq_lock(p, &rf);
3743 update_rq_clock(rq);
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754 p->pi_top_task = pi_task;
3755
3756
3757
3758
3759 if (prio == p->prio && !dl_prio(prio))
3760 goto out_unlock;
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774 if (unlikely(p == rq->idle)) {
3775 WARN_ON(p != rq->curr);
3776 WARN_ON(p->pi_blocked_on);
3777 goto out_unlock;
3778 }
3779
3780 trace_sched_pi_setprio(p, pi_task);
3781 oldprio = p->prio;
3782
3783 if (oldprio == prio)
3784 queue_flag &= ~DEQUEUE_MOVE;
3785
3786 prev_class = p->sched_class;
3787 queued = task_on_rq_queued(p);
3788 running = task_current(rq, p);
3789 if (queued)
3790 dequeue_task(rq, p, queue_flag);
3791 if (running)
3792 put_prev_task(rq, p);
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803 if (dl_prio(prio)) {
3804 if (!dl_prio(p->normal_prio) ||
3805 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3806 p->dl.dl_boosted = 1;
3807 queue_flag |= ENQUEUE_REPLENISH;
3808 } else
3809 p->dl.dl_boosted = 0;
3810 p->sched_class = &dl_sched_class;
3811 } else if (rt_prio(prio)) {
3812 if (dl_prio(oldprio))
3813 p->dl.dl_boosted = 0;
3814 if (oldprio < prio)
3815 queue_flag |= ENQUEUE_HEAD;
3816 p->sched_class = &rt_sched_class;
3817 } else {
3818 if (dl_prio(oldprio))
3819 p->dl.dl_boosted = 0;
3820 if (rt_prio(oldprio))
3821 p->rt.timeout = 0;
3822 p->sched_class = &fair_sched_class;
3823 }
3824
3825 p->prio = prio;
3826
3827 if (queued)
3828 enqueue_task(rq, p, queue_flag);
3829 if (running)
3830 set_curr_task(rq, p);
3831
3832 check_class_changed(rq, p, prev_class, oldprio);
3833out_unlock:
3834
3835 preempt_disable();
3836 __task_rq_unlock(rq, &rf);
3837
3838 balance_callback(rq);
3839 preempt_enable();
3840}
3841#else
3842static inline int rt_effective_prio(struct task_struct *p, int prio)
3843{
3844 return prio;
3845}
3846#endif
3847
3848void set_user_nice(struct task_struct *p, long nice)
3849{
3850 bool queued, running;
3851 int old_prio, delta;
3852 struct rq_flags rf;
3853 struct rq *rq;
3854
3855 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3856 return;
3857
3858
3859
3860
3861 rq = task_rq_lock(p, &rf);
3862 update_rq_clock(rq);
3863
3864
3865
3866
3867
3868
3869
3870 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3871 p->static_prio = NICE_TO_PRIO(nice);
3872 goto out_unlock;
3873 }
3874 queued = task_on_rq_queued(p);
3875 running = task_current(rq, p);
3876 if (queued)
3877 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3878 if (running)
3879 put_prev_task(rq, p);
3880
3881 p->static_prio = NICE_TO_PRIO(nice);
3882 set_load_weight(p);
3883 old_prio = p->prio;
3884 p->prio = effective_prio(p);
3885 delta = p->prio - old_prio;
3886
3887 if (queued) {
3888 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3889
3890
3891
3892
3893 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3894 resched_curr(rq);
3895 }
3896 if (running)
3897 set_curr_task(rq, p);
3898out_unlock:
3899 task_rq_unlock(rq, p, &rf);
3900}
3901EXPORT_SYMBOL(set_user_nice);
3902
3903
3904
3905
3906
3907
3908int can_nice(const struct task_struct *p, const int nice)
3909{
3910
3911 int nice_rlim = nice_to_rlimit(nice);
3912
3913 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3914 capable(CAP_SYS_NICE));
3915}
3916
3917#ifdef __ARCH_WANT_SYS_NICE
3918
3919
3920
3921
3922
3923
3924
3925
3926SYSCALL_DEFINE1(nice, int, increment)
3927{
3928 long nice, retval;
3929
3930
3931
3932
3933
3934
3935 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3936 nice = task_nice(current) + increment;
3937
3938 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3939 if (increment < 0 && !can_nice(current, nice))
3940 return -EPERM;
3941
3942 retval = security_task_setnice(current, nice);
3943 if (retval)
3944 return retval;
3945
3946 set_user_nice(current, nice);
3947 return 0;
3948}
3949
3950#endif
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960int task_prio(const struct task_struct *p)
3961{
3962 return p->prio - MAX_RT_PRIO;
3963}
3964
3965
3966
3967
3968
3969
3970
3971int idle_cpu(int cpu)
3972{
3973 struct rq *rq = cpu_rq(cpu);
3974
3975 if (rq->curr != rq->idle)
3976 return 0;
3977
3978 if (rq->nr_running)
3979 return 0;
3980
3981#ifdef CONFIG_SMP
3982 if (!llist_empty(&rq->wake_list))
3983 return 0;
3984#endif
3985
3986 return 1;
3987}
3988
3989
3990
3991
3992
3993
3994
3995struct task_struct *idle_task(int cpu)
3996{
3997 return cpu_rq(cpu)->idle;
3998}
3999
4000
4001
4002
4003
4004
4005
4006static struct task_struct *find_process_by_pid(pid_t pid)
4007{
4008 return pid ? find_task_by_vpid(pid) : current;
4009}
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019static void
4020__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
4021{
4022 struct sched_dl_entity *dl_se = &p->dl;
4023
4024 dl_se->dl_runtime = attr->sched_runtime;
4025 dl_se->dl_deadline = attr->sched_deadline;
4026 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
4027 dl_se->flags = attr->sched_flags;
4028 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049}
4050
4051
4052
4053
4054
4055#define SETPARAM_POLICY -1
4056
4057static void __setscheduler_params(struct task_struct *p,
4058 const struct sched_attr *attr)
4059{
4060 int policy = attr->sched_policy;
4061
4062 if (policy == SETPARAM_POLICY)
4063 policy = p->policy;
4064
4065 p->policy = policy;
4066
4067 if (dl_policy(policy))
4068 __setparam_dl(p, attr);
4069 else if (fair_policy(policy))
4070 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4071
4072
4073
4074
4075
4076
4077 p->rt_priority = attr->sched_priority;
4078 p->normal_prio = normal_prio(p);
4079 set_load_weight(p);
4080}
4081
4082
4083static void __setscheduler(struct rq *rq, struct task_struct *p,
4084 const struct sched_attr *attr, bool keep_boost)
4085{
4086 __setscheduler_params(p, attr);
4087
4088
4089
4090
4091
4092 p->prio = normal_prio(p);
4093 if (keep_boost)
4094 p->prio = rt_effective_prio(p, p->prio);
4095
4096 if (dl_prio(p->prio))
4097 p->sched_class = &dl_sched_class;
4098 else if (rt_prio(p->prio))
4099 p->sched_class = &rt_sched_class;
4100 else
4101 p->sched_class = &fair_sched_class;
4102}
4103
4104static void
4105__getparam_dl(struct task_struct *p, struct sched_attr *attr)
4106{
4107 struct sched_dl_entity *dl_se = &p->dl;
4108
4109 attr->sched_priority = p->rt_priority;
4110 attr->sched_runtime = dl_se->dl_runtime;
4111 attr->sched_deadline = dl_se->dl_deadline;
4112 attr->sched_period = dl_se->dl_period;
4113 attr->sched_flags = dl_se->flags;
4114}
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126static bool
4127__checkparam_dl(const struct sched_attr *attr)
4128{
4129
4130 if (attr->sched_deadline == 0)
4131 return false;
4132
4133
4134
4135
4136
4137 if (attr->sched_runtime < (1ULL << DL_SCALE))
4138 return false;
4139
4140
4141
4142
4143
4144 if (attr->sched_deadline & (1ULL << 63) ||
4145 attr->sched_period & (1ULL << 63))
4146 return false;
4147
4148
4149 if ((attr->sched_period != 0 &&
4150 attr->sched_period < attr->sched_deadline) ||
4151 attr->sched_deadline < attr->sched_runtime)
4152 return false;
4153
4154 return true;
4155}
4156
4157
4158
4159
4160static bool check_same_owner(struct task_struct *p)
4161{
4162 const struct cred *cred = current_cred(), *pcred;
4163 bool match;
4164
4165 rcu_read_lock();
4166 pcred = __task_cred(p);
4167 match = (uid_eq(cred->euid, pcred->euid) ||
4168 uid_eq(cred->euid, pcred->uid));
4169 rcu_read_unlock();
4170 return match;
4171}
4172
4173static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
4174{
4175 struct sched_dl_entity *dl_se = &p->dl;
4176
4177 if (dl_se->dl_runtime != attr->sched_runtime ||
4178 dl_se->dl_deadline != attr->sched_deadline ||
4179 dl_se->dl_period != attr->sched_period ||
4180 dl_se->flags != attr->sched_flags)
4181 return true;
4182
4183 return false;
4184}
4185
4186static int __sched_setscheduler(struct task_struct *p,
4187 const struct sched_attr *attr,
4188 bool user, bool pi)
4189{
4190 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4191 MAX_RT_PRIO - 1 - attr->sched_priority;
4192 int retval, oldprio, oldpolicy = -1, queued, running;
4193 int new_effective_prio, policy = attr->sched_policy;
4194 const struct sched_class *prev_class;
4195 struct rq_flags rf;
4196 int reset_on_fork;
4197 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4198 struct rq *rq;
4199
4200
4201 BUG_ON(in_interrupt());
4202recheck:
4203
4204 if (policy < 0) {
4205 reset_on_fork = p->sched_reset_on_fork;
4206 policy = oldpolicy = p->policy;
4207 } else {
4208 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4209
4210 if (!valid_policy(policy))
4211 return -EINVAL;
4212 }
4213
4214 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
4215 return -EINVAL;
4216
4217
4218
4219
4220
4221
4222 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4223 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4224 return -EINVAL;
4225 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4226 (rt_policy(policy) != (attr->sched_priority != 0)))
4227 return -EINVAL;
4228
4229
4230
4231
4232 if (user && !capable(CAP_SYS_NICE)) {
4233 if (fair_policy(policy)) {
4234 if (attr->sched_nice < task_nice(p) &&
4235 !can_nice(p, attr->sched_nice))
4236 return -EPERM;
4237 }
4238
4239 if (rt_policy(policy)) {
4240 unsigned long rlim_rtprio =
4241 task_rlimit(p, RLIMIT_RTPRIO);
4242
4243
4244 if (policy != p->policy && !rlim_rtprio)
4245 return -EPERM;
4246
4247
4248 if (attr->sched_priority > p->rt_priority &&
4249 attr->sched_priority > rlim_rtprio)
4250 return -EPERM;
4251 }
4252
4253
4254
4255
4256
4257
4258
4259 if (dl_policy(policy))
4260 return -EPERM;
4261
4262
4263
4264
4265
4266 if (idle_policy(p->policy) && !idle_policy(policy)) {
4267 if (!can_nice(p, task_nice(p)))
4268 return -EPERM;
4269 }
4270
4271
4272 if (!check_same_owner(p))
4273 return -EPERM;
4274
4275
4276 if (p->sched_reset_on_fork && !reset_on_fork)
4277 return -EPERM;
4278 }
4279
4280 if (user) {
4281 retval = security_task_setscheduler(p);
4282 if (retval)
4283 return retval;
4284 }
4285
4286
4287
4288
4289
4290
4291
4292
4293 rq = task_rq_lock(p, &rf);
4294 update_rq_clock(rq);
4295
4296
4297
4298
4299 if (p == rq->stop) {
4300 task_rq_unlock(rq, p, &rf);
4301 return -EINVAL;
4302 }
4303
4304
4305
4306
4307
4308 if (unlikely(policy == p->policy)) {
4309 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4310 goto change;
4311 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4312 goto change;
4313 if (dl_policy(policy) && dl_param_changed(p, attr))
4314 goto change;
4315
4316 p->sched_reset_on_fork = reset_on_fork;
4317 task_rq_unlock(rq, p, &rf);
4318 return 0;
4319 }
4320change:
4321
4322 if (user) {
4323#ifdef CONFIG_RT_GROUP_SCHED
4324
4325
4326
4327
4328 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4329 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4330 !task_group_is_autogroup(task_group(p))) {
4331 task_rq_unlock(rq, p, &rf);
4332 return -EPERM;
4333 }
4334#endif
4335#ifdef CONFIG_SMP
4336 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4337 cpumask_t *span = rq->rd->span;
4338
4339
4340
4341
4342
4343
4344 if (!cpumask_subset(span, &p->cpus_allowed) ||
4345 rq->rd->dl_bw.bw == 0) {
4346 task_rq_unlock(rq, p, &rf);
4347 return -EPERM;
4348 }
4349 }
4350#endif
4351 }
4352
4353
4354 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4355 policy = oldpolicy = -1;
4356 task_rq_unlock(rq, p, &rf);
4357 goto recheck;
4358 }
4359
4360
4361
4362
4363
4364
4365 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
4366 task_rq_unlock(rq, p, &rf);
4367 return -EBUSY;
4368 }
4369
4370 p->sched_reset_on_fork = reset_on_fork;
4371 oldprio = p->prio;
4372
4373 if (pi) {
4374
4375
4376
4377
4378
4379
4380
4381 new_effective_prio = rt_effective_prio(p, newprio);
4382 if (new_effective_prio == oldprio)
4383 queue_flags &= ~DEQUEUE_MOVE;
4384 }
4385
4386 queued = task_on_rq_queued(p);
4387 running = task_current(rq, p);
4388 if (queued)
4389 dequeue_task(rq, p, queue_flags);
4390 if (running)
4391 put_prev_task(rq, p);
4392
4393 prev_class = p->sched_class;
4394 __setscheduler(rq, p, attr, pi);
4395
4396 if (queued) {
4397
4398
4399
4400
4401 if (oldprio < p->prio)
4402 queue_flags |= ENQUEUE_HEAD;
4403
4404 enqueue_task(rq, p, queue_flags);
4405 }
4406 if (running)
4407 set_curr_task(rq, p);
4408
4409 check_class_changed(rq, p, prev_class, oldprio);
4410
4411
4412 preempt_disable();
4413 task_rq_unlock(rq, p, &rf);
4414
4415 if (pi)
4416 rt_mutex_adjust_pi(p);
4417
4418
4419 balance_callback(rq);
4420 preempt_enable();
4421
4422 return 0;
4423}
4424
4425static int _sched_setscheduler(struct task_struct *p, int policy,
4426 const struct sched_param *param, bool check)
4427{
4428 struct sched_attr attr = {
4429 .sched_policy = policy,
4430 .sched_priority = param->sched_priority,
4431 .sched_nice = PRIO_TO_NICE(p->static_prio),
4432 };
4433
4434
4435 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4436 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4437 policy &= ~SCHED_RESET_ON_FORK;
4438 attr.sched_policy = policy;
4439 }
4440
4441 return __sched_setscheduler(p, &attr, check, true);
4442}
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453int sched_setscheduler(struct task_struct *p, int policy,
4454 const struct sched_param *param)
4455{
4456 return _sched_setscheduler(p, policy, param, true);
4457}
4458EXPORT_SYMBOL_GPL(sched_setscheduler);
4459
4460int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4461{
4462 return __sched_setscheduler(p, attr, true, true);
4463}
4464EXPORT_SYMBOL_GPL(sched_setattr);
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4480 const struct sched_param *param)
4481{
4482 return _sched_setscheduler(p, policy, param, false);
4483}
4484EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4485
4486static int
4487do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4488{
4489 struct sched_param lparam;
4490 struct task_struct *p;
4491 int retval;
4492
4493 if (!param || pid < 0)
4494 return -EINVAL;
4495 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4496 return -EFAULT;
4497
4498 rcu_read_lock();
4499 retval = -ESRCH;
4500 p = find_process_by_pid(pid);
4501 if (p != NULL)
4502 retval = sched_setscheduler(p, policy, &lparam);
4503 rcu_read_unlock();
4504
4505 return retval;
4506}
4507
4508
4509
4510
4511static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4512{
4513 u32 size;
4514 int ret;
4515
4516 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4517 return -EFAULT;
4518
4519
4520 memset(attr, 0, sizeof(*attr));
4521
4522 ret = get_user(size, &uattr->size);
4523 if (ret)
4524 return ret;
4525
4526
4527 if (size > PAGE_SIZE)
4528 goto err_size;
4529
4530
4531 if (!size)
4532 size = SCHED_ATTR_SIZE_VER0;
4533
4534 if (size < SCHED_ATTR_SIZE_VER0)
4535 goto err_size;
4536
4537
4538
4539
4540
4541
4542
4543 if (size > sizeof(*attr)) {
4544 unsigned char __user *addr;
4545 unsigned char __user *end;
4546 unsigned char val;
4547
4548 addr = (void __user *)uattr + sizeof(*attr);
4549 end = (void __user *)uattr + size;
4550
4551 for (; addr < end; addr++) {
4552 ret = get_user(val, addr);
4553 if (ret)
4554 return ret;
4555 if (val)
4556 goto err_size;
4557 }
4558 size = sizeof(*attr);
4559 }
4560
4561 ret = copy_from_user(attr, uattr, size);
4562 if (ret)
4563 return -EFAULT;
4564
4565
4566
4567
4568
4569 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4570
4571 return 0;
4572
4573err_size:
4574 put_user(sizeof(*attr), &uattr->size);
4575 return -E2BIG;
4576}
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4587{
4588 if (policy < 0)
4589 return -EINVAL;
4590
4591 return do_sched_setscheduler(pid, policy, param);
4592}
4593
4594
4595
4596
4597
4598
4599
4600
4601SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4602{
4603 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4604}
4605
4606
4607
4608
4609
4610
4611
4612SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4613 unsigned int, flags)
4614{
4615 struct sched_attr attr;
4616 struct task_struct *p;
4617 int retval;
4618
4619 if (!uattr || pid < 0 || flags)
4620 return -EINVAL;
4621
4622 retval = sched_copy_attr(uattr, &attr);
4623 if (retval)
4624 return retval;
4625
4626 if ((int)attr.sched_policy < 0)
4627 return -EINVAL;
4628
4629 rcu_read_lock();
4630 retval = -ESRCH;
4631 p = find_process_by_pid(pid);
4632 if (p != NULL)
4633 retval = sched_setattr(p, &attr);
4634 rcu_read_unlock();
4635
4636 return retval;
4637}
4638
4639
4640
4641
4642
4643
4644
4645
4646SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4647{
4648 struct task_struct *p;
4649 int retval;
4650
4651 if (pid < 0)
4652 return -EINVAL;
4653
4654 retval = -ESRCH;
4655 rcu_read_lock();
4656 p = find_process_by_pid(pid);
4657 if (p) {
4658 retval = security_task_getscheduler(p);
4659 if (!retval)
4660 retval = p->policy
4661 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4662 }
4663 rcu_read_unlock();
4664 return retval;
4665}
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4676{
4677 struct sched_param lp = { .sched_priority = 0 };
4678 struct task_struct *p;
4679 int retval;
4680
4681 if (!param || pid < 0)
4682 return -EINVAL;
4683
4684 rcu_read_lock();
4685 p = find_process_by_pid(pid);
4686 retval = -ESRCH;
4687 if (!p)
4688 goto out_unlock;
4689
4690 retval = security_task_getscheduler(p);
4691 if (retval)
4692 goto out_unlock;
4693
4694 if (task_has_rt_policy(p))
4695 lp.sched_priority = p->rt_priority;
4696 rcu_read_unlock();
4697
4698
4699
4700
4701 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4702
4703 return retval;
4704
4705out_unlock:
4706 rcu_read_unlock();
4707 return retval;
4708}
4709
4710static int sched_read_attr(struct sched_attr __user *uattr,
4711 struct sched_attr *attr,
4712 unsigned int usize)
4713{
4714 int ret;
4715
4716 if (!access_ok(VERIFY_WRITE, uattr, usize))
4717 return -EFAULT;
4718
4719
4720
4721
4722
4723
4724 if (usize < sizeof(*attr)) {
4725 unsigned char *addr;
4726 unsigned char *end;
4727
4728 addr = (void *)attr + usize;
4729 end = (void *)attr + sizeof(*attr);
4730
4731 for (; addr < end; addr++) {
4732 if (*addr)
4733 return -EFBIG;
4734 }
4735
4736 attr->size = usize;
4737 }
4738
4739 ret = copy_to_user(uattr, attr, attr->size);
4740 if (ret)
4741 return -EFAULT;
4742
4743 return 0;
4744}
4745
4746
4747
4748
4749
4750
4751
4752
4753SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4754 unsigned int, size, unsigned int, flags)
4755{
4756 struct sched_attr attr = {
4757 .size = sizeof(struct sched_attr),
4758 };
4759 struct task_struct *p;
4760 int retval;
4761
4762 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4763 size < SCHED_ATTR_SIZE_VER0 || flags)
4764 return -EINVAL;
4765
4766 rcu_read_lock();
4767 p = find_process_by_pid(pid);
4768 retval = -ESRCH;
4769 if (!p)
4770 goto out_unlock;
4771
4772 retval = security_task_getscheduler(p);
4773 if (retval)
4774 goto out_unlock;
4775
4776 attr.sched_policy = p->policy;
4777 if (p->sched_reset_on_fork)
4778 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4779 if (task_has_dl_policy(p))
4780 __getparam_dl(p, &attr);
4781 else if (task_has_rt_policy(p))
4782 attr.sched_priority = p->rt_priority;
4783 else
4784 attr.sched_nice = task_nice(p);
4785
4786 rcu_read_unlock();
4787
4788 retval = sched_read_attr(uattr, &attr, size);
4789 return retval;
4790
4791out_unlock:
4792 rcu_read_unlock();
4793 return retval;
4794}
4795
4796long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4797{
4798 cpumask_var_t cpus_allowed, new_mask;
4799 struct task_struct *p;
4800 int retval;
4801
4802 rcu_read_lock();
4803
4804 p = find_process_by_pid(pid);
4805 if (!p) {
4806 rcu_read_unlock();
4807 return -ESRCH;
4808 }
4809
4810
4811 get_task_struct(p);
4812 rcu_read_unlock();
4813
4814 if (p->flags & PF_NO_SETAFFINITY) {
4815 retval = -EINVAL;
4816 goto out_put_task;
4817 }
4818 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4819 retval = -ENOMEM;
4820 goto out_put_task;
4821 }
4822 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4823 retval = -ENOMEM;
4824 goto out_free_cpus_allowed;
4825 }
4826 retval = -EPERM;
4827 if (!check_same_owner(p)) {
4828 rcu_read_lock();
4829 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4830 rcu_read_unlock();
4831 goto out_free_new_mask;
4832 }
4833 rcu_read_unlock();
4834 }
4835
4836 retval = security_task_setscheduler(p);
4837 if (retval)
4838 goto out_free_new_mask;
4839
4840
4841 cpuset_cpus_allowed(p, cpus_allowed);
4842 cpumask_and(new_mask, in_mask, cpus_allowed);
4843
4844
4845
4846
4847
4848
4849
4850#ifdef CONFIG_SMP
4851 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4852 rcu_read_lock();
4853 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4854 retval = -EBUSY;
4855 rcu_read_unlock();
4856 goto out_free_new_mask;
4857 }
4858 rcu_read_unlock();
4859 }
4860#endif
4861again:
4862 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4863
4864 if (!retval) {
4865 cpuset_cpus_allowed(p, cpus_allowed);
4866 if (!cpumask_subset(new_mask, cpus_allowed)) {
4867
4868
4869
4870
4871
4872 cpumask_copy(new_mask, cpus_allowed);
4873 goto again;
4874 }
4875 }
4876out_free_new_mask:
4877 free_cpumask_var(new_mask);
4878out_free_cpus_allowed:
4879 free_cpumask_var(cpus_allowed);
4880out_put_task:
4881 put_task_struct(p);
4882 return retval;
4883}
4884
4885static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4886 struct cpumask *new_mask)
4887{
4888 if (len < cpumask_size())
4889 cpumask_clear(new_mask);
4890 else if (len > cpumask_size())
4891 len = cpumask_size();
4892
4893 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4894}
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4905 unsigned long __user *, user_mask_ptr)
4906{
4907 cpumask_var_t new_mask;
4908 int retval;
4909
4910 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4911 return -ENOMEM;
4912
4913 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4914 if (retval == 0)
4915 retval = sched_setaffinity(pid, new_mask);
4916 free_cpumask_var(new_mask);
4917 return retval;
4918}
4919
4920long sched_getaffinity(pid_t pid, struct cpumask *mask)
4921{
4922 struct task_struct *p;
4923 unsigned long flags;
4924 int retval;
4925
4926 rcu_read_lock();
4927
4928 retval = -ESRCH;
4929 p = find_process_by_pid(pid);
4930 if (!p)
4931 goto out_unlock;
4932
4933 retval = security_task_getscheduler(p);
4934 if (retval)
4935 goto out_unlock;
4936
4937 raw_spin_lock_irqsave(&p->pi_lock, flags);
4938 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4939 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4940
4941out_unlock:
4942 rcu_read_unlock();
4943
4944 return retval;
4945}
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4957 unsigned long __user *, user_mask_ptr)
4958{
4959 int ret;
4960 cpumask_var_t mask;
4961
4962 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4963 return -EINVAL;
4964 if (len & (sizeof(unsigned long)-1))
4965 return -EINVAL;
4966
4967 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4968 return -ENOMEM;
4969
4970 ret = sched_getaffinity(pid, mask);
4971 if (ret == 0) {
4972 size_t retlen = min_t(size_t, len, cpumask_size());
4973
4974 if (copy_to_user(user_mask_ptr, mask, retlen))
4975 ret = -EFAULT;
4976 else
4977 ret = retlen;
4978 }
4979 free_cpumask_var(mask);
4980
4981 return ret;
4982}
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992SYSCALL_DEFINE0(sched_yield)
4993{
4994 struct rq_flags rf;
4995 struct rq *rq;
4996
4997 local_irq_disable();
4998 rq = this_rq();
4999 rq_lock(rq, &rf);
5000
5001 schedstat_inc(rq->yld_count);
5002 current->sched_class->yield_task(rq);
5003
5004
5005
5006
5007
5008 preempt_disable();
5009 rq_unlock(rq, &rf);
5010 sched_preempt_enable_no_resched();
5011
5012 schedule();
5013
5014 return 0;
5015}
5016
5017#ifndef CONFIG_PREEMPT
5018int __sched _cond_resched(void)
5019{
5020 if (should_resched(0)) {
5021 preempt_schedule_common();
5022 return 1;
5023 }
5024 return 0;
5025}
5026EXPORT_SYMBOL(_cond_resched);
5027#endif
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037int __cond_resched_lock(spinlock_t *lock)
5038{
5039 int resched = should_resched(PREEMPT_LOCK_OFFSET);
5040 int ret = 0;
5041
5042 lockdep_assert_held(lock);
5043
5044 if (spin_needbreak(lock) || resched) {
5045 spin_unlock(lock);
5046 if (resched)
5047 preempt_schedule_common();
5048 else
5049 cpu_relax();
5050 ret = 1;
5051 spin_lock(lock);
5052 }
5053 return ret;
5054}
5055EXPORT_SYMBOL(__cond_resched_lock);
5056
5057int __sched __cond_resched_softirq(void)
5058{
5059 BUG_ON(!in_softirq());
5060
5061 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
5062 local_bh_enable();
5063 preempt_schedule_common();
5064 local_bh_disable();
5065 return 1;
5066 }
5067 return 0;
5068}
5069EXPORT_SYMBOL(__cond_resched_softirq);
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093void __sched yield(void)
5094{
5095 set_current_state(TASK_RUNNING);
5096 sys_sched_yield();
5097}
5098EXPORT_SYMBOL(yield);
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115int __sched yield_to(struct task_struct *p, bool preempt)
5116{
5117 struct task_struct *curr = current;
5118 struct rq *rq, *p_rq;
5119 unsigned long flags;
5120 int yielded = 0;
5121
5122 local_irq_save(flags);
5123 rq = this_rq();
5124
5125again:
5126 p_rq = task_rq(p);
5127
5128
5129
5130
5131 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5132 yielded = -ESRCH;
5133 goto out_irq;
5134 }
5135
5136 double_rq_lock(rq, p_rq);
5137 if (task_rq(p) != p_rq) {
5138 double_rq_unlock(rq, p_rq);
5139 goto again;
5140 }
5141
5142 if (!curr->sched_class->yield_to_task)
5143 goto out_unlock;
5144
5145 if (curr->sched_class != p->sched_class)
5146 goto out_unlock;
5147
5148 if (task_running(p_rq, p) || p->state)
5149 goto out_unlock;
5150
5151 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5152 if (yielded) {
5153 schedstat_inc(rq->yld_count);
5154
5155
5156
5157
5158 if (preempt && rq != p_rq)
5159 resched_curr(p_rq);
5160 }
5161
5162out_unlock:
5163 double_rq_unlock(rq, p_rq);
5164out_irq:
5165 local_irq_restore(flags);
5166
5167 if (yielded > 0)
5168 schedule();
5169
5170 return yielded;
5171}
5172EXPORT_SYMBOL_GPL(yield_to);
5173
5174int io_schedule_prepare(void)
5175{
5176 int old_iowait = current->in_iowait;
5177
5178 current->in_iowait = 1;
5179 blk_schedule_flush_plug(current);
5180
5181 return old_iowait;
5182}
5183
5184void io_schedule_finish(int token)
5185{
5186 current->in_iowait = token;
5187}
5188
5189
5190
5191
5192
5193long __sched io_schedule_timeout(long timeout)
5194{
5195 int token;
5196 long ret;
5197
5198 token = io_schedule_prepare();
5199 ret = schedule_timeout(timeout);
5200 io_schedule_finish(token);
5201
5202 return ret;
5203}
5204EXPORT_SYMBOL(io_schedule_timeout);
5205
5206void io_schedule(void)
5207{
5208 int token;
5209
5210 token = io_schedule_prepare();
5211 schedule();
5212 io_schedule_finish(token);
5213}
5214EXPORT_SYMBOL(io_schedule);
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5225{
5226 int ret = -EINVAL;
5227
5228 switch (policy) {
5229 case SCHED_FIFO:
5230 case SCHED_RR:
5231 ret = MAX_USER_RT_PRIO-1;
5232 break;
5233 case SCHED_DEADLINE:
5234 case SCHED_NORMAL:
5235 case SCHED_BATCH:
5236 case SCHED_IDLE:
5237 ret = 0;
5238 break;
5239 }
5240 return ret;
5241}
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5252{
5253 int ret = -EINVAL;
5254
5255 switch (policy) {
5256 case SCHED_FIFO:
5257 case SCHED_RR:
5258 ret = 1;
5259 break;
5260 case SCHED_DEADLINE:
5261 case SCHED_NORMAL:
5262 case SCHED_BATCH:
5263 case SCHED_IDLE:
5264 ret = 0;
5265 }
5266 return ret;
5267}
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5281 struct timespec __user *, interval)
5282{
5283 struct task_struct *p;
5284 unsigned int time_slice;
5285 struct rq_flags rf;
5286 struct timespec t;
5287 struct rq *rq;
5288 int retval;
5289
5290 if (pid < 0)
5291 return -EINVAL;
5292
5293 retval = -ESRCH;
5294 rcu_read_lock();
5295 p = find_process_by_pid(pid);
5296 if (!p)
5297 goto out_unlock;
5298
5299 retval = security_task_getscheduler(p);
5300 if (retval)
5301 goto out_unlock;
5302
5303 rq = task_rq_lock(p, &rf);
5304 time_slice = 0;
5305 if (p->sched_class->get_rr_interval)
5306 time_slice = p->sched_class->get_rr_interval(rq, p);
5307 task_rq_unlock(rq, p, &rf);
5308
5309 rcu_read_unlock();
5310 jiffies_to_timespec(time_slice, &t);
5311 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5312 return retval;
5313
5314out_unlock:
5315 rcu_read_unlock();
5316 return retval;
5317}
5318
5319static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5320
5321void sched_show_task(struct task_struct *p)
5322{
5323 unsigned long free = 0;
5324 int ppid;
5325 unsigned long state = p->state;
5326
5327
5328 BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
5329
5330 if (!try_get_task_stack(p))
5331 return;
5332 if (state)
5333 state = __ffs(state) + 1;
5334 printk(KERN_INFO "%-15.15s %c", p->comm,
5335 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5336 if (state == TASK_RUNNING)
5337 printk(KERN_CONT " running task ");
5338#ifdef CONFIG_DEBUG_STACK_USAGE
5339 free = stack_not_used(p);
5340#endif
5341 ppid = 0;
5342 rcu_read_lock();
5343 if (pid_alive(p))
5344 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5345 rcu_read_unlock();
5346 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5347 task_pid_nr(p), ppid,
5348 (unsigned long)task_thread_info(p)->flags);
5349
5350 print_worker_info(KERN_INFO, p);
5351 show_stack(p, NULL);
5352 put_task_stack(p);
5353}
5354
5355void show_state_filter(unsigned long state_filter)
5356{
5357 struct task_struct *g, *p;
5358
5359#if BITS_PER_LONG == 32
5360 printk(KERN_INFO
5361 " task PC stack pid father\n");
5362#else
5363 printk(KERN_INFO
5364 " task PC stack pid father\n");
5365#endif
5366 rcu_read_lock();
5367 for_each_process_thread(g, p) {
5368
5369
5370
5371
5372
5373
5374
5375 touch_nmi_watchdog();
5376 touch_all_softlockup_watchdogs();
5377 if (!state_filter || (p->state & state_filter))
5378 sched_show_task(p);
5379 }
5380
5381#ifdef CONFIG_SCHED_DEBUG
5382 if (!state_filter)
5383 sysrq_sched_debug_show();
5384#endif
5385 rcu_read_unlock();
5386
5387
5388
5389 if (!state_filter)
5390 debug_show_all_locks();
5391}
5392
5393void init_idle_bootup_task(struct task_struct *idle)
5394{
5395 idle->sched_class = &idle_sched_class;
5396}
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406void init_idle(struct task_struct *idle, int cpu)
5407{
5408 struct rq *rq = cpu_rq(cpu);
5409 unsigned long flags;
5410
5411 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5412 raw_spin_lock(&rq->lock);
5413
5414 __sched_fork(0, idle);
5415 idle->state = TASK_RUNNING;
5416 idle->se.exec_start = sched_clock();
5417 idle->flags |= PF_IDLE;
5418
5419 kasan_unpoison_task_stack(idle);
5420
5421#ifdef CONFIG_SMP
5422
5423
5424
5425
5426
5427
5428 set_cpus_allowed_common(idle, cpumask_of(cpu));
5429#endif
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440 rcu_read_lock();
5441 __set_task_cpu(idle, cpu);
5442 rcu_read_unlock();
5443
5444 rq->curr = rq->idle = idle;
5445 idle->on_rq = TASK_ON_RQ_QUEUED;
5446#ifdef CONFIG_SMP
5447 idle->on_cpu = 1;
5448#endif
5449 raw_spin_unlock(&rq->lock);
5450 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5451
5452
5453 init_idle_preempt_count(idle, cpu);
5454
5455
5456
5457
5458 idle->sched_class = &idle_sched_class;
5459 ftrace_graph_init_idle_task(idle, cpu);
5460 vtime_init_idle(idle, cpu);
5461#ifdef CONFIG_SMP
5462 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5463#endif
5464}
5465
5466int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5467 const struct cpumask *trial)
5468{
5469 int ret = 1, trial_cpus;
5470 struct dl_bw *cur_dl_b;
5471 unsigned long flags;
5472
5473 if (!cpumask_weight(cur))
5474 return ret;
5475
5476 rcu_read_lock_sched();
5477 cur_dl_b = dl_bw_of(cpumask_any(cur));
5478 trial_cpus = cpumask_weight(trial);
5479
5480 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5481 if (cur_dl_b->bw != -1 &&
5482 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5483 ret = 0;
5484 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5485 rcu_read_unlock_sched();
5486
5487 return ret;
5488}
5489
5490int task_can_attach(struct task_struct *p,
5491 const struct cpumask *cs_cpus_allowed)
5492{
5493 int ret = 0;
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504 if (p->flags & PF_NO_SETAFFINITY) {
5505 ret = -EINVAL;
5506 goto out;
5507 }
5508
5509#ifdef CONFIG_SMP
5510 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5511 cs_cpus_allowed)) {
5512 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5513 cs_cpus_allowed);
5514 struct dl_bw *dl_b;
5515 bool overflow;
5516 int cpus;
5517 unsigned long flags;
5518
5519 rcu_read_lock_sched();
5520 dl_b = dl_bw_of(dest_cpu);
5521 raw_spin_lock_irqsave(&dl_b->lock, flags);
5522 cpus = dl_bw_cpus(dest_cpu);
5523 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5524 if (overflow)
5525 ret = -EBUSY;
5526 else {
5527
5528
5529
5530
5531
5532
5533 __dl_add(dl_b, p->dl.dl_bw);
5534 }
5535 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5536 rcu_read_unlock_sched();
5537
5538 }
5539#endif
5540out:
5541 return ret;
5542}
5543
5544#ifdef CONFIG_SMP
5545
5546bool sched_smp_initialized __read_mostly;
5547
5548#ifdef CONFIG_NUMA_BALANCING
5549
5550int migrate_task_to(struct task_struct *p, int target_cpu)
5551{
5552 struct migration_arg arg = { p, target_cpu };
5553 int curr_cpu = task_cpu(p);
5554
5555 if (curr_cpu == target_cpu)
5556 return 0;
5557
5558 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
5559 return -EINVAL;
5560
5561
5562
5563 trace_sched_move_numa(p, curr_cpu, target_cpu);
5564 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5565}
5566
5567
5568
5569
5570
5571void sched_setnuma(struct task_struct *p, int nid)
5572{
5573 bool queued, running;
5574 struct rq_flags rf;
5575 struct rq *rq;
5576
5577 rq = task_rq_lock(p, &rf);
5578 queued = task_on_rq_queued(p);
5579 running = task_current(rq, p);
5580
5581 if (queued)
5582 dequeue_task(rq, p, DEQUEUE_SAVE);
5583 if (running)
5584 put_prev_task(rq, p);
5585
5586 p->numa_preferred_nid = nid;
5587
5588 if (queued)
5589 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5590 if (running)
5591 set_curr_task(rq, p);
5592 task_rq_unlock(rq, p, &rf);
5593}
5594#endif
5595
5596#ifdef CONFIG_HOTPLUG_CPU
5597
5598
5599
5600
5601void idle_task_exit(void)
5602{
5603 struct mm_struct *mm = current->active_mm;
5604
5605 BUG_ON(cpu_online(smp_processor_id()));
5606
5607 if (mm != &init_mm) {
5608 switch_mm(mm, &init_mm, current);
5609 finish_arch_post_lock_switch();
5610 }
5611 mmdrop(mm);
5612}
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623static void calc_load_migrate(struct rq *rq)
5624{
5625 long delta = calc_load_fold_active(rq, 1);
5626 if (delta)
5627 atomic_long_add(delta, &calc_load_tasks);
5628}
5629
5630static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5631{
5632}
5633
5634static const struct sched_class fake_sched_class = {
5635 .put_prev_task = put_prev_task_fake,
5636};
5637
5638static struct task_struct fake_task = {
5639
5640
5641
5642 .prio = MAX_PRIO + 1,
5643 .sched_class = &fake_sched_class,
5644};
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5655{
5656 struct rq *rq = dead_rq;
5657 struct task_struct *next, *stop = rq->stop;
5658 struct rq_flags orf = *rf;
5659 int dest_cpu;
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670 rq->stop = NULL;
5671
5672
5673
5674
5675
5676
5677 update_rq_clock(rq);
5678
5679 for (;;) {
5680
5681
5682
5683
5684 if (rq->nr_running == 1)
5685 break;
5686
5687
5688
5689
5690 next = pick_next_task(rq, &fake_task, rf);
5691 BUG_ON(!next);
5692 next->sched_class->put_prev_task(rq, next);
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703 rq_unlock(rq, rf);
5704 raw_spin_lock(&next->pi_lock);
5705 rq_relock(rq, rf);
5706
5707
5708
5709
5710
5711
5712 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5713 raw_spin_unlock(&next->pi_lock);
5714 continue;
5715 }
5716
5717
5718 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5719 rq = __migrate_task(rq, rf, next, dest_cpu);
5720 if (rq != dead_rq) {
5721 rq_unlock(rq, rf);
5722 rq = dead_rq;
5723 *rf = orf;
5724 rq_relock(rq, rf);
5725 }
5726 raw_spin_unlock(&next->pi_lock);
5727 }
5728
5729 rq->stop = stop;
5730}
5731#endif
5732
5733void set_rq_online(struct rq *rq)
5734{
5735 if (!rq->online) {
5736 const struct sched_class *class;
5737
5738 cpumask_set_cpu(rq->cpu, rq->rd->online);
5739 rq->online = 1;
5740
5741 for_each_class(class) {
5742 if (class->rq_online)
5743 class->rq_online(rq);
5744 }
5745 }
5746}
5747
5748void set_rq_offline(struct rq *rq)
5749{
5750 if (rq->online) {
5751 const struct sched_class *class;
5752
5753 for_each_class(class) {
5754 if (class->rq_offline)
5755 class->rq_offline(rq);
5756 }
5757
5758 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5759 rq->online = 0;
5760 }
5761}
5762
5763static void set_cpu_rq_start_time(unsigned int cpu)
5764{
5765 struct rq *rq = cpu_rq(cpu);
5766
5767 rq->age_stamp = sched_clock_cpu(cpu);
5768}
5769
5770
5771
5772
5773static int num_cpus_frozen;
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783static void cpuset_cpu_active(void)
5784{
5785 if (cpuhp_tasks_frozen) {
5786
5787
5788
5789
5790
5791
5792 num_cpus_frozen--;
5793 if (likely(num_cpus_frozen)) {
5794 partition_sched_domains(1, NULL, NULL);
5795 return;
5796 }
5797
5798
5799
5800
5801
5802 }
5803 cpuset_update_active_cpus();
5804}
5805
5806static int cpuset_cpu_inactive(unsigned int cpu)
5807{
5808 unsigned long flags;
5809 struct dl_bw *dl_b;
5810 bool overflow;
5811 int cpus;
5812
5813 if (!cpuhp_tasks_frozen) {
5814 rcu_read_lock_sched();
5815 dl_b = dl_bw_of(cpu);
5816
5817 raw_spin_lock_irqsave(&dl_b->lock, flags);
5818 cpus = dl_bw_cpus(cpu);
5819 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5820 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5821
5822 rcu_read_unlock_sched();
5823
5824 if (overflow)
5825 return -EBUSY;
5826 cpuset_update_active_cpus();
5827 } else {
5828 num_cpus_frozen++;
5829 partition_sched_domains(1, NULL, NULL);
5830 }
5831 return 0;
5832}
5833
5834int sched_cpu_activate(unsigned int cpu)
5835{
5836 struct rq *rq = cpu_rq(cpu);
5837 struct rq_flags rf;
5838
5839 set_cpu_active(cpu, true);
5840
5841 if (sched_smp_initialized) {
5842 sched_domains_numa_masks_set(cpu);
5843 cpuset_cpu_active();
5844 }
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855 rq_lock_irqsave(rq, &rf);
5856 if (rq->rd) {
5857 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5858 set_rq_online(rq);
5859 }
5860 rq_unlock_irqrestore(rq, &rf);
5861
5862 update_max_interval();
5863
5864 return 0;
5865}
5866
5867int sched_cpu_deactivate(unsigned int cpu)
5868{
5869 int ret;
5870
5871 set_cpu_active(cpu, false);
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882 if (IS_ENABLED(CONFIG_PREEMPT))
5883 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5884 else
5885 synchronize_rcu();
5886
5887 if (!sched_smp_initialized)
5888 return 0;
5889
5890 ret = cpuset_cpu_inactive(cpu);
5891 if (ret) {
5892 set_cpu_active(cpu, true);
5893 return ret;
5894 }
5895 sched_domains_numa_masks_clear(cpu);
5896 return 0;
5897}
5898
5899static void sched_rq_cpu_starting(unsigned int cpu)
5900{
5901 struct rq *rq = cpu_rq(cpu);
5902
5903 rq->calc_load_update = calc_load_update;
5904 update_max_interval();
5905}
5906
5907int sched_cpu_starting(unsigned int cpu)
5908{
5909 set_cpu_rq_start_time(cpu);
5910 sched_rq_cpu_starting(cpu);
5911 return 0;
5912}
5913
5914#ifdef CONFIG_HOTPLUG_CPU
5915int sched_cpu_dying(unsigned int cpu)
5916{
5917 struct rq *rq = cpu_rq(cpu);
5918 struct rq_flags rf;
5919
5920
5921 sched_ttwu_pending();
5922
5923 rq_lock_irqsave(rq, &rf);
5924 if (rq->rd) {
5925 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5926 set_rq_offline(rq);
5927 }
5928 migrate_tasks(rq, &rf);
5929 BUG_ON(rq->nr_running != 1);
5930 rq_unlock_irqrestore(rq, &rf);
5931
5932 calc_load_migrate(rq);
5933 update_max_interval();
5934 nohz_balance_exit_idle(cpu);
5935 hrtick_clear(rq);
5936 return 0;
5937}
5938#endif
5939
5940#ifdef CONFIG_SCHED_SMT
5941DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5942
5943static void sched_init_smt(void)
5944{
5945
5946
5947
5948
5949 if (cpumask_weight(cpu_smt_mask(0)) > 1)
5950 static_branch_enable(&sched_smt_present);
5951}
5952#else
5953static inline void sched_init_smt(void) { }
5954#endif
5955
5956void __init sched_init_smp(void)
5957{
5958 cpumask_var_t non_isolated_cpus;
5959
5960 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
5961 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
5962
5963 sched_init_numa();
5964
5965
5966
5967
5968
5969
5970 mutex_lock(&sched_domains_mutex);
5971 init_sched_domains(cpu_active_mask);
5972 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
5973 if (cpumask_empty(non_isolated_cpus))
5974 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
5975 mutex_unlock(&sched_domains_mutex);
5976
5977
5978 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
5979 BUG();
5980 sched_init_granularity();
5981 free_cpumask_var(non_isolated_cpus);
5982
5983 init_sched_rt_class();
5984 init_sched_dl_class();
5985
5986 sched_init_smt();
5987 sched_clock_init_late();
5988
5989 sched_smp_initialized = true;
5990}
5991
5992static int __init migration_init(void)
5993{
5994 sched_rq_cpu_starting(smp_processor_id());
5995 return 0;
5996}
5997early_initcall(migration_init);
5998
5999#else
6000void __init sched_init_smp(void)
6001{
6002 sched_init_granularity();
6003 sched_clock_init_late();
6004}
6005#endif
6006
6007int in_sched_functions(unsigned long addr)
6008{
6009 return in_lock_functions(addr) ||
6010 (addr >= (unsigned long)__sched_text_start
6011 && addr < (unsigned long)__sched_text_end);
6012}
6013
6014#ifdef CONFIG_CGROUP_SCHED
6015
6016
6017
6018
6019struct task_group root_task_group;
6020LIST_HEAD(task_groups);
6021
6022
6023static struct kmem_cache *task_group_cache __read_mostly;
6024#endif
6025
6026DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6027DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
6028
6029#define WAIT_TABLE_BITS 8
6030#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
6031static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
6032
6033wait_queue_head_t *bit_waitqueue(void *word, int bit)
6034{
6035 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
6036 unsigned long val = (unsigned long)word << shift | bit;
6037
6038 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
6039}
6040EXPORT_SYMBOL(bit_waitqueue);
6041
6042void __init sched_init(void)
6043{
6044 int i, j;
6045 unsigned long alloc_size = 0, ptr;
6046
6047 sched_clock_init();
6048
6049 for (i = 0; i < WAIT_TABLE_SIZE; i++)
6050 init_waitqueue_head(bit_wait_table + i);
6051
6052#ifdef CONFIG_FAIR_GROUP_SCHED
6053 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6054#endif
6055#ifdef CONFIG_RT_GROUP_SCHED
6056 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6057#endif
6058 if (alloc_size) {
6059 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6060
6061#ifdef CONFIG_FAIR_GROUP_SCHED
6062 root_task_group.se = (struct sched_entity **)ptr;
6063 ptr += nr_cpu_ids * sizeof(void **);
6064
6065 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6066 ptr += nr_cpu_ids * sizeof(void **);
6067
6068#endif
6069#ifdef CONFIG_RT_GROUP_SCHED
6070 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6071 ptr += nr_cpu_ids * sizeof(void **);
6072
6073 root_task_group.rt_rq = (struct rt_rq **)ptr;
6074 ptr += nr_cpu_ids * sizeof(void **);
6075
6076#endif
6077 }
6078#ifdef CONFIG_CPUMASK_OFFSTACK
6079 for_each_possible_cpu(i) {
6080 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
6081 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
6082 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
6083 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
6084 }
6085#endif
6086
6087 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
6088 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
6089
6090#ifdef CONFIG_SMP
6091 init_defrootdomain();
6092#endif
6093
6094#ifdef CONFIG_RT_GROUP_SCHED
6095 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6096 global_rt_period(), global_rt_runtime());
6097#endif
6098
6099#ifdef CONFIG_CGROUP_SCHED
6100 task_group_cache = KMEM_CACHE(task_group, 0);
6101
6102 list_add(&root_task_group.list, &task_groups);
6103 INIT_LIST_HEAD(&root_task_group.children);
6104 INIT_LIST_HEAD(&root_task_group.siblings);
6105 autogroup_init(&init_task);
6106#endif
6107
6108 for_each_possible_cpu(i) {
6109 struct rq *rq;
6110
6111 rq = cpu_rq(i);
6112 raw_spin_lock_init(&rq->lock);
6113 rq->nr_running = 0;
6114 rq->calc_load_active = 0;
6115 rq->calc_load_update = jiffies + LOAD_FREQ;
6116 init_cfs_rq(&rq->cfs);
6117 init_rt_rq(&rq->rt);
6118 init_dl_rq(&rq->dl);
6119#ifdef CONFIG_FAIR_GROUP_SCHED
6120 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6121 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6122 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6143 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6144#endif
6145
6146 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6147#ifdef CONFIG_RT_GROUP_SCHED
6148 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6149#endif
6150
6151 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6152 rq->cpu_load[j] = 0;
6153
6154#ifdef CONFIG_SMP
6155 rq->sd = NULL;
6156 rq->rd = NULL;
6157 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
6158 rq->balance_callback = NULL;
6159 rq->active_balance = 0;
6160 rq->next_balance = jiffies;
6161 rq->push_cpu = 0;
6162 rq->cpu = i;
6163 rq->online = 0;
6164 rq->idle_stamp = 0;
6165 rq->avg_idle = 2*sysctl_sched_migration_cost;
6166 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6167
6168 INIT_LIST_HEAD(&rq->cfs_tasks);
6169
6170 rq_attach_root(rq, &def_root_domain);
6171#ifdef CONFIG_NO_HZ_COMMON
6172 rq->last_load_update_tick = jiffies;
6173 rq->nohz_flags = 0;
6174#endif
6175#ifdef CONFIG_NO_HZ_FULL
6176 rq->last_sched_tick = 0;
6177#endif
6178#endif
6179 init_rq_hrtick(rq);
6180 atomic_set(&rq->nr_iowait, 0);
6181 }
6182
6183 set_load_weight(&init_task);
6184
6185
6186
6187
6188 mmgrab(&init_mm);
6189 enter_lazy_tlb(&init_mm, current);
6190
6191
6192
6193
6194
6195
6196
6197 init_idle(current, smp_processor_id());
6198
6199 calc_load_update = jiffies + LOAD_FREQ;
6200
6201#ifdef CONFIG_SMP
6202 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6203
6204 if (cpu_isolated_map == NULL)
6205 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6206 idle_thread_set_boot_cpu();
6207 set_cpu_rq_start_time(smp_processor_id());
6208#endif
6209 init_sched_fair_class();
6210
6211 init_schedstats();
6212
6213 scheduler_running = 1;
6214}
6215
6216#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6217static inline int preempt_count_equals(int preempt_offset)
6218{
6219 int nested = preempt_count() + rcu_preempt_depth();
6220
6221 return (nested == preempt_offset);
6222}
6223
6224void __might_sleep(const char *file, int line, int preempt_offset)
6225{
6226
6227
6228
6229
6230
6231 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6232 "do not call blocking ops when !TASK_RUNNING; "
6233 "state=%lx set at [<%p>] %pS\n",
6234 current->state,
6235 (void *)current->task_state_change,
6236 (void *)current->task_state_change);
6237
6238 ___might_sleep(file, line, preempt_offset);
6239}
6240EXPORT_SYMBOL(__might_sleep);
6241
6242void ___might_sleep(const char *file, int line, int preempt_offset)
6243{
6244
6245 static unsigned long prev_jiffy;
6246
6247 unsigned long preempt_disable_ip;
6248
6249
6250 rcu_sleep_check();
6251
6252 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6253 !is_idle_task(current)) ||
6254 system_state != SYSTEM_RUNNING || oops_in_progress)
6255 return;
6256 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6257 return;
6258 prev_jiffy = jiffies;
6259
6260
6261 preempt_disable_ip = get_preempt_disable_ip(current);
6262
6263 printk(KERN_ERR
6264 "BUG: sleeping function called from invalid context at %s:%d\n",
6265 file, line);
6266 printk(KERN_ERR
6267 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6268 in_atomic(), irqs_disabled(),
6269 current->pid, current->comm);
6270
6271 if (task_stack_end_corrupted(current))
6272 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6273
6274 debug_show_held_locks(current);
6275 if (irqs_disabled())
6276 print_irqtrace_events(current);
6277 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6278 && !preempt_count_equals(preempt_offset)) {
6279 pr_err("Preemption disabled at:");
6280 print_ip_sym(preempt_disable_ip);
6281 pr_cont("\n");
6282 }
6283 dump_stack();
6284 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6285}
6286EXPORT_SYMBOL(___might_sleep);
6287#endif
6288
6289#ifdef CONFIG_MAGIC_SYSRQ
6290void normalize_rt_tasks(void)
6291{
6292 struct task_struct *g, *p;
6293 struct sched_attr attr = {
6294 .sched_policy = SCHED_NORMAL,
6295 };
6296
6297 read_lock(&tasklist_lock);
6298 for_each_process_thread(g, p) {
6299
6300
6301
6302 if (p->flags & PF_KTHREAD)
6303 continue;
6304
6305 p->se.exec_start = 0;
6306 schedstat_set(p->se.statistics.wait_start, 0);
6307 schedstat_set(p->se.statistics.sleep_start, 0);
6308 schedstat_set(p->se.statistics.block_start, 0);
6309
6310 if (!dl_task(p) && !rt_task(p)) {
6311
6312
6313
6314
6315 if (task_nice(p) < 0)
6316 set_user_nice(p, 0);
6317 continue;
6318 }
6319
6320 __sched_setscheduler(p, &attr, false, false);
6321 }
6322 read_unlock(&tasklist_lock);
6323}
6324
6325#endif
6326
6327#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346struct task_struct *curr_task(int cpu)
6347{
6348 return cpu_curr(cpu);
6349}
6350
6351#endif
6352
6353#ifdef CONFIG_IA64
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369void ia64_set_curr_task(int cpu, struct task_struct *p)
6370{
6371 cpu_curr(cpu) = p;
6372}
6373
6374#endif
6375
6376#ifdef CONFIG_CGROUP_SCHED
6377
6378static DEFINE_SPINLOCK(task_group_lock);
6379
6380static void sched_free_group(struct task_group *tg)
6381{
6382 free_fair_sched_group(tg);
6383 free_rt_sched_group(tg);
6384 autogroup_free(tg);
6385 kmem_cache_free(task_group_cache, tg);
6386}
6387
6388
6389struct task_group *sched_create_group(struct task_group *parent)
6390{
6391 struct task_group *tg;
6392
6393 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6394 if (!tg)
6395 return ERR_PTR(-ENOMEM);
6396
6397 if (!alloc_fair_sched_group(tg, parent))
6398 goto err;
6399
6400 if (!alloc_rt_sched_group(tg, parent))
6401 goto err;
6402
6403 return tg;
6404
6405err:
6406 sched_free_group(tg);
6407 return ERR_PTR(-ENOMEM);
6408}
6409
6410void sched_online_group(struct task_group *tg, struct task_group *parent)
6411{
6412 unsigned long flags;
6413
6414 spin_lock_irqsave(&task_group_lock, flags);
6415 list_add_rcu(&tg->list, &task_groups);
6416
6417
6418 WARN_ON(!parent);
6419
6420 tg->parent = parent;
6421 INIT_LIST_HEAD(&tg->children);
6422 list_add_rcu(&tg->siblings, &parent->children);
6423 spin_unlock_irqrestore(&task_group_lock, flags);
6424
6425 online_fair_sched_group(tg);
6426}
6427
6428
6429static void sched_free_group_rcu(struct rcu_head *rhp)
6430{
6431
6432 sched_free_group(container_of(rhp, struct task_group, rcu));
6433}
6434
6435void sched_destroy_group(struct task_group *tg)
6436{
6437
6438 call_rcu(&tg->rcu, sched_free_group_rcu);
6439}
6440
6441void sched_offline_group(struct task_group *tg)
6442{
6443 unsigned long flags;
6444
6445
6446 unregister_fair_sched_group(tg);
6447
6448 spin_lock_irqsave(&task_group_lock, flags);
6449 list_del_rcu(&tg->list);
6450 list_del_rcu(&tg->siblings);
6451 spin_unlock_irqrestore(&task_group_lock, flags);
6452}
6453
6454static void sched_change_group(struct task_struct *tsk, int type)
6455{
6456 struct task_group *tg;
6457
6458
6459
6460
6461
6462
6463 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6464 struct task_group, css);
6465 tg = autogroup_task_group(tsk, tg);
6466 tsk->sched_task_group = tg;
6467
6468#ifdef CONFIG_FAIR_GROUP_SCHED
6469 if (tsk->sched_class->task_change_group)
6470 tsk->sched_class->task_change_group(tsk, type);
6471 else
6472#endif
6473 set_task_rq(tsk, task_cpu(tsk));
6474}
6475
6476
6477
6478
6479
6480
6481
6482
6483void sched_move_task(struct task_struct *tsk)
6484{
6485 int queued, running, queue_flags =
6486 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6487 struct rq_flags rf;
6488 struct rq *rq;
6489
6490 rq = task_rq_lock(tsk, &rf);
6491 update_rq_clock(rq);
6492
6493 running = task_current(rq, tsk);
6494 queued = task_on_rq_queued(tsk);
6495
6496 if (queued)
6497 dequeue_task(rq, tsk, queue_flags);
6498 if (running)
6499 put_prev_task(rq, tsk);
6500
6501 sched_change_group(tsk, TASK_MOVE_GROUP);
6502
6503 if (queued)
6504 enqueue_task(rq, tsk, queue_flags);
6505 if (running)
6506 set_curr_task(rq, tsk);
6507
6508 task_rq_unlock(rq, tsk, &rf);
6509}
6510#endif
6511
6512#ifdef CONFIG_RT_GROUP_SCHED
6513
6514
6515
6516static DEFINE_MUTEX(rt_constraints_mutex);
6517
6518
6519static inline int tg_has_rt_tasks(struct task_group *tg)
6520{
6521 struct task_struct *g, *p;
6522
6523
6524
6525
6526 if (task_group_is_autogroup(tg))
6527 return 0;
6528
6529 for_each_process_thread(g, p) {
6530 if (rt_task(p) && task_group(p) == tg)
6531 return 1;
6532 }
6533
6534 return 0;
6535}
6536
6537struct rt_schedulable_data {
6538 struct task_group *tg;
6539 u64 rt_period;
6540 u64 rt_runtime;
6541};
6542
6543static int tg_rt_schedulable(struct task_group *tg, void *data)
6544{
6545 struct rt_schedulable_data *d = data;
6546 struct task_group *child;
6547 unsigned long total, sum = 0;
6548 u64 period, runtime;
6549
6550 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6551 runtime = tg->rt_bandwidth.rt_runtime;
6552
6553 if (tg == d->tg) {
6554 period = d->rt_period;
6555 runtime = d->rt_runtime;
6556 }
6557
6558
6559
6560
6561 if (runtime > period && runtime != RUNTIME_INF)
6562 return -EINVAL;
6563
6564
6565
6566
6567 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
6568 return -EBUSY;
6569
6570 total = to_ratio(period, runtime);
6571
6572
6573
6574
6575 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
6576 return -EINVAL;
6577
6578
6579
6580
6581 list_for_each_entry_rcu(child, &tg->children, siblings) {
6582 period = ktime_to_ns(child->rt_bandwidth.rt_period);
6583 runtime = child->rt_bandwidth.rt_runtime;
6584
6585 if (child == d->tg) {
6586 period = d->rt_period;
6587 runtime = d->rt_runtime;
6588 }
6589
6590 sum += to_ratio(period, runtime);
6591 }
6592
6593 if (sum > total)
6594 return -EINVAL;
6595
6596 return 0;
6597}
6598
6599static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
6600{
6601 int ret;
6602
6603 struct rt_schedulable_data data = {
6604 .tg = tg,
6605 .rt_period = period,
6606 .rt_runtime = runtime,
6607 };
6608
6609 rcu_read_lock();
6610 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
6611 rcu_read_unlock();
6612
6613 return ret;
6614}
6615
6616static int tg_set_rt_bandwidth(struct task_group *tg,
6617 u64 rt_period, u64 rt_runtime)
6618{
6619 int i, err = 0;
6620
6621
6622
6623
6624
6625 if (tg == &root_task_group && rt_runtime == 0)
6626 return -EINVAL;
6627
6628
6629 if (rt_period == 0)
6630 return -EINVAL;
6631
6632 mutex_lock(&rt_constraints_mutex);
6633 read_lock(&tasklist_lock);
6634 err = __rt_schedulable(tg, rt_period, rt_runtime);
6635 if (err)
6636 goto unlock;
6637
6638 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6639 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
6640 tg->rt_bandwidth.rt_runtime = rt_runtime;
6641
6642 for_each_possible_cpu(i) {
6643 struct rt_rq *rt_rq = tg->rt_rq[i];
6644
6645 raw_spin_lock(&rt_rq->rt_runtime_lock);
6646 rt_rq->rt_runtime = rt_runtime;
6647 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6648 }
6649 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6650unlock:
6651 read_unlock(&tasklist_lock);
6652 mutex_unlock(&rt_constraints_mutex);
6653
6654 return err;
6655}
6656
6657static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
6658{
6659 u64 rt_runtime, rt_period;
6660
6661 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6662 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
6663 if (rt_runtime_us < 0)
6664 rt_runtime = RUNTIME_INF;
6665
6666 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6667}
6668
6669static long sched_group_rt_runtime(struct task_group *tg)
6670{
6671 u64 rt_runtime_us;
6672
6673 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
6674 return -1;
6675
6676 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
6677 do_div(rt_runtime_us, NSEC_PER_USEC);
6678 return rt_runtime_us;
6679}
6680
6681static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
6682{
6683 u64 rt_runtime, rt_period;
6684
6685 rt_period = rt_period_us * NSEC_PER_USEC;
6686 rt_runtime = tg->rt_bandwidth.rt_runtime;
6687
6688 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6689}
6690
6691static long sched_group_rt_period(struct task_group *tg)
6692{
6693 u64 rt_period_us;
6694
6695 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
6696 do_div(rt_period_us, NSEC_PER_USEC);
6697 return rt_period_us;
6698}
6699#endif
6700
6701#ifdef CONFIG_RT_GROUP_SCHED
6702static int sched_rt_global_constraints(void)
6703{
6704 int ret = 0;
6705
6706 mutex_lock(&rt_constraints_mutex);
6707 read_lock(&tasklist_lock);
6708 ret = __rt_schedulable(NULL, 0, 0);
6709 read_unlock(&tasklist_lock);
6710 mutex_unlock(&rt_constraints_mutex);
6711
6712 return ret;
6713}
6714
6715static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6716{
6717
6718 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
6719 return 0;
6720
6721 return 1;
6722}
6723
6724#else
6725static int sched_rt_global_constraints(void)
6726{
6727 unsigned long flags;
6728 int i;
6729
6730 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6731 for_each_possible_cpu(i) {
6732 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
6733
6734 raw_spin_lock(&rt_rq->rt_runtime_lock);
6735 rt_rq->rt_runtime = global_rt_runtime();
6736 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6737 }
6738 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6739
6740 return 0;
6741}
6742#endif
6743
6744static int sched_dl_global_validate(void)
6745{
6746 u64 runtime = global_rt_runtime();
6747 u64 period = global_rt_period();
6748 u64 new_bw = to_ratio(period, runtime);
6749 struct dl_bw *dl_b;
6750 int cpu, ret = 0;
6751 unsigned long flags;
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762 for_each_possible_cpu(cpu) {
6763 rcu_read_lock_sched();
6764 dl_b = dl_bw_of(cpu);
6765
6766 raw_spin_lock_irqsave(&dl_b->lock, flags);
6767 if (new_bw < dl_b->total_bw)
6768 ret = -EBUSY;
6769 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
6770
6771 rcu_read_unlock_sched();
6772
6773 if (ret)
6774 break;
6775 }
6776
6777 return ret;
6778}
6779
6780static void sched_dl_do_global(void)
6781{
6782 u64 new_bw = -1;
6783 struct dl_bw *dl_b;
6784 int cpu;
6785 unsigned long flags;
6786
6787 def_dl_bandwidth.dl_period = global_rt_period();
6788 def_dl_bandwidth.dl_runtime = global_rt_runtime();
6789
6790 if (global_rt_runtime() != RUNTIME_INF)
6791 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
6792
6793
6794
6795
6796 for_each_possible_cpu(cpu) {
6797 rcu_read_lock_sched();
6798 dl_b = dl_bw_of(cpu);
6799
6800 raw_spin_lock_irqsave(&dl_b->lock, flags);
6801 dl_b->bw = new_bw;
6802 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
6803
6804 rcu_read_unlock_sched();
6805 }
6806}
6807
6808static int sched_rt_global_validate(void)
6809{
6810 if (sysctl_sched_rt_period <= 0)
6811 return -EINVAL;
6812
6813 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
6814 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
6815 return -EINVAL;
6816
6817 return 0;
6818}
6819
6820static void sched_rt_do_global(void)
6821{
6822 def_rt_bandwidth.rt_runtime = global_rt_runtime();
6823 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
6824}
6825
6826int sched_rt_handler(struct ctl_table *table, int write,
6827 void __user *buffer, size_t *lenp,
6828 loff_t *ppos)
6829{
6830 int old_period, old_runtime;
6831 static DEFINE_MUTEX(mutex);
6832 int ret;
6833
6834 mutex_lock(&mutex);
6835 old_period = sysctl_sched_rt_period;
6836 old_runtime = sysctl_sched_rt_runtime;
6837
6838 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6839
6840 if (!ret && write) {
6841 ret = sched_rt_global_validate();
6842 if (ret)
6843 goto undo;
6844
6845 ret = sched_dl_global_validate();
6846 if (ret)
6847 goto undo;
6848
6849 ret = sched_rt_global_constraints();
6850 if (ret)
6851 goto undo;
6852
6853 sched_rt_do_global();
6854 sched_dl_do_global();
6855 }
6856 if (0) {
6857undo:
6858 sysctl_sched_rt_period = old_period;
6859 sysctl_sched_rt_runtime = old_runtime;
6860 }
6861 mutex_unlock(&mutex);
6862
6863 return ret;
6864}
6865
6866int sched_rr_handler(struct ctl_table *table, int write,
6867 void __user *buffer, size_t *lenp,
6868 loff_t *ppos)
6869{
6870 int ret;
6871 static DEFINE_MUTEX(mutex);
6872
6873 mutex_lock(&mutex);
6874 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6875
6876
6877
6878
6879 if (!ret && write) {
6880 sched_rr_timeslice =
6881 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
6882 msecs_to_jiffies(sysctl_sched_rr_timeslice);
6883 }
6884 mutex_unlock(&mutex);
6885 return ret;
6886}
6887
6888#ifdef CONFIG_CGROUP_SCHED
6889
6890static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6891{
6892 return css ? container_of(css, struct task_group, css) : NULL;
6893}
6894
6895static struct cgroup_subsys_state *
6896cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6897{
6898 struct task_group *parent = css_tg(parent_css);
6899 struct task_group *tg;
6900
6901 if (!parent) {
6902
6903 return &root_task_group.css;
6904 }
6905
6906 tg = sched_create_group(parent);
6907 if (IS_ERR(tg))
6908 return ERR_PTR(-ENOMEM);
6909
6910 return &tg->css;
6911}
6912
6913
6914static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6915{
6916 struct task_group *tg = css_tg(css);
6917 struct task_group *parent = css_tg(css->parent);
6918
6919 if (parent)
6920 sched_online_group(tg, parent);
6921 return 0;
6922}
6923
6924static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6925{
6926 struct task_group *tg = css_tg(css);
6927
6928 sched_offline_group(tg);
6929}
6930
6931static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6932{
6933 struct task_group *tg = css_tg(css);
6934
6935
6936
6937
6938 sched_free_group(tg);
6939}
6940
6941
6942
6943
6944
6945static void cpu_cgroup_fork(struct task_struct *task)
6946{
6947 struct rq_flags rf;
6948 struct rq *rq;
6949
6950 rq = task_rq_lock(task, &rf);
6951
6952 update_rq_clock(rq);
6953 sched_change_group(task, TASK_SET_GROUP);
6954
6955 task_rq_unlock(rq, task, &rf);
6956}
6957
6958static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6959{
6960 struct task_struct *task;
6961 struct cgroup_subsys_state *css;
6962 int ret = 0;
6963
6964 cgroup_taskset_for_each(task, css, tset) {
6965#ifdef CONFIG_RT_GROUP_SCHED
6966 if (!sched_rt_can_attach(css_tg(css), task))
6967 return -EINVAL;
6968#else
6969
6970 if (task->sched_class != &fair_sched_class)
6971 return -EINVAL;
6972#endif
6973
6974
6975
6976
6977 raw_spin_lock_irq(&task->pi_lock);
6978
6979
6980
6981
6982
6983 if (task->state == TASK_NEW)
6984 ret = -EINVAL;
6985 raw_spin_unlock_irq(&task->pi_lock);
6986
6987 if (ret)
6988 break;
6989 }
6990 return ret;
6991}
6992
6993static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6994{
6995 struct task_struct *task;
6996 struct cgroup_subsys_state *css;
6997
6998 cgroup_taskset_for_each(task, css, tset)
6999 sched_move_task(task);
7000}
7001
7002#ifdef CONFIG_FAIR_GROUP_SCHED
7003static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7004 struct cftype *cftype, u64 shareval)
7005{
7006 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7007}
7008
7009static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7010 struct cftype *cft)
7011{
7012 struct task_group *tg = css_tg(css);
7013
7014 return (u64) scale_load_down(tg->shares);
7015}
7016
7017#ifdef CONFIG_CFS_BANDWIDTH
7018static DEFINE_MUTEX(cfs_constraints_mutex);
7019
7020const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7021const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7022
7023static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7024
7025static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7026{
7027 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7028 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7029
7030 if (tg == &root_task_group)
7031 return -EINVAL;
7032
7033
7034
7035
7036
7037
7038 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7039 return -EINVAL;
7040
7041
7042
7043
7044
7045
7046 if (period > max_cfs_quota_period)
7047 return -EINVAL;
7048
7049
7050
7051
7052
7053 get_online_cpus();
7054 mutex_lock(&cfs_constraints_mutex);
7055 ret = __cfs_schedulable(tg, period, quota);
7056 if (ret)
7057 goto out_unlock;
7058
7059 runtime_enabled = quota != RUNTIME_INF;
7060 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7061
7062
7063
7064
7065 if (runtime_enabled && !runtime_was_enabled)
7066 cfs_bandwidth_usage_inc();
7067 raw_spin_lock_irq(&cfs_b->lock);
7068 cfs_b->period = ns_to_ktime(period);
7069 cfs_b->quota = quota;
7070
7071 __refill_cfs_bandwidth_runtime(cfs_b);
7072
7073
7074 if (runtime_enabled)
7075 start_cfs_bandwidth(cfs_b);
7076
7077 raw_spin_unlock_irq(&cfs_b->lock);
7078
7079 for_each_online_cpu(i) {
7080 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7081 struct rq *rq = cfs_rq->rq;
7082 struct rq_flags rf;
7083
7084 rq_lock_irq(rq, &rf);
7085 cfs_rq->runtime_enabled = runtime_enabled;
7086 cfs_rq->runtime_remaining = 0;
7087
7088 if (cfs_rq->throttled)
7089 unthrottle_cfs_rq(cfs_rq);
7090 rq_unlock_irq(rq, &rf);
7091 }
7092 if (runtime_was_enabled && !runtime_enabled)
7093 cfs_bandwidth_usage_dec();
7094out_unlock:
7095 mutex_unlock(&cfs_constraints_mutex);
7096 put_online_cpus();
7097
7098 return ret;
7099}
7100
7101int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7102{
7103 u64 quota, period;
7104
7105 period = ktime_to_ns(tg->cfs_bandwidth.period);
7106 if (cfs_quota_us < 0)
7107 quota = RUNTIME_INF;
7108 else
7109 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7110
7111 return tg_set_cfs_bandwidth(tg, period, quota);
7112}
7113
7114long tg_get_cfs_quota(struct task_group *tg)
7115{
7116 u64 quota_us;
7117
7118 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7119 return -1;
7120
7121 quota_us = tg->cfs_bandwidth.quota;
7122 do_div(quota_us, NSEC_PER_USEC);
7123
7124 return quota_us;
7125}
7126
7127int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7128{
7129 u64 quota, period;
7130
7131 period = (u64)cfs_period_us * NSEC_PER_USEC;
7132 quota = tg->cfs_bandwidth.quota;
7133
7134 return tg_set_cfs_bandwidth(tg, period, quota);
7135}
7136
7137long tg_get_cfs_period(struct task_group *tg)
7138{
7139 u64 cfs_period_us;
7140
7141 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7142 do_div(cfs_period_us, NSEC_PER_USEC);
7143
7144 return cfs_period_us;
7145}
7146
7147static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7148 struct cftype *cft)
7149{
7150 return tg_get_cfs_quota(css_tg(css));
7151}
7152
7153static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7154 struct cftype *cftype, s64 cfs_quota_us)
7155{
7156 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7157}
7158
7159static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7160 struct cftype *cft)
7161{
7162 return tg_get_cfs_period(css_tg(css));
7163}
7164
7165static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7166 struct cftype *cftype, u64 cfs_period_us)
7167{
7168 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7169}
7170
7171struct cfs_schedulable_data {
7172 struct task_group *tg;
7173 u64 period, quota;
7174};
7175
7176
7177
7178
7179
7180static u64 normalize_cfs_quota(struct task_group *tg,
7181 struct cfs_schedulable_data *d)
7182{
7183 u64 quota, period;
7184
7185 if (tg == d->tg) {
7186 period = d->period;
7187 quota = d->quota;
7188 } else {
7189 period = tg_get_cfs_period(tg);
7190 quota = tg_get_cfs_quota(tg);
7191 }
7192
7193
7194 if (quota == RUNTIME_INF || quota == -1)
7195 return RUNTIME_INF;
7196
7197 return to_ratio(period, quota);
7198}
7199
7200static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7201{
7202 struct cfs_schedulable_data *d = data;
7203 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7204 s64 quota = 0, parent_quota = -1;
7205
7206 if (!tg->parent) {
7207 quota = RUNTIME_INF;
7208 } else {
7209 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7210
7211 quota = normalize_cfs_quota(tg, d);
7212 parent_quota = parent_b->hierarchical_quota;
7213
7214
7215
7216
7217
7218 if (quota == RUNTIME_INF)
7219 quota = parent_quota;
7220 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7221 return -EINVAL;
7222 }
7223 cfs_b->hierarchical_quota = quota;
7224
7225 return 0;
7226}
7227
7228static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7229{
7230 int ret;
7231 struct cfs_schedulable_data data = {
7232 .tg = tg,
7233 .period = period,
7234 .quota = quota,
7235 };
7236
7237 if (quota != RUNTIME_INF) {
7238 do_div(data.period, NSEC_PER_USEC);
7239 do_div(data.quota, NSEC_PER_USEC);
7240 }
7241
7242 rcu_read_lock();
7243 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7244 rcu_read_unlock();
7245
7246 return ret;
7247}
7248
7249static int cpu_stats_show(struct seq_file *sf, void *v)
7250{
7251 struct task_group *tg = css_tg(seq_css(sf));
7252 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7253
7254 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7255 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7256 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7257
7258 return 0;
7259}
7260#endif
7261#endif
7262
7263#ifdef CONFIG_RT_GROUP_SCHED
7264static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7265 struct cftype *cft, s64 val)
7266{
7267 return sched_group_set_rt_runtime(css_tg(css), val);
7268}
7269
7270static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7271 struct cftype *cft)
7272{
7273 return sched_group_rt_runtime(css_tg(css));
7274}
7275
7276static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7277 struct cftype *cftype, u64 rt_period_us)
7278{
7279 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7280}
7281
7282static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7283 struct cftype *cft)
7284{
7285 return sched_group_rt_period(css_tg(css));
7286}
7287#endif
7288
7289static struct cftype cpu_files[] = {
7290#ifdef CONFIG_FAIR_GROUP_SCHED
7291 {
7292 .name = "shares",
7293 .read_u64 = cpu_shares_read_u64,
7294 .write_u64 = cpu_shares_write_u64,
7295 },
7296#endif
7297#ifdef CONFIG_CFS_BANDWIDTH
7298 {
7299 .name = "cfs_quota_us",
7300 .read_s64 = cpu_cfs_quota_read_s64,
7301 .write_s64 = cpu_cfs_quota_write_s64,
7302 },
7303 {
7304 .name = "cfs_period_us",
7305 .read_u64 = cpu_cfs_period_read_u64,
7306 .write_u64 = cpu_cfs_period_write_u64,
7307 },
7308 {
7309 .name = "stat",
7310 .seq_show = cpu_stats_show,
7311 },
7312#endif
7313#ifdef CONFIG_RT_GROUP_SCHED
7314 {
7315 .name = "rt_runtime_us",
7316 .read_s64 = cpu_rt_runtime_read,
7317 .write_s64 = cpu_rt_runtime_write,
7318 },
7319 {
7320 .name = "rt_period_us",
7321 .read_u64 = cpu_rt_period_read_uint,
7322 .write_u64 = cpu_rt_period_write_uint,
7323 },
7324#endif
7325 { }
7326};
7327
7328struct cgroup_subsys cpu_cgrp_subsys = {
7329 .css_alloc = cpu_cgroup_css_alloc,
7330 .css_online = cpu_cgroup_css_online,
7331 .css_released = cpu_cgroup_css_released,
7332 .css_free = cpu_cgroup_css_free,
7333 .fork = cpu_cgroup_fork,
7334 .can_attach = cpu_cgroup_can_attach,
7335 .attach = cpu_cgroup_attach,
7336 .legacy_cftypes = cpu_files,
7337 .early_init = true,
7338};
7339
7340#endif
7341
7342void dump_cpu_task(int cpu)
7343{
7344 pr_info("Task dump for CPU %d:\n", cpu);
7345 sched_show_task(cpu_curr(cpu));
7346}
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360const int sched_prio_to_weight[40] = {
7361 88761, 71755, 56483, 46273, 36291,
7362 29154, 23254, 18705, 14949, 11916,
7363 9548, 7620, 6100, 4904, 3906,
7364 3121, 2501, 1991, 1586, 1277,
7365 1024, 820, 655, 526, 423,
7366 335, 272, 215, 172, 137,
7367 110, 87, 70, 56, 45,
7368 36, 29, 23, 18, 15,
7369};
7370
7371
7372
7373
7374
7375
7376
7377
7378const u32 sched_prio_to_wmult[40] = {
7379 48388, 59856, 76040, 92818, 118348,
7380 147320, 184698, 229616, 287308, 360437,
7381 449829, 563644, 704093, 875809, 1099582,
7382 1376151, 1717300, 2157191, 2708050, 3363326,
7383 4194304, 5237765, 6557202, 8165337, 10153587,
7384 12820798, 15790321, 19976592, 24970740, 31350126,
7385 39045157, 49367440, 61356676, 76695844, 95443717,
7386 119304647, 148102320, 186737708, 238609294, 286331153,
7387};
7388