1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/kasan.h>
30#include <linux/mm.h>
31#include <linux/module.h>
32#include <linux/nmi.h>
33#include <linux/init.h>
34#include <linux/uaccess.h>
35#include <linux/highmem.h>
36#include <linux/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/context_tracking.h>
75#include <linux/compiler.h>
76#include <linux/frame.h>
77#include <linux/prefetch.h>
78
79#include <asm/switch_to.h>
80#include <asm/tlb.h>
81#include <asm/irq_regs.h>
82#include <asm/mutex.h>
83#ifdef CONFIG_PARAVIRT
84#include <asm/paravirt.h>
85#endif
86
87#include "sched.h"
88#include "../workqueue_internal.h"
89#include "../smpboot.h"
90
91#define CREATE_TRACE_POINTS
92#include <trace/events/sched.h>
93
94DEFINE_MUTEX(sched_domains_mutex);
95DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
96
97static void update_rq_clock_task(struct rq *rq, s64 delta);
98
99void update_rq_clock(struct rq *rq)
100{
101 s64 delta;
102
103 lockdep_assert_held(&rq->lock);
104
105 if (rq->clock_skip_update & RQCF_ACT_SKIP)
106 return;
107
108 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
109 if (delta < 0)
110 return;
111 rq->clock += delta;
112 update_rq_clock_task(rq, delta);
113}
114
115
116
117
118
119#define SCHED_FEAT(name, enabled) \
120 (1UL << __SCHED_FEAT_##name) * enabled |
121
122const_debug unsigned int sysctl_sched_features =
123#include "features.h"
124 0;
125
126#undef SCHED_FEAT
127
128
129
130
131
132const_debug unsigned int sysctl_sched_nr_migrate = 32;
133
134
135
136
137
138
139
140const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
141
142
143
144
145
146unsigned int sysctl_sched_rt_period = 1000000;
147
148__read_mostly int scheduler_running;
149
150
151
152
153
154int sysctl_sched_rt_runtime = 950000;
155
156
157cpumask_var_t cpu_isolated_map;
158
159
160
161
162static struct rq *this_rq_lock(void)
163 __acquires(rq->lock)
164{
165 struct rq *rq;
166
167 local_irq_disable();
168 rq = this_rq();
169 raw_spin_lock(&rq->lock);
170
171 return rq;
172}
173
174
175
176
177struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
178 __acquires(rq->lock)
179{
180 struct rq *rq;
181
182 lockdep_assert_held(&p->pi_lock);
183
184 for (;;) {
185 rq = task_rq(p);
186 raw_spin_lock(&rq->lock);
187 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
188 rf->cookie = lockdep_pin_lock(&rq->lock);
189 return rq;
190 }
191 raw_spin_unlock(&rq->lock);
192
193 while (unlikely(task_on_rq_migrating(p)))
194 cpu_relax();
195 }
196}
197
198
199
200
201struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
202 __acquires(p->pi_lock)
203 __acquires(rq->lock)
204{
205 struct rq *rq;
206
207 for (;;) {
208 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
209 rq = task_rq(p);
210 raw_spin_lock(&rq->lock);
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
228 rf->cookie = lockdep_pin_lock(&rq->lock);
229 return rq;
230 }
231 raw_spin_unlock(&rq->lock);
232 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
233
234 while (unlikely(task_on_rq_migrating(p)))
235 cpu_relax();
236 }
237}
238
239#ifdef CONFIG_SCHED_HRTICK
240
241
242
243
244static void hrtick_clear(struct rq *rq)
245{
246 if (hrtimer_active(&rq->hrtick_timer))
247 hrtimer_cancel(&rq->hrtick_timer);
248}
249
250
251
252
253
254static enum hrtimer_restart hrtick(struct hrtimer *timer)
255{
256 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
257
258 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
259
260 raw_spin_lock(&rq->lock);
261 update_rq_clock(rq);
262 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
263 raw_spin_unlock(&rq->lock);
264
265 return HRTIMER_NORESTART;
266}
267
268#ifdef CONFIG_SMP
269
270static void __hrtick_restart(struct rq *rq)
271{
272 struct hrtimer *timer = &rq->hrtick_timer;
273
274 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
275}
276
277
278
279
280static void __hrtick_start(void *arg)
281{
282 struct rq *rq = arg;
283
284 raw_spin_lock(&rq->lock);
285 __hrtick_restart(rq);
286 rq->hrtick_csd_pending = 0;
287 raw_spin_unlock(&rq->lock);
288}
289
290
291
292
293
294
295void hrtick_start(struct rq *rq, u64 delay)
296{
297 struct hrtimer *timer = &rq->hrtick_timer;
298 ktime_t time;
299 s64 delta;
300
301
302
303
304
305 delta = max_t(s64, delay, 10000LL);
306 time = ktime_add_ns(timer->base->get_time(), delta);
307
308 hrtimer_set_expires(timer, time);
309
310 if (rq == this_rq()) {
311 __hrtick_restart(rq);
312 } else if (!rq->hrtick_csd_pending) {
313 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
314 rq->hrtick_csd_pending = 1;
315 }
316}
317
318#else
319
320
321
322
323
324void hrtick_start(struct rq *rq, u64 delay)
325{
326
327
328
329
330 delay = max_t(u64, delay, 10000LL);
331 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
332 HRTIMER_MODE_REL_PINNED);
333}
334#endif
335
336static void init_rq_hrtick(struct rq *rq)
337{
338#ifdef CONFIG_SMP
339 rq->hrtick_csd_pending = 0;
340
341 rq->hrtick_csd.flags = 0;
342 rq->hrtick_csd.func = __hrtick_start;
343 rq->hrtick_csd.info = rq;
344#endif
345
346 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
347 rq->hrtick_timer.function = hrtick;
348}
349#else
350static inline void hrtick_clear(struct rq *rq)
351{
352}
353
354static inline void init_rq_hrtick(struct rq *rq)
355{
356}
357#endif
358
359
360
361
362#define fetch_or(ptr, mask) \
363 ({ \
364 typeof(ptr) _ptr = (ptr); \
365 typeof(mask) _mask = (mask); \
366 typeof(*_ptr) _old, _val = *_ptr; \
367 \
368 for (;;) { \
369 _old = cmpxchg(_ptr, _val, _val | _mask); \
370 if (_old == _val) \
371 break; \
372 _val = _old; \
373 } \
374 _old; \
375})
376
377#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
378
379
380
381
382
383static bool set_nr_and_not_polling(struct task_struct *p)
384{
385 struct thread_info *ti = task_thread_info(p);
386 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
387}
388
389
390
391
392
393
394
395static bool set_nr_if_polling(struct task_struct *p)
396{
397 struct thread_info *ti = task_thread_info(p);
398 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
399
400 for (;;) {
401 if (!(val & _TIF_POLLING_NRFLAG))
402 return false;
403 if (val & _TIF_NEED_RESCHED)
404 return true;
405 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
406 if (old == val)
407 break;
408 val = old;
409 }
410 return true;
411}
412
413#else
414static bool set_nr_and_not_polling(struct task_struct *p)
415{
416 set_tsk_need_resched(p);
417 return true;
418}
419
420#ifdef CONFIG_SMP
421static bool set_nr_if_polling(struct task_struct *p)
422{
423 return false;
424}
425#endif
426#endif
427
428void wake_q_add(struct wake_q_head *head, struct task_struct *task)
429{
430 struct wake_q_node *node = &task->wake_q;
431
432
433
434
435
436
437
438
439
440 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
441 return;
442
443 get_task_struct(task);
444
445
446
447
448 *head->lastp = node;
449 head->lastp = &node->next;
450}
451
452void wake_up_q(struct wake_q_head *head)
453{
454 struct wake_q_node *node = head->first;
455
456 while (node != WAKE_Q_TAIL) {
457 struct task_struct *task;
458
459 task = container_of(node, struct task_struct, wake_q);
460 BUG_ON(!task);
461
462 node = node->next;
463 task->wake_q.next = NULL;
464
465
466
467
468
469 wake_up_process(task);
470 put_task_struct(task);
471 }
472}
473
474
475
476
477
478
479
480
481void resched_curr(struct rq *rq)
482{
483 struct task_struct *curr = rq->curr;
484 int cpu;
485
486 lockdep_assert_held(&rq->lock);
487
488 if (test_tsk_need_resched(curr))
489 return;
490
491 cpu = cpu_of(rq);
492
493 if (cpu == smp_processor_id()) {
494 set_tsk_need_resched(curr);
495 set_preempt_need_resched();
496 return;
497 }
498
499 if (set_nr_and_not_polling(curr))
500 smp_send_reschedule(cpu);
501 else
502 trace_sched_wake_idle_without_ipi(cpu);
503}
504
505void resched_cpu(int cpu)
506{
507 struct rq *rq = cpu_rq(cpu);
508 unsigned long flags;
509
510 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
511 return;
512 resched_curr(rq);
513 raw_spin_unlock_irqrestore(&rq->lock, flags);
514}
515
516#ifdef CONFIG_SMP
517#ifdef CONFIG_NO_HZ_COMMON
518
519
520
521
522
523
524
525
526int get_nohz_timer_target(void)
527{
528 int i, cpu = smp_processor_id();
529 struct sched_domain *sd;
530
531 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
532 return cpu;
533
534 rcu_read_lock();
535 for_each_domain(cpu, sd) {
536 for_each_cpu(i, sched_domain_span(sd)) {
537 if (cpu == i)
538 continue;
539
540 if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
541 cpu = i;
542 goto unlock;
543 }
544 }
545 }
546
547 if (!is_housekeeping_cpu(cpu))
548 cpu = housekeeping_any_cpu();
549unlock:
550 rcu_read_unlock();
551 return cpu;
552}
553
554
555
556
557
558
559
560
561
562
563static void wake_up_idle_cpu(int cpu)
564{
565 struct rq *rq = cpu_rq(cpu);
566
567 if (cpu == smp_processor_id())
568 return;
569
570 if (set_nr_and_not_polling(rq->idle))
571 smp_send_reschedule(cpu);
572 else
573 trace_sched_wake_idle_without_ipi(cpu);
574}
575
576static bool wake_up_full_nohz_cpu(int cpu)
577{
578
579
580
581
582
583
584 if (cpu_is_offline(cpu))
585 return true;
586 if (tick_nohz_full_cpu(cpu)) {
587 if (cpu != smp_processor_id() ||
588 tick_nohz_tick_stopped())
589 tick_nohz_full_kick_cpu(cpu);
590 return true;
591 }
592
593 return false;
594}
595
596
597
598
599
600
601void wake_up_nohz_cpu(int cpu)
602{
603 if (!wake_up_full_nohz_cpu(cpu))
604 wake_up_idle_cpu(cpu);
605}
606
607static inline bool got_nohz_idle_kick(void)
608{
609 int cpu = smp_processor_id();
610
611 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
612 return false;
613
614 if (idle_cpu(cpu) && !need_resched())
615 return true;
616
617
618
619
620
621 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
622 return false;
623}
624
625#else
626
627static inline bool got_nohz_idle_kick(void)
628{
629 return false;
630}
631
632#endif
633
634#ifdef CONFIG_NO_HZ_FULL
635bool sched_can_stop_tick(struct rq *rq)
636{
637 int fifo_nr_running;
638
639
640 if (rq->dl.dl_nr_running)
641 return false;
642
643
644
645
646
647 if (rq->rt.rr_nr_running) {
648 if (rq->rt.rr_nr_running == 1)
649 return true;
650 else
651 return false;
652 }
653
654
655
656
657
658 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
659 if (fifo_nr_running)
660 return true;
661
662
663
664
665
666
667 if (rq->nr_running > 1)
668 return false;
669
670 return true;
671}
672#endif
673
674void sched_avg_update(struct rq *rq)
675{
676 s64 period = sched_avg_period();
677
678 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
679
680
681
682
683
684 asm("" : "+rm" (rq->age_stamp));
685 rq->age_stamp += period;
686 rq->rt_avg /= 2;
687 }
688}
689
690#endif
691
692#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
693 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
694
695
696
697
698
699
700int walk_tg_tree_from(struct task_group *from,
701 tg_visitor down, tg_visitor up, void *data)
702{
703 struct task_group *parent, *child;
704 int ret;
705
706 parent = from;
707
708down:
709 ret = (*down)(parent, data);
710 if (ret)
711 goto out;
712 list_for_each_entry_rcu(child, &parent->children, siblings) {
713 parent = child;
714 goto down;
715
716up:
717 continue;
718 }
719 ret = (*up)(parent, data);
720 if (ret || parent == from)
721 goto out;
722
723 child = parent;
724 parent = parent->parent;
725 if (parent)
726 goto up;
727out:
728 return ret;
729}
730
731int tg_nop(struct task_group *tg, void *data)
732{
733 return 0;
734}
735#endif
736
737static void set_load_weight(struct task_struct *p)
738{
739 int prio = p->static_prio - MAX_RT_PRIO;
740 struct load_weight *load = &p->se.load;
741
742
743
744
745 if (idle_policy(p->policy)) {
746 load->weight = scale_load(WEIGHT_IDLEPRIO);
747 load->inv_weight = WMULT_IDLEPRIO;
748 return;
749 }
750
751 load->weight = scale_load(sched_prio_to_weight[prio]);
752 load->inv_weight = sched_prio_to_wmult[prio];
753}
754
755static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
756{
757 update_rq_clock(rq);
758 if (!(flags & ENQUEUE_RESTORE))
759 sched_info_queued(rq, p);
760 p->sched_class->enqueue_task(rq, p, flags);
761}
762
763static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
764{
765 update_rq_clock(rq);
766 if (!(flags & DEQUEUE_SAVE))
767 sched_info_dequeued(rq, p);
768 p->sched_class->dequeue_task(rq, p, flags);
769}
770
771void activate_task(struct rq *rq, struct task_struct *p, int flags)
772{
773 if (task_contributes_to_load(p))
774 rq->nr_uninterruptible--;
775
776 enqueue_task(rq, p, flags);
777}
778
779void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
780{
781 if (task_contributes_to_load(p))
782 rq->nr_uninterruptible++;
783
784 dequeue_task(rq, p, flags);
785}
786
787static void update_rq_clock_task(struct rq *rq, s64 delta)
788{
789
790
791
792
793#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
794 s64 steal = 0, irq_delta = 0;
795#endif
796#ifdef CONFIG_IRQ_TIME_ACCOUNTING
797 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814 if (irq_delta > delta)
815 irq_delta = delta;
816
817 rq->prev_irq_time += irq_delta;
818 delta -= irq_delta;
819#endif
820#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
821 if (static_key_false((¶virt_steal_rq_enabled))) {
822 steal = paravirt_steal_clock(cpu_of(rq));
823 steal -= rq->prev_steal_time_rq;
824
825 if (unlikely(steal > delta))
826 steal = delta;
827
828 rq->prev_steal_time_rq += steal;
829 delta -= steal;
830 }
831#endif
832
833 rq->clock_task += delta;
834
835#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
836 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
837 sched_rt_avg_update(rq, irq_delta + steal);
838#endif
839}
840
841void sched_set_stop_task(int cpu, struct task_struct *stop)
842{
843 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
844 struct task_struct *old_stop = cpu_rq(cpu)->stop;
845
846 if (stop) {
847
848
849
850
851
852
853
854
855 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
856
857 stop->sched_class = &stop_sched_class;
858 }
859
860 cpu_rq(cpu)->stop = stop;
861
862 if (old_stop) {
863
864
865
866
867 old_stop->sched_class = &rt_sched_class;
868 }
869}
870
871
872
873
874static inline int __normal_prio(struct task_struct *p)
875{
876 return p->static_prio;
877}
878
879
880
881
882
883
884
885
886static inline int normal_prio(struct task_struct *p)
887{
888 int prio;
889
890 if (task_has_dl_policy(p))
891 prio = MAX_DL_PRIO-1;
892 else if (task_has_rt_policy(p))
893 prio = MAX_RT_PRIO-1 - p->rt_priority;
894 else
895 prio = __normal_prio(p);
896 return prio;
897}
898
899
900
901
902
903
904
905
906static int effective_prio(struct task_struct *p)
907{
908 p->normal_prio = normal_prio(p);
909
910
911
912
913
914 if (!rt_prio(p->prio))
915 return p->normal_prio;
916 return p->prio;
917}
918
919
920
921
922
923
924
925inline int task_curr(const struct task_struct *p)
926{
927 return cpu_curr(task_cpu(p)) == p;
928}
929
930
931
932
933
934
935
936
937static inline void check_class_changed(struct rq *rq, struct task_struct *p,
938 const struct sched_class *prev_class,
939 int oldprio)
940{
941 if (prev_class != p->sched_class) {
942 if (prev_class->switched_from)
943 prev_class->switched_from(rq, p);
944
945 p->sched_class->switched_to(rq, p);
946 } else if (oldprio != p->prio || dl_task(p))
947 p->sched_class->prio_changed(rq, p, oldprio);
948}
949
950void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
951{
952 const struct sched_class *class;
953
954 if (p->sched_class == rq->curr->sched_class) {
955 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
956 } else {
957 for_each_class(class) {
958 if (class == rq->curr->sched_class)
959 break;
960 if (class == p->sched_class) {
961 resched_curr(rq);
962 break;
963 }
964 }
965 }
966
967
968
969
970
971 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
972 rq_clock_skip_update(rq, true);
973}
974
975#ifdef CONFIG_SMP
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
996{
997 lockdep_assert_held(&rq->lock);
998
999 p->on_rq = TASK_ON_RQ_MIGRATING;
1000 dequeue_task(rq, p, 0);
1001 set_task_cpu(p, new_cpu);
1002 raw_spin_unlock(&rq->lock);
1003
1004 rq = cpu_rq(new_cpu);
1005
1006 raw_spin_lock(&rq->lock);
1007 BUG_ON(task_cpu(p) != new_cpu);
1008 enqueue_task(rq, p, 0);
1009 p->on_rq = TASK_ON_RQ_QUEUED;
1010 check_preempt_curr(rq, p, 0);
1011
1012 return rq;
1013}
1014
1015struct migration_arg {
1016 struct task_struct *task;
1017 int dest_cpu;
1018};
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
1030{
1031 if (unlikely(!cpu_active(dest_cpu)))
1032 return rq;
1033
1034
1035 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1036 return rq;
1037
1038 rq = move_queued_task(rq, p, dest_cpu);
1039
1040 return rq;
1041}
1042
1043
1044
1045
1046
1047
1048static int migration_cpu_stop(void *data)
1049{
1050 struct migration_arg *arg = data;
1051 struct task_struct *p = arg->task;
1052 struct rq *rq = this_rq();
1053
1054
1055
1056
1057
1058 local_irq_disable();
1059
1060
1061
1062
1063
1064 sched_ttwu_pending();
1065
1066 raw_spin_lock(&p->pi_lock);
1067 raw_spin_lock(&rq->lock);
1068
1069
1070
1071
1072
1073 if (task_rq(p) == rq) {
1074 if (task_on_rq_queued(p))
1075 rq = __migrate_task(rq, p, arg->dest_cpu);
1076 else
1077 p->wake_cpu = arg->dest_cpu;
1078 }
1079 raw_spin_unlock(&rq->lock);
1080 raw_spin_unlock(&p->pi_lock);
1081
1082 local_irq_enable();
1083 return 0;
1084}
1085
1086
1087
1088
1089
1090void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1091{
1092 cpumask_copy(&p->cpus_allowed, new_mask);
1093 p->nr_cpus_allowed = cpumask_weight(new_mask);
1094}
1095
1096void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1097{
1098 struct rq *rq = task_rq(p);
1099 bool queued, running;
1100
1101 lockdep_assert_held(&p->pi_lock);
1102
1103 queued = task_on_rq_queued(p);
1104 running = task_current(rq, p);
1105
1106 if (queued) {
1107
1108
1109
1110
1111 lockdep_assert_held(&rq->lock);
1112 dequeue_task(rq, p, DEQUEUE_SAVE);
1113 }
1114 if (running)
1115 put_prev_task(rq, p);
1116
1117 p->sched_class->set_cpus_allowed(p, new_mask);
1118
1119 if (queued)
1120 enqueue_task(rq, p, ENQUEUE_RESTORE);
1121 if (running)
1122 set_curr_task(rq, p);
1123}
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134static int __set_cpus_allowed_ptr(struct task_struct *p,
1135 const struct cpumask *new_mask, bool check)
1136{
1137 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1138 unsigned int dest_cpu;
1139 struct rq_flags rf;
1140 struct rq *rq;
1141 int ret = 0;
1142
1143 rq = task_rq_lock(p, &rf);
1144
1145 if (p->flags & PF_KTHREAD) {
1146
1147
1148
1149 cpu_valid_mask = cpu_online_mask;
1150 }
1151
1152
1153
1154
1155
1156 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1157 ret = -EINVAL;
1158 goto out;
1159 }
1160
1161 if (cpumask_equal(&p->cpus_allowed, new_mask))
1162 goto out;
1163
1164 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1165 ret = -EINVAL;
1166 goto out;
1167 }
1168
1169 do_set_cpus_allowed(p, new_mask);
1170
1171 if (p->flags & PF_KTHREAD) {
1172
1173
1174
1175
1176 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1177 !cpumask_intersects(new_mask, cpu_active_mask) &&
1178 p->nr_cpus_allowed != 1);
1179 }
1180
1181
1182 if (cpumask_test_cpu(task_cpu(p), new_mask))
1183 goto out;
1184
1185 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1186 if (task_running(rq, p) || p->state == TASK_WAKING) {
1187 struct migration_arg arg = { p, dest_cpu };
1188
1189 task_rq_unlock(rq, p, &rf);
1190 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1191 tlb_migrate_finish(p->mm);
1192 return 0;
1193 } else if (task_on_rq_queued(p)) {
1194
1195
1196
1197
1198 lockdep_unpin_lock(&rq->lock, rf.cookie);
1199 rq = move_queued_task(rq, p, dest_cpu);
1200 lockdep_repin_lock(&rq->lock, rf.cookie);
1201 }
1202out:
1203 task_rq_unlock(rq, p, &rf);
1204
1205 return ret;
1206}
1207
1208int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1209{
1210 return __set_cpus_allowed_ptr(p, new_mask, false);
1211}
1212EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1213
1214void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1215{
1216#ifdef CONFIG_SCHED_DEBUG
1217
1218
1219
1220
1221 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1222 !p->on_rq);
1223
1224
1225
1226
1227
1228
1229 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1230 p->sched_class == &fair_sched_class &&
1231 (p->on_rq && !task_on_rq_migrating(p)));
1232
1233#ifdef CONFIG_LOCKDEP
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1245 lockdep_is_held(&task_rq(p)->lock)));
1246#endif
1247#endif
1248
1249 trace_sched_migrate_task(p, new_cpu);
1250
1251 if (task_cpu(p) != new_cpu) {
1252 if (p->sched_class->migrate_task_rq)
1253 p->sched_class->migrate_task_rq(p);
1254 p->se.nr_migrations++;
1255 perf_event_task_migrate(p);
1256 }
1257
1258 __set_task_cpu(p, new_cpu);
1259}
1260
1261static void __migrate_swap_task(struct task_struct *p, int cpu)
1262{
1263 if (task_on_rq_queued(p)) {
1264 struct rq *src_rq, *dst_rq;
1265
1266 src_rq = task_rq(p);
1267 dst_rq = cpu_rq(cpu);
1268
1269 p->on_rq = TASK_ON_RQ_MIGRATING;
1270 deactivate_task(src_rq, p, 0);
1271 set_task_cpu(p, cpu);
1272 activate_task(dst_rq, p, 0);
1273 p->on_rq = TASK_ON_RQ_QUEUED;
1274 check_preempt_curr(dst_rq, p, 0);
1275 } else {
1276
1277
1278
1279
1280
1281 p->wake_cpu = cpu;
1282 }
1283}
1284
1285struct migration_swap_arg {
1286 struct task_struct *src_task, *dst_task;
1287 int src_cpu, dst_cpu;
1288};
1289
1290static int migrate_swap_stop(void *data)
1291{
1292 struct migration_swap_arg *arg = data;
1293 struct rq *src_rq, *dst_rq;
1294 int ret = -EAGAIN;
1295
1296 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1297 return -EAGAIN;
1298
1299 src_rq = cpu_rq(arg->src_cpu);
1300 dst_rq = cpu_rq(arg->dst_cpu);
1301
1302 double_raw_lock(&arg->src_task->pi_lock,
1303 &arg->dst_task->pi_lock);
1304 double_rq_lock(src_rq, dst_rq);
1305
1306 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1307 goto unlock;
1308
1309 if (task_cpu(arg->src_task) != arg->src_cpu)
1310 goto unlock;
1311
1312 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1313 goto unlock;
1314
1315 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1316 goto unlock;
1317
1318 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1319 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1320
1321 ret = 0;
1322
1323unlock:
1324 double_rq_unlock(src_rq, dst_rq);
1325 raw_spin_unlock(&arg->dst_task->pi_lock);
1326 raw_spin_unlock(&arg->src_task->pi_lock);
1327
1328 return ret;
1329}
1330
1331
1332
1333
1334int migrate_swap(struct task_struct *cur, struct task_struct *p)
1335{
1336 struct migration_swap_arg arg;
1337 int ret = -EINVAL;
1338
1339 arg = (struct migration_swap_arg){
1340 .src_task = cur,
1341 .src_cpu = task_cpu(cur),
1342 .dst_task = p,
1343 .dst_cpu = task_cpu(p),
1344 };
1345
1346 if (arg.src_cpu == arg.dst_cpu)
1347 goto out;
1348
1349
1350
1351
1352
1353 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1354 goto out;
1355
1356 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1357 goto out;
1358
1359 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1360 goto out;
1361
1362 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1363 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1364
1365out:
1366 return ret;
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1386{
1387 int running, queued;
1388 struct rq_flags rf;
1389 unsigned long ncsw;
1390 struct rq *rq;
1391
1392 for (;;) {
1393
1394
1395
1396
1397
1398
1399 rq = task_rq(p);
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412 while (task_running(rq, p)) {
1413 if (match_state && unlikely(p->state != match_state))
1414 return 0;
1415 cpu_relax();
1416 }
1417
1418
1419
1420
1421
1422
1423 rq = task_rq_lock(p, &rf);
1424 trace_sched_wait_task(p);
1425 running = task_running(rq, p);
1426 queued = task_on_rq_queued(p);
1427 ncsw = 0;
1428 if (!match_state || p->state == match_state)
1429 ncsw = p->nvcsw | LONG_MIN;
1430 task_rq_unlock(rq, p, &rf);
1431
1432
1433
1434
1435 if (unlikely(!ncsw))
1436 break;
1437
1438
1439
1440
1441
1442
1443
1444 if (unlikely(running)) {
1445 cpu_relax();
1446 continue;
1447 }
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458 if (unlikely(queued)) {
1459 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1460
1461 set_current_state(TASK_UNINTERRUPTIBLE);
1462 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1463 continue;
1464 }
1465
1466
1467
1468
1469
1470
1471 break;
1472 }
1473
1474 return ncsw;
1475}
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490void kick_process(struct task_struct *p)
1491{
1492 int cpu;
1493
1494 preempt_disable();
1495 cpu = task_cpu(p);
1496 if ((cpu != smp_processor_id()) && task_curr(p))
1497 smp_send_reschedule(cpu);
1498 preempt_enable();
1499}
1500EXPORT_SYMBOL_GPL(kick_process);
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524static int select_fallback_rq(int cpu, struct task_struct *p)
1525{
1526 int nid = cpu_to_node(cpu);
1527 const struct cpumask *nodemask = NULL;
1528 enum { cpuset, possible, fail } state = cpuset;
1529 int dest_cpu;
1530
1531
1532
1533
1534
1535
1536 if (nid != -1) {
1537 nodemask = cpumask_of_node(nid);
1538
1539
1540 for_each_cpu(dest_cpu, nodemask) {
1541 if (!cpu_active(dest_cpu))
1542 continue;
1543 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1544 return dest_cpu;
1545 }
1546 }
1547
1548 for (;;) {
1549
1550 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1551 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1552 continue;
1553 if (!cpu_online(dest_cpu))
1554 continue;
1555 goto out;
1556 }
1557
1558
1559 switch (state) {
1560 case cpuset:
1561 if (IS_ENABLED(CONFIG_CPUSETS)) {
1562 cpuset_cpus_allowed_fallback(p);
1563 state = possible;
1564 break;
1565 }
1566
1567 case possible:
1568 do_set_cpus_allowed(p, cpu_possible_mask);
1569 state = fail;
1570 break;
1571
1572 case fail:
1573 BUG();
1574 break;
1575 }
1576 }
1577
1578out:
1579 if (state != cpuset) {
1580
1581
1582
1583
1584
1585 if (p->mm && printk_ratelimit()) {
1586 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1587 task_pid_nr(p), p->comm, cpu);
1588 }
1589 }
1590
1591 return dest_cpu;
1592}
1593
1594
1595
1596
1597static inline
1598int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1599{
1600 lockdep_assert_held(&p->pi_lock);
1601
1602 if (tsk_nr_cpus_allowed(p) > 1)
1603 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1604 else
1605 cpu = cpumask_any(tsk_cpus_allowed(p));
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1618 !cpu_online(cpu)))
1619 cpu = select_fallback_rq(task_cpu(p), p);
1620
1621 return cpu;
1622}
1623
1624static void update_avg(u64 *avg, u64 sample)
1625{
1626 s64 diff = sample - *avg;
1627 *avg += diff >> 3;
1628}
1629
1630#else
1631
1632static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1633 const struct cpumask *new_mask, bool check)
1634{
1635 return set_cpus_allowed_ptr(p, new_mask);
1636}
1637
1638#endif
1639
1640static void
1641ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1642{
1643 struct rq *rq;
1644
1645 if (!schedstat_enabled())
1646 return;
1647
1648 rq = this_rq();
1649
1650#ifdef CONFIG_SMP
1651 if (cpu == rq->cpu) {
1652 schedstat_inc(rq->ttwu_local);
1653 schedstat_inc(p->se.statistics.nr_wakeups_local);
1654 } else {
1655 struct sched_domain *sd;
1656
1657 schedstat_inc(p->se.statistics.nr_wakeups_remote);
1658 rcu_read_lock();
1659 for_each_domain(rq->cpu, sd) {
1660 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1661 schedstat_inc(sd->ttwu_wake_remote);
1662 break;
1663 }
1664 }
1665 rcu_read_unlock();
1666 }
1667
1668 if (wake_flags & WF_MIGRATED)
1669 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1670#endif
1671
1672 schedstat_inc(rq->ttwu_count);
1673 schedstat_inc(p->se.statistics.nr_wakeups);
1674
1675 if (wake_flags & WF_SYNC)
1676 schedstat_inc(p->se.statistics.nr_wakeups_sync);
1677}
1678
1679static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1680{
1681 activate_task(rq, p, en_flags);
1682 p->on_rq = TASK_ON_RQ_QUEUED;
1683
1684
1685 if (p->flags & PF_WQ_WORKER)
1686 wq_worker_waking_up(p, cpu_of(rq));
1687}
1688
1689
1690
1691
1692static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1693 struct pin_cookie cookie)
1694{
1695 check_preempt_curr(rq, p, wake_flags);
1696 p->state = TASK_RUNNING;
1697 trace_sched_wakeup(p);
1698
1699#ifdef CONFIG_SMP
1700 if (p->sched_class->task_woken) {
1701
1702
1703
1704
1705 lockdep_unpin_lock(&rq->lock, cookie);
1706 p->sched_class->task_woken(rq, p);
1707 lockdep_repin_lock(&rq->lock, cookie);
1708 }
1709
1710 if (rq->idle_stamp) {
1711 u64 delta = rq_clock(rq) - rq->idle_stamp;
1712 u64 max = 2*rq->max_idle_balance_cost;
1713
1714 update_avg(&rq->avg_idle, delta);
1715
1716 if (rq->avg_idle > max)
1717 rq->avg_idle = max;
1718
1719 rq->idle_stamp = 0;
1720 }
1721#endif
1722}
1723
1724static void
1725ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1726 struct pin_cookie cookie)
1727{
1728 int en_flags = ENQUEUE_WAKEUP;
1729
1730 lockdep_assert_held(&rq->lock);
1731
1732#ifdef CONFIG_SMP
1733 if (p->sched_contributes_to_load)
1734 rq->nr_uninterruptible--;
1735
1736 if (wake_flags & WF_MIGRATED)
1737 en_flags |= ENQUEUE_MIGRATED;
1738#endif
1739
1740 ttwu_activate(rq, p, en_flags);
1741 ttwu_do_wakeup(rq, p, wake_flags, cookie);
1742}
1743
1744
1745
1746
1747
1748
1749
1750static int ttwu_remote(struct task_struct *p, int wake_flags)
1751{
1752 struct rq_flags rf;
1753 struct rq *rq;
1754 int ret = 0;
1755
1756 rq = __task_rq_lock(p, &rf);
1757 if (task_on_rq_queued(p)) {
1758
1759 update_rq_clock(rq);
1760 ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
1761 ret = 1;
1762 }
1763 __task_rq_unlock(rq, &rf);
1764
1765 return ret;
1766}
1767
1768#ifdef CONFIG_SMP
1769void sched_ttwu_pending(void)
1770{
1771 struct rq *rq = this_rq();
1772 struct llist_node *llist = llist_del_all(&rq->wake_list);
1773 struct pin_cookie cookie;
1774 struct task_struct *p;
1775 unsigned long flags;
1776
1777 if (!llist)
1778 return;
1779
1780 raw_spin_lock_irqsave(&rq->lock, flags);
1781 cookie = lockdep_pin_lock(&rq->lock);
1782
1783 while (llist) {
1784 int wake_flags = 0;
1785
1786 p = llist_entry(llist, struct task_struct, wake_entry);
1787 llist = llist_next(llist);
1788
1789 if (p->sched_remote_wakeup)
1790 wake_flags = WF_MIGRATED;
1791
1792 ttwu_do_activate(rq, p, wake_flags, cookie);
1793 }
1794
1795 lockdep_unpin_lock(&rq->lock, cookie);
1796 raw_spin_unlock_irqrestore(&rq->lock, flags);
1797}
1798
1799void scheduler_ipi(void)
1800{
1801
1802
1803
1804
1805
1806 preempt_fold_need_resched();
1807
1808 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1809 return;
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824 irq_enter();
1825 sched_ttwu_pending();
1826
1827
1828
1829
1830 if (unlikely(got_nohz_idle_kick())) {
1831 this_rq()->idle_balance = 1;
1832 raise_softirq_irqoff(SCHED_SOFTIRQ);
1833 }
1834 irq_exit();
1835}
1836
1837static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1838{
1839 struct rq *rq = cpu_rq(cpu);
1840
1841 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1842
1843 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1844 if (!set_nr_if_polling(rq->idle))
1845 smp_send_reschedule(cpu);
1846 else
1847 trace_sched_wake_idle_without_ipi(cpu);
1848 }
1849}
1850
1851void wake_up_if_idle(int cpu)
1852{
1853 struct rq *rq = cpu_rq(cpu);
1854 unsigned long flags;
1855
1856 rcu_read_lock();
1857
1858 if (!is_idle_task(rcu_dereference(rq->curr)))
1859 goto out;
1860
1861 if (set_nr_if_polling(rq->idle)) {
1862 trace_sched_wake_idle_without_ipi(cpu);
1863 } else {
1864 raw_spin_lock_irqsave(&rq->lock, flags);
1865 if (is_idle_task(rq->curr))
1866 smp_send_reschedule(cpu);
1867
1868 raw_spin_unlock_irqrestore(&rq->lock, flags);
1869 }
1870
1871out:
1872 rcu_read_unlock();
1873}
1874
1875bool cpus_share_cache(int this_cpu, int that_cpu)
1876{
1877 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1878}
1879#endif
1880
1881static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1882{
1883 struct rq *rq = cpu_rq(cpu);
1884 struct pin_cookie cookie;
1885
1886#if defined(CONFIG_SMP)
1887 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1888 sched_clock_cpu(cpu);
1889 ttwu_queue_remote(p, cpu, wake_flags);
1890 return;
1891 }
1892#endif
1893
1894 raw_spin_lock(&rq->lock);
1895 cookie = lockdep_pin_lock(&rq->lock);
1896 ttwu_do_activate(rq, p, wake_flags, cookie);
1897 lockdep_unpin_lock(&rq->lock, cookie);
1898 raw_spin_unlock(&rq->lock);
1899}
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007static int
2008try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2009{
2010 unsigned long flags;
2011 int cpu, success = 0;
2012
2013
2014
2015
2016
2017
2018
2019 smp_mb__before_spinlock();
2020 raw_spin_lock_irqsave(&p->pi_lock, flags);
2021 if (!(p->state & state))
2022 goto out;
2023
2024 trace_sched_waking(p);
2025
2026 success = 1;
2027 cpu = task_cpu(p);
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050 smp_rmb();
2051 if (p->on_rq && ttwu_remote(p, wake_flags))
2052 goto stat;
2053
2054#ifdef CONFIG_SMP
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072 smp_rmb();
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083 smp_cond_load_acquire(&p->on_cpu, !VAL);
2084
2085 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2086 p->state = TASK_WAKING;
2087
2088 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2089 if (task_cpu(p) != cpu) {
2090 wake_flags |= WF_MIGRATED;
2091 set_task_cpu(p, cpu);
2092 }
2093#endif
2094
2095 ttwu_queue(p, cpu, wake_flags);
2096stat:
2097 ttwu_stat(p, cpu, wake_flags);
2098out:
2099 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2100
2101 return success;
2102}
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
2114{
2115 struct rq *rq = task_rq(p);
2116
2117 if (WARN_ON_ONCE(rq != this_rq()) ||
2118 WARN_ON_ONCE(p == current))
2119 return;
2120
2121 lockdep_assert_held(&rq->lock);
2122
2123 if (!raw_spin_trylock(&p->pi_lock)) {
2124
2125
2126
2127
2128
2129
2130 lockdep_unpin_lock(&rq->lock, cookie);
2131 raw_spin_unlock(&rq->lock);
2132 raw_spin_lock(&p->pi_lock);
2133 raw_spin_lock(&rq->lock);
2134 lockdep_repin_lock(&rq->lock, cookie);
2135 }
2136
2137 if (!(p->state & TASK_NORMAL))
2138 goto out;
2139
2140 trace_sched_waking(p);
2141
2142 if (!task_on_rq_queued(p))
2143 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2144
2145 ttwu_do_wakeup(rq, p, 0, cookie);
2146 ttwu_stat(p, smp_processor_id(), 0);
2147out:
2148 raw_spin_unlock(&p->pi_lock);
2149}
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163int wake_up_process(struct task_struct *p)
2164{
2165 return try_to_wake_up(p, TASK_NORMAL, 0);
2166}
2167EXPORT_SYMBOL(wake_up_process);
2168
2169int wake_up_state(struct task_struct *p, unsigned int state)
2170{
2171 return try_to_wake_up(p, state, 0);
2172}
2173
2174
2175
2176
2177void __dl_clear_params(struct task_struct *p)
2178{
2179 struct sched_dl_entity *dl_se = &p->dl;
2180
2181 dl_se->dl_runtime = 0;
2182 dl_se->dl_deadline = 0;
2183 dl_se->dl_period = 0;
2184 dl_se->flags = 0;
2185 dl_se->dl_bw = 0;
2186
2187 dl_se->dl_throttled = 0;
2188 dl_se->dl_yielded = 0;
2189}
2190
2191
2192
2193
2194
2195
2196
2197static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2198{
2199 p->on_rq = 0;
2200
2201 p->se.on_rq = 0;
2202 p->se.exec_start = 0;
2203 p->se.sum_exec_runtime = 0;
2204 p->se.prev_sum_exec_runtime = 0;
2205 p->se.nr_migrations = 0;
2206 p->se.vruntime = 0;
2207 INIT_LIST_HEAD(&p->se.group_node);
2208
2209#ifdef CONFIG_FAIR_GROUP_SCHED
2210 p->se.cfs_rq = NULL;
2211#endif
2212
2213#ifdef CONFIG_SCHEDSTATS
2214
2215 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2216#endif
2217
2218 RB_CLEAR_NODE(&p->dl.rb_node);
2219 init_dl_task_timer(&p->dl);
2220 __dl_clear_params(p);
2221
2222 INIT_LIST_HEAD(&p->rt.run_list);
2223 p->rt.timeout = 0;
2224 p->rt.time_slice = sched_rr_timeslice;
2225 p->rt.on_rq = 0;
2226 p->rt.on_list = 0;
2227
2228#ifdef CONFIG_PREEMPT_NOTIFIERS
2229 INIT_HLIST_HEAD(&p->preempt_notifiers);
2230#endif
2231
2232#ifdef CONFIG_NUMA_BALANCING
2233 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2234 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2235 p->mm->numa_scan_seq = 0;
2236 }
2237
2238 if (clone_flags & CLONE_VM)
2239 p->numa_preferred_nid = current->numa_preferred_nid;
2240 else
2241 p->numa_preferred_nid = -1;
2242
2243 p->node_stamp = 0ULL;
2244 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2245 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2246 p->numa_work.next = &p->numa_work;
2247 p->numa_faults = NULL;
2248 p->last_task_numa_placement = 0;
2249 p->last_sum_exec_runtime = 0;
2250
2251 p->numa_group = NULL;
2252#endif
2253}
2254
2255DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2256
2257#ifdef CONFIG_NUMA_BALANCING
2258
2259void set_numabalancing_state(bool enabled)
2260{
2261 if (enabled)
2262 static_branch_enable(&sched_numa_balancing);
2263 else
2264 static_branch_disable(&sched_numa_balancing);
2265}
2266
2267#ifdef CONFIG_PROC_SYSCTL
2268int sysctl_numa_balancing(struct ctl_table *table, int write,
2269 void __user *buffer, size_t *lenp, loff_t *ppos)
2270{
2271 struct ctl_table t;
2272 int err;
2273 int state = static_branch_likely(&sched_numa_balancing);
2274
2275 if (write && !capable(CAP_SYS_ADMIN))
2276 return -EPERM;
2277
2278 t = *table;
2279 t.data = &state;
2280 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2281 if (err < 0)
2282 return err;
2283 if (write)
2284 set_numabalancing_state(state);
2285 return err;
2286}
2287#endif
2288#endif
2289
2290#ifdef CONFIG_SCHEDSTATS
2291
2292DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2293static bool __initdata __sched_schedstats = false;
2294
2295static void set_schedstats(bool enabled)
2296{
2297 if (enabled)
2298 static_branch_enable(&sched_schedstats);
2299 else
2300 static_branch_disable(&sched_schedstats);
2301}
2302
2303void force_schedstat_enabled(void)
2304{
2305 if (!schedstat_enabled()) {
2306 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2307 static_branch_enable(&sched_schedstats);
2308 }
2309}
2310
2311static int __init setup_schedstats(char *str)
2312{
2313 int ret = 0;
2314 if (!str)
2315 goto out;
2316
2317
2318
2319
2320
2321
2322 if (!strcmp(str, "enable")) {
2323 __sched_schedstats = true;
2324 ret = 1;
2325 } else if (!strcmp(str, "disable")) {
2326 __sched_schedstats = false;
2327 ret = 1;
2328 }
2329out:
2330 if (!ret)
2331 pr_warn("Unable to parse schedstats=\n");
2332
2333 return ret;
2334}
2335__setup("schedstats=", setup_schedstats);
2336
2337static void __init init_schedstats(void)
2338{
2339 set_schedstats(__sched_schedstats);
2340}
2341
2342#ifdef CONFIG_PROC_SYSCTL
2343int sysctl_schedstats(struct ctl_table *table, int write,
2344 void __user *buffer, size_t *lenp, loff_t *ppos)
2345{
2346 struct ctl_table t;
2347 int err;
2348 int state = static_branch_likely(&sched_schedstats);
2349
2350 if (write && !capable(CAP_SYS_ADMIN))
2351 return -EPERM;
2352
2353 t = *table;
2354 t.data = &state;
2355 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2356 if (err < 0)
2357 return err;
2358 if (write)
2359 set_schedstats(state);
2360 return err;
2361}
2362#endif
2363#else
2364static inline void init_schedstats(void) {}
2365#endif
2366
2367
2368
2369
2370int sched_fork(unsigned long clone_flags, struct task_struct *p)
2371{
2372 unsigned long flags;
2373 int cpu = get_cpu();
2374
2375 __sched_fork(clone_flags, p);
2376
2377
2378
2379
2380
2381 p->state = TASK_NEW;
2382
2383
2384
2385
2386 p->prio = current->normal_prio;
2387
2388
2389
2390
2391 if (unlikely(p->sched_reset_on_fork)) {
2392 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2393 p->policy = SCHED_NORMAL;
2394 p->static_prio = NICE_TO_PRIO(0);
2395 p->rt_priority = 0;
2396 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2397 p->static_prio = NICE_TO_PRIO(0);
2398
2399 p->prio = p->normal_prio = __normal_prio(p);
2400 set_load_weight(p);
2401
2402
2403
2404
2405
2406 p->sched_reset_on_fork = 0;
2407 }
2408
2409 if (dl_prio(p->prio)) {
2410 put_cpu();
2411 return -EAGAIN;
2412 } else if (rt_prio(p->prio)) {
2413 p->sched_class = &rt_sched_class;
2414 } else {
2415 p->sched_class = &fair_sched_class;
2416 }
2417
2418 init_entity_runnable_average(&p->se);
2419
2420
2421
2422
2423
2424
2425
2426
2427 raw_spin_lock_irqsave(&p->pi_lock, flags);
2428
2429
2430
2431
2432 __set_task_cpu(p, cpu);
2433 if (p->sched_class->task_fork)
2434 p->sched_class->task_fork(p);
2435 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2436
2437#ifdef CONFIG_SCHED_INFO
2438 if (likely(sched_info_on()))
2439 memset(&p->sched_info, 0, sizeof(p->sched_info));
2440#endif
2441#if defined(CONFIG_SMP)
2442 p->on_cpu = 0;
2443#endif
2444 init_task_preempt_count(p);
2445#ifdef CONFIG_SMP
2446 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2447 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2448#endif
2449
2450 put_cpu();
2451 return 0;
2452}
2453
2454unsigned long to_ratio(u64 period, u64 runtime)
2455{
2456 if (runtime == RUNTIME_INF)
2457 return 1ULL << 20;
2458
2459
2460
2461
2462
2463
2464 if (period == 0)
2465 return 0;
2466
2467 return div64_u64(runtime << 20, period);
2468}
2469
2470#ifdef CONFIG_SMP
2471inline struct dl_bw *dl_bw_of(int i)
2472{
2473 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2474 "sched RCU must be held");
2475 return &cpu_rq(i)->rd->dl_bw;
2476}
2477
2478static inline int dl_bw_cpus(int i)
2479{
2480 struct root_domain *rd = cpu_rq(i)->rd;
2481 int cpus = 0;
2482
2483 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2484 "sched RCU must be held");
2485 for_each_cpu_and(i, rd->span, cpu_active_mask)
2486 cpus++;
2487
2488 return cpus;
2489}
2490#else
2491inline struct dl_bw *dl_bw_of(int i)
2492{
2493 return &cpu_rq(i)->dl.dl_bw;
2494}
2495
2496static inline int dl_bw_cpus(int i)
2497{
2498 return 1;
2499}
2500#endif
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513static int dl_overflow(struct task_struct *p, int policy,
2514 const struct sched_attr *attr)
2515{
2516
2517 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2518 u64 period = attr->sched_period ?: attr->sched_deadline;
2519 u64 runtime = attr->sched_runtime;
2520 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2521 int cpus, err = -1;
2522
2523
2524 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2525 return 0;
2526
2527
2528
2529
2530
2531
2532 raw_spin_lock(&dl_b->lock);
2533 cpus = dl_bw_cpus(task_cpu(p));
2534 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2535 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2536 __dl_add(dl_b, new_bw);
2537 err = 0;
2538 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2539 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2540 __dl_clear(dl_b, p->dl.dl_bw);
2541 __dl_add(dl_b, new_bw);
2542 err = 0;
2543 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2544 __dl_clear(dl_b, p->dl.dl_bw);
2545 err = 0;
2546 }
2547 raw_spin_unlock(&dl_b->lock);
2548
2549 return err;
2550}
2551
2552extern void init_dl_bw(struct dl_bw *dl_b);
2553
2554
2555
2556
2557
2558
2559
2560
2561void wake_up_new_task(struct task_struct *p)
2562{
2563 struct rq_flags rf;
2564 struct rq *rq;
2565
2566 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2567 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP
2569
2570
2571
2572
2573
2574
2575
2576
2577 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2578#endif
2579 rq = __task_rq_lock(p, &rf);
2580 post_init_entity_util_avg(&p->se);
2581
2582 activate_task(rq, p, 0);
2583 p->on_rq = TASK_ON_RQ_QUEUED;
2584 trace_sched_wakeup_new(p);
2585 check_preempt_curr(rq, p, WF_FORK);
2586#ifdef CONFIG_SMP
2587 if (p->sched_class->task_woken) {
2588
2589
2590
2591
2592 lockdep_unpin_lock(&rq->lock, rf.cookie);
2593 p->sched_class->task_woken(rq, p);
2594 lockdep_repin_lock(&rq->lock, rf.cookie);
2595 }
2596#endif
2597 task_rq_unlock(rq, p, &rf);
2598}
2599
2600#ifdef CONFIG_PREEMPT_NOTIFIERS
2601
2602static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2603
2604void preempt_notifier_inc(void)
2605{
2606 static_key_slow_inc(&preempt_notifier_key);
2607}
2608EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2609
2610void preempt_notifier_dec(void)
2611{
2612 static_key_slow_dec(&preempt_notifier_key);
2613}
2614EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2615
2616
2617
2618
2619
2620void preempt_notifier_register(struct preempt_notifier *notifier)
2621{
2622 if (!static_key_false(&preempt_notifier_key))
2623 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2624
2625 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2626}
2627EXPORT_SYMBOL_GPL(preempt_notifier_register);
2628
2629
2630
2631
2632
2633
2634
2635void preempt_notifier_unregister(struct preempt_notifier *notifier)
2636{
2637 hlist_del(¬ifier->link);
2638}
2639EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2640
2641static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2642{
2643 struct preempt_notifier *notifier;
2644
2645 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2646 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2647}
2648
2649static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2650{
2651 if (static_key_false(&preempt_notifier_key))
2652 __fire_sched_in_preempt_notifiers(curr);
2653}
2654
2655static void
2656__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2657 struct task_struct *next)
2658{
2659 struct preempt_notifier *notifier;
2660
2661 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2662 notifier->ops->sched_out(notifier, next);
2663}
2664
2665static __always_inline void
2666fire_sched_out_preempt_notifiers(struct task_struct *curr,
2667 struct task_struct *next)
2668{
2669 if (static_key_false(&preempt_notifier_key))
2670 __fire_sched_out_preempt_notifiers(curr, next);
2671}
2672
2673#else
2674
2675static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2676{
2677}
2678
2679static inline void
2680fire_sched_out_preempt_notifiers(struct task_struct *curr,
2681 struct task_struct *next)
2682{
2683}
2684
2685#endif
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700static inline void
2701prepare_task_switch(struct rq *rq, struct task_struct *prev,
2702 struct task_struct *next)
2703{
2704 sched_info_switch(rq, prev, next);
2705 perf_event_task_sched_out(prev, next);
2706 fire_sched_out_preempt_notifiers(prev, next);
2707 prepare_lock_switch(rq, next);
2708 prepare_arch_switch(next);
2709}
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730static struct rq *finish_task_switch(struct task_struct *prev)
2731 __releases(rq->lock)
2732{
2733 struct rq *rq = this_rq();
2734 struct mm_struct *mm = rq->prev_mm;
2735 long prev_state;
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2749 "corrupted preempt_count: %s/%d/0x%x\n",
2750 current->comm, current->pid, preempt_count()))
2751 preempt_count_set(FORK_PREEMPT_COUNT);
2752
2753 rq->prev_mm = NULL;
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766 prev_state = prev->state;
2767 vtime_task_switch(prev);
2768 perf_event_task_sched_in(prev, current);
2769 finish_lock_switch(rq, prev);
2770 finish_arch_post_lock_switch();
2771
2772 fire_sched_in_preempt_notifiers(current);
2773 if (mm)
2774 mmdrop(mm);
2775 if (unlikely(prev_state == TASK_DEAD)) {
2776 if (prev->sched_class->task_dead)
2777 prev->sched_class->task_dead(prev);
2778
2779
2780
2781
2782
2783 kprobe_flush_task(prev);
2784
2785
2786 put_task_stack(prev);
2787
2788 put_task_struct(prev);
2789 }
2790
2791 tick_nohz_task_switch();
2792 return rq;
2793}
2794
2795#ifdef CONFIG_SMP
2796
2797
2798static void __balance_callback(struct rq *rq)
2799{
2800 struct callback_head *head, *next;
2801 void (*func)(struct rq *rq);
2802 unsigned long flags;
2803
2804 raw_spin_lock_irqsave(&rq->lock, flags);
2805 head = rq->balance_callback;
2806 rq->balance_callback = NULL;
2807 while (head) {
2808 func = (void (*)(struct rq *))head->func;
2809 next = head->next;
2810 head->next = NULL;
2811 head = next;
2812
2813 func(rq);
2814 }
2815 raw_spin_unlock_irqrestore(&rq->lock, flags);
2816}
2817
2818static inline void balance_callback(struct rq *rq)
2819{
2820 if (unlikely(rq->balance_callback))
2821 __balance_callback(rq);
2822}
2823
2824#else
2825
2826static inline void balance_callback(struct rq *rq)
2827{
2828}
2829
2830#endif
2831
2832
2833
2834
2835
2836asmlinkage __visible void schedule_tail(struct task_struct *prev)
2837 __releases(rq->lock)
2838{
2839 struct rq *rq;
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850 rq = finish_task_switch(prev);
2851 balance_callback(rq);
2852 preempt_enable();
2853
2854 if (current->set_child_tid)
2855 put_user(task_pid_vnr(current), current->set_child_tid);
2856}
2857
2858
2859
2860
2861static __always_inline struct rq *
2862context_switch(struct rq *rq, struct task_struct *prev,
2863 struct task_struct *next, struct pin_cookie cookie)
2864{
2865 struct mm_struct *mm, *oldmm;
2866
2867 prepare_task_switch(rq, prev, next);
2868
2869 mm = next->mm;
2870 oldmm = prev->active_mm;
2871
2872
2873
2874
2875
2876 arch_start_context_switch(prev);
2877
2878 if (!mm) {
2879 next->active_mm = oldmm;
2880 atomic_inc(&oldmm->mm_count);
2881 enter_lazy_tlb(oldmm, next);
2882 } else
2883 switch_mm_irqs_off(oldmm, mm, next);
2884
2885 if (!prev->mm) {
2886 prev->active_mm = NULL;
2887 rq->prev_mm = oldmm;
2888 }
2889
2890
2891
2892
2893
2894
2895 lockdep_unpin_lock(&rq->lock, cookie);
2896 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2897
2898
2899 switch_to(prev, next, prev);
2900 barrier();
2901
2902 return finish_task_switch(prev);
2903}
2904
2905
2906
2907
2908
2909
2910
2911unsigned long nr_running(void)
2912{
2913 unsigned long i, sum = 0;
2914
2915 for_each_online_cpu(i)
2916 sum += cpu_rq(i)->nr_running;
2917
2918 return sum;
2919}
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934bool single_task_running(void)
2935{
2936 return raw_rq()->nr_running == 1;
2937}
2938EXPORT_SYMBOL(single_task_running);
2939
2940unsigned long long nr_context_switches(void)
2941{
2942 int i;
2943 unsigned long long sum = 0;
2944
2945 for_each_possible_cpu(i)
2946 sum += cpu_rq(i)->nr_switches;
2947
2948 return sum;
2949}
2950
2951unsigned long nr_iowait(void)
2952{
2953 unsigned long i, sum = 0;
2954
2955 for_each_possible_cpu(i)
2956 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2957
2958 return sum;
2959}
2960
2961unsigned long nr_iowait_cpu(int cpu)
2962{
2963 struct rq *this = cpu_rq(cpu);
2964 return atomic_read(&this->nr_iowait);
2965}
2966
2967void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2968{
2969 struct rq *rq = this_rq();
2970 *nr_waiters = atomic_read(&rq->nr_iowait);
2971 *load = rq->load.weight;
2972}
2973
2974#ifdef CONFIG_SMP
2975
2976
2977
2978
2979
2980void sched_exec(void)
2981{
2982 struct task_struct *p = current;
2983 unsigned long flags;
2984 int dest_cpu;
2985
2986 raw_spin_lock_irqsave(&p->pi_lock, flags);
2987 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2988 if (dest_cpu == smp_processor_id())
2989 goto unlock;
2990
2991 if (likely(cpu_active(dest_cpu))) {
2992 struct migration_arg arg = { p, dest_cpu };
2993
2994 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2995 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2996 return;
2997 }
2998unlock:
2999 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3000}
3001
3002#endif
3003
3004DEFINE_PER_CPU(struct kernel_stat, kstat);
3005DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3006
3007EXPORT_PER_CPU_SYMBOL(kstat);
3008EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3009
3010
3011
3012
3013
3014
3015
3016static inline void prefetch_curr_exec_start(struct task_struct *p)
3017{
3018#ifdef CONFIG_FAIR_GROUP_SCHED
3019 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3020#else
3021 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3022#endif
3023 prefetch(curr);
3024 prefetch(&curr->exec_start);
3025}
3026
3027
3028
3029
3030
3031
3032unsigned long long task_sched_runtime(struct task_struct *p)
3033{
3034 struct rq_flags rf;
3035 struct rq *rq;
3036 u64 ns;
3037
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050 if (!p->on_cpu || !task_on_rq_queued(p))
3051 return p->se.sum_exec_runtime;
3052#endif
3053
3054 rq = task_rq_lock(p, &rf);
3055
3056
3057
3058
3059
3060 if (task_current(rq, p) && task_on_rq_queued(p)) {
3061 prefetch_curr_exec_start(p);
3062 update_rq_clock(rq);
3063 p->sched_class->update_curr(rq);
3064 }
3065 ns = p->se.sum_exec_runtime;
3066 task_rq_unlock(rq, p, &rf);
3067
3068 return ns;
3069}
3070
3071
3072
3073
3074
3075void scheduler_tick(void)
3076{
3077 int cpu = smp_processor_id();
3078 struct rq *rq = cpu_rq(cpu);
3079 struct task_struct *curr = rq->curr;
3080
3081 sched_clock_tick();
3082
3083 raw_spin_lock(&rq->lock);
3084 update_rq_clock(rq);
3085 curr->sched_class->task_tick(rq, curr, 0);
3086 cpu_load_update_active(rq);
3087 calc_global_load_tick(rq);
3088 raw_spin_unlock(&rq->lock);
3089
3090 perf_event_task_tick();
3091
3092#ifdef CONFIG_SMP
3093 rq->idle_balance = idle_cpu(cpu);
3094 trigger_load_balance(rq);
3095#endif
3096 rq_last_tick_reset(rq);
3097}
3098
3099#ifdef CONFIG_NO_HZ_FULL
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113u64 scheduler_tick_max_deferment(void)
3114{
3115 struct rq *rq = this_rq();
3116 unsigned long next, now = READ_ONCE(jiffies);
3117
3118 next = rq->last_sched_tick + HZ;
3119
3120 if (time_before_eq(next, now))
3121 return 0;
3122
3123 return jiffies_to_nsecs(next - now);
3124}
3125#endif
3126
3127#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3128 defined(CONFIG_PREEMPT_TRACER))
3129
3130
3131
3132
3133static inline void preempt_latency_start(int val)
3134{
3135 if (preempt_count() == val) {
3136 unsigned long ip = get_lock_parent_ip();
3137#ifdef CONFIG_DEBUG_PREEMPT
3138 current->preempt_disable_ip = ip;
3139#endif
3140 trace_preempt_off(CALLER_ADDR0, ip);
3141 }
3142}
3143
3144void preempt_count_add(int val)
3145{
3146#ifdef CONFIG_DEBUG_PREEMPT
3147
3148
3149
3150 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3151 return;
3152#endif
3153 __preempt_count_add(val);
3154#ifdef CONFIG_DEBUG_PREEMPT
3155
3156
3157
3158 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3159 PREEMPT_MASK - 10);
3160#endif
3161 preempt_latency_start(val);
3162}
3163EXPORT_SYMBOL(preempt_count_add);
3164NOKPROBE_SYMBOL(preempt_count_add);
3165
3166
3167
3168
3169
3170static inline void preempt_latency_stop(int val)
3171{
3172 if (preempt_count() == val)
3173 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3174}
3175
3176void preempt_count_sub(int val)
3177{
3178#ifdef CONFIG_DEBUG_PREEMPT
3179
3180
3181
3182 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3183 return;
3184
3185
3186
3187 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3188 !(preempt_count() & PREEMPT_MASK)))
3189 return;
3190#endif
3191
3192 preempt_latency_stop(val);
3193 __preempt_count_sub(val);
3194}
3195EXPORT_SYMBOL(preempt_count_sub);
3196NOKPROBE_SYMBOL(preempt_count_sub);
3197
3198#else
3199static inline void preempt_latency_start(int val) { }
3200static inline void preempt_latency_stop(int val) { }
3201#endif
3202
3203
3204
3205
3206static noinline void __schedule_bug(struct task_struct *prev)
3207{
3208
3209 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3210
3211 if (oops_in_progress)
3212 return;
3213
3214 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3215 prev->comm, prev->pid, preempt_count());
3216
3217 debug_show_held_locks(prev);
3218 print_modules();
3219 if (irqs_disabled())
3220 print_irqtrace_events(prev);
3221 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3222 && in_atomic_preempt_off()) {
3223 pr_err("Preemption disabled at:");
3224 print_ip_sym(preempt_disable_ip);
3225 pr_cont("\n");
3226 }
3227 if (panic_on_warn)
3228 panic("scheduling while atomic\n");
3229
3230 dump_stack();
3231 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3232}
3233
3234
3235
3236
3237static inline void schedule_debug(struct task_struct *prev)
3238{
3239#ifdef CONFIG_SCHED_STACK_END_CHECK
3240 if (task_stack_end_corrupted(prev))
3241 panic("corrupted stack end detected inside scheduler\n");
3242#endif
3243
3244 if (unlikely(in_atomic_preempt_off())) {
3245 __schedule_bug(prev);
3246 preempt_count_set(PREEMPT_DISABLED);
3247 }
3248 rcu_sleep_check();
3249
3250 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3251
3252 schedstat_inc(this_rq()->sched_count);
3253}
3254
3255
3256
3257
3258static inline struct task_struct *
3259pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
3260{
3261 const struct sched_class *class = &fair_sched_class;
3262 struct task_struct *p;
3263
3264
3265
3266
3267
3268 if (likely(prev->sched_class == class &&
3269 rq->nr_running == rq->cfs.h_nr_running)) {
3270 p = fair_sched_class.pick_next_task(rq, prev, cookie);
3271 if (unlikely(p == RETRY_TASK))
3272 goto again;
3273
3274
3275 if (unlikely(!p))
3276 p = idle_sched_class.pick_next_task(rq, prev, cookie);
3277
3278 return p;
3279 }
3280
3281again:
3282 for_each_class(class) {
3283 p = class->pick_next_task(rq, prev, cookie);
3284 if (p) {
3285 if (unlikely(p == RETRY_TASK))
3286 goto again;
3287 return p;
3288 }
3289 }
3290
3291 BUG();
3292}
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333static void __sched notrace __schedule(bool preempt)
3334{
3335 struct task_struct *prev, *next;
3336 unsigned long *switch_count;
3337 struct pin_cookie cookie;
3338 struct rq *rq;
3339 int cpu;
3340
3341 cpu = smp_processor_id();
3342 rq = cpu_rq(cpu);
3343 prev = rq->curr;
3344
3345 schedule_debug(prev);
3346
3347 if (sched_feat(HRTICK))
3348 hrtick_clear(rq);
3349
3350 local_irq_disable();
3351 rcu_note_context_switch();
3352
3353
3354
3355
3356
3357
3358 smp_mb__before_spinlock();
3359 raw_spin_lock(&rq->lock);
3360 cookie = lockdep_pin_lock(&rq->lock);
3361
3362 rq->clock_skip_update <<= 1;
3363
3364 switch_count = &prev->nivcsw;
3365 if (!preempt && prev->state) {
3366 if (unlikely(signal_pending_state(prev->state, prev))) {
3367 prev->state = TASK_RUNNING;
3368 } else {
3369 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3370 prev->on_rq = 0;
3371
3372
3373
3374
3375
3376
3377 if (prev->flags & PF_WQ_WORKER) {
3378 struct task_struct *to_wakeup;
3379
3380 to_wakeup = wq_worker_sleeping(prev);
3381 if (to_wakeup)
3382 try_to_wake_up_local(to_wakeup, cookie);
3383 }
3384 }
3385 switch_count = &prev->nvcsw;
3386 }
3387
3388 if (task_on_rq_queued(prev))
3389 update_rq_clock(rq);
3390
3391 next = pick_next_task(rq, prev, cookie);
3392 clear_tsk_need_resched(prev);
3393 clear_preempt_need_resched();
3394 rq->clock_skip_update = 0;
3395
3396 if (likely(prev != next)) {
3397 rq->nr_switches++;
3398 rq->curr = next;
3399 ++*switch_count;
3400
3401 trace_sched_switch(preempt, prev, next);
3402 rq = context_switch(rq, prev, next, cookie);
3403 } else {
3404 lockdep_unpin_lock(&rq->lock, cookie);
3405 raw_spin_unlock_irq(&rq->lock);
3406 }
3407
3408 balance_callback(rq);
3409}
3410
3411void __noreturn do_task_dead(void)
3412{
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425 smp_mb();
3426 raw_spin_unlock_wait(¤t->pi_lock);
3427
3428
3429 __set_current_state(TASK_DEAD);
3430 current->flags |= PF_NOFREEZE;
3431 __schedule(false);
3432 BUG();
3433
3434 for (;;)
3435 cpu_relax();
3436}
3437
3438static inline void sched_submit_work(struct task_struct *tsk)
3439{
3440 if (!tsk->state || tsk_is_pi_blocked(tsk))
3441 return;
3442
3443
3444
3445
3446 if (blk_needs_flush_plug(tsk))
3447 blk_schedule_flush_plug(tsk);
3448}
3449
3450asmlinkage __visible void __sched schedule(void)
3451{
3452 struct task_struct *tsk = current;
3453
3454 sched_submit_work(tsk);
3455 do {
3456 preempt_disable();
3457 __schedule(false);
3458 sched_preempt_enable_no_resched();
3459 } while (need_resched());
3460}
3461EXPORT_SYMBOL(schedule);
3462
3463#ifdef CONFIG_CONTEXT_TRACKING
3464asmlinkage __visible void __sched schedule_user(void)
3465{
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476 enum ctx_state prev_state = exception_enter();
3477 schedule();
3478 exception_exit(prev_state);
3479}
3480#endif
3481
3482
3483
3484
3485
3486
3487void __sched schedule_preempt_disabled(void)
3488{
3489 sched_preempt_enable_no_resched();
3490 schedule();
3491 preempt_disable();
3492}
3493
3494static void __sched notrace preempt_schedule_common(void)
3495{
3496 do {
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510 preempt_disable_notrace();
3511 preempt_latency_start(1);
3512 __schedule(true);
3513 preempt_latency_stop(1);
3514 preempt_enable_no_resched_notrace();
3515
3516
3517
3518
3519
3520 } while (need_resched());
3521}
3522
3523#ifdef CONFIG_PREEMPT
3524
3525
3526
3527
3528
3529asmlinkage __visible void __sched notrace preempt_schedule(void)
3530{
3531
3532
3533
3534
3535 if (likely(!preemptible()))
3536 return;
3537
3538 preempt_schedule_common();
3539}
3540NOKPROBE_SYMBOL(preempt_schedule);
3541EXPORT_SYMBOL(preempt_schedule);
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3558{
3559 enum ctx_state prev_ctx;
3560
3561 if (likely(!preemptible()))
3562 return;
3563
3564 do {
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578 preempt_disable_notrace();
3579 preempt_latency_start(1);
3580
3581
3582
3583
3584
3585 prev_ctx = exception_enter();
3586 __schedule(true);
3587 exception_exit(prev_ctx);
3588
3589 preempt_latency_stop(1);
3590 preempt_enable_no_resched_notrace();
3591 } while (need_resched());
3592}
3593EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3594
3595#endif
3596
3597
3598
3599
3600
3601
3602
3603asmlinkage __visible void __sched preempt_schedule_irq(void)
3604{
3605 enum ctx_state prev_state;
3606
3607
3608 BUG_ON(preempt_count() || !irqs_disabled());
3609
3610 prev_state = exception_enter();
3611
3612 do {
3613 preempt_disable();
3614 local_irq_enable();
3615 __schedule(true);
3616 local_irq_disable();
3617 sched_preempt_enable_no_resched();
3618 } while (need_resched());
3619
3620 exception_exit(prev_state);
3621}
3622
3623int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3624 void *key)
3625{
3626 return try_to_wake_up(curr->private, mode, wake_flags);
3627}
3628EXPORT_SYMBOL(default_wake_function);
3629
3630#ifdef CONFIG_RT_MUTEXES
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643void rt_mutex_setprio(struct task_struct *p, int prio)
3644{
3645 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
3646 const struct sched_class *prev_class;
3647 struct rq_flags rf;
3648 struct rq *rq;
3649
3650 BUG_ON(prio > MAX_PRIO);
3651
3652 rq = __task_rq_lock(p, &rf);
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666 if (unlikely(p == rq->idle)) {
3667 WARN_ON(p != rq->curr);
3668 WARN_ON(p->pi_blocked_on);
3669 goto out_unlock;
3670 }
3671
3672 trace_sched_pi_setprio(p, prio);
3673 oldprio = p->prio;
3674
3675 if (oldprio == prio)
3676 queue_flag &= ~DEQUEUE_MOVE;
3677
3678 prev_class = p->sched_class;
3679 queued = task_on_rq_queued(p);
3680 running = task_current(rq, p);
3681 if (queued)
3682 dequeue_task(rq, p, queue_flag);
3683 if (running)
3684 put_prev_task(rq, p);
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695 if (dl_prio(prio)) {
3696 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3697 if (!dl_prio(p->normal_prio) ||
3698 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3699 p->dl.dl_boosted = 1;
3700 queue_flag |= ENQUEUE_REPLENISH;
3701 } else
3702 p->dl.dl_boosted = 0;
3703 p->sched_class = &dl_sched_class;
3704 } else if (rt_prio(prio)) {
3705 if (dl_prio(oldprio))
3706 p->dl.dl_boosted = 0;
3707 if (oldprio < prio)
3708 queue_flag |= ENQUEUE_HEAD;
3709 p->sched_class = &rt_sched_class;
3710 } else {
3711 if (dl_prio(oldprio))
3712 p->dl.dl_boosted = 0;
3713 if (rt_prio(oldprio))
3714 p->rt.timeout = 0;
3715 p->sched_class = &fair_sched_class;
3716 }
3717
3718 p->prio = prio;
3719
3720 if (queued)
3721 enqueue_task(rq, p, queue_flag);
3722 if (running)
3723 set_curr_task(rq, p);
3724
3725 check_class_changed(rq, p, prev_class, oldprio);
3726out_unlock:
3727 preempt_disable();
3728 __task_rq_unlock(rq, &rf);
3729
3730 balance_callback(rq);
3731 preempt_enable();
3732}
3733#endif
3734
3735void set_user_nice(struct task_struct *p, long nice)
3736{
3737 bool queued, running;
3738 int old_prio, delta;
3739 struct rq_flags rf;
3740 struct rq *rq;
3741
3742 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3743 return;
3744
3745
3746
3747
3748 rq = task_rq_lock(p, &rf);
3749
3750
3751
3752
3753
3754
3755 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3756 p->static_prio = NICE_TO_PRIO(nice);
3757 goto out_unlock;
3758 }
3759 queued = task_on_rq_queued(p);
3760 running = task_current(rq, p);
3761 if (queued)
3762 dequeue_task(rq, p, DEQUEUE_SAVE);
3763 if (running)
3764 put_prev_task(rq, p);
3765
3766 p->static_prio = NICE_TO_PRIO(nice);
3767 set_load_weight(p);
3768 old_prio = p->prio;
3769 p->prio = effective_prio(p);
3770 delta = p->prio - old_prio;
3771
3772 if (queued) {
3773 enqueue_task(rq, p, ENQUEUE_RESTORE);
3774
3775
3776
3777
3778 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3779 resched_curr(rq);
3780 }
3781 if (running)
3782 set_curr_task(rq, p);
3783out_unlock:
3784 task_rq_unlock(rq, p, &rf);
3785}
3786EXPORT_SYMBOL(set_user_nice);
3787
3788
3789
3790
3791
3792
3793int can_nice(const struct task_struct *p, const int nice)
3794{
3795
3796 int nice_rlim = nice_to_rlimit(nice);
3797
3798 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3799 capable(CAP_SYS_NICE));
3800}
3801
3802#ifdef __ARCH_WANT_SYS_NICE
3803
3804
3805
3806
3807
3808
3809
3810
3811SYSCALL_DEFINE1(nice, int, increment)
3812{
3813 long nice, retval;
3814
3815
3816
3817
3818
3819
3820 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3821 nice = task_nice(current) + increment;
3822
3823 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3824 if (increment < 0 && !can_nice(current, nice))
3825 return -EPERM;
3826
3827 retval = security_task_setnice(current, nice);
3828 if (retval)
3829 return retval;
3830
3831 set_user_nice(current, nice);
3832 return 0;
3833}
3834
3835#endif
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845int task_prio(const struct task_struct *p)
3846{
3847 return p->prio - MAX_RT_PRIO;
3848}
3849
3850
3851
3852
3853
3854
3855
3856int idle_cpu(int cpu)
3857{
3858 struct rq *rq = cpu_rq(cpu);
3859
3860 if (rq->curr != rq->idle)
3861 return 0;
3862
3863 if (rq->nr_running)
3864 return 0;
3865
3866#ifdef CONFIG_SMP
3867 if (!llist_empty(&rq->wake_list))
3868 return 0;
3869#endif
3870
3871 return 1;
3872}
3873
3874
3875
3876
3877
3878
3879
3880struct task_struct *idle_task(int cpu)
3881{
3882 return cpu_rq(cpu)->idle;
3883}
3884
3885
3886
3887
3888
3889
3890
3891static struct task_struct *find_process_by_pid(pid_t pid)
3892{
3893 return pid ? find_task_by_vpid(pid) : current;
3894}
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904static void
3905__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3906{
3907 struct sched_dl_entity *dl_se = &p->dl;
3908
3909 dl_se->dl_runtime = attr->sched_runtime;
3910 dl_se->dl_deadline = attr->sched_deadline;
3911 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3912 dl_se->flags = attr->sched_flags;
3913 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934}
3935
3936
3937
3938
3939
3940#define SETPARAM_POLICY -1
3941
3942static void __setscheduler_params(struct task_struct *p,
3943 const struct sched_attr *attr)
3944{
3945 int policy = attr->sched_policy;
3946
3947 if (policy == SETPARAM_POLICY)
3948 policy = p->policy;
3949
3950 p->policy = policy;
3951
3952 if (dl_policy(policy))
3953 __setparam_dl(p, attr);
3954 else if (fair_policy(policy))
3955 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3956
3957
3958
3959
3960
3961
3962 p->rt_priority = attr->sched_priority;
3963 p->normal_prio = normal_prio(p);
3964 set_load_weight(p);
3965}
3966
3967
3968static void __setscheduler(struct rq *rq, struct task_struct *p,
3969 const struct sched_attr *attr, bool keep_boost)
3970{
3971 __setscheduler_params(p, attr);
3972
3973
3974
3975
3976
3977 if (keep_boost)
3978 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
3979 else
3980 p->prio = normal_prio(p);
3981
3982 if (dl_prio(p->prio))
3983 p->sched_class = &dl_sched_class;
3984 else if (rt_prio(p->prio))
3985 p->sched_class = &rt_sched_class;
3986 else
3987 p->sched_class = &fair_sched_class;
3988}
3989
3990static void
3991__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3992{
3993 struct sched_dl_entity *dl_se = &p->dl;
3994
3995 attr->sched_priority = p->rt_priority;
3996 attr->sched_runtime = dl_se->dl_runtime;
3997 attr->sched_deadline = dl_se->dl_deadline;
3998 attr->sched_period = dl_se->dl_period;
3999 attr->sched_flags = dl_se->flags;
4000}
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012static bool
4013__checkparam_dl(const struct sched_attr *attr)
4014{
4015
4016 if (attr->sched_deadline == 0)
4017 return false;
4018
4019
4020
4021
4022
4023 if (attr->sched_runtime < (1ULL << DL_SCALE))
4024 return false;
4025
4026
4027
4028
4029
4030 if (attr->sched_deadline & (1ULL << 63) ||
4031 attr->sched_period & (1ULL << 63))
4032 return false;
4033
4034
4035 if ((attr->sched_period != 0 &&
4036 attr->sched_period < attr->sched_deadline) ||
4037 attr->sched_deadline < attr->sched_runtime)
4038 return false;
4039
4040 return true;
4041}
4042
4043
4044
4045
4046static bool check_same_owner(struct task_struct *p)
4047{
4048 const struct cred *cred = current_cred(), *pcred;
4049 bool match;
4050
4051 rcu_read_lock();
4052 pcred = __task_cred(p);
4053 match = (uid_eq(cred->euid, pcred->euid) ||
4054 uid_eq(cred->euid, pcred->uid));
4055 rcu_read_unlock();
4056 return match;
4057}
4058
4059static bool dl_param_changed(struct task_struct *p,
4060 const struct sched_attr *attr)
4061{
4062 struct sched_dl_entity *dl_se = &p->dl;
4063
4064 if (dl_se->dl_runtime != attr->sched_runtime ||
4065 dl_se->dl_deadline != attr->sched_deadline ||
4066 dl_se->dl_period != attr->sched_period ||
4067 dl_se->flags != attr->sched_flags)
4068 return true;
4069
4070 return false;
4071}
4072
4073static int __sched_setscheduler(struct task_struct *p,
4074 const struct sched_attr *attr,
4075 bool user, bool pi)
4076{
4077 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4078 MAX_RT_PRIO - 1 - attr->sched_priority;
4079 int retval, oldprio, oldpolicy = -1, queued, running;
4080 int new_effective_prio, policy = attr->sched_policy;
4081 const struct sched_class *prev_class;
4082 struct rq_flags rf;
4083 int reset_on_fork;
4084 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
4085 struct rq *rq;
4086
4087
4088 BUG_ON(in_interrupt());
4089recheck:
4090
4091 if (policy < 0) {
4092 reset_on_fork = p->sched_reset_on_fork;
4093 policy = oldpolicy = p->policy;
4094 } else {
4095 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4096
4097 if (!valid_policy(policy))
4098 return -EINVAL;
4099 }
4100
4101 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
4102 return -EINVAL;
4103
4104
4105
4106
4107
4108
4109 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4110 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4111 return -EINVAL;
4112 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4113 (rt_policy(policy) != (attr->sched_priority != 0)))
4114 return -EINVAL;
4115
4116
4117
4118
4119 if (user && !capable(CAP_SYS_NICE)) {
4120 if (fair_policy(policy)) {
4121 if (attr->sched_nice < task_nice(p) &&
4122 !can_nice(p, attr->sched_nice))
4123 return -EPERM;
4124 }
4125
4126 if (rt_policy(policy)) {
4127 unsigned long rlim_rtprio =
4128 task_rlimit(p, RLIMIT_RTPRIO);
4129
4130
4131 if (policy != p->policy && !rlim_rtprio)
4132 return -EPERM;
4133
4134
4135 if (attr->sched_priority > p->rt_priority &&
4136 attr->sched_priority > rlim_rtprio)
4137 return -EPERM;
4138 }
4139
4140
4141
4142
4143
4144
4145
4146 if (dl_policy(policy))
4147 return -EPERM;
4148
4149
4150
4151
4152
4153 if (idle_policy(p->policy) && !idle_policy(policy)) {
4154 if (!can_nice(p, task_nice(p)))
4155 return -EPERM;
4156 }
4157
4158
4159 if (!check_same_owner(p))
4160 return -EPERM;
4161
4162
4163 if (p->sched_reset_on_fork && !reset_on_fork)
4164 return -EPERM;
4165 }
4166
4167 if (user) {
4168 retval = security_task_setscheduler(p);
4169 if (retval)
4170 return retval;
4171 }
4172
4173
4174
4175
4176
4177
4178
4179
4180 rq = task_rq_lock(p, &rf);
4181
4182
4183
4184
4185 if (p == rq->stop) {
4186 task_rq_unlock(rq, p, &rf);
4187 return -EINVAL;
4188 }
4189
4190
4191
4192
4193
4194 if (unlikely(policy == p->policy)) {
4195 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4196 goto change;
4197 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4198 goto change;
4199 if (dl_policy(policy) && dl_param_changed(p, attr))
4200 goto change;
4201
4202 p->sched_reset_on_fork = reset_on_fork;
4203 task_rq_unlock(rq, p, &rf);
4204 return 0;
4205 }
4206change:
4207
4208 if (user) {
4209#ifdef CONFIG_RT_GROUP_SCHED
4210
4211
4212
4213
4214 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4215 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4216 !task_group_is_autogroup(task_group(p))) {
4217 task_rq_unlock(rq, p, &rf);
4218 return -EPERM;
4219 }
4220#endif
4221#ifdef CONFIG_SMP
4222 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4223 cpumask_t *span = rq->rd->span;
4224
4225
4226
4227
4228
4229
4230 if (!cpumask_subset(span, &p->cpus_allowed) ||
4231 rq->rd->dl_bw.bw == 0) {
4232 task_rq_unlock(rq, p, &rf);
4233 return -EPERM;
4234 }
4235 }
4236#endif
4237 }
4238
4239
4240 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4241 policy = oldpolicy = -1;
4242 task_rq_unlock(rq, p, &rf);
4243 goto recheck;
4244 }
4245
4246
4247
4248
4249
4250
4251 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
4252 task_rq_unlock(rq, p, &rf);
4253 return -EBUSY;
4254 }
4255
4256 p->sched_reset_on_fork = reset_on_fork;
4257 oldprio = p->prio;
4258
4259 if (pi) {
4260
4261
4262
4263
4264
4265
4266
4267 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
4268 if (new_effective_prio == oldprio)
4269 queue_flags &= ~DEQUEUE_MOVE;
4270 }
4271
4272 queued = task_on_rq_queued(p);
4273 running = task_current(rq, p);
4274 if (queued)
4275 dequeue_task(rq, p, queue_flags);
4276 if (running)
4277 put_prev_task(rq, p);
4278
4279 prev_class = p->sched_class;
4280 __setscheduler(rq, p, attr, pi);
4281
4282 if (queued) {
4283
4284
4285
4286
4287 if (oldprio < p->prio)
4288 queue_flags |= ENQUEUE_HEAD;
4289
4290 enqueue_task(rq, p, queue_flags);
4291 }
4292 if (running)
4293 set_curr_task(rq, p);
4294
4295 check_class_changed(rq, p, prev_class, oldprio);
4296 preempt_disable();
4297 task_rq_unlock(rq, p, &rf);
4298
4299 if (pi)
4300 rt_mutex_adjust_pi(p);
4301
4302
4303
4304
4305 balance_callback(rq);
4306 preempt_enable();
4307
4308 return 0;
4309}
4310
4311static int _sched_setscheduler(struct task_struct *p, int policy,
4312 const struct sched_param *param, bool check)
4313{
4314 struct sched_attr attr = {
4315 .sched_policy = policy,
4316 .sched_priority = param->sched_priority,
4317 .sched_nice = PRIO_TO_NICE(p->static_prio),
4318 };
4319
4320
4321 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4322 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4323 policy &= ~SCHED_RESET_ON_FORK;
4324 attr.sched_policy = policy;
4325 }
4326
4327 return __sched_setscheduler(p, &attr, check, true);
4328}
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339int sched_setscheduler(struct task_struct *p, int policy,
4340 const struct sched_param *param)
4341{
4342 return _sched_setscheduler(p, policy, param, true);
4343}
4344EXPORT_SYMBOL_GPL(sched_setscheduler);
4345
4346int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4347{
4348 return __sched_setscheduler(p, attr, true, true);
4349}
4350EXPORT_SYMBOL_GPL(sched_setattr);
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4366 const struct sched_param *param)
4367{
4368 return _sched_setscheduler(p, policy, param, false);
4369}
4370EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4371
4372static int
4373do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4374{
4375 struct sched_param lparam;
4376 struct task_struct *p;
4377 int retval;
4378
4379 if (!param || pid < 0)
4380 return -EINVAL;
4381 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4382 return -EFAULT;
4383
4384 rcu_read_lock();
4385 retval = -ESRCH;
4386 p = find_process_by_pid(pid);
4387 if (p != NULL)
4388 retval = sched_setscheduler(p, policy, &lparam);
4389 rcu_read_unlock();
4390
4391 return retval;
4392}
4393
4394
4395
4396
4397static int sched_copy_attr(struct sched_attr __user *uattr,
4398 struct sched_attr *attr)
4399{
4400 u32 size;
4401 int ret;
4402
4403 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4404 return -EFAULT;
4405
4406
4407
4408
4409 memset(attr, 0, sizeof(*attr));
4410
4411 ret = get_user(size, &uattr->size);
4412 if (ret)
4413 return ret;
4414
4415 if (size > PAGE_SIZE)
4416 goto err_size;
4417
4418 if (!size)
4419 size = SCHED_ATTR_SIZE_VER0;
4420
4421 if (size < SCHED_ATTR_SIZE_VER0)
4422 goto err_size;
4423
4424
4425
4426
4427
4428
4429
4430 if (size > sizeof(*attr)) {
4431 unsigned char __user *addr;
4432 unsigned char __user *end;
4433 unsigned char val;
4434
4435 addr = (void __user *)uattr + sizeof(*attr);
4436 end = (void __user *)uattr + size;
4437
4438 for (; addr < end; addr++) {
4439 ret = get_user(val, addr);
4440 if (ret)
4441 return ret;
4442 if (val)
4443 goto err_size;
4444 }
4445 size = sizeof(*attr);
4446 }
4447
4448 ret = copy_from_user(attr, uattr, size);
4449 if (ret)
4450 return -EFAULT;
4451
4452
4453
4454
4455
4456 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4457
4458 return 0;
4459
4460err_size:
4461 put_user(sizeof(*attr), &uattr->size);
4462 return -E2BIG;
4463}
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4474 struct sched_param __user *, param)
4475{
4476
4477 if (policy < 0)
4478 return -EINVAL;
4479
4480 return do_sched_setscheduler(pid, policy, param);
4481}
4482
4483
4484
4485
4486
4487
4488
4489
4490SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4491{
4492 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4493}
4494
4495
4496
4497
4498
4499
4500
4501SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4502 unsigned int, flags)
4503{
4504 struct sched_attr attr;
4505 struct task_struct *p;
4506 int retval;
4507
4508 if (!uattr || pid < 0 || flags)
4509 return -EINVAL;
4510
4511 retval = sched_copy_attr(uattr, &attr);
4512 if (retval)
4513 return retval;
4514
4515 if ((int)attr.sched_policy < 0)
4516 return -EINVAL;
4517
4518 rcu_read_lock();
4519 retval = -ESRCH;
4520 p = find_process_by_pid(pid);
4521 if (p != NULL)
4522 retval = sched_setattr(p, &attr);
4523 rcu_read_unlock();
4524
4525 return retval;
4526}
4527
4528
4529
4530
4531
4532
4533
4534
4535SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4536{
4537 struct task_struct *p;
4538 int retval;
4539
4540 if (pid < 0)
4541 return -EINVAL;
4542
4543 retval = -ESRCH;
4544 rcu_read_lock();
4545 p = find_process_by_pid(pid);
4546 if (p) {
4547 retval = security_task_getscheduler(p);
4548 if (!retval)
4549 retval = p->policy
4550 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4551 }
4552 rcu_read_unlock();
4553 return retval;
4554}
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4565{
4566 struct sched_param lp = { .sched_priority = 0 };
4567 struct task_struct *p;
4568 int retval;
4569
4570 if (!param || pid < 0)
4571 return -EINVAL;
4572
4573 rcu_read_lock();
4574 p = find_process_by_pid(pid);
4575 retval = -ESRCH;
4576 if (!p)
4577 goto out_unlock;
4578
4579 retval = security_task_getscheduler(p);
4580 if (retval)
4581 goto out_unlock;
4582
4583 if (task_has_rt_policy(p))
4584 lp.sched_priority = p->rt_priority;
4585 rcu_read_unlock();
4586
4587
4588
4589
4590 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4591
4592 return retval;
4593
4594out_unlock:
4595 rcu_read_unlock();
4596 return retval;
4597}
4598
4599static int sched_read_attr(struct sched_attr __user *uattr,
4600 struct sched_attr *attr,
4601 unsigned int usize)
4602{
4603 int ret;
4604
4605 if (!access_ok(VERIFY_WRITE, uattr, usize))
4606 return -EFAULT;
4607
4608
4609
4610
4611
4612
4613 if (usize < sizeof(*attr)) {
4614 unsigned char *addr;
4615 unsigned char *end;
4616
4617 addr = (void *)attr + usize;
4618 end = (void *)attr + sizeof(*attr);
4619
4620 for (; addr < end; addr++) {
4621 if (*addr)
4622 return -EFBIG;
4623 }
4624
4625 attr->size = usize;
4626 }
4627
4628 ret = copy_to_user(uattr, attr, attr->size);
4629 if (ret)
4630 return -EFAULT;
4631
4632 return 0;
4633}
4634
4635
4636
4637
4638
4639
4640
4641
4642SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4643 unsigned int, size, unsigned int, flags)
4644{
4645 struct sched_attr attr = {
4646 .size = sizeof(struct sched_attr),
4647 };
4648 struct task_struct *p;
4649 int retval;
4650
4651 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4652 size < SCHED_ATTR_SIZE_VER0 || flags)
4653 return -EINVAL;
4654
4655 rcu_read_lock();
4656 p = find_process_by_pid(pid);
4657 retval = -ESRCH;
4658 if (!p)
4659 goto out_unlock;
4660
4661 retval = security_task_getscheduler(p);
4662 if (retval)
4663 goto out_unlock;
4664
4665 attr.sched_policy = p->policy;
4666 if (p->sched_reset_on_fork)
4667 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4668 if (task_has_dl_policy(p))
4669 __getparam_dl(p, &attr);
4670 else if (task_has_rt_policy(p))
4671 attr.sched_priority = p->rt_priority;
4672 else
4673 attr.sched_nice = task_nice(p);
4674
4675 rcu_read_unlock();
4676
4677 retval = sched_read_attr(uattr, &attr, size);
4678 return retval;
4679
4680out_unlock:
4681 rcu_read_unlock();
4682 return retval;
4683}
4684
4685long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4686{
4687 cpumask_var_t cpus_allowed, new_mask;
4688 struct task_struct *p;
4689 int retval;
4690
4691 rcu_read_lock();
4692
4693 p = find_process_by_pid(pid);
4694 if (!p) {
4695 rcu_read_unlock();
4696 return -ESRCH;
4697 }
4698
4699
4700 get_task_struct(p);
4701 rcu_read_unlock();
4702
4703 if (p->flags & PF_NO_SETAFFINITY) {
4704 retval = -EINVAL;
4705 goto out_put_task;
4706 }
4707 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4708 retval = -ENOMEM;
4709 goto out_put_task;
4710 }
4711 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4712 retval = -ENOMEM;
4713 goto out_free_cpus_allowed;
4714 }
4715 retval = -EPERM;
4716 if (!check_same_owner(p)) {
4717 rcu_read_lock();
4718 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4719 rcu_read_unlock();
4720 goto out_free_new_mask;
4721 }
4722 rcu_read_unlock();
4723 }
4724
4725 retval = security_task_setscheduler(p);
4726 if (retval)
4727 goto out_free_new_mask;
4728
4729
4730 cpuset_cpus_allowed(p, cpus_allowed);
4731 cpumask_and(new_mask, in_mask, cpus_allowed);
4732
4733
4734
4735
4736
4737
4738
4739#ifdef CONFIG_SMP
4740 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4741 rcu_read_lock();
4742 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4743 retval = -EBUSY;
4744 rcu_read_unlock();
4745 goto out_free_new_mask;
4746 }
4747 rcu_read_unlock();
4748 }
4749#endif
4750again:
4751 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4752
4753 if (!retval) {
4754 cpuset_cpus_allowed(p, cpus_allowed);
4755 if (!cpumask_subset(new_mask, cpus_allowed)) {
4756
4757
4758
4759
4760
4761 cpumask_copy(new_mask, cpus_allowed);
4762 goto again;
4763 }
4764 }
4765out_free_new_mask:
4766 free_cpumask_var(new_mask);
4767out_free_cpus_allowed:
4768 free_cpumask_var(cpus_allowed);
4769out_put_task:
4770 put_task_struct(p);
4771 return retval;
4772}
4773
4774static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4775 struct cpumask *new_mask)
4776{
4777 if (len < cpumask_size())
4778 cpumask_clear(new_mask);
4779 else if (len > cpumask_size())
4780 len = cpumask_size();
4781
4782 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4783}
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4794 unsigned long __user *, user_mask_ptr)
4795{
4796 cpumask_var_t new_mask;
4797 int retval;
4798
4799 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4800 return -ENOMEM;
4801
4802 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4803 if (retval == 0)
4804 retval = sched_setaffinity(pid, new_mask);
4805 free_cpumask_var(new_mask);
4806 return retval;
4807}
4808
4809long sched_getaffinity(pid_t pid, struct cpumask *mask)
4810{
4811 struct task_struct *p;
4812 unsigned long flags;
4813 int retval;
4814
4815 rcu_read_lock();
4816
4817 retval = -ESRCH;
4818 p = find_process_by_pid(pid);
4819 if (!p)
4820 goto out_unlock;
4821
4822 retval = security_task_getscheduler(p);
4823 if (retval)
4824 goto out_unlock;
4825
4826 raw_spin_lock_irqsave(&p->pi_lock, flags);
4827 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4828 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4829
4830out_unlock:
4831 rcu_read_unlock();
4832
4833 return retval;
4834}
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4846 unsigned long __user *, user_mask_ptr)
4847{
4848 int ret;
4849 cpumask_var_t mask;
4850
4851 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4852 return -EINVAL;
4853 if (len & (sizeof(unsigned long)-1))
4854 return -EINVAL;
4855
4856 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4857 return -ENOMEM;
4858
4859 ret = sched_getaffinity(pid, mask);
4860 if (ret == 0) {
4861 size_t retlen = min_t(size_t, len, cpumask_size());
4862
4863 if (copy_to_user(user_mask_ptr, mask, retlen))
4864 ret = -EFAULT;
4865 else
4866 ret = retlen;
4867 }
4868 free_cpumask_var(mask);
4869
4870 return ret;
4871}
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881SYSCALL_DEFINE0(sched_yield)
4882{
4883 struct rq *rq = this_rq_lock();
4884
4885 schedstat_inc(rq->yld_count);
4886 current->sched_class->yield_task(rq);
4887
4888
4889
4890
4891
4892 __release(rq->lock);
4893 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4894 do_raw_spin_unlock(&rq->lock);
4895 sched_preempt_enable_no_resched();
4896
4897 schedule();
4898
4899 return 0;
4900}
4901
4902#ifndef CONFIG_PREEMPT
4903int __sched _cond_resched(void)
4904{
4905 if (should_resched(0)) {
4906 preempt_schedule_common();
4907 return 1;
4908 }
4909 return 0;
4910}
4911EXPORT_SYMBOL(_cond_resched);
4912#endif
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922int __cond_resched_lock(spinlock_t *lock)
4923{
4924 int resched = should_resched(PREEMPT_LOCK_OFFSET);
4925 int ret = 0;
4926
4927 lockdep_assert_held(lock);
4928
4929 if (spin_needbreak(lock) || resched) {
4930 spin_unlock(lock);
4931 if (resched)
4932 preempt_schedule_common();
4933 else
4934 cpu_relax();
4935 ret = 1;
4936 spin_lock(lock);
4937 }
4938 return ret;
4939}
4940EXPORT_SYMBOL(__cond_resched_lock);
4941
4942int __sched __cond_resched_softirq(void)
4943{
4944 BUG_ON(!in_softirq());
4945
4946 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
4947 local_bh_enable();
4948 preempt_schedule_common();
4949 local_bh_disable();
4950 return 1;
4951 }
4952 return 0;
4953}
4954EXPORT_SYMBOL(__cond_resched_softirq);
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978void __sched yield(void)
4979{
4980 set_current_state(TASK_RUNNING);
4981 sys_sched_yield();
4982}
4983EXPORT_SYMBOL(yield);
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000int __sched yield_to(struct task_struct *p, bool preempt)
5001{
5002 struct task_struct *curr = current;
5003 struct rq *rq, *p_rq;
5004 unsigned long flags;
5005 int yielded = 0;
5006
5007 local_irq_save(flags);
5008 rq = this_rq();
5009
5010again:
5011 p_rq = task_rq(p);
5012
5013
5014
5015
5016 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5017 yielded = -ESRCH;
5018 goto out_irq;
5019 }
5020
5021 double_rq_lock(rq, p_rq);
5022 if (task_rq(p) != p_rq) {
5023 double_rq_unlock(rq, p_rq);
5024 goto again;
5025 }
5026
5027 if (!curr->sched_class->yield_to_task)
5028 goto out_unlock;
5029
5030 if (curr->sched_class != p->sched_class)
5031 goto out_unlock;
5032
5033 if (task_running(p_rq, p) || p->state)
5034 goto out_unlock;
5035
5036 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5037 if (yielded) {
5038 schedstat_inc(rq->yld_count);
5039
5040
5041
5042
5043 if (preempt && rq != p_rq)
5044 resched_curr(p_rq);
5045 }
5046
5047out_unlock:
5048 double_rq_unlock(rq, p_rq);
5049out_irq:
5050 local_irq_restore(flags);
5051
5052 if (yielded > 0)
5053 schedule();
5054
5055 return yielded;
5056}
5057EXPORT_SYMBOL_GPL(yield_to);
5058
5059
5060
5061
5062
5063long __sched io_schedule_timeout(long timeout)
5064{
5065 int old_iowait = current->in_iowait;
5066 struct rq *rq;
5067 long ret;
5068
5069 current->in_iowait = 1;
5070 blk_schedule_flush_plug(current);
5071
5072 delayacct_blkio_start();
5073 rq = raw_rq();
5074 atomic_inc(&rq->nr_iowait);
5075 ret = schedule_timeout(timeout);
5076 current->in_iowait = old_iowait;
5077 atomic_dec(&rq->nr_iowait);
5078 delayacct_blkio_end();
5079
5080 return ret;
5081}
5082EXPORT_SYMBOL(io_schedule_timeout);
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5093{
5094 int ret = -EINVAL;
5095
5096 switch (policy) {
5097 case SCHED_FIFO:
5098 case SCHED_RR:
5099 ret = MAX_USER_RT_PRIO-1;
5100 break;
5101 case SCHED_DEADLINE:
5102 case SCHED_NORMAL:
5103 case SCHED_BATCH:
5104 case SCHED_IDLE:
5105 ret = 0;
5106 break;
5107 }
5108 return ret;
5109}
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5120{
5121 int ret = -EINVAL;
5122
5123 switch (policy) {
5124 case SCHED_FIFO:
5125 case SCHED_RR:
5126 ret = 1;
5127 break;
5128 case SCHED_DEADLINE:
5129 case SCHED_NORMAL:
5130 case SCHED_BATCH:
5131 case SCHED_IDLE:
5132 ret = 0;
5133 }
5134 return ret;
5135}
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5149 struct timespec __user *, interval)
5150{
5151 struct task_struct *p;
5152 unsigned int time_slice;
5153 struct rq_flags rf;
5154 struct timespec t;
5155 struct rq *rq;
5156 int retval;
5157
5158 if (pid < 0)
5159 return -EINVAL;
5160
5161 retval = -ESRCH;
5162 rcu_read_lock();
5163 p = find_process_by_pid(pid);
5164 if (!p)
5165 goto out_unlock;
5166
5167 retval = security_task_getscheduler(p);
5168 if (retval)
5169 goto out_unlock;
5170
5171 rq = task_rq_lock(p, &rf);
5172 time_slice = 0;
5173 if (p->sched_class->get_rr_interval)
5174 time_slice = p->sched_class->get_rr_interval(rq, p);
5175 task_rq_unlock(rq, p, &rf);
5176
5177 rcu_read_unlock();
5178 jiffies_to_timespec(time_slice, &t);
5179 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5180 return retval;
5181
5182out_unlock:
5183 rcu_read_unlock();
5184 return retval;
5185}
5186
5187static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5188
5189void sched_show_task(struct task_struct *p)
5190{
5191 unsigned long free = 0;
5192 int ppid;
5193 unsigned long state = p->state;
5194
5195 if (!try_get_task_stack(p))
5196 return;
5197 if (state)
5198 state = __ffs(state) + 1;
5199 printk(KERN_INFO "%-15.15s %c", p->comm,
5200 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5201 if (state == TASK_RUNNING)
5202 printk(KERN_CONT " running task ");
5203#ifdef CONFIG_DEBUG_STACK_USAGE
5204 free = stack_not_used(p);
5205#endif
5206 ppid = 0;
5207 rcu_read_lock();
5208 if (pid_alive(p))
5209 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5210 rcu_read_unlock();
5211 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5212 task_pid_nr(p), ppid,
5213 (unsigned long)task_thread_info(p)->flags);
5214
5215 print_worker_info(KERN_INFO, p);
5216 show_stack(p, NULL);
5217 put_task_stack(p);
5218}
5219
5220void show_state_filter(unsigned long state_filter)
5221{
5222 struct task_struct *g, *p;
5223
5224#if BITS_PER_LONG == 32
5225 printk(KERN_INFO
5226 " task PC stack pid father\n");
5227#else
5228 printk(KERN_INFO
5229 " task PC stack pid father\n");
5230#endif
5231 rcu_read_lock();
5232 for_each_process_thread(g, p) {
5233
5234
5235
5236
5237
5238
5239
5240 touch_nmi_watchdog();
5241 touch_all_softlockup_watchdogs();
5242 if (!state_filter || (p->state & state_filter))
5243 sched_show_task(p);
5244 }
5245
5246#ifdef CONFIG_SCHED_DEBUG
5247 if (!state_filter)
5248 sysrq_sched_debug_show();
5249#endif
5250 rcu_read_unlock();
5251
5252
5253
5254 if (!state_filter)
5255 debug_show_all_locks();
5256}
5257
5258void init_idle_bootup_task(struct task_struct *idle)
5259{
5260 idle->sched_class = &idle_sched_class;
5261}
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271void init_idle(struct task_struct *idle, int cpu)
5272{
5273 struct rq *rq = cpu_rq(cpu);
5274 unsigned long flags;
5275
5276 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5277 raw_spin_lock(&rq->lock);
5278
5279 __sched_fork(0, idle);
5280 idle->state = TASK_RUNNING;
5281 idle->se.exec_start = sched_clock();
5282
5283 kasan_unpoison_task_stack(idle);
5284
5285#ifdef CONFIG_SMP
5286
5287
5288
5289
5290
5291
5292 set_cpus_allowed_common(idle, cpumask_of(cpu));
5293#endif
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304 rcu_read_lock();
5305 __set_task_cpu(idle, cpu);
5306 rcu_read_unlock();
5307
5308 rq->curr = rq->idle = idle;
5309 idle->on_rq = TASK_ON_RQ_QUEUED;
5310#ifdef CONFIG_SMP
5311 idle->on_cpu = 1;
5312#endif
5313 raw_spin_unlock(&rq->lock);
5314 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5315
5316
5317 init_idle_preempt_count(idle, cpu);
5318
5319
5320
5321
5322 idle->sched_class = &idle_sched_class;
5323 ftrace_graph_init_idle_task(idle, cpu);
5324 vtime_init_idle(idle, cpu);
5325#ifdef CONFIG_SMP
5326 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5327#endif
5328}
5329
5330int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5331 const struct cpumask *trial)
5332{
5333 int ret = 1, trial_cpus;
5334 struct dl_bw *cur_dl_b;
5335 unsigned long flags;
5336
5337 if (!cpumask_weight(cur))
5338 return ret;
5339
5340 rcu_read_lock_sched();
5341 cur_dl_b = dl_bw_of(cpumask_any(cur));
5342 trial_cpus = cpumask_weight(trial);
5343
5344 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5345 if (cur_dl_b->bw != -1 &&
5346 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5347 ret = 0;
5348 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5349 rcu_read_unlock_sched();
5350
5351 return ret;
5352}
5353
5354int task_can_attach(struct task_struct *p,
5355 const struct cpumask *cs_cpus_allowed)
5356{
5357 int ret = 0;
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368 if (p->flags & PF_NO_SETAFFINITY) {
5369 ret = -EINVAL;
5370 goto out;
5371 }
5372
5373#ifdef CONFIG_SMP
5374 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5375 cs_cpus_allowed)) {
5376 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5377 cs_cpus_allowed);
5378 struct dl_bw *dl_b;
5379 bool overflow;
5380 int cpus;
5381 unsigned long flags;
5382
5383 rcu_read_lock_sched();
5384 dl_b = dl_bw_of(dest_cpu);
5385 raw_spin_lock_irqsave(&dl_b->lock, flags);
5386 cpus = dl_bw_cpus(dest_cpu);
5387 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5388 if (overflow)
5389 ret = -EBUSY;
5390 else {
5391
5392
5393
5394
5395
5396
5397 __dl_add(dl_b, p->dl.dl_bw);
5398 }
5399 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5400 rcu_read_unlock_sched();
5401
5402 }
5403#endif
5404out:
5405 return ret;
5406}
5407
5408#ifdef CONFIG_SMP
5409
5410static bool sched_smp_initialized __read_mostly;
5411
5412#ifdef CONFIG_NUMA_BALANCING
5413
5414int migrate_task_to(struct task_struct *p, int target_cpu)
5415{
5416 struct migration_arg arg = { p, target_cpu };
5417 int curr_cpu = task_cpu(p);
5418
5419 if (curr_cpu == target_cpu)
5420 return 0;
5421
5422 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
5423 return -EINVAL;
5424
5425
5426
5427 trace_sched_move_numa(p, curr_cpu, target_cpu);
5428 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5429}
5430
5431
5432
5433
5434
5435void sched_setnuma(struct task_struct *p, int nid)
5436{
5437 bool queued, running;
5438 struct rq_flags rf;
5439 struct rq *rq;
5440
5441 rq = task_rq_lock(p, &rf);
5442 queued = task_on_rq_queued(p);
5443 running = task_current(rq, p);
5444
5445 if (queued)
5446 dequeue_task(rq, p, DEQUEUE_SAVE);
5447 if (running)
5448 put_prev_task(rq, p);
5449
5450 p->numa_preferred_nid = nid;
5451
5452 if (queued)
5453 enqueue_task(rq, p, ENQUEUE_RESTORE);
5454 if (running)
5455 set_curr_task(rq, p);
5456 task_rq_unlock(rq, p, &rf);
5457}
5458#endif
5459
5460#ifdef CONFIG_HOTPLUG_CPU
5461
5462
5463
5464
5465void idle_task_exit(void)
5466{
5467 struct mm_struct *mm = current->active_mm;
5468
5469 BUG_ON(cpu_online(smp_processor_id()));
5470
5471 if (mm != &init_mm) {
5472 switch_mm_irqs_off(mm, &init_mm, current);
5473 finish_arch_post_lock_switch();
5474 }
5475 mmdrop(mm);
5476}
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487static void calc_load_migrate(struct rq *rq)
5488{
5489 long delta = calc_load_fold_active(rq, 1);
5490 if (delta)
5491 atomic_long_add(delta, &calc_load_tasks);
5492}
5493
5494static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5495{
5496}
5497
5498static const struct sched_class fake_sched_class = {
5499 .put_prev_task = put_prev_task_fake,
5500};
5501
5502static struct task_struct fake_task = {
5503
5504
5505
5506 .prio = MAX_PRIO + 1,
5507 .sched_class = &fake_sched_class,
5508};
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518static void migrate_tasks(struct rq *dead_rq)
5519{
5520 struct rq *rq = dead_rq;
5521 struct task_struct *next, *stop = rq->stop;
5522 struct pin_cookie cookie;
5523 int dest_cpu;
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534 rq->stop = NULL;
5535
5536
5537
5538
5539
5540
5541 update_rq_clock(rq);
5542
5543 for (;;) {
5544
5545
5546
5547
5548 if (rq->nr_running == 1)
5549 break;
5550
5551
5552
5553
5554 cookie = lockdep_pin_lock(&rq->lock);
5555 next = pick_next_task(rq, &fake_task, cookie);
5556 BUG_ON(!next);
5557 next->sched_class->put_prev_task(rq, next);
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568 lockdep_unpin_lock(&rq->lock, cookie);
5569 raw_spin_unlock(&rq->lock);
5570 raw_spin_lock(&next->pi_lock);
5571 raw_spin_lock(&rq->lock);
5572
5573
5574
5575
5576
5577
5578 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5579 raw_spin_unlock(&next->pi_lock);
5580 continue;
5581 }
5582
5583
5584 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5585
5586 rq = __migrate_task(rq, next, dest_cpu);
5587 if (rq != dead_rq) {
5588 raw_spin_unlock(&rq->lock);
5589 rq = dead_rq;
5590 raw_spin_lock(&rq->lock);
5591 }
5592 raw_spin_unlock(&next->pi_lock);
5593 }
5594
5595 rq->stop = stop;
5596}
5597#endif
5598
5599static void set_rq_online(struct rq *rq)
5600{
5601 if (!rq->online) {
5602 const struct sched_class *class;
5603
5604 cpumask_set_cpu(rq->cpu, rq->rd->online);
5605 rq->online = 1;
5606
5607 for_each_class(class) {
5608 if (class->rq_online)
5609 class->rq_online(rq);
5610 }
5611 }
5612}
5613
5614static void set_rq_offline(struct rq *rq)
5615{
5616 if (rq->online) {
5617 const struct sched_class *class;
5618
5619 for_each_class(class) {
5620 if (class->rq_offline)
5621 class->rq_offline(rq);
5622 }
5623
5624 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5625 rq->online = 0;
5626 }
5627}
5628
5629static void set_cpu_rq_start_time(unsigned int cpu)
5630{
5631 struct rq *rq = cpu_rq(cpu);
5632
5633 rq->age_stamp = sched_clock_cpu(cpu);
5634}
5635
5636static cpumask_var_t sched_domains_tmpmask;
5637
5638#ifdef CONFIG_SCHED_DEBUG
5639
5640static __read_mostly int sched_debug_enabled;
5641
5642static int __init sched_debug_setup(char *str)
5643{
5644 sched_debug_enabled = 1;
5645
5646 return 0;
5647}
5648early_param("sched_debug", sched_debug_setup);
5649
5650static inline bool sched_debug(void)
5651{
5652 return sched_debug_enabled;
5653}
5654
5655static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5656 struct cpumask *groupmask)
5657{
5658 struct sched_group *group = sd->groups;
5659
5660 cpumask_clear(groupmask);
5661
5662 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5663
5664 if (!(sd->flags & SD_LOAD_BALANCE)) {
5665 printk("does not load-balance\n");
5666 if (sd->parent)
5667 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5668 " has parent");
5669 return -1;
5670 }
5671
5672 printk(KERN_CONT "span %*pbl level %s\n",
5673 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5674
5675 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5676 printk(KERN_ERR "ERROR: domain->span does not contain "
5677 "CPU%d\n", cpu);
5678 }
5679 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5680 printk(KERN_ERR "ERROR: domain->groups does not contain"
5681 " CPU%d\n", cpu);
5682 }
5683
5684 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5685 do {
5686 if (!group) {
5687 printk("\n");
5688 printk(KERN_ERR "ERROR: group is NULL\n");
5689 break;
5690 }
5691
5692 if (!cpumask_weight(sched_group_cpus(group))) {
5693 printk(KERN_CONT "\n");
5694 printk(KERN_ERR "ERROR: empty group\n");
5695 break;
5696 }
5697
5698 if (!(sd->flags & SD_OVERLAP) &&
5699 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5700 printk(KERN_CONT "\n");
5701 printk(KERN_ERR "ERROR: repeated CPUs\n");
5702 break;
5703 }
5704
5705 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5706
5707 printk(KERN_CONT " %*pbl",
5708 cpumask_pr_args(sched_group_cpus(group)));
5709 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5710 printk(KERN_CONT " (cpu_capacity = %d)",
5711 group->sgc->capacity);
5712 }
5713
5714 group = group->next;
5715 } while (group != sd->groups);
5716 printk(KERN_CONT "\n");
5717
5718 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5719 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5720
5721 if (sd->parent &&
5722 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5723 printk(KERN_ERR "ERROR: parent span is not a superset "
5724 "of domain->span\n");
5725 return 0;
5726}
5727
5728static void sched_domain_debug(struct sched_domain *sd, int cpu)
5729{
5730 int level = 0;
5731
5732 if (!sched_debug_enabled)
5733 return;
5734
5735 if (!sd) {
5736 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5737 return;
5738 }
5739
5740 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5741
5742 for (;;) {
5743 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5744 break;
5745 level++;
5746 sd = sd->parent;
5747 if (!sd)
5748 break;
5749 }
5750}
5751#else
5752
5753# define sched_debug_enabled 0
5754# define sched_domain_debug(sd, cpu) do { } while (0)
5755static inline bool sched_debug(void)
5756{
5757 return false;
5758}
5759#endif
5760
5761static int sd_degenerate(struct sched_domain *sd)
5762{
5763 if (cpumask_weight(sched_domain_span(sd)) == 1)
5764 return 1;
5765
5766
5767 if (sd->flags & (SD_LOAD_BALANCE |
5768 SD_BALANCE_NEWIDLE |
5769 SD_BALANCE_FORK |
5770 SD_BALANCE_EXEC |
5771 SD_SHARE_CPUCAPACITY |
5772 SD_ASYM_CPUCAPACITY |
5773 SD_SHARE_PKG_RESOURCES |
5774 SD_SHARE_POWERDOMAIN)) {
5775 if (sd->groups != sd->groups->next)
5776 return 0;
5777 }
5778
5779
5780 if (sd->flags & (SD_WAKE_AFFINE))
5781 return 0;
5782
5783 return 1;
5784}
5785
5786static int
5787sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5788{
5789 unsigned long cflags = sd->flags, pflags = parent->flags;
5790
5791 if (sd_degenerate(parent))
5792 return 1;
5793
5794 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5795 return 0;
5796
5797
5798 if (parent->groups == parent->groups->next) {
5799 pflags &= ~(SD_LOAD_BALANCE |
5800 SD_BALANCE_NEWIDLE |
5801 SD_BALANCE_FORK |
5802 SD_BALANCE_EXEC |
5803 SD_ASYM_CPUCAPACITY |
5804 SD_SHARE_CPUCAPACITY |
5805 SD_SHARE_PKG_RESOURCES |
5806 SD_PREFER_SIBLING |
5807 SD_SHARE_POWERDOMAIN);
5808 if (nr_node_ids == 1)
5809 pflags &= ~SD_SERIALIZE;
5810 }
5811 if (~cflags & pflags)
5812 return 0;
5813
5814 return 1;
5815}
5816
5817static void free_rootdomain(struct rcu_head *rcu)
5818{
5819 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5820
5821 cpupri_cleanup(&rd->cpupri);
5822 cpudl_cleanup(&rd->cpudl);
5823 free_cpumask_var(rd->dlo_mask);
5824 free_cpumask_var(rd->rto_mask);
5825 free_cpumask_var(rd->online);
5826 free_cpumask_var(rd->span);
5827 kfree(rd);
5828}
5829
5830static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5831{
5832 struct root_domain *old_rd = NULL;
5833 unsigned long flags;
5834
5835 raw_spin_lock_irqsave(&rq->lock, flags);
5836
5837 if (rq->rd) {
5838 old_rd = rq->rd;
5839
5840 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5841 set_rq_offline(rq);
5842
5843 cpumask_clear_cpu(rq->cpu, old_rd->span);
5844
5845
5846
5847
5848
5849
5850 if (!atomic_dec_and_test(&old_rd->refcount))
5851 old_rd = NULL;
5852 }
5853
5854 atomic_inc(&rd->refcount);
5855 rq->rd = rd;
5856
5857 cpumask_set_cpu(rq->cpu, rd->span);
5858 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5859 set_rq_online(rq);
5860
5861 raw_spin_unlock_irqrestore(&rq->lock, flags);
5862
5863 if (old_rd)
5864 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5865}
5866
5867static int init_rootdomain(struct root_domain *rd)
5868{
5869 memset(rd, 0, sizeof(*rd));
5870
5871 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
5872 goto out;
5873 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
5874 goto free_span;
5875 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5876 goto free_online;
5877 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5878 goto free_dlo_mask;
5879
5880 init_dl_bw(&rd->dl_bw);
5881 if (cpudl_init(&rd->cpudl) != 0)
5882 goto free_dlo_mask;
5883
5884 if (cpupri_init(&rd->cpupri) != 0)
5885 goto free_rto_mask;
5886 return 0;
5887
5888free_rto_mask:
5889 free_cpumask_var(rd->rto_mask);
5890free_dlo_mask:
5891 free_cpumask_var(rd->dlo_mask);
5892free_online:
5893 free_cpumask_var(rd->online);
5894free_span:
5895 free_cpumask_var(rd->span);
5896out:
5897 return -ENOMEM;
5898}
5899
5900
5901
5902
5903
5904struct root_domain def_root_domain;
5905
5906static void init_defrootdomain(void)
5907{
5908 init_rootdomain(&def_root_domain);
5909
5910 atomic_set(&def_root_domain.refcount, 1);
5911}
5912
5913static struct root_domain *alloc_rootdomain(void)
5914{
5915 struct root_domain *rd;
5916
5917 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5918 if (!rd)
5919 return NULL;
5920
5921 if (init_rootdomain(rd) != 0) {
5922 kfree(rd);
5923 return NULL;
5924 }
5925
5926 return rd;
5927}
5928
5929static void free_sched_groups(struct sched_group *sg, int free_sgc)
5930{
5931 struct sched_group *tmp, *first;
5932
5933 if (!sg)
5934 return;
5935
5936 first = sg;
5937 do {
5938 tmp = sg->next;
5939
5940 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5941 kfree(sg->sgc);
5942
5943 kfree(sg);
5944 sg = tmp;
5945 } while (sg != first);
5946}
5947
5948static void destroy_sched_domain(struct sched_domain *sd)
5949{
5950
5951
5952
5953
5954 if (sd->flags & SD_OVERLAP) {
5955 free_sched_groups(sd->groups, 1);
5956 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5957 kfree(sd->groups->sgc);
5958 kfree(sd->groups);
5959 }
5960 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
5961 kfree(sd->shared);
5962 kfree(sd);
5963}
5964
5965static void destroy_sched_domains_rcu(struct rcu_head *rcu)
5966{
5967 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5968
5969 while (sd) {
5970 struct sched_domain *parent = sd->parent;
5971 destroy_sched_domain(sd);
5972 sd = parent;
5973 }
5974}
5975
5976static void destroy_sched_domains(struct sched_domain *sd)
5977{
5978 if (sd)
5979 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
5980}
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5992DEFINE_PER_CPU(int, sd_llc_size);
5993DEFINE_PER_CPU(int, sd_llc_id);
5994DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
5995DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5996DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5997
5998static void update_top_cache_domain(int cpu)
5999{
6000 struct sched_domain_shared *sds = NULL;
6001 struct sched_domain *sd;
6002 int id = cpu;
6003 int size = 1;
6004
6005 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6006 if (sd) {
6007 id = cpumask_first(sched_domain_span(sd));
6008 size = cpumask_weight(sched_domain_span(sd));
6009 sds = sd->shared;
6010 }
6011
6012 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6013 per_cpu(sd_llc_size, cpu) = size;
6014 per_cpu(sd_llc_id, cpu) = id;
6015 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
6016
6017 sd = lowest_flag_domain(cpu, SD_NUMA);
6018 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
6019
6020 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
6021 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
6022}
6023
6024
6025
6026
6027
6028static void
6029cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6030{
6031 struct rq *rq = cpu_rq(cpu);
6032 struct sched_domain *tmp;
6033
6034
6035 for (tmp = sd; tmp; ) {
6036 struct sched_domain *parent = tmp->parent;
6037 if (!parent)
6038 break;
6039
6040 if (sd_parent_degenerate(tmp, parent)) {
6041 tmp->parent = parent->parent;
6042 if (parent->parent)
6043 parent->parent->child = tmp;
6044
6045
6046
6047
6048
6049 if (parent->flags & SD_PREFER_SIBLING)
6050 tmp->flags |= SD_PREFER_SIBLING;
6051 destroy_sched_domain(parent);
6052 } else
6053 tmp = tmp->parent;
6054 }
6055
6056 if (sd && sd_degenerate(sd)) {
6057 tmp = sd;
6058 sd = sd->parent;
6059 destroy_sched_domain(tmp);
6060 if (sd)
6061 sd->child = NULL;
6062 }
6063
6064 sched_domain_debug(sd, cpu);
6065
6066 rq_attach_root(rq, rd);
6067 tmp = rq->sd;
6068 rcu_assign_pointer(rq->sd, sd);
6069 destroy_sched_domains(tmp);
6070
6071 update_top_cache_domain(cpu);
6072}
6073
6074
6075static int __init isolated_cpu_setup(char *str)
6076{
6077 int ret;
6078
6079 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6080 ret = cpulist_parse(str, cpu_isolated_map);
6081 if (ret) {
6082 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
6083 return 0;
6084 }
6085 return 1;
6086}
6087__setup("isolcpus=", isolated_cpu_setup);
6088
6089struct s_data {
6090 struct sched_domain ** __percpu sd;
6091 struct root_domain *rd;
6092};
6093
6094enum s_alloc {
6095 sa_rootdomain,
6096 sa_sd,
6097 sa_sd_storage,
6098 sa_none,
6099};
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6115{
6116 const struct cpumask *span = sched_domain_span(sd);
6117 struct sd_data *sdd = sd->private;
6118 struct sched_domain *sibling;
6119 int i;
6120
6121 for_each_cpu(i, span) {
6122 sibling = *per_cpu_ptr(sdd->sd, i);
6123 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6124 continue;
6125
6126 cpumask_set_cpu(i, sched_group_mask(sg));
6127 }
6128}
6129
6130
6131
6132
6133
6134int group_balance_cpu(struct sched_group *sg)
6135{
6136 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6137}
6138
6139static int
6140build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6141{
6142 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6143 const struct cpumask *span = sched_domain_span(sd);
6144 struct cpumask *covered = sched_domains_tmpmask;
6145 struct sd_data *sdd = sd->private;
6146 struct sched_domain *sibling;
6147 int i;
6148
6149 cpumask_clear(covered);
6150
6151 for_each_cpu(i, span) {
6152 struct cpumask *sg_span;
6153
6154 if (cpumask_test_cpu(i, covered))
6155 continue;
6156
6157 sibling = *per_cpu_ptr(sdd->sd, i);
6158
6159
6160 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6161 continue;
6162
6163 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6164 GFP_KERNEL, cpu_to_node(cpu));
6165
6166 if (!sg)
6167 goto fail;
6168
6169 sg_span = sched_group_cpus(sg);
6170 if (sibling->child)
6171 cpumask_copy(sg_span, sched_domain_span(sibling->child));
6172 else
6173 cpumask_set_cpu(i, sg_span);
6174
6175 cpumask_or(covered, covered, sg_span);
6176
6177 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
6178 if (atomic_inc_return(&sg->sgc->ref) == 1)
6179 build_group_mask(sd, sg);
6180
6181
6182
6183
6184
6185
6186 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
6187
6188
6189
6190
6191
6192
6193 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6194 group_balance_cpu(sg) == cpu)
6195 groups = sg;
6196
6197 if (!first)
6198 first = sg;
6199 if (last)
6200 last->next = sg;
6201 last = sg;
6202 last->next = first;
6203 }
6204 sd->groups = groups;
6205
6206 return 0;
6207
6208fail:
6209 free_sched_groups(first, 0);
6210
6211 return -ENOMEM;
6212}
6213
6214static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6215{
6216 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6217 struct sched_domain *child = sd->child;
6218
6219 if (child)
6220 cpu = cpumask_first(sched_domain_span(child));
6221
6222 if (sg) {
6223 *sg = *per_cpu_ptr(sdd->sg, cpu);
6224 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
6225 atomic_set(&(*sg)->sgc->ref, 1);
6226 }
6227
6228 return cpu;
6229}
6230
6231
6232
6233
6234
6235
6236
6237
6238static int
6239build_sched_groups(struct sched_domain *sd, int cpu)
6240{
6241 struct sched_group *first = NULL, *last = NULL;
6242 struct sd_data *sdd = sd->private;
6243 const struct cpumask *span = sched_domain_span(sd);
6244 struct cpumask *covered;
6245 int i;
6246
6247 get_group(cpu, sdd, &sd->groups);
6248 atomic_inc(&sd->groups->ref);
6249
6250 if (cpu != cpumask_first(span))
6251 return 0;
6252
6253 lockdep_assert_held(&sched_domains_mutex);
6254 covered = sched_domains_tmpmask;
6255
6256 cpumask_clear(covered);
6257
6258 for_each_cpu(i, span) {
6259 struct sched_group *sg;
6260 int group, j;
6261
6262 if (cpumask_test_cpu(i, covered))
6263 continue;
6264
6265 group = get_group(i, sdd, &sg);
6266 cpumask_setall(sched_group_mask(sg));
6267
6268 for_each_cpu(j, span) {
6269 if (get_group(j, sdd, NULL) != group)
6270 continue;
6271
6272 cpumask_set_cpu(j, covered);
6273 cpumask_set_cpu(j, sched_group_cpus(sg));
6274 }
6275
6276 if (!first)
6277 first = sg;
6278 if (last)
6279 last->next = sg;
6280 last = sg;
6281 }
6282 last->next = first;
6283
6284 return 0;
6285}
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6298{
6299 struct sched_group *sg = sd->groups;
6300
6301 WARN_ON(!sg);
6302
6303 do {
6304 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6305 sg = sg->next;
6306 } while (sg != sd->groups);
6307
6308 if (cpu != group_balance_cpu(sg))
6309 return;
6310
6311 update_group_capacity(sd, cpu);
6312}
6313
6314
6315
6316
6317
6318
6319static int default_relax_domain_level = -1;
6320int sched_domain_level_max;
6321
6322static int __init setup_relax_domain_level(char *str)
6323{
6324 if (kstrtoint(str, 0, &default_relax_domain_level))
6325 pr_warn("Unable to set relax_domain_level\n");
6326
6327 return 1;
6328}
6329__setup("relax_domain_level=", setup_relax_domain_level);
6330
6331static void set_domain_attribute(struct sched_domain *sd,
6332 struct sched_domain_attr *attr)
6333{
6334 int request;
6335
6336 if (!attr || attr->relax_domain_level < 0) {
6337 if (default_relax_domain_level < 0)
6338 return;
6339 else
6340 request = default_relax_domain_level;
6341 } else
6342 request = attr->relax_domain_level;
6343 if (request < sd->level) {
6344
6345 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6346 } else {
6347
6348 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6349 }
6350}
6351
6352static void __sdt_free(const struct cpumask *cpu_map);
6353static int __sdt_alloc(const struct cpumask *cpu_map);
6354
6355static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6356 const struct cpumask *cpu_map)
6357{
6358 switch (what) {
6359 case sa_rootdomain:
6360 if (!atomic_read(&d->rd->refcount))
6361 free_rootdomain(&d->rd->rcu);
6362 case sa_sd:
6363 free_percpu(d->sd);
6364 case sa_sd_storage:
6365 __sdt_free(cpu_map);
6366 case sa_none:
6367 break;
6368 }
6369}
6370
6371static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6372 const struct cpumask *cpu_map)
6373{
6374 memset(d, 0, sizeof(*d));
6375
6376 if (__sdt_alloc(cpu_map))
6377 return sa_sd_storage;
6378 d->sd = alloc_percpu(struct sched_domain *);
6379 if (!d->sd)
6380 return sa_sd_storage;
6381 d->rd = alloc_rootdomain();
6382 if (!d->rd)
6383 return sa_sd;
6384 return sa_rootdomain;
6385}
6386
6387
6388
6389
6390
6391
6392static void claim_allocations(int cpu, struct sched_domain *sd)
6393{
6394 struct sd_data *sdd = sd->private;
6395
6396 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6397 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6398
6399 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
6400 *per_cpu_ptr(sdd->sds, cpu) = NULL;
6401
6402 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6403 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6404
6405 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
6406 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
6407}
6408
6409#ifdef CONFIG_NUMA
6410static int sched_domains_numa_levels;
6411enum numa_topology_type sched_numa_topology_type;
6412static int *sched_domains_numa_distance;
6413int sched_max_numa_distance;
6414static struct cpumask ***sched_domains_numa_masks;
6415static int sched_domains_curr_level;
6416#endif
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436#define TOPOLOGY_SD_FLAGS \
6437 (SD_SHARE_CPUCAPACITY | \
6438 SD_SHARE_PKG_RESOURCES | \
6439 SD_NUMA | \
6440 SD_ASYM_PACKING | \
6441 SD_ASYM_CPUCAPACITY | \
6442 SD_SHARE_POWERDOMAIN)
6443
6444static struct sched_domain *
6445sd_init(struct sched_domain_topology_level *tl,
6446 const struct cpumask *cpu_map,
6447 struct sched_domain *child, int cpu)
6448{
6449 struct sd_data *sdd = &tl->data;
6450 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6451 int sd_id, sd_weight, sd_flags = 0;
6452
6453#ifdef CONFIG_NUMA
6454
6455
6456
6457 sched_domains_curr_level = tl->numa_level;
6458#endif
6459
6460 sd_weight = cpumask_weight(tl->mask(cpu));
6461
6462 if (tl->sd_flags)
6463 sd_flags = (*tl->sd_flags)();
6464 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6465 "wrong sd_flags in topology description\n"))
6466 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6467
6468 *sd = (struct sched_domain){
6469 .min_interval = sd_weight,
6470 .max_interval = 2*sd_weight,
6471 .busy_factor = 32,
6472 .imbalance_pct = 125,
6473
6474 .cache_nice_tries = 0,
6475 .busy_idx = 0,
6476 .idle_idx = 0,
6477 .newidle_idx = 0,
6478 .wake_idx = 0,
6479 .forkexec_idx = 0,
6480
6481 .flags = 1*SD_LOAD_BALANCE
6482 | 1*SD_BALANCE_NEWIDLE
6483 | 1*SD_BALANCE_EXEC
6484 | 1*SD_BALANCE_FORK
6485 | 0*SD_BALANCE_WAKE
6486 | 1*SD_WAKE_AFFINE
6487 | 0*SD_SHARE_CPUCAPACITY
6488 | 0*SD_SHARE_PKG_RESOURCES
6489 | 0*SD_SERIALIZE
6490 | 0*SD_PREFER_SIBLING
6491 | 0*SD_NUMA
6492 | sd_flags
6493 ,
6494
6495 .last_balance = jiffies,
6496 .balance_interval = sd_weight,
6497 .smt_gain = 0,
6498 .max_newidle_lb_cost = 0,
6499 .next_decay_max_lb_cost = jiffies,
6500 .child = child,
6501#ifdef CONFIG_SCHED_DEBUG
6502 .name = tl->name,
6503#endif
6504 };
6505
6506 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6507 sd_id = cpumask_first(sched_domain_span(sd));
6508
6509
6510
6511
6512
6513 if (sd->flags & SD_ASYM_CPUCAPACITY) {
6514 struct sched_domain *t = sd;
6515
6516 for_each_lower_domain(t)
6517 t->flags |= SD_BALANCE_WAKE;
6518 }
6519
6520 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6521 sd->flags |= SD_PREFER_SIBLING;
6522 sd->imbalance_pct = 110;
6523 sd->smt_gain = 1178;
6524
6525 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6526 sd->imbalance_pct = 117;
6527 sd->cache_nice_tries = 1;
6528 sd->busy_idx = 2;
6529
6530#ifdef CONFIG_NUMA
6531 } else if (sd->flags & SD_NUMA) {
6532 sd->cache_nice_tries = 2;
6533 sd->busy_idx = 3;
6534 sd->idle_idx = 2;
6535
6536 sd->flags |= SD_SERIALIZE;
6537 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6538 sd->flags &= ~(SD_BALANCE_EXEC |
6539 SD_BALANCE_FORK |
6540 SD_WAKE_AFFINE);
6541 }
6542
6543#endif
6544 } else {
6545 sd->flags |= SD_PREFER_SIBLING;
6546 sd->cache_nice_tries = 1;
6547 sd->busy_idx = 2;
6548 sd->idle_idx = 1;
6549 }
6550
6551
6552
6553
6554
6555 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6556 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
6557 atomic_inc(&sd->shared->ref);
6558 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
6559 }
6560
6561 sd->private = sdd;
6562
6563 return sd;
6564}
6565
6566
6567
6568
6569static struct sched_domain_topology_level default_topology[] = {
6570#ifdef CONFIG_SCHED_SMT
6571 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6572#endif
6573#ifdef CONFIG_SCHED_MC
6574 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6575#endif
6576 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6577 { NULL, },
6578};
6579
6580static struct sched_domain_topology_level *sched_domain_topology =
6581 default_topology;
6582
6583#define for_each_sd_topology(tl) \
6584 for (tl = sched_domain_topology; tl->mask; tl++)
6585
6586void set_sched_topology(struct sched_domain_topology_level *tl)
6587{
6588 if (WARN_ON_ONCE(sched_smp_initialized))
6589 return;
6590
6591 sched_domain_topology = tl;
6592}
6593
6594#ifdef CONFIG_NUMA
6595
6596static const struct cpumask *sd_numa_mask(int cpu)
6597{
6598 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6599}
6600
6601static void sched_numa_warn(const char *str)
6602{
6603 static int done = false;
6604 int i,j;
6605
6606 if (done)
6607 return;
6608
6609 done = true;
6610
6611 printk(KERN_WARNING "ERROR: %s\n\n", str);
6612
6613 for (i = 0; i < nr_node_ids; i++) {
6614 printk(KERN_WARNING " ");
6615 for (j = 0; j < nr_node_ids; j++)
6616 printk(KERN_CONT "%02d ", node_distance(i,j));
6617 printk(KERN_CONT "\n");
6618 }
6619 printk(KERN_WARNING "\n");
6620}
6621
6622bool find_numa_distance(int distance)
6623{
6624 int i;
6625
6626 if (distance == node_distance(0, 0))
6627 return true;
6628
6629 for (i = 0; i < sched_domains_numa_levels; i++) {
6630 if (sched_domains_numa_distance[i] == distance)
6631 return true;
6632 }
6633
6634 return false;
6635}
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656static void init_numa_topology_type(void)
6657{
6658 int a, b, c, n;
6659
6660 n = sched_max_numa_distance;
6661
6662 if (sched_domains_numa_levels <= 1) {
6663 sched_numa_topology_type = NUMA_DIRECT;
6664 return;
6665 }
6666
6667 for_each_online_node(a) {
6668 for_each_online_node(b) {
6669
6670 if (node_distance(a, b) < n)
6671 continue;
6672
6673
6674 for_each_online_node(c) {
6675 if (node_distance(a, c) < n &&
6676 node_distance(b, c) < n) {
6677 sched_numa_topology_type =
6678 NUMA_GLUELESS_MESH;
6679 return;
6680 }
6681 }
6682
6683 sched_numa_topology_type = NUMA_BACKPLANE;
6684 return;
6685 }
6686 }
6687}
6688
6689static void sched_init_numa(void)
6690{
6691 int next_distance, curr_distance = node_distance(0, 0);
6692 struct sched_domain_topology_level *tl;
6693 int level = 0;
6694 int i, j, k;
6695
6696 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6697 if (!sched_domains_numa_distance)
6698 return;
6699
6700
6701
6702
6703
6704
6705
6706
6707 next_distance = curr_distance;
6708 for (i = 0; i < nr_node_ids; i++) {
6709 for (j = 0; j < nr_node_ids; j++) {
6710 for (k = 0; k < nr_node_ids; k++) {
6711 int distance = node_distance(i, k);
6712
6713 if (distance > curr_distance &&
6714 (distance < next_distance ||
6715 next_distance == curr_distance))
6716 next_distance = distance;
6717
6718
6719
6720
6721
6722
6723 if (sched_debug() && node_distance(k, i) != distance)
6724 sched_numa_warn("Node-distance not symmetric");
6725
6726 if (sched_debug() && i && !find_numa_distance(distance))
6727 sched_numa_warn("Node-0 not representative");
6728 }
6729 if (next_distance != curr_distance) {
6730 sched_domains_numa_distance[level++] = next_distance;
6731 sched_domains_numa_levels = level;
6732 curr_distance = next_distance;
6733 } else break;
6734 }
6735
6736
6737
6738
6739 if (!sched_debug())
6740 break;
6741 }
6742
6743 if (!level)
6744 return;
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763 sched_domains_numa_levels = 0;
6764
6765 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6766 if (!sched_domains_numa_masks)
6767 return;
6768
6769
6770
6771
6772
6773 for (i = 0; i < level; i++) {
6774 sched_domains_numa_masks[i] =
6775 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6776 if (!sched_domains_numa_masks[i])
6777 return;
6778
6779 for (j = 0; j < nr_node_ids; j++) {
6780 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6781 if (!mask)
6782 return;
6783
6784 sched_domains_numa_masks[i][j] = mask;
6785
6786 for_each_node(k) {
6787 if (node_distance(j, k) > sched_domains_numa_distance[i])
6788 continue;
6789
6790 cpumask_or(mask, mask, cpumask_of_node(k));
6791 }
6792 }
6793 }
6794
6795
6796 for (i = 0; sched_domain_topology[i].mask; i++);
6797
6798 tl = kzalloc((i + level + 1) *
6799 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6800 if (!tl)
6801 return;
6802
6803
6804
6805
6806 for (i = 0; sched_domain_topology[i].mask; i++)
6807 tl[i] = sched_domain_topology[i];
6808
6809
6810
6811
6812 for (j = 0; j < level; i++, j++) {
6813 tl[i] = (struct sched_domain_topology_level){
6814 .mask = sd_numa_mask,
6815 .sd_flags = cpu_numa_flags,
6816 .flags = SDTL_OVERLAP,
6817 .numa_level = j,
6818 SD_INIT_NAME(NUMA)
6819 };
6820 }
6821
6822 sched_domain_topology = tl;
6823
6824 sched_domains_numa_levels = level;
6825 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6826
6827 init_numa_topology_type();
6828}
6829
6830static void sched_domains_numa_masks_set(unsigned int cpu)
6831{
6832 int node = cpu_to_node(cpu);
6833 int i, j;
6834
6835 for (i = 0; i < sched_domains_numa_levels; i++) {
6836 for (j = 0; j < nr_node_ids; j++) {
6837 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6838 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6839 }
6840 }
6841}
6842
6843static void sched_domains_numa_masks_clear(unsigned int cpu)
6844{
6845 int i, j;
6846
6847 for (i = 0; i < sched_domains_numa_levels; i++) {
6848 for (j = 0; j < nr_node_ids; j++)
6849 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6850 }
6851}
6852
6853#else
6854static inline void sched_init_numa(void) { }
6855static void sched_domains_numa_masks_set(unsigned int cpu) { }
6856static void sched_domains_numa_masks_clear(unsigned int cpu) { }
6857#endif
6858
6859static int __sdt_alloc(const struct cpumask *cpu_map)
6860{
6861 struct sched_domain_topology_level *tl;
6862 int j;
6863
6864 for_each_sd_topology(tl) {
6865 struct sd_data *sdd = &tl->data;
6866
6867 sdd->sd = alloc_percpu(struct sched_domain *);
6868 if (!sdd->sd)
6869 return -ENOMEM;
6870
6871 sdd->sds = alloc_percpu(struct sched_domain_shared *);
6872 if (!sdd->sds)
6873 return -ENOMEM;
6874
6875 sdd->sg = alloc_percpu(struct sched_group *);
6876 if (!sdd->sg)
6877 return -ENOMEM;
6878
6879 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6880 if (!sdd->sgc)
6881 return -ENOMEM;
6882
6883 for_each_cpu(j, cpu_map) {
6884 struct sched_domain *sd;
6885 struct sched_domain_shared *sds;
6886 struct sched_group *sg;
6887 struct sched_group_capacity *sgc;
6888
6889 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6890 GFP_KERNEL, cpu_to_node(j));
6891 if (!sd)
6892 return -ENOMEM;
6893
6894 *per_cpu_ptr(sdd->sd, j) = sd;
6895
6896 sds = kzalloc_node(sizeof(struct sched_domain_shared),
6897 GFP_KERNEL, cpu_to_node(j));
6898 if (!sds)
6899 return -ENOMEM;
6900
6901 *per_cpu_ptr(sdd->sds, j) = sds;
6902
6903 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6904 GFP_KERNEL, cpu_to_node(j));
6905 if (!sg)
6906 return -ENOMEM;
6907
6908 sg->next = sg;
6909
6910 *per_cpu_ptr(sdd->sg, j) = sg;
6911
6912 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6913 GFP_KERNEL, cpu_to_node(j));
6914 if (!sgc)
6915 return -ENOMEM;
6916
6917 *per_cpu_ptr(sdd->sgc, j) = sgc;
6918 }
6919 }
6920
6921 return 0;
6922}
6923
6924static void __sdt_free(const struct cpumask *cpu_map)
6925{
6926 struct sched_domain_topology_level *tl;
6927 int j;
6928
6929 for_each_sd_topology(tl) {
6930 struct sd_data *sdd = &tl->data;
6931
6932 for_each_cpu(j, cpu_map) {
6933 struct sched_domain *sd;
6934
6935 if (sdd->sd) {
6936 sd = *per_cpu_ptr(sdd->sd, j);
6937 if (sd && (sd->flags & SD_OVERLAP))
6938 free_sched_groups(sd->groups, 0);
6939 kfree(*per_cpu_ptr(sdd->sd, j));
6940 }
6941
6942 if (sdd->sds)
6943 kfree(*per_cpu_ptr(sdd->sds, j));
6944 if (sdd->sg)
6945 kfree(*per_cpu_ptr(sdd->sg, j));
6946 if (sdd->sgc)
6947 kfree(*per_cpu_ptr(sdd->sgc, j));
6948 }
6949 free_percpu(sdd->sd);
6950 sdd->sd = NULL;
6951 free_percpu(sdd->sds);
6952 sdd->sds = NULL;
6953 free_percpu(sdd->sg);
6954 sdd->sg = NULL;
6955 free_percpu(sdd->sgc);
6956 sdd->sgc = NULL;
6957 }
6958}
6959
6960struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6961 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6962 struct sched_domain *child, int cpu)
6963{
6964 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
6965
6966 if (child) {
6967 sd->level = child->level + 1;
6968 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6969 child->parent = sd;
6970
6971 if (!cpumask_subset(sched_domain_span(child),
6972 sched_domain_span(sd))) {
6973 pr_err("BUG: arch topology borken\n");
6974#ifdef CONFIG_SCHED_DEBUG
6975 pr_err(" the %s domain not a subset of the %s domain\n",
6976 child->name, sd->name);
6977#endif
6978
6979 cpumask_or(sched_domain_span(sd),
6980 sched_domain_span(sd),
6981 sched_domain_span(child));
6982 }
6983
6984 }
6985 set_domain_attribute(sd, attr);
6986
6987 return sd;
6988}
6989
6990
6991
6992
6993
6994static int build_sched_domains(const struct cpumask *cpu_map,
6995 struct sched_domain_attr *attr)
6996{
6997 enum s_alloc alloc_state;
6998 struct sched_domain *sd;
6999 struct s_data d;
7000 struct rq *rq = NULL;
7001 int i, ret = -ENOMEM;
7002
7003 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7004 if (alloc_state != sa_rootdomain)
7005 goto error;
7006
7007
7008 for_each_cpu(i, cpu_map) {
7009 struct sched_domain_topology_level *tl;
7010
7011 sd = NULL;
7012 for_each_sd_topology(tl) {
7013 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
7014 if (tl == sched_domain_topology)
7015 *per_cpu_ptr(d.sd, i) = sd;
7016 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7017 sd->flags |= SD_OVERLAP;
7018 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7019 break;
7020 }
7021 }
7022
7023
7024 for_each_cpu(i, cpu_map) {
7025 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7026 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7027 if (sd->flags & SD_OVERLAP) {
7028 if (build_overlap_sched_groups(sd, i))
7029 goto error;
7030 } else {
7031 if (build_sched_groups(sd, i))
7032 goto error;
7033 }
7034 }
7035 }
7036
7037
7038 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7039 if (!cpumask_test_cpu(i, cpu_map))
7040 continue;
7041
7042 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7043 claim_allocations(i, sd);
7044 init_sched_groups_capacity(i, sd);
7045 }
7046 }
7047
7048
7049 rcu_read_lock();
7050 for_each_cpu(i, cpu_map) {
7051 rq = cpu_rq(i);
7052 sd = *per_cpu_ptr(d.sd, i);
7053
7054
7055 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
7056 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
7057
7058 cpu_attach_domain(sd, d.rd, i);
7059 }
7060 rcu_read_unlock();
7061
7062 if (rq && sched_debug_enabled) {
7063 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
7064 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
7065 }
7066
7067 ret = 0;
7068error:
7069 __free_domain_allocs(&d, alloc_state, cpu_map);
7070 return ret;
7071}
7072
7073static cpumask_var_t *doms_cur;
7074static int ndoms_cur;
7075static struct sched_domain_attr *dattr_cur;
7076
7077
7078
7079
7080
7081
7082
7083static cpumask_var_t fallback_doms;
7084
7085
7086
7087
7088
7089
7090int __weak arch_update_cpu_topology(void)
7091{
7092 return 0;
7093}
7094
7095cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7096{
7097 int i;
7098 cpumask_var_t *doms;
7099
7100 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7101 if (!doms)
7102 return NULL;
7103 for (i = 0; i < ndoms; i++) {
7104 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7105 free_sched_domains(doms, i);
7106 return NULL;
7107 }
7108 }
7109 return doms;
7110}
7111
7112void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7113{
7114 unsigned int i;
7115 for (i = 0; i < ndoms; i++)
7116 free_cpumask_var(doms[i]);
7117 kfree(doms);
7118}
7119
7120
7121
7122
7123
7124
7125static int init_sched_domains(const struct cpumask *cpu_map)
7126{
7127 int err;
7128
7129 arch_update_cpu_topology();
7130 ndoms_cur = 1;
7131 doms_cur = alloc_sched_domains(ndoms_cur);
7132 if (!doms_cur)
7133 doms_cur = &fallback_doms;
7134 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7135 err = build_sched_domains(doms_cur[0], NULL);
7136 register_sched_domain_sysctl();
7137
7138 return err;
7139}
7140
7141
7142
7143
7144
7145static void detach_destroy_domains(const struct cpumask *cpu_map)
7146{
7147 int i;
7148
7149 rcu_read_lock();
7150 for_each_cpu(i, cpu_map)
7151 cpu_attach_domain(NULL, &def_root_domain, i);
7152 rcu_read_unlock();
7153}
7154
7155
7156static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7157 struct sched_domain_attr *new, int idx_new)
7158{
7159 struct sched_domain_attr tmp;
7160
7161
7162 if (!new && !cur)
7163 return 1;
7164
7165 tmp = SD_ATTR_INIT;
7166 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7167 new ? (new + idx_new) : &tmp,
7168 sizeof(struct sched_domain_attr));
7169}
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7198 struct sched_domain_attr *dattr_new)
7199{
7200 int i, j, n;
7201 int new_topology;
7202
7203 mutex_lock(&sched_domains_mutex);
7204
7205
7206 unregister_sched_domain_sysctl();
7207
7208
7209 new_topology = arch_update_cpu_topology();
7210
7211 n = doms_new ? ndoms_new : 0;
7212
7213
7214 for (i = 0; i < ndoms_cur; i++) {
7215 for (j = 0; j < n && !new_topology; j++) {
7216 if (cpumask_equal(doms_cur[i], doms_new[j])
7217 && dattrs_equal(dattr_cur, i, dattr_new, j))
7218 goto match1;
7219 }
7220
7221 detach_destroy_domains(doms_cur[i]);
7222match1:
7223 ;
7224 }
7225
7226 n = ndoms_cur;
7227 if (doms_new == NULL) {
7228 n = 0;
7229 doms_new = &fallback_doms;
7230 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7231 WARN_ON_ONCE(dattr_new);
7232 }
7233
7234
7235 for (i = 0; i < ndoms_new; i++) {
7236 for (j = 0; j < n && !new_topology; j++) {
7237 if (cpumask_equal(doms_new[i], doms_cur[j])
7238 && dattrs_equal(dattr_new, i, dattr_cur, j))
7239 goto match2;
7240 }
7241
7242 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7243match2:
7244 ;
7245 }
7246
7247
7248 if (doms_cur != &fallback_doms)
7249 free_sched_domains(doms_cur, ndoms_cur);
7250 kfree(dattr_cur);
7251 doms_cur = doms_new;
7252 dattr_cur = dattr_new;
7253 ndoms_cur = ndoms_new;
7254
7255 register_sched_domain_sysctl();
7256
7257 mutex_unlock(&sched_domains_mutex);
7258}
7259
7260static int num_cpus_frozen;
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270static void cpuset_cpu_active(void)
7271{
7272 if (cpuhp_tasks_frozen) {
7273
7274
7275
7276
7277
7278
7279 num_cpus_frozen--;
7280 if (likely(num_cpus_frozen)) {
7281 partition_sched_domains(1, NULL, NULL);
7282 return;
7283 }
7284
7285
7286
7287
7288
7289 }
7290 cpuset_update_active_cpus(true);
7291}
7292
7293static int cpuset_cpu_inactive(unsigned int cpu)
7294{
7295 unsigned long flags;
7296 struct dl_bw *dl_b;
7297 bool overflow;
7298 int cpus;
7299
7300 if (!cpuhp_tasks_frozen) {
7301 rcu_read_lock_sched();
7302 dl_b = dl_bw_of(cpu);
7303
7304 raw_spin_lock_irqsave(&dl_b->lock, flags);
7305 cpus = dl_bw_cpus(cpu);
7306 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7307 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7308
7309 rcu_read_unlock_sched();
7310
7311 if (overflow)
7312 return -EBUSY;
7313 cpuset_update_active_cpus(false);
7314 } else {
7315 num_cpus_frozen++;
7316 partition_sched_domains(1, NULL, NULL);
7317 }
7318 return 0;
7319}
7320
7321int sched_cpu_activate(unsigned int cpu)
7322{
7323 struct rq *rq = cpu_rq(cpu);
7324 unsigned long flags;
7325
7326 set_cpu_active(cpu, true);
7327
7328 if (sched_smp_initialized) {
7329 sched_domains_numa_masks_set(cpu);
7330 cpuset_cpu_active();
7331 }
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342 raw_spin_lock_irqsave(&rq->lock, flags);
7343 if (rq->rd) {
7344 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7345 set_rq_online(rq);
7346 }
7347 raw_spin_unlock_irqrestore(&rq->lock, flags);
7348
7349 update_max_interval();
7350
7351 return 0;
7352}
7353
7354int sched_cpu_deactivate(unsigned int cpu)
7355{
7356 int ret;
7357
7358 set_cpu_active(cpu, false);
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369 if (IS_ENABLED(CONFIG_PREEMPT))
7370 synchronize_rcu_mult(call_rcu, call_rcu_sched);
7371 else
7372 synchronize_rcu();
7373
7374 if (!sched_smp_initialized)
7375 return 0;
7376
7377 ret = cpuset_cpu_inactive(cpu);
7378 if (ret) {
7379 set_cpu_active(cpu, true);
7380 return ret;
7381 }
7382 sched_domains_numa_masks_clear(cpu);
7383 return 0;
7384}
7385
7386static void sched_rq_cpu_starting(unsigned int cpu)
7387{
7388 struct rq *rq = cpu_rq(cpu);
7389
7390 rq->calc_load_update = calc_load_update;
7391 update_max_interval();
7392}
7393
7394int sched_cpu_starting(unsigned int cpu)
7395{
7396 set_cpu_rq_start_time(cpu);
7397 sched_rq_cpu_starting(cpu);
7398 return 0;
7399}
7400
7401#ifdef CONFIG_HOTPLUG_CPU
7402int sched_cpu_dying(unsigned int cpu)
7403{
7404 struct rq *rq = cpu_rq(cpu);
7405 unsigned long flags;
7406
7407
7408 sched_ttwu_pending();
7409 raw_spin_lock_irqsave(&rq->lock, flags);
7410 if (rq->rd) {
7411 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7412 set_rq_offline(rq);
7413 }
7414 migrate_tasks(rq);
7415 BUG_ON(rq->nr_running != 1);
7416 raw_spin_unlock_irqrestore(&rq->lock, flags);
7417 calc_load_migrate(rq);
7418 update_max_interval();
7419 nohz_balance_exit_idle(cpu);
7420 hrtick_clear(rq);
7421 return 0;
7422}
7423#endif
7424
7425#ifdef CONFIG_SCHED_SMT
7426DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7427
7428static void sched_init_smt(void)
7429{
7430
7431
7432
7433
7434 if (cpumask_weight(cpu_smt_mask(0)) > 1)
7435 static_branch_enable(&sched_smt_present);
7436}
7437#else
7438static inline void sched_init_smt(void) { }
7439#endif
7440
7441void __init sched_init_smp(void)
7442{
7443 cpumask_var_t non_isolated_cpus;
7444
7445 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7446 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7447
7448 sched_init_numa();
7449
7450
7451
7452
7453
7454
7455 mutex_lock(&sched_domains_mutex);
7456 init_sched_domains(cpu_active_mask);
7457 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7458 if (cpumask_empty(non_isolated_cpus))
7459 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7460 mutex_unlock(&sched_domains_mutex);
7461
7462
7463 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7464 BUG();
7465 sched_init_granularity();
7466 free_cpumask_var(non_isolated_cpus);
7467
7468 init_sched_rt_class();
7469 init_sched_dl_class();
7470
7471 sched_init_smt();
7472
7473 sched_smp_initialized = true;
7474}
7475
7476static int __init migration_init(void)
7477{
7478 sched_rq_cpu_starting(smp_processor_id());
7479 return 0;
7480}
7481early_initcall(migration_init);
7482
7483#else
7484void __init sched_init_smp(void)
7485{
7486 sched_init_granularity();
7487}
7488#endif
7489
7490int in_sched_functions(unsigned long addr)
7491{
7492 return in_lock_functions(addr) ||
7493 (addr >= (unsigned long)__sched_text_start
7494 && addr < (unsigned long)__sched_text_end);
7495}
7496
7497#ifdef CONFIG_CGROUP_SCHED
7498
7499
7500
7501
7502struct task_group root_task_group;
7503LIST_HEAD(task_groups);
7504
7505
7506static struct kmem_cache *task_group_cache __read_mostly;
7507#endif
7508
7509DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7510DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7511
7512#define WAIT_TABLE_BITS 8
7513#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
7514static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
7515
7516wait_queue_head_t *bit_waitqueue(void *word, int bit)
7517{
7518 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
7519 unsigned long val = (unsigned long)word << shift | bit;
7520
7521 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
7522}
7523EXPORT_SYMBOL(bit_waitqueue);
7524
7525void __init sched_init(void)
7526{
7527 int i, j;
7528 unsigned long alloc_size = 0, ptr;
7529
7530 for (i = 0; i < WAIT_TABLE_SIZE; i++)
7531 init_waitqueue_head(bit_wait_table + i);
7532
7533#ifdef CONFIG_FAIR_GROUP_SCHED
7534 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7535#endif
7536#ifdef CONFIG_RT_GROUP_SCHED
7537 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7538#endif
7539 if (alloc_size) {
7540 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7541
7542#ifdef CONFIG_FAIR_GROUP_SCHED
7543 root_task_group.se = (struct sched_entity **)ptr;
7544 ptr += nr_cpu_ids * sizeof(void **);
7545
7546 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7547 ptr += nr_cpu_ids * sizeof(void **);
7548
7549#endif
7550#ifdef CONFIG_RT_GROUP_SCHED
7551 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7552 ptr += nr_cpu_ids * sizeof(void **);
7553
7554 root_task_group.rt_rq = (struct rt_rq **)ptr;
7555 ptr += nr_cpu_ids * sizeof(void **);
7556
7557#endif
7558 }
7559#ifdef CONFIG_CPUMASK_OFFSTACK
7560 for_each_possible_cpu(i) {
7561 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7562 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7563 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7564 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7565 }
7566#endif
7567
7568 init_rt_bandwidth(&def_rt_bandwidth,
7569 global_rt_period(), global_rt_runtime());
7570 init_dl_bandwidth(&def_dl_bandwidth,
7571 global_rt_period(), global_rt_runtime());
7572
7573#ifdef CONFIG_SMP
7574 init_defrootdomain();
7575#endif
7576
7577#ifdef CONFIG_RT_GROUP_SCHED
7578 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7579 global_rt_period(), global_rt_runtime());
7580#endif
7581
7582#ifdef CONFIG_CGROUP_SCHED
7583 task_group_cache = KMEM_CACHE(task_group, 0);
7584
7585 list_add(&root_task_group.list, &task_groups);
7586 INIT_LIST_HEAD(&root_task_group.children);
7587 INIT_LIST_HEAD(&root_task_group.siblings);
7588 autogroup_init(&init_task);
7589#endif
7590
7591 for_each_possible_cpu(i) {
7592 struct rq *rq;
7593
7594 rq = cpu_rq(i);
7595 raw_spin_lock_init(&rq->lock);
7596 rq->nr_running = 0;
7597 rq->calc_load_active = 0;
7598 rq->calc_load_update = jiffies + LOAD_FREQ;
7599 init_cfs_rq(&rq->cfs);
7600 init_rt_rq(&rq->rt);
7601 init_dl_rq(&rq->dl);
7602#ifdef CONFIG_FAIR_GROUP_SCHED
7603 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7604 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7625 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7626#endif
7627
7628 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7629#ifdef CONFIG_RT_GROUP_SCHED
7630 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7631#endif
7632
7633 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7634 rq->cpu_load[j] = 0;
7635
7636#ifdef CONFIG_SMP
7637 rq->sd = NULL;
7638 rq->rd = NULL;
7639 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7640 rq->balance_callback = NULL;
7641 rq->active_balance = 0;
7642 rq->next_balance = jiffies;
7643 rq->push_cpu = 0;
7644 rq->cpu = i;
7645 rq->online = 0;
7646 rq->idle_stamp = 0;
7647 rq->avg_idle = 2*sysctl_sched_migration_cost;
7648 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7649
7650 INIT_LIST_HEAD(&rq->cfs_tasks);
7651
7652 rq_attach_root(rq, &def_root_domain);
7653#ifdef CONFIG_NO_HZ_COMMON
7654 rq->last_load_update_tick = jiffies;
7655 rq->nohz_flags = 0;
7656#endif
7657#ifdef CONFIG_NO_HZ_FULL
7658 rq->last_sched_tick = 0;
7659#endif
7660#endif
7661 init_rq_hrtick(rq);
7662 atomic_set(&rq->nr_iowait, 0);
7663 }
7664
7665 set_load_weight(&init_task);
7666
7667
7668
7669
7670 atomic_inc(&init_mm.mm_count);
7671 enter_lazy_tlb(&init_mm, current);
7672
7673
7674
7675
7676
7677
7678
7679 init_idle(current, smp_processor_id());
7680
7681 calc_load_update = jiffies + LOAD_FREQ;
7682
7683#ifdef CONFIG_SMP
7684 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7685
7686 if (cpu_isolated_map == NULL)
7687 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7688 idle_thread_set_boot_cpu();
7689 set_cpu_rq_start_time(smp_processor_id());
7690#endif
7691 init_sched_fair_class();
7692
7693 init_schedstats();
7694
7695 scheduler_running = 1;
7696}
7697
7698#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7699static inline int preempt_count_equals(int preempt_offset)
7700{
7701 int nested = preempt_count() + rcu_preempt_depth();
7702
7703 return (nested == preempt_offset);
7704}
7705
7706void __might_sleep(const char *file, int line, int preempt_offset)
7707{
7708
7709
7710
7711
7712
7713 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7714 "do not call blocking ops when !TASK_RUNNING; "
7715 "state=%lx set at [<%p>] %pS\n",
7716 current->state,
7717 (void *)current->task_state_change,
7718 (void *)current->task_state_change);
7719
7720 ___might_sleep(file, line, preempt_offset);
7721}
7722EXPORT_SYMBOL(__might_sleep);
7723
7724void ___might_sleep(const char *file, int line, int preempt_offset)
7725{
7726 static unsigned long prev_jiffy;
7727 unsigned long preempt_disable_ip;
7728
7729 rcu_sleep_check();
7730 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7731 !is_idle_task(current)) ||
7732 system_state != SYSTEM_RUNNING || oops_in_progress)
7733 return;
7734 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7735 return;
7736 prev_jiffy = jiffies;
7737
7738
7739 preempt_disable_ip = get_preempt_disable_ip(current);
7740
7741 printk(KERN_ERR
7742 "BUG: sleeping function called from invalid context at %s:%d\n",
7743 file, line);
7744 printk(KERN_ERR
7745 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7746 in_atomic(), irqs_disabled(),
7747 current->pid, current->comm);
7748
7749 if (task_stack_end_corrupted(current))
7750 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7751
7752 debug_show_held_locks(current);
7753 if (irqs_disabled())
7754 print_irqtrace_events(current);
7755 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
7756 && !preempt_count_equals(preempt_offset)) {
7757 pr_err("Preemption disabled at:");
7758 print_ip_sym(preempt_disable_ip);
7759 pr_cont("\n");
7760 }
7761 dump_stack();
7762 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7763}
7764EXPORT_SYMBOL(___might_sleep);
7765#endif
7766
7767#ifdef CONFIG_MAGIC_SYSRQ
7768void normalize_rt_tasks(void)
7769{
7770 struct task_struct *g, *p;
7771 struct sched_attr attr = {
7772 .sched_policy = SCHED_NORMAL,
7773 };
7774
7775 read_lock(&tasklist_lock);
7776 for_each_process_thread(g, p) {
7777
7778
7779
7780 if (p->flags & PF_KTHREAD)
7781 continue;
7782
7783 p->se.exec_start = 0;
7784 schedstat_set(p->se.statistics.wait_start, 0);
7785 schedstat_set(p->se.statistics.sleep_start, 0);
7786 schedstat_set(p->se.statistics.block_start, 0);
7787
7788 if (!dl_task(p) && !rt_task(p)) {
7789
7790
7791
7792
7793 if (task_nice(p) < 0)
7794 set_user_nice(p, 0);
7795 continue;
7796 }
7797
7798 __sched_setscheduler(p, &attr, false, false);
7799 }
7800 read_unlock(&tasklist_lock);
7801}
7802
7803#endif
7804
7805#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824struct task_struct *curr_task(int cpu)
7825{
7826 return cpu_curr(cpu);
7827}
7828
7829#endif
7830
7831#ifdef CONFIG_IA64
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847void ia64_set_curr_task(int cpu, struct task_struct *p)
7848{
7849 cpu_curr(cpu) = p;
7850}
7851
7852#endif
7853
7854#ifdef CONFIG_CGROUP_SCHED
7855
7856static DEFINE_SPINLOCK(task_group_lock);
7857
7858static void sched_free_group(struct task_group *tg)
7859{
7860 free_fair_sched_group(tg);
7861 free_rt_sched_group(tg);
7862 autogroup_free(tg);
7863 kmem_cache_free(task_group_cache, tg);
7864}
7865
7866
7867struct task_group *sched_create_group(struct task_group *parent)
7868{
7869 struct task_group *tg;
7870
7871 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
7872 if (!tg)
7873 return ERR_PTR(-ENOMEM);
7874
7875 if (!alloc_fair_sched_group(tg, parent))
7876 goto err;
7877
7878 if (!alloc_rt_sched_group(tg, parent))
7879 goto err;
7880
7881 return tg;
7882
7883err:
7884 sched_free_group(tg);
7885 return ERR_PTR(-ENOMEM);
7886}
7887
7888void sched_online_group(struct task_group *tg, struct task_group *parent)
7889{
7890 unsigned long flags;
7891
7892 spin_lock_irqsave(&task_group_lock, flags);
7893 list_add_rcu(&tg->list, &task_groups);
7894
7895 WARN_ON(!parent);
7896
7897 tg->parent = parent;
7898 INIT_LIST_HEAD(&tg->children);
7899 list_add_rcu(&tg->siblings, &parent->children);
7900 spin_unlock_irqrestore(&task_group_lock, flags);
7901
7902 online_fair_sched_group(tg);
7903}
7904
7905
7906static void sched_free_group_rcu(struct rcu_head *rhp)
7907{
7908
7909 sched_free_group(container_of(rhp, struct task_group, rcu));
7910}
7911
7912void sched_destroy_group(struct task_group *tg)
7913{
7914
7915 call_rcu(&tg->rcu, sched_free_group_rcu);
7916}
7917
7918void sched_offline_group(struct task_group *tg)
7919{
7920 unsigned long flags;
7921
7922
7923 unregister_fair_sched_group(tg);
7924
7925 spin_lock_irqsave(&task_group_lock, flags);
7926 list_del_rcu(&tg->list);
7927 list_del_rcu(&tg->siblings);
7928 spin_unlock_irqrestore(&task_group_lock, flags);
7929}
7930
7931static void sched_change_group(struct task_struct *tsk, int type)
7932{
7933 struct task_group *tg;
7934
7935
7936
7937
7938
7939
7940 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7941 struct task_group, css);
7942 tg = autogroup_task_group(tsk, tg);
7943 tsk->sched_task_group = tg;
7944
7945#ifdef CONFIG_FAIR_GROUP_SCHED
7946 if (tsk->sched_class->task_change_group)
7947 tsk->sched_class->task_change_group(tsk, type);
7948 else
7949#endif
7950 set_task_rq(tsk, task_cpu(tsk));
7951}
7952
7953
7954
7955
7956
7957
7958
7959
7960void sched_move_task(struct task_struct *tsk)
7961{
7962 int queued, running;
7963 struct rq_flags rf;
7964 struct rq *rq;
7965
7966 rq = task_rq_lock(tsk, &rf);
7967
7968 running = task_current(rq, tsk);
7969 queued = task_on_rq_queued(tsk);
7970
7971 if (queued)
7972 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7973 if (unlikely(running))
7974 put_prev_task(rq, tsk);
7975
7976 sched_change_group(tsk, TASK_MOVE_GROUP);
7977
7978 if (queued)
7979 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
7980 if (unlikely(running))
7981 set_curr_task(rq, tsk);
7982
7983 task_rq_unlock(rq, tsk, &rf);
7984}
7985#endif
7986
7987#ifdef CONFIG_RT_GROUP_SCHED
7988
7989
7990
7991static DEFINE_MUTEX(rt_constraints_mutex);
7992
7993
7994static inline int tg_has_rt_tasks(struct task_group *tg)
7995{
7996 struct task_struct *g, *p;
7997
7998
7999
8000
8001 if (task_group_is_autogroup(tg))
8002 return 0;
8003
8004 for_each_process_thread(g, p) {
8005 if (rt_task(p) && task_group(p) == tg)
8006 return 1;
8007 }
8008
8009 return 0;
8010}
8011
8012struct rt_schedulable_data {
8013 struct task_group *tg;
8014 u64 rt_period;
8015 u64 rt_runtime;
8016};
8017
8018static int tg_rt_schedulable(struct task_group *tg, void *data)
8019{
8020 struct rt_schedulable_data *d = data;
8021 struct task_group *child;
8022 unsigned long total, sum = 0;
8023 u64 period, runtime;
8024
8025 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8026 runtime = tg->rt_bandwidth.rt_runtime;
8027
8028 if (tg == d->tg) {
8029 period = d->rt_period;
8030 runtime = d->rt_runtime;
8031 }
8032
8033
8034
8035
8036 if (runtime > period && runtime != RUNTIME_INF)
8037 return -EINVAL;
8038
8039
8040
8041
8042 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8043 return -EBUSY;
8044
8045 total = to_ratio(period, runtime);
8046
8047
8048
8049
8050 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8051 return -EINVAL;
8052
8053
8054
8055
8056 list_for_each_entry_rcu(child, &tg->children, siblings) {
8057 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8058 runtime = child->rt_bandwidth.rt_runtime;
8059
8060 if (child == d->tg) {
8061 period = d->rt_period;
8062 runtime = d->rt_runtime;
8063 }
8064
8065 sum += to_ratio(period, runtime);
8066 }
8067
8068 if (sum > total)
8069 return -EINVAL;
8070
8071 return 0;
8072}
8073
8074static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8075{
8076 int ret;
8077
8078 struct rt_schedulable_data data = {
8079 .tg = tg,
8080 .rt_period = period,
8081 .rt_runtime = runtime,
8082 };
8083
8084 rcu_read_lock();
8085 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8086 rcu_read_unlock();
8087
8088 return ret;
8089}
8090
8091static int tg_set_rt_bandwidth(struct task_group *tg,
8092 u64 rt_period, u64 rt_runtime)
8093{
8094 int i, err = 0;
8095
8096
8097
8098
8099
8100 if (tg == &root_task_group && rt_runtime == 0)
8101 return -EINVAL;
8102
8103
8104 if (rt_period == 0)
8105 return -EINVAL;
8106
8107 mutex_lock(&rt_constraints_mutex);
8108 read_lock(&tasklist_lock);
8109 err = __rt_schedulable(tg, rt_period, rt_runtime);
8110 if (err)
8111 goto unlock;
8112
8113 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8114 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8115 tg->rt_bandwidth.rt_runtime = rt_runtime;
8116
8117 for_each_possible_cpu(i) {
8118 struct rt_rq *rt_rq = tg->rt_rq[i];
8119
8120 raw_spin_lock(&rt_rq->rt_runtime_lock);
8121 rt_rq->rt_runtime = rt_runtime;
8122 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8123 }
8124 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8125unlock:
8126 read_unlock(&tasklist_lock);
8127 mutex_unlock(&rt_constraints_mutex);
8128
8129 return err;
8130}
8131
8132static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8133{
8134 u64 rt_runtime, rt_period;
8135
8136 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8137 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8138 if (rt_runtime_us < 0)
8139 rt_runtime = RUNTIME_INF;
8140
8141 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8142}
8143
8144static long sched_group_rt_runtime(struct task_group *tg)
8145{
8146 u64 rt_runtime_us;
8147
8148 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8149 return -1;
8150
8151 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8152 do_div(rt_runtime_us, NSEC_PER_USEC);
8153 return rt_runtime_us;
8154}
8155
8156static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
8157{
8158 u64 rt_runtime, rt_period;
8159
8160 rt_period = rt_period_us * NSEC_PER_USEC;
8161 rt_runtime = tg->rt_bandwidth.rt_runtime;
8162
8163 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8164}
8165
8166static long sched_group_rt_period(struct task_group *tg)
8167{
8168 u64 rt_period_us;
8169
8170 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8171 do_div(rt_period_us, NSEC_PER_USEC);
8172 return rt_period_us;
8173}
8174#endif
8175
8176#ifdef CONFIG_RT_GROUP_SCHED
8177static int sched_rt_global_constraints(void)
8178{
8179 int ret = 0;
8180
8181 mutex_lock(&rt_constraints_mutex);
8182 read_lock(&tasklist_lock);
8183 ret = __rt_schedulable(NULL, 0, 0);
8184 read_unlock(&tasklist_lock);
8185 mutex_unlock(&rt_constraints_mutex);
8186
8187 return ret;
8188}
8189
8190static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8191{
8192
8193 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8194 return 0;
8195
8196 return 1;
8197}
8198
8199#else
8200static int sched_rt_global_constraints(void)
8201{
8202 unsigned long flags;
8203 int i;
8204
8205 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8206 for_each_possible_cpu(i) {
8207 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8208
8209 raw_spin_lock(&rt_rq->rt_runtime_lock);
8210 rt_rq->rt_runtime = global_rt_runtime();
8211 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8212 }
8213 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8214
8215 return 0;
8216}
8217#endif
8218
8219static int sched_dl_global_validate(void)
8220{
8221 u64 runtime = global_rt_runtime();
8222 u64 period = global_rt_period();
8223 u64 new_bw = to_ratio(period, runtime);
8224 struct dl_bw *dl_b;
8225 int cpu, ret = 0;
8226 unsigned long flags;
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237 for_each_possible_cpu(cpu) {
8238 rcu_read_lock_sched();
8239 dl_b = dl_bw_of(cpu);
8240
8241 raw_spin_lock_irqsave(&dl_b->lock, flags);
8242 if (new_bw < dl_b->total_bw)
8243 ret = -EBUSY;
8244 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8245
8246 rcu_read_unlock_sched();
8247
8248 if (ret)
8249 break;
8250 }
8251
8252 return ret;
8253}
8254
8255static void sched_dl_do_global(void)
8256{
8257 u64 new_bw = -1;
8258 struct dl_bw *dl_b;
8259 int cpu;
8260 unsigned long flags;
8261
8262 def_dl_bandwidth.dl_period = global_rt_period();
8263 def_dl_bandwidth.dl_runtime = global_rt_runtime();
8264
8265 if (global_rt_runtime() != RUNTIME_INF)
8266 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
8267
8268
8269
8270
8271 for_each_possible_cpu(cpu) {
8272 rcu_read_lock_sched();
8273 dl_b = dl_bw_of(cpu);
8274
8275 raw_spin_lock_irqsave(&dl_b->lock, flags);
8276 dl_b->bw = new_bw;
8277 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8278
8279 rcu_read_unlock_sched();
8280 }
8281}
8282
8283static int sched_rt_global_validate(void)
8284{
8285 if (sysctl_sched_rt_period <= 0)
8286 return -EINVAL;
8287
8288 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
8289 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
8290 return -EINVAL;
8291
8292 return 0;
8293}
8294
8295static void sched_rt_do_global(void)
8296{
8297 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8298 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
8299}
8300
8301int sched_rt_handler(struct ctl_table *table, int write,
8302 void __user *buffer, size_t *lenp,
8303 loff_t *ppos)
8304{
8305 int old_period, old_runtime;
8306 static DEFINE_MUTEX(mutex);
8307 int ret;
8308
8309 mutex_lock(&mutex);
8310 old_period = sysctl_sched_rt_period;
8311 old_runtime = sysctl_sched_rt_runtime;
8312
8313 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8314
8315 if (!ret && write) {
8316 ret = sched_rt_global_validate();
8317 if (ret)
8318 goto undo;
8319
8320 ret = sched_dl_global_validate();
8321 if (ret)
8322 goto undo;
8323
8324 ret = sched_rt_global_constraints();
8325 if (ret)
8326 goto undo;
8327
8328 sched_rt_do_global();
8329 sched_dl_do_global();
8330 }
8331 if (0) {
8332undo:
8333 sysctl_sched_rt_period = old_period;
8334 sysctl_sched_rt_runtime = old_runtime;
8335 }
8336 mutex_unlock(&mutex);
8337
8338 return ret;
8339}
8340
8341int sched_rr_handler(struct ctl_table *table, int write,
8342 void __user *buffer, size_t *lenp,
8343 loff_t *ppos)
8344{
8345 int ret;
8346 static DEFINE_MUTEX(mutex);
8347
8348 mutex_lock(&mutex);
8349 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8350
8351
8352 if (!ret && write) {
8353 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
8354 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
8355 }
8356 mutex_unlock(&mutex);
8357 return ret;
8358}
8359
8360#ifdef CONFIG_CGROUP_SCHED
8361
8362static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8363{
8364 return css ? container_of(css, struct task_group, css) : NULL;
8365}
8366
8367static struct cgroup_subsys_state *
8368cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8369{
8370 struct task_group *parent = css_tg(parent_css);
8371 struct task_group *tg;
8372
8373 if (!parent) {
8374
8375 return &root_task_group.css;
8376 }
8377
8378 tg = sched_create_group(parent);
8379 if (IS_ERR(tg))
8380 return ERR_PTR(-ENOMEM);
8381
8382 sched_online_group(tg, parent);
8383
8384 return &tg->css;
8385}
8386
8387static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8388{
8389 struct task_group *tg = css_tg(css);
8390
8391 sched_offline_group(tg);
8392}
8393
8394static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8395{
8396 struct task_group *tg = css_tg(css);
8397
8398
8399
8400
8401 sched_free_group(tg);
8402}
8403
8404
8405
8406
8407
8408static void cpu_cgroup_fork(struct task_struct *task)
8409{
8410 struct rq_flags rf;
8411 struct rq *rq;
8412
8413 rq = task_rq_lock(task, &rf);
8414
8415 sched_change_group(task, TASK_SET_GROUP);
8416
8417 task_rq_unlock(rq, task, &rf);
8418}
8419
8420static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8421{
8422 struct task_struct *task;
8423 struct cgroup_subsys_state *css;
8424 int ret = 0;
8425
8426 cgroup_taskset_for_each(task, css, tset) {
8427#ifdef CONFIG_RT_GROUP_SCHED
8428 if (!sched_rt_can_attach(css_tg(css), task))
8429 return -EINVAL;
8430#else
8431
8432 if (task->sched_class != &fair_sched_class)
8433 return -EINVAL;
8434#endif
8435
8436
8437
8438
8439 raw_spin_lock_irq(&task->pi_lock);
8440
8441
8442
8443
8444
8445 if (task->state == TASK_NEW)
8446 ret = -EINVAL;
8447 raw_spin_unlock_irq(&task->pi_lock);
8448
8449 if (ret)
8450 break;
8451 }
8452 return ret;
8453}
8454
8455static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8456{
8457 struct task_struct *task;
8458 struct cgroup_subsys_state *css;
8459
8460 cgroup_taskset_for_each(task, css, tset)
8461 sched_move_task(task);
8462}
8463
8464#ifdef CONFIG_FAIR_GROUP_SCHED
8465static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8466 struct cftype *cftype, u64 shareval)
8467{
8468 return sched_group_set_shares(css_tg(css), scale_load(shareval));
8469}
8470
8471static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8472 struct cftype *cft)
8473{
8474 struct task_group *tg = css_tg(css);
8475
8476 return (u64) scale_load_down(tg->shares);
8477}
8478
8479#ifdef CONFIG_CFS_BANDWIDTH
8480static DEFINE_MUTEX(cfs_constraints_mutex);
8481
8482const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
8483const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
8484
8485static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8486
8487static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8488{
8489 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8490 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8491
8492 if (tg == &root_task_group)
8493 return -EINVAL;
8494
8495
8496
8497
8498
8499
8500 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8501 return -EINVAL;
8502
8503
8504
8505
8506
8507
8508 if (period > max_cfs_quota_period)
8509 return -EINVAL;
8510
8511
8512
8513
8514
8515 get_online_cpus();
8516 mutex_lock(&cfs_constraints_mutex);
8517 ret = __cfs_schedulable(tg, period, quota);
8518 if (ret)
8519 goto out_unlock;
8520
8521 runtime_enabled = quota != RUNTIME_INF;
8522 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8523
8524
8525
8526
8527 if (runtime_enabled && !runtime_was_enabled)
8528 cfs_bandwidth_usage_inc();
8529 raw_spin_lock_irq(&cfs_b->lock);
8530 cfs_b->period = ns_to_ktime(period);
8531 cfs_b->quota = quota;
8532
8533 __refill_cfs_bandwidth_runtime(cfs_b);
8534
8535 if (runtime_enabled)
8536 start_cfs_bandwidth(cfs_b);
8537 raw_spin_unlock_irq(&cfs_b->lock);
8538
8539 for_each_online_cpu(i) {
8540 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8541 struct rq *rq = cfs_rq->rq;
8542
8543 raw_spin_lock_irq(&rq->lock);
8544 cfs_rq->runtime_enabled = runtime_enabled;
8545 cfs_rq->runtime_remaining = 0;
8546
8547 if (cfs_rq->throttled)
8548 unthrottle_cfs_rq(cfs_rq);
8549 raw_spin_unlock_irq(&rq->lock);
8550 }
8551 if (runtime_was_enabled && !runtime_enabled)
8552 cfs_bandwidth_usage_dec();
8553out_unlock:
8554 mutex_unlock(&cfs_constraints_mutex);
8555 put_online_cpus();
8556
8557 return ret;
8558}
8559
8560int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8561{
8562 u64 quota, period;
8563
8564 period = ktime_to_ns(tg->cfs_bandwidth.period);
8565 if (cfs_quota_us < 0)
8566 quota = RUNTIME_INF;
8567 else
8568 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8569
8570 return tg_set_cfs_bandwidth(tg, period, quota);
8571}
8572
8573long tg_get_cfs_quota(struct task_group *tg)
8574{
8575 u64 quota_us;
8576
8577 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8578 return -1;
8579
8580 quota_us = tg->cfs_bandwidth.quota;
8581 do_div(quota_us, NSEC_PER_USEC);
8582
8583 return quota_us;
8584}
8585
8586int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8587{
8588 u64 quota, period;
8589
8590 period = (u64)cfs_period_us * NSEC_PER_USEC;
8591 quota = tg->cfs_bandwidth.quota;
8592
8593 return tg_set_cfs_bandwidth(tg, period, quota);
8594}
8595
8596long tg_get_cfs_period(struct task_group *tg)
8597{
8598 u64 cfs_period_us;
8599
8600 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8601 do_div(cfs_period_us, NSEC_PER_USEC);
8602
8603 return cfs_period_us;
8604}
8605
8606static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8607 struct cftype *cft)
8608{
8609 return tg_get_cfs_quota(css_tg(css));
8610}
8611
8612static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8613 struct cftype *cftype, s64 cfs_quota_us)
8614{
8615 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8616}
8617
8618static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8619 struct cftype *cft)
8620{
8621 return tg_get_cfs_period(css_tg(css));
8622}
8623
8624static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8625 struct cftype *cftype, u64 cfs_period_us)
8626{
8627 return tg_set_cfs_period(css_tg(css), cfs_period_us);
8628}
8629
8630struct cfs_schedulable_data {
8631 struct task_group *tg;
8632 u64 period, quota;
8633};
8634
8635
8636
8637
8638
8639static u64 normalize_cfs_quota(struct task_group *tg,
8640 struct cfs_schedulable_data *d)
8641{
8642 u64 quota, period;
8643
8644 if (tg == d->tg) {
8645 period = d->period;
8646 quota = d->quota;
8647 } else {
8648 period = tg_get_cfs_period(tg);
8649 quota = tg_get_cfs_quota(tg);
8650 }
8651
8652
8653 if (quota == RUNTIME_INF || quota == -1)
8654 return RUNTIME_INF;
8655
8656 return to_ratio(period, quota);
8657}
8658
8659static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8660{
8661 struct cfs_schedulable_data *d = data;
8662 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8663 s64 quota = 0, parent_quota = -1;
8664
8665 if (!tg->parent) {
8666 quota = RUNTIME_INF;
8667 } else {
8668 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8669
8670 quota = normalize_cfs_quota(tg, d);
8671 parent_quota = parent_b->hierarchical_quota;
8672
8673
8674
8675
8676
8677 if (quota == RUNTIME_INF)
8678 quota = parent_quota;
8679 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8680 return -EINVAL;
8681 }
8682 cfs_b->hierarchical_quota = quota;
8683
8684 return 0;
8685}
8686
8687static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8688{
8689 int ret;
8690 struct cfs_schedulable_data data = {
8691 .tg = tg,
8692 .period = period,
8693 .quota = quota,
8694 };
8695
8696 if (quota != RUNTIME_INF) {
8697 do_div(data.period, NSEC_PER_USEC);
8698 do_div(data.quota, NSEC_PER_USEC);
8699 }
8700
8701 rcu_read_lock();
8702 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8703 rcu_read_unlock();
8704
8705 return ret;
8706}
8707
8708static int cpu_stats_show(struct seq_file *sf, void *v)
8709{
8710 struct task_group *tg = css_tg(seq_css(sf));
8711 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8712
8713 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8714 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8715 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8716
8717 return 0;
8718}
8719#endif
8720#endif
8721
8722#ifdef CONFIG_RT_GROUP_SCHED
8723static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8724 struct cftype *cft, s64 val)
8725{
8726 return sched_group_set_rt_runtime(css_tg(css), val);
8727}
8728
8729static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8730 struct cftype *cft)
8731{
8732 return sched_group_rt_runtime(css_tg(css));
8733}
8734
8735static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8736 struct cftype *cftype, u64 rt_period_us)
8737{
8738 return sched_group_set_rt_period(css_tg(css), rt_period_us);
8739}
8740
8741static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8742 struct cftype *cft)
8743{
8744 return sched_group_rt_period(css_tg(css));
8745}
8746#endif
8747
8748static struct cftype cpu_files[] = {
8749#ifdef CONFIG_FAIR_GROUP_SCHED
8750 {
8751 .name = "shares",
8752 .read_u64 = cpu_shares_read_u64,
8753 .write_u64 = cpu_shares_write_u64,
8754 },
8755#endif
8756#ifdef CONFIG_CFS_BANDWIDTH
8757 {
8758 .name = "cfs_quota_us",
8759 .read_s64 = cpu_cfs_quota_read_s64,
8760 .write_s64 = cpu_cfs_quota_write_s64,
8761 },
8762 {
8763 .name = "cfs_period_us",
8764 .read_u64 = cpu_cfs_period_read_u64,
8765 .write_u64 = cpu_cfs_period_write_u64,
8766 },
8767 {
8768 .name = "stat",
8769 .seq_show = cpu_stats_show,
8770 },
8771#endif
8772#ifdef CONFIG_RT_GROUP_SCHED
8773 {
8774 .name = "rt_runtime_us",
8775 .read_s64 = cpu_rt_runtime_read,
8776 .write_s64 = cpu_rt_runtime_write,
8777 },
8778 {
8779 .name = "rt_period_us",
8780 .read_u64 = cpu_rt_period_read_uint,
8781 .write_u64 = cpu_rt_period_write_uint,
8782 },
8783#endif
8784 { }
8785};
8786
8787struct cgroup_subsys cpu_cgrp_subsys = {
8788 .css_alloc = cpu_cgroup_css_alloc,
8789 .css_released = cpu_cgroup_css_released,
8790 .css_free = cpu_cgroup_css_free,
8791 .fork = cpu_cgroup_fork,
8792 .can_attach = cpu_cgroup_can_attach,
8793 .attach = cpu_cgroup_attach,
8794 .legacy_cftypes = cpu_files,
8795 .early_init = true,
8796};
8797
8798#endif
8799
8800void dump_cpu_task(int cpu)
8801{
8802 pr_info("Task dump for CPU %d:\n", cpu);
8803 sched_show_task(cpu_curr(cpu));
8804}
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818const int sched_prio_to_weight[40] = {
8819 88761, 71755, 56483, 46273, 36291,
8820 29154, 23254, 18705, 14949, 11916,
8821 9548, 7620, 6100, 4904, 3906,
8822 3121, 2501, 1991, 1586, 1277,
8823 1024, 820, 655, 526, 423,
8824 335, 272, 215, 172, 137,
8825 110, 87, 70, 56, 45,
8826 36, 29, 23, 18, 15,
8827};
8828
8829
8830
8831
8832
8833
8834
8835
8836const u32 sched_prio_to_wmult[40] = {
8837 48388, 59856, 76040, 92818, 118348,
8838 147320, 184698, 229616, 287308, 360437,
8839 449829, 563644, 704093, 875809, 1099582,
8840 1376151, 1717300, 2157191, 2708050, 3363326,
8841 4194304, 5237765, 6557202, 8165337, 10153587,
8842 12820798, 15790321, 19976592, 24970740, 31350126,
8843 39045157, 49367440, 61356676, 76695844, 95443717,
8844 119304647, 148102320, 186737708, 238609294, 286331153,
8845};
8846