1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/kasan.h>
30#include <linux/mm.h>
31#include <linux/module.h>
32#include <linux/nmi.h>
33#include <linux/init.h>
34#include <linux/uaccess.h>
35#include <linux/highmem.h>
36#include <linux/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/context_tracking.h>
75#include <linux/compiler.h>
76#include <linux/frame.h>
77
78#include <asm/switch_to.h>
79#include <asm/tlb.h>
80#include <asm/irq_regs.h>
81#include <asm/mutex.h>
82#ifdef CONFIG_PARAVIRT
83#include <asm/paravirt.h>
84#endif
85
86#include "sched.h"
87#include "../workqueue_internal.h"
88#include "../smpboot.h"
89
90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h>
92
93DEFINE_MUTEX(sched_domains_mutex);
94DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
95
96static void update_rq_clock_task(struct rq *rq, s64 delta);
97
98void update_rq_clock(struct rq *rq)
99{
100 s64 delta;
101
102 lockdep_assert_held(&rq->lock);
103
104 if (rq->clock_skip_update & RQCF_ACT_SKIP)
105 return;
106
107 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
108 if (delta < 0)
109 return;
110 rq->clock += delta;
111 update_rq_clock_task(rq, delta);
112}
113
114
115
116
117
118#define SCHED_FEAT(name, enabled) \
119 (1UL << __SCHED_FEAT_##name) * enabled |
120
121const_debug unsigned int sysctl_sched_features =
122#include "features.h"
123 0;
124
125#undef SCHED_FEAT
126
127
128
129
130
131const_debug unsigned int sysctl_sched_nr_migrate = 32;
132
133
134
135
136
137
138
139const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
140
141
142
143
144
145unsigned int sysctl_sched_rt_period = 1000000;
146
147__read_mostly int scheduler_running;
148
149
150
151
152
153int sysctl_sched_rt_runtime = 950000;
154
155
156cpumask_var_t cpu_isolated_map;
157
158
159
160
161static struct rq *this_rq_lock(void)
162 __acquires(rq->lock)
163{
164 struct rq *rq;
165
166 local_irq_disable();
167 rq = this_rq();
168 raw_spin_lock(&rq->lock);
169
170 return rq;
171}
172
173
174
175
176struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
177 __acquires(rq->lock)
178{
179 struct rq *rq;
180
181 lockdep_assert_held(&p->pi_lock);
182
183 for (;;) {
184 rq = task_rq(p);
185 raw_spin_lock(&rq->lock);
186 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
187 rf->cookie = lockdep_pin_lock(&rq->lock);
188 return rq;
189 }
190 raw_spin_unlock(&rq->lock);
191
192 while (unlikely(task_on_rq_migrating(p)))
193 cpu_relax();
194 }
195}
196
197
198
199
200struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
201 __acquires(p->pi_lock)
202 __acquires(rq->lock)
203{
204 struct rq *rq;
205
206 for (;;) {
207 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
208 rq = task_rq(p);
209 raw_spin_lock(&rq->lock);
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
227 rf->cookie = lockdep_pin_lock(&rq->lock);
228 return rq;
229 }
230 raw_spin_unlock(&rq->lock);
231 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
232
233 while (unlikely(task_on_rq_migrating(p)))
234 cpu_relax();
235 }
236}
237
238#ifdef CONFIG_SCHED_HRTICK
239
240
241
242
243static void hrtick_clear(struct rq *rq)
244{
245 if (hrtimer_active(&rq->hrtick_timer))
246 hrtimer_cancel(&rq->hrtick_timer);
247}
248
249
250
251
252
253static enum hrtimer_restart hrtick(struct hrtimer *timer)
254{
255 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
256
257 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
258
259 raw_spin_lock(&rq->lock);
260 update_rq_clock(rq);
261 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
262 raw_spin_unlock(&rq->lock);
263
264 return HRTIMER_NORESTART;
265}
266
267#ifdef CONFIG_SMP
268
269static void __hrtick_restart(struct rq *rq)
270{
271 struct hrtimer *timer = &rq->hrtick_timer;
272
273 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
274}
275
276
277
278
279static void __hrtick_start(void *arg)
280{
281 struct rq *rq = arg;
282
283 raw_spin_lock(&rq->lock);
284 __hrtick_restart(rq);
285 rq->hrtick_csd_pending = 0;
286 raw_spin_unlock(&rq->lock);
287}
288
289
290
291
292
293
294void hrtick_start(struct rq *rq, u64 delay)
295{
296 struct hrtimer *timer = &rq->hrtick_timer;
297 ktime_t time;
298 s64 delta;
299
300
301
302
303
304 delta = max_t(s64, delay, 10000LL);
305 time = ktime_add_ns(timer->base->get_time(), delta);
306
307 hrtimer_set_expires(timer, time);
308
309 if (rq == this_rq()) {
310 __hrtick_restart(rq);
311 } else if (!rq->hrtick_csd_pending) {
312 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
313 rq->hrtick_csd_pending = 1;
314 }
315}
316
317#else
318
319
320
321
322
323void hrtick_start(struct rq *rq, u64 delay)
324{
325
326
327
328
329 delay = max_t(u64, delay, 10000LL);
330 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
331 HRTIMER_MODE_REL_PINNED);
332}
333#endif
334
335static void init_rq_hrtick(struct rq *rq)
336{
337#ifdef CONFIG_SMP
338 rq->hrtick_csd_pending = 0;
339
340 rq->hrtick_csd.flags = 0;
341 rq->hrtick_csd.func = __hrtick_start;
342 rq->hrtick_csd.info = rq;
343#endif
344
345 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
346 rq->hrtick_timer.function = hrtick;
347}
348#else
349static inline void hrtick_clear(struct rq *rq)
350{
351}
352
353static inline void init_rq_hrtick(struct rq *rq)
354{
355}
356#endif
357
358
359
360
361#define fetch_or(ptr, mask) \
362 ({ \
363 typeof(ptr) _ptr = (ptr); \
364 typeof(mask) _mask = (mask); \
365 typeof(*_ptr) _old, _val = *_ptr; \
366 \
367 for (;;) { \
368 _old = cmpxchg(_ptr, _val, _val | _mask); \
369 if (_old == _val) \
370 break; \
371 _val = _old; \
372 } \
373 _old; \
374})
375
376#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
377
378
379
380
381
382static bool set_nr_and_not_polling(struct task_struct *p)
383{
384 struct thread_info *ti = task_thread_info(p);
385 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
386}
387
388
389
390
391
392
393
394static bool set_nr_if_polling(struct task_struct *p)
395{
396 struct thread_info *ti = task_thread_info(p);
397 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
398
399 for (;;) {
400 if (!(val & _TIF_POLLING_NRFLAG))
401 return false;
402 if (val & _TIF_NEED_RESCHED)
403 return true;
404 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
405 if (old == val)
406 break;
407 val = old;
408 }
409 return true;
410}
411
412#else
413static bool set_nr_and_not_polling(struct task_struct *p)
414{
415 set_tsk_need_resched(p);
416 return true;
417}
418
419#ifdef CONFIG_SMP
420static bool set_nr_if_polling(struct task_struct *p)
421{
422 return false;
423}
424#endif
425#endif
426
427void wake_q_add(struct wake_q_head *head, struct task_struct *task)
428{
429 struct wake_q_node *node = &task->wake_q;
430
431
432
433
434
435
436
437
438
439 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
440 return;
441
442 get_task_struct(task);
443
444
445
446
447 *head->lastp = node;
448 head->lastp = &node->next;
449}
450
451void wake_up_q(struct wake_q_head *head)
452{
453 struct wake_q_node *node = head->first;
454
455 while (node != WAKE_Q_TAIL) {
456 struct task_struct *task;
457
458 task = container_of(node, struct task_struct, wake_q);
459 BUG_ON(!task);
460
461 node = node->next;
462 task->wake_q.next = NULL;
463
464
465
466
467
468 wake_up_process(task);
469 put_task_struct(task);
470 }
471}
472
473
474
475
476
477
478
479
480void resched_curr(struct rq *rq)
481{
482 struct task_struct *curr = rq->curr;
483 int cpu;
484
485 lockdep_assert_held(&rq->lock);
486
487 if (test_tsk_need_resched(curr))
488 return;
489
490 cpu = cpu_of(rq);
491
492 if (cpu == smp_processor_id()) {
493 set_tsk_need_resched(curr);
494 set_preempt_need_resched();
495 return;
496 }
497
498 if (set_nr_and_not_polling(curr))
499 smp_send_reschedule(cpu);
500 else
501 trace_sched_wake_idle_without_ipi(cpu);
502}
503
504void resched_cpu(int cpu)
505{
506 struct rq *rq = cpu_rq(cpu);
507 unsigned long flags;
508
509 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
510 return;
511 resched_curr(rq);
512 raw_spin_unlock_irqrestore(&rq->lock, flags);
513}
514
515#ifdef CONFIG_SMP
516#ifdef CONFIG_NO_HZ_COMMON
517
518
519
520
521
522
523
524
525int get_nohz_timer_target(void)
526{
527 int i, cpu = smp_processor_id();
528 struct sched_domain *sd;
529
530 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
531 return cpu;
532
533 rcu_read_lock();
534 for_each_domain(cpu, sd) {
535 for_each_cpu(i, sched_domain_span(sd)) {
536 if (cpu == i)
537 continue;
538
539 if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
540 cpu = i;
541 goto unlock;
542 }
543 }
544 }
545
546 if (!is_housekeeping_cpu(cpu))
547 cpu = housekeeping_any_cpu();
548unlock:
549 rcu_read_unlock();
550 return cpu;
551}
552
553
554
555
556
557
558
559
560
561
562static void wake_up_idle_cpu(int cpu)
563{
564 struct rq *rq = cpu_rq(cpu);
565
566 if (cpu == smp_processor_id())
567 return;
568
569 if (set_nr_and_not_polling(rq->idle))
570 smp_send_reschedule(cpu);
571 else
572 trace_sched_wake_idle_without_ipi(cpu);
573}
574
575static bool wake_up_full_nohz_cpu(int cpu)
576{
577
578
579
580
581
582
583 if (tick_nohz_full_cpu(cpu)) {
584 if (cpu != smp_processor_id() ||
585 tick_nohz_tick_stopped())
586 tick_nohz_full_kick_cpu(cpu);
587 return true;
588 }
589
590 return false;
591}
592
593void wake_up_nohz_cpu(int cpu)
594{
595 if (!wake_up_full_nohz_cpu(cpu))
596 wake_up_idle_cpu(cpu);
597}
598
599static inline bool got_nohz_idle_kick(void)
600{
601 int cpu = smp_processor_id();
602
603 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
604 return false;
605
606 if (idle_cpu(cpu) && !need_resched())
607 return true;
608
609
610
611
612
613 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
614 return false;
615}
616
617#else
618
619static inline bool got_nohz_idle_kick(void)
620{
621 return false;
622}
623
624#endif
625
626#ifdef CONFIG_NO_HZ_FULL
627bool sched_can_stop_tick(struct rq *rq)
628{
629 int fifo_nr_running;
630
631
632 if (rq->dl.dl_nr_running)
633 return false;
634
635
636
637
638
639 if (rq->rt.rr_nr_running) {
640 if (rq->rt.rr_nr_running == 1)
641 return true;
642 else
643 return false;
644 }
645
646
647
648
649
650 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
651 if (fifo_nr_running)
652 return true;
653
654
655
656
657
658
659 if (rq->nr_running > 1)
660 return false;
661
662 return true;
663}
664#endif
665
666void sched_avg_update(struct rq *rq)
667{
668 s64 period = sched_avg_period();
669
670 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
671
672
673
674
675
676 asm("" : "+rm" (rq->age_stamp));
677 rq->age_stamp += period;
678 rq->rt_avg /= 2;
679 }
680}
681
682#endif
683
684#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
685 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
686
687
688
689
690
691
692int walk_tg_tree_from(struct task_group *from,
693 tg_visitor down, tg_visitor up, void *data)
694{
695 struct task_group *parent, *child;
696 int ret;
697
698 parent = from;
699
700down:
701 ret = (*down)(parent, data);
702 if (ret)
703 goto out;
704 list_for_each_entry_rcu(child, &parent->children, siblings) {
705 parent = child;
706 goto down;
707
708up:
709 continue;
710 }
711 ret = (*up)(parent, data);
712 if (ret || parent == from)
713 goto out;
714
715 child = parent;
716 parent = parent->parent;
717 if (parent)
718 goto up;
719out:
720 return ret;
721}
722
723int tg_nop(struct task_group *tg, void *data)
724{
725 return 0;
726}
727#endif
728
729static void set_load_weight(struct task_struct *p)
730{
731 int prio = p->static_prio - MAX_RT_PRIO;
732 struct load_weight *load = &p->se.load;
733
734
735
736
737 if (idle_policy(p->policy)) {
738 load->weight = scale_load(WEIGHT_IDLEPRIO);
739 load->inv_weight = WMULT_IDLEPRIO;
740 return;
741 }
742
743 load->weight = scale_load(sched_prio_to_weight[prio]);
744 load->inv_weight = sched_prio_to_wmult[prio];
745}
746
747static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
748{
749 update_rq_clock(rq);
750 if (!(flags & ENQUEUE_RESTORE))
751 sched_info_queued(rq, p);
752 p->sched_class->enqueue_task(rq, p, flags);
753}
754
755static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
756{
757 update_rq_clock(rq);
758 if (!(flags & DEQUEUE_SAVE))
759 sched_info_dequeued(rq, p);
760 p->sched_class->dequeue_task(rq, p, flags);
761}
762
763void activate_task(struct rq *rq, struct task_struct *p, int flags)
764{
765 if (task_contributes_to_load(p))
766 rq->nr_uninterruptible--;
767
768 enqueue_task(rq, p, flags);
769}
770
771void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
772{
773 if (task_contributes_to_load(p))
774 rq->nr_uninterruptible++;
775
776 dequeue_task(rq, p, flags);
777}
778
779static void update_rq_clock_task(struct rq *rq, s64 delta)
780{
781
782
783
784
785#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
786 s64 steal = 0, irq_delta = 0;
787#endif
788#ifdef CONFIG_IRQ_TIME_ACCOUNTING
789 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806 if (irq_delta > delta)
807 irq_delta = delta;
808
809 rq->prev_irq_time += irq_delta;
810 delta -= irq_delta;
811#endif
812#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
813 if (static_key_false((¶virt_steal_rq_enabled))) {
814 steal = paravirt_steal_clock(cpu_of(rq));
815 steal -= rq->prev_steal_time_rq;
816
817 if (unlikely(steal > delta))
818 steal = delta;
819
820 rq->prev_steal_time_rq += steal;
821 delta -= steal;
822 }
823#endif
824
825 rq->clock_task += delta;
826
827#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
828 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
829 sched_rt_avg_update(rq, irq_delta + steal);
830#endif
831}
832
833void sched_set_stop_task(int cpu, struct task_struct *stop)
834{
835 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
836 struct task_struct *old_stop = cpu_rq(cpu)->stop;
837
838 if (stop) {
839
840
841
842
843
844
845
846
847 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
848
849 stop->sched_class = &stop_sched_class;
850 }
851
852 cpu_rq(cpu)->stop = stop;
853
854 if (old_stop) {
855
856
857
858
859 old_stop->sched_class = &rt_sched_class;
860 }
861}
862
863
864
865
866static inline int __normal_prio(struct task_struct *p)
867{
868 return p->static_prio;
869}
870
871
872
873
874
875
876
877
878static inline int normal_prio(struct task_struct *p)
879{
880 int prio;
881
882 if (task_has_dl_policy(p))
883 prio = MAX_DL_PRIO-1;
884 else if (task_has_rt_policy(p))
885 prio = MAX_RT_PRIO-1 - p->rt_priority;
886 else
887 prio = __normal_prio(p);
888 return prio;
889}
890
891
892
893
894
895
896
897
898static int effective_prio(struct task_struct *p)
899{
900 p->normal_prio = normal_prio(p);
901
902
903
904
905
906 if (!rt_prio(p->prio))
907 return p->normal_prio;
908 return p->prio;
909}
910
911
912
913
914
915
916
917inline int task_curr(const struct task_struct *p)
918{
919 return cpu_curr(task_cpu(p)) == p;
920}
921
922
923
924
925
926
927
928
929static inline void check_class_changed(struct rq *rq, struct task_struct *p,
930 const struct sched_class *prev_class,
931 int oldprio)
932{
933 if (prev_class != p->sched_class) {
934 if (prev_class->switched_from)
935 prev_class->switched_from(rq, p);
936
937 p->sched_class->switched_to(rq, p);
938 } else if (oldprio != p->prio || dl_task(p))
939 p->sched_class->prio_changed(rq, p, oldprio);
940}
941
942void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
943{
944 const struct sched_class *class;
945
946 if (p->sched_class == rq->curr->sched_class) {
947 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
948 } else {
949 for_each_class(class) {
950 if (class == rq->curr->sched_class)
951 break;
952 if (class == p->sched_class) {
953 resched_curr(rq);
954 break;
955 }
956 }
957 }
958
959
960
961
962
963 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
964 rq_clock_skip_update(rq, true);
965}
966
967#ifdef CONFIG_SMP
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
988{
989 lockdep_assert_held(&rq->lock);
990
991 p->on_rq = TASK_ON_RQ_MIGRATING;
992 dequeue_task(rq, p, 0);
993 set_task_cpu(p, new_cpu);
994 raw_spin_unlock(&rq->lock);
995
996 rq = cpu_rq(new_cpu);
997
998 raw_spin_lock(&rq->lock);
999 BUG_ON(task_cpu(p) != new_cpu);
1000 enqueue_task(rq, p, 0);
1001 p->on_rq = TASK_ON_RQ_QUEUED;
1002 check_preempt_curr(rq, p, 0);
1003
1004 return rq;
1005}
1006
1007struct migration_arg {
1008 struct task_struct *task;
1009 int dest_cpu;
1010};
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
1022{
1023 if (unlikely(!cpu_active(dest_cpu)))
1024 return rq;
1025
1026
1027 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1028 return rq;
1029
1030 rq = move_queued_task(rq, p, dest_cpu);
1031
1032 return rq;
1033}
1034
1035
1036
1037
1038
1039
1040static int migration_cpu_stop(void *data)
1041{
1042 struct migration_arg *arg = data;
1043 struct task_struct *p = arg->task;
1044 struct rq *rq = this_rq();
1045
1046
1047
1048
1049
1050 local_irq_disable();
1051
1052
1053
1054
1055
1056 sched_ttwu_pending();
1057
1058 raw_spin_lock(&p->pi_lock);
1059 raw_spin_lock(&rq->lock);
1060
1061
1062
1063
1064
1065 if (task_rq(p) == rq && task_on_rq_queued(p))
1066 rq = __migrate_task(rq, p, arg->dest_cpu);
1067 raw_spin_unlock(&rq->lock);
1068 raw_spin_unlock(&p->pi_lock);
1069
1070 local_irq_enable();
1071 return 0;
1072}
1073
1074
1075
1076
1077
1078void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1079{
1080 cpumask_copy(&p->cpus_allowed, new_mask);
1081 p->nr_cpus_allowed = cpumask_weight(new_mask);
1082}
1083
1084void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1085{
1086 struct rq *rq = task_rq(p);
1087 bool queued, running;
1088
1089 lockdep_assert_held(&p->pi_lock);
1090
1091 queued = task_on_rq_queued(p);
1092 running = task_current(rq, p);
1093
1094 if (queued) {
1095
1096
1097
1098
1099 lockdep_assert_held(&rq->lock);
1100 dequeue_task(rq, p, DEQUEUE_SAVE);
1101 }
1102 if (running)
1103 put_prev_task(rq, p);
1104
1105 p->sched_class->set_cpus_allowed(p, new_mask);
1106
1107 if (running)
1108 p->sched_class->set_curr_task(rq);
1109 if (queued)
1110 enqueue_task(rq, p, ENQUEUE_RESTORE);
1111}
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122static int __set_cpus_allowed_ptr(struct task_struct *p,
1123 const struct cpumask *new_mask, bool check)
1124{
1125 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1126 unsigned int dest_cpu;
1127 struct rq_flags rf;
1128 struct rq *rq;
1129 int ret = 0;
1130
1131 rq = task_rq_lock(p, &rf);
1132
1133 if (p->flags & PF_KTHREAD) {
1134
1135
1136
1137 cpu_valid_mask = cpu_online_mask;
1138 }
1139
1140
1141
1142
1143
1144 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1145 ret = -EINVAL;
1146 goto out;
1147 }
1148
1149 if (cpumask_equal(&p->cpus_allowed, new_mask))
1150 goto out;
1151
1152 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1153 ret = -EINVAL;
1154 goto out;
1155 }
1156
1157 do_set_cpus_allowed(p, new_mask);
1158
1159 if (p->flags & PF_KTHREAD) {
1160
1161
1162
1163
1164 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1165 !cpumask_intersects(new_mask, cpu_active_mask) &&
1166 p->nr_cpus_allowed != 1);
1167 }
1168
1169
1170 if (cpumask_test_cpu(task_cpu(p), new_mask))
1171 goto out;
1172
1173 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1174 if (task_running(rq, p) || p->state == TASK_WAKING) {
1175 struct migration_arg arg = { p, dest_cpu };
1176
1177 task_rq_unlock(rq, p, &rf);
1178 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1179 tlb_migrate_finish(p->mm);
1180 return 0;
1181 } else if (task_on_rq_queued(p)) {
1182
1183
1184
1185
1186 lockdep_unpin_lock(&rq->lock, rf.cookie);
1187 rq = move_queued_task(rq, p, dest_cpu);
1188 lockdep_repin_lock(&rq->lock, rf.cookie);
1189 }
1190out:
1191 task_rq_unlock(rq, p, &rf);
1192
1193 return ret;
1194}
1195
1196int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1197{
1198 return __set_cpus_allowed_ptr(p, new_mask, false);
1199}
1200EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1201
1202void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1203{
1204#ifdef CONFIG_SCHED_DEBUG
1205
1206
1207
1208
1209 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1210 !p->on_rq);
1211
1212
1213
1214
1215
1216
1217 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1218 p->sched_class == &fair_sched_class &&
1219 (p->on_rq && !task_on_rq_migrating(p)));
1220
1221#ifdef CONFIG_LOCKDEP
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1233 lockdep_is_held(&task_rq(p)->lock)));
1234#endif
1235#endif
1236
1237 trace_sched_migrate_task(p, new_cpu);
1238
1239 if (task_cpu(p) != new_cpu) {
1240 if (p->sched_class->migrate_task_rq)
1241 p->sched_class->migrate_task_rq(p);
1242 p->se.nr_migrations++;
1243 perf_event_task_migrate(p);
1244 }
1245
1246 __set_task_cpu(p, new_cpu);
1247}
1248
1249static void __migrate_swap_task(struct task_struct *p, int cpu)
1250{
1251 if (task_on_rq_queued(p)) {
1252 struct rq *src_rq, *dst_rq;
1253
1254 src_rq = task_rq(p);
1255 dst_rq = cpu_rq(cpu);
1256
1257 p->on_rq = TASK_ON_RQ_MIGRATING;
1258 deactivate_task(src_rq, p, 0);
1259 set_task_cpu(p, cpu);
1260 activate_task(dst_rq, p, 0);
1261 p->on_rq = TASK_ON_RQ_QUEUED;
1262 check_preempt_curr(dst_rq, p, 0);
1263 } else {
1264
1265
1266
1267
1268
1269 p->wake_cpu = cpu;
1270 }
1271}
1272
1273struct migration_swap_arg {
1274 struct task_struct *src_task, *dst_task;
1275 int src_cpu, dst_cpu;
1276};
1277
1278static int migrate_swap_stop(void *data)
1279{
1280 struct migration_swap_arg *arg = data;
1281 struct rq *src_rq, *dst_rq;
1282 int ret = -EAGAIN;
1283
1284 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1285 return -EAGAIN;
1286
1287 src_rq = cpu_rq(arg->src_cpu);
1288 dst_rq = cpu_rq(arg->dst_cpu);
1289
1290 double_raw_lock(&arg->src_task->pi_lock,
1291 &arg->dst_task->pi_lock);
1292 double_rq_lock(src_rq, dst_rq);
1293
1294 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1295 goto unlock;
1296
1297 if (task_cpu(arg->src_task) != arg->src_cpu)
1298 goto unlock;
1299
1300 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1301 goto unlock;
1302
1303 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1304 goto unlock;
1305
1306 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1307 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1308
1309 ret = 0;
1310
1311unlock:
1312 double_rq_unlock(src_rq, dst_rq);
1313 raw_spin_unlock(&arg->dst_task->pi_lock);
1314 raw_spin_unlock(&arg->src_task->pi_lock);
1315
1316 return ret;
1317}
1318
1319
1320
1321
1322int migrate_swap(struct task_struct *cur, struct task_struct *p)
1323{
1324 struct migration_swap_arg arg;
1325 int ret = -EINVAL;
1326
1327 arg = (struct migration_swap_arg){
1328 .src_task = cur,
1329 .src_cpu = task_cpu(cur),
1330 .dst_task = p,
1331 .dst_cpu = task_cpu(p),
1332 };
1333
1334 if (arg.src_cpu == arg.dst_cpu)
1335 goto out;
1336
1337
1338
1339
1340
1341 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1342 goto out;
1343
1344 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1345 goto out;
1346
1347 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1348 goto out;
1349
1350 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1351 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1352
1353out:
1354 return ret;
1355}
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1374{
1375 int running, queued;
1376 struct rq_flags rf;
1377 unsigned long ncsw;
1378 struct rq *rq;
1379
1380 for (;;) {
1381
1382
1383
1384
1385
1386
1387 rq = task_rq(p);
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400 while (task_running(rq, p)) {
1401 if (match_state && unlikely(p->state != match_state))
1402 return 0;
1403 cpu_relax();
1404 }
1405
1406
1407
1408
1409
1410
1411 rq = task_rq_lock(p, &rf);
1412 trace_sched_wait_task(p);
1413 running = task_running(rq, p);
1414 queued = task_on_rq_queued(p);
1415 ncsw = 0;
1416 if (!match_state || p->state == match_state)
1417 ncsw = p->nvcsw | LONG_MIN;
1418 task_rq_unlock(rq, p, &rf);
1419
1420
1421
1422
1423 if (unlikely(!ncsw))
1424 break;
1425
1426
1427
1428
1429
1430
1431
1432 if (unlikely(running)) {
1433 cpu_relax();
1434 continue;
1435 }
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446 if (unlikely(queued)) {
1447 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1448
1449 set_current_state(TASK_UNINTERRUPTIBLE);
1450 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1451 continue;
1452 }
1453
1454
1455
1456
1457
1458
1459 break;
1460 }
1461
1462 return ncsw;
1463}
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478void kick_process(struct task_struct *p)
1479{
1480 int cpu;
1481
1482 preempt_disable();
1483 cpu = task_cpu(p);
1484 if ((cpu != smp_processor_id()) && task_curr(p))
1485 smp_send_reschedule(cpu);
1486 preempt_enable();
1487}
1488EXPORT_SYMBOL_GPL(kick_process);
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512static int select_fallback_rq(int cpu, struct task_struct *p)
1513{
1514 int nid = cpu_to_node(cpu);
1515 const struct cpumask *nodemask = NULL;
1516 enum { cpuset, possible, fail } state = cpuset;
1517 int dest_cpu;
1518
1519
1520
1521
1522
1523
1524 if (nid != -1) {
1525 nodemask = cpumask_of_node(nid);
1526
1527
1528 for_each_cpu(dest_cpu, nodemask) {
1529 if (!cpu_active(dest_cpu))
1530 continue;
1531 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1532 return dest_cpu;
1533 }
1534 }
1535
1536 for (;;) {
1537
1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1539 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1540 continue;
1541 if (!cpu_online(dest_cpu))
1542 continue;
1543 goto out;
1544 }
1545
1546
1547 switch (state) {
1548 case cpuset:
1549 if (IS_ENABLED(CONFIG_CPUSETS)) {
1550 cpuset_cpus_allowed_fallback(p);
1551 state = possible;
1552 break;
1553 }
1554
1555 case possible:
1556 do_set_cpus_allowed(p, cpu_possible_mask);
1557 state = fail;
1558 break;
1559
1560 case fail:
1561 BUG();
1562 break;
1563 }
1564 }
1565
1566out:
1567 if (state != cpuset) {
1568
1569
1570
1571
1572
1573 if (p->mm && printk_ratelimit()) {
1574 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1575 task_pid_nr(p), p->comm, cpu);
1576 }
1577 }
1578
1579 return dest_cpu;
1580}
1581
1582
1583
1584
1585static inline
1586int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1587{
1588 lockdep_assert_held(&p->pi_lock);
1589
1590 if (tsk_nr_cpus_allowed(p) > 1)
1591 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1592 else
1593 cpu = cpumask_any(tsk_cpus_allowed(p));
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1606 !cpu_online(cpu)))
1607 cpu = select_fallback_rq(task_cpu(p), p);
1608
1609 return cpu;
1610}
1611
1612static void update_avg(u64 *avg, u64 sample)
1613{
1614 s64 diff = sample - *avg;
1615 *avg += diff >> 3;
1616}
1617
1618#else
1619
1620static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1621 const struct cpumask *new_mask, bool check)
1622{
1623 return set_cpus_allowed_ptr(p, new_mask);
1624}
1625
1626#endif
1627
1628static void
1629ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1630{
1631#ifdef CONFIG_SCHEDSTATS
1632 struct rq *rq = this_rq();
1633
1634#ifdef CONFIG_SMP
1635 int this_cpu = smp_processor_id();
1636
1637 if (cpu == this_cpu) {
1638 schedstat_inc(rq, ttwu_local);
1639 schedstat_inc(p, se.statistics.nr_wakeups_local);
1640 } else {
1641 struct sched_domain *sd;
1642
1643 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1644 rcu_read_lock();
1645 for_each_domain(this_cpu, sd) {
1646 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1647 schedstat_inc(sd, ttwu_wake_remote);
1648 break;
1649 }
1650 }
1651 rcu_read_unlock();
1652 }
1653
1654 if (wake_flags & WF_MIGRATED)
1655 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1656
1657#endif
1658
1659 schedstat_inc(rq, ttwu_count);
1660 schedstat_inc(p, se.statistics.nr_wakeups);
1661
1662 if (wake_flags & WF_SYNC)
1663 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1664
1665#endif
1666}
1667
1668static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1669{
1670 activate_task(rq, p, en_flags);
1671 p->on_rq = TASK_ON_RQ_QUEUED;
1672
1673
1674 if (p->flags & PF_WQ_WORKER)
1675 wq_worker_waking_up(p, cpu_of(rq));
1676}
1677
1678
1679
1680
1681static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1682 struct pin_cookie cookie)
1683{
1684 check_preempt_curr(rq, p, wake_flags);
1685 p->state = TASK_RUNNING;
1686 trace_sched_wakeup(p);
1687
1688#ifdef CONFIG_SMP
1689 if (p->sched_class->task_woken) {
1690
1691
1692
1693
1694 lockdep_unpin_lock(&rq->lock, cookie);
1695 p->sched_class->task_woken(rq, p);
1696 lockdep_repin_lock(&rq->lock, cookie);
1697 }
1698
1699 if (rq->idle_stamp) {
1700 u64 delta = rq_clock(rq) - rq->idle_stamp;
1701 u64 max = 2*rq->max_idle_balance_cost;
1702
1703 update_avg(&rq->avg_idle, delta);
1704
1705 if (rq->avg_idle > max)
1706 rq->avg_idle = max;
1707
1708 rq->idle_stamp = 0;
1709 }
1710#endif
1711}
1712
1713static void
1714ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1715 struct pin_cookie cookie)
1716{
1717 int en_flags = ENQUEUE_WAKEUP;
1718
1719 lockdep_assert_held(&rq->lock);
1720
1721#ifdef CONFIG_SMP
1722 if (p->sched_contributes_to_load)
1723 rq->nr_uninterruptible--;
1724
1725 if (wake_flags & WF_MIGRATED)
1726 en_flags |= ENQUEUE_MIGRATED;
1727#endif
1728
1729 ttwu_activate(rq, p, en_flags);
1730 ttwu_do_wakeup(rq, p, wake_flags, cookie);
1731}
1732
1733
1734
1735
1736
1737
1738
1739static int ttwu_remote(struct task_struct *p, int wake_flags)
1740{
1741 struct rq_flags rf;
1742 struct rq *rq;
1743 int ret = 0;
1744
1745 rq = __task_rq_lock(p, &rf);
1746 if (task_on_rq_queued(p)) {
1747
1748 update_rq_clock(rq);
1749 ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
1750 ret = 1;
1751 }
1752 __task_rq_unlock(rq, &rf);
1753
1754 return ret;
1755}
1756
1757#ifdef CONFIG_SMP
1758void sched_ttwu_pending(void)
1759{
1760 struct rq *rq = this_rq();
1761 struct llist_node *llist = llist_del_all(&rq->wake_list);
1762 struct pin_cookie cookie;
1763 struct task_struct *p;
1764 unsigned long flags;
1765
1766 if (!llist)
1767 return;
1768
1769 raw_spin_lock_irqsave(&rq->lock, flags);
1770 cookie = lockdep_pin_lock(&rq->lock);
1771
1772 while (llist) {
1773 int wake_flags = 0;
1774
1775 p = llist_entry(llist, struct task_struct, wake_entry);
1776 llist = llist_next(llist);
1777
1778 if (p->sched_remote_wakeup)
1779 wake_flags = WF_MIGRATED;
1780
1781 ttwu_do_activate(rq, p, wake_flags, cookie);
1782 }
1783
1784 lockdep_unpin_lock(&rq->lock, cookie);
1785 raw_spin_unlock_irqrestore(&rq->lock, flags);
1786}
1787
1788void scheduler_ipi(void)
1789{
1790
1791
1792
1793
1794
1795 preempt_fold_need_resched();
1796
1797 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1798 return;
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813 irq_enter();
1814 sched_ttwu_pending();
1815
1816
1817
1818
1819 if (unlikely(got_nohz_idle_kick())) {
1820 this_rq()->idle_balance = 1;
1821 raise_softirq_irqoff(SCHED_SOFTIRQ);
1822 }
1823 irq_exit();
1824}
1825
1826static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1827{
1828 struct rq *rq = cpu_rq(cpu);
1829
1830 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1831
1832 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1833 if (!set_nr_if_polling(rq->idle))
1834 smp_send_reschedule(cpu);
1835 else
1836 trace_sched_wake_idle_without_ipi(cpu);
1837 }
1838}
1839
1840void wake_up_if_idle(int cpu)
1841{
1842 struct rq *rq = cpu_rq(cpu);
1843 unsigned long flags;
1844
1845 rcu_read_lock();
1846
1847 if (!is_idle_task(rcu_dereference(rq->curr)))
1848 goto out;
1849
1850 if (set_nr_if_polling(rq->idle)) {
1851 trace_sched_wake_idle_without_ipi(cpu);
1852 } else {
1853 raw_spin_lock_irqsave(&rq->lock, flags);
1854 if (is_idle_task(rq->curr))
1855 smp_send_reschedule(cpu);
1856
1857 raw_spin_unlock_irqrestore(&rq->lock, flags);
1858 }
1859
1860out:
1861 rcu_read_unlock();
1862}
1863
1864bool cpus_share_cache(int this_cpu, int that_cpu)
1865{
1866 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1867}
1868#endif
1869
1870static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1871{
1872 struct rq *rq = cpu_rq(cpu);
1873 struct pin_cookie cookie;
1874
1875#if defined(CONFIG_SMP)
1876 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1877 sched_clock_cpu(cpu);
1878 ttwu_queue_remote(p, cpu, wake_flags);
1879 return;
1880 }
1881#endif
1882
1883 raw_spin_lock(&rq->lock);
1884 cookie = lockdep_pin_lock(&rq->lock);
1885 ttwu_do_activate(rq, p, wake_flags, cookie);
1886 lockdep_unpin_lock(&rq->lock, cookie);
1887 raw_spin_unlock(&rq->lock);
1888}
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996static int
1997try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1998{
1999 unsigned long flags;
2000 int cpu, success = 0;
2001
2002
2003
2004
2005
2006
2007
2008 smp_mb__before_spinlock();
2009 raw_spin_lock_irqsave(&p->pi_lock, flags);
2010 if (!(p->state & state))
2011 goto out;
2012
2013 trace_sched_waking(p);
2014
2015 success = 1;
2016 cpu = task_cpu(p);
2017
2018 if (p->on_rq && ttwu_remote(p, wake_flags))
2019 goto stat;
2020
2021#ifdef CONFIG_SMP
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039 smp_rmb();
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050 smp_cond_acquire(!p->on_cpu);
2051
2052 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2053 p->state = TASK_WAKING;
2054
2055 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2056 if (task_cpu(p) != cpu) {
2057 wake_flags |= WF_MIGRATED;
2058 set_task_cpu(p, cpu);
2059 }
2060#endif
2061
2062 ttwu_queue(p, cpu, wake_flags);
2063stat:
2064 if (schedstat_enabled())
2065 ttwu_stat(p, cpu, wake_flags);
2066out:
2067 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2068
2069 return success;
2070}
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
2081{
2082 struct rq *rq = task_rq(p);
2083
2084 if (WARN_ON_ONCE(rq != this_rq()) ||
2085 WARN_ON_ONCE(p == current))
2086 return;
2087
2088 lockdep_assert_held(&rq->lock);
2089
2090 if (!raw_spin_trylock(&p->pi_lock)) {
2091
2092
2093
2094
2095
2096
2097 lockdep_unpin_lock(&rq->lock, cookie);
2098 raw_spin_unlock(&rq->lock);
2099 raw_spin_lock(&p->pi_lock);
2100 raw_spin_lock(&rq->lock);
2101 lockdep_repin_lock(&rq->lock, cookie);
2102 }
2103
2104 if (!(p->state & TASK_NORMAL))
2105 goto out;
2106
2107 trace_sched_waking(p);
2108
2109 if (!task_on_rq_queued(p))
2110 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2111
2112 ttwu_do_wakeup(rq, p, 0, cookie);
2113 if (schedstat_enabled())
2114 ttwu_stat(p, smp_processor_id(), 0);
2115out:
2116 raw_spin_unlock(&p->pi_lock);
2117}
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131int wake_up_process(struct task_struct *p)
2132{
2133 return try_to_wake_up(p, TASK_NORMAL, 0);
2134}
2135EXPORT_SYMBOL(wake_up_process);
2136
2137int wake_up_state(struct task_struct *p, unsigned int state)
2138{
2139 return try_to_wake_up(p, state, 0);
2140}
2141
2142
2143
2144
2145void __dl_clear_params(struct task_struct *p)
2146{
2147 struct sched_dl_entity *dl_se = &p->dl;
2148
2149 dl_se->dl_runtime = 0;
2150 dl_se->dl_deadline = 0;
2151 dl_se->dl_period = 0;
2152 dl_se->flags = 0;
2153 dl_se->dl_bw = 0;
2154
2155 dl_se->dl_throttled = 0;
2156 dl_se->dl_yielded = 0;
2157}
2158
2159
2160
2161
2162
2163
2164
2165static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2166{
2167 p->on_rq = 0;
2168
2169 p->se.on_rq = 0;
2170 p->se.exec_start = 0;
2171 p->se.sum_exec_runtime = 0;
2172 p->se.prev_sum_exec_runtime = 0;
2173 p->se.nr_migrations = 0;
2174 p->se.vruntime = 0;
2175 INIT_LIST_HEAD(&p->se.group_node);
2176
2177#ifdef CONFIG_FAIR_GROUP_SCHED
2178 p->se.cfs_rq = NULL;
2179#endif
2180
2181#ifdef CONFIG_SCHEDSTATS
2182
2183 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2184#endif
2185
2186 RB_CLEAR_NODE(&p->dl.rb_node);
2187 init_dl_task_timer(&p->dl);
2188 __dl_clear_params(p);
2189
2190 INIT_LIST_HEAD(&p->rt.run_list);
2191 p->rt.timeout = 0;
2192 p->rt.time_slice = sched_rr_timeslice;
2193 p->rt.on_rq = 0;
2194 p->rt.on_list = 0;
2195
2196#ifdef CONFIG_PREEMPT_NOTIFIERS
2197 INIT_HLIST_HEAD(&p->preempt_notifiers);
2198#endif
2199
2200#ifdef CONFIG_NUMA_BALANCING
2201 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2202 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2203 p->mm->numa_scan_seq = 0;
2204 }
2205
2206 if (clone_flags & CLONE_VM)
2207 p->numa_preferred_nid = current->numa_preferred_nid;
2208 else
2209 p->numa_preferred_nid = -1;
2210
2211 p->node_stamp = 0ULL;
2212 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2213 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2214 p->numa_work.next = &p->numa_work;
2215 p->numa_faults = NULL;
2216 p->last_task_numa_placement = 0;
2217 p->last_sum_exec_runtime = 0;
2218
2219 p->numa_group = NULL;
2220#endif
2221}
2222
2223DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2224
2225#ifdef CONFIG_NUMA_BALANCING
2226
2227void set_numabalancing_state(bool enabled)
2228{
2229 if (enabled)
2230 static_branch_enable(&sched_numa_balancing);
2231 else
2232 static_branch_disable(&sched_numa_balancing);
2233}
2234
2235#ifdef CONFIG_PROC_SYSCTL
2236int sysctl_numa_balancing(struct ctl_table *table, int write,
2237 void __user *buffer, size_t *lenp, loff_t *ppos)
2238{
2239 struct ctl_table t;
2240 int err;
2241 int state = static_branch_likely(&sched_numa_balancing);
2242
2243 if (write && !capable(CAP_SYS_ADMIN))
2244 return -EPERM;
2245
2246 t = *table;
2247 t.data = &state;
2248 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2249 if (err < 0)
2250 return err;
2251 if (write)
2252 set_numabalancing_state(state);
2253 return err;
2254}
2255#endif
2256#endif
2257
2258#ifdef CONFIG_SCHEDSTATS
2259
2260DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2261static bool __initdata __sched_schedstats = false;
2262
2263static void set_schedstats(bool enabled)
2264{
2265 if (enabled)
2266 static_branch_enable(&sched_schedstats);
2267 else
2268 static_branch_disable(&sched_schedstats);
2269}
2270
2271void force_schedstat_enabled(void)
2272{
2273 if (!schedstat_enabled()) {
2274 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2275 static_branch_enable(&sched_schedstats);
2276 }
2277}
2278
2279static int __init setup_schedstats(char *str)
2280{
2281 int ret = 0;
2282 if (!str)
2283 goto out;
2284
2285
2286
2287
2288
2289
2290 if (!strcmp(str, "enable")) {
2291 __sched_schedstats = true;
2292 ret = 1;
2293 } else if (!strcmp(str, "disable")) {
2294 __sched_schedstats = false;
2295 ret = 1;
2296 }
2297out:
2298 if (!ret)
2299 pr_warn("Unable to parse schedstats=\n");
2300
2301 return ret;
2302}
2303__setup("schedstats=", setup_schedstats);
2304
2305static void __init init_schedstats(void)
2306{
2307 set_schedstats(__sched_schedstats);
2308}
2309
2310#ifdef CONFIG_PROC_SYSCTL
2311int sysctl_schedstats(struct ctl_table *table, int write,
2312 void __user *buffer, size_t *lenp, loff_t *ppos)
2313{
2314 struct ctl_table t;
2315 int err;
2316 int state = static_branch_likely(&sched_schedstats);
2317
2318 if (write && !capable(CAP_SYS_ADMIN))
2319 return -EPERM;
2320
2321 t = *table;
2322 t.data = &state;
2323 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2324 if (err < 0)
2325 return err;
2326 if (write)
2327 set_schedstats(state);
2328 return err;
2329}
2330#endif
2331#else
2332static inline void init_schedstats(void) {}
2333#endif
2334
2335
2336
2337
2338int sched_fork(unsigned long clone_flags, struct task_struct *p)
2339{
2340 unsigned long flags;
2341 int cpu = get_cpu();
2342
2343 __sched_fork(clone_flags, p);
2344
2345
2346
2347
2348
2349 p->state = TASK_RUNNING;
2350
2351
2352
2353
2354 p->prio = current->normal_prio;
2355
2356
2357
2358
2359 if (unlikely(p->sched_reset_on_fork)) {
2360 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2361 p->policy = SCHED_NORMAL;
2362 p->static_prio = NICE_TO_PRIO(0);
2363 p->rt_priority = 0;
2364 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2365 p->static_prio = NICE_TO_PRIO(0);
2366
2367 p->prio = p->normal_prio = __normal_prio(p);
2368 set_load_weight(p);
2369
2370
2371
2372
2373
2374 p->sched_reset_on_fork = 0;
2375 }
2376
2377 if (dl_prio(p->prio)) {
2378 put_cpu();
2379 return -EAGAIN;
2380 } else if (rt_prio(p->prio)) {
2381 p->sched_class = &rt_sched_class;
2382 } else {
2383 p->sched_class = &fair_sched_class;
2384 }
2385
2386 if (p->sched_class->task_fork)
2387 p->sched_class->task_fork(p);
2388
2389
2390
2391
2392
2393
2394
2395
2396 raw_spin_lock_irqsave(&p->pi_lock, flags);
2397 set_task_cpu(p, cpu);
2398 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2399
2400#ifdef CONFIG_SCHED_INFO
2401 if (likely(sched_info_on()))
2402 memset(&p->sched_info, 0, sizeof(p->sched_info));
2403#endif
2404#if defined(CONFIG_SMP)
2405 p->on_cpu = 0;
2406#endif
2407 init_task_preempt_count(p);
2408#ifdef CONFIG_SMP
2409 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2410 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2411#endif
2412
2413 put_cpu();
2414 return 0;
2415}
2416
2417unsigned long to_ratio(u64 period, u64 runtime)
2418{
2419 if (runtime == RUNTIME_INF)
2420 return 1ULL << 20;
2421
2422
2423
2424
2425
2426
2427 if (period == 0)
2428 return 0;
2429
2430 return div64_u64(runtime << 20, period);
2431}
2432
2433#ifdef CONFIG_SMP
2434inline struct dl_bw *dl_bw_of(int i)
2435{
2436 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2437 "sched RCU must be held");
2438 return &cpu_rq(i)->rd->dl_bw;
2439}
2440
2441static inline int dl_bw_cpus(int i)
2442{
2443 struct root_domain *rd = cpu_rq(i)->rd;
2444 int cpus = 0;
2445
2446 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2447 "sched RCU must be held");
2448 for_each_cpu_and(i, rd->span, cpu_active_mask)
2449 cpus++;
2450
2451 return cpus;
2452}
2453#else
2454inline struct dl_bw *dl_bw_of(int i)
2455{
2456 return &cpu_rq(i)->dl.dl_bw;
2457}
2458
2459static inline int dl_bw_cpus(int i)
2460{
2461 return 1;
2462}
2463#endif
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476static int dl_overflow(struct task_struct *p, int policy,
2477 const struct sched_attr *attr)
2478{
2479
2480 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2481 u64 period = attr->sched_period ?: attr->sched_deadline;
2482 u64 runtime = attr->sched_runtime;
2483 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2484 int cpus, err = -1;
2485
2486
2487 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2488 return 0;
2489
2490
2491
2492
2493
2494
2495 raw_spin_lock(&dl_b->lock);
2496 cpus = dl_bw_cpus(task_cpu(p));
2497 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2498 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2499 __dl_add(dl_b, new_bw);
2500 err = 0;
2501 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2502 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2503 __dl_clear(dl_b, p->dl.dl_bw);
2504 __dl_add(dl_b, new_bw);
2505 err = 0;
2506 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2507 __dl_clear(dl_b, p->dl.dl_bw);
2508 err = 0;
2509 }
2510 raw_spin_unlock(&dl_b->lock);
2511
2512 return err;
2513}
2514
2515extern void init_dl_bw(struct dl_bw *dl_b);
2516
2517
2518
2519
2520
2521
2522
2523
2524void wake_up_new_task(struct task_struct *p)
2525{
2526 struct rq_flags rf;
2527 struct rq *rq;
2528
2529
2530 init_entity_runnable_average(&p->se);
2531 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2532#ifdef CONFIG_SMP
2533
2534
2535
2536
2537
2538 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2539#endif
2540 rq = __task_rq_lock(p, &rf);
2541 post_init_entity_util_avg(&p->se);
2542
2543 activate_task(rq, p, 0);
2544 p->on_rq = TASK_ON_RQ_QUEUED;
2545 trace_sched_wakeup_new(p);
2546 check_preempt_curr(rq, p, WF_FORK);
2547#ifdef CONFIG_SMP
2548 if (p->sched_class->task_woken) {
2549
2550
2551
2552
2553 lockdep_unpin_lock(&rq->lock, rf.cookie);
2554 p->sched_class->task_woken(rq, p);
2555 lockdep_repin_lock(&rq->lock, rf.cookie);
2556 }
2557#endif
2558 task_rq_unlock(rq, p, &rf);
2559}
2560
2561#ifdef CONFIG_PREEMPT_NOTIFIERS
2562
2563static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2564
2565void preempt_notifier_inc(void)
2566{
2567 static_key_slow_inc(&preempt_notifier_key);
2568}
2569EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2570
2571void preempt_notifier_dec(void)
2572{
2573 static_key_slow_dec(&preempt_notifier_key);
2574}
2575EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2576
2577
2578
2579
2580
2581void preempt_notifier_register(struct preempt_notifier *notifier)
2582{
2583 if (!static_key_false(&preempt_notifier_key))
2584 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2585
2586 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2587}
2588EXPORT_SYMBOL_GPL(preempt_notifier_register);
2589
2590
2591
2592
2593
2594
2595
2596void preempt_notifier_unregister(struct preempt_notifier *notifier)
2597{
2598 hlist_del(¬ifier->link);
2599}
2600EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2601
2602static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2603{
2604 struct preempt_notifier *notifier;
2605
2606 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2607 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2608}
2609
2610static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2611{
2612 if (static_key_false(&preempt_notifier_key))
2613 __fire_sched_in_preempt_notifiers(curr);
2614}
2615
2616static void
2617__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2618 struct task_struct *next)
2619{
2620 struct preempt_notifier *notifier;
2621
2622 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2623 notifier->ops->sched_out(notifier, next);
2624}
2625
2626static __always_inline void
2627fire_sched_out_preempt_notifiers(struct task_struct *curr,
2628 struct task_struct *next)
2629{
2630 if (static_key_false(&preempt_notifier_key))
2631 __fire_sched_out_preempt_notifiers(curr, next);
2632}
2633
2634#else
2635
2636static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2637{
2638}
2639
2640static inline void
2641fire_sched_out_preempt_notifiers(struct task_struct *curr,
2642 struct task_struct *next)
2643{
2644}
2645
2646#endif
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661static inline void
2662prepare_task_switch(struct rq *rq, struct task_struct *prev,
2663 struct task_struct *next)
2664{
2665 sched_info_switch(rq, prev, next);
2666 perf_event_task_sched_out(prev, next);
2667 fire_sched_out_preempt_notifiers(prev, next);
2668 prepare_lock_switch(rq, next);
2669 prepare_arch_switch(next);
2670}
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691static struct rq *finish_task_switch(struct task_struct *prev)
2692 __releases(rq->lock)
2693{
2694 struct rq *rq = this_rq();
2695 struct mm_struct *mm = rq->prev_mm;
2696 long prev_state;
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2710 "corrupted preempt_count: %s/%d/0x%x\n",
2711 current->comm, current->pid, preempt_count()))
2712 preempt_count_set(FORK_PREEMPT_COUNT);
2713
2714 rq->prev_mm = NULL;
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727 prev_state = prev->state;
2728 vtime_task_switch(prev);
2729 perf_event_task_sched_in(prev, current);
2730 finish_lock_switch(rq, prev);
2731 finish_arch_post_lock_switch();
2732
2733 fire_sched_in_preempt_notifiers(current);
2734 if (mm)
2735 mmdrop(mm);
2736 if (unlikely(prev_state == TASK_DEAD)) {
2737 if (prev->sched_class->task_dead)
2738 prev->sched_class->task_dead(prev);
2739
2740
2741
2742
2743
2744 kprobe_flush_task(prev);
2745 put_task_struct(prev);
2746 }
2747
2748 tick_nohz_task_switch();
2749 return rq;
2750}
2751
2752#ifdef CONFIG_SMP
2753
2754
2755static void __balance_callback(struct rq *rq)
2756{
2757 struct callback_head *head, *next;
2758 void (*func)(struct rq *rq);
2759 unsigned long flags;
2760
2761 raw_spin_lock_irqsave(&rq->lock, flags);
2762 head = rq->balance_callback;
2763 rq->balance_callback = NULL;
2764 while (head) {
2765 func = (void (*)(struct rq *))head->func;
2766 next = head->next;
2767 head->next = NULL;
2768 head = next;
2769
2770 func(rq);
2771 }
2772 raw_spin_unlock_irqrestore(&rq->lock, flags);
2773}
2774
2775static inline void balance_callback(struct rq *rq)
2776{
2777 if (unlikely(rq->balance_callback))
2778 __balance_callback(rq);
2779}
2780
2781#else
2782
2783static inline void balance_callback(struct rq *rq)
2784{
2785}
2786
2787#endif
2788
2789
2790
2791
2792
2793asmlinkage __visible void schedule_tail(struct task_struct *prev)
2794 __releases(rq->lock)
2795{
2796 struct rq *rq;
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807 rq = finish_task_switch(prev);
2808 balance_callback(rq);
2809 preempt_enable();
2810
2811 if (current->set_child_tid)
2812 put_user(task_pid_vnr(current), current->set_child_tid);
2813}
2814
2815
2816
2817
2818static __always_inline struct rq *
2819context_switch(struct rq *rq, struct task_struct *prev,
2820 struct task_struct *next, struct pin_cookie cookie)
2821{
2822 struct mm_struct *mm, *oldmm;
2823
2824 prepare_task_switch(rq, prev, next);
2825
2826 mm = next->mm;
2827 oldmm = prev->active_mm;
2828
2829
2830
2831
2832
2833 arch_start_context_switch(prev);
2834
2835 if (!mm) {
2836 next->active_mm = oldmm;
2837 atomic_inc(&oldmm->mm_count);
2838 enter_lazy_tlb(oldmm, next);
2839 } else
2840 switch_mm_irqs_off(oldmm, mm, next);
2841
2842 if (!prev->mm) {
2843 prev->active_mm = NULL;
2844 rq->prev_mm = oldmm;
2845 }
2846
2847
2848
2849
2850
2851
2852 lockdep_unpin_lock(&rq->lock, cookie);
2853 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2854
2855
2856 switch_to(prev, next, prev);
2857 barrier();
2858
2859 return finish_task_switch(prev);
2860}
2861
2862
2863
2864
2865
2866
2867
2868unsigned long nr_running(void)
2869{
2870 unsigned long i, sum = 0;
2871
2872 for_each_online_cpu(i)
2873 sum += cpu_rq(i)->nr_running;
2874
2875 return sum;
2876}
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891bool single_task_running(void)
2892{
2893 return raw_rq()->nr_running == 1;
2894}
2895EXPORT_SYMBOL(single_task_running);
2896
2897unsigned long long nr_context_switches(void)
2898{
2899 int i;
2900 unsigned long long sum = 0;
2901
2902 for_each_possible_cpu(i)
2903 sum += cpu_rq(i)->nr_switches;
2904
2905 return sum;
2906}
2907
2908unsigned long nr_iowait(void)
2909{
2910 unsigned long i, sum = 0;
2911
2912 for_each_possible_cpu(i)
2913 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2914
2915 return sum;
2916}
2917
2918unsigned long nr_iowait_cpu(int cpu)
2919{
2920 struct rq *this = cpu_rq(cpu);
2921 return atomic_read(&this->nr_iowait);
2922}
2923
2924void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2925{
2926 struct rq *rq = this_rq();
2927 *nr_waiters = atomic_read(&rq->nr_iowait);
2928 *load = rq->load.weight;
2929}
2930
2931#ifdef CONFIG_SMP
2932
2933
2934
2935
2936
2937void sched_exec(void)
2938{
2939 struct task_struct *p = current;
2940 unsigned long flags;
2941 int dest_cpu;
2942
2943 raw_spin_lock_irqsave(&p->pi_lock, flags);
2944 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2945 if (dest_cpu == smp_processor_id())
2946 goto unlock;
2947
2948 if (likely(cpu_active(dest_cpu))) {
2949 struct migration_arg arg = { p, dest_cpu };
2950
2951 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2952 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2953 return;
2954 }
2955unlock:
2956 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2957}
2958
2959#endif
2960
2961DEFINE_PER_CPU(struct kernel_stat, kstat);
2962DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2963
2964EXPORT_PER_CPU_SYMBOL(kstat);
2965EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2966
2967
2968
2969
2970
2971
2972unsigned long long task_sched_runtime(struct task_struct *p)
2973{
2974 struct rq_flags rf;
2975 struct rq *rq;
2976 u64 ns;
2977
2978#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990 if (!p->on_cpu || !task_on_rq_queued(p))
2991 return p->se.sum_exec_runtime;
2992#endif
2993
2994 rq = task_rq_lock(p, &rf);
2995
2996
2997
2998
2999
3000 if (task_current(rq, p) && task_on_rq_queued(p)) {
3001 update_rq_clock(rq);
3002 p->sched_class->update_curr(rq);
3003 }
3004 ns = p->se.sum_exec_runtime;
3005 task_rq_unlock(rq, p, &rf);
3006
3007 return ns;
3008}
3009
3010
3011
3012
3013
3014void scheduler_tick(void)
3015{
3016 int cpu = smp_processor_id();
3017 struct rq *rq = cpu_rq(cpu);
3018 struct task_struct *curr = rq->curr;
3019
3020 sched_clock_tick();
3021
3022 raw_spin_lock(&rq->lock);
3023 update_rq_clock(rq);
3024 curr->sched_class->task_tick(rq, curr, 0);
3025 cpu_load_update_active(rq);
3026 calc_global_load_tick(rq);
3027 raw_spin_unlock(&rq->lock);
3028
3029 perf_event_task_tick();
3030
3031#ifdef CONFIG_SMP
3032 rq->idle_balance = idle_cpu(cpu);
3033 trigger_load_balance(rq);
3034#endif
3035 rq_last_tick_reset(rq);
3036}
3037
3038#ifdef CONFIG_NO_HZ_FULL
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052u64 scheduler_tick_max_deferment(void)
3053{
3054 struct rq *rq = this_rq();
3055 unsigned long next, now = READ_ONCE(jiffies);
3056
3057 next = rq->last_sched_tick + HZ;
3058
3059 if (time_before_eq(next, now))
3060 return 0;
3061
3062 return jiffies_to_nsecs(next - now);
3063}
3064#endif
3065
3066#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3067 defined(CONFIG_PREEMPT_TRACER))
3068
3069
3070
3071
3072static inline void preempt_latency_start(int val)
3073{
3074 if (preempt_count() == val) {
3075 unsigned long ip = get_lock_parent_ip();
3076#ifdef CONFIG_DEBUG_PREEMPT
3077 current->preempt_disable_ip = ip;
3078#endif
3079 trace_preempt_off(CALLER_ADDR0, ip);
3080 }
3081}
3082
3083void preempt_count_add(int val)
3084{
3085#ifdef CONFIG_DEBUG_PREEMPT
3086
3087
3088
3089 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3090 return;
3091#endif
3092 __preempt_count_add(val);
3093#ifdef CONFIG_DEBUG_PREEMPT
3094
3095
3096
3097 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3098 PREEMPT_MASK - 10);
3099#endif
3100 preempt_latency_start(val);
3101}
3102EXPORT_SYMBOL(preempt_count_add);
3103NOKPROBE_SYMBOL(preempt_count_add);
3104
3105
3106
3107
3108
3109static inline void preempt_latency_stop(int val)
3110{
3111 if (preempt_count() == val)
3112 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3113}
3114
3115void preempt_count_sub(int val)
3116{
3117#ifdef CONFIG_DEBUG_PREEMPT
3118
3119
3120
3121 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3122 return;
3123
3124
3125
3126 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3127 !(preempt_count() & PREEMPT_MASK)))
3128 return;
3129#endif
3130
3131 preempt_latency_stop(val);
3132 __preempt_count_sub(val);
3133}
3134EXPORT_SYMBOL(preempt_count_sub);
3135NOKPROBE_SYMBOL(preempt_count_sub);
3136
3137#else
3138static inline void preempt_latency_start(int val) { }
3139static inline void preempt_latency_stop(int val) { }
3140#endif
3141
3142
3143
3144
3145static noinline void __schedule_bug(struct task_struct *prev)
3146{
3147 if (oops_in_progress)
3148 return;
3149
3150 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3151 prev->comm, prev->pid, preempt_count());
3152
3153 debug_show_held_locks(prev);
3154 print_modules();
3155 if (irqs_disabled())
3156 print_irqtrace_events(prev);
3157#ifdef CONFIG_DEBUG_PREEMPT
3158 if (in_atomic_preempt_off()) {
3159 pr_err("Preemption disabled at:");
3160 print_ip_sym(current->preempt_disable_ip);
3161 pr_cont("\n");
3162 }
3163#endif
3164 dump_stack();
3165 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3166}
3167
3168
3169
3170
3171static inline void schedule_debug(struct task_struct *prev)
3172{
3173#ifdef CONFIG_SCHED_STACK_END_CHECK
3174 if (task_stack_end_corrupted(prev))
3175 panic("corrupted stack end detected inside scheduler\n");
3176#endif
3177
3178 if (unlikely(in_atomic_preempt_off())) {
3179 __schedule_bug(prev);
3180 preempt_count_set(PREEMPT_DISABLED);
3181 }
3182 rcu_sleep_check();
3183
3184 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3185
3186 schedstat_inc(this_rq(), sched_count);
3187}
3188
3189
3190
3191
3192static inline struct task_struct *
3193pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
3194{
3195 const struct sched_class *class = &fair_sched_class;
3196 struct task_struct *p;
3197
3198
3199
3200
3201
3202 if (likely(prev->sched_class == class &&
3203 rq->nr_running == rq->cfs.h_nr_running)) {
3204 p = fair_sched_class.pick_next_task(rq, prev, cookie);
3205 if (unlikely(p == RETRY_TASK))
3206 goto again;
3207
3208
3209 if (unlikely(!p))
3210 p = idle_sched_class.pick_next_task(rq, prev, cookie);
3211
3212 return p;
3213 }
3214
3215again:
3216 for_each_class(class) {
3217 p = class->pick_next_task(rq, prev, cookie);
3218 if (p) {
3219 if (unlikely(p == RETRY_TASK))
3220 goto again;
3221 return p;
3222 }
3223 }
3224
3225 BUG();
3226}
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267static void __sched notrace __schedule(bool preempt)
3268{
3269 struct task_struct *prev, *next;
3270 unsigned long *switch_count;
3271 struct pin_cookie cookie;
3272 struct rq *rq;
3273 int cpu;
3274
3275 cpu = smp_processor_id();
3276 rq = cpu_rq(cpu);
3277 prev = rq->curr;
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287 if (unlikely(prev->state == TASK_DEAD))
3288 preempt_enable_no_resched_notrace();
3289
3290 schedule_debug(prev);
3291
3292 if (sched_feat(HRTICK))
3293 hrtick_clear(rq);
3294
3295 local_irq_disable();
3296 rcu_note_context_switch();
3297
3298
3299
3300
3301
3302
3303 smp_mb__before_spinlock();
3304 raw_spin_lock(&rq->lock);
3305 cookie = lockdep_pin_lock(&rq->lock);
3306
3307 rq->clock_skip_update <<= 1;
3308
3309 switch_count = &prev->nivcsw;
3310 if (!preempt && prev->state) {
3311 if (unlikely(signal_pending_state(prev->state, prev))) {
3312 prev->state = TASK_RUNNING;
3313 } else {
3314 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3315 prev->on_rq = 0;
3316
3317
3318
3319
3320
3321
3322 if (prev->flags & PF_WQ_WORKER) {
3323 struct task_struct *to_wakeup;
3324
3325 to_wakeup = wq_worker_sleeping(prev);
3326 if (to_wakeup)
3327 try_to_wake_up_local(to_wakeup, cookie);
3328 }
3329 }
3330 switch_count = &prev->nvcsw;
3331 }
3332
3333 if (task_on_rq_queued(prev))
3334 update_rq_clock(rq);
3335
3336 next = pick_next_task(rq, prev, cookie);
3337 clear_tsk_need_resched(prev);
3338 clear_preempt_need_resched();
3339 rq->clock_skip_update = 0;
3340
3341 if (likely(prev != next)) {
3342 rq->nr_switches++;
3343 rq->curr = next;
3344 ++*switch_count;
3345
3346 trace_sched_switch(preempt, prev, next);
3347 rq = context_switch(rq, prev, next, cookie);
3348 } else {
3349 lockdep_unpin_lock(&rq->lock, cookie);
3350 raw_spin_unlock_irq(&rq->lock);
3351 }
3352
3353 balance_callback(rq);
3354}
3355STACK_FRAME_NON_STANDARD(__schedule);
3356
3357static inline void sched_submit_work(struct task_struct *tsk)
3358{
3359 if (!tsk->state || tsk_is_pi_blocked(tsk))
3360 return;
3361
3362
3363
3364
3365 if (blk_needs_flush_plug(tsk))
3366 blk_schedule_flush_plug(tsk);
3367}
3368
3369asmlinkage __visible void __sched schedule(void)
3370{
3371 struct task_struct *tsk = current;
3372
3373 sched_submit_work(tsk);
3374 do {
3375 preempt_disable();
3376 __schedule(false);
3377 sched_preempt_enable_no_resched();
3378 } while (need_resched());
3379}
3380EXPORT_SYMBOL(schedule);
3381
3382#ifdef CONFIG_CONTEXT_TRACKING
3383asmlinkage __visible void __sched schedule_user(void)
3384{
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395 enum ctx_state prev_state = exception_enter();
3396 schedule();
3397 exception_exit(prev_state);
3398}
3399#endif
3400
3401
3402
3403
3404
3405
3406void __sched schedule_preempt_disabled(void)
3407{
3408 sched_preempt_enable_no_resched();
3409 schedule();
3410 preempt_disable();
3411}
3412
3413static void __sched notrace preempt_schedule_common(void)
3414{
3415 do {
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429 preempt_disable_notrace();
3430 preempt_latency_start(1);
3431 __schedule(true);
3432 preempt_latency_stop(1);
3433 preempt_enable_no_resched_notrace();
3434
3435
3436
3437
3438
3439 } while (need_resched());
3440}
3441
3442#ifdef CONFIG_PREEMPT
3443
3444
3445
3446
3447
3448asmlinkage __visible void __sched notrace preempt_schedule(void)
3449{
3450
3451
3452
3453
3454 if (likely(!preemptible()))
3455 return;
3456
3457 preempt_schedule_common();
3458}
3459NOKPROBE_SYMBOL(preempt_schedule);
3460EXPORT_SYMBOL(preempt_schedule);
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3477{
3478 enum ctx_state prev_ctx;
3479
3480 if (likely(!preemptible()))
3481 return;
3482
3483 do {
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497 preempt_disable_notrace();
3498 preempt_latency_start(1);
3499
3500
3501
3502
3503
3504 prev_ctx = exception_enter();
3505 __schedule(true);
3506 exception_exit(prev_ctx);
3507
3508 preempt_latency_stop(1);
3509 preempt_enable_no_resched_notrace();
3510 } while (need_resched());
3511}
3512EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3513
3514#endif
3515
3516
3517
3518
3519
3520
3521
3522asmlinkage __visible void __sched preempt_schedule_irq(void)
3523{
3524 enum ctx_state prev_state;
3525
3526
3527 BUG_ON(preempt_count() || !irqs_disabled());
3528
3529 prev_state = exception_enter();
3530
3531 do {
3532 preempt_disable();
3533 local_irq_enable();
3534 __schedule(true);
3535 local_irq_disable();
3536 sched_preempt_enable_no_resched();
3537 } while (need_resched());
3538
3539 exception_exit(prev_state);
3540}
3541
3542int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3543 void *key)
3544{
3545 return try_to_wake_up(curr->private, mode, wake_flags);
3546}
3547EXPORT_SYMBOL(default_wake_function);
3548
3549#ifdef CONFIG_RT_MUTEXES
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562void rt_mutex_setprio(struct task_struct *p, int prio)
3563{
3564 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
3565 const struct sched_class *prev_class;
3566 struct rq_flags rf;
3567 struct rq *rq;
3568
3569 BUG_ON(prio > MAX_PRIO);
3570
3571 rq = __task_rq_lock(p, &rf);
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585 if (unlikely(p == rq->idle)) {
3586 WARN_ON(p != rq->curr);
3587 WARN_ON(p->pi_blocked_on);
3588 goto out_unlock;
3589 }
3590
3591 trace_sched_pi_setprio(p, prio);
3592 oldprio = p->prio;
3593
3594 if (oldprio == prio)
3595 queue_flag &= ~DEQUEUE_MOVE;
3596
3597 prev_class = p->sched_class;
3598 queued = task_on_rq_queued(p);
3599 running = task_current(rq, p);
3600 if (queued)
3601 dequeue_task(rq, p, queue_flag);
3602 if (running)
3603 put_prev_task(rq, p);
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614 if (dl_prio(prio)) {
3615 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3616 if (!dl_prio(p->normal_prio) ||
3617 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3618 p->dl.dl_boosted = 1;
3619 queue_flag |= ENQUEUE_REPLENISH;
3620 } else
3621 p->dl.dl_boosted = 0;
3622 p->sched_class = &dl_sched_class;
3623 } else if (rt_prio(prio)) {
3624 if (dl_prio(oldprio))
3625 p->dl.dl_boosted = 0;
3626 if (oldprio < prio)
3627 queue_flag |= ENQUEUE_HEAD;
3628 p->sched_class = &rt_sched_class;
3629 } else {
3630 if (dl_prio(oldprio))
3631 p->dl.dl_boosted = 0;
3632 if (rt_prio(oldprio))
3633 p->rt.timeout = 0;
3634 p->sched_class = &fair_sched_class;
3635 }
3636
3637 p->prio = prio;
3638
3639 if (running)
3640 p->sched_class->set_curr_task(rq);
3641 if (queued)
3642 enqueue_task(rq, p, queue_flag);
3643
3644 check_class_changed(rq, p, prev_class, oldprio);
3645out_unlock:
3646 preempt_disable();
3647 __task_rq_unlock(rq, &rf);
3648
3649 balance_callback(rq);
3650 preempt_enable();
3651}
3652#endif
3653
3654void set_user_nice(struct task_struct *p, long nice)
3655{
3656 int old_prio, delta, queued;
3657 struct rq_flags rf;
3658 struct rq *rq;
3659
3660 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3661 return;
3662
3663
3664
3665
3666 rq = task_rq_lock(p, &rf);
3667
3668
3669
3670
3671
3672
3673 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3674 p->static_prio = NICE_TO_PRIO(nice);
3675 goto out_unlock;
3676 }
3677 queued = task_on_rq_queued(p);
3678 if (queued)
3679 dequeue_task(rq, p, DEQUEUE_SAVE);
3680
3681 p->static_prio = NICE_TO_PRIO(nice);
3682 set_load_weight(p);
3683 old_prio = p->prio;
3684 p->prio = effective_prio(p);
3685 delta = p->prio - old_prio;
3686
3687 if (queued) {
3688 enqueue_task(rq, p, ENQUEUE_RESTORE);
3689
3690
3691
3692
3693 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3694 resched_curr(rq);
3695 }
3696out_unlock:
3697 task_rq_unlock(rq, p, &rf);
3698}
3699EXPORT_SYMBOL(set_user_nice);
3700
3701
3702
3703
3704
3705
3706int can_nice(const struct task_struct *p, const int nice)
3707{
3708
3709 int nice_rlim = nice_to_rlimit(nice);
3710
3711 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3712 capable(CAP_SYS_NICE));
3713}
3714
3715#ifdef __ARCH_WANT_SYS_NICE
3716
3717
3718
3719
3720
3721
3722
3723
3724SYSCALL_DEFINE1(nice, int, increment)
3725{
3726 long nice, retval;
3727
3728
3729
3730
3731
3732
3733 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3734 nice = task_nice(current) + increment;
3735
3736 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3737 if (increment < 0 && !can_nice(current, nice))
3738 return -EPERM;
3739
3740 retval = security_task_setnice(current, nice);
3741 if (retval)
3742 return retval;
3743
3744 set_user_nice(current, nice);
3745 return 0;
3746}
3747
3748#endif
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758int task_prio(const struct task_struct *p)
3759{
3760 return p->prio - MAX_RT_PRIO;
3761}
3762
3763
3764
3765
3766
3767
3768
3769int idle_cpu(int cpu)
3770{
3771 struct rq *rq = cpu_rq(cpu);
3772
3773 if (rq->curr != rq->idle)
3774 return 0;
3775
3776 if (rq->nr_running)
3777 return 0;
3778
3779#ifdef CONFIG_SMP
3780 if (!llist_empty(&rq->wake_list))
3781 return 0;
3782#endif
3783
3784 return 1;
3785}
3786
3787
3788
3789
3790
3791
3792
3793struct task_struct *idle_task(int cpu)
3794{
3795 return cpu_rq(cpu)->idle;
3796}
3797
3798
3799
3800
3801
3802
3803
3804static struct task_struct *find_process_by_pid(pid_t pid)
3805{
3806 return pid ? find_task_by_vpid(pid) : current;
3807}
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817static void
3818__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3819{
3820 struct sched_dl_entity *dl_se = &p->dl;
3821
3822 dl_se->dl_runtime = attr->sched_runtime;
3823 dl_se->dl_deadline = attr->sched_deadline;
3824 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3825 dl_se->flags = attr->sched_flags;
3826 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847}
3848
3849
3850
3851
3852
3853#define SETPARAM_POLICY -1
3854
3855static void __setscheduler_params(struct task_struct *p,
3856 const struct sched_attr *attr)
3857{
3858 int policy = attr->sched_policy;
3859
3860 if (policy == SETPARAM_POLICY)
3861 policy = p->policy;
3862
3863 p->policy = policy;
3864
3865 if (dl_policy(policy))
3866 __setparam_dl(p, attr);
3867 else if (fair_policy(policy))
3868 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3869
3870
3871
3872
3873
3874
3875 p->rt_priority = attr->sched_priority;
3876 p->normal_prio = normal_prio(p);
3877 set_load_weight(p);
3878}
3879
3880
3881static void __setscheduler(struct rq *rq, struct task_struct *p,
3882 const struct sched_attr *attr, bool keep_boost)
3883{
3884 __setscheduler_params(p, attr);
3885
3886
3887
3888
3889
3890 if (keep_boost)
3891 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
3892 else
3893 p->prio = normal_prio(p);
3894
3895 if (dl_prio(p->prio))
3896 p->sched_class = &dl_sched_class;
3897 else if (rt_prio(p->prio))
3898 p->sched_class = &rt_sched_class;
3899 else
3900 p->sched_class = &fair_sched_class;
3901}
3902
3903static void
3904__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3905{
3906 struct sched_dl_entity *dl_se = &p->dl;
3907
3908 attr->sched_priority = p->rt_priority;
3909 attr->sched_runtime = dl_se->dl_runtime;
3910 attr->sched_deadline = dl_se->dl_deadline;
3911 attr->sched_period = dl_se->dl_period;
3912 attr->sched_flags = dl_se->flags;
3913}
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925static bool
3926__checkparam_dl(const struct sched_attr *attr)
3927{
3928
3929 if (attr->sched_deadline == 0)
3930 return false;
3931
3932
3933
3934
3935
3936 if (attr->sched_runtime < (1ULL << DL_SCALE))
3937 return false;
3938
3939
3940
3941
3942
3943 if (attr->sched_deadline & (1ULL << 63) ||
3944 attr->sched_period & (1ULL << 63))
3945 return false;
3946
3947
3948 if ((attr->sched_period != 0 &&
3949 attr->sched_period < attr->sched_deadline) ||
3950 attr->sched_deadline < attr->sched_runtime)
3951 return false;
3952
3953 return true;
3954}
3955
3956
3957
3958
3959static bool check_same_owner(struct task_struct *p)
3960{
3961 const struct cred *cred = current_cred(), *pcred;
3962 bool match;
3963
3964 rcu_read_lock();
3965 pcred = __task_cred(p);
3966 match = (uid_eq(cred->euid, pcred->euid) ||
3967 uid_eq(cred->euid, pcred->uid));
3968 rcu_read_unlock();
3969 return match;
3970}
3971
3972static bool dl_param_changed(struct task_struct *p,
3973 const struct sched_attr *attr)
3974{
3975 struct sched_dl_entity *dl_se = &p->dl;
3976
3977 if (dl_se->dl_runtime != attr->sched_runtime ||
3978 dl_se->dl_deadline != attr->sched_deadline ||
3979 dl_se->dl_period != attr->sched_period ||
3980 dl_se->flags != attr->sched_flags)
3981 return true;
3982
3983 return false;
3984}
3985
3986static int __sched_setscheduler(struct task_struct *p,
3987 const struct sched_attr *attr,
3988 bool user, bool pi)
3989{
3990 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3991 MAX_RT_PRIO - 1 - attr->sched_priority;
3992 int retval, oldprio, oldpolicy = -1, queued, running;
3993 int new_effective_prio, policy = attr->sched_policy;
3994 const struct sched_class *prev_class;
3995 struct rq_flags rf;
3996 int reset_on_fork;
3997 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
3998 struct rq *rq;
3999
4000
4001 BUG_ON(in_interrupt());
4002recheck:
4003
4004 if (policy < 0) {
4005 reset_on_fork = p->sched_reset_on_fork;
4006 policy = oldpolicy = p->policy;
4007 } else {
4008 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4009
4010 if (!valid_policy(policy))
4011 return -EINVAL;
4012 }
4013
4014 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
4015 return -EINVAL;
4016
4017
4018
4019
4020
4021
4022 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4023 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4024 return -EINVAL;
4025 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4026 (rt_policy(policy) != (attr->sched_priority != 0)))
4027 return -EINVAL;
4028
4029
4030
4031
4032 if (user && !capable(CAP_SYS_NICE)) {
4033 if (fair_policy(policy)) {
4034 if (attr->sched_nice < task_nice(p) &&
4035 !can_nice(p, attr->sched_nice))
4036 return -EPERM;
4037 }
4038
4039 if (rt_policy(policy)) {
4040 unsigned long rlim_rtprio =
4041 task_rlimit(p, RLIMIT_RTPRIO);
4042
4043
4044 if (policy != p->policy && !rlim_rtprio)
4045 return -EPERM;
4046
4047
4048 if (attr->sched_priority > p->rt_priority &&
4049 attr->sched_priority > rlim_rtprio)
4050 return -EPERM;
4051 }
4052
4053
4054
4055
4056
4057
4058
4059 if (dl_policy(policy))
4060 return -EPERM;
4061
4062
4063
4064
4065
4066 if (idle_policy(p->policy) && !idle_policy(policy)) {
4067 if (!can_nice(p, task_nice(p)))
4068 return -EPERM;
4069 }
4070
4071
4072 if (!check_same_owner(p))
4073 return -EPERM;
4074
4075
4076 if (p->sched_reset_on_fork && !reset_on_fork)
4077 return -EPERM;
4078 }
4079
4080 if (user) {
4081 retval = security_task_setscheduler(p);
4082 if (retval)
4083 return retval;
4084 }
4085
4086
4087
4088
4089
4090
4091
4092
4093 rq = task_rq_lock(p, &rf);
4094
4095
4096
4097
4098 if (p == rq->stop) {
4099 task_rq_unlock(rq, p, &rf);
4100 return -EINVAL;
4101 }
4102
4103
4104
4105
4106
4107 if (unlikely(policy == p->policy)) {
4108 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4109 goto change;
4110 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4111 goto change;
4112 if (dl_policy(policy) && dl_param_changed(p, attr))
4113 goto change;
4114
4115 p->sched_reset_on_fork = reset_on_fork;
4116 task_rq_unlock(rq, p, &rf);
4117 return 0;
4118 }
4119change:
4120
4121 if (user) {
4122#ifdef CONFIG_RT_GROUP_SCHED
4123
4124
4125
4126
4127 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4128 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4129 !task_group_is_autogroup(task_group(p))) {
4130 task_rq_unlock(rq, p, &rf);
4131 return -EPERM;
4132 }
4133#endif
4134#ifdef CONFIG_SMP
4135 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4136 cpumask_t *span = rq->rd->span;
4137
4138
4139
4140
4141
4142
4143 if (!cpumask_subset(span, &p->cpus_allowed) ||
4144 rq->rd->dl_bw.bw == 0) {
4145 task_rq_unlock(rq, p, &rf);
4146 return -EPERM;
4147 }
4148 }
4149#endif
4150 }
4151
4152
4153 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4154 policy = oldpolicy = -1;
4155 task_rq_unlock(rq, p, &rf);
4156 goto recheck;
4157 }
4158
4159
4160
4161
4162
4163
4164 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
4165 task_rq_unlock(rq, p, &rf);
4166 return -EBUSY;
4167 }
4168
4169 p->sched_reset_on_fork = reset_on_fork;
4170 oldprio = p->prio;
4171
4172 if (pi) {
4173
4174
4175
4176
4177
4178
4179
4180 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
4181 if (new_effective_prio == oldprio)
4182 queue_flags &= ~DEQUEUE_MOVE;
4183 }
4184
4185 queued = task_on_rq_queued(p);
4186 running = task_current(rq, p);
4187 if (queued)
4188 dequeue_task(rq, p, queue_flags);
4189 if (running)
4190 put_prev_task(rq, p);
4191
4192 prev_class = p->sched_class;
4193 __setscheduler(rq, p, attr, pi);
4194
4195 if (running)
4196 p->sched_class->set_curr_task(rq);
4197 if (queued) {
4198
4199
4200
4201
4202 if (oldprio < p->prio)
4203 queue_flags |= ENQUEUE_HEAD;
4204
4205 enqueue_task(rq, p, queue_flags);
4206 }
4207
4208 check_class_changed(rq, p, prev_class, oldprio);
4209 preempt_disable();
4210 task_rq_unlock(rq, p, &rf);
4211
4212 if (pi)
4213 rt_mutex_adjust_pi(p);
4214
4215
4216
4217
4218 balance_callback(rq);
4219 preempt_enable();
4220
4221 return 0;
4222}
4223
4224static int _sched_setscheduler(struct task_struct *p, int policy,
4225 const struct sched_param *param, bool check)
4226{
4227 struct sched_attr attr = {
4228 .sched_policy = policy,
4229 .sched_priority = param->sched_priority,
4230 .sched_nice = PRIO_TO_NICE(p->static_prio),
4231 };
4232
4233
4234 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4235 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4236 policy &= ~SCHED_RESET_ON_FORK;
4237 attr.sched_policy = policy;
4238 }
4239
4240 return __sched_setscheduler(p, &attr, check, true);
4241}
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252int sched_setscheduler(struct task_struct *p, int policy,
4253 const struct sched_param *param)
4254{
4255 return _sched_setscheduler(p, policy, param, true);
4256}
4257EXPORT_SYMBOL_GPL(sched_setscheduler);
4258
4259int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4260{
4261 return __sched_setscheduler(p, attr, true, true);
4262}
4263EXPORT_SYMBOL_GPL(sched_setattr);
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4279 const struct sched_param *param)
4280{
4281 return _sched_setscheduler(p, policy, param, false);
4282}
4283EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4284
4285static int
4286do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4287{
4288 struct sched_param lparam;
4289 struct task_struct *p;
4290 int retval;
4291
4292 if (!param || pid < 0)
4293 return -EINVAL;
4294 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4295 return -EFAULT;
4296
4297 rcu_read_lock();
4298 retval = -ESRCH;
4299 p = find_process_by_pid(pid);
4300 if (p != NULL)
4301 retval = sched_setscheduler(p, policy, &lparam);
4302 rcu_read_unlock();
4303
4304 return retval;
4305}
4306
4307
4308
4309
4310static int sched_copy_attr(struct sched_attr __user *uattr,
4311 struct sched_attr *attr)
4312{
4313 u32 size;
4314 int ret;
4315
4316 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4317 return -EFAULT;
4318
4319
4320
4321
4322 memset(attr, 0, sizeof(*attr));
4323
4324 ret = get_user(size, &uattr->size);
4325 if (ret)
4326 return ret;
4327
4328 if (size > PAGE_SIZE)
4329 goto err_size;
4330
4331 if (!size)
4332 size = SCHED_ATTR_SIZE_VER0;
4333
4334 if (size < SCHED_ATTR_SIZE_VER0)
4335 goto err_size;
4336
4337
4338
4339
4340
4341
4342
4343 if (size > sizeof(*attr)) {
4344 unsigned char __user *addr;
4345 unsigned char __user *end;
4346 unsigned char val;
4347
4348 addr = (void __user *)uattr + sizeof(*attr);
4349 end = (void __user *)uattr + size;
4350
4351 for (; addr < end; addr++) {
4352 ret = get_user(val, addr);
4353 if (ret)
4354 return ret;
4355 if (val)
4356 goto err_size;
4357 }
4358 size = sizeof(*attr);
4359 }
4360
4361 ret = copy_from_user(attr, uattr, size);
4362 if (ret)
4363 return -EFAULT;
4364
4365
4366
4367
4368
4369 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4370
4371 return 0;
4372
4373err_size:
4374 put_user(sizeof(*attr), &uattr->size);
4375 return -E2BIG;
4376}
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4387 struct sched_param __user *, param)
4388{
4389
4390 if (policy < 0)
4391 return -EINVAL;
4392
4393 return do_sched_setscheduler(pid, policy, param);
4394}
4395
4396
4397
4398
4399
4400
4401
4402
4403SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4404{
4405 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4406}
4407
4408
4409
4410
4411
4412
4413
4414SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4415 unsigned int, flags)
4416{
4417 struct sched_attr attr;
4418 struct task_struct *p;
4419 int retval;
4420
4421 if (!uattr || pid < 0 || flags)
4422 return -EINVAL;
4423
4424 retval = sched_copy_attr(uattr, &attr);
4425 if (retval)
4426 return retval;
4427
4428 if ((int)attr.sched_policy < 0)
4429 return -EINVAL;
4430
4431 rcu_read_lock();
4432 retval = -ESRCH;
4433 p = find_process_by_pid(pid);
4434 if (p != NULL)
4435 retval = sched_setattr(p, &attr);
4436 rcu_read_unlock();
4437
4438 return retval;
4439}
4440
4441
4442
4443
4444
4445
4446
4447
4448SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4449{
4450 struct task_struct *p;
4451 int retval;
4452
4453 if (pid < 0)
4454 return -EINVAL;
4455
4456 retval = -ESRCH;
4457 rcu_read_lock();
4458 p = find_process_by_pid(pid);
4459 if (p) {
4460 retval = security_task_getscheduler(p);
4461 if (!retval)
4462 retval = p->policy
4463 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4464 }
4465 rcu_read_unlock();
4466 return retval;
4467}
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4478{
4479 struct sched_param lp = { .sched_priority = 0 };
4480 struct task_struct *p;
4481 int retval;
4482
4483 if (!param || pid < 0)
4484 return -EINVAL;
4485
4486 rcu_read_lock();
4487 p = find_process_by_pid(pid);
4488 retval = -ESRCH;
4489 if (!p)
4490 goto out_unlock;
4491
4492 retval = security_task_getscheduler(p);
4493 if (retval)
4494 goto out_unlock;
4495
4496 if (task_has_rt_policy(p))
4497 lp.sched_priority = p->rt_priority;
4498 rcu_read_unlock();
4499
4500
4501
4502
4503 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4504
4505 return retval;
4506
4507out_unlock:
4508 rcu_read_unlock();
4509 return retval;
4510}
4511
4512static int sched_read_attr(struct sched_attr __user *uattr,
4513 struct sched_attr *attr,
4514 unsigned int usize)
4515{
4516 int ret;
4517
4518 if (!access_ok(VERIFY_WRITE, uattr, usize))
4519 return -EFAULT;
4520
4521
4522
4523
4524
4525
4526 if (usize < sizeof(*attr)) {
4527 unsigned char *addr;
4528 unsigned char *end;
4529
4530 addr = (void *)attr + usize;
4531 end = (void *)attr + sizeof(*attr);
4532
4533 for (; addr < end; addr++) {
4534 if (*addr)
4535 return -EFBIG;
4536 }
4537
4538 attr->size = usize;
4539 }
4540
4541 ret = copy_to_user(uattr, attr, attr->size);
4542 if (ret)
4543 return -EFAULT;
4544
4545 return 0;
4546}
4547
4548
4549
4550
4551
4552
4553
4554
4555SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4556 unsigned int, size, unsigned int, flags)
4557{
4558 struct sched_attr attr = {
4559 .size = sizeof(struct sched_attr),
4560 };
4561 struct task_struct *p;
4562 int retval;
4563
4564 if (!uattr || pid < 0 || size > PAGE_SIZE ||
4565 size < SCHED_ATTR_SIZE_VER0 || flags)
4566 return -EINVAL;
4567
4568 rcu_read_lock();
4569 p = find_process_by_pid(pid);
4570 retval = -ESRCH;
4571 if (!p)
4572 goto out_unlock;
4573
4574 retval = security_task_getscheduler(p);
4575 if (retval)
4576 goto out_unlock;
4577
4578 attr.sched_policy = p->policy;
4579 if (p->sched_reset_on_fork)
4580 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4581 if (task_has_dl_policy(p))
4582 __getparam_dl(p, &attr);
4583 else if (task_has_rt_policy(p))
4584 attr.sched_priority = p->rt_priority;
4585 else
4586 attr.sched_nice = task_nice(p);
4587
4588 rcu_read_unlock();
4589
4590 retval = sched_read_attr(uattr, &attr, size);
4591 return retval;
4592
4593out_unlock:
4594 rcu_read_unlock();
4595 return retval;
4596}
4597
4598long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4599{
4600 cpumask_var_t cpus_allowed, new_mask;
4601 struct task_struct *p;
4602 int retval;
4603
4604 rcu_read_lock();
4605
4606 p = find_process_by_pid(pid);
4607 if (!p) {
4608 rcu_read_unlock();
4609 return -ESRCH;
4610 }
4611
4612
4613 get_task_struct(p);
4614 rcu_read_unlock();
4615
4616 if (p->flags & PF_NO_SETAFFINITY) {
4617 retval = -EINVAL;
4618 goto out_put_task;
4619 }
4620 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4621 retval = -ENOMEM;
4622 goto out_put_task;
4623 }
4624 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4625 retval = -ENOMEM;
4626 goto out_free_cpus_allowed;
4627 }
4628 retval = -EPERM;
4629 if (!check_same_owner(p)) {
4630 rcu_read_lock();
4631 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4632 rcu_read_unlock();
4633 goto out_free_new_mask;
4634 }
4635 rcu_read_unlock();
4636 }
4637
4638 retval = security_task_setscheduler(p);
4639 if (retval)
4640 goto out_free_new_mask;
4641
4642
4643 cpuset_cpus_allowed(p, cpus_allowed);
4644 cpumask_and(new_mask, in_mask, cpus_allowed);
4645
4646
4647
4648
4649
4650
4651
4652#ifdef CONFIG_SMP
4653 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4654 rcu_read_lock();
4655 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4656 retval = -EBUSY;
4657 rcu_read_unlock();
4658 goto out_free_new_mask;
4659 }
4660 rcu_read_unlock();
4661 }
4662#endif
4663again:
4664 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4665
4666 if (!retval) {
4667 cpuset_cpus_allowed(p, cpus_allowed);
4668 if (!cpumask_subset(new_mask, cpus_allowed)) {
4669
4670
4671
4672
4673
4674 cpumask_copy(new_mask, cpus_allowed);
4675 goto again;
4676 }
4677 }
4678out_free_new_mask:
4679 free_cpumask_var(new_mask);
4680out_free_cpus_allowed:
4681 free_cpumask_var(cpus_allowed);
4682out_put_task:
4683 put_task_struct(p);
4684 return retval;
4685}
4686
4687static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4688 struct cpumask *new_mask)
4689{
4690 if (len < cpumask_size())
4691 cpumask_clear(new_mask);
4692 else if (len > cpumask_size())
4693 len = cpumask_size();
4694
4695 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4696}
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4707 unsigned long __user *, user_mask_ptr)
4708{
4709 cpumask_var_t new_mask;
4710 int retval;
4711
4712 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4713 return -ENOMEM;
4714
4715 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4716 if (retval == 0)
4717 retval = sched_setaffinity(pid, new_mask);
4718 free_cpumask_var(new_mask);
4719 return retval;
4720}
4721
4722long sched_getaffinity(pid_t pid, struct cpumask *mask)
4723{
4724 struct task_struct *p;
4725 unsigned long flags;
4726 int retval;
4727
4728 rcu_read_lock();
4729
4730 retval = -ESRCH;
4731 p = find_process_by_pid(pid);
4732 if (!p)
4733 goto out_unlock;
4734
4735 retval = security_task_getscheduler(p);
4736 if (retval)
4737 goto out_unlock;
4738
4739 raw_spin_lock_irqsave(&p->pi_lock, flags);
4740 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4741 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4742
4743out_unlock:
4744 rcu_read_unlock();
4745
4746 return retval;
4747}
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4758 unsigned long __user *, user_mask_ptr)
4759{
4760 int ret;
4761 cpumask_var_t mask;
4762
4763 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4764 return -EINVAL;
4765 if (len & (sizeof(unsigned long)-1))
4766 return -EINVAL;
4767
4768 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4769 return -ENOMEM;
4770
4771 ret = sched_getaffinity(pid, mask);
4772 if (ret == 0) {
4773 size_t retlen = min_t(size_t, len, cpumask_size());
4774
4775 if (copy_to_user(user_mask_ptr, mask, retlen))
4776 ret = -EFAULT;
4777 else
4778 ret = retlen;
4779 }
4780 free_cpumask_var(mask);
4781
4782 return ret;
4783}
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793SYSCALL_DEFINE0(sched_yield)
4794{
4795 struct rq *rq = this_rq_lock();
4796
4797 schedstat_inc(rq, yld_count);
4798 current->sched_class->yield_task(rq);
4799
4800
4801
4802
4803
4804 __release(rq->lock);
4805 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4806 do_raw_spin_unlock(&rq->lock);
4807 sched_preempt_enable_no_resched();
4808
4809 schedule();
4810
4811 return 0;
4812}
4813
4814int __sched _cond_resched(void)
4815{
4816 if (should_resched(0)) {
4817 preempt_schedule_common();
4818 return 1;
4819 }
4820 return 0;
4821}
4822EXPORT_SYMBOL(_cond_resched);
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832int __cond_resched_lock(spinlock_t *lock)
4833{
4834 int resched = should_resched(PREEMPT_LOCK_OFFSET);
4835 int ret = 0;
4836
4837 lockdep_assert_held(lock);
4838
4839 if (spin_needbreak(lock) || resched) {
4840 spin_unlock(lock);
4841 if (resched)
4842 preempt_schedule_common();
4843 else
4844 cpu_relax();
4845 ret = 1;
4846 spin_lock(lock);
4847 }
4848 return ret;
4849}
4850EXPORT_SYMBOL(__cond_resched_lock);
4851
4852int __sched __cond_resched_softirq(void)
4853{
4854 BUG_ON(!in_softirq());
4855
4856 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
4857 local_bh_enable();
4858 preempt_schedule_common();
4859 local_bh_disable();
4860 return 1;
4861 }
4862 return 0;
4863}
4864EXPORT_SYMBOL(__cond_resched_softirq);
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888void __sched yield(void)
4889{
4890 set_current_state(TASK_RUNNING);
4891 sys_sched_yield();
4892}
4893EXPORT_SYMBOL(yield);
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910int __sched yield_to(struct task_struct *p, bool preempt)
4911{
4912 struct task_struct *curr = current;
4913 struct rq *rq, *p_rq;
4914 unsigned long flags;
4915 int yielded = 0;
4916
4917 local_irq_save(flags);
4918 rq = this_rq();
4919
4920again:
4921 p_rq = task_rq(p);
4922
4923
4924
4925
4926 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4927 yielded = -ESRCH;
4928 goto out_irq;
4929 }
4930
4931 double_rq_lock(rq, p_rq);
4932 if (task_rq(p) != p_rq) {
4933 double_rq_unlock(rq, p_rq);
4934 goto again;
4935 }
4936
4937 if (!curr->sched_class->yield_to_task)
4938 goto out_unlock;
4939
4940 if (curr->sched_class != p->sched_class)
4941 goto out_unlock;
4942
4943 if (task_running(p_rq, p) || p->state)
4944 goto out_unlock;
4945
4946 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4947 if (yielded) {
4948 schedstat_inc(rq, yld_count);
4949
4950
4951
4952
4953 if (preempt && rq != p_rq)
4954 resched_curr(p_rq);
4955 }
4956
4957out_unlock:
4958 double_rq_unlock(rq, p_rq);
4959out_irq:
4960 local_irq_restore(flags);
4961
4962 if (yielded > 0)
4963 schedule();
4964
4965 return yielded;
4966}
4967EXPORT_SYMBOL_GPL(yield_to);
4968
4969
4970
4971
4972
4973long __sched io_schedule_timeout(long timeout)
4974{
4975 int old_iowait = current->in_iowait;
4976 struct rq *rq;
4977 long ret;
4978
4979 current->in_iowait = 1;
4980 blk_schedule_flush_plug(current);
4981
4982 delayacct_blkio_start();
4983 rq = raw_rq();
4984 atomic_inc(&rq->nr_iowait);
4985 ret = schedule_timeout(timeout);
4986 current->in_iowait = old_iowait;
4987 atomic_dec(&rq->nr_iowait);
4988 delayacct_blkio_end();
4989
4990 return ret;
4991}
4992EXPORT_SYMBOL(io_schedule_timeout);
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5003{
5004 int ret = -EINVAL;
5005
5006 switch (policy) {
5007 case SCHED_FIFO:
5008 case SCHED_RR:
5009 ret = MAX_USER_RT_PRIO-1;
5010 break;
5011 case SCHED_DEADLINE:
5012 case SCHED_NORMAL:
5013 case SCHED_BATCH:
5014 case SCHED_IDLE:
5015 ret = 0;
5016 break;
5017 }
5018 return ret;
5019}
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5030{
5031 int ret = -EINVAL;
5032
5033 switch (policy) {
5034 case SCHED_FIFO:
5035 case SCHED_RR:
5036 ret = 1;
5037 break;
5038 case SCHED_DEADLINE:
5039 case SCHED_NORMAL:
5040 case SCHED_BATCH:
5041 case SCHED_IDLE:
5042 ret = 0;
5043 }
5044 return ret;
5045}
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5059 struct timespec __user *, interval)
5060{
5061 struct task_struct *p;
5062 unsigned int time_slice;
5063 struct rq_flags rf;
5064 struct timespec t;
5065 struct rq *rq;
5066 int retval;
5067
5068 if (pid < 0)
5069 return -EINVAL;
5070
5071 retval = -ESRCH;
5072 rcu_read_lock();
5073 p = find_process_by_pid(pid);
5074 if (!p)
5075 goto out_unlock;
5076
5077 retval = security_task_getscheduler(p);
5078 if (retval)
5079 goto out_unlock;
5080
5081 rq = task_rq_lock(p, &rf);
5082 time_slice = 0;
5083 if (p->sched_class->get_rr_interval)
5084 time_slice = p->sched_class->get_rr_interval(rq, p);
5085 task_rq_unlock(rq, p, &rf);
5086
5087 rcu_read_unlock();
5088 jiffies_to_timespec(time_slice, &t);
5089 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5090 return retval;
5091
5092out_unlock:
5093 rcu_read_unlock();
5094 return retval;
5095}
5096
5097static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5098
5099void sched_show_task(struct task_struct *p)
5100{
5101 unsigned long free = 0;
5102 int ppid;
5103 unsigned long state = p->state;
5104
5105 if (state)
5106 state = __ffs(state) + 1;
5107 printk(KERN_INFO "%-15.15s %c", p->comm,
5108 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5109#if BITS_PER_LONG == 32
5110 if (state == TASK_RUNNING)
5111 printk(KERN_CONT " running ");
5112 else
5113 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5114#else
5115 if (state == TASK_RUNNING)
5116 printk(KERN_CONT " running task ");
5117 else
5118 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5119#endif
5120#ifdef CONFIG_DEBUG_STACK_USAGE
5121 free = stack_not_used(p);
5122#endif
5123 ppid = 0;
5124 rcu_read_lock();
5125 if (pid_alive(p))
5126 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5127 rcu_read_unlock();
5128 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5129 task_pid_nr(p), ppid,
5130 (unsigned long)task_thread_info(p)->flags);
5131
5132 print_worker_info(KERN_INFO, p);
5133 show_stack(p, NULL);
5134}
5135
5136void show_state_filter(unsigned long state_filter)
5137{
5138 struct task_struct *g, *p;
5139
5140#if BITS_PER_LONG == 32
5141 printk(KERN_INFO
5142 " task PC stack pid father\n");
5143#else
5144 printk(KERN_INFO
5145 " task PC stack pid father\n");
5146#endif
5147 rcu_read_lock();
5148 for_each_process_thread(g, p) {
5149
5150
5151
5152
5153
5154
5155
5156 touch_nmi_watchdog();
5157 touch_all_softlockup_watchdogs();
5158 if (!state_filter || (p->state & state_filter))
5159 sched_show_task(p);
5160 }
5161
5162#ifdef CONFIG_SCHED_DEBUG
5163 if (!state_filter)
5164 sysrq_sched_debug_show();
5165#endif
5166 rcu_read_unlock();
5167
5168
5169
5170 if (!state_filter)
5171 debug_show_all_locks();
5172}
5173
5174void init_idle_bootup_task(struct task_struct *idle)
5175{
5176 idle->sched_class = &idle_sched_class;
5177}
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187void init_idle(struct task_struct *idle, int cpu)
5188{
5189 struct rq *rq = cpu_rq(cpu);
5190 unsigned long flags;
5191
5192 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5193 raw_spin_lock(&rq->lock);
5194
5195 __sched_fork(0, idle);
5196 idle->state = TASK_RUNNING;
5197 idle->se.exec_start = sched_clock();
5198
5199 kasan_unpoison_task_stack(idle);
5200
5201#ifdef CONFIG_SMP
5202
5203
5204
5205
5206
5207
5208 set_cpus_allowed_common(idle, cpumask_of(cpu));
5209#endif
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220 rcu_read_lock();
5221 __set_task_cpu(idle, cpu);
5222 rcu_read_unlock();
5223
5224 rq->curr = rq->idle = idle;
5225 idle->on_rq = TASK_ON_RQ_QUEUED;
5226#ifdef CONFIG_SMP
5227 idle->on_cpu = 1;
5228#endif
5229 raw_spin_unlock(&rq->lock);
5230 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5231
5232
5233 init_idle_preempt_count(idle, cpu);
5234
5235
5236
5237
5238 idle->sched_class = &idle_sched_class;
5239 ftrace_graph_init_idle_task(idle, cpu);
5240 vtime_init_idle(idle, cpu);
5241#ifdef CONFIG_SMP
5242 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5243#endif
5244}
5245
5246int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5247 const struct cpumask *trial)
5248{
5249 int ret = 1, trial_cpus;
5250 struct dl_bw *cur_dl_b;
5251 unsigned long flags;
5252
5253 if (!cpumask_weight(cur))
5254 return ret;
5255
5256 rcu_read_lock_sched();
5257 cur_dl_b = dl_bw_of(cpumask_any(cur));
5258 trial_cpus = cpumask_weight(trial);
5259
5260 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5261 if (cur_dl_b->bw != -1 &&
5262 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5263 ret = 0;
5264 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
5265 rcu_read_unlock_sched();
5266
5267 return ret;
5268}
5269
5270int task_can_attach(struct task_struct *p,
5271 const struct cpumask *cs_cpus_allowed)
5272{
5273 int ret = 0;
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284 if (p->flags & PF_NO_SETAFFINITY) {
5285 ret = -EINVAL;
5286 goto out;
5287 }
5288
5289#ifdef CONFIG_SMP
5290 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5291 cs_cpus_allowed)) {
5292 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5293 cs_cpus_allowed);
5294 struct dl_bw *dl_b;
5295 bool overflow;
5296 int cpus;
5297 unsigned long flags;
5298
5299 rcu_read_lock_sched();
5300 dl_b = dl_bw_of(dest_cpu);
5301 raw_spin_lock_irqsave(&dl_b->lock, flags);
5302 cpus = dl_bw_cpus(dest_cpu);
5303 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5304 if (overflow)
5305 ret = -EBUSY;
5306 else {
5307
5308
5309
5310
5311
5312
5313 __dl_add(dl_b, p->dl.dl_bw);
5314 }
5315 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5316 rcu_read_unlock_sched();
5317
5318 }
5319#endif
5320out:
5321 return ret;
5322}
5323
5324#ifdef CONFIG_SMP
5325
5326static bool sched_smp_initialized __read_mostly;
5327
5328#ifdef CONFIG_NUMA_BALANCING
5329
5330int migrate_task_to(struct task_struct *p, int target_cpu)
5331{
5332 struct migration_arg arg = { p, target_cpu };
5333 int curr_cpu = task_cpu(p);
5334
5335 if (curr_cpu == target_cpu)
5336 return 0;
5337
5338 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
5339 return -EINVAL;
5340
5341
5342
5343 trace_sched_move_numa(p, curr_cpu, target_cpu);
5344 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5345}
5346
5347
5348
5349
5350
5351void sched_setnuma(struct task_struct *p, int nid)
5352{
5353 bool queued, running;
5354 struct rq_flags rf;
5355 struct rq *rq;
5356
5357 rq = task_rq_lock(p, &rf);
5358 queued = task_on_rq_queued(p);
5359 running = task_current(rq, p);
5360
5361 if (queued)
5362 dequeue_task(rq, p, DEQUEUE_SAVE);
5363 if (running)
5364 put_prev_task(rq, p);
5365
5366 p->numa_preferred_nid = nid;
5367
5368 if (running)
5369 p->sched_class->set_curr_task(rq);
5370 if (queued)
5371 enqueue_task(rq, p, ENQUEUE_RESTORE);
5372 task_rq_unlock(rq, p, &rf);
5373}
5374#endif
5375
5376#ifdef CONFIG_HOTPLUG_CPU
5377
5378
5379
5380
5381void idle_task_exit(void)
5382{
5383 struct mm_struct *mm = current->active_mm;
5384
5385 BUG_ON(cpu_online(smp_processor_id()));
5386
5387 if (mm != &init_mm) {
5388 switch_mm_irqs_off(mm, &init_mm, current);
5389 finish_arch_post_lock_switch();
5390 }
5391 mmdrop(mm);
5392}
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403static void calc_load_migrate(struct rq *rq)
5404{
5405 long delta = calc_load_fold_active(rq, 1);
5406 if (delta)
5407 atomic_long_add(delta, &calc_load_tasks);
5408}
5409
5410static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5411{
5412}
5413
5414static const struct sched_class fake_sched_class = {
5415 .put_prev_task = put_prev_task_fake,
5416};
5417
5418static struct task_struct fake_task = {
5419
5420
5421
5422 .prio = MAX_PRIO + 1,
5423 .sched_class = &fake_sched_class,
5424};
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434static void migrate_tasks(struct rq *dead_rq)
5435{
5436 struct rq *rq = dead_rq;
5437 struct task_struct *next, *stop = rq->stop;
5438 struct pin_cookie cookie;
5439 int dest_cpu;
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450 rq->stop = NULL;
5451
5452
5453
5454
5455
5456
5457 update_rq_clock(rq);
5458
5459 for (;;) {
5460
5461
5462
5463
5464 if (rq->nr_running == 1)
5465 break;
5466
5467
5468
5469
5470 cookie = lockdep_pin_lock(&rq->lock);
5471 next = pick_next_task(rq, &fake_task, cookie);
5472 BUG_ON(!next);
5473 next->sched_class->put_prev_task(rq, next);
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484 lockdep_unpin_lock(&rq->lock, cookie);
5485 raw_spin_unlock(&rq->lock);
5486 raw_spin_lock(&next->pi_lock);
5487 raw_spin_lock(&rq->lock);
5488
5489
5490
5491
5492
5493
5494 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5495 raw_spin_unlock(&next->pi_lock);
5496 continue;
5497 }
5498
5499
5500 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5501
5502 rq = __migrate_task(rq, next, dest_cpu);
5503 if (rq != dead_rq) {
5504 raw_spin_unlock(&rq->lock);
5505 rq = dead_rq;
5506 raw_spin_lock(&rq->lock);
5507 }
5508 raw_spin_unlock(&next->pi_lock);
5509 }
5510
5511 rq->stop = stop;
5512}
5513#endif
5514
5515static void set_rq_online(struct rq *rq)
5516{
5517 if (!rq->online) {
5518 const struct sched_class *class;
5519
5520 cpumask_set_cpu(rq->cpu, rq->rd->online);
5521 rq->online = 1;
5522
5523 for_each_class(class) {
5524 if (class->rq_online)
5525 class->rq_online(rq);
5526 }
5527 }
5528}
5529
5530static void set_rq_offline(struct rq *rq)
5531{
5532 if (rq->online) {
5533 const struct sched_class *class;
5534
5535 for_each_class(class) {
5536 if (class->rq_offline)
5537 class->rq_offline(rq);
5538 }
5539
5540 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5541 rq->online = 0;
5542 }
5543}
5544
5545static void set_cpu_rq_start_time(unsigned int cpu)
5546{
5547 struct rq *rq = cpu_rq(cpu);
5548
5549 rq->age_stamp = sched_clock_cpu(cpu);
5550}
5551
5552static cpumask_var_t sched_domains_tmpmask;
5553
5554#ifdef CONFIG_SCHED_DEBUG
5555
5556static __read_mostly int sched_debug_enabled;
5557
5558static int __init sched_debug_setup(char *str)
5559{
5560 sched_debug_enabled = 1;
5561
5562 return 0;
5563}
5564early_param("sched_debug", sched_debug_setup);
5565
5566static inline bool sched_debug(void)
5567{
5568 return sched_debug_enabled;
5569}
5570
5571static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5572 struct cpumask *groupmask)
5573{
5574 struct sched_group *group = sd->groups;
5575
5576 cpumask_clear(groupmask);
5577
5578 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5579
5580 if (!(sd->flags & SD_LOAD_BALANCE)) {
5581 printk("does not load-balance\n");
5582 if (sd->parent)
5583 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5584 " has parent");
5585 return -1;
5586 }
5587
5588 printk(KERN_CONT "span %*pbl level %s\n",
5589 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5590
5591 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5592 printk(KERN_ERR "ERROR: domain->span does not contain "
5593 "CPU%d\n", cpu);
5594 }
5595 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5596 printk(KERN_ERR "ERROR: domain->groups does not contain"
5597 " CPU%d\n", cpu);
5598 }
5599
5600 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5601 do {
5602 if (!group) {
5603 printk("\n");
5604 printk(KERN_ERR "ERROR: group is NULL\n");
5605 break;
5606 }
5607
5608 if (!cpumask_weight(sched_group_cpus(group))) {
5609 printk(KERN_CONT "\n");
5610 printk(KERN_ERR "ERROR: empty group\n");
5611 break;
5612 }
5613
5614 if (!(sd->flags & SD_OVERLAP) &&
5615 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5616 printk(KERN_CONT "\n");
5617 printk(KERN_ERR "ERROR: repeated CPUs\n");
5618 break;
5619 }
5620
5621 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5622
5623 printk(KERN_CONT " %*pbl",
5624 cpumask_pr_args(sched_group_cpus(group)));
5625 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5626 printk(KERN_CONT " (cpu_capacity = %d)",
5627 group->sgc->capacity);
5628 }
5629
5630 group = group->next;
5631 } while (group != sd->groups);
5632 printk(KERN_CONT "\n");
5633
5634 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5635 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5636
5637 if (sd->parent &&
5638 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5639 printk(KERN_ERR "ERROR: parent span is not a superset "
5640 "of domain->span\n");
5641 return 0;
5642}
5643
5644static void sched_domain_debug(struct sched_domain *sd, int cpu)
5645{
5646 int level = 0;
5647
5648 if (!sched_debug_enabled)
5649 return;
5650
5651 if (!sd) {
5652 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5653 return;
5654 }
5655
5656 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5657
5658 for (;;) {
5659 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5660 break;
5661 level++;
5662 sd = sd->parent;
5663 if (!sd)
5664 break;
5665 }
5666}
5667#else
5668# define sched_domain_debug(sd, cpu) do { } while (0)
5669static inline bool sched_debug(void)
5670{
5671 return false;
5672}
5673#endif
5674
5675static int sd_degenerate(struct sched_domain *sd)
5676{
5677 if (cpumask_weight(sched_domain_span(sd)) == 1)
5678 return 1;
5679
5680
5681 if (sd->flags & (SD_LOAD_BALANCE |
5682 SD_BALANCE_NEWIDLE |
5683 SD_BALANCE_FORK |
5684 SD_BALANCE_EXEC |
5685 SD_SHARE_CPUCAPACITY |
5686 SD_SHARE_PKG_RESOURCES |
5687 SD_SHARE_POWERDOMAIN)) {
5688 if (sd->groups != sd->groups->next)
5689 return 0;
5690 }
5691
5692
5693 if (sd->flags & (SD_WAKE_AFFINE))
5694 return 0;
5695
5696 return 1;
5697}
5698
5699static int
5700sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5701{
5702 unsigned long cflags = sd->flags, pflags = parent->flags;
5703
5704 if (sd_degenerate(parent))
5705 return 1;
5706
5707 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5708 return 0;
5709
5710
5711 if (parent->groups == parent->groups->next) {
5712 pflags &= ~(SD_LOAD_BALANCE |
5713 SD_BALANCE_NEWIDLE |
5714 SD_BALANCE_FORK |
5715 SD_BALANCE_EXEC |
5716 SD_SHARE_CPUCAPACITY |
5717 SD_SHARE_PKG_RESOURCES |
5718 SD_PREFER_SIBLING |
5719 SD_SHARE_POWERDOMAIN);
5720 if (nr_node_ids == 1)
5721 pflags &= ~SD_SERIALIZE;
5722 }
5723 if (~cflags & pflags)
5724 return 0;
5725
5726 return 1;
5727}
5728
5729static void free_rootdomain(struct rcu_head *rcu)
5730{
5731 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5732
5733 cpupri_cleanup(&rd->cpupri);
5734 cpudl_cleanup(&rd->cpudl);
5735 free_cpumask_var(rd->dlo_mask);
5736 free_cpumask_var(rd->rto_mask);
5737 free_cpumask_var(rd->online);
5738 free_cpumask_var(rd->span);
5739 kfree(rd);
5740}
5741
5742static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5743{
5744 struct root_domain *old_rd = NULL;
5745 unsigned long flags;
5746
5747 raw_spin_lock_irqsave(&rq->lock, flags);
5748
5749 if (rq->rd) {
5750 old_rd = rq->rd;
5751
5752 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5753 set_rq_offline(rq);
5754
5755 cpumask_clear_cpu(rq->cpu, old_rd->span);
5756
5757
5758
5759
5760
5761
5762 if (!atomic_dec_and_test(&old_rd->refcount))
5763 old_rd = NULL;
5764 }
5765
5766 atomic_inc(&rd->refcount);
5767 rq->rd = rd;
5768
5769 cpumask_set_cpu(rq->cpu, rd->span);
5770 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5771 set_rq_online(rq);
5772
5773 raw_spin_unlock_irqrestore(&rq->lock, flags);
5774
5775 if (old_rd)
5776 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5777}
5778
5779static int init_rootdomain(struct root_domain *rd)
5780{
5781 memset(rd, 0, sizeof(*rd));
5782
5783 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
5784 goto out;
5785 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
5786 goto free_span;
5787 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5788 goto free_online;
5789 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5790 goto free_dlo_mask;
5791
5792 init_dl_bw(&rd->dl_bw);
5793 if (cpudl_init(&rd->cpudl) != 0)
5794 goto free_dlo_mask;
5795
5796 if (cpupri_init(&rd->cpupri) != 0)
5797 goto free_rto_mask;
5798 return 0;
5799
5800free_rto_mask:
5801 free_cpumask_var(rd->rto_mask);
5802free_dlo_mask:
5803 free_cpumask_var(rd->dlo_mask);
5804free_online:
5805 free_cpumask_var(rd->online);
5806free_span:
5807 free_cpumask_var(rd->span);
5808out:
5809 return -ENOMEM;
5810}
5811
5812
5813
5814
5815
5816struct root_domain def_root_domain;
5817
5818static void init_defrootdomain(void)
5819{
5820 init_rootdomain(&def_root_domain);
5821
5822 atomic_set(&def_root_domain.refcount, 1);
5823}
5824
5825static struct root_domain *alloc_rootdomain(void)
5826{
5827 struct root_domain *rd;
5828
5829 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5830 if (!rd)
5831 return NULL;
5832
5833 if (init_rootdomain(rd) != 0) {
5834 kfree(rd);
5835 return NULL;
5836 }
5837
5838 return rd;
5839}
5840
5841static void free_sched_groups(struct sched_group *sg, int free_sgc)
5842{
5843 struct sched_group *tmp, *first;
5844
5845 if (!sg)
5846 return;
5847
5848 first = sg;
5849 do {
5850 tmp = sg->next;
5851
5852 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5853 kfree(sg->sgc);
5854
5855 kfree(sg);
5856 sg = tmp;
5857 } while (sg != first);
5858}
5859
5860static void free_sched_domain(struct rcu_head *rcu)
5861{
5862 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5863
5864
5865
5866
5867
5868 if (sd->flags & SD_OVERLAP) {
5869 free_sched_groups(sd->groups, 1);
5870 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5871 kfree(sd->groups->sgc);
5872 kfree(sd->groups);
5873 }
5874 kfree(sd);
5875}
5876
5877static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5878{
5879 call_rcu(&sd->rcu, free_sched_domain);
5880}
5881
5882static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5883{
5884 for (; sd; sd = sd->parent)
5885 destroy_sched_domain(sd, cpu);
5886}
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5898DEFINE_PER_CPU(int, sd_llc_size);
5899DEFINE_PER_CPU(int, sd_llc_id);
5900DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5901DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5902DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5903
5904static void update_top_cache_domain(int cpu)
5905{
5906 struct sched_domain *sd;
5907 struct sched_domain *busy_sd = NULL;
5908 int id = cpu;
5909 int size = 1;
5910
5911 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5912 if (sd) {
5913 id = cpumask_first(sched_domain_span(sd));
5914 size = cpumask_weight(sched_domain_span(sd));
5915 busy_sd = sd->parent;
5916 }
5917 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5918
5919 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5920 per_cpu(sd_llc_size, cpu) = size;
5921 per_cpu(sd_llc_id, cpu) = id;
5922
5923 sd = lowest_flag_domain(cpu, SD_NUMA);
5924 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5925
5926 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5927 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5928}
5929
5930
5931
5932
5933
5934static void
5935cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5936{
5937 struct rq *rq = cpu_rq(cpu);
5938 struct sched_domain *tmp;
5939
5940
5941 for (tmp = sd; tmp; ) {
5942 struct sched_domain *parent = tmp->parent;
5943 if (!parent)
5944 break;
5945
5946 if (sd_parent_degenerate(tmp, parent)) {
5947 tmp->parent = parent->parent;
5948 if (parent->parent)
5949 parent->parent->child = tmp;
5950
5951
5952
5953
5954
5955 if (parent->flags & SD_PREFER_SIBLING)
5956 tmp->flags |= SD_PREFER_SIBLING;
5957 destroy_sched_domain(parent, cpu);
5958 } else
5959 tmp = tmp->parent;
5960 }
5961
5962 if (sd && sd_degenerate(sd)) {
5963 tmp = sd;
5964 sd = sd->parent;
5965 destroy_sched_domain(tmp, cpu);
5966 if (sd)
5967 sd->child = NULL;
5968 }
5969
5970 sched_domain_debug(sd, cpu);
5971
5972 rq_attach_root(rq, rd);
5973 tmp = rq->sd;
5974 rcu_assign_pointer(rq->sd, sd);
5975 destroy_sched_domains(tmp, cpu);
5976
5977 update_top_cache_domain(cpu);
5978}
5979
5980
5981static int __init isolated_cpu_setup(char *str)
5982{
5983 int ret;
5984
5985 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5986 ret = cpulist_parse(str, cpu_isolated_map);
5987 if (ret) {
5988 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
5989 return 0;
5990 }
5991 return 1;
5992}
5993__setup("isolcpus=", isolated_cpu_setup);
5994
5995struct s_data {
5996 struct sched_domain ** __percpu sd;
5997 struct root_domain *rd;
5998};
5999
6000enum s_alloc {
6001 sa_rootdomain,
6002 sa_sd,
6003 sa_sd_storage,
6004 sa_none,
6005};
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6021{
6022 const struct cpumask *span = sched_domain_span(sd);
6023 struct sd_data *sdd = sd->private;
6024 struct sched_domain *sibling;
6025 int i;
6026
6027 for_each_cpu(i, span) {
6028 sibling = *per_cpu_ptr(sdd->sd, i);
6029 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6030 continue;
6031
6032 cpumask_set_cpu(i, sched_group_mask(sg));
6033 }
6034}
6035
6036
6037
6038
6039
6040int group_balance_cpu(struct sched_group *sg)
6041{
6042 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6043}
6044
6045static int
6046build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6047{
6048 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6049 const struct cpumask *span = sched_domain_span(sd);
6050 struct cpumask *covered = sched_domains_tmpmask;
6051 struct sd_data *sdd = sd->private;
6052 struct sched_domain *sibling;
6053 int i;
6054
6055 cpumask_clear(covered);
6056
6057 for_each_cpu(i, span) {
6058 struct cpumask *sg_span;
6059
6060 if (cpumask_test_cpu(i, covered))
6061 continue;
6062
6063 sibling = *per_cpu_ptr(sdd->sd, i);
6064
6065
6066 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6067 continue;
6068
6069 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6070 GFP_KERNEL, cpu_to_node(cpu));
6071
6072 if (!sg)
6073 goto fail;
6074
6075 sg_span = sched_group_cpus(sg);
6076 if (sibling->child)
6077 cpumask_copy(sg_span, sched_domain_span(sibling->child));
6078 else
6079 cpumask_set_cpu(i, sg_span);
6080
6081 cpumask_or(covered, covered, sg_span);
6082
6083 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
6084 if (atomic_inc_return(&sg->sgc->ref) == 1)
6085 build_group_mask(sd, sg);
6086
6087
6088
6089
6090
6091
6092 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
6093
6094
6095
6096
6097
6098
6099 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6100 group_balance_cpu(sg) == cpu)
6101 groups = sg;
6102
6103 if (!first)
6104 first = sg;
6105 if (last)
6106 last->next = sg;
6107 last = sg;
6108 last->next = first;
6109 }
6110 sd->groups = groups;
6111
6112 return 0;
6113
6114fail:
6115 free_sched_groups(first, 0);
6116
6117 return -ENOMEM;
6118}
6119
6120static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6121{
6122 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6123 struct sched_domain *child = sd->child;
6124
6125 if (child)
6126 cpu = cpumask_first(sched_domain_span(child));
6127
6128 if (sg) {
6129 *sg = *per_cpu_ptr(sdd->sg, cpu);
6130 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
6131 atomic_set(&(*sg)->sgc->ref, 1);
6132 }
6133
6134 return cpu;
6135}
6136
6137
6138
6139
6140
6141
6142
6143
6144static int
6145build_sched_groups(struct sched_domain *sd, int cpu)
6146{
6147 struct sched_group *first = NULL, *last = NULL;
6148 struct sd_data *sdd = sd->private;
6149 const struct cpumask *span = sched_domain_span(sd);
6150 struct cpumask *covered;
6151 int i;
6152
6153 get_group(cpu, sdd, &sd->groups);
6154 atomic_inc(&sd->groups->ref);
6155
6156 if (cpu != cpumask_first(span))
6157 return 0;
6158
6159 lockdep_assert_held(&sched_domains_mutex);
6160 covered = sched_domains_tmpmask;
6161
6162 cpumask_clear(covered);
6163
6164 for_each_cpu(i, span) {
6165 struct sched_group *sg;
6166 int group, j;
6167
6168 if (cpumask_test_cpu(i, covered))
6169 continue;
6170
6171 group = get_group(i, sdd, &sg);
6172 cpumask_setall(sched_group_mask(sg));
6173
6174 for_each_cpu(j, span) {
6175 if (get_group(j, sdd, NULL) != group)
6176 continue;
6177
6178 cpumask_set_cpu(j, covered);
6179 cpumask_set_cpu(j, sched_group_cpus(sg));
6180 }
6181
6182 if (!first)
6183 first = sg;
6184 if (last)
6185 last->next = sg;
6186 last = sg;
6187 }
6188 last->next = first;
6189
6190 return 0;
6191}
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6204{
6205 struct sched_group *sg = sd->groups;
6206
6207 WARN_ON(!sg);
6208
6209 do {
6210 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6211 sg = sg->next;
6212 } while (sg != sd->groups);
6213
6214 if (cpu != group_balance_cpu(sg))
6215 return;
6216
6217 update_group_capacity(sd, cpu);
6218 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
6219}
6220
6221
6222
6223
6224
6225
6226static int default_relax_domain_level = -1;
6227int sched_domain_level_max;
6228
6229static int __init setup_relax_domain_level(char *str)
6230{
6231 if (kstrtoint(str, 0, &default_relax_domain_level))
6232 pr_warn("Unable to set relax_domain_level\n");
6233
6234 return 1;
6235}
6236__setup("relax_domain_level=", setup_relax_domain_level);
6237
6238static void set_domain_attribute(struct sched_domain *sd,
6239 struct sched_domain_attr *attr)
6240{
6241 int request;
6242
6243 if (!attr || attr->relax_domain_level < 0) {
6244 if (default_relax_domain_level < 0)
6245 return;
6246 else
6247 request = default_relax_domain_level;
6248 } else
6249 request = attr->relax_domain_level;
6250 if (request < sd->level) {
6251
6252 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6253 } else {
6254
6255 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6256 }
6257}
6258
6259static void __sdt_free(const struct cpumask *cpu_map);
6260static int __sdt_alloc(const struct cpumask *cpu_map);
6261
6262static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6263 const struct cpumask *cpu_map)
6264{
6265 switch (what) {
6266 case sa_rootdomain:
6267 if (!atomic_read(&d->rd->refcount))
6268 free_rootdomain(&d->rd->rcu);
6269 case sa_sd:
6270 free_percpu(d->sd);
6271 case sa_sd_storage:
6272 __sdt_free(cpu_map);
6273 case sa_none:
6274 break;
6275 }
6276}
6277
6278static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6279 const struct cpumask *cpu_map)
6280{
6281 memset(d, 0, sizeof(*d));
6282
6283 if (__sdt_alloc(cpu_map))
6284 return sa_sd_storage;
6285 d->sd = alloc_percpu(struct sched_domain *);
6286 if (!d->sd)
6287 return sa_sd_storage;
6288 d->rd = alloc_rootdomain();
6289 if (!d->rd)
6290 return sa_sd;
6291 return sa_rootdomain;
6292}
6293
6294
6295
6296
6297
6298
6299static void claim_allocations(int cpu, struct sched_domain *sd)
6300{
6301 struct sd_data *sdd = sd->private;
6302
6303 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6304 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6305
6306 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6307 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6308
6309 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
6310 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
6311}
6312
6313#ifdef CONFIG_NUMA
6314static int sched_domains_numa_levels;
6315enum numa_topology_type sched_numa_topology_type;
6316static int *sched_domains_numa_distance;
6317int sched_max_numa_distance;
6318static struct cpumask ***sched_domains_numa_masks;
6319static int sched_domains_curr_level;
6320#endif
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333#define TOPOLOGY_SD_FLAGS \
6334 (SD_SHARE_CPUCAPACITY | \
6335 SD_SHARE_PKG_RESOURCES | \
6336 SD_NUMA | \
6337 SD_ASYM_PACKING | \
6338 SD_SHARE_POWERDOMAIN)
6339
6340static struct sched_domain *
6341sd_init(struct sched_domain_topology_level *tl, int cpu)
6342{
6343 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6344 int sd_weight, sd_flags = 0;
6345
6346#ifdef CONFIG_NUMA
6347
6348
6349
6350 sched_domains_curr_level = tl->numa_level;
6351#endif
6352
6353 sd_weight = cpumask_weight(tl->mask(cpu));
6354
6355 if (tl->sd_flags)
6356 sd_flags = (*tl->sd_flags)();
6357 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6358 "wrong sd_flags in topology description\n"))
6359 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6360
6361 *sd = (struct sched_domain){
6362 .min_interval = sd_weight,
6363 .max_interval = 2*sd_weight,
6364 .busy_factor = 32,
6365 .imbalance_pct = 125,
6366
6367 .cache_nice_tries = 0,
6368 .busy_idx = 0,
6369 .idle_idx = 0,
6370 .newidle_idx = 0,
6371 .wake_idx = 0,
6372 .forkexec_idx = 0,
6373
6374 .flags = 1*SD_LOAD_BALANCE
6375 | 1*SD_BALANCE_NEWIDLE
6376 | 1*SD_BALANCE_EXEC
6377 | 1*SD_BALANCE_FORK
6378 | 0*SD_BALANCE_WAKE
6379 | 1*SD_WAKE_AFFINE
6380 | 0*SD_SHARE_CPUCAPACITY
6381 | 0*SD_SHARE_PKG_RESOURCES
6382 | 0*SD_SERIALIZE
6383 | 0*SD_PREFER_SIBLING
6384 | 0*SD_NUMA
6385 | sd_flags
6386 ,
6387
6388 .last_balance = jiffies,
6389 .balance_interval = sd_weight,
6390 .smt_gain = 0,
6391 .max_newidle_lb_cost = 0,
6392 .next_decay_max_lb_cost = jiffies,
6393#ifdef CONFIG_SCHED_DEBUG
6394 .name = tl->name,
6395#endif
6396 };
6397
6398
6399
6400
6401
6402 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6403 sd->flags |= SD_PREFER_SIBLING;
6404 sd->imbalance_pct = 110;
6405 sd->smt_gain = 1178;
6406
6407 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6408 sd->imbalance_pct = 117;
6409 sd->cache_nice_tries = 1;
6410 sd->busy_idx = 2;
6411
6412#ifdef CONFIG_NUMA
6413 } else if (sd->flags & SD_NUMA) {
6414 sd->cache_nice_tries = 2;
6415 sd->busy_idx = 3;
6416 sd->idle_idx = 2;
6417
6418 sd->flags |= SD_SERIALIZE;
6419 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6420 sd->flags &= ~(SD_BALANCE_EXEC |
6421 SD_BALANCE_FORK |
6422 SD_WAKE_AFFINE);
6423 }
6424
6425#endif
6426 } else {
6427 sd->flags |= SD_PREFER_SIBLING;
6428 sd->cache_nice_tries = 1;
6429 sd->busy_idx = 2;
6430 sd->idle_idx = 1;
6431 }
6432
6433 sd->private = &tl->data;
6434
6435 return sd;
6436}
6437
6438
6439
6440
6441static struct sched_domain_topology_level default_topology[] = {
6442#ifdef CONFIG_SCHED_SMT
6443 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6444#endif
6445#ifdef CONFIG_SCHED_MC
6446 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6447#endif
6448 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6449 { NULL, },
6450};
6451
6452static struct sched_domain_topology_level *sched_domain_topology =
6453 default_topology;
6454
6455#define for_each_sd_topology(tl) \
6456 for (tl = sched_domain_topology; tl->mask; tl++)
6457
6458void set_sched_topology(struct sched_domain_topology_level *tl)
6459{
6460 sched_domain_topology = tl;
6461}
6462
6463#ifdef CONFIG_NUMA
6464
6465static const struct cpumask *sd_numa_mask(int cpu)
6466{
6467 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6468}
6469
6470static void sched_numa_warn(const char *str)
6471{
6472 static int done = false;
6473 int i,j;
6474
6475 if (done)
6476 return;
6477
6478 done = true;
6479
6480 printk(KERN_WARNING "ERROR: %s\n\n", str);
6481
6482 for (i = 0; i < nr_node_ids; i++) {
6483 printk(KERN_WARNING " ");
6484 for (j = 0; j < nr_node_ids; j++)
6485 printk(KERN_CONT "%02d ", node_distance(i,j));
6486 printk(KERN_CONT "\n");
6487 }
6488 printk(KERN_WARNING "\n");
6489}
6490
6491bool find_numa_distance(int distance)
6492{
6493 int i;
6494
6495 if (distance == node_distance(0, 0))
6496 return true;
6497
6498 for (i = 0; i < sched_domains_numa_levels; i++) {
6499 if (sched_domains_numa_distance[i] == distance)
6500 return true;
6501 }
6502
6503 return false;
6504}
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525static void init_numa_topology_type(void)
6526{
6527 int a, b, c, n;
6528
6529 n = sched_max_numa_distance;
6530
6531 if (sched_domains_numa_levels <= 1) {
6532 sched_numa_topology_type = NUMA_DIRECT;
6533 return;
6534 }
6535
6536 for_each_online_node(a) {
6537 for_each_online_node(b) {
6538
6539 if (node_distance(a, b) < n)
6540 continue;
6541
6542
6543 for_each_online_node(c) {
6544 if (node_distance(a, c) < n &&
6545 node_distance(b, c) < n) {
6546 sched_numa_topology_type =
6547 NUMA_GLUELESS_MESH;
6548 return;
6549 }
6550 }
6551
6552 sched_numa_topology_type = NUMA_BACKPLANE;
6553 return;
6554 }
6555 }
6556}
6557
6558static void sched_init_numa(void)
6559{
6560 int next_distance, curr_distance = node_distance(0, 0);
6561 struct sched_domain_topology_level *tl;
6562 int level = 0;
6563 int i, j, k;
6564
6565 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6566 if (!sched_domains_numa_distance)
6567 return;
6568
6569
6570
6571
6572
6573
6574
6575
6576 next_distance = curr_distance;
6577 for (i = 0; i < nr_node_ids; i++) {
6578 for (j = 0; j < nr_node_ids; j++) {
6579 for (k = 0; k < nr_node_ids; k++) {
6580 int distance = node_distance(i, k);
6581
6582 if (distance > curr_distance &&
6583 (distance < next_distance ||
6584 next_distance == curr_distance))
6585 next_distance = distance;
6586
6587
6588
6589
6590
6591
6592 if (sched_debug() && node_distance(k, i) != distance)
6593 sched_numa_warn("Node-distance not symmetric");
6594
6595 if (sched_debug() && i && !find_numa_distance(distance))
6596 sched_numa_warn("Node-0 not representative");
6597 }
6598 if (next_distance != curr_distance) {
6599 sched_domains_numa_distance[level++] = next_distance;
6600 sched_domains_numa_levels = level;
6601 curr_distance = next_distance;
6602 } else break;
6603 }
6604
6605
6606
6607
6608 if (!sched_debug())
6609 break;
6610 }
6611
6612 if (!level)
6613 return;
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632 sched_domains_numa_levels = 0;
6633
6634 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6635 if (!sched_domains_numa_masks)
6636 return;
6637
6638
6639
6640
6641
6642 for (i = 0; i < level; i++) {
6643 sched_domains_numa_masks[i] =
6644 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6645 if (!sched_domains_numa_masks[i])
6646 return;
6647
6648 for (j = 0; j < nr_node_ids; j++) {
6649 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6650 if (!mask)
6651 return;
6652
6653 sched_domains_numa_masks[i][j] = mask;
6654
6655 for_each_node(k) {
6656 if (node_distance(j, k) > sched_domains_numa_distance[i])
6657 continue;
6658
6659 cpumask_or(mask, mask, cpumask_of_node(k));
6660 }
6661 }
6662 }
6663
6664
6665 for (i = 0; sched_domain_topology[i].mask; i++);
6666
6667 tl = kzalloc((i + level + 1) *
6668 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6669 if (!tl)
6670 return;
6671
6672
6673
6674
6675 for (i = 0; sched_domain_topology[i].mask; i++)
6676 tl[i] = sched_domain_topology[i];
6677
6678
6679
6680
6681 for (j = 0; j < level; i++, j++) {
6682 tl[i] = (struct sched_domain_topology_level){
6683 .mask = sd_numa_mask,
6684 .sd_flags = cpu_numa_flags,
6685 .flags = SDTL_OVERLAP,
6686 .numa_level = j,
6687 SD_INIT_NAME(NUMA)
6688 };
6689 }
6690
6691 sched_domain_topology = tl;
6692
6693 sched_domains_numa_levels = level;
6694 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6695
6696 init_numa_topology_type();
6697}
6698
6699static void sched_domains_numa_masks_set(unsigned int cpu)
6700{
6701 int node = cpu_to_node(cpu);
6702 int i, j;
6703
6704 for (i = 0; i < sched_domains_numa_levels; i++) {
6705 for (j = 0; j < nr_node_ids; j++) {
6706 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6707 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6708 }
6709 }
6710}
6711
6712static void sched_domains_numa_masks_clear(unsigned int cpu)
6713{
6714 int i, j;
6715
6716 for (i = 0; i < sched_domains_numa_levels; i++) {
6717 for (j = 0; j < nr_node_ids; j++)
6718 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6719 }
6720}
6721
6722#else
6723static inline void sched_init_numa(void) { }
6724static void sched_domains_numa_masks_set(unsigned int cpu) { }
6725static void sched_domains_numa_masks_clear(unsigned int cpu) { }
6726#endif
6727
6728static int __sdt_alloc(const struct cpumask *cpu_map)
6729{
6730 struct sched_domain_topology_level *tl;
6731 int j;
6732
6733 for_each_sd_topology(tl) {
6734 struct sd_data *sdd = &tl->data;
6735
6736 sdd->sd = alloc_percpu(struct sched_domain *);
6737 if (!sdd->sd)
6738 return -ENOMEM;
6739
6740 sdd->sg = alloc_percpu(struct sched_group *);
6741 if (!sdd->sg)
6742 return -ENOMEM;
6743
6744 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6745 if (!sdd->sgc)
6746 return -ENOMEM;
6747
6748 for_each_cpu(j, cpu_map) {
6749 struct sched_domain *sd;
6750 struct sched_group *sg;
6751 struct sched_group_capacity *sgc;
6752
6753 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6754 GFP_KERNEL, cpu_to_node(j));
6755 if (!sd)
6756 return -ENOMEM;
6757
6758 *per_cpu_ptr(sdd->sd, j) = sd;
6759
6760 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6761 GFP_KERNEL, cpu_to_node(j));
6762 if (!sg)
6763 return -ENOMEM;
6764
6765 sg->next = sg;
6766
6767 *per_cpu_ptr(sdd->sg, j) = sg;
6768
6769 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6770 GFP_KERNEL, cpu_to_node(j));
6771 if (!sgc)
6772 return -ENOMEM;
6773
6774 *per_cpu_ptr(sdd->sgc, j) = sgc;
6775 }
6776 }
6777
6778 return 0;
6779}
6780
6781static void __sdt_free(const struct cpumask *cpu_map)
6782{
6783 struct sched_domain_topology_level *tl;
6784 int j;
6785
6786 for_each_sd_topology(tl) {
6787 struct sd_data *sdd = &tl->data;
6788
6789 for_each_cpu(j, cpu_map) {
6790 struct sched_domain *sd;
6791
6792 if (sdd->sd) {
6793 sd = *per_cpu_ptr(sdd->sd, j);
6794 if (sd && (sd->flags & SD_OVERLAP))
6795 free_sched_groups(sd->groups, 0);
6796 kfree(*per_cpu_ptr(sdd->sd, j));
6797 }
6798
6799 if (sdd->sg)
6800 kfree(*per_cpu_ptr(sdd->sg, j));
6801 if (sdd->sgc)
6802 kfree(*per_cpu_ptr(sdd->sgc, j));
6803 }
6804 free_percpu(sdd->sd);
6805 sdd->sd = NULL;
6806 free_percpu(sdd->sg);
6807 sdd->sg = NULL;
6808 free_percpu(sdd->sgc);
6809 sdd->sgc = NULL;
6810 }
6811}
6812
6813struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6814 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6815 struct sched_domain *child, int cpu)
6816{
6817 struct sched_domain *sd = sd_init(tl, cpu);
6818 if (!sd)
6819 return child;
6820
6821 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6822 if (child) {
6823 sd->level = child->level + 1;
6824 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6825 child->parent = sd;
6826 sd->child = child;
6827
6828 if (!cpumask_subset(sched_domain_span(child),
6829 sched_domain_span(sd))) {
6830 pr_err("BUG: arch topology borken\n");
6831#ifdef CONFIG_SCHED_DEBUG
6832 pr_err(" the %s domain not a subset of the %s domain\n",
6833 child->name, sd->name);
6834#endif
6835
6836 cpumask_or(sched_domain_span(sd),
6837 sched_domain_span(sd),
6838 sched_domain_span(child));
6839 }
6840
6841 }
6842 set_domain_attribute(sd, attr);
6843
6844 return sd;
6845}
6846
6847
6848
6849
6850
6851static int build_sched_domains(const struct cpumask *cpu_map,
6852 struct sched_domain_attr *attr)
6853{
6854 enum s_alloc alloc_state;
6855 struct sched_domain *sd;
6856 struct s_data d;
6857 int i, ret = -ENOMEM;
6858
6859 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6860 if (alloc_state != sa_rootdomain)
6861 goto error;
6862
6863
6864 for_each_cpu(i, cpu_map) {
6865 struct sched_domain_topology_level *tl;
6866
6867 sd = NULL;
6868 for_each_sd_topology(tl) {
6869 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6870 if (tl == sched_domain_topology)
6871 *per_cpu_ptr(d.sd, i) = sd;
6872 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6873 sd->flags |= SD_OVERLAP;
6874 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6875 break;
6876 }
6877 }
6878
6879
6880 for_each_cpu(i, cpu_map) {
6881 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6882 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6883 if (sd->flags & SD_OVERLAP) {
6884 if (build_overlap_sched_groups(sd, i))
6885 goto error;
6886 } else {
6887 if (build_sched_groups(sd, i))
6888 goto error;
6889 }
6890 }
6891 }
6892
6893
6894 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6895 if (!cpumask_test_cpu(i, cpu_map))
6896 continue;
6897
6898 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6899 claim_allocations(i, sd);
6900 init_sched_groups_capacity(i, sd);
6901 }
6902 }
6903
6904
6905 rcu_read_lock();
6906 for_each_cpu(i, cpu_map) {
6907 sd = *per_cpu_ptr(d.sd, i);
6908 cpu_attach_domain(sd, d.rd, i);
6909 }
6910 rcu_read_unlock();
6911
6912 ret = 0;
6913error:
6914 __free_domain_allocs(&d, alloc_state, cpu_map);
6915 return ret;
6916}
6917
6918static cpumask_var_t *doms_cur;
6919static int ndoms_cur;
6920static struct sched_domain_attr *dattr_cur;
6921
6922
6923
6924
6925
6926
6927
6928static cpumask_var_t fallback_doms;
6929
6930
6931
6932
6933
6934
6935int __weak arch_update_cpu_topology(void)
6936{
6937 return 0;
6938}
6939
6940cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6941{
6942 int i;
6943 cpumask_var_t *doms;
6944
6945 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6946 if (!doms)
6947 return NULL;
6948 for (i = 0; i < ndoms; i++) {
6949 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6950 free_sched_domains(doms, i);
6951 return NULL;
6952 }
6953 }
6954 return doms;
6955}
6956
6957void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6958{
6959 unsigned int i;
6960 for (i = 0; i < ndoms; i++)
6961 free_cpumask_var(doms[i]);
6962 kfree(doms);
6963}
6964
6965
6966
6967
6968
6969
6970static int init_sched_domains(const struct cpumask *cpu_map)
6971{
6972 int err;
6973
6974 arch_update_cpu_topology();
6975 ndoms_cur = 1;
6976 doms_cur = alloc_sched_domains(ndoms_cur);
6977 if (!doms_cur)
6978 doms_cur = &fallback_doms;
6979 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6980 err = build_sched_domains(doms_cur[0], NULL);
6981 register_sched_domain_sysctl();
6982
6983 return err;
6984}
6985
6986
6987
6988
6989
6990static void detach_destroy_domains(const struct cpumask *cpu_map)
6991{
6992 int i;
6993
6994 rcu_read_lock();
6995 for_each_cpu(i, cpu_map)
6996 cpu_attach_domain(NULL, &def_root_domain, i);
6997 rcu_read_unlock();
6998}
6999
7000
7001static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7002 struct sched_domain_attr *new, int idx_new)
7003{
7004 struct sched_domain_attr tmp;
7005
7006
7007 if (!new && !cur)
7008 return 1;
7009
7010 tmp = SD_ATTR_INIT;
7011 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7012 new ? (new + idx_new) : &tmp,
7013 sizeof(struct sched_domain_attr));
7014}
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7043 struct sched_domain_attr *dattr_new)
7044{
7045 int i, j, n;
7046 int new_topology;
7047
7048 mutex_lock(&sched_domains_mutex);
7049
7050
7051 unregister_sched_domain_sysctl();
7052
7053
7054 new_topology = arch_update_cpu_topology();
7055
7056 n = doms_new ? ndoms_new : 0;
7057
7058
7059 for (i = 0; i < ndoms_cur; i++) {
7060 for (j = 0; j < n && !new_topology; j++) {
7061 if (cpumask_equal(doms_cur[i], doms_new[j])
7062 && dattrs_equal(dattr_cur, i, dattr_new, j))
7063 goto match1;
7064 }
7065
7066 detach_destroy_domains(doms_cur[i]);
7067match1:
7068 ;
7069 }
7070
7071 n = ndoms_cur;
7072 if (doms_new == NULL) {
7073 n = 0;
7074 doms_new = &fallback_doms;
7075 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7076 WARN_ON_ONCE(dattr_new);
7077 }
7078
7079
7080 for (i = 0; i < ndoms_new; i++) {
7081 for (j = 0; j < n && !new_topology; j++) {
7082 if (cpumask_equal(doms_new[i], doms_cur[j])
7083 && dattrs_equal(dattr_new, i, dattr_cur, j))
7084 goto match2;
7085 }
7086
7087 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7088match2:
7089 ;
7090 }
7091
7092
7093 if (doms_cur != &fallback_doms)
7094 free_sched_domains(doms_cur, ndoms_cur);
7095 kfree(dattr_cur);
7096 doms_cur = doms_new;
7097 dattr_cur = dattr_new;
7098 ndoms_cur = ndoms_new;
7099
7100 register_sched_domain_sysctl();
7101
7102 mutex_unlock(&sched_domains_mutex);
7103}
7104
7105static int num_cpus_frozen;
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115static void cpuset_cpu_active(void)
7116{
7117 if (cpuhp_tasks_frozen) {
7118
7119
7120
7121
7122
7123
7124 num_cpus_frozen--;
7125 if (likely(num_cpus_frozen)) {
7126 partition_sched_domains(1, NULL, NULL);
7127 return;
7128 }
7129
7130
7131
7132
7133
7134 }
7135 cpuset_update_active_cpus(true);
7136}
7137
7138static int cpuset_cpu_inactive(unsigned int cpu)
7139{
7140 unsigned long flags;
7141 struct dl_bw *dl_b;
7142 bool overflow;
7143 int cpus;
7144
7145 if (!cpuhp_tasks_frozen) {
7146 rcu_read_lock_sched();
7147 dl_b = dl_bw_of(cpu);
7148
7149 raw_spin_lock_irqsave(&dl_b->lock, flags);
7150 cpus = dl_bw_cpus(cpu);
7151 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7152 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7153
7154 rcu_read_unlock_sched();
7155
7156 if (overflow)
7157 return -EBUSY;
7158 cpuset_update_active_cpus(false);
7159 } else {
7160 num_cpus_frozen++;
7161 partition_sched_domains(1, NULL, NULL);
7162 }
7163 return 0;
7164}
7165
7166int sched_cpu_activate(unsigned int cpu)
7167{
7168 struct rq *rq = cpu_rq(cpu);
7169 unsigned long flags;
7170
7171 set_cpu_active(cpu, true);
7172
7173 if (sched_smp_initialized) {
7174 sched_domains_numa_masks_set(cpu);
7175 cpuset_cpu_active();
7176 }
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187 raw_spin_lock_irqsave(&rq->lock, flags);
7188 if (rq->rd) {
7189 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7190 set_rq_online(rq);
7191 }
7192 raw_spin_unlock_irqrestore(&rq->lock, flags);
7193
7194 update_max_interval();
7195
7196 return 0;
7197}
7198
7199int sched_cpu_deactivate(unsigned int cpu)
7200{
7201 int ret;
7202
7203 set_cpu_active(cpu, false);
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214 if (IS_ENABLED(CONFIG_PREEMPT))
7215 synchronize_rcu_mult(call_rcu, call_rcu_sched);
7216 else
7217 synchronize_rcu();
7218
7219 if (!sched_smp_initialized)
7220 return 0;
7221
7222 ret = cpuset_cpu_inactive(cpu);
7223 if (ret) {
7224 set_cpu_active(cpu, true);
7225 return ret;
7226 }
7227 sched_domains_numa_masks_clear(cpu);
7228 return 0;
7229}
7230
7231static void sched_rq_cpu_starting(unsigned int cpu)
7232{
7233 struct rq *rq = cpu_rq(cpu);
7234
7235 rq->calc_load_update = calc_load_update;
7236 account_reset_rq(rq);
7237 update_max_interval();
7238}
7239
7240int sched_cpu_starting(unsigned int cpu)
7241{
7242 set_cpu_rq_start_time(cpu);
7243 sched_rq_cpu_starting(cpu);
7244 return 0;
7245}
7246
7247#ifdef CONFIG_HOTPLUG_CPU
7248int sched_cpu_dying(unsigned int cpu)
7249{
7250 struct rq *rq = cpu_rq(cpu);
7251 unsigned long flags;
7252
7253
7254 sched_ttwu_pending();
7255 raw_spin_lock_irqsave(&rq->lock, flags);
7256 if (rq->rd) {
7257 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7258 set_rq_offline(rq);
7259 }
7260 migrate_tasks(rq);
7261 BUG_ON(rq->nr_running != 1);
7262 raw_spin_unlock_irqrestore(&rq->lock, flags);
7263 calc_load_migrate(rq);
7264 update_max_interval();
7265 nohz_balance_exit_idle(cpu);
7266 hrtick_clear(rq);
7267 return 0;
7268}
7269#endif
7270
7271void __init sched_init_smp(void)
7272{
7273 cpumask_var_t non_isolated_cpus;
7274
7275 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7276 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7277
7278 sched_init_numa();
7279
7280
7281
7282
7283
7284
7285 mutex_lock(&sched_domains_mutex);
7286 init_sched_domains(cpu_active_mask);
7287 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7288 if (cpumask_empty(non_isolated_cpus))
7289 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7290 mutex_unlock(&sched_domains_mutex);
7291
7292
7293 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7294 BUG();
7295 sched_init_granularity();
7296 free_cpumask_var(non_isolated_cpus);
7297
7298 init_sched_rt_class();
7299 init_sched_dl_class();
7300 sched_smp_initialized = true;
7301}
7302
7303static int __init migration_init(void)
7304{
7305 sched_rq_cpu_starting(smp_processor_id());
7306 return 0;
7307}
7308early_initcall(migration_init);
7309
7310#else
7311void __init sched_init_smp(void)
7312{
7313 sched_init_granularity();
7314}
7315#endif
7316
7317int in_sched_functions(unsigned long addr)
7318{
7319 return in_lock_functions(addr) ||
7320 (addr >= (unsigned long)__sched_text_start
7321 && addr < (unsigned long)__sched_text_end);
7322}
7323
7324#ifdef CONFIG_CGROUP_SCHED
7325
7326
7327
7328
7329struct task_group root_task_group;
7330LIST_HEAD(task_groups);
7331
7332
7333static struct kmem_cache *task_group_cache __read_mostly;
7334#endif
7335
7336DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7337
7338void __init sched_init(void)
7339{
7340 int i, j;
7341 unsigned long alloc_size = 0, ptr;
7342
7343#ifdef CONFIG_FAIR_GROUP_SCHED
7344 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7345#endif
7346#ifdef CONFIG_RT_GROUP_SCHED
7347 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7348#endif
7349 if (alloc_size) {
7350 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7351
7352#ifdef CONFIG_FAIR_GROUP_SCHED
7353 root_task_group.se = (struct sched_entity **)ptr;
7354 ptr += nr_cpu_ids * sizeof(void **);
7355
7356 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7357 ptr += nr_cpu_ids * sizeof(void **);
7358
7359#endif
7360#ifdef CONFIG_RT_GROUP_SCHED
7361 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7362 ptr += nr_cpu_ids * sizeof(void **);
7363
7364 root_task_group.rt_rq = (struct rt_rq **)ptr;
7365 ptr += nr_cpu_ids * sizeof(void **);
7366
7367#endif
7368 }
7369#ifdef CONFIG_CPUMASK_OFFSTACK
7370 for_each_possible_cpu(i) {
7371 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7372 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7373 }
7374#endif
7375
7376 init_rt_bandwidth(&def_rt_bandwidth,
7377 global_rt_period(), global_rt_runtime());
7378 init_dl_bandwidth(&def_dl_bandwidth,
7379 global_rt_period(), global_rt_runtime());
7380
7381#ifdef CONFIG_SMP
7382 init_defrootdomain();
7383#endif
7384
7385#ifdef CONFIG_RT_GROUP_SCHED
7386 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7387 global_rt_period(), global_rt_runtime());
7388#endif
7389
7390#ifdef CONFIG_CGROUP_SCHED
7391 task_group_cache = KMEM_CACHE(task_group, 0);
7392
7393 list_add(&root_task_group.list, &task_groups);
7394 INIT_LIST_HEAD(&root_task_group.children);
7395 INIT_LIST_HEAD(&root_task_group.siblings);
7396 autogroup_init(&init_task);
7397#endif
7398
7399 for_each_possible_cpu(i) {
7400 struct rq *rq;
7401
7402 rq = cpu_rq(i);
7403 raw_spin_lock_init(&rq->lock);
7404 rq->nr_running = 0;
7405 rq->calc_load_active = 0;
7406 rq->calc_load_update = jiffies + LOAD_FREQ;
7407 init_cfs_rq(&rq->cfs);
7408 init_rt_rq(&rq->rt);
7409 init_dl_rq(&rq->dl);
7410#ifdef CONFIG_FAIR_GROUP_SCHED
7411 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7412 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7433 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7434#endif
7435
7436 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7437#ifdef CONFIG_RT_GROUP_SCHED
7438 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7439#endif
7440
7441 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7442 rq->cpu_load[j] = 0;
7443
7444#ifdef CONFIG_SMP
7445 rq->sd = NULL;
7446 rq->rd = NULL;
7447 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7448 rq->balance_callback = NULL;
7449 rq->active_balance = 0;
7450 rq->next_balance = jiffies;
7451 rq->push_cpu = 0;
7452 rq->cpu = i;
7453 rq->online = 0;
7454 rq->idle_stamp = 0;
7455 rq->avg_idle = 2*sysctl_sched_migration_cost;
7456 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7457
7458 INIT_LIST_HEAD(&rq->cfs_tasks);
7459
7460 rq_attach_root(rq, &def_root_domain);
7461#ifdef CONFIG_NO_HZ_COMMON
7462 rq->last_load_update_tick = jiffies;
7463 rq->nohz_flags = 0;
7464#endif
7465#ifdef CONFIG_NO_HZ_FULL
7466 rq->last_sched_tick = 0;
7467#endif
7468#endif
7469 init_rq_hrtick(rq);
7470 atomic_set(&rq->nr_iowait, 0);
7471 }
7472
7473 set_load_weight(&init_task);
7474
7475#ifdef CONFIG_PREEMPT_NOTIFIERS
7476 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7477#endif
7478
7479
7480
7481
7482 atomic_inc(&init_mm.mm_count);
7483 enter_lazy_tlb(&init_mm, current);
7484
7485
7486
7487
7488 current->sched_class = &fair_sched_class;
7489
7490
7491
7492
7493
7494
7495
7496 init_idle(current, smp_processor_id());
7497
7498 calc_load_update = jiffies + LOAD_FREQ;
7499
7500#ifdef CONFIG_SMP
7501 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7502
7503 if (cpu_isolated_map == NULL)
7504 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7505 idle_thread_set_boot_cpu();
7506 set_cpu_rq_start_time(smp_processor_id());
7507#endif
7508 init_sched_fair_class();
7509
7510 init_schedstats();
7511
7512 scheduler_running = 1;
7513}
7514
7515#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7516static inline int preempt_count_equals(int preempt_offset)
7517{
7518 int nested = preempt_count() + rcu_preempt_depth();
7519
7520 return (nested == preempt_offset);
7521}
7522
7523void __might_sleep(const char *file, int line, int preempt_offset)
7524{
7525
7526
7527
7528
7529
7530 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7531 "do not call blocking ops when !TASK_RUNNING; "
7532 "state=%lx set at [<%p>] %pS\n",
7533 current->state,
7534 (void *)current->task_state_change,
7535 (void *)current->task_state_change);
7536
7537 ___might_sleep(file, line, preempt_offset);
7538}
7539EXPORT_SYMBOL(__might_sleep);
7540
7541void ___might_sleep(const char *file, int line, int preempt_offset)
7542{
7543 static unsigned long prev_jiffy;
7544
7545 rcu_sleep_check();
7546 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7547 !is_idle_task(current)) ||
7548 system_state != SYSTEM_RUNNING || oops_in_progress)
7549 return;
7550 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7551 return;
7552 prev_jiffy = jiffies;
7553
7554 printk(KERN_ERR
7555 "BUG: sleeping function called from invalid context at %s:%d\n",
7556 file, line);
7557 printk(KERN_ERR
7558 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7559 in_atomic(), irqs_disabled(),
7560 current->pid, current->comm);
7561
7562 if (task_stack_end_corrupted(current))
7563 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7564
7565 debug_show_held_locks(current);
7566 if (irqs_disabled())
7567 print_irqtrace_events(current);
7568#ifdef CONFIG_DEBUG_PREEMPT
7569 if (!preempt_count_equals(preempt_offset)) {
7570 pr_err("Preemption disabled at:");
7571 print_ip_sym(current->preempt_disable_ip);
7572 pr_cont("\n");
7573 }
7574#endif
7575 dump_stack();
7576}
7577EXPORT_SYMBOL(___might_sleep);
7578#endif
7579
7580#ifdef CONFIG_MAGIC_SYSRQ
7581void normalize_rt_tasks(void)
7582{
7583 struct task_struct *g, *p;
7584 struct sched_attr attr = {
7585 .sched_policy = SCHED_NORMAL,
7586 };
7587
7588 read_lock(&tasklist_lock);
7589 for_each_process_thread(g, p) {
7590
7591
7592
7593 if (p->flags & PF_KTHREAD)
7594 continue;
7595
7596 p->se.exec_start = 0;
7597#ifdef CONFIG_SCHEDSTATS
7598 p->se.statistics.wait_start = 0;
7599 p->se.statistics.sleep_start = 0;
7600 p->se.statistics.block_start = 0;
7601#endif
7602
7603 if (!dl_task(p) && !rt_task(p)) {
7604
7605
7606
7607
7608 if (task_nice(p) < 0)
7609 set_user_nice(p, 0);
7610 continue;
7611 }
7612
7613 __sched_setscheduler(p, &attr, false, false);
7614 }
7615 read_unlock(&tasklist_lock);
7616}
7617
7618#endif
7619
7620#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639struct task_struct *curr_task(int cpu)
7640{
7641 return cpu_curr(cpu);
7642}
7643
7644#endif
7645
7646#ifdef CONFIG_IA64
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662void set_curr_task(int cpu, struct task_struct *p)
7663{
7664 cpu_curr(cpu) = p;
7665}
7666
7667#endif
7668
7669#ifdef CONFIG_CGROUP_SCHED
7670
7671static DEFINE_SPINLOCK(task_group_lock);
7672
7673static void sched_free_group(struct task_group *tg)
7674{
7675 free_fair_sched_group(tg);
7676 free_rt_sched_group(tg);
7677 autogroup_free(tg);
7678 kmem_cache_free(task_group_cache, tg);
7679}
7680
7681
7682struct task_group *sched_create_group(struct task_group *parent)
7683{
7684 struct task_group *tg;
7685
7686 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
7687 if (!tg)
7688 return ERR_PTR(-ENOMEM);
7689
7690 if (!alloc_fair_sched_group(tg, parent))
7691 goto err;
7692
7693 if (!alloc_rt_sched_group(tg, parent))
7694 goto err;
7695
7696 return tg;
7697
7698err:
7699 sched_free_group(tg);
7700 return ERR_PTR(-ENOMEM);
7701}
7702
7703void sched_online_group(struct task_group *tg, struct task_group *parent)
7704{
7705 unsigned long flags;
7706
7707 spin_lock_irqsave(&task_group_lock, flags);
7708 list_add_rcu(&tg->list, &task_groups);
7709
7710 WARN_ON(!parent);
7711
7712 tg->parent = parent;
7713 INIT_LIST_HEAD(&tg->children);
7714 list_add_rcu(&tg->siblings, &parent->children);
7715 spin_unlock_irqrestore(&task_group_lock, flags);
7716}
7717
7718
7719static void sched_free_group_rcu(struct rcu_head *rhp)
7720{
7721
7722 sched_free_group(container_of(rhp, struct task_group, rcu));
7723}
7724
7725void sched_destroy_group(struct task_group *tg)
7726{
7727
7728 call_rcu(&tg->rcu, sched_free_group_rcu);
7729}
7730
7731void sched_offline_group(struct task_group *tg)
7732{
7733 unsigned long flags;
7734
7735
7736 unregister_fair_sched_group(tg);
7737
7738 spin_lock_irqsave(&task_group_lock, flags);
7739 list_del_rcu(&tg->list);
7740 list_del_rcu(&tg->siblings);
7741 spin_unlock_irqrestore(&task_group_lock, flags);
7742}
7743
7744
7745
7746
7747
7748
7749void sched_move_task(struct task_struct *tsk)
7750{
7751 struct task_group *tg;
7752 int queued, running;
7753 struct rq_flags rf;
7754 struct rq *rq;
7755
7756 rq = task_rq_lock(tsk, &rf);
7757
7758 running = task_current(rq, tsk);
7759 queued = task_on_rq_queued(tsk);
7760
7761 if (queued)
7762 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7763 if (unlikely(running))
7764 put_prev_task(rq, tsk);
7765
7766
7767
7768
7769
7770
7771 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7772 struct task_group, css);
7773 tg = autogroup_task_group(tsk, tg);
7774 tsk->sched_task_group = tg;
7775
7776#ifdef CONFIG_FAIR_GROUP_SCHED
7777 if (tsk->sched_class->task_move_group)
7778 tsk->sched_class->task_move_group(tsk);
7779 else
7780#endif
7781 set_task_rq(tsk, task_cpu(tsk));
7782
7783 if (unlikely(running))
7784 tsk->sched_class->set_curr_task(rq);
7785 if (queued)
7786 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
7787
7788 task_rq_unlock(rq, tsk, &rf);
7789}
7790#endif
7791
7792#ifdef CONFIG_RT_GROUP_SCHED
7793
7794
7795
7796static DEFINE_MUTEX(rt_constraints_mutex);
7797
7798
7799static inline int tg_has_rt_tasks(struct task_group *tg)
7800{
7801 struct task_struct *g, *p;
7802
7803
7804
7805
7806 if (task_group_is_autogroup(tg))
7807 return 0;
7808
7809 for_each_process_thread(g, p) {
7810 if (rt_task(p) && task_group(p) == tg)
7811 return 1;
7812 }
7813
7814 return 0;
7815}
7816
7817struct rt_schedulable_data {
7818 struct task_group *tg;
7819 u64 rt_period;
7820 u64 rt_runtime;
7821};
7822
7823static int tg_rt_schedulable(struct task_group *tg, void *data)
7824{
7825 struct rt_schedulable_data *d = data;
7826 struct task_group *child;
7827 unsigned long total, sum = 0;
7828 u64 period, runtime;
7829
7830 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7831 runtime = tg->rt_bandwidth.rt_runtime;
7832
7833 if (tg == d->tg) {
7834 period = d->rt_period;
7835 runtime = d->rt_runtime;
7836 }
7837
7838
7839
7840
7841 if (runtime > period && runtime != RUNTIME_INF)
7842 return -EINVAL;
7843
7844
7845
7846
7847 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7848 return -EBUSY;
7849
7850 total = to_ratio(period, runtime);
7851
7852
7853
7854
7855 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7856 return -EINVAL;
7857
7858
7859
7860
7861 list_for_each_entry_rcu(child, &tg->children, siblings) {
7862 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7863 runtime = child->rt_bandwidth.rt_runtime;
7864
7865 if (child == d->tg) {
7866 period = d->rt_period;
7867 runtime = d->rt_runtime;
7868 }
7869
7870 sum += to_ratio(period, runtime);
7871 }
7872
7873 if (sum > total)
7874 return -EINVAL;
7875
7876 return 0;
7877}
7878
7879static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7880{
7881 int ret;
7882
7883 struct rt_schedulable_data data = {
7884 .tg = tg,
7885 .rt_period = period,
7886 .rt_runtime = runtime,
7887 };
7888
7889 rcu_read_lock();
7890 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7891 rcu_read_unlock();
7892
7893 return ret;
7894}
7895
7896static int tg_set_rt_bandwidth(struct task_group *tg,
7897 u64 rt_period, u64 rt_runtime)
7898{
7899 int i, err = 0;
7900
7901
7902
7903
7904
7905 if (tg == &root_task_group && rt_runtime == 0)
7906 return -EINVAL;
7907
7908
7909 if (rt_period == 0)
7910 return -EINVAL;
7911
7912 mutex_lock(&rt_constraints_mutex);
7913 read_lock(&tasklist_lock);
7914 err = __rt_schedulable(tg, rt_period, rt_runtime);
7915 if (err)
7916 goto unlock;
7917
7918 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7919 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7920 tg->rt_bandwidth.rt_runtime = rt_runtime;
7921
7922 for_each_possible_cpu(i) {
7923 struct rt_rq *rt_rq = tg->rt_rq[i];
7924
7925 raw_spin_lock(&rt_rq->rt_runtime_lock);
7926 rt_rq->rt_runtime = rt_runtime;
7927 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7928 }
7929 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7930unlock:
7931 read_unlock(&tasklist_lock);
7932 mutex_unlock(&rt_constraints_mutex);
7933
7934 return err;
7935}
7936
7937static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7938{
7939 u64 rt_runtime, rt_period;
7940
7941 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7942 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7943 if (rt_runtime_us < 0)
7944 rt_runtime = RUNTIME_INF;
7945
7946 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7947}
7948
7949static long sched_group_rt_runtime(struct task_group *tg)
7950{
7951 u64 rt_runtime_us;
7952
7953 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7954 return -1;
7955
7956 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7957 do_div(rt_runtime_us, NSEC_PER_USEC);
7958 return rt_runtime_us;
7959}
7960
7961static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
7962{
7963 u64 rt_runtime, rt_period;
7964
7965 rt_period = rt_period_us * NSEC_PER_USEC;
7966 rt_runtime = tg->rt_bandwidth.rt_runtime;
7967
7968 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7969}
7970
7971static long sched_group_rt_period(struct task_group *tg)
7972{
7973 u64 rt_period_us;
7974
7975 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7976 do_div(rt_period_us, NSEC_PER_USEC);
7977 return rt_period_us;
7978}
7979#endif
7980
7981#ifdef CONFIG_RT_GROUP_SCHED
7982static int sched_rt_global_constraints(void)
7983{
7984 int ret = 0;
7985
7986 mutex_lock(&rt_constraints_mutex);
7987 read_lock(&tasklist_lock);
7988 ret = __rt_schedulable(NULL, 0, 0);
7989 read_unlock(&tasklist_lock);
7990 mutex_unlock(&rt_constraints_mutex);
7991
7992 return ret;
7993}
7994
7995static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7996{
7997
7998 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7999 return 0;
8000
8001 return 1;
8002}
8003
8004#else
8005static int sched_rt_global_constraints(void)
8006{
8007 unsigned long flags;
8008 int i;
8009
8010 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8011 for_each_possible_cpu(i) {
8012 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8013
8014 raw_spin_lock(&rt_rq->rt_runtime_lock);
8015 rt_rq->rt_runtime = global_rt_runtime();
8016 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8017 }
8018 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8019
8020 return 0;
8021}
8022#endif
8023
8024static int sched_dl_global_validate(void)
8025{
8026 u64 runtime = global_rt_runtime();
8027 u64 period = global_rt_period();
8028 u64 new_bw = to_ratio(period, runtime);
8029 struct dl_bw *dl_b;
8030 int cpu, ret = 0;
8031 unsigned long flags;
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042 for_each_possible_cpu(cpu) {
8043 rcu_read_lock_sched();
8044 dl_b = dl_bw_of(cpu);
8045
8046 raw_spin_lock_irqsave(&dl_b->lock, flags);
8047 if (new_bw < dl_b->total_bw)
8048 ret = -EBUSY;
8049 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8050
8051 rcu_read_unlock_sched();
8052
8053 if (ret)
8054 break;
8055 }
8056
8057 return ret;
8058}
8059
8060static void sched_dl_do_global(void)
8061{
8062 u64 new_bw = -1;
8063 struct dl_bw *dl_b;
8064 int cpu;
8065 unsigned long flags;
8066
8067 def_dl_bandwidth.dl_period = global_rt_period();
8068 def_dl_bandwidth.dl_runtime = global_rt_runtime();
8069
8070 if (global_rt_runtime() != RUNTIME_INF)
8071 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
8072
8073
8074
8075
8076 for_each_possible_cpu(cpu) {
8077 rcu_read_lock_sched();
8078 dl_b = dl_bw_of(cpu);
8079
8080 raw_spin_lock_irqsave(&dl_b->lock, flags);
8081 dl_b->bw = new_bw;
8082 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
8083
8084 rcu_read_unlock_sched();
8085 }
8086}
8087
8088static int sched_rt_global_validate(void)
8089{
8090 if (sysctl_sched_rt_period <= 0)
8091 return -EINVAL;
8092
8093 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
8094 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
8095 return -EINVAL;
8096
8097 return 0;
8098}
8099
8100static void sched_rt_do_global(void)
8101{
8102 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8103 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
8104}
8105
8106int sched_rt_handler(struct ctl_table *table, int write,
8107 void __user *buffer, size_t *lenp,
8108 loff_t *ppos)
8109{
8110 int old_period, old_runtime;
8111 static DEFINE_MUTEX(mutex);
8112 int ret;
8113
8114 mutex_lock(&mutex);
8115 old_period = sysctl_sched_rt_period;
8116 old_runtime = sysctl_sched_rt_runtime;
8117
8118 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8119
8120 if (!ret && write) {
8121 ret = sched_rt_global_validate();
8122 if (ret)
8123 goto undo;
8124
8125 ret = sched_dl_global_validate();
8126 if (ret)
8127 goto undo;
8128
8129 ret = sched_rt_global_constraints();
8130 if (ret)
8131 goto undo;
8132
8133 sched_rt_do_global();
8134 sched_dl_do_global();
8135 }
8136 if (0) {
8137undo:
8138 sysctl_sched_rt_period = old_period;
8139 sysctl_sched_rt_runtime = old_runtime;
8140 }
8141 mutex_unlock(&mutex);
8142
8143 return ret;
8144}
8145
8146int sched_rr_handler(struct ctl_table *table, int write,
8147 void __user *buffer, size_t *lenp,
8148 loff_t *ppos)
8149{
8150 int ret;
8151 static DEFINE_MUTEX(mutex);
8152
8153 mutex_lock(&mutex);
8154 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8155
8156
8157 if (!ret && write) {
8158 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
8159 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
8160 }
8161 mutex_unlock(&mutex);
8162 return ret;
8163}
8164
8165#ifdef CONFIG_CGROUP_SCHED
8166
8167static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8168{
8169 return css ? container_of(css, struct task_group, css) : NULL;
8170}
8171
8172static struct cgroup_subsys_state *
8173cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8174{
8175 struct task_group *parent = css_tg(parent_css);
8176 struct task_group *tg;
8177
8178 if (!parent) {
8179
8180 return &root_task_group.css;
8181 }
8182
8183 tg = sched_create_group(parent);
8184 if (IS_ERR(tg))
8185 return ERR_PTR(-ENOMEM);
8186
8187 sched_online_group(tg, parent);
8188
8189 return &tg->css;
8190}
8191
8192static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8193{
8194 struct task_group *tg = css_tg(css);
8195
8196 sched_offline_group(tg);
8197}
8198
8199static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8200{
8201 struct task_group *tg = css_tg(css);
8202
8203
8204
8205
8206 sched_free_group(tg);
8207}
8208
8209static void cpu_cgroup_fork(struct task_struct *task)
8210{
8211 sched_move_task(task);
8212}
8213
8214static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8215{
8216 struct task_struct *task;
8217 struct cgroup_subsys_state *css;
8218
8219 cgroup_taskset_for_each(task, css, tset) {
8220#ifdef CONFIG_RT_GROUP_SCHED
8221 if (!sched_rt_can_attach(css_tg(css), task))
8222 return -EINVAL;
8223#else
8224
8225 if (task->sched_class != &fair_sched_class)
8226 return -EINVAL;
8227#endif
8228 }
8229 return 0;
8230}
8231
8232static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8233{
8234 struct task_struct *task;
8235 struct cgroup_subsys_state *css;
8236
8237 cgroup_taskset_for_each(task, css, tset)
8238 sched_move_task(task);
8239}
8240
8241#ifdef CONFIG_FAIR_GROUP_SCHED
8242static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8243 struct cftype *cftype, u64 shareval)
8244{
8245 return sched_group_set_shares(css_tg(css), scale_load(shareval));
8246}
8247
8248static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8249 struct cftype *cft)
8250{
8251 struct task_group *tg = css_tg(css);
8252
8253 return (u64) scale_load_down(tg->shares);
8254}
8255
8256#ifdef CONFIG_CFS_BANDWIDTH
8257static DEFINE_MUTEX(cfs_constraints_mutex);
8258
8259const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
8260const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
8261
8262static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8263
8264static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8265{
8266 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8267 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8268
8269 if (tg == &root_task_group)
8270 return -EINVAL;
8271
8272
8273
8274
8275
8276
8277 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8278 return -EINVAL;
8279
8280
8281
8282
8283
8284
8285 if (period > max_cfs_quota_period)
8286 return -EINVAL;
8287
8288
8289
8290
8291
8292 get_online_cpus();
8293 mutex_lock(&cfs_constraints_mutex);
8294 ret = __cfs_schedulable(tg, period, quota);
8295 if (ret)
8296 goto out_unlock;
8297
8298 runtime_enabled = quota != RUNTIME_INF;
8299 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8300
8301
8302
8303
8304 if (runtime_enabled && !runtime_was_enabled)
8305 cfs_bandwidth_usage_inc();
8306 raw_spin_lock_irq(&cfs_b->lock);
8307 cfs_b->period = ns_to_ktime(period);
8308 cfs_b->quota = quota;
8309
8310 __refill_cfs_bandwidth_runtime(cfs_b);
8311
8312 if (runtime_enabled)
8313 start_cfs_bandwidth(cfs_b);
8314 raw_spin_unlock_irq(&cfs_b->lock);
8315
8316 for_each_online_cpu(i) {
8317 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8318 struct rq *rq = cfs_rq->rq;
8319
8320 raw_spin_lock_irq(&rq->lock);
8321 cfs_rq->runtime_enabled = runtime_enabled;
8322 cfs_rq->runtime_remaining = 0;
8323
8324 if (cfs_rq->throttled)
8325 unthrottle_cfs_rq(cfs_rq);
8326 raw_spin_unlock_irq(&rq->lock);
8327 }
8328 if (runtime_was_enabled && !runtime_enabled)
8329 cfs_bandwidth_usage_dec();
8330out_unlock:
8331 mutex_unlock(&cfs_constraints_mutex);
8332 put_online_cpus();
8333
8334 return ret;
8335}
8336
8337int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8338{
8339 u64 quota, period;
8340
8341 period = ktime_to_ns(tg->cfs_bandwidth.period);
8342 if (cfs_quota_us < 0)
8343 quota = RUNTIME_INF;
8344 else
8345 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8346
8347 return tg_set_cfs_bandwidth(tg, period, quota);
8348}
8349
8350long tg_get_cfs_quota(struct task_group *tg)
8351{
8352 u64 quota_us;
8353
8354 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8355 return -1;
8356
8357 quota_us = tg->cfs_bandwidth.quota;
8358 do_div(quota_us, NSEC_PER_USEC);
8359
8360 return quota_us;
8361}
8362
8363int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8364{
8365 u64 quota, period;
8366
8367 period = (u64)cfs_period_us * NSEC_PER_USEC;
8368 quota = tg->cfs_bandwidth.quota;
8369
8370 return tg_set_cfs_bandwidth(tg, period, quota);
8371}
8372
8373long tg_get_cfs_period(struct task_group *tg)
8374{
8375 u64 cfs_period_us;
8376
8377 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8378 do_div(cfs_period_us, NSEC_PER_USEC);
8379
8380 return cfs_period_us;
8381}
8382
8383static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8384 struct cftype *cft)
8385{
8386 return tg_get_cfs_quota(css_tg(css));
8387}
8388
8389static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8390 struct cftype *cftype, s64 cfs_quota_us)
8391{
8392 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8393}
8394
8395static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8396 struct cftype *cft)
8397{
8398 return tg_get_cfs_period(css_tg(css));
8399}
8400
8401static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8402 struct cftype *cftype, u64 cfs_period_us)
8403{
8404 return tg_set_cfs_period(css_tg(css), cfs_period_us);
8405}
8406
8407struct cfs_schedulable_data {
8408 struct task_group *tg;
8409 u64 period, quota;
8410};
8411
8412
8413
8414
8415
8416static u64 normalize_cfs_quota(struct task_group *tg,
8417 struct cfs_schedulable_data *d)
8418{
8419 u64 quota, period;
8420
8421 if (tg == d->tg) {
8422 period = d->period;
8423 quota = d->quota;
8424 } else {
8425 period = tg_get_cfs_period(tg);
8426 quota = tg_get_cfs_quota(tg);
8427 }
8428
8429
8430 if (quota == RUNTIME_INF || quota == -1)
8431 return RUNTIME_INF;
8432
8433 return to_ratio(period, quota);
8434}
8435
8436static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8437{
8438 struct cfs_schedulable_data *d = data;
8439 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8440 s64 quota = 0, parent_quota = -1;
8441
8442 if (!tg->parent) {
8443 quota = RUNTIME_INF;
8444 } else {
8445 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8446
8447 quota = normalize_cfs_quota(tg, d);
8448 parent_quota = parent_b->hierarchical_quota;
8449
8450
8451
8452
8453
8454 if (quota == RUNTIME_INF)
8455 quota = parent_quota;
8456 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8457 return -EINVAL;
8458 }
8459 cfs_b->hierarchical_quota = quota;
8460
8461 return 0;
8462}
8463
8464static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8465{
8466 int ret;
8467 struct cfs_schedulable_data data = {
8468 .tg = tg,
8469 .period = period,
8470 .quota = quota,
8471 };
8472
8473 if (quota != RUNTIME_INF) {
8474 do_div(data.period, NSEC_PER_USEC);
8475 do_div(data.quota, NSEC_PER_USEC);
8476 }
8477
8478 rcu_read_lock();
8479 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8480 rcu_read_unlock();
8481
8482 return ret;
8483}
8484
8485static int cpu_stats_show(struct seq_file *sf, void *v)
8486{
8487 struct task_group *tg = css_tg(seq_css(sf));
8488 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8489
8490 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8491 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8492 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8493
8494 return 0;
8495}
8496#endif
8497#endif
8498
8499#ifdef CONFIG_RT_GROUP_SCHED
8500static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8501 struct cftype *cft, s64 val)
8502{
8503 return sched_group_set_rt_runtime(css_tg(css), val);
8504}
8505
8506static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8507 struct cftype *cft)
8508{
8509 return sched_group_rt_runtime(css_tg(css));
8510}
8511
8512static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8513 struct cftype *cftype, u64 rt_period_us)
8514{
8515 return sched_group_set_rt_period(css_tg(css), rt_period_us);
8516}
8517
8518static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8519 struct cftype *cft)
8520{
8521 return sched_group_rt_period(css_tg(css));
8522}
8523#endif
8524
8525static struct cftype cpu_files[] = {
8526#ifdef CONFIG_FAIR_GROUP_SCHED
8527 {
8528 .name = "shares",
8529 .read_u64 = cpu_shares_read_u64,
8530 .write_u64 = cpu_shares_write_u64,
8531 },
8532#endif
8533#ifdef CONFIG_CFS_BANDWIDTH
8534 {
8535 .name = "cfs_quota_us",
8536 .read_s64 = cpu_cfs_quota_read_s64,
8537 .write_s64 = cpu_cfs_quota_write_s64,
8538 },
8539 {
8540 .name = "cfs_period_us",
8541 .read_u64 = cpu_cfs_period_read_u64,
8542 .write_u64 = cpu_cfs_period_write_u64,
8543 },
8544 {
8545 .name = "stat",
8546 .seq_show = cpu_stats_show,
8547 },
8548#endif
8549#ifdef CONFIG_RT_GROUP_SCHED
8550 {
8551 .name = "rt_runtime_us",
8552 .read_s64 = cpu_rt_runtime_read,
8553 .write_s64 = cpu_rt_runtime_write,
8554 },
8555 {
8556 .name = "rt_period_us",
8557 .read_u64 = cpu_rt_period_read_uint,
8558 .write_u64 = cpu_rt_period_write_uint,
8559 },
8560#endif
8561 { }
8562};
8563
8564struct cgroup_subsys cpu_cgrp_subsys = {
8565 .css_alloc = cpu_cgroup_css_alloc,
8566 .css_released = cpu_cgroup_css_released,
8567 .css_free = cpu_cgroup_css_free,
8568 .fork = cpu_cgroup_fork,
8569 .can_attach = cpu_cgroup_can_attach,
8570 .attach = cpu_cgroup_attach,
8571 .legacy_cftypes = cpu_files,
8572 .early_init = true,
8573};
8574
8575#endif
8576
8577void dump_cpu_task(int cpu)
8578{
8579 pr_info("Task dump for CPU %d:\n", cpu);
8580 sched_show_task(cpu_curr(cpu));
8581}
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595const int sched_prio_to_weight[40] = {
8596 88761, 71755, 56483, 46273, 36291,
8597 29154, 23254, 18705, 14949, 11916,
8598 9548, 7620, 6100, 4904, 3906,
8599 3121, 2501, 1991, 1586, 1277,
8600 1024, 820, 655, 526, 423,
8601 335, 272, 215, 172, 137,
8602 110, 87, 70, 56, 45,
8603 36, 29, 23, 18, 15,
8604};
8605
8606
8607
8608
8609
8610
8611
8612
8613const u32 sched_prio_to_wmult[40] = {
8614 48388, 59856, 76040, 92818, 118348,
8615 147320, 184698, 229616, 287308, 360437,
8616 449829, 563644, 704093, 875809, 1099582,
8617 1376151, 1717300, 2157191, 2708050, 3363326,
8618 4194304, 5237765, 6557202, 8165337, 10153587,
8619 12820798, 15790321, 19976592, 24970740, 31350126,
8620 39045157, 49367440, 61356676, 76695844, 95443717,
8621 119304647, 148102320, 186737708, 238609294, 286331153,
8622};
8623