1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76#include <linux/compiler.h>
77
78#include <asm/switch_to.h>
79#include <asm/tlb.h>
80#include <asm/irq_regs.h>
81#include <asm/mutex.h>
82#ifdef CONFIG_PARAVIRT
83#include <asm/paravirt.h>
84#endif
85
86#include "sched.h"
87#include "../workqueue_internal.h"
88#include "../smpboot.h"
89
90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h>
92
93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94{
95 unsigned long delta;
96 ktime_t soft, hard, now;
97
98 for (;;) {
99 if (hrtimer_active(period_timer))
100 break;
101
102 now = hrtimer_cb_get_time(period_timer);
103 hrtimer_forward(period_timer, now, period);
104
105 soft = hrtimer_get_softexpires(period_timer);
106 hard = hrtimer_get_expires(period_timer);
107 delta = ktime_to_ns(ktime_sub(hard, soft));
108 __hrtimer_start_range_ns(period_timer, soft, delta,
109 HRTIMER_MODE_ABS_PINNED, 0);
110 }
111}
112
113DEFINE_MUTEX(sched_domains_mutex);
114DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
115
116static void update_rq_clock_task(struct rq *rq, s64 delta);
117
118void update_rq_clock(struct rq *rq)
119{
120 s64 delta;
121
122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
125 return;
126
127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
128 if (delta < 0)
129 return;
130 rq->clock += delta;
131 update_rq_clock_task(rq, delta);
132}
133
134
135
136
137
138#define SCHED_FEAT(name, enabled) \
139 (1UL << __SCHED_FEAT_##name) * enabled |
140
141const_debug unsigned int sysctl_sched_features =
142#include "features.h"
143 0;
144
145#undef SCHED_FEAT
146
147#ifdef CONFIG_SCHED_DEBUG
148#define SCHED_FEAT(name, enabled) \
149 #name ,
150
151static const char * const sched_feat_names[] = {
152#include "features.h"
153};
154
155#undef SCHED_FEAT
156
157static int sched_feat_show(struct seq_file *m, void *v)
158{
159 int i;
160
161 for (i = 0; i < __SCHED_FEAT_NR; i++) {
162 if (!(sysctl_sched_features & (1UL << i)))
163 seq_puts(m, "NO_");
164 seq_printf(m, "%s ", sched_feat_names[i]);
165 }
166 seq_puts(m, "\n");
167
168 return 0;
169}
170
171#ifdef HAVE_JUMP_LABEL
172
173#define jump_label_key__true STATIC_KEY_INIT_TRUE
174#define jump_label_key__false STATIC_KEY_INIT_FALSE
175
176#define SCHED_FEAT(name, enabled) \
177 jump_label_key__##enabled ,
178
179struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
180#include "features.h"
181};
182
183#undef SCHED_FEAT
184
185static void sched_feat_disable(int i)
186{
187 if (static_key_enabled(&sched_feat_keys[i]))
188 static_key_slow_dec(&sched_feat_keys[i]);
189}
190
191static void sched_feat_enable(int i)
192{
193 if (!static_key_enabled(&sched_feat_keys[i]))
194 static_key_slow_inc(&sched_feat_keys[i]);
195}
196#else
197static void sched_feat_disable(int i) { };
198static void sched_feat_enable(int i) { };
199#endif
200
201static int sched_feat_set(char *cmp)
202{
203 int i;
204 int neg = 0;
205
206 if (strncmp(cmp, "NO_", 3) == 0) {
207 neg = 1;
208 cmp += 3;
209 }
210
211 for (i = 0; i < __SCHED_FEAT_NR; i++) {
212 if (strcmp(cmp, sched_feat_names[i]) == 0) {
213 if (neg) {
214 sysctl_sched_features &= ~(1UL << i);
215 sched_feat_disable(i);
216 } else {
217 sysctl_sched_features |= (1UL << i);
218 sched_feat_enable(i);
219 }
220 break;
221 }
222 }
223
224 return i;
225}
226
227static ssize_t
228sched_feat_write(struct file *filp, const char __user *ubuf,
229 size_t cnt, loff_t *ppos)
230{
231 char buf[64];
232 char *cmp;
233 int i;
234 struct inode *inode;
235
236 if (cnt > 63)
237 cnt = 63;
238
239 if (copy_from_user(&buf, ubuf, cnt))
240 return -EFAULT;
241
242 buf[cnt] = 0;
243 cmp = strstrip(buf);
244
245
246 inode = file_inode(filp);
247 mutex_lock(&inode->i_mutex);
248 i = sched_feat_set(cmp);
249 mutex_unlock(&inode->i_mutex);
250 if (i == __SCHED_FEAT_NR)
251 return -EINVAL;
252
253 *ppos += cnt;
254
255 return cnt;
256}
257
258static int sched_feat_open(struct inode *inode, struct file *filp)
259{
260 return single_open(filp, sched_feat_show, NULL);
261}
262
263static const struct file_operations sched_feat_fops = {
264 .open = sched_feat_open,
265 .write = sched_feat_write,
266 .read = seq_read,
267 .llseek = seq_lseek,
268 .release = single_release,
269};
270
271static __init int sched_init_debug(void)
272{
273 debugfs_create_file("sched_features", 0644, NULL, NULL,
274 &sched_feat_fops);
275
276 return 0;
277}
278late_initcall(sched_init_debug);
279#endif
280
281
282
283
284
285const_debug unsigned int sysctl_sched_nr_migrate = 32;
286
287
288
289
290
291
292
293const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
294
295
296
297
298
299unsigned int sysctl_sched_rt_period = 1000000;
300
301__read_mostly int scheduler_running;
302
303
304
305
306
307int sysctl_sched_rt_runtime = 950000;
308
309
310cpumask_var_t cpu_isolated_map;
311
312
313
314
315static struct rq *this_rq_lock(void)
316 __acquires(rq->lock)
317{
318 struct rq *rq;
319
320 local_irq_disable();
321 rq = this_rq();
322 raw_spin_lock(&rq->lock);
323
324 return rq;
325}
326
327#ifdef CONFIG_SCHED_HRTICK
328
329
330
331
332static void hrtick_clear(struct rq *rq)
333{
334 if (hrtimer_active(&rq->hrtick_timer))
335 hrtimer_cancel(&rq->hrtick_timer);
336}
337
338
339
340
341
342static enum hrtimer_restart hrtick(struct hrtimer *timer)
343{
344 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
345
346 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
347
348 raw_spin_lock(&rq->lock);
349 update_rq_clock(rq);
350 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
351 raw_spin_unlock(&rq->lock);
352
353 return HRTIMER_NORESTART;
354}
355
356#ifdef CONFIG_SMP
357
358static int __hrtick_restart(struct rq *rq)
359{
360 struct hrtimer *timer = &rq->hrtick_timer;
361 ktime_t time = hrtimer_get_softexpires(timer);
362
363 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
364}
365
366
367
368
369static void __hrtick_start(void *arg)
370{
371 struct rq *rq = arg;
372
373 raw_spin_lock(&rq->lock);
374 __hrtick_restart(rq);
375 rq->hrtick_csd_pending = 0;
376 raw_spin_unlock(&rq->lock);
377}
378
379
380
381
382
383
384void hrtick_start(struct rq *rq, u64 delay)
385{
386 struct hrtimer *timer = &rq->hrtick_timer;
387 ktime_t time;
388 s64 delta;
389
390
391
392
393
394 delta = max_t(s64, delay, 10000LL);
395 time = ktime_add_ns(timer->base->get_time(), delta);
396
397 hrtimer_set_expires(timer, time);
398
399 if (rq == this_rq()) {
400 __hrtick_restart(rq);
401 } else if (!rq->hrtick_csd_pending) {
402 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
403 rq->hrtick_csd_pending = 1;
404 }
405}
406
407static int
408hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
409{
410 int cpu = (int)(long)hcpu;
411
412 switch (action) {
413 case CPU_UP_CANCELED:
414 case CPU_UP_CANCELED_FROZEN:
415 case CPU_DOWN_PREPARE:
416 case CPU_DOWN_PREPARE_FROZEN:
417 case CPU_DEAD:
418 case CPU_DEAD_FROZEN:
419 hrtick_clear(cpu_rq(cpu));
420 return NOTIFY_OK;
421 }
422
423 return NOTIFY_DONE;
424}
425
426static __init void init_hrtick(void)
427{
428 hotcpu_notifier(hotplug_hrtick, 0);
429}
430#else
431
432
433
434
435
436void hrtick_start(struct rq *rq, u64 delay)
437{
438
439
440
441
442 delay = max_t(u64, delay, 10000LL);
443 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
444 HRTIMER_MODE_REL_PINNED, 0);
445}
446
447static inline void init_hrtick(void)
448{
449}
450#endif
451
452static void init_rq_hrtick(struct rq *rq)
453{
454#ifdef CONFIG_SMP
455 rq->hrtick_csd_pending = 0;
456
457 rq->hrtick_csd.flags = 0;
458 rq->hrtick_csd.func = __hrtick_start;
459 rq->hrtick_csd.info = rq;
460#endif
461
462 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
463 rq->hrtick_timer.function = hrtick;
464}
465#else
466static inline void hrtick_clear(struct rq *rq)
467{
468}
469
470static inline void init_rq_hrtick(struct rq *rq)
471{
472}
473
474static inline void init_hrtick(void)
475{
476}
477#endif
478
479
480
481
482#define fetch_or(ptr, val) \
483({ typeof(*(ptr)) __old, __val = *(ptr); \
484 for (;;) { \
485 __old = cmpxchg((ptr), __val, __val | (val)); \
486 if (__old == __val) \
487 break; \
488 __val = __old; \
489 } \
490 __old; \
491})
492
493#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
494
495
496
497
498
499static bool set_nr_and_not_polling(struct task_struct *p)
500{
501 struct thread_info *ti = task_thread_info(p);
502 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
503}
504
505
506
507
508
509
510
511static bool set_nr_if_polling(struct task_struct *p)
512{
513 struct thread_info *ti = task_thread_info(p);
514 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
515
516 for (;;) {
517 if (!(val & _TIF_POLLING_NRFLAG))
518 return false;
519 if (val & _TIF_NEED_RESCHED)
520 return true;
521 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
522 if (old == val)
523 break;
524 val = old;
525 }
526 return true;
527}
528
529#else
530static bool set_nr_and_not_polling(struct task_struct *p)
531{
532 set_tsk_need_resched(p);
533 return true;
534}
535
536#ifdef CONFIG_SMP
537static bool set_nr_if_polling(struct task_struct *p)
538{
539 return false;
540}
541#endif
542#endif
543
544
545
546
547
548
549
550
551void resched_curr(struct rq *rq)
552{
553 struct task_struct *curr = rq->curr;
554 int cpu;
555
556 lockdep_assert_held(&rq->lock);
557
558 if (test_tsk_need_resched(curr))
559 return;
560
561 cpu = cpu_of(rq);
562
563 if (cpu == smp_processor_id()) {
564 set_tsk_need_resched(curr);
565 set_preempt_need_resched();
566 return;
567 }
568
569 if (set_nr_and_not_polling(curr))
570 smp_send_reschedule(cpu);
571 else
572 trace_sched_wake_idle_without_ipi(cpu);
573}
574
575void resched_cpu(int cpu)
576{
577 struct rq *rq = cpu_rq(cpu);
578 unsigned long flags;
579
580 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
581 return;
582 resched_curr(rq);
583 raw_spin_unlock_irqrestore(&rq->lock, flags);
584}
585
586#ifdef CONFIG_SMP
587#ifdef CONFIG_NO_HZ_COMMON
588
589
590
591
592
593
594
595
596int get_nohz_timer_target(int pinned)
597{
598 int cpu = smp_processor_id();
599 int i;
600 struct sched_domain *sd;
601
602 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
603 return cpu;
604
605 rcu_read_lock();
606 for_each_domain(cpu, sd) {
607 for_each_cpu(i, sched_domain_span(sd)) {
608 if (!idle_cpu(i)) {
609 cpu = i;
610 goto unlock;
611 }
612 }
613 }
614unlock:
615 rcu_read_unlock();
616 return cpu;
617}
618
619
620
621
622
623
624
625
626
627
628static void wake_up_idle_cpu(int cpu)
629{
630 struct rq *rq = cpu_rq(cpu);
631
632 if (cpu == smp_processor_id())
633 return;
634
635 if (set_nr_and_not_polling(rq->idle))
636 smp_send_reschedule(cpu);
637 else
638 trace_sched_wake_idle_without_ipi(cpu);
639}
640
641static bool wake_up_full_nohz_cpu(int cpu)
642{
643
644
645
646
647
648
649 if (tick_nohz_full_cpu(cpu)) {
650 if (cpu != smp_processor_id() ||
651 tick_nohz_tick_stopped())
652 tick_nohz_full_kick_cpu(cpu);
653 return true;
654 }
655
656 return false;
657}
658
659void wake_up_nohz_cpu(int cpu)
660{
661 if (!wake_up_full_nohz_cpu(cpu))
662 wake_up_idle_cpu(cpu);
663}
664
665static inline bool got_nohz_idle_kick(void)
666{
667 int cpu = smp_processor_id();
668
669 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
670 return false;
671
672 if (idle_cpu(cpu) && !need_resched())
673 return true;
674
675
676
677
678
679 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
680 return false;
681}
682
683#else
684
685static inline bool got_nohz_idle_kick(void)
686{
687 return false;
688}
689
690#endif
691
692#ifdef CONFIG_NO_HZ_FULL
693bool sched_can_stop_tick(void)
694{
695
696
697
698
699 if (current->policy == SCHED_FIFO)
700 return true;
701
702
703
704
705
706 if (current->policy == SCHED_RR) {
707 struct sched_rt_entity *rt_se = ¤t->rt;
708
709 return rt_se->run_list.prev == rt_se->run_list.next;
710 }
711
712
713
714
715
716
717 if (this_rq()->nr_running > 1)
718 return false;
719
720 return true;
721}
722#endif
723
724void sched_avg_update(struct rq *rq)
725{
726 s64 period = sched_avg_period();
727
728 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
729
730
731
732
733
734 asm("" : "+rm" (rq->age_stamp));
735 rq->age_stamp += period;
736 rq->rt_avg /= 2;
737 }
738}
739
740#endif
741
742#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
743 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
744
745
746
747
748
749
750int walk_tg_tree_from(struct task_group *from,
751 tg_visitor down, tg_visitor up, void *data)
752{
753 struct task_group *parent, *child;
754 int ret;
755
756 parent = from;
757
758down:
759 ret = (*down)(parent, data);
760 if (ret)
761 goto out;
762 list_for_each_entry_rcu(child, &parent->children, siblings) {
763 parent = child;
764 goto down;
765
766up:
767 continue;
768 }
769 ret = (*up)(parent, data);
770 if (ret || parent == from)
771 goto out;
772
773 child = parent;
774 parent = parent->parent;
775 if (parent)
776 goto up;
777out:
778 return ret;
779}
780
781int tg_nop(struct task_group *tg, void *data)
782{
783 return 0;
784}
785#endif
786
787static void set_load_weight(struct task_struct *p)
788{
789 int prio = p->static_prio - MAX_RT_PRIO;
790 struct load_weight *load = &p->se.load;
791
792
793
794
795 if (p->policy == SCHED_IDLE) {
796 load->weight = scale_load(WEIGHT_IDLEPRIO);
797 load->inv_weight = WMULT_IDLEPRIO;
798 return;
799 }
800
801 load->weight = scale_load(prio_to_weight[prio]);
802 load->inv_weight = prio_to_wmult[prio];
803}
804
805static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
806{
807 update_rq_clock(rq);
808 sched_info_queued(rq, p);
809 p->sched_class->enqueue_task(rq, p, flags);
810}
811
812static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
813{
814 update_rq_clock(rq);
815 sched_info_dequeued(rq, p);
816 p->sched_class->dequeue_task(rq, p, flags);
817}
818
819void activate_task(struct rq *rq, struct task_struct *p, int flags)
820{
821 if (task_contributes_to_load(p))
822 rq->nr_uninterruptible--;
823
824 enqueue_task(rq, p, flags);
825}
826
827void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
828{
829 if (task_contributes_to_load(p))
830 rq->nr_uninterruptible++;
831
832 dequeue_task(rq, p, flags);
833}
834
835static void update_rq_clock_task(struct rq *rq, s64 delta)
836{
837
838
839
840
841#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
842 s64 steal = 0, irq_delta = 0;
843#endif
844#ifdef CONFIG_IRQ_TIME_ACCOUNTING
845 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862 if (irq_delta > delta)
863 irq_delta = delta;
864
865 rq->prev_irq_time += irq_delta;
866 delta -= irq_delta;
867#endif
868#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
869 if (static_key_false((¶virt_steal_rq_enabled))) {
870 steal = paravirt_steal_clock(cpu_of(rq));
871 steal -= rq->prev_steal_time_rq;
872
873 if (unlikely(steal > delta))
874 steal = delta;
875
876 rq->prev_steal_time_rq += steal;
877 delta -= steal;
878 }
879#endif
880
881 rq->clock_task += delta;
882
883#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
884 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
885 sched_rt_avg_update(rq, irq_delta + steal);
886#endif
887}
888
889void sched_set_stop_task(int cpu, struct task_struct *stop)
890{
891 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
892 struct task_struct *old_stop = cpu_rq(cpu)->stop;
893
894 if (stop) {
895
896
897
898
899
900
901
902
903 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
904
905 stop->sched_class = &stop_sched_class;
906 }
907
908 cpu_rq(cpu)->stop = stop;
909
910 if (old_stop) {
911
912
913
914
915 old_stop->sched_class = &rt_sched_class;
916 }
917}
918
919
920
921
922static inline int __normal_prio(struct task_struct *p)
923{
924 return p->static_prio;
925}
926
927
928
929
930
931
932
933
934static inline int normal_prio(struct task_struct *p)
935{
936 int prio;
937
938 if (task_has_dl_policy(p))
939 prio = MAX_DL_PRIO-1;
940 else if (task_has_rt_policy(p))
941 prio = MAX_RT_PRIO-1 - p->rt_priority;
942 else
943 prio = __normal_prio(p);
944 return prio;
945}
946
947
948
949
950
951
952
953
954static int effective_prio(struct task_struct *p)
955{
956 p->normal_prio = normal_prio(p);
957
958
959
960
961
962 if (!rt_prio(p->prio))
963 return p->normal_prio;
964 return p->prio;
965}
966
967
968
969
970
971
972
973inline int task_curr(const struct task_struct *p)
974{
975 return cpu_curr(task_cpu(p)) == p;
976}
977
978
979
980
981static inline void check_class_changed(struct rq *rq, struct task_struct *p,
982 const struct sched_class *prev_class,
983 int oldprio)
984{
985 if (prev_class != p->sched_class) {
986 if (prev_class->switched_from)
987 prev_class->switched_from(rq, p);
988
989 p->sched_class->switched_to(rq, p);
990 } else if (oldprio != p->prio || dl_task(p))
991 p->sched_class->prio_changed(rq, p, oldprio);
992}
993
994void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
995{
996 const struct sched_class *class;
997
998 if (p->sched_class == rq->curr->sched_class) {
999 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1000 } else {
1001 for_each_class(class) {
1002 if (class == rq->curr->sched_class)
1003 break;
1004 if (class == p->sched_class) {
1005 resched_curr(rq);
1006 break;
1007 }
1008 }
1009 }
1010
1011
1012
1013
1014
1015 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1016 rq_clock_skip_update(rq, true);
1017}
1018
1019#ifdef CONFIG_SMP
1020void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1021{
1022#ifdef CONFIG_SCHED_DEBUG
1023
1024
1025
1026
1027 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1028 !p->on_rq);
1029
1030#ifdef CONFIG_LOCKDEP
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1042 lockdep_is_held(&task_rq(p)->lock)));
1043#endif
1044#endif
1045
1046 trace_sched_migrate_task(p, new_cpu);
1047
1048 if (task_cpu(p) != new_cpu) {
1049 if (p->sched_class->migrate_task_rq)
1050 p->sched_class->migrate_task_rq(p, new_cpu);
1051 p->se.nr_migrations++;
1052 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1053 }
1054
1055 __set_task_cpu(p, new_cpu);
1056}
1057
1058static void __migrate_swap_task(struct task_struct *p, int cpu)
1059{
1060 if (task_on_rq_queued(p)) {
1061 struct rq *src_rq, *dst_rq;
1062
1063 src_rq = task_rq(p);
1064 dst_rq = cpu_rq(cpu);
1065
1066 deactivate_task(src_rq, p, 0);
1067 set_task_cpu(p, cpu);
1068 activate_task(dst_rq, p, 0);
1069 check_preempt_curr(dst_rq, p, 0);
1070 } else {
1071
1072
1073
1074
1075
1076 p->wake_cpu = cpu;
1077 }
1078}
1079
1080struct migration_swap_arg {
1081 struct task_struct *src_task, *dst_task;
1082 int src_cpu, dst_cpu;
1083};
1084
1085static int migrate_swap_stop(void *data)
1086{
1087 struct migration_swap_arg *arg = data;
1088 struct rq *src_rq, *dst_rq;
1089 int ret = -EAGAIN;
1090
1091 src_rq = cpu_rq(arg->src_cpu);
1092 dst_rq = cpu_rq(arg->dst_cpu);
1093
1094 double_raw_lock(&arg->src_task->pi_lock,
1095 &arg->dst_task->pi_lock);
1096 double_rq_lock(src_rq, dst_rq);
1097 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1098 goto unlock;
1099
1100 if (task_cpu(arg->src_task) != arg->src_cpu)
1101 goto unlock;
1102
1103 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1104 goto unlock;
1105
1106 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1107 goto unlock;
1108
1109 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1110 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1111
1112 ret = 0;
1113
1114unlock:
1115 double_rq_unlock(src_rq, dst_rq);
1116 raw_spin_unlock(&arg->dst_task->pi_lock);
1117 raw_spin_unlock(&arg->src_task->pi_lock);
1118
1119 return ret;
1120}
1121
1122
1123
1124
1125int migrate_swap(struct task_struct *cur, struct task_struct *p)
1126{
1127 struct migration_swap_arg arg;
1128 int ret = -EINVAL;
1129
1130 arg = (struct migration_swap_arg){
1131 .src_task = cur,
1132 .src_cpu = task_cpu(cur),
1133 .dst_task = p,
1134 .dst_cpu = task_cpu(p),
1135 };
1136
1137 if (arg.src_cpu == arg.dst_cpu)
1138 goto out;
1139
1140
1141
1142
1143
1144 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1145 goto out;
1146
1147 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1148 goto out;
1149
1150 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1151 goto out;
1152
1153 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1154 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1155
1156out:
1157 return ret;
1158}
1159
1160struct migration_arg {
1161 struct task_struct *task;
1162 int dest_cpu;
1163};
1164
1165static int migration_cpu_stop(void *data);
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1184{
1185 unsigned long flags;
1186 int running, queued;
1187 unsigned long ncsw;
1188 struct rq *rq;
1189
1190 for (;;) {
1191
1192
1193
1194
1195
1196
1197 rq = task_rq(p);
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210 while (task_running(rq, p)) {
1211 if (match_state && unlikely(p->state != match_state))
1212 return 0;
1213 cpu_relax();
1214 }
1215
1216
1217
1218
1219
1220
1221 rq = task_rq_lock(p, &flags);
1222 trace_sched_wait_task(p);
1223 running = task_running(rq, p);
1224 queued = task_on_rq_queued(p);
1225 ncsw = 0;
1226 if (!match_state || p->state == match_state)
1227 ncsw = p->nvcsw | LONG_MIN;
1228 task_rq_unlock(rq, p, &flags);
1229
1230
1231
1232
1233 if (unlikely(!ncsw))
1234 break;
1235
1236
1237
1238
1239
1240
1241
1242 if (unlikely(running)) {
1243 cpu_relax();
1244 continue;
1245 }
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256 if (unlikely(queued)) {
1257 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1258
1259 set_current_state(TASK_UNINTERRUPTIBLE);
1260 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1261 continue;
1262 }
1263
1264
1265
1266
1267
1268
1269 break;
1270 }
1271
1272 return ncsw;
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288void kick_process(struct task_struct *p)
1289{
1290 int cpu;
1291
1292 preempt_disable();
1293 cpu = task_cpu(p);
1294 if ((cpu != smp_processor_id()) && task_curr(p))
1295 smp_send_reschedule(cpu);
1296 preempt_enable();
1297}
1298EXPORT_SYMBOL_GPL(kick_process);
1299#endif
1300
1301#ifdef CONFIG_SMP
1302
1303
1304
1305static int select_fallback_rq(int cpu, struct task_struct *p)
1306{
1307 int nid = cpu_to_node(cpu);
1308 const struct cpumask *nodemask = NULL;
1309 enum { cpuset, possible, fail } state = cpuset;
1310 int dest_cpu;
1311
1312
1313
1314
1315
1316
1317 if (nid != -1) {
1318 nodemask = cpumask_of_node(nid);
1319
1320
1321 for_each_cpu(dest_cpu, nodemask) {
1322 if (!cpu_online(dest_cpu))
1323 continue;
1324 if (!cpu_active(dest_cpu))
1325 continue;
1326 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1327 return dest_cpu;
1328 }
1329 }
1330
1331 for (;;) {
1332
1333 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1334 if (!cpu_online(dest_cpu))
1335 continue;
1336 if (!cpu_active(dest_cpu))
1337 continue;
1338 goto out;
1339 }
1340
1341 switch (state) {
1342 case cpuset:
1343
1344 cpuset_cpus_allowed_fallback(p);
1345 state = possible;
1346 break;
1347
1348 case possible:
1349 do_set_cpus_allowed(p, cpu_possible_mask);
1350 state = fail;
1351 break;
1352
1353 case fail:
1354 BUG();
1355 break;
1356 }
1357 }
1358
1359out:
1360 if (state != cpuset) {
1361
1362
1363
1364
1365
1366 if (p->mm && printk_ratelimit()) {
1367 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1368 task_pid_nr(p), p->comm, cpu);
1369 }
1370 }
1371
1372 return dest_cpu;
1373}
1374
1375
1376
1377
1378static inline
1379int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1380{
1381 if (p->nr_cpus_allowed > 1)
1382 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1395 !cpu_online(cpu)))
1396 cpu = select_fallback_rq(task_cpu(p), p);
1397
1398 return cpu;
1399}
1400
1401static void update_avg(u64 *avg, u64 sample)
1402{
1403 s64 diff = sample - *avg;
1404 *avg += diff >> 3;
1405}
1406#endif
1407
1408static void
1409ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1410{
1411#ifdef CONFIG_SCHEDSTATS
1412 struct rq *rq = this_rq();
1413
1414#ifdef CONFIG_SMP
1415 int this_cpu = smp_processor_id();
1416
1417 if (cpu == this_cpu) {
1418 schedstat_inc(rq, ttwu_local);
1419 schedstat_inc(p, se.statistics.nr_wakeups_local);
1420 } else {
1421 struct sched_domain *sd;
1422
1423 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1424 rcu_read_lock();
1425 for_each_domain(this_cpu, sd) {
1426 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1427 schedstat_inc(sd, ttwu_wake_remote);
1428 break;
1429 }
1430 }
1431 rcu_read_unlock();
1432 }
1433
1434 if (wake_flags & WF_MIGRATED)
1435 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1436
1437#endif
1438
1439 schedstat_inc(rq, ttwu_count);
1440 schedstat_inc(p, se.statistics.nr_wakeups);
1441
1442 if (wake_flags & WF_SYNC)
1443 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1444
1445#endif
1446}
1447
1448static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1449{
1450 activate_task(rq, p, en_flags);
1451 p->on_rq = TASK_ON_RQ_QUEUED;
1452
1453
1454 if (p->flags & PF_WQ_WORKER)
1455 wq_worker_waking_up(p, cpu_of(rq));
1456}
1457
1458
1459
1460
1461static void
1462ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1463{
1464 check_preempt_curr(rq, p, wake_flags);
1465 trace_sched_wakeup(p, true);
1466
1467 p->state = TASK_RUNNING;
1468#ifdef CONFIG_SMP
1469 if (p->sched_class->task_woken)
1470 p->sched_class->task_woken(rq, p);
1471
1472 if (rq->idle_stamp) {
1473 u64 delta = rq_clock(rq) - rq->idle_stamp;
1474 u64 max = 2*rq->max_idle_balance_cost;
1475
1476 update_avg(&rq->avg_idle, delta);
1477
1478 if (rq->avg_idle > max)
1479 rq->avg_idle = max;
1480
1481 rq->idle_stamp = 0;
1482 }
1483#endif
1484}
1485
1486static void
1487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1488{
1489#ifdef CONFIG_SMP
1490 if (p->sched_contributes_to_load)
1491 rq->nr_uninterruptible--;
1492#endif
1493
1494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1495 ttwu_do_wakeup(rq, p, wake_flags);
1496}
1497
1498
1499
1500
1501
1502
1503
1504static int ttwu_remote(struct task_struct *p, int wake_flags)
1505{
1506 struct rq *rq;
1507 int ret = 0;
1508
1509 rq = __task_rq_lock(p);
1510 if (task_on_rq_queued(p)) {
1511
1512 update_rq_clock(rq);
1513 ttwu_do_wakeup(rq, p, wake_flags);
1514 ret = 1;
1515 }
1516 __task_rq_unlock(rq);
1517
1518 return ret;
1519}
1520
1521#ifdef CONFIG_SMP
1522void sched_ttwu_pending(void)
1523{
1524 struct rq *rq = this_rq();
1525 struct llist_node *llist = llist_del_all(&rq->wake_list);
1526 struct task_struct *p;
1527 unsigned long flags;
1528
1529 if (!llist)
1530 return;
1531
1532 raw_spin_lock_irqsave(&rq->lock, flags);
1533
1534 while (llist) {
1535 p = llist_entry(llist, struct task_struct, wake_entry);
1536 llist = llist_next(llist);
1537 ttwu_do_activate(rq, p, 0);
1538 }
1539
1540 raw_spin_unlock_irqrestore(&rq->lock, flags);
1541}
1542
1543void scheduler_ipi(void)
1544{
1545
1546
1547
1548
1549
1550 preempt_fold_need_resched();
1551
1552 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1553 return;
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568 irq_enter();
1569 sched_ttwu_pending();
1570
1571
1572
1573
1574 if (unlikely(got_nohz_idle_kick())) {
1575 this_rq()->idle_balance = 1;
1576 raise_softirq_irqoff(SCHED_SOFTIRQ);
1577 }
1578 irq_exit();
1579}
1580
1581static void ttwu_queue_remote(struct task_struct *p, int cpu)
1582{
1583 struct rq *rq = cpu_rq(cpu);
1584
1585 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1586 if (!set_nr_if_polling(rq->idle))
1587 smp_send_reschedule(cpu);
1588 else
1589 trace_sched_wake_idle_without_ipi(cpu);
1590 }
1591}
1592
1593void wake_up_if_idle(int cpu)
1594{
1595 struct rq *rq = cpu_rq(cpu);
1596 unsigned long flags;
1597
1598 rcu_read_lock();
1599
1600 if (!is_idle_task(rcu_dereference(rq->curr)))
1601 goto out;
1602
1603 if (set_nr_if_polling(rq->idle)) {
1604 trace_sched_wake_idle_without_ipi(cpu);
1605 } else {
1606 raw_spin_lock_irqsave(&rq->lock, flags);
1607 if (is_idle_task(rq->curr))
1608 smp_send_reschedule(cpu);
1609
1610 raw_spin_unlock_irqrestore(&rq->lock, flags);
1611 }
1612
1613out:
1614 rcu_read_unlock();
1615}
1616
1617bool cpus_share_cache(int this_cpu, int that_cpu)
1618{
1619 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1620}
1621#endif
1622
1623static void ttwu_queue(struct task_struct *p, int cpu)
1624{
1625 struct rq *rq = cpu_rq(cpu);
1626
1627#if defined(CONFIG_SMP)
1628 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1629 sched_clock_cpu(cpu);
1630 ttwu_queue_remote(p, cpu);
1631 return;
1632 }
1633#endif
1634
1635 raw_spin_lock(&rq->lock);
1636 ttwu_do_activate(rq, p, 0);
1637 raw_spin_unlock(&rq->lock);
1638}
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655static int
1656try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1657{
1658 unsigned long flags;
1659 int cpu, success = 0;
1660
1661
1662
1663
1664
1665
1666
1667 smp_mb__before_spinlock();
1668 raw_spin_lock_irqsave(&p->pi_lock, flags);
1669 if (!(p->state & state))
1670 goto out;
1671
1672 success = 1;
1673 cpu = task_cpu(p);
1674
1675 if (p->on_rq && ttwu_remote(p, wake_flags))
1676 goto stat;
1677
1678#ifdef CONFIG_SMP
1679
1680
1681
1682
1683 while (p->on_cpu)
1684 cpu_relax();
1685
1686
1687
1688 smp_rmb();
1689
1690 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1691 p->state = TASK_WAKING;
1692
1693 if (p->sched_class->task_waking)
1694 p->sched_class->task_waking(p);
1695
1696 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1697 if (task_cpu(p) != cpu) {
1698 wake_flags |= WF_MIGRATED;
1699 set_task_cpu(p, cpu);
1700 }
1701#endif
1702
1703 ttwu_queue(p, cpu);
1704stat:
1705 ttwu_stat(p, cpu, wake_flags);
1706out:
1707 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1708
1709 return success;
1710}
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720static void try_to_wake_up_local(struct task_struct *p)
1721{
1722 struct rq *rq = task_rq(p);
1723
1724 if (WARN_ON_ONCE(rq != this_rq()) ||
1725 WARN_ON_ONCE(p == current))
1726 return;
1727
1728 lockdep_assert_held(&rq->lock);
1729
1730 if (!raw_spin_trylock(&p->pi_lock)) {
1731 raw_spin_unlock(&rq->lock);
1732 raw_spin_lock(&p->pi_lock);
1733 raw_spin_lock(&rq->lock);
1734 }
1735
1736 if (!(p->state & TASK_NORMAL))
1737 goto out;
1738
1739 if (!task_on_rq_queued(p))
1740 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1741
1742 ttwu_do_wakeup(rq, p, 0);
1743 ttwu_stat(p, smp_processor_id(), 0);
1744out:
1745 raw_spin_unlock(&p->pi_lock);
1746}
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760int wake_up_process(struct task_struct *p)
1761{
1762 WARN_ON(task_is_stopped_or_traced(p));
1763 return try_to_wake_up(p, TASK_NORMAL, 0);
1764}
1765EXPORT_SYMBOL(wake_up_process);
1766
1767int wake_up_state(struct task_struct *p, unsigned int state)
1768{
1769 return try_to_wake_up(p, state, 0);
1770}
1771
1772
1773
1774
1775void __dl_clear_params(struct task_struct *p)
1776{
1777 struct sched_dl_entity *dl_se = &p->dl;
1778
1779 dl_se->dl_runtime = 0;
1780 dl_se->dl_deadline = 0;
1781 dl_se->dl_period = 0;
1782 dl_se->flags = 0;
1783 dl_se->dl_bw = 0;
1784
1785 dl_se->dl_throttled = 0;
1786 dl_se->dl_new = 1;
1787 dl_se->dl_yielded = 0;
1788}
1789
1790
1791
1792
1793
1794
1795
1796static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1797{
1798 p->on_rq = 0;
1799
1800 p->se.on_rq = 0;
1801 p->se.exec_start = 0;
1802 p->se.sum_exec_runtime = 0;
1803 p->se.prev_sum_exec_runtime = 0;
1804 p->se.nr_migrations = 0;
1805 p->se.vruntime = 0;
1806#ifdef CONFIG_SMP
1807 p->se.avg.decay_count = 0;
1808#endif
1809 INIT_LIST_HEAD(&p->se.group_node);
1810
1811#ifdef CONFIG_SCHEDSTATS
1812 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1813#endif
1814
1815 RB_CLEAR_NODE(&p->dl.rb_node);
1816 init_dl_task_timer(&p->dl);
1817 __dl_clear_params(p);
1818
1819 INIT_LIST_HEAD(&p->rt.run_list);
1820
1821#ifdef CONFIG_PREEMPT_NOTIFIERS
1822 INIT_HLIST_HEAD(&p->preempt_notifiers);
1823#endif
1824
1825#ifdef CONFIG_NUMA_BALANCING
1826 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1827 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1828 p->mm->numa_scan_seq = 0;
1829 }
1830
1831 if (clone_flags & CLONE_VM)
1832 p->numa_preferred_nid = current->numa_preferred_nid;
1833 else
1834 p->numa_preferred_nid = -1;
1835
1836 p->node_stamp = 0ULL;
1837 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1838 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1839 p->numa_work.next = &p->numa_work;
1840 p->numa_faults = NULL;
1841 p->last_task_numa_placement = 0;
1842 p->last_sum_exec_runtime = 0;
1843
1844 p->numa_group = NULL;
1845#endif
1846}
1847
1848#ifdef CONFIG_NUMA_BALANCING
1849#ifdef CONFIG_SCHED_DEBUG
1850void set_numabalancing_state(bool enabled)
1851{
1852 if (enabled)
1853 sched_feat_set("NUMA");
1854 else
1855 sched_feat_set("NO_NUMA");
1856}
1857#else
1858__read_mostly bool numabalancing_enabled;
1859
1860void set_numabalancing_state(bool enabled)
1861{
1862 numabalancing_enabled = enabled;
1863}
1864#endif
1865
1866#ifdef CONFIG_PROC_SYSCTL
1867int sysctl_numa_balancing(struct ctl_table *table, int write,
1868 void __user *buffer, size_t *lenp, loff_t *ppos)
1869{
1870 struct ctl_table t;
1871 int err;
1872 int state = numabalancing_enabled;
1873
1874 if (write && !capable(CAP_SYS_ADMIN))
1875 return -EPERM;
1876
1877 t = *table;
1878 t.data = &state;
1879 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1880 if (err < 0)
1881 return err;
1882 if (write)
1883 set_numabalancing_state(state);
1884 return err;
1885}
1886#endif
1887#endif
1888
1889
1890
1891
1892int sched_fork(unsigned long clone_flags, struct task_struct *p)
1893{
1894 unsigned long flags;
1895 int cpu = get_cpu();
1896
1897 __sched_fork(clone_flags, p);
1898
1899
1900
1901
1902
1903 p->state = TASK_RUNNING;
1904
1905
1906
1907
1908 p->prio = current->normal_prio;
1909
1910
1911
1912
1913 if (unlikely(p->sched_reset_on_fork)) {
1914 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1915 p->policy = SCHED_NORMAL;
1916 p->static_prio = NICE_TO_PRIO(0);
1917 p->rt_priority = 0;
1918 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1919 p->static_prio = NICE_TO_PRIO(0);
1920
1921 p->prio = p->normal_prio = __normal_prio(p);
1922 set_load_weight(p);
1923
1924
1925
1926
1927
1928 p->sched_reset_on_fork = 0;
1929 }
1930
1931 if (dl_prio(p->prio)) {
1932 put_cpu();
1933 return -EAGAIN;
1934 } else if (rt_prio(p->prio)) {
1935 p->sched_class = &rt_sched_class;
1936 } else {
1937 p->sched_class = &fair_sched_class;
1938 }
1939
1940 if (p->sched_class->task_fork)
1941 p->sched_class->task_fork(p);
1942
1943
1944
1945
1946
1947
1948
1949
1950 raw_spin_lock_irqsave(&p->pi_lock, flags);
1951 set_task_cpu(p, cpu);
1952 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1953
1954#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1955 if (likely(sched_info_on()))
1956 memset(&p->sched_info, 0, sizeof(p->sched_info));
1957#endif
1958#if defined(CONFIG_SMP)
1959 p->on_cpu = 0;
1960#endif
1961 init_task_preempt_count(p);
1962#ifdef CONFIG_SMP
1963 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1964 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1965#endif
1966
1967 put_cpu();
1968 return 0;
1969}
1970
1971unsigned long to_ratio(u64 period, u64 runtime)
1972{
1973 if (runtime == RUNTIME_INF)
1974 return 1ULL << 20;
1975
1976
1977
1978
1979
1980
1981 if (period == 0)
1982 return 0;
1983
1984 return div64_u64(runtime << 20, period);
1985}
1986
1987#ifdef CONFIG_SMP
1988inline struct dl_bw *dl_bw_of(int i)
1989{
1990 rcu_lockdep_assert(rcu_read_lock_sched_held(),
1991 "sched RCU must be held");
1992 return &cpu_rq(i)->rd->dl_bw;
1993}
1994
1995static inline int dl_bw_cpus(int i)
1996{
1997 struct root_domain *rd = cpu_rq(i)->rd;
1998 int cpus = 0;
1999
2000 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2001 "sched RCU must be held");
2002 for_each_cpu_and(i, rd->span, cpu_active_mask)
2003 cpus++;
2004
2005 return cpus;
2006}
2007#else
2008inline struct dl_bw *dl_bw_of(int i)
2009{
2010 return &cpu_rq(i)->dl.dl_bw;
2011}
2012
2013static inline int dl_bw_cpus(int i)
2014{
2015 return 1;
2016}
2017#endif
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030static int dl_overflow(struct task_struct *p, int policy,
2031 const struct sched_attr *attr)
2032{
2033
2034 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
2035 u64 period = attr->sched_period ?: attr->sched_deadline;
2036 u64 runtime = attr->sched_runtime;
2037 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2038 int cpus, err = -1;
2039
2040 if (new_bw == p->dl.dl_bw)
2041 return 0;
2042
2043
2044
2045
2046
2047
2048 raw_spin_lock(&dl_b->lock);
2049 cpus = dl_bw_cpus(task_cpu(p));
2050 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2051 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2052 __dl_add(dl_b, new_bw);
2053 err = 0;
2054 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2055 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2056 __dl_clear(dl_b, p->dl.dl_bw);
2057 __dl_add(dl_b, new_bw);
2058 err = 0;
2059 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2060 __dl_clear(dl_b, p->dl.dl_bw);
2061 err = 0;
2062 }
2063 raw_spin_unlock(&dl_b->lock);
2064
2065 return err;
2066}
2067
2068extern void init_dl_bw(struct dl_bw *dl_b);
2069
2070
2071
2072
2073
2074
2075
2076
2077void wake_up_new_task(struct task_struct *p)
2078{
2079 unsigned long flags;
2080 struct rq *rq;
2081
2082 raw_spin_lock_irqsave(&p->pi_lock, flags);
2083#ifdef CONFIG_SMP
2084
2085
2086
2087
2088
2089 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2090#endif
2091
2092
2093 init_task_runnable_average(p);
2094 rq = __task_rq_lock(p);
2095 activate_task(rq, p, 0);
2096 p->on_rq = TASK_ON_RQ_QUEUED;
2097 trace_sched_wakeup_new(p, true);
2098 check_preempt_curr(rq, p, WF_FORK);
2099#ifdef CONFIG_SMP
2100 if (p->sched_class->task_woken)
2101 p->sched_class->task_woken(rq, p);
2102#endif
2103 task_rq_unlock(rq, p, &flags);
2104}
2105
2106#ifdef CONFIG_PREEMPT_NOTIFIERS
2107
2108
2109
2110
2111
2112void preempt_notifier_register(struct preempt_notifier *notifier)
2113{
2114 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2115}
2116EXPORT_SYMBOL_GPL(preempt_notifier_register);
2117
2118
2119
2120
2121
2122
2123
2124void preempt_notifier_unregister(struct preempt_notifier *notifier)
2125{
2126 hlist_del(¬ifier->link);
2127}
2128EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2129
2130static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2131{
2132 struct preempt_notifier *notifier;
2133
2134 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2135 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2136}
2137
2138static void
2139fire_sched_out_preempt_notifiers(struct task_struct *curr,
2140 struct task_struct *next)
2141{
2142 struct preempt_notifier *notifier;
2143
2144 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2145 notifier->ops->sched_out(notifier, next);
2146}
2147
2148#else
2149
2150static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2151{
2152}
2153
2154static void
2155fire_sched_out_preempt_notifiers(struct task_struct *curr,
2156 struct task_struct *next)
2157{
2158}
2159
2160#endif
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175static inline void
2176prepare_task_switch(struct rq *rq, struct task_struct *prev,
2177 struct task_struct *next)
2178{
2179 trace_sched_switch(prev, next);
2180 sched_info_switch(rq, prev, next);
2181 perf_event_task_sched_out(prev, next);
2182 fire_sched_out_preempt_notifiers(prev, next);
2183 prepare_lock_switch(rq, next);
2184 prepare_arch_switch(next);
2185}
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206static struct rq *finish_task_switch(struct task_struct *prev)
2207 __releases(rq->lock)
2208{
2209 struct rq *rq = this_rq();
2210 struct mm_struct *mm = rq->prev_mm;
2211 long prev_state;
2212
2213 rq->prev_mm = NULL;
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226 prev_state = prev->state;
2227 vtime_task_switch(prev);
2228 finish_arch_switch(prev);
2229 perf_event_task_sched_in(prev, current);
2230 finish_lock_switch(rq, prev);
2231 finish_arch_post_lock_switch();
2232
2233 fire_sched_in_preempt_notifiers(current);
2234 if (mm)
2235 mmdrop(mm);
2236 if (unlikely(prev_state == TASK_DEAD)) {
2237 if (prev->sched_class->task_dead)
2238 prev->sched_class->task_dead(prev);
2239
2240
2241
2242
2243
2244 kprobe_flush_task(prev);
2245 put_task_struct(prev);
2246 }
2247
2248 tick_nohz_task_switch(current);
2249 return rq;
2250}
2251
2252#ifdef CONFIG_SMP
2253
2254
2255static inline void post_schedule(struct rq *rq)
2256{
2257 if (rq->post_schedule) {
2258 unsigned long flags;
2259
2260 raw_spin_lock_irqsave(&rq->lock, flags);
2261 if (rq->curr->sched_class->post_schedule)
2262 rq->curr->sched_class->post_schedule(rq);
2263 raw_spin_unlock_irqrestore(&rq->lock, flags);
2264
2265 rq->post_schedule = 0;
2266 }
2267}
2268
2269#else
2270
2271static inline void post_schedule(struct rq *rq)
2272{
2273}
2274
2275#endif
2276
2277
2278
2279
2280
2281asmlinkage __visible void schedule_tail(struct task_struct *prev)
2282 __releases(rq->lock)
2283{
2284 struct rq *rq;
2285
2286
2287 preempt_disable();
2288 rq = finish_task_switch(prev);
2289 post_schedule(rq);
2290 preempt_enable();
2291
2292 if (current->set_child_tid)
2293 put_user(task_pid_vnr(current), current->set_child_tid);
2294}
2295
2296
2297
2298
2299static inline struct rq *
2300context_switch(struct rq *rq, struct task_struct *prev,
2301 struct task_struct *next)
2302{
2303 struct mm_struct *mm, *oldmm;
2304
2305 prepare_task_switch(rq, prev, next);
2306
2307 mm = next->mm;
2308 oldmm = prev->active_mm;
2309
2310
2311
2312
2313
2314 arch_start_context_switch(prev);
2315
2316 if (!mm) {
2317 next->active_mm = oldmm;
2318 atomic_inc(&oldmm->mm_count);
2319 enter_lazy_tlb(oldmm, next);
2320 } else
2321 switch_mm(oldmm, mm, next);
2322
2323 if (!prev->mm) {
2324 prev->active_mm = NULL;
2325 rq->prev_mm = oldmm;
2326 }
2327
2328
2329
2330
2331
2332
2333 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2334
2335 context_tracking_task_switch(prev, next);
2336
2337 switch_to(prev, next, prev);
2338 barrier();
2339
2340 return finish_task_switch(prev);
2341}
2342
2343
2344
2345
2346
2347
2348
2349unsigned long nr_running(void)
2350{
2351 unsigned long i, sum = 0;
2352
2353 for_each_online_cpu(i)
2354 sum += cpu_rq(i)->nr_running;
2355
2356 return sum;
2357}
2358
2359
2360
2361
2362bool single_task_running(void)
2363{
2364 if (cpu_rq(smp_processor_id())->nr_running == 1)
2365 return true;
2366 else
2367 return false;
2368}
2369EXPORT_SYMBOL(single_task_running);
2370
2371unsigned long long nr_context_switches(void)
2372{
2373 int i;
2374 unsigned long long sum = 0;
2375
2376 for_each_possible_cpu(i)
2377 sum += cpu_rq(i)->nr_switches;
2378
2379 return sum;
2380}
2381
2382unsigned long nr_iowait(void)
2383{
2384 unsigned long i, sum = 0;
2385
2386 for_each_possible_cpu(i)
2387 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2388
2389 return sum;
2390}
2391
2392unsigned long nr_iowait_cpu(int cpu)
2393{
2394 struct rq *this = cpu_rq(cpu);
2395 return atomic_read(&this->nr_iowait);
2396}
2397
2398void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2399{
2400 struct rq *this = this_rq();
2401 *nr_waiters = atomic_read(&this->nr_iowait);
2402 *load = this->cpu_load[0];
2403}
2404
2405#ifdef CONFIG_SMP
2406
2407
2408
2409
2410
2411void sched_exec(void)
2412{
2413 struct task_struct *p = current;
2414 unsigned long flags;
2415 int dest_cpu;
2416
2417 raw_spin_lock_irqsave(&p->pi_lock, flags);
2418 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2419 if (dest_cpu == smp_processor_id())
2420 goto unlock;
2421
2422 if (likely(cpu_active(dest_cpu))) {
2423 struct migration_arg arg = { p, dest_cpu };
2424
2425 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2426 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2427 return;
2428 }
2429unlock:
2430 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2431}
2432
2433#endif
2434
2435DEFINE_PER_CPU(struct kernel_stat, kstat);
2436DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2437
2438EXPORT_PER_CPU_SYMBOL(kstat);
2439EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2440
2441
2442
2443
2444
2445
2446unsigned long long task_sched_runtime(struct task_struct *p)
2447{
2448 unsigned long flags;
2449 struct rq *rq;
2450 u64 ns;
2451
2452#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464 if (!p->on_cpu || !task_on_rq_queued(p))
2465 return p->se.sum_exec_runtime;
2466#endif
2467
2468 rq = task_rq_lock(p, &flags);
2469
2470
2471
2472
2473
2474 if (task_current(rq, p) && task_on_rq_queued(p)) {
2475 update_rq_clock(rq);
2476 p->sched_class->update_curr(rq);
2477 }
2478 ns = p->se.sum_exec_runtime;
2479 task_rq_unlock(rq, p, &flags);
2480
2481 return ns;
2482}
2483
2484
2485
2486
2487
2488void scheduler_tick(void)
2489{
2490 int cpu = smp_processor_id();
2491 struct rq *rq = cpu_rq(cpu);
2492 struct task_struct *curr = rq->curr;
2493
2494 sched_clock_tick();
2495
2496 raw_spin_lock(&rq->lock);
2497 update_rq_clock(rq);
2498 curr->sched_class->task_tick(rq, curr, 0);
2499 update_cpu_load_active(rq);
2500 raw_spin_unlock(&rq->lock);
2501
2502 perf_event_task_tick();
2503
2504#ifdef CONFIG_SMP
2505 rq->idle_balance = idle_cpu(cpu);
2506 trigger_load_balance(rq);
2507#endif
2508 rq_last_tick_reset(rq);
2509}
2510
2511#ifdef CONFIG_NO_HZ_FULL
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525u64 scheduler_tick_max_deferment(void)
2526{
2527 struct rq *rq = this_rq();
2528 unsigned long next, now = ACCESS_ONCE(jiffies);
2529
2530 next = rq->last_sched_tick + HZ;
2531
2532 if (time_before_eq(next, now))
2533 return 0;
2534
2535 return jiffies_to_nsecs(next - now);
2536}
2537#endif
2538
2539notrace unsigned long get_parent_ip(unsigned long addr)
2540{
2541 if (in_lock_functions(addr)) {
2542 addr = CALLER_ADDR2;
2543 if (in_lock_functions(addr))
2544 addr = CALLER_ADDR3;
2545 }
2546 return addr;
2547}
2548
2549#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2550 defined(CONFIG_PREEMPT_TRACER))
2551
2552void preempt_count_add(int val)
2553{
2554#ifdef CONFIG_DEBUG_PREEMPT
2555
2556
2557
2558 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2559 return;
2560#endif
2561 __preempt_count_add(val);
2562#ifdef CONFIG_DEBUG_PREEMPT
2563
2564
2565
2566 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2567 PREEMPT_MASK - 10);
2568#endif
2569 if (preempt_count() == val) {
2570 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2571#ifdef CONFIG_DEBUG_PREEMPT
2572 current->preempt_disable_ip = ip;
2573#endif
2574 trace_preempt_off(CALLER_ADDR0, ip);
2575 }
2576}
2577EXPORT_SYMBOL(preempt_count_add);
2578NOKPROBE_SYMBOL(preempt_count_add);
2579
2580void preempt_count_sub(int val)
2581{
2582#ifdef CONFIG_DEBUG_PREEMPT
2583
2584
2585
2586 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2587 return;
2588
2589
2590
2591 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2592 !(preempt_count() & PREEMPT_MASK)))
2593 return;
2594#endif
2595
2596 if (preempt_count() == val)
2597 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2598 __preempt_count_sub(val);
2599}
2600EXPORT_SYMBOL(preempt_count_sub);
2601NOKPROBE_SYMBOL(preempt_count_sub);
2602
2603#endif
2604
2605
2606
2607
2608static noinline void __schedule_bug(struct task_struct *prev)
2609{
2610 if (oops_in_progress)
2611 return;
2612
2613 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2614 prev->comm, prev->pid, preempt_count());
2615
2616 debug_show_held_locks(prev);
2617 print_modules();
2618 if (irqs_disabled())
2619 print_irqtrace_events(prev);
2620#ifdef CONFIG_DEBUG_PREEMPT
2621 if (in_atomic_preempt_off()) {
2622 pr_err("Preemption disabled at:");
2623 print_ip_sym(current->preempt_disable_ip);
2624 pr_cont("\n");
2625 }
2626#endif
2627 dump_stack();
2628 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2629}
2630
2631
2632
2633
2634static inline void schedule_debug(struct task_struct *prev)
2635{
2636#ifdef CONFIG_SCHED_STACK_END_CHECK
2637 BUG_ON(unlikely(task_stack_end_corrupted(prev)));
2638#endif
2639
2640
2641
2642
2643
2644 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2645 __schedule_bug(prev);
2646 rcu_sleep_check();
2647
2648 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2649
2650 schedstat_inc(this_rq(), sched_count);
2651}
2652
2653
2654
2655
2656static inline struct task_struct *
2657pick_next_task(struct rq *rq, struct task_struct *prev)
2658{
2659 const struct sched_class *class = &fair_sched_class;
2660 struct task_struct *p;
2661
2662
2663
2664
2665
2666 if (likely(prev->sched_class == class &&
2667 rq->nr_running == rq->cfs.h_nr_running)) {
2668 p = fair_sched_class.pick_next_task(rq, prev);
2669 if (unlikely(p == RETRY_TASK))
2670 goto again;
2671
2672
2673 if (unlikely(!p))
2674 p = idle_sched_class.pick_next_task(rq, prev);
2675
2676 return p;
2677 }
2678
2679again:
2680 for_each_class(class) {
2681 p = class->pick_next_task(rq, prev);
2682 if (p) {
2683 if (unlikely(p == RETRY_TASK))
2684 goto again;
2685 return p;
2686 }
2687 }
2688
2689 BUG();
2690}
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733static void __sched __schedule(void)
2734{
2735 struct task_struct *prev, *next;
2736 unsigned long *switch_count;
2737 struct rq *rq;
2738 int cpu;
2739
2740 preempt_disable();
2741 cpu = smp_processor_id();
2742 rq = cpu_rq(cpu);
2743 rcu_note_context_switch();
2744 prev = rq->curr;
2745
2746 schedule_debug(prev);
2747
2748 if (sched_feat(HRTICK))
2749 hrtick_clear(rq);
2750
2751
2752
2753
2754
2755
2756 smp_mb__before_spinlock();
2757 raw_spin_lock_irq(&rq->lock);
2758
2759 rq->clock_skip_update <<= 1;
2760
2761 switch_count = &prev->nivcsw;
2762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2763 if (unlikely(signal_pending_state(prev->state, prev))) {
2764 prev->state = TASK_RUNNING;
2765 } else {
2766 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2767 prev->on_rq = 0;
2768
2769
2770
2771
2772
2773
2774 if (prev->flags & PF_WQ_WORKER) {
2775 struct task_struct *to_wakeup;
2776
2777 to_wakeup = wq_worker_sleeping(prev, cpu);
2778 if (to_wakeup)
2779 try_to_wake_up_local(to_wakeup);
2780 }
2781 }
2782 switch_count = &prev->nvcsw;
2783 }
2784
2785 if (task_on_rq_queued(prev))
2786 update_rq_clock(rq);
2787
2788 next = pick_next_task(rq, prev);
2789 clear_tsk_need_resched(prev);
2790 clear_preempt_need_resched();
2791 rq->clock_skip_update = 0;
2792
2793 if (likely(prev != next)) {
2794 rq->nr_switches++;
2795 rq->curr = next;
2796 ++*switch_count;
2797
2798 rq = context_switch(rq, prev, next);
2799 cpu = cpu_of(rq);
2800 } else
2801 raw_spin_unlock_irq(&rq->lock);
2802
2803 post_schedule(rq);
2804
2805 sched_preempt_enable_no_resched();
2806}
2807
2808static inline void sched_submit_work(struct task_struct *tsk)
2809{
2810 if (!tsk->state || tsk_is_pi_blocked(tsk))
2811 return;
2812
2813
2814
2815
2816 if (blk_needs_flush_plug(tsk))
2817 blk_schedule_flush_plug(tsk);
2818}
2819
2820asmlinkage __visible void __sched schedule(void)
2821{
2822 struct task_struct *tsk = current;
2823
2824 sched_submit_work(tsk);
2825 do {
2826 __schedule();
2827 } while (need_resched());
2828}
2829EXPORT_SYMBOL(schedule);
2830
2831#ifdef CONFIG_CONTEXT_TRACKING
2832asmlinkage __visible void __sched schedule_user(void)
2833{
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844 enum ctx_state prev_state = exception_enter();
2845 schedule();
2846 exception_exit(prev_state);
2847}
2848#endif
2849
2850
2851
2852
2853
2854
2855void __sched schedule_preempt_disabled(void)
2856{
2857 sched_preempt_enable_no_resched();
2858 schedule();
2859 preempt_disable();
2860}
2861
2862static void __sched notrace preempt_schedule_common(void)
2863{
2864 do {
2865 __preempt_count_add(PREEMPT_ACTIVE);
2866 __schedule();
2867 __preempt_count_sub(PREEMPT_ACTIVE);
2868
2869
2870
2871
2872
2873 barrier();
2874 } while (need_resched());
2875}
2876
2877#ifdef CONFIG_PREEMPT
2878
2879
2880
2881
2882
2883asmlinkage __visible void __sched notrace preempt_schedule(void)
2884{
2885
2886
2887
2888
2889 if (likely(!preemptible()))
2890 return;
2891
2892 preempt_schedule_common();
2893}
2894NOKPROBE_SYMBOL(preempt_schedule);
2895EXPORT_SYMBOL(preempt_schedule);
2896
2897#ifdef CONFIG_CONTEXT_TRACKING
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2913{
2914 enum ctx_state prev_ctx;
2915
2916 if (likely(!preemptible()))
2917 return;
2918
2919 do {
2920 __preempt_count_add(PREEMPT_ACTIVE);
2921
2922
2923
2924
2925
2926 prev_ctx = exception_enter();
2927 __schedule();
2928 exception_exit(prev_ctx);
2929
2930 __preempt_count_sub(PREEMPT_ACTIVE);
2931 barrier();
2932 } while (need_resched());
2933}
2934EXPORT_SYMBOL_GPL(preempt_schedule_context);
2935#endif
2936
2937#endif
2938
2939
2940
2941
2942
2943
2944
2945asmlinkage __visible void __sched preempt_schedule_irq(void)
2946{
2947 enum ctx_state prev_state;
2948
2949
2950 BUG_ON(preempt_count() || !irqs_disabled());
2951
2952 prev_state = exception_enter();
2953
2954 do {
2955 __preempt_count_add(PREEMPT_ACTIVE);
2956 local_irq_enable();
2957 __schedule();
2958 local_irq_disable();
2959 __preempt_count_sub(PREEMPT_ACTIVE);
2960
2961
2962
2963
2964
2965 barrier();
2966 } while (need_resched());
2967
2968 exception_exit(prev_state);
2969}
2970
2971int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2972 void *key)
2973{
2974 return try_to_wake_up(curr->private, mode, wake_flags);
2975}
2976EXPORT_SYMBOL(default_wake_function);
2977
2978#ifdef CONFIG_RT_MUTEXES
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991void rt_mutex_setprio(struct task_struct *p, int prio)
2992{
2993 int oldprio, queued, running, enqueue_flag = 0;
2994 struct rq *rq;
2995 const struct sched_class *prev_class;
2996
2997 BUG_ON(prio > MAX_PRIO);
2998
2999 rq = __task_rq_lock(p);
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013 if (unlikely(p == rq->idle)) {
3014 WARN_ON(p != rq->curr);
3015 WARN_ON(p->pi_blocked_on);
3016 goto out_unlock;
3017 }
3018
3019 trace_sched_pi_setprio(p, prio);
3020 oldprio = p->prio;
3021 prev_class = p->sched_class;
3022 queued = task_on_rq_queued(p);
3023 running = task_current(rq, p);
3024 if (queued)
3025 dequeue_task(rq, p, 0);
3026 if (running)
3027 put_prev_task(rq, p);
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038 if (dl_prio(prio)) {
3039 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3040 if (!dl_prio(p->normal_prio) ||
3041 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3042 p->dl.dl_boosted = 1;
3043 p->dl.dl_throttled = 0;
3044 enqueue_flag = ENQUEUE_REPLENISH;
3045 } else
3046 p->dl.dl_boosted = 0;
3047 p->sched_class = &dl_sched_class;
3048 } else if (rt_prio(prio)) {
3049 if (dl_prio(oldprio))
3050 p->dl.dl_boosted = 0;
3051 if (oldprio < prio)
3052 enqueue_flag = ENQUEUE_HEAD;
3053 p->sched_class = &rt_sched_class;
3054 } else {
3055 if (dl_prio(oldprio))
3056 p->dl.dl_boosted = 0;
3057 if (rt_prio(oldprio))
3058 p->rt.timeout = 0;
3059 p->sched_class = &fair_sched_class;
3060 }
3061
3062 p->prio = prio;
3063
3064 if (running)
3065 p->sched_class->set_curr_task(rq);
3066 if (queued)
3067 enqueue_task(rq, p, enqueue_flag);
3068
3069 check_class_changed(rq, p, prev_class, oldprio);
3070out_unlock:
3071 __task_rq_unlock(rq);
3072}
3073#endif
3074
3075void set_user_nice(struct task_struct *p, long nice)
3076{
3077 int old_prio, delta, queued;
3078 unsigned long flags;
3079 struct rq *rq;
3080
3081 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3082 return;
3083
3084
3085
3086
3087 rq = task_rq_lock(p, &flags);
3088
3089
3090
3091
3092
3093
3094 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3095 p->static_prio = NICE_TO_PRIO(nice);
3096 goto out_unlock;
3097 }
3098 queued = task_on_rq_queued(p);
3099 if (queued)
3100 dequeue_task(rq, p, 0);
3101
3102 p->static_prio = NICE_TO_PRIO(nice);
3103 set_load_weight(p);
3104 old_prio = p->prio;
3105 p->prio = effective_prio(p);
3106 delta = p->prio - old_prio;
3107
3108 if (queued) {
3109 enqueue_task(rq, p, 0);
3110
3111
3112
3113
3114 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3115 resched_curr(rq);
3116 }
3117out_unlock:
3118 task_rq_unlock(rq, p, &flags);
3119}
3120EXPORT_SYMBOL(set_user_nice);
3121
3122
3123
3124
3125
3126
3127int can_nice(const struct task_struct *p, const int nice)
3128{
3129
3130 int nice_rlim = nice_to_rlimit(nice);
3131
3132 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3133 capable(CAP_SYS_NICE));
3134}
3135
3136#ifdef __ARCH_WANT_SYS_NICE
3137
3138
3139
3140
3141
3142
3143
3144
3145SYSCALL_DEFINE1(nice, int, increment)
3146{
3147 long nice, retval;
3148
3149
3150
3151
3152
3153
3154 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3155 nice = task_nice(current) + increment;
3156
3157 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3158 if (increment < 0 && !can_nice(current, nice))
3159 return -EPERM;
3160
3161 retval = security_task_setnice(current, nice);
3162 if (retval)
3163 return retval;
3164
3165 set_user_nice(current, nice);
3166 return 0;
3167}
3168
3169#endif
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179int task_prio(const struct task_struct *p)
3180{
3181 return p->prio - MAX_RT_PRIO;
3182}
3183
3184
3185
3186
3187
3188
3189
3190int idle_cpu(int cpu)
3191{
3192 struct rq *rq = cpu_rq(cpu);
3193
3194 if (rq->curr != rq->idle)
3195 return 0;
3196
3197 if (rq->nr_running)
3198 return 0;
3199
3200#ifdef CONFIG_SMP
3201 if (!llist_empty(&rq->wake_list))
3202 return 0;
3203#endif
3204
3205 return 1;
3206}
3207
3208
3209
3210
3211
3212
3213
3214struct task_struct *idle_task(int cpu)
3215{
3216 return cpu_rq(cpu)->idle;
3217}
3218
3219
3220
3221
3222
3223
3224
3225static struct task_struct *find_process_by_pid(pid_t pid)
3226{
3227 return pid ? find_task_by_vpid(pid) : current;
3228}
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238static void
3239__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3240{
3241 struct sched_dl_entity *dl_se = &p->dl;
3242
3243 dl_se->dl_runtime = attr->sched_runtime;
3244 dl_se->dl_deadline = attr->sched_deadline;
3245 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3246 dl_se->flags = attr->sched_flags;
3247 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268}
3269
3270
3271
3272
3273
3274#define SETPARAM_POLICY -1
3275
3276static void __setscheduler_params(struct task_struct *p,
3277 const struct sched_attr *attr)
3278{
3279 int policy = attr->sched_policy;
3280
3281 if (policy == SETPARAM_POLICY)
3282 policy = p->policy;
3283
3284 p->policy = policy;
3285
3286 if (dl_policy(policy))
3287 __setparam_dl(p, attr);
3288 else if (fair_policy(policy))
3289 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3290
3291
3292
3293
3294
3295
3296 p->rt_priority = attr->sched_priority;
3297 p->normal_prio = normal_prio(p);
3298 set_load_weight(p);
3299}
3300
3301
3302static void __setscheduler(struct rq *rq, struct task_struct *p,
3303 const struct sched_attr *attr, bool keep_boost)
3304{
3305 __setscheduler_params(p, attr);
3306
3307
3308
3309
3310
3311 if (keep_boost)
3312 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
3313 else
3314 p->prio = normal_prio(p);
3315
3316 if (dl_prio(p->prio))
3317 p->sched_class = &dl_sched_class;
3318 else if (rt_prio(p->prio))
3319 p->sched_class = &rt_sched_class;
3320 else
3321 p->sched_class = &fair_sched_class;
3322}
3323
3324static void
3325__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3326{
3327 struct sched_dl_entity *dl_se = &p->dl;
3328
3329 attr->sched_priority = p->rt_priority;
3330 attr->sched_runtime = dl_se->dl_runtime;
3331 attr->sched_deadline = dl_se->dl_deadline;
3332 attr->sched_period = dl_se->dl_period;
3333 attr->sched_flags = dl_se->flags;
3334}
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346static bool
3347__checkparam_dl(const struct sched_attr *attr)
3348{
3349
3350 if (attr->sched_deadline == 0)
3351 return false;
3352
3353
3354
3355
3356
3357 if (attr->sched_runtime < (1ULL << DL_SCALE))
3358 return false;
3359
3360
3361
3362
3363
3364 if (attr->sched_deadline & (1ULL << 63) ||
3365 attr->sched_period & (1ULL << 63))
3366 return false;
3367
3368
3369 if ((attr->sched_period != 0 &&
3370 attr->sched_period < attr->sched_deadline) ||
3371 attr->sched_deadline < attr->sched_runtime)
3372 return false;
3373
3374 return true;
3375}
3376
3377
3378
3379
3380static bool check_same_owner(struct task_struct *p)
3381{
3382 const struct cred *cred = current_cred(), *pcred;
3383 bool match;
3384
3385 rcu_read_lock();
3386 pcred = __task_cred(p);
3387 match = (uid_eq(cred->euid, pcred->euid) ||
3388 uid_eq(cred->euid, pcred->uid));
3389 rcu_read_unlock();
3390 return match;
3391}
3392
3393static bool dl_param_changed(struct task_struct *p,
3394 const struct sched_attr *attr)
3395{
3396 struct sched_dl_entity *dl_se = &p->dl;
3397
3398 if (dl_se->dl_runtime != attr->sched_runtime ||
3399 dl_se->dl_deadline != attr->sched_deadline ||
3400 dl_se->dl_period != attr->sched_period ||
3401 dl_se->flags != attr->sched_flags)
3402 return true;
3403
3404 return false;
3405}
3406
3407static int __sched_setscheduler(struct task_struct *p,
3408 const struct sched_attr *attr,
3409 bool user)
3410{
3411 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3412 MAX_RT_PRIO - 1 - attr->sched_priority;
3413 int retval, oldprio, oldpolicy = -1, queued, running;
3414 int new_effective_prio, policy = attr->sched_policy;
3415 unsigned long flags;
3416 const struct sched_class *prev_class;
3417 struct rq *rq;
3418 int reset_on_fork;
3419
3420
3421 BUG_ON(in_interrupt());
3422recheck:
3423
3424 if (policy < 0) {
3425 reset_on_fork = p->sched_reset_on_fork;
3426 policy = oldpolicy = p->policy;
3427 } else {
3428 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3429
3430 if (policy != SCHED_DEADLINE &&
3431 policy != SCHED_FIFO && policy != SCHED_RR &&
3432 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3433 policy != SCHED_IDLE)
3434 return -EINVAL;
3435 }
3436
3437 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3438 return -EINVAL;
3439
3440
3441
3442
3443
3444
3445 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3446 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3447 return -EINVAL;
3448 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3449 (rt_policy(policy) != (attr->sched_priority != 0)))
3450 return -EINVAL;
3451
3452
3453
3454
3455 if (user && !capable(CAP_SYS_NICE)) {
3456 if (fair_policy(policy)) {
3457 if (attr->sched_nice < task_nice(p) &&
3458 !can_nice(p, attr->sched_nice))
3459 return -EPERM;
3460 }
3461
3462 if (rt_policy(policy)) {
3463 unsigned long rlim_rtprio =
3464 task_rlimit(p, RLIMIT_RTPRIO);
3465
3466
3467 if (policy != p->policy && !rlim_rtprio)
3468 return -EPERM;
3469
3470
3471 if (attr->sched_priority > p->rt_priority &&
3472 attr->sched_priority > rlim_rtprio)
3473 return -EPERM;
3474 }
3475
3476
3477
3478
3479
3480
3481
3482 if (dl_policy(policy))
3483 return -EPERM;
3484
3485
3486
3487
3488
3489 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3490 if (!can_nice(p, task_nice(p)))
3491 return -EPERM;
3492 }
3493
3494
3495 if (!check_same_owner(p))
3496 return -EPERM;
3497
3498
3499 if (p->sched_reset_on_fork && !reset_on_fork)
3500 return -EPERM;
3501 }
3502
3503 if (user) {
3504 retval = security_task_setscheduler(p);
3505 if (retval)
3506 return retval;
3507 }
3508
3509
3510
3511
3512
3513
3514
3515
3516 rq = task_rq_lock(p, &flags);
3517
3518
3519
3520
3521 if (p == rq->stop) {
3522 task_rq_unlock(rq, p, &flags);
3523 return -EINVAL;
3524 }
3525
3526
3527
3528
3529
3530 if (unlikely(policy == p->policy)) {
3531 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3532 goto change;
3533 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3534 goto change;
3535 if (dl_policy(policy) && dl_param_changed(p, attr))
3536 goto change;
3537
3538 p->sched_reset_on_fork = reset_on_fork;
3539 task_rq_unlock(rq, p, &flags);
3540 return 0;
3541 }
3542change:
3543
3544 if (user) {
3545#ifdef CONFIG_RT_GROUP_SCHED
3546
3547
3548
3549
3550 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3551 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3552 !task_group_is_autogroup(task_group(p))) {
3553 task_rq_unlock(rq, p, &flags);
3554 return -EPERM;
3555 }
3556#endif
3557#ifdef CONFIG_SMP
3558 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3559 cpumask_t *span = rq->rd->span;
3560
3561
3562
3563
3564
3565
3566 if (!cpumask_subset(span, &p->cpus_allowed) ||
3567 rq->rd->dl_bw.bw == 0) {
3568 task_rq_unlock(rq, p, &flags);
3569 return -EPERM;
3570 }
3571 }
3572#endif
3573 }
3574
3575
3576 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3577 policy = oldpolicy = -1;
3578 task_rq_unlock(rq, p, &flags);
3579 goto recheck;
3580 }
3581
3582
3583
3584
3585
3586
3587 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3588 task_rq_unlock(rq, p, &flags);
3589 return -EBUSY;
3590 }
3591
3592 p->sched_reset_on_fork = reset_on_fork;
3593 oldprio = p->prio;
3594
3595
3596
3597
3598
3599
3600
3601
3602 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
3603 if (new_effective_prio == oldprio) {
3604 __setscheduler_params(p, attr);
3605 task_rq_unlock(rq, p, &flags);
3606 return 0;
3607 }
3608
3609 queued = task_on_rq_queued(p);
3610 running = task_current(rq, p);
3611 if (queued)
3612 dequeue_task(rq, p, 0);
3613 if (running)
3614 put_prev_task(rq, p);
3615
3616 prev_class = p->sched_class;
3617 __setscheduler(rq, p, attr, true);
3618
3619 if (running)
3620 p->sched_class->set_curr_task(rq);
3621 if (queued) {
3622
3623
3624
3625
3626 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3627 }
3628
3629 check_class_changed(rq, p, prev_class, oldprio);
3630 task_rq_unlock(rq, p, &flags);
3631
3632 rt_mutex_adjust_pi(p);
3633
3634 return 0;
3635}
3636
3637static int _sched_setscheduler(struct task_struct *p, int policy,
3638 const struct sched_param *param, bool check)
3639{
3640 struct sched_attr attr = {
3641 .sched_policy = policy,
3642 .sched_priority = param->sched_priority,
3643 .sched_nice = PRIO_TO_NICE(p->static_prio),
3644 };
3645
3646
3647 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
3648 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3649 policy &= ~SCHED_RESET_ON_FORK;
3650 attr.sched_policy = policy;
3651 }
3652
3653 return __sched_setscheduler(p, &attr, check);
3654}
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665int sched_setscheduler(struct task_struct *p, int policy,
3666 const struct sched_param *param)
3667{
3668 return _sched_setscheduler(p, policy, param, true);
3669}
3670EXPORT_SYMBOL_GPL(sched_setscheduler);
3671
3672int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3673{
3674 return __sched_setscheduler(p, attr, true);
3675}
3676EXPORT_SYMBOL_GPL(sched_setattr);
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3692 const struct sched_param *param)
3693{
3694 return _sched_setscheduler(p, policy, param, false);
3695}
3696
3697static int
3698do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3699{
3700 struct sched_param lparam;
3701 struct task_struct *p;
3702 int retval;
3703
3704 if (!param || pid < 0)
3705 return -EINVAL;
3706 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3707 return -EFAULT;
3708
3709 rcu_read_lock();
3710 retval = -ESRCH;
3711 p = find_process_by_pid(pid);
3712 if (p != NULL)
3713 retval = sched_setscheduler(p, policy, &lparam);
3714 rcu_read_unlock();
3715
3716 return retval;
3717}
3718
3719
3720
3721
3722static int sched_copy_attr(struct sched_attr __user *uattr,
3723 struct sched_attr *attr)
3724{
3725 u32 size;
3726 int ret;
3727
3728 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3729 return -EFAULT;
3730
3731
3732
3733
3734 memset(attr, 0, sizeof(*attr));
3735
3736 ret = get_user(size, &uattr->size);
3737 if (ret)
3738 return ret;
3739
3740 if (size > PAGE_SIZE)
3741 goto err_size;
3742
3743 if (!size)
3744 size = SCHED_ATTR_SIZE_VER0;
3745
3746 if (size < SCHED_ATTR_SIZE_VER0)
3747 goto err_size;
3748
3749
3750
3751
3752
3753
3754
3755 if (size > sizeof(*attr)) {
3756 unsigned char __user *addr;
3757 unsigned char __user *end;
3758 unsigned char val;
3759
3760 addr = (void __user *)uattr + sizeof(*attr);
3761 end = (void __user *)uattr + size;
3762
3763 for (; addr < end; addr++) {
3764 ret = get_user(val, addr);
3765 if (ret)
3766 return ret;
3767 if (val)
3768 goto err_size;
3769 }
3770 size = sizeof(*attr);
3771 }
3772
3773 ret = copy_from_user(attr, uattr, size);
3774 if (ret)
3775 return -EFAULT;
3776
3777
3778
3779
3780
3781 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3782
3783 return 0;
3784
3785err_size:
3786 put_user(sizeof(*attr), &uattr->size);
3787 return -E2BIG;
3788}
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3799 struct sched_param __user *, param)
3800{
3801
3802 if (policy < 0)
3803 return -EINVAL;
3804
3805 return do_sched_setscheduler(pid, policy, param);
3806}
3807
3808
3809
3810
3811
3812
3813
3814
3815SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3816{
3817 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
3818}
3819
3820
3821
3822
3823
3824
3825
3826SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3827 unsigned int, flags)
3828{
3829 struct sched_attr attr;
3830 struct task_struct *p;
3831 int retval;
3832
3833 if (!uattr || pid < 0 || flags)
3834 return -EINVAL;
3835
3836 retval = sched_copy_attr(uattr, &attr);
3837 if (retval)
3838 return retval;
3839
3840 if ((int)attr.sched_policy < 0)
3841 return -EINVAL;
3842
3843 rcu_read_lock();
3844 retval = -ESRCH;
3845 p = find_process_by_pid(pid);
3846 if (p != NULL)
3847 retval = sched_setattr(p, &attr);
3848 rcu_read_unlock();
3849
3850 return retval;
3851}
3852
3853
3854
3855
3856
3857
3858
3859
3860SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3861{
3862 struct task_struct *p;
3863 int retval;
3864
3865 if (pid < 0)
3866 return -EINVAL;
3867
3868 retval = -ESRCH;
3869 rcu_read_lock();
3870 p = find_process_by_pid(pid);
3871 if (p) {
3872 retval = security_task_getscheduler(p);
3873 if (!retval)
3874 retval = p->policy
3875 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3876 }
3877 rcu_read_unlock();
3878 return retval;
3879}
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3890{
3891 struct sched_param lp = { .sched_priority = 0 };
3892 struct task_struct *p;
3893 int retval;
3894
3895 if (!param || pid < 0)
3896 return -EINVAL;
3897
3898 rcu_read_lock();
3899 p = find_process_by_pid(pid);
3900 retval = -ESRCH;
3901 if (!p)
3902 goto out_unlock;
3903
3904 retval = security_task_getscheduler(p);
3905 if (retval)
3906 goto out_unlock;
3907
3908 if (task_has_rt_policy(p))
3909 lp.sched_priority = p->rt_priority;
3910 rcu_read_unlock();
3911
3912
3913
3914
3915 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3916
3917 return retval;
3918
3919out_unlock:
3920 rcu_read_unlock();
3921 return retval;
3922}
3923
3924static int sched_read_attr(struct sched_attr __user *uattr,
3925 struct sched_attr *attr,
3926 unsigned int usize)
3927{
3928 int ret;
3929
3930 if (!access_ok(VERIFY_WRITE, uattr, usize))
3931 return -EFAULT;
3932
3933
3934
3935
3936
3937
3938 if (usize < sizeof(*attr)) {
3939 unsigned char *addr;
3940 unsigned char *end;
3941
3942 addr = (void *)attr + usize;
3943 end = (void *)attr + sizeof(*attr);
3944
3945 for (; addr < end; addr++) {
3946 if (*addr)
3947 return -EFBIG;
3948 }
3949
3950 attr->size = usize;
3951 }
3952
3953 ret = copy_to_user(uattr, attr, attr->size);
3954 if (ret)
3955 return -EFAULT;
3956
3957 return 0;
3958}
3959
3960
3961
3962
3963
3964
3965
3966
3967SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3968 unsigned int, size, unsigned int, flags)
3969{
3970 struct sched_attr attr = {
3971 .size = sizeof(struct sched_attr),
3972 };
3973 struct task_struct *p;
3974 int retval;
3975
3976 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3977 size < SCHED_ATTR_SIZE_VER0 || flags)
3978 return -EINVAL;
3979
3980 rcu_read_lock();
3981 p = find_process_by_pid(pid);
3982 retval = -ESRCH;
3983 if (!p)
3984 goto out_unlock;
3985
3986 retval = security_task_getscheduler(p);
3987 if (retval)
3988 goto out_unlock;
3989
3990 attr.sched_policy = p->policy;
3991 if (p->sched_reset_on_fork)
3992 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3993 if (task_has_dl_policy(p))
3994 __getparam_dl(p, &attr);
3995 else if (task_has_rt_policy(p))
3996 attr.sched_priority = p->rt_priority;
3997 else
3998 attr.sched_nice = task_nice(p);
3999
4000 rcu_read_unlock();
4001
4002 retval = sched_read_attr(uattr, &attr, size);
4003 return retval;
4004
4005out_unlock:
4006 rcu_read_unlock();
4007 return retval;
4008}
4009
4010long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4011{
4012 cpumask_var_t cpus_allowed, new_mask;
4013 struct task_struct *p;
4014 int retval;
4015
4016 rcu_read_lock();
4017
4018 p = find_process_by_pid(pid);
4019 if (!p) {
4020 rcu_read_unlock();
4021 return -ESRCH;
4022 }
4023
4024
4025 get_task_struct(p);
4026 rcu_read_unlock();
4027
4028 if (p->flags & PF_NO_SETAFFINITY) {
4029 retval = -EINVAL;
4030 goto out_put_task;
4031 }
4032 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4033 retval = -ENOMEM;
4034 goto out_put_task;
4035 }
4036 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4037 retval = -ENOMEM;
4038 goto out_free_cpus_allowed;
4039 }
4040 retval = -EPERM;
4041 if (!check_same_owner(p)) {
4042 rcu_read_lock();
4043 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4044 rcu_read_unlock();
4045 goto out_free_new_mask;
4046 }
4047 rcu_read_unlock();
4048 }
4049
4050 retval = security_task_setscheduler(p);
4051 if (retval)
4052 goto out_free_new_mask;
4053
4054
4055 cpuset_cpus_allowed(p, cpus_allowed);
4056 cpumask_and(new_mask, in_mask, cpus_allowed);
4057
4058
4059
4060
4061
4062
4063
4064#ifdef CONFIG_SMP
4065 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4066 rcu_read_lock();
4067 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4068 retval = -EBUSY;
4069 rcu_read_unlock();
4070 goto out_free_new_mask;
4071 }
4072 rcu_read_unlock();
4073 }
4074#endif
4075again:
4076 retval = set_cpus_allowed_ptr(p, new_mask);
4077
4078 if (!retval) {
4079 cpuset_cpus_allowed(p, cpus_allowed);
4080 if (!cpumask_subset(new_mask, cpus_allowed)) {
4081
4082
4083
4084
4085
4086 cpumask_copy(new_mask, cpus_allowed);
4087 goto again;
4088 }
4089 }
4090out_free_new_mask:
4091 free_cpumask_var(new_mask);
4092out_free_cpus_allowed:
4093 free_cpumask_var(cpus_allowed);
4094out_put_task:
4095 put_task_struct(p);
4096 return retval;
4097}
4098
4099static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4100 struct cpumask *new_mask)
4101{
4102 if (len < cpumask_size())
4103 cpumask_clear(new_mask);
4104 else if (len > cpumask_size())
4105 len = cpumask_size();
4106
4107 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4108}
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4119 unsigned long __user *, user_mask_ptr)
4120{
4121 cpumask_var_t new_mask;
4122 int retval;
4123
4124 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4125 return -ENOMEM;
4126
4127 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4128 if (retval == 0)
4129 retval = sched_setaffinity(pid, new_mask);
4130 free_cpumask_var(new_mask);
4131 return retval;
4132}
4133
4134long sched_getaffinity(pid_t pid, struct cpumask *mask)
4135{
4136 struct task_struct *p;
4137 unsigned long flags;
4138 int retval;
4139
4140 rcu_read_lock();
4141
4142 retval = -ESRCH;
4143 p = find_process_by_pid(pid);
4144 if (!p)
4145 goto out_unlock;
4146
4147 retval = security_task_getscheduler(p);
4148 if (retval)
4149 goto out_unlock;
4150
4151 raw_spin_lock_irqsave(&p->pi_lock, flags);
4152 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4153 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4154
4155out_unlock:
4156 rcu_read_unlock();
4157
4158 return retval;
4159}
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4170 unsigned long __user *, user_mask_ptr)
4171{
4172 int ret;
4173 cpumask_var_t mask;
4174
4175 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4176 return -EINVAL;
4177 if (len & (sizeof(unsigned long)-1))
4178 return -EINVAL;
4179
4180 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4181 return -ENOMEM;
4182
4183 ret = sched_getaffinity(pid, mask);
4184 if (ret == 0) {
4185 size_t retlen = min_t(size_t, len, cpumask_size());
4186
4187 if (copy_to_user(user_mask_ptr, mask, retlen))
4188 ret = -EFAULT;
4189 else
4190 ret = retlen;
4191 }
4192 free_cpumask_var(mask);
4193
4194 return ret;
4195}
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205SYSCALL_DEFINE0(sched_yield)
4206{
4207 struct rq *rq = this_rq_lock();
4208
4209 schedstat_inc(rq, yld_count);
4210 current->sched_class->yield_task(rq);
4211
4212
4213
4214
4215
4216 __release(rq->lock);
4217 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4218 do_raw_spin_unlock(&rq->lock);
4219 sched_preempt_enable_no_resched();
4220
4221 schedule();
4222
4223 return 0;
4224}
4225
4226int __sched _cond_resched(void)
4227{
4228 if (should_resched()) {
4229 preempt_schedule_common();
4230 return 1;
4231 }
4232 return 0;
4233}
4234EXPORT_SYMBOL(_cond_resched);
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244int __cond_resched_lock(spinlock_t *lock)
4245{
4246 int resched = should_resched();
4247 int ret = 0;
4248
4249 lockdep_assert_held(lock);
4250
4251 if (spin_needbreak(lock) || resched) {
4252 spin_unlock(lock);
4253 if (resched)
4254 preempt_schedule_common();
4255 else
4256 cpu_relax();
4257 ret = 1;
4258 spin_lock(lock);
4259 }
4260 return ret;
4261}
4262EXPORT_SYMBOL(__cond_resched_lock);
4263
4264int __sched __cond_resched_softirq(void)
4265{
4266 BUG_ON(!in_softirq());
4267
4268 if (should_resched()) {
4269 local_bh_enable();
4270 preempt_schedule_common();
4271 local_bh_disable();
4272 return 1;
4273 }
4274 return 0;
4275}
4276EXPORT_SYMBOL(__cond_resched_softirq);
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300void __sched yield(void)
4301{
4302 set_current_state(TASK_RUNNING);
4303 sys_sched_yield();
4304}
4305EXPORT_SYMBOL(yield);
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322int __sched yield_to(struct task_struct *p, bool preempt)
4323{
4324 struct task_struct *curr = current;
4325 struct rq *rq, *p_rq;
4326 unsigned long flags;
4327 int yielded = 0;
4328
4329 local_irq_save(flags);
4330 rq = this_rq();
4331
4332again:
4333 p_rq = task_rq(p);
4334
4335
4336
4337
4338 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4339 yielded = -ESRCH;
4340 goto out_irq;
4341 }
4342
4343 double_rq_lock(rq, p_rq);
4344 if (task_rq(p) != p_rq) {
4345 double_rq_unlock(rq, p_rq);
4346 goto again;
4347 }
4348
4349 if (!curr->sched_class->yield_to_task)
4350 goto out_unlock;
4351
4352 if (curr->sched_class != p->sched_class)
4353 goto out_unlock;
4354
4355 if (task_running(p_rq, p) || p->state)
4356 goto out_unlock;
4357
4358 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4359 if (yielded) {
4360 schedstat_inc(rq, yld_count);
4361
4362
4363
4364
4365 if (preempt && rq != p_rq)
4366 resched_curr(p_rq);
4367 }
4368
4369out_unlock:
4370 double_rq_unlock(rq, p_rq);
4371out_irq:
4372 local_irq_restore(flags);
4373
4374 if (yielded > 0)
4375 schedule();
4376
4377 return yielded;
4378}
4379EXPORT_SYMBOL_GPL(yield_to);
4380
4381
4382
4383
4384
4385long __sched io_schedule_timeout(long timeout)
4386{
4387 int old_iowait = current->in_iowait;
4388 struct rq *rq;
4389 long ret;
4390
4391 current->in_iowait = 1;
4392 blk_schedule_flush_plug(current);
4393
4394 delayacct_blkio_start();
4395 rq = raw_rq();
4396 atomic_inc(&rq->nr_iowait);
4397 ret = schedule_timeout(timeout);
4398 current->in_iowait = old_iowait;
4399 atomic_dec(&rq->nr_iowait);
4400 delayacct_blkio_end();
4401
4402 return ret;
4403}
4404EXPORT_SYMBOL(io_schedule_timeout);
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4415{
4416 int ret = -EINVAL;
4417
4418 switch (policy) {
4419 case SCHED_FIFO:
4420 case SCHED_RR:
4421 ret = MAX_USER_RT_PRIO-1;
4422 break;
4423 case SCHED_DEADLINE:
4424 case SCHED_NORMAL:
4425 case SCHED_BATCH:
4426 case SCHED_IDLE:
4427 ret = 0;
4428 break;
4429 }
4430 return ret;
4431}
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4442{
4443 int ret = -EINVAL;
4444
4445 switch (policy) {
4446 case SCHED_FIFO:
4447 case SCHED_RR:
4448 ret = 1;
4449 break;
4450 case SCHED_DEADLINE:
4451 case SCHED_NORMAL:
4452 case SCHED_BATCH:
4453 case SCHED_IDLE:
4454 ret = 0;
4455 }
4456 return ret;
4457}
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4471 struct timespec __user *, interval)
4472{
4473 struct task_struct *p;
4474 unsigned int time_slice;
4475 unsigned long flags;
4476 struct rq *rq;
4477 int retval;
4478 struct timespec t;
4479
4480 if (pid < 0)
4481 return -EINVAL;
4482
4483 retval = -ESRCH;
4484 rcu_read_lock();
4485 p = find_process_by_pid(pid);
4486 if (!p)
4487 goto out_unlock;
4488
4489 retval = security_task_getscheduler(p);
4490 if (retval)
4491 goto out_unlock;
4492
4493 rq = task_rq_lock(p, &flags);
4494 time_slice = 0;
4495 if (p->sched_class->get_rr_interval)
4496 time_slice = p->sched_class->get_rr_interval(rq, p);
4497 task_rq_unlock(rq, p, &flags);
4498
4499 rcu_read_unlock();
4500 jiffies_to_timespec(time_slice, &t);
4501 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4502 return retval;
4503
4504out_unlock:
4505 rcu_read_unlock();
4506 return retval;
4507}
4508
4509static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4510
4511void sched_show_task(struct task_struct *p)
4512{
4513 unsigned long free = 0;
4514 int ppid;
4515 unsigned long state = p->state;
4516
4517 if (state)
4518 state = __ffs(state) + 1;
4519 printk(KERN_INFO "%-15.15s %c", p->comm,
4520 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4521#if BITS_PER_LONG == 32
4522 if (state == TASK_RUNNING)
4523 printk(KERN_CONT " running ");
4524 else
4525 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4526#else
4527 if (state == TASK_RUNNING)
4528 printk(KERN_CONT " running task ");
4529 else
4530 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4531#endif
4532#ifdef CONFIG_DEBUG_STACK_USAGE
4533 free = stack_not_used(p);
4534#endif
4535 ppid = 0;
4536 rcu_read_lock();
4537 if (pid_alive(p))
4538 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4539 rcu_read_unlock();
4540 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4541 task_pid_nr(p), ppid,
4542 (unsigned long)task_thread_info(p)->flags);
4543
4544 print_worker_info(KERN_INFO, p);
4545 show_stack(p, NULL);
4546}
4547
4548void show_state_filter(unsigned long state_filter)
4549{
4550 struct task_struct *g, *p;
4551
4552#if BITS_PER_LONG == 32
4553 printk(KERN_INFO
4554 " task PC stack pid father\n");
4555#else
4556 printk(KERN_INFO
4557 " task PC stack pid father\n");
4558#endif
4559 rcu_read_lock();
4560 for_each_process_thread(g, p) {
4561
4562
4563
4564
4565 touch_nmi_watchdog();
4566 if (!state_filter || (p->state & state_filter))
4567 sched_show_task(p);
4568 }
4569
4570 touch_all_softlockup_watchdogs();
4571
4572#ifdef CONFIG_SCHED_DEBUG
4573 sysrq_sched_debug_show();
4574#endif
4575 rcu_read_unlock();
4576
4577
4578
4579 if (!state_filter)
4580 debug_show_all_locks();
4581}
4582
4583void init_idle_bootup_task(struct task_struct *idle)
4584{
4585 idle->sched_class = &idle_sched_class;
4586}
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596void init_idle(struct task_struct *idle, int cpu)
4597{
4598 struct rq *rq = cpu_rq(cpu);
4599 unsigned long flags;
4600
4601 raw_spin_lock_irqsave(&rq->lock, flags);
4602
4603 __sched_fork(0, idle);
4604 idle->state = TASK_RUNNING;
4605 idle->se.exec_start = sched_clock();
4606
4607 do_set_cpus_allowed(idle, cpumask_of(cpu));
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618 rcu_read_lock();
4619 __set_task_cpu(idle, cpu);
4620 rcu_read_unlock();
4621
4622 rq->curr = rq->idle = idle;
4623 idle->on_rq = TASK_ON_RQ_QUEUED;
4624#if defined(CONFIG_SMP)
4625 idle->on_cpu = 1;
4626#endif
4627 raw_spin_unlock_irqrestore(&rq->lock, flags);
4628
4629
4630 init_idle_preempt_count(idle, cpu);
4631
4632
4633
4634
4635 idle->sched_class = &idle_sched_class;
4636 ftrace_graph_init_idle_task(idle, cpu);
4637 vtime_init_idle(idle, cpu);
4638#if defined(CONFIG_SMP)
4639 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4640#endif
4641}
4642
4643int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4644 const struct cpumask *trial)
4645{
4646 int ret = 1, trial_cpus;
4647 struct dl_bw *cur_dl_b;
4648 unsigned long flags;
4649
4650 if (!cpumask_weight(cur))
4651 return ret;
4652
4653 rcu_read_lock_sched();
4654 cur_dl_b = dl_bw_of(cpumask_any(cur));
4655 trial_cpus = cpumask_weight(trial);
4656
4657 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4658 if (cur_dl_b->bw != -1 &&
4659 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4660 ret = 0;
4661 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4662 rcu_read_unlock_sched();
4663
4664 return ret;
4665}
4666
4667int task_can_attach(struct task_struct *p,
4668 const struct cpumask *cs_cpus_allowed)
4669{
4670 int ret = 0;
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681 if (p->flags & PF_NO_SETAFFINITY) {
4682 ret = -EINVAL;
4683 goto out;
4684 }
4685
4686#ifdef CONFIG_SMP
4687 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4688 cs_cpus_allowed)) {
4689 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4690 cs_cpus_allowed);
4691 struct dl_bw *dl_b;
4692 bool overflow;
4693 int cpus;
4694 unsigned long flags;
4695
4696 rcu_read_lock_sched();
4697 dl_b = dl_bw_of(dest_cpu);
4698 raw_spin_lock_irqsave(&dl_b->lock, flags);
4699 cpus = dl_bw_cpus(dest_cpu);
4700 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4701 if (overflow)
4702 ret = -EBUSY;
4703 else {
4704
4705
4706
4707
4708
4709
4710 __dl_add(dl_b, p->dl.dl_bw);
4711 }
4712 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4713 rcu_read_unlock_sched();
4714
4715 }
4716#endif
4717out:
4718 return ret;
4719}
4720
4721#ifdef CONFIG_SMP
4722
4723
4724
4725
4726
4727static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4728{
4729 struct rq *rq = task_rq(p);
4730
4731 lockdep_assert_held(&rq->lock);
4732
4733 dequeue_task(rq, p, 0);
4734 p->on_rq = TASK_ON_RQ_MIGRATING;
4735 set_task_cpu(p, new_cpu);
4736 raw_spin_unlock(&rq->lock);
4737
4738 rq = cpu_rq(new_cpu);
4739
4740 raw_spin_lock(&rq->lock);
4741 BUG_ON(task_cpu(p) != new_cpu);
4742 p->on_rq = TASK_ON_RQ_QUEUED;
4743 enqueue_task(rq, p, 0);
4744 check_preempt_curr(rq, p, 0);
4745
4746 return rq;
4747}
4748
4749void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4750{
4751 if (p->sched_class->set_cpus_allowed)
4752 p->sched_class->set_cpus_allowed(p, new_mask);
4753
4754 cpumask_copy(&p->cpus_allowed, new_mask);
4755 p->nr_cpus_allowed = cpumask_weight(new_mask);
4756}
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4782{
4783 unsigned long flags;
4784 struct rq *rq;
4785 unsigned int dest_cpu;
4786 int ret = 0;
4787
4788 rq = task_rq_lock(p, &flags);
4789
4790 if (cpumask_equal(&p->cpus_allowed, new_mask))
4791 goto out;
4792
4793 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4794 ret = -EINVAL;
4795 goto out;
4796 }
4797
4798 do_set_cpus_allowed(p, new_mask);
4799
4800
4801 if (cpumask_test_cpu(task_cpu(p), new_mask))
4802 goto out;
4803
4804 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4805 if (task_running(rq, p) || p->state == TASK_WAKING) {
4806 struct migration_arg arg = { p, dest_cpu };
4807
4808 task_rq_unlock(rq, p, &flags);
4809 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4810 tlb_migrate_finish(p->mm);
4811 return 0;
4812 } else if (task_on_rq_queued(p))
4813 rq = move_queued_task(p, dest_cpu);
4814out:
4815 task_rq_unlock(rq, p, &flags);
4816
4817 return ret;
4818}
4819EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4833{
4834 struct rq *rq;
4835 int ret = 0;
4836
4837 if (unlikely(!cpu_active(dest_cpu)))
4838 return ret;
4839
4840 rq = cpu_rq(src_cpu);
4841
4842 raw_spin_lock(&p->pi_lock);
4843 raw_spin_lock(&rq->lock);
4844
4845 if (task_cpu(p) != src_cpu)
4846 goto done;
4847
4848
4849 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4850 goto fail;
4851
4852
4853
4854
4855
4856 if (task_on_rq_queued(p))
4857 rq = move_queued_task(p, dest_cpu);
4858done:
4859 ret = 1;
4860fail:
4861 raw_spin_unlock(&rq->lock);
4862 raw_spin_unlock(&p->pi_lock);
4863 return ret;
4864}
4865
4866#ifdef CONFIG_NUMA_BALANCING
4867
4868int migrate_task_to(struct task_struct *p, int target_cpu)
4869{
4870 struct migration_arg arg = { p, target_cpu };
4871 int curr_cpu = task_cpu(p);
4872
4873 if (curr_cpu == target_cpu)
4874 return 0;
4875
4876 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4877 return -EINVAL;
4878
4879
4880
4881 trace_sched_move_numa(p, curr_cpu, target_cpu);
4882 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4883}
4884
4885
4886
4887
4888
4889void sched_setnuma(struct task_struct *p, int nid)
4890{
4891 struct rq *rq;
4892 unsigned long flags;
4893 bool queued, running;
4894
4895 rq = task_rq_lock(p, &flags);
4896 queued = task_on_rq_queued(p);
4897 running = task_current(rq, p);
4898
4899 if (queued)
4900 dequeue_task(rq, p, 0);
4901 if (running)
4902 put_prev_task(rq, p);
4903
4904 p->numa_preferred_nid = nid;
4905
4906 if (running)
4907 p->sched_class->set_curr_task(rq);
4908 if (queued)
4909 enqueue_task(rq, p, 0);
4910 task_rq_unlock(rq, p, &flags);
4911}
4912#endif
4913
4914
4915
4916
4917
4918
4919static int migration_cpu_stop(void *data)
4920{
4921 struct migration_arg *arg = data;
4922
4923
4924
4925
4926
4927 local_irq_disable();
4928
4929
4930
4931
4932
4933 sched_ttwu_pending();
4934 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4935 local_irq_enable();
4936 return 0;
4937}
4938
4939#ifdef CONFIG_HOTPLUG_CPU
4940
4941
4942
4943
4944
4945void idle_task_exit(void)
4946{
4947 struct mm_struct *mm = current->active_mm;
4948
4949 BUG_ON(cpu_online(smp_processor_id()));
4950
4951 if (mm != &init_mm) {
4952 switch_mm(mm, &init_mm, current);
4953 finish_arch_post_lock_switch();
4954 }
4955 mmdrop(mm);
4956}
4957
4958
4959
4960
4961
4962
4963
4964
4965static void calc_load_migrate(struct rq *rq)
4966{
4967 long delta = calc_load_fold_active(rq);
4968 if (delta)
4969 atomic_long_add(delta, &calc_load_tasks);
4970}
4971
4972static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4973{
4974}
4975
4976static const struct sched_class fake_sched_class = {
4977 .put_prev_task = put_prev_task_fake,
4978};
4979
4980static struct task_struct fake_task = {
4981
4982
4983
4984 .prio = MAX_PRIO + 1,
4985 .sched_class = &fake_sched_class,
4986};
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996static void migrate_tasks(unsigned int dead_cpu)
4997{
4998 struct rq *rq = cpu_rq(dead_cpu);
4999 struct task_struct *next, *stop = rq->stop;
5000 int dest_cpu;
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011 rq->stop = NULL;
5012
5013
5014
5015
5016
5017
5018 update_rq_clock(rq);
5019
5020 for ( ; ; ) {
5021
5022
5023
5024
5025 if (rq->nr_running == 1)
5026 break;
5027
5028 next = pick_next_task(rq, &fake_task);
5029 BUG_ON(!next);
5030 next->sched_class->put_prev_task(rq, next);
5031
5032
5033 dest_cpu = select_fallback_rq(dead_cpu, next);
5034 raw_spin_unlock(&rq->lock);
5035
5036 __migrate_task(next, dead_cpu, dest_cpu);
5037
5038 raw_spin_lock(&rq->lock);
5039 }
5040
5041 rq->stop = stop;
5042}
5043
5044#endif
5045
5046#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5047
5048static struct ctl_table sd_ctl_dir[] = {
5049 {
5050 .procname = "sched_domain",
5051 .mode = 0555,
5052 },
5053 {}
5054};
5055
5056static struct ctl_table sd_ctl_root[] = {
5057 {
5058 .procname = "kernel",
5059 .mode = 0555,
5060 .child = sd_ctl_dir,
5061 },
5062 {}
5063};
5064
5065static struct ctl_table *sd_alloc_ctl_entry(int n)
5066{
5067 struct ctl_table *entry =
5068 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5069
5070 return entry;
5071}
5072
5073static void sd_free_ctl_entry(struct ctl_table **tablep)
5074{
5075 struct ctl_table *entry;
5076
5077
5078
5079
5080
5081
5082
5083 for (entry = *tablep; entry->mode; entry++) {
5084 if (entry->child)
5085 sd_free_ctl_entry(&entry->child);
5086 if (entry->proc_handler == NULL)
5087 kfree(entry->procname);
5088 }
5089
5090 kfree(*tablep);
5091 *tablep = NULL;
5092}
5093
5094static int min_load_idx = 0;
5095static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5096
5097static void
5098set_table_entry(struct ctl_table *entry,
5099 const char *procname, void *data, int maxlen,
5100 umode_t mode, proc_handler *proc_handler,
5101 bool load_idx)
5102{
5103 entry->procname = procname;
5104 entry->data = data;
5105 entry->maxlen = maxlen;
5106 entry->mode = mode;
5107 entry->proc_handler = proc_handler;
5108
5109 if (load_idx) {
5110 entry->extra1 = &min_load_idx;
5111 entry->extra2 = &max_load_idx;
5112 }
5113}
5114
5115static struct ctl_table *
5116sd_alloc_ctl_domain_table(struct sched_domain *sd)
5117{
5118 struct ctl_table *table = sd_alloc_ctl_entry(14);
5119
5120 if (table == NULL)
5121 return NULL;
5122
5123 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5124 sizeof(long), 0644, proc_doulongvec_minmax, false);
5125 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5126 sizeof(long), 0644, proc_doulongvec_minmax, false);
5127 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5128 sizeof(int), 0644, proc_dointvec_minmax, true);
5129 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5130 sizeof(int), 0644, proc_dointvec_minmax, true);
5131 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5132 sizeof(int), 0644, proc_dointvec_minmax, true);
5133 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5134 sizeof(int), 0644, proc_dointvec_minmax, true);
5135 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5136 sizeof(int), 0644, proc_dointvec_minmax, true);
5137 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5138 sizeof(int), 0644, proc_dointvec_minmax, false);
5139 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5140 sizeof(int), 0644, proc_dointvec_minmax, false);
5141 set_table_entry(&table[9], "cache_nice_tries",
5142 &sd->cache_nice_tries,
5143 sizeof(int), 0644, proc_dointvec_minmax, false);
5144 set_table_entry(&table[10], "flags", &sd->flags,
5145 sizeof(int), 0644, proc_dointvec_minmax, false);
5146 set_table_entry(&table[11], "max_newidle_lb_cost",
5147 &sd->max_newidle_lb_cost,
5148 sizeof(long), 0644, proc_doulongvec_minmax, false);
5149 set_table_entry(&table[12], "name", sd->name,
5150 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5151
5152
5153 return table;
5154}
5155
5156static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5157{
5158 struct ctl_table *entry, *table;
5159 struct sched_domain *sd;
5160 int domain_num = 0, i;
5161 char buf[32];
5162
5163 for_each_domain(cpu, sd)
5164 domain_num++;
5165 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5166 if (table == NULL)
5167 return NULL;
5168
5169 i = 0;
5170 for_each_domain(cpu, sd) {
5171 snprintf(buf, 32, "domain%d", i);
5172 entry->procname = kstrdup(buf, GFP_KERNEL);
5173 entry->mode = 0555;
5174 entry->child = sd_alloc_ctl_domain_table(sd);
5175 entry++;
5176 i++;
5177 }
5178 return table;
5179}
5180
5181static struct ctl_table_header *sd_sysctl_header;
5182static void register_sched_domain_sysctl(void)
5183{
5184 int i, cpu_num = num_possible_cpus();
5185 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5186 char buf[32];
5187
5188 WARN_ON(sd_ctl_dir[0].child);
5189 sd_ctl_dir[0].child = entry;
5190
5191 if (entry == NULL)
5192 return;
5193
5194 for_each_possible_cpu(i) {
5195 snprintf(buf, 32, "cpu%d", i);
5196 entry->procname = kstrdup(buf, GFP_KERNEL);
5197 entry->mode = 0555;
5198 entry->child = sd_alloc_ctl_cpu_table(i);
5199 entry++;
5200 }
5201
5202 WARN_ON(sd_sysctl_header);
5203 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5204}
5205
5206
5207static void unregister_sched_domain_sysctl(void)
5208{
5209 if (sd_sysctl_header)
5210 unregister_sysctl_table(sd_sysctl_header);
5211 sd_sysctl_header = NULL;
5212 if (sd_ctl_dir[0].child)
5213 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5214}
5215#else
5216static void register_sched_domain_sysctl(void)
5217{
5218}
5219static void unregister_sched_domain_sysctl(void)
5220{
5221}
5222#endif
5223
5224static void set_rq_online(struct rq *rq)
5225{
5226 if (!rq->online) {
5227 const struct sched_class *class;
5228
5229 cpumask_set_cpu(rq->cpu, rq->rd->online);
5230 rq->online = 1;
5231
5232 for_each_class(class) {
5233 if (class->rq_online)
5234 class->rq_online(rq);
5235 }
5236 }
5237}
5238
5239static void set_rq_offline(struct rq *rq)
5240{
5241 if (rq->online) {
5242 const struct sched_class *class;
5243
5244 for_each_class(class) {
5245 if (class->rq_offline)
5246 class->rq_offline(rq);
5247 }
5248
5249 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5250 rq->online = 0;
5251 }
5252}
5253
5254
5255
5256
5257
5258static int
5259migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5260{
5261 int cpu = (long)hcpu;
5262 unsigned long flags;
5263 struct rq *rq = cpu_rq(cpu);
5264
5265 switch (action & ~CPU_TASKS_FROZEN) {
5266
5267 case CPU_UP_PREPARE:
5268 rq->calc_load_update = calc_load_update;
5269 break;
5270
5271 case CPU_ONLINE:
5272
5273 raw_spin_lock_irqsave(&rq->lock, flags);
5274 if (rq->rd) {
5275 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5276
5277 set_rq_online(rq);
5278 }
5279 raw_spin_unlock_irqrestore(&rq->lock, flags);
5280 break;
5281
5282#ifdef CONFIG_HOTPLUG_CPU
5283 case CPU_DYING:
5284 sched_ttwu_pending();
5285
5286 raw_spin_lock_irqsave(&rq->lock, flags);
5287 if (rq->rd) {
5288 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5289 set_rq_offline(rq);
5290 }
5291 migrate_tasks(cpu);
5292 BUG_ON(rq->nr_running != 1);
5293 raw_spin_unlock_irqrestore(&rq->lock, flags);
5294 break;
5295
5296 case CPU_DEAD:
5297 calc_load_migrate(rq);
5298 break;
5299#endif
5300 }
5301
5302 update_max_interval();
5303
5304 return NOTIFY_OK;
5305}
5306
5307
5308
5309
5310
5311
5312static struct notifier_block migration_notifier = {
5313 .notifier_call = migration_call,
5314 .priority = CPU_PRI_MIGRATION,
5315};
5316
5317static void __cpuinit set_cpu_rq_start_time(void)
5318{
5319 int cpu = smp_processor_id();
5320 struct rq *rq = cpu_rq(cpu);
5321 rq->age_stamp = sched_clock_cpu(cpu);
5322}
5323
5324static int sched_cpu_active(struct notifier_block *nfb,
5325 unsigned long action, void *hcpu)
5326{
5327 switch (action & ~CPU_TASKS_FROZEN) {
5328 case CPU_STARTING:
5329 set_cpu_rq_start_time();
5330 return NOTIFY_OK;
5331 case CPU_DOWN_FAILED:
5332 set_cpu_active((long)hcpu, true);
5333 return NOTIFY_OK;
5334 default:
5335 return NOTIFY_DONE;
5336 }
5337}
5338
5339static int sched_cpu_inactive(struct notifier_block *nfb,
5340 unsigned long action, void *hcpu)
5341{
5342 switch (action & ~CPU_TASKS_FROZEN) {
5343 case CPU_DOWN_PREPARE:
5344 set_cpu_active((long)hcpu, false);
5345 return NOTIFY_OK;
5346 default:
5347 return NOTIFY_DONE;
5348 }
5349}
5350
5351static int __init migration_init(void)
5352{
5353 void *cpu = (void *)(long)smp_processor_id();
5354 int err;
5355
5356
5357 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5358 BUG_ON(err == NOTIFY_BAD);
5359 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5360 register_cpu_notifier(&migration_notifier);
5361
5362
5363 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5364 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5365
5366 return 0;
5367}
5368early_initcall(migration_init);
5369#endif
5370
5371#ifdef CONFIG_SMP
5372
5373static cpumask_var_t sched_domains_tmpmask;
5374
5375#ifdef CONFIG_SCHED_DEBUG
5376
5377static __read_mostly int sched_debug_enabled;
5378
5379static int __init sched_debug_setup(char *str)
5380{
5381 sched_debug_enabled = 1;
5382
5383 return 0;
5384}
5385early_param("sched_debug", sched_debug_setup);
5386
5387static inline bool sched_debug(void)
5388{
5389 return sched_debug_enabled;
5390}
5391
5392static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5393 struct cpumask *groupmask)
5394{
5395 struct sched_group *group = sd->groups;
5396
5397 cpumask_clear(groupmask);
5398
5399 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5400
5401 if (!(sd->flags & SD_LOAD_BALANCE)) {
5402 printk("does not load-balance\n");
5403 if (sd->parent)
5404 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5405 " has parent");
5406 return -1;
5407 }
5408
5409 printk(KERN_CONT "span %*pbl level %s\n",
5410 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5411
5412 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5413 printk(KERN_ERR "ERROR: domain->span does not contain "
5414 "CPU%d\n", cpu);
5415 }
5416 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5417 printk(KERN_ERR "ERROR: domain->groups does not contain"
5418 " CPU%d\n", cpu);
5419 }
5420
5421 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5422 do {
5423 if (!group) {
5424 printk("\n");
5425 printk(KERN_ERR "ERROR: group is NULL\n");
5426 break;
5427 }
5428
5429 if (!cpumask_weight(sched_group_cpus(group))) {
5430 printk(KERN_CONT "\n");
5431 printk(KERN_ERR "ERROR: empty group\n");
5432 break;
5433 }
5434
5435 if (!(sd->flags & SD_OVERLAP) &&
5436 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5437 printk(KERN_CONT "\n");
5438 printk(KERN_ERR "ERROR: repeated CPUs\n");
5439 break;
5440 }
5441
5442 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5443
5444 printk(KERN_CONT " %*pbl",
5445 cpumask_pr_args(sched_group_cpus(group)));
5446 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5447 printk(KERN_CONT " (cpu_capacity = %d)",
5448 group->sgc->capacity);
5449 }
5450
5451 group = group->next;
5452 } while (group != sd->groups);
5453 printk(KERN_CONT "\n");
5454
5455 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5456 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5457
5458 if (sd->parent &&
5459 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5460 printk(KERN_ERR "ERROR: parent span is not a superset "
5461 "of domain->span\n");
5462 return 0;
5463}
5464
5465static void sched_domain_debug(struct sched_domain *sd, int cpu)
5466{
5467 int level = 0;
5468
5469 if (!sched_debug_enabled)
5470 return;
5471
5472 if (!sd) {
5473 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5474 return;
5475 }
5476
5477 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5478
5479 for (;;) {
5480 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5481 break;
5482 level++;
5483 sd = sd->parent;
5484 if (!sd)
5485 break;
5486 }
5487}
5488#else
5489# define sched_domain_debug(sd, cpu) do { } while (0)
5490static inline bool sched_debug(void)
5491{
5492 return false;
5493}
5494#endif
5495
5496static int sd_degenerate(struct sched_domain *sd)
5497{
5498 if (cpumask_weight(sched_domain_span(sd)) == 1)
5499 return 1;
5500
5501
5502 if (sd->flags & (SD_LOAD_BALANCE |
5503 SD_BALANCE_NEWIDLE |
5504 SD_BALANCE_FORK |
5505 SD_BALANCE_EXEC |
5506 SD_SHARE_CPUCAPACITY |
5507 SD_SHARE_PKG_RESOURCES |
5508 SD_SHARE_POWERDOMAIN)) {
5509 if (sd->groups != sd->groups->next)
5510 return 0;
5511 }
5512
5513
5514 if (sd->flags & (SD_WAKE_AFFINE))
5515 return 0;
5516
5517 return 1;
5518}
5519
5520static int
5521sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5522{
5523 unsigned long cflags = sd->flags, pflags = parent->flags;
5524
5525 if (sd_degenerate(parent))
5526 return 1;
5527
5528 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5529 return 0;
5530
5531
5532 if (parent->groups == parent->groups->next) {
5533 pflags &= ~(SD_LOAD_BALANCE |
5534 SD_BALANCE_NEWIDLE |
5535 SD_BALANCE_FORK |
5536 SD_BALANCE_EXEC |
5537 SD_SHARE_CPUCAPACITY |
5538 SD_SHARE_PKG_RESOURCES |
5539 SD_PREFER_SIBLING |
5540 SD_SHARE_POWERDOMAIN);
5541 if (nr_node_ids == 1)
5542 pflags &= ~SD_SERIALIZE;
5543 }
5544 if (~cflags & pflags)
5545 return 0;
5546
5547 return 1;
5548}
5549
5550static void free_rootdomain(struct rcu_head *rcu)
5551{
5552 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5553
5554 cpupri_cleanup(&rd->cpupri);
5555 cpudl_cleanup(&rd->cpudl);
5556 free_cpumask_var(rd->dlo_mask);
5557 free_cpumask_var(rd->rto_mask);
5558 free_cpumask_var(rd->online);
5559 free_cpumask_var(rd->span);
5560 kfree(rd);
5561}
5562
5563static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5564{
5565 struct root_domain *old_rd = NULL;
5566 unsigned long flags;
5567
5568 raw_spin_lock_irqsave(&rq->lock, flags);
5569
5570 if (rq->rd) {
5571 old_rd = rq->rd;
5572
5573 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5574 set_rq_offline(rq);
5575
5576 cpumask_clear_cpu(rq->cpu, old_rd->span);
5577
5578
5579
5580
5581
5582
5583 if (!atomic_dec_and_test(&old_rd->refcount))
5584 old_rd = NULL;
5585 }
5586
5587 atomic_inc(&rd->refcount);
5588 rq->rd = rd;
5589
5590 cpumask_set_cpu(rq->cpu, rd->span);
5591 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5592 set_rq_online(rq);
5593
5594 raw_spin_unlock_irqrestore(&rq->lock, flags);
5595
5596 if (old_rd)
5597 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5598}
5599
5600static int init_rootdomain(struct root_domain *rd)
5601{
5602 memset(rd, 0, sizeof(*rd));
5603
5604 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5605 goto out;
5606 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5607 goto free_span;
5608 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5609 goto free_online;
5610 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5611 goto free_dlo_mask;
5612
5613 init_dl_bw(&rd->dl_bw);
5614 if (cpudl_init(&rd->cpudl) != 0)
5615 goto free_dlo_mask;
5616
5617 if (cpupri_init(&rd->cpupri) != 0)
5618 goto free_rto_mask;
5619 return 0;
5620
5621free_rto_mask:
5622 free_cpumask_var(rd->rto_mask);
5623free_dlo_mask:
5624 free_cpumask_var(rd->dlo_mask);
5625free_online:
5626 free_cpumask_var(rd->online);
5627free_span:
5628 free_cpumask_var(rd->span);
5629out:
5630 return -ENOMEM;
5631}
5632
5633
5634
5635
5636
5637struct root_domain def_root_domain;
5638
5639static void init_defrootdomain(void)
5640{
5641 init_rootdomain(&def_root_domain);
5642
5643 atomic_set(&def_root_domain.refcount, 1);
5644}
5645
5646static struct root_domain *alloc_rootdomain(void)
5647{
5648 struct root_domain *rd;
5649
5650 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5651 if (!rd)
5652 return NULL;
5653
5654 if (init_rootdomain(rd) != 0) {
5655 kfree(rd);
5656 return NULL;
5657 }
5658
5659 return rd;
5660}
5661
5662static void free_sched_groups(struct sched_group *sg, int free_sgc)
5663{
5664 struct sched_group *tmp, *first;
5665
5666 if (!sg)
5667 return;
5668
5669 first = sg;
5670 do {
5671 tmp = sg->next;
5672
5673 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5674 kfree(sg->sgc);
5675
5676 kfree(sg);
5677 sg = tmp;
5678 } while (sg != first);
5679}
5680
5681static void free_sched_domain(struct rcu_head *rcu)
5682{
5683 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5684
5685
5686
5687
5688
5689 if (sd->flags & SD_OVERLAP) {
5690 free_sched_groups(sd->groups, 1);
5691 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5692 kfree(sd->groups->sgc);
5693 kfree(sd->groups);
5694 }
5695 kfree(sd);
5696}
5697
5698static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5699{
5700 call_rcu(&sd->rcu, free_sched_domain);
5701}
5702
5703static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5704{
5705 for (; sd; sd = sd->parent)
5706 destroy_sched_domain(sd, cpu);
5707}
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5719DEFINE_PER_CPU(int, sd_llc_size);
5720DEFINE_PER_CPU(int, sd_llc_id);
5721DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5722DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5723DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5724
5725static void update_top_cache_domain(int cpu)
5726{
5727 struct sched_domain *sd;
5728 struct sched_domain *busy_sd = NULL;
5729 int id = cpu;
5730 int size = 1;
5731
5732 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5733 if (sd) {
5734 id = cpumask_first(sched_domain_span(sd));
5735 size = cpumask_weight(sched_domain_span(sd));
5736 busy_sd = sd->parent;
5737 }
5738 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5739
5740 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5741 per_cpu(sd_llc_size, cpu) = size;
5742 per_cpu(sd_llc_id, cpu) = id;
5743
5744 sd = lowest_flag_domain(cpu, SD_NUMA);
5745 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5746
5747 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5748 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5749}
5750
5751
5752
5753
5754
5755static void
5756cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5757{
5758 struct rq *rq = cpu_rq(cpu);
5759 struct sched_domain *tmp;
5760
5761
5762 for (tmp = sd; tmp; ) {
5763 struct sched_domain *parent = tmp->parent;
5764 if (!parent)
5765 break;
5766
5767 if (sd_parent_degenerate(tmp, parent)) {
5768 tmp->parent = parent->parent;
5769 if (parent->parent)
5770 parent->parent->child = tmp;
5771
5772
5773
5774
5775
5776 if (parent->flags & SD_PREFER_SIBLING)
5777 tmp->flags |= SD_PREFER_SIBLING;
5778 destroy_sched_domain(parent, cpu);
5779 } else
5780 tmp = tmp->parent;
5781 }
5782
5783 if (sd && sd_degenerate(sd)) {
5784 tmp = sd;
5785 sd = sd->parent;
5786 destroy_sched_domain(tmp, cpu);
5787 if (sd)
5788 sd->child = NULL;
5789 }
5790
5791 sched_domain_debug(sd, cpu);
5792
5793 rq_attach_root(rq, rd);
5794 tmp = rq->sd;
5795 rcu_assign_pointer(rq->sd, sd);
5796 destroy_sched_domains(tmp, cpu);
5797
5798 update_top_cache_domain(cpu);
5799}
5800
5801
5802static int __init isolated_cpu_setup(char *str)
5803{
5804 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5805 cpulist_parse(str, cpu_isolated_map);
5806 return 1;
5807}
5808
5809__setup("isolcpus=", isolated_cpu_setup);
5810
5811struct s_data {
5812 struct sched_domain ** __percpu sd;
5813 struct root_domain *rd;
5814};
5815
5816enum s_alloc {
5817 sa_rootdomain,
5818 sa_sd,
5819 sa_sd_storage,
5820 sa_none,
5821};
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5837{
5838 const struct cpumask *span = sched_domain_span(sd);
5839 struct sd_data *sdd = sd->private;
5840 struct sched_domain *sibling;
5841 int i;
5842
5843 for_each_cpu(i, span) {
5844 sibling = *per_cpu_ptr(sdd->sd, i);
5845 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5846 continue;
5847
5848 cpumask_set_cpu(i, sched_group_mask(sg));
5849 }
5850}
5851
5852
5853
5854
5855
5856int group_balance_cpu(struct sched_group *sg)
5857{
5858 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5859}
5860
5861static int
5862build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5863{
5864 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5865 const struct cpumask *span = sched_domain_span(sd);
5866 struct cpumask *covered = sched_domains_tmpmask;
5867 struct sd_data *sdd = sd->private;
5868 struct sched_domain *sibling;
5869 int i;
5870
5871 cpumask_clear(covered);
5872
5873 for_each_cpu(i, span) {
5874 struct cpumask *sg_span;
5875
5876 if (cpumask_test_cpu(i, covered))
5877 continue;
5878
5879 sibling = *per_cpu_ptr(sdd->sd, i);
5880
5881
5882 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5883 continue;
5884
5885 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5886 GFP_KERNEL, cpu_to_node(cpu));
5887
5888 if (!sg)
5889 goto fail;
5890
5891 sg_span = sched_group_cpus(sg);
5892 if (sibling->child)
5893 cpumask_copy(sg_span, sched_domain_span(sibling->child));
5894 else
5895 cpumask_set_cpu(i, sg_span);
5896
5897 cpumask_or(covered, covered, sg_span);
5898
5899 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
5900 if (atomic_inc_return(&sg->sgc->ref) == 1)
5901 build_group_mask(sd, sg);
5902
5903
5904
5905
5906
5907
5908 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
5909
5910
5911
5912
5913
5914
5915 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5916 group_balance_cpu(sg) == cpu)
5917 groups = sg;
5918
5919 if (!first)
5920 first = sg;
5921 if (last)
5922 last->next = sg;
5923 last = sg;
5924 last->next = first;
5925 }
5926 sd->groups = groups;
5927
5928 return 0;
5929
5930fail:
5931 free_sched_groups(first, 0);
5932
5933 return -ENOMEM;
5934}
5935
5936static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5937{
5938 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5939 struct sched_domain *child = sd->child;
5940
5941 if (child)
5942 cpu = cpumask_first(sched_domain_span(child));
5943
5944 if (sg) {
5945 *sg = *per_cpu_ptr(sdd->sg, cpu);
5946 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
5947 atomic_set(&(*sg)->sgc->ref, 1);
5948 }
5949
5950 return cpu;
5951}
5952
5953
5954
5955
5956
5957
5958
5959
5960static int
5961build_sched_groups(struct sched_domain *sd, int cpu)
5962{
5963 struct sched_group *first = NULL, *last = NULL;
5964 struct sd_data *sdd = sd->private;
5965 const struct cpumask *span = sched_domain_span(sd);
5966 struct cpumask *covered;
5967 int i;
5968
5969 get_group(cpu, sdd, &sd->groups);
5970 atomic_inc(&sd->groups->ref);
5971
5972 if (cpu != cpumask_first(span))
5973 return 0;
5974
5975 lockdep_assert_held(&sched_domains_mutex);
5976 covered = sched_domains_tmpmask;
5977
5978 cpumask_clear(covered);
5979
5980 for_each_cpu(i, span) {
5981 struct sched_group *sg;
5982 int group, j;
5983
5984 if (cpumask_test_cpu(i, covered))
5985 continue;
5986
5987 group = get_group(i, sdd, &sg);
5988 cpumask_setall(sched_group_mask(sg));
5989
5990 for_each_cpu(j, span) {
5991 if (get_group(j, sdd, NULL) != group)
5992 continue;
5993
5994 cpumask_set_cpu(j, covered);
5995 cpumask_set_cpu(j, sched_group_cpus(sg));
5996 }
5997
5998 if (!first)
5999 first = sg;
6000 if (last)
6001 last->next = sg;
6002 last = sg;
6003 }
6004 last->next = first;
6005
6006 return 0;
6007}
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6020{
6021 struct sched_group *sg = sd->groups;
6022
6023 WARN_ON(!sg);
6024
6025 do {
6026 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6027 sg = sg->next;
6028 } while (sg != sd->groups);
6029
6030 if (cpu != group_balance_cpu(sg))
6031 return;
6032
6033 update_group_capacity(sd, cpu);
6034 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
6035}
6036
6037
6038
6039
6040
6041
6042static int default_relax_domain_level = -1;
6043int sched_domain_level_max;
6044
6045static int __init setup_relax_domain_level(char *str)
6046{
6047 if (kstrtoint(str, 0, &default_relax_domain_level))
6048 pr_warn("Unable to set relax_domain_level\n");
6049
6050 return 1;
6051}
6052__setup("relax_domain_level=", setup_relax_domain_level);
6053
6054static void set_domain_attribute(struct sched_domain *sd,
6055 struct sched_domain_attr *attr)
6056{
6057 int request;
6058
6059 if (!attr || attr->relax_domain_level < 0) {
6060 if (default_relax_domain_level < 0)
6061 return;
6062 else
6063 request = default_relax_domain_level;
6064 } else
6065 request = attr->relax_domain_level;
6066 if (request < sd->level) {
6067
6068 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6069 } else {
6070
6071 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6072 }
6073}
6074
6075static void __sdt_free(const struct cpumask *cpu_map);
6076static int __sdt_alloc(const struct cpumask *cpu_map);
6077
6078static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6079 const struct cpumask *cpu_map)
6080{
6081 switch (what) {
6082 case sa_rootdomain:
6083 if (!atomic_read(&d->rd->refcount))
6084 free_rootdomain(&d->rd->rcu);
6085 case sa_sd:
6086 free_percpu(d->sd);
6087 case sa_sd_storage:
6088 __sdt_free(cpu_map);
6089 case sa_none:
6090 break;
6091 }
6092}
6093
6094static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6095 const struct cpumask *cpu_map)
6096{
6097 memset(d, 0, sizeof(*d));
6098
6099 if (__sdt_alloc(cpu_map))
6100 return sa_sd_storage;
6101 d->sd = alloc_percpu(struct sched_domain *);
6102 if (!d->sd)
6103 return sa_sd_storage;
6104 d->rd = alloc_rootdomain();
6105 if (!d->rd)
6106 return sa_sd;
6107 return sa_rootdomain;
6108}
6109
6110
6111
6112
6113
6114
6115static void claim_allocations(int cpu, struct sched_domain *sd)
6116{
6117 struct sd_data *sdd = sd->private;
6118
6119 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6120 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6121
6122 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6123 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6124
6125 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
6126 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
6127}
6128
6129#ifdef CONFIG_NUMA
6130static int sched_domains_numa_levels;
6131enum numa_topology_type sched_numa_topology_type;
6132static int *sched_domains_numa_distance;
6133int sched_max_numa_distance;
6134static struct cpumask ***sched_domains_numa_masks;
6135static int sched_domains_curr_level;
6136#endif
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149#define TOPOLOGY_SD_FLAGS \
6150 (SD_SHARE_CPUCAPACITY | \
6151 SD_SHARE_PKG_RESOURCES | \
6152 SD_NUMA | \
6153 SD_ASYM_PACKING | \
6154 SD_SHARE_POWERDOMAIN)
6155
6156static struct sched_domain *
6157sd_init(struct sched_domain_topology_level *tl, int cpu)
6158{
6159 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6160 int sd_weight, sd_flags = 0;
6161
6162#ifdef CONFIG_NUMA
6163
6164
6165
6166 sched_domains_curr_level = tl->numa_level;
6167#endif
6168
6169 sd_weight = cpumask_weight(tl->mask(cpu));
6170
6171 if (tl->sd_flags)
6172 sd_flags = (*tl->sd_flags)();
6173 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6174 "wrong sd_flags in topology description\n"))
6175 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6176
6177 *sd = (struct sched_domain){
6178 .min_interval = sd_weight,
6179 .max_interval = 2*sd_weight,
6180 .busy_factor = 32,
6181 .imbalance_pct = 125,
6182
6183 .cache_nice_tries = 0,
6184 .busy_idx = 0,
6185 .idle_idx = 0,
6186 .newidle_idx = 0,
6187 .wake_idx = 0,
6188 .forkexec_idx = 0,
6189
6190 .flags = 1*SD_LOAD_BALANCE
6191 | 1*SD_BALANCE_NEWIDLE
6192 | 1*SD_BALANCE_EXEC
6193 | 1*SD_BALANCE_FORK
6194 | 0*SD_BALANCE_WAKE
6195 | 1*SD_WAKE_AFFINE
6196 | 0*SD_SHARE_CPUCAPACITY
6197 | 0*SD_SHARE_PKG_RESOURCES
6198 | 0*SD_SERIALIZE
6199 | 0*SD_PREFER_SIBLING
6200 | 0*SD_NUMA
6201 | sd_flags
6202 ,
6203
6204 .last_balance = jiffies,
6205 .balance_interval = sd_weight,
6206 .smt_gain = 0,
6207 .max_newidle_lb_cost = 0,
6208 .next_decay_max_lb_cost = jiffies,
6209#ifdef CONFIG_SCHED_DEBUG
6210 .name = tl->name,
6211#endif
6212 };
6213
6214
6215
6216
6217
6218 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6219 sd->flags |= SD_PREFER_SIBLING;
6220 sd->imbalance_pct = 110;
6221 sd->smt_gain = 1178;
6222
6223 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6224 sd->imbalance_pct = 117;
6225 sd->cache_nice_tries = 1;
6226 sd->busy_idx = 2;
6227
6228#ifdef CONFIG_NUMA
6229 } else if (sd->flags & SD_NUMA) {
6230 sd->cache_nice_tries = 2;
6231 sd->busy_idx = 3;
6232 sd->idle_idx = 2;
6233
6234 sd->flags |= SD_SERIALIZE;
6235 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6236 sd->flags &= ~(SD_BALANCE_EXEC |
6237 SD_BALANCE_FORK |
6238 SD_WAKE_AFFINE);
6239 }
6240
6241#endif
6242 } else {
6243 sd->flags |= SD_PREFER_SIBLING;
6244 sd->cache_nice_tries = 1;
6245 sd->busy_idx = 2;
6246 sd->idle_idx = 1;
6247 }
6248
6249 sd->private = &tl->data;
6250
6251 return sd;
6252}
6253
6254
6255
6256
6257static struct sched_domain_topology_level default_topology[] = {
6258#ifdef CONFIG_SCHED_SMT
6259 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6260#endif
6261#ifdef CONFIG_SCHED_MC
6262 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6263#endif
6264 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6265 { NULL, },
6266};
6267
6268struct sched_domain_topology_level *sched_domain_topology = default_topology;
6269
6270#define for_each_sd_topology(tl) \
6271 for (tl = sched_domain_topology; tl->mask; tl++)
6272
6273void set_sched_topology(struct sched_domain_topology_level *tl)
6274{
6275 sched_domain_topology = tl;
6276}
6277
6278#ifdef CONFIG_NUMA
6279
6280static const struct cpumask *sd_numa_mask(int cpu)
6281{
6282 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6283}
6284
6285static void sched_numa_warn(const char *str)
6286{
6287 static int done = false;
6288 int i,j;
6289
6290 if (done)
6291 return;
6292
6293 done = true;
6294
6295 printk(KERN_WARNING "ERROR: %s\n\n", str);
6296
6297 for (i = 0; i < nr_node_ids; i++) {
6298 printk(KERN_WARNING " ");
6299 for (j = 0; j < nr_node_ids; j++)
6300 printk(KERN_CONT "%02d ", node_distance(i,j));
6301 printk(KERN_CONT "\n");
6302 }
6303 printk(KERN_WARNING "\n");
6304}
6305
6306bool find_numa_distance(int distance)
6307{
6308 int i;
6309
6310 if (distance == node_distance(0, 0))
6311 return true;
6312
6313 for (i = 0; i < sched_domains_numa_levels; i++) {
6314 if (sched_domains_numa_distance[i] == distance)
6315 return true;
6316 }
6317
6318 return false;
6319}
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340static void init_numa_topology_type(void)
6341{
6342 int a, b, c, n;
6343
6344 n = sched_max_numa_distance;
6345
6346 if (n <= 1)
6347 sched_numa_topology_type = NUMA_DIRECT;
6348
6349 for_each_online_node(a) {
6350 for_each_online_node(b) {
6351
6352 if (node_distance(a, b) < n)
6353 continue;
6354
6355
6356 for_each_online_node(c) {
6357 if (node_distance(a, c) < n &&
6358 node_distance(b, c) < n) {
6359 sched_numa_topology_type =
6360 NUMA_GLUELESS_MESH;
6361 return;
6362 }
6363 }
6364
6365 sched_numa_topology_type = NUMA_BACKPLANE;
6366 return;
6367 }
6368 }
6369}
6370
6371static void sched_init_numa(void)
6372{
6373 int next_distance, curr_distance = node_distance(0, 0);
6374 struct sched_domain_topology_level *tl;
6375 int level = 0;
6376 int i, j, k;
6377
6378 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6379 if (!sched_domains_numa_distance)
6380 return;
6381
6382
6383
6384
6385
6386
6387
6388
6389 next_distance = curr_distance;
6390 for (i = 0; i < nr_node_ids; i++) {
6391 for (j = 0; j < nr_node_ids; j++) {
6392 for (k = 0; k < nr_node_ids; k++) {
6393 int distance = node_distance(i, k);
6394
6395 if (distance > curr_distance &&
6396 (distance < next_distance ||
6397 next_distance == curr_distance))
6398 next_distance = distance;
6399
6400
6401
6402
6403
6404
6405 if (sched_debug() && node_distance(k, i) != distance)
6406 sched_numa_warn("Node-distance not symmetric");
6407
6408 if (sched_debug() && i && !find_numa_distance(distance))
6409 sched_numa_warn("Node-0 not representative");
6410 }
6411 if (next_distance != curr_distance) {
6412 sched_domains_numa_distance[level++] = next_distance;
6413 sched_domains_numa_levels = level;
6414 curr_distance = next_distance;
6415 } else break;
6416 }
6417
6418
6419
6420
6421 if (!sched_debug())
6422 break;
6423 }
6424
6425 if (!level)
6426 return;
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445 sched_domains_numa_levels = 0;
6446
6447 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6448 if (!sched_domains_numa_masks)
6449 return;
6450
6451
6452
6453
6454
6455 for (i = 0; i < level; i++) {
6456 sched_domains_numa_masks[i] =
6457 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6458 if (!sched_domains_numa_masks[i])
6459 return;
6460
6461 for (j = 0; j < nr_node_ids; j++) {
6462 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6463 if (!mask)
6464 return;
6465
6466 sched_domains_numa_masks[i][j] = mask;
6467
6468 for (k = 0; k < nr_node_ids; k++) {
6469 if (node_distance(j, k) > sched_domains_numa_distance[i])
6470 continue;
6471
6472 cpumask_or(mask, mask, cpumask_of_node(k));
6473 }
6474 }
6475 }
6476
6477
6478 for (i = 0; sched_domain_topology[i].mask; i++);
6479
6480 tl = kzalloc((i + level + 1) *
6481 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6482 if (!tl)
6483 return;
6484
6485
6486
6487
6488 for (i = 0; sched_domain_topology[i].mask; i++)
6489 tl[i] = sched_domain_topology[i];
6490
6491
6492
6493
6494 for (j = 0; j < level; i++, j++) {
6495 tl[i] = (struct sched_domain_topology_level){
6496 .mask = sd_numa_mask,
6497 .sd_flags = cpu_numa_flags,
6498 .flags = SDTL_OVERLAP,
6499 .numa_level = j,
6500 SD_INIT_NAME(NUMA)
6501 };
6502 }
6503
6504 sched_domain_topology = tl;
6505
6506 sched_domains_numa_levels = level;
6507 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6508
6509 init_numa_topology_type();
6510}
6511
6512static void sched_domains_numa_masks_set(int cpu)
6513{
6514 int i, j;
6515 int node = cpu_to_node(cpu);
6516
6517 for (i = 0; i < sched_domains_numa_levels; i++) {
6518 for (j = 0; j < nr_node_ids; j++) {
6519 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6520 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6521 }
6522 }
6523}
6524
6525static void sched_domains_numa_masks_clear(int cpu)
6526{
6527 int i, j;
6528 for (i = 0; i < sched_domains_numa_levels; i++) {
6529 for (j = 0; j < nr_node_ids; j++)
6530 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6531 }
6532}
6533
6534
6535
6536
6537
6538static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6539 unsigned long action,
6540 void *hcpu)
6541{
6542 int cpu = (long)hcpu;
6543
6544 switch (action & ~CPU_TASKS_FROZEN) {
6545 case CPU_ONLINE:
6546 sched_domains_numa_masks_set(cpu);
6547 break;
6548
6549 case CPU_DEAD:
6550 sched_domains_numa_masks_clear(cpu);
6551 break;
6552
6553 default:
6554 return NOTIFY_DONE;
6555 }
6556
6557 return NOTIFY_OK;
6558}
6559#else
6560static inline void sched_init_numa(void)
6561{
6562}
6563
6564static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6565 unsigned long action,
6566 void *hcpu)
6567{
6568 return 0;
6569}
6570#endif
6571
6572static int __sdt_alloc(const struct cpumask *cpu_map)
6573{
6574 struct sched_domain_topology_level *tl;
6575 int j;
6576
6577 for_each_sd_topology(tl) {
6578 struct sd_data *sdd = &tl->data;
6579
6580 sdd->sd = alloc_percpu(struct sched_domain *);
6581 if (!sdd->sd)
6582 return -ENOMEM;
6583
6584 sdd->sg = alloc_percpu(struct sched_group *);
6585 if (!sdd->sg)
6586 return -ENOMEM;
6587
6588 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6589 if (!sdd->sgc)
6590 return -ENOMEM;
6591
6592 for_each_cpu(j, cpu_map) {
6593 struct sched_domain *sd;
6594 struct sched_group *sg;
6595 struct sched_group_capacity *sgc;
6596
6597 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6598 GFP_KERNEL, cpu_to_node(j));
6599 if (!sd)
6600 return -ENOMEM;
6601
6602 *per_cpu_ptr(sdd->sd, j) = sd;
6603
6604 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6605 GFP_KERNEL, cpu_to_node(j));
6606 if (!sg)
6607 return -ENOMEM;
6608
6609 sg->next = sg;
6610
6611 *per_cpu_ptr(sdd->sg, j) = sg;
6612
6613 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6614 GFP_KERNEL, cpu_to_node(j));
6615 if (!sgc)
6616 return -ENOMEM;
6617
6618 *per_cpu_ptr(sdd->sgc, j) = sgc;
6619 }
6620 }
6621
6622 return 0;
6623}
6624
6625static void __sdt_free(const struct cpumask *cpu_map)
6626{
6627 struct sched_domain_topology_level *tl;
6628 int j;
6629
6630 for_each_sd_topology(tl) {
6631 struct sd_data *sdd = &tl->data;
6632
6633 for_each_cpu(j, cpu_map) {
6634 struct sched_domain *sd;
6635
6636 if (sdd->sd) {
6637 sd = *per_cpu_ptr(sdd->sd, j);
6638 if (sd && (sd->flags & SD_OVERLAP))
6639 free_sched_groups(sd->groups, 0);
6640 kfree(*per_cpu_ptr(sdd->sd, j));
6641 }
6642
6643 if (sdd->sg)
6644 kfree(*per_cpu_ptr(sdd->sg, j));
6645 if (sdd->sgc)
6646 kfree(*per_cpu_ptr(sdd->sgc, j));
6647 }
6648 free_percpu(sdd->sd);
6649 sdd->sd = NULL;
6650 free_percpu(sdd->sg);
6651 sdd->sg = NULL;
6652 free_percpu(sdd->sgc);
6653 sdd->sgc = NULL;
6654 }
6655}
6656
6657struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6658 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6659 struct sched_domain *child, int cpu)
6660{
6661 struct sched_domain *sd = sd_init(tl, cpu);
6662 if (!sd)
6663 return child;
6664
6665 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6666 if (child) {
6667 sd->level = child->level + 1;
6668 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6669 child->parent = sd;
6670 sd->child = child;
6671
6672 if (!cpumask_subset(sched_domain_span(child),
6673 sched_domain_span(sd))) {
6674 pr_err("BUG: arch topology borken\n");
6675#ifdef CONFIG_SCHED_DEBUG
6676 pr_err(" the %s domain not a subset of the %s domain\n",
6677 child->name, sd->name);
6678#endif
6679
6680 cpumask_or(sched_domain_span(sd),
6681 sched_domain_span(sd),
6682 sched_domain_span(child));
6683 }
6684
6685 }
6686 set_domain_attribute(sd, attr);
6687
6688 return sd;
6689}
6690
6691
6692
6693
6694
6695static int build_sched_domains(const struct cpumask *cpu_map,
6696 struct sched_domain_attr *attr)
6697{
6698 enum s_alloc alloc_state;
6699 struct sched_domain *sd;
6700 struct s_data d;
6701 int i, ret = -ENOMEM;
6702
6703 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6704 if (alloc_state != sa_rootdomain)
6705 goto error;
6706
6707
6708 for_each_cpu(i, cpu_map) {
6709 struct sched_domain_topology_level *tl;
6710
6711 sd = NULL;
6712 for_each_sd_topology(tl) {
6713 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6714 if (tl == sched_domain_topology)
6715 *per_cpu_ptr(d.sd, i) = sd;
6716 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6717 sd->flags |= SD_OVERLAP;
6718 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6719 break;
6720 }
6721 }
6722
6723
6724 for_each_cpu(i, cpu_map) {
6725 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6726 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6727 if (sd->flags & SD_OVERLAP) {
6728 if (build_overlap_sched_groups(sd, i))
6729 goto error;
6730 } else {
6731 if (build_sched_groups(sd, i))
6732 goto error;
6733 }
6734 }
6735 }
6736
6737
6738 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6739 if (!cpumask_test_cpu(i, cpu_map))
6740 continue;
6741
6742 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6743 claim_allocations(i, sd);
6744 init_sched_groups_capacity(i, sd);
6745 }
6746 }
6747
6748
6749 rcu_read_lock();
6750 for_each_cpu(i, cpu_map) {
6751 sd = *per_cpu_ptr(d.sd, i);
6752 cpu_attach_domain(sd, d.rd, i);
6753 }
6754 rcu_read_unlock();
6755
6756 ret = 0;
6757error:
6758 __free_domain_allocs(&d, alloc_state, cpu_map);
6759 return ret;
6760}
6761
6762static cpumask_var_t *doms_cur;
6763static int ndoms_cur;
6764static struct sched_domain_attr *dattr_cur;
6765
6766
6767
6768
6769
6770
6771
6772static cpumask_var_t fallback_doms;
6773
6774
6775
6776
6777
6778
6779int __weak arch_update_cpu_topology(void)
6780{
6781 return 0;
6782}
6783
6784cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6785{
6786 int i;
6787 cpumask_var_t *doms;
6788
6789 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6790 if (!doms)
6791 return NULL;
6792 for (i = 0; i < ndoms; i++) {
6793 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6794 free_sched_domains(doms, i);
6795 return NULL;
6796 }
6797 }
6798 return doms;
6799}
6800
6801void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6802{
6803 unsigned int i;
6804 for (i = 0; i < ndoms; i++)
6805 free_cpumask_var(doms[i]);
6806 kfree(doms);
6807}
6808
6809
6810
6811
6812
6813
6814static int init_sched_domains(const struct cpumask *cpu_map)
6815{
6816 int err;
6817
6818 arch_update_cpu_topology();
6819 ndoms_cur = 1;
6820 doms_cur = alloc_sched_domains(ndoms_cur);
6821 if (!doms_cur)
6822 doms_cur = &fallback_doms;
6823 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6824 err = build_sched_domains(doms_cur[0], NULL);
6825 register_sched_domain_sysctl();
6826
6827 return err;
6828}
6829
6830
6831
6832
6833
6834static void detach_destroy_domains(const struct cpumask *cpu_map)
6835{
6836 int i;
6837
6838 rcu_read_lock();
6839 for_each_cpu(i, cpu_map)
6840 cpu_attach_domain(NULL, &def_root_domain, i);
6841 rcu_read_unlock();
6842}
6843
6844
6845static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6846 struct sched_domain_attr *new, int idx_new)
6847{
6848 struct sched_domain_attr tmp;
6849
6850
6851 if (!new && !cur)
6852 return 1;
6853
6854 tmp = SD_ATTR_INIT;
6855 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6856 new ? (new + idx_new) : &tmp,
6857 sizeof(struct sched_domain_attr));
6858}
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6887 struct sched_domain_attr *dattr_new)
6888{
6889 int i, j, n;
6890 int new_topology;
6891
6892 mutex_lock(&sched_domains_mutex);
6893
6894
6895 unregister_sched_domain_sysctl();
6896
6897
6898 new_topology = arch_update_cpu_topology();
6899
6900 n = doms_new ? ndoms_new : 0;
6901
6902
6903 for (i = 0; i < ndoms_cur; i++) {
6904 for (j = 0; j < n && !new_topology; j++) {
6905 if (cpumask_equal(doms_cur[i], doms_new[j])
6906 && dattrs_equal(dattr_cur, i, dattr_new, j))
6907 goto match1;
6908 }
6909
6910 detach_destroy_domains(doms_cur[i]);
6911match1:
6912 ;
6913 }
6914
6915 n = ndoms_cur;
6916 if (doms_new == NULL) {
6917 n = 0;
6918 doms_new = &fallback_doms;
6919 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6920 WARN_ON_ONCE(dattr_new);
6921 }
6922
6923
6924 for (i = 0; i < ndoms_new; i++) {
6925 for (j = 0; j < n && !new_topology; j++) {
6926 if (cpumask_equal(doms_new[i], doms_cur[j])
6927 && dattrs_equal(dattr_new, i, dattr_cur, j))
6928 goto match2;
6929 }
6930
6931 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6932match2:
6933 ;
6934 }
6935
6936
6937 if (doms_cur != &fallback_doms)
6938 free_sched_domains(doms_cur, ndoms_cur);
6939 kfree(dattr_cur);
6940 doms_cur = doms_new;
6941 dattr_cur = dattr_new;
6942 ndoms_cur = ndoms_new;
6943
6944 register_sched_domain_sysctl();
6945
6946 mutex_unlock(&sched_domains_mutex);
6947}
6948
6949static int num_cpus_frozen;
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6960 void *hcpu)
6961{
6962 switch (action) {
6963 case CPU_ONLINE_FROZEN:
6964 case CPU_DOWN_FAILED_FROZEN:
6965
6966
6967
6968
6969
6970
6971
6972 num_cpus_frozen--;
6973 if (likely(num_cpus_frozen)) {
6974 partition_sched_domains(1, NULL, NULL);
6975 break;
6976 }
6977
6978
6979
6980
6981
6982
6983
6984 case CPU_ONLINE:
6985 cpuset_update_active_cpus(true);
6986 break;
6987 default:
6988 return NOTIFY_DONE;
6989 }
6990 return NOTIFY_OK;
6991}
6992
6993static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6994 void *hcpu)
6995{
6996 unsigned long flags;
6997 long cpu = (long)hcpu;
6998 struct dl_bw *dl_b;
6999 bool overflow;
7000 int cpus;
7001
7002 switch (action) {
7003 case CPU_DOWN_PREPARE:
7004 rcu_read_lock_sched();
7005 dl_b = dl_bw_of(cpu);
7006
7007 raw_spin_lock_irqsave(&dl_b->lock, flags);
7008 cpus = dl_bw_cpus(cpu);
7009 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7010 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7011
7012 rcu_read_unlock_sched();
7013
7014 if (overflow)
7015 return notifier_from_errno(-EBUSY);
7016 cpuset_update_active_cpus(false);
7017 break;
7018 case CPU_DOWN_PREPARE_FROZEN:
7019 num_cpus_frozen++;
7020 partition_sched_domains(1, NULL, NULL);
7021 break;
7022 default:
7023 return NOTIFY_DONE;
7024 }
7025 return NOTIFY_OK;
7026}
7027
7028void __init sched_init_smp(void)
7029{
7030 cpumask_var_t non_isolated_cpus;
7031
7032 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7033 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7034
7035 sched_init_numa();
7036
7037
7038
7039
7040
7041
7042 mutex_lock(&sched_domains_mutex);
7043 init_sched_domains(cpu_active_mask);
7044 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7045 if (cpumask_empty(non_isolated_cpus))
7046 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7047 mutex_unlock(&sched_domains_mutex);
7048
7049 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
7050 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7051 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7052
7053 init_hrtick();
7054
7055
7056 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7057 BUG();
7058 sched_init_granularity();
7059 free_cpumask_var(non_isolated_cpus);
7060
7061 init_sched_rt_class();
7062 init_sched_dl_class();
7063}
7064#else
7065void __init sched_init_smp(void)
7066{
7067 sched_init_granularity();
7068}
7069#endif
7070
7071const_debug unsigned int sysctl_timer_migration = 1;
7072
7073int in_sched_functions(unsigned long addr)
7074{
7075 return in_lock_functions(addr) ||
7076 (addr >= (unsigned long)__sched_text_start
7077 && addr < (unsigned long)__sched_text_end);
7078}
7079
7080#ifdef CONFIG_CGROUP_SCHED
7081
7082
7083
7084
7085struct task_group root_task_group;
7086LIST_HEAD(task_groups);
7087#endif
7088
7089DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7090
7091void __init sched_init(void)
7092{
7093 int i, j;
7094 unsigned long alloc_size = 0, ptr;
7095
7096#ifdef CONFIG_FAIR_GROUP_SCHED
7097 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7098#endif
7099#ifdef CONFIG_RT_GROUP_SCHED
7100 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7101#endif
7102 if (alloc_size) {
7103 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7104
7105#ifdef CONFIG_FAIR_GROUP_SCHED
7106 root_task_group.se = (struct sched_entity **)ptr;
7107 ptr += nr_cpu_ids * sizeof(void **);
7108
7109 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7110 ptr += nr_cpu_ids * sizeof(void **);
7111
7112#endif
7113#ifdef CONFIG_RT_GROUP_SCHED
7114 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7115 ptr += nr_cpu_ids * sizeof(void **);
7116
7117 root_task_group.rt_rq = (struct rt_rq **)ptr;
7118 ptr += nr_cpu_ids * sizeof(void **);
7119
7120#endif
7121 }
7122#ifdef CONFIG_CPUMASK_OFFSTACK
7123 for_each_possible_cpu(i) {
7124 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7125 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7126 }
7127#endif
7128
7129 init_rt_bandwidth(&def_rt_bandwidth,
7130 global_rt_period(), global_rt_runtime());
7131 init_dl_bandwidth(&def_dl_bandwidth,
7132 global_rt_period(), global_rt_runtime());
7133
7134#ifdef CONFIG_SMP
7135 init_defrootdomain();
7136#endif
7137
7138#ifdef CONFIG_RT_GROUP_SCHED
7139 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7140 global_rt_period(), global_rt_runtime());
7141#endif
7142
7143#ifdef CONFIG_CGROUP_SCHED
7144 list_add(&root_task_group.list, &task_groups);
7145 INIT_LIST_HEAD(&root_task_group.children);
7146 INIT_LIST_HEAD(&root_task_group.siblings);
7147 autogroup_init(&init_task);
7148
7149#endif
7150
7151 for_each_possible_cpu(i) {
7152 struct rq *rq;
7153
7154 rq = cpu_rq(i);
7155 raw_spin_lock_init(&rq->lock);
7156 rq->nr_running = 0;
7157 rq->calc_load_active = 0;
7158 rq->calc_load_update = jiffies + LOAD_FREQ;
7159 init_cfs_rq(&rq->cfs);
7160 init_rt_rq(&rq->rt);
7161 init_dl_rq(&rq->dl);
7162#ifdef CONFIG_FAIR_GROUP_SCHED
7163 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7164 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7185 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7186#endif
7187
7188 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7189#ifdef CONFIG_RT_GROUP_SCHED
7190 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7191#endif
7192
7193 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7194 rq->cpu_load[j] = 0;
7195
7196 rq->last_load_update_tick = jiffies;
7197
7198#ifdef CONFIG_SMP
7199 rq->sd = NULL;
7200 rq->rd = NULL;
7201 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7202 rq->post_schedule = 0;
7203 rq->active_balance = 0;
7204 rq->next_balance = jiffies;
7205 rq->push_cpu = 0;
7206 rq->cpu = i;
7207 rq->online = 0;
7208 rq->idle_stamp = 0;
7209 rq->avg_idle = 2*sysctl_sched_migration_cost;
7210 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
7211
7212 INIT_LIST_HEAD(&rq->cfs_tasks);
7213
7214 rq_attach_root(rq, &def_root_domain);
7215#ifdef CONFIG_NO_HZ_COMMON
7216 rq->nohz_flags = 0;
7217#endif
7218#ifdef CONFIG_NO_HZ_FULL
7219 rq->last_sched_tick = 0;
7220#endif
7221#endif
7222 init_rq_hrtick(rq);
7223 atomic_set(&rq->nr_iowait, 0);
7224 }
7225
7226 set_load_weight(&init_task);
7227
7228#ifdef CONFIG_PREEMPT_NOTIFIERS
7229 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7230#endif
7231
7232
7233
7234
7235 atomic_inc(&init_mm.mm_count);
7236 enter_lazy_tlb(&init_mm, current);
7237
7238
7239
7240
7241 current->sched_class = &fair_sched_class;
7242
7243
7244
7245
7246
7247
7248
7249 init_idle(current, smp_processor_id());
7250
7251 calc_load_update = jiffies + LOAD_FREQ;
7252
7253#ifdef CONFIG_SMP
7254 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7255
7256 if (cpu_isolated_map == NULL)
7257 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7258 idle_thread_set_boot_cpu();
7259 set_cpu_rq_start_time();
7260#endif
7261 init_sched_fair_class();
7262
7263 scheduler_running = 1;
7264}
7265
7266#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7267static inline int preempt_count_equals(int preempt_offset)
7268{
7269 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7270
7271 return (nested == preempt_offset);
7272}
7273
7274void __might_sleep(const char *file, int line, int preempt_offset)
7275{
7276
7277
7278
7279
7280
7281 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7282 "do not call blocking ops when !TASK_RUNNING; "
7283 "state=%lx set at [<%p>] %pS\n",
7284 current->state,
7285 (void *)current->task_state_change,
7286 (void *)current->task_state_change);
7287
7288 ___might_sleep(file, line, preempt_offset);
7289}
7290EXPORT_SYMBOL(__might_sleep);
7291
7292void ___might_sleep(const char *file, int line, int preempt_offset)
7293{
7294 static unsigned long prev_jiffy;
7295
7296 rcu_sleep_check();
7297 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
7298 !is_idle_task(current)) ||
7299 system_state != SYSTEM_RUNNING || oops_in_progress)
7300 return;
7301 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7302 return;
7303 prev_jiffy = jiffies;
7304
7305 printk(KERN_ERR
7306 "BUG: sleeping function called from invalid context at %s:%d\n",
7307 file, line);
7308 printk(KERN_ERR
7309 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7310 in_atomic(), irqs_disabled(),
7311 current->pid, current->comm);
7312
7313 if (task_stack_end_corrupted(current))
7314 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7315
7316 debug_show_held_locks(current);
7317 if (irqs_disabled())
7318 print_irqtrace_events(current);
7319#ifdef CONFIG_DEBUG_PREEMPT
7320 if (!preempt_count_equals(preempt_offset)) {
7321 pr_err("Preemption disabled at:");
7322 print_ip_sym(current->preempt_disable_ip);
7323 pr_cont("\n");
7324 }
7325#endif
7326 dump_stack();
7327}
7328EXPORT_SYMBOL(___might_sleep);
7329#endif
7330
7331#ifdef CONFIG_MAGIC_SYSRQ
7332static void normalize_task(struct rq *rq, struct task_struct *p)
7333{
7334 const struct sched_class *prev_class = p->sched_class;
7335 struct sched_attr attr = {
7336 .sched_policy = SCHED_NORMAL,
7337 };
7338 int old_prio = p->prio;
7339 int queued;
7340
7341 queued = task_on_rq_queued(p);
7342 if (queued)
7343 dequeue_task(rq, p, 0);
7344 __setscheduler(rq, p, &attr, false);
7345 if (queued) {
7346 enqueue_task(rq, p, 0);
7347 resched_curr(rq);
7348 }
7349
7350 check_class_changed(rq, p, prev_class, old_prio);
7351}
7352
7353void normalize_rt_tasks(void)
7354{
7355 struct task_struct *g, *p;
7356 unsigned long flags;
7357 struct rq *rq;
7358
7359 read_lock(&tasklist_lock);
7360 for_each_process_thread(g, p) {
7361
7362
7363
7364 if (p->flags & PF_KTHREAD)
7365 continue;
7366
7367 p->se.exec_start = 0;
7368#ifdef CONFIG_SCHEDSTATS
7369 p->se.statistics.wait_start = 0;
7370 p->se.statistics.sleep_start = 0;
7371 p->se.statistics.block_start = 0;
7372#endif
7373
7374 if (!dl_task(p) && !rt_task(p)) {
7375
7376
7377
7378
7379 if (task_nice(p) < 0)
7380 set_user_nice(p, 0);
7381 continue;
7382 }
7383
7384 rq = task_rq_lock(p, &flags);
7385 normalize_task(rq, p);
7386 task_rq_unlock(rq, p, &flags);
7387 }
7388 read_unlock(&tasklist_lock);
7389}
7390
7391#endif
7392
7393#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412struct task_struct *curr_task(int cpu)
7413{
7414 return cpu_curr(cpu);
7415}
7416
7417#endif
7418
7419#ifdef CONFIG_IA64
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435void set_curr_task(int cpu, struct task_struct *p)
7436{
7437 cpu_curr(cpu) = p;
7438}
7439
7440#endif
7441
7442#ifdef CONFIG_CGROUP_SCHED
7443
7444static DEFINE_SPINLOCK(task_group_lock);
7445
7446static void free_sched_group(struct task_group *tg)
7447{
7448 free_fair_sched_group(tg);
7449 free_rt_sched_group(tg);
7450 autogroup_free(tg);
7451 kfree(tg);
7452}
7453
7454
7455struct task_group *sched_create_group(struct task_group *parent)
7456{
7457 struct task_group *tg;
7458
7459 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7460 if (!tg)
7461 return ERR_PTR(-ENOMEM);
7462
7463 if (!alloc_fair_sched_group(tg, parent))
7464 goto err;
7465
7466 if (!alloc_rt_sched_group(tg, parent))
7467 goto err;
7468
7469 return tg;
7470
7471err:
7472 free_sched_group(tg);
7473 return ERR_PTR(-ENOMEM);
7474}
7475
7476void sched_online_group(struct task_group *tg, struct task_group *parent)
7477{
7478 unsigned long flags;
7479
7480 spin_lock_irqsave(&task_group_lock, flags);
7481 list_add_rcu(&tg->list, &task_groups);
7482
7483 WARN_ON(!parent);
7484
7485 tg->parent = parent;
7486 INIT_LIST_HEAD(&tg->children);
7487 list_add_rcu(&tg->siblings, &parent->children);
7488 spin_unlock_irqrestore(&task_group_lock, flags);
7489}
7490
7491
7492static void free_sched_group_rcu(struct rcu_head *rhp)
7493{
7494
7495 free_sched_group(container_of(rhp, struct task_group, rcu));
7496}
7497
7498
7499void sched_destroy_group(struct task_group *tg)
7500{
7501
7502 call_rcu(&tg->rcu, free_sched_group_rcu);
7503}
7504
7505void sched_offline_group(struct task_group *tg)
7506{
7507 unsigned long flags;
7508 int i;
7509
7510
7511 for_each_possible_cpu(i)
7512 unregister_fair_sched_group(tg, i);
7513
7514 spin_lock_irqsave(&task_group_lock, flags);
7515 list_del_rcu(&tg->list);
7516 list_del_rcu(&tg->siblings);
7517 spin_unlock_irqrestore(&task_group_lock, flags);
7518}
7519
7520
7521
7522
7523
7524
7525void sched_move_task(struct task_struct *tsk)
7526{
7527 struct task_group *tg;
7528 int queued, running;
7529 unsigned long flags;
7530 struct rq *rq;
7531
7532 rq = task_rq_lock(tsk, &flags);
7533
7534 running = task_current(rq, tsk);
7535 queued = task_on_rq_queued(tsk);
7536
7537 if (queued)
7538 dequeue_task(rq, tsk, 0);
7539 if (unlikely(running))
7540 put_prev_task(rq, tsk);
7541
7542
7543
7544
7545
7546
7547 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7548 struct task_group, css);
7549 tg = autogroup_task_group(tsk, tg);
7550 tsk->sched_task_group = tg;
7551
7552#ifdef CONFIG_FAIR_GROUP_SCHED
7553 if (tsk->sched_class->task_move_group)
7554 tsk->sched_class->task_move_group(tsk, queued);
7555 else
7556#endif
7557 set_task_rq(tsk, task_cpu(tsk));
7558
7559 if (unlikely(running))
7560 tsk->sched_class->set_curr_task(rq);
7561 if (queued)
7562 enqueue_task(rq, tsk, 0);
7563
7564 task_rq_unlock(rq, tsk, &flags);
7565}
7566#endif
7567
7568#ifdef CONFIG_RT_GROUP_SCHED
7569
7570
7571
7572static DEFINE_MUTEX(rt_constraints_mutex);
7573
7574
7575static inline int tg_has_rt_tasks(struct task_group *tg)
7576{
7577 struct task_struct *g, *p;
7578
7579
7580
7581
7582 if (task_group_is_autogroup(tg))
7583 return 0;
7584
7585 for_each_process_thread(g, p) {
7586 if (rt_task(p) && task_group(p) == tg)
7587 return 1;
7588 }
7589
7590 return 0;
7591}
7592
7593struct rt_schedulable_data {
7594 struct task_group *tg;
7595 u64 rt_period;
7596 u64 rt_runtime;
7597};
7598
7599static int tg_rt_schedulable(struct task_group *tg, void *data)
7600{
7601 struct rt_schedulable_data *d = data;
7602 struct task_group *child;
7603 unsigned long total, sum = 0;
7604 u64 period, runtime;
7605
7606 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7607 runtime = tg->rt_bandwidth.rt_runtime;
7608
7609 if (tg == d->tg) {
7610 period = d->rt_period;
7611 runtime = d->rt_runtime;
7612 }
7613
7614
7615
7616
7617 if (runtime > period && runtime != RUNTIME_INF)
7618 return -EINVAL;
7619
7620
7621
7622
7623 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7624 return -EBUSY;
7625
7626 total = to_ratio(period, runtime);
7627
7628
7629
7630
7631 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7632 return -EINVAL;
7633
7634
7635
7636
7637 list_for_each_entry_rcu(child, &tg->children, siblings) {
7638 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7639 runtime = child->rt_bandwidth.rt_runtime;
7640
7641 if (child == d->tg) {
7642 period = d->rt_period;
7643 runtime = d->rt_runtime;
7644 }
7645
7646 sum += to_ratio(period, runtime);
7647 }
7648
7649 if (sum > total)
7650 return -EINVAL;
7651
7652 return 0;
7653}
7654
7655static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7656{
7657 int ret;
7658
7659 struct rt_schedulable_data data = {
7660 .tg = tg,
7661 .rt_period = period,
7662 .rt_runtime = runtime,
7663 };
7664
7665 rcu_read_lock();
7666 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7667 rcu_read_unlock();
7668
7669 return ret;
7670}
7671
7672static int tg_set_rt_bandwidth(struct task_group *tg,
7673 u64 rt_period, u64 rt_runtime)
7674{
7675 int i, err = 0;
7676
7677
7678
7679
7680
7681 if (tg == &root_task_group && rt_runtime == 0)
7682 return -EINVAL;
7683
7684
7685 if (rt_period == 0)
7686 return -EINVAL;
7687
7688 mutex_lock(&rt_constraints_mutex);
7689 read_lock(&tasklist_lock);
7690 err = __rt_schedulable(tg, rt_period, rt_runtime);
7691 if (err)
7692 goto unlock;
7693
7694 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7695 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7696 tg->rt_bandwidth.rt_runtime = rt_runtime;
7697
7698 for_each_possible_cpu(i) {
7699 struct rt_rq *rt_rq = tg->rt_rq[i];
7700
7701 raw_spin_lock(&rt_rq->rt_runtime_lock);
7702 rt_rq->rt_runtime = rt_runtime;
7703 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7704 }
7705 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7706unlock:
7707 read_unlock(&tasklist_lock);
7708 mutex_unlock(&rt_constraints_mutex);
7709
7710 return err;
7711}
7712
7713static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7714{
7715 u64 rt_runtime, rt_period;
7716
7717 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7718 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7719 if (rt_runtime_us < 0)
7720 rt_runtime = RUNTIME_INF;
7721
7722 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7723}
7724
7725static long sched_group_rt_runtime(struct task_group *tg)
7726{
7727 u64 rt_runtime_us;
7728
7729 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7730 return -1;
7731
7732 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7733 do_div(rt_runtime_us, NSEC_PER_USEC);
7734 return rt_runtime_us;
7735}
7736
7737static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7738{
7739 u64 rt_runtime, rt_period;
7740
7741 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7742 rt_runtime = tg->rt_bandwidth.rt_runtime;
7743
7744 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7745}
7746
7747static long sched_group_rt_period(struct task_group *tg)
7748{
7749 u64 rt_period_us;
7750
7751 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7752 do_div(rt_period_us, NSEC_PER_USEC);
7753 return rt_period_us;
7754}
7755#endif
7756
7757#ifdef CONFIG_RT_GROUP_SCHED
7758static int sched_rt_global_constraints(void)
7759{
7760 int ret = 0;
7761
7762 mutex_lock(&rt_constraints_mutex);
7763 read_lock(&tasklist_lock);
7764 ret = __rt_schedulable(NULL, 0, 0);
7765 read_unlock(&tasklist_lock);
7766 mutex_unlock(&rt_constraints_mutex);
7767
7768 return ret;
7769}
7770
7771static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7772{
7773
7774 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7775 return 0;
7776
7777 return 1;
7778}
7779
7780#else
7781static int sched_rt_global_constraints(void)
7782{
7783 unsigned long flags;
7784 int i, ret = 0;
7785
7786 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7787 for_each_possible_cpu(i) {
7788 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7789
7790 raw_spin_lock(&rt_rq->rt_runtime_lock);
7791 rt_rq->rt_runtime = global_rt_runtime();
7792 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7793 }
7794 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7795
7796 return ret;
7797}
7798#endif
7799
7800static int sched_dl_global_validate(void)
7801{
7802 u64 runtime = global_rt_runtime();
7803 u64 period = global_rt_period();
7804 u64 new_bw = to_ratio(period, runtime);
7805 struct dl_bw *dl_b;
7806 int cpu, ret = 0;
7807 unsigned long flags;
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818 for_each_possible_cpu(cpu) {
7819 rcu_read_lock_sched();
7820 dl_b = dl_bw_of(cpu);
7821
7822 raw_spin_lock_irqsave(&dl_b->lock, flags);
7823 if (new_bw < dl_b->total_bw)
7824 ret = -EBUSY;
7825 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7826
7827 rcu_read_unlock_sched();
7828
7829 if (ret)
7830 break;
7831 }
7832
7833 return ret;
7834}
7835
7836static void sched_dl_do_global(void)
7837{
7838 u64 new_bw = -1;
7839 struct dl_bw *dl_b;
7840 int cpu;
7841 unsigned long flags;
7842
7843 def_dl_bandwidth.dl_period = global_rt_period();
7844 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7845
7846 if (global_rt_runtime() != RUNTIME_INF)
7847 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7848
7849
7850
7851
7852 for_each_possible_cpu(cpu) {
7853 rcu_read_lock_sched();
7854 dl_b = dl_bw_of(cpu);
7855
7856 raw_spin_lock_irqsave(&dl_b->lock, flags);
7857 dl_b->bw = new_bw;
7858 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7859
7860 rcu_read_unlock_sched();
7861 }
7862}
7863
7864static int sched_rt_global_validate(void)
7865{
7866 if (sysctl_sched_rt_period <= 0)
7867 return -EINVAL;
7868
7869 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7870 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7871 return -EINVAL;
7872
7873 return 0;
7874}
7875
7876static void sched_rt_do_global(void)
7877{
7878 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7879 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7880}
7881
7882int sched_rt_handler(struct ctl_table *table, int write,
7883 void __user *buffer, size_t *lenp,
7884 loff_t *ppos)
7885{
7886 int old_period, old_runtime;
7887 static DEFINE_MUTEX(mutex);
7888 int ret;
7889
7890 mutex_lock(&mutex);
7891 old_period = sysctl_sched_rt_period;
7892 old_runtime = sysctl_sched_rt_runtime;
7893
7894 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7895
7896 if (!ret && write) {
7897 ret = sched_rt_global_validate();
7898 if (ret)
7899 goto undo;
7900
7901 ret = sched_dl_global_validate();
7902 if (ret)
7903 goto undo;
7904
7905 ret = sched_rt_global_constraints();
7906 if (ret)
7907 goto undo;
7908
7909 sched_rt_do_global();
7910 sched_dl_do_global();
7911 }
7912 if (0) {
7913undo:
7914 sysctl_sched_rt_period = old_period;
7915 sysctl_sched_rt_runtime = old_runtime;
7916 }
7917 mutex_unlock(&mutex);
7918
7919 return ret;
7920}
7921
7922int sched_rr_handler(struct ctl_table *table, int write,
7923 void __user *buffer, size_t *lenp,
7924 loff_t *ppos)
7925{
7926 int ret;
7927 static DEFINE_MUTEX(mutex);
7928
7929 mutex_lock(&mutex);
7930 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7931
7932
7933 if (!ret && write) {
7934 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7935 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7936 }
7937 mutex_unlock(&mutex);
7938 return ret;
7939}
7940
7941#ifdef CONFIG_CGROUP_SCHED
7942
7943static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7944{
7945 return css ? container_of(css, struct task_group, css) : NULL;
7946}
7947
7948static struct cgroup_subsys_state *
7949cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7950{
7951 struct task_group *parent = css_tg(parent_css);
7952 struct task_group *tg;
7953
7954 if (!parent) {
7955
7956 return &root_task_group.css;
7957 }
7958
7959 tg = sched_create_group(parent);
7960 if (IS_ERR(tg))
7961 return ERR_PTR(-ENOMEM);
7962
7963 return &tg->css;
7964}
7965
7966static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7967{
7968 struct task_group *tg = css_tg(css);
7969 struct task_group *parent = css_tg(css->parent);
7970
7971 if (parent)
7972 sched_online_group(tg, parent);
7973 return 0;
7974}
7975
7976static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7977{
7978 struct task_group *tg = css_tg(css);
7979
7980 sched_destroy_group(tg);
7981}
7982
7983static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7984{
7985 struct task_group *tg = css_tg(css);
7986
7987 sched_offline_group(tg);
7988}
7989
7990static void cpu_cgroup_fork(struct task_struct *task)
7991{
7992 sched_move_task(task);
7993}
7994
7995static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7996 struct cgroup_taskset *tset)
7997{
7998 struct task_struct *task;
7999
8000 cgroup_taskset_for_each(task, tset) {
8001#ifdef CONFIG_RT_GROUP_SCHED
8002 if (!sched_rt_can_attach(css_tg(css), task))
8003 return -EINVAL;
8004#else
8005
8006 if (task->sched_class != &fair_sched_class)
8007 return -EINVAL;
8008#endif
8009 }
8010 return 0;
8011}
8012
8013static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
8014 struct cgroup_taskset *tset)
8015{
8016 struct task_struct *task;
8017
8018 cgroup_taskset_for_each(task, tset)
8019 sched_move_task(task);
8020}
8021
8022static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8023 struct cgroup_subsys_state *old_css,
8024 struct task_struct *task)
8025{
8026
8027
8028
8029
8030
8031 if (!(task->flags & PF_EXITING))
8032 return;
8033
8034 sched_move_task(task);
8035}
8036
8037#ifdef CONFIG_FAIR_GROUP_SCHED
8038static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8039 struct cftype *cftype, u64 shareval)
8040{
8041 return sched_group_set_shares(css_tg(css), scale_load(shareval));
8042}
8043
8044static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8045 struct cftype *cft)
8046{
8047 struct task_group *tg = css_tg(css);
8048
8049 return (u64) scale_load_down(tg->shares);
8050}
8051
8052#ifdef CONFIG_CFS_BANDWIDTH
8053static DEFINE_MUTEX(cfs_constraints_mutex);
8054
8055const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
8056const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
8057
8058static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8059
8060static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8061{
8062 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8063 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8064
8065 if (tg == &root_task_group)
8066 return -EINVAL;
8067
8068
8069
8070
8071
8072
8073 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
8074 return -EINVAL;
8075
8076
8077
8078
8079
8080
8081 if (period > max_cfs_quota_period)
8082 return -EINVAL;
8083
8084
8085
8086
8087
8088 get_online_cpus();
8089 mutex_lock(&cfs_constraints_mutex);
8090 ret = __cfs_schedulable(tg, period, quota);
8091 if (ret)
8092 goto out_unlock;
8093
8094 runtime_enabled = quota != RUNTIME_INF;
8095 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
8096
8097
8098
8099
8100 if (runtime_enabled && !runtime_was_enabled)
8101 cfs_bandwidth_usage_inc();
8102 raw_spin_lock_irq(&cfs_b->lock);
8103 cfs_b->period = ns_to_ktime(period);
8104 cfs_b->quota = quota;
8105
8106 __refill_cfs_bandwidth_runtime(cfs_b);
8107
8108 if (runtime_enabled && cfs_b->timer_active) {
8109
8110 __start_cfs_bandwidth(cfs_b, true);
8111 }
8112 raw_spin_unlock_irq(&cfs_b->lock);
8113
8114 for_each_online_cpu(i) {
8115 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
8116 struct rq *rq = cfs_rq->rq;
8117
8118 raw_spin_lock_irq(&rq->lock);
8119 cfs_rq->runtime_enabled = runtime_enabled;
8120 cfs_rq->runtime_remaining = 0;
8121
8122 if (cfs_rq->throttled)
8123 unthrottle_cfs_rq(cfs_rq);
8124 raw_spin_unlock_irq(&rq->lock);
8125 }
8126 if (runtime_was_enabled && !runtime_enabled)
8127 cfs_bandwidth_usage_dec();
8128out_unlock:
8129 mutex_unlock(&cfs_constraints_mutex);
8130 put_online_cpus();
8131
8132 return ret;
8133}
8134
8135int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8136{
8137 u64 quota, period;
8138
8139 period = ktime_to_ns(tg->cfs_bandwidth.period);
8140 if (cfs_quota_us < 0)
8141 quota = RUNTIME_INF;
8142 else
8143 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
8144
8145 return tg_set_cfs_bandwidth(tg, period, quota);
8146}
8147
8148long tg_get_cfs_quota(struct task_group *tg)
8149{
8150 u64 quota_us;
8151
8152 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
8153 return -1;
8154
8155 quota_us = tg->cfs_bandwidth.quota;
8156 do_div(quota_us, NSEC_PER_USEC);
8157
8158 return quota_us;
8159}
8160
8161int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8162{
8163 u64 quota, period;
8164
8165 period = (u64)cfs_period_us * NSEC_PER_USEC;
8166 quota = tg->cfs_bandwidth.quota;
8167
8168 return tg_set_cfs_bandwidth(tg, period, quota);
8169}
8170
8171long tg_get_cfs_period(struct task_group *tg)
8172{
8173 u64 cfs_period_us;
8174
8175 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
8176 do_div(cfs_period_us, NSEC_PER_USEC);
8177
8178 return cfs_period_us;
8179}
8180
8181static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
8182 struct cftype *cft)
8183{
8184 return tg_get_cfs_quota(css_tg(css));
8185}
8186
8187static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
8188 struct cftype *cftype, s64 cfs_quota_us)
8189{
8190 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
8191}
8192
8193static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
8194 struct cftype *cft)
8195{
8196 return tg_get_cfs_period(css_tg(css));
8197}
8198
8199static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
8200 struct cftype *cftype, u64 cfs_period_us)
8201{
8202 return tg_set_cfs_period(css_tg(css), cfs_period_us);
8203}
8204
8205struct cfs_schedulable_data {
8206 struct task_group *tg;
8207 u64 period, quota;
8208};
8209
8210
8211
8212
8213
8214static u64 normalize_cfs_quota(struct task_group *tg,
8215 struct cfs_schedulable_data *d)
8216{
8217 u64 quota, period;
8218
8219 if (tg == d->tg) {
8220 period = d->period;
8221 quota = d->quota;
8222 } else {
8223 period = tg_get_cfs_period(tg);
8224 quota = tg_get_cfs_quota(tg);
8225 }
8226
8227
8228 if (quota == RUNTIME_INF || quota == -1)
8229 return RUNTIME_INF;
8230
8231 return to_ratio(period, quota);
8232}
8233
8234static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8235{
8236 struct cfs_schedulable_data *d = data;
8237 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8238 s64 quota = 0, parent_quota = -1;
8239
8240 if (!tg->parent) {
8241 quota = RUNTIME_INF;
8242 } else {
8243 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8244
8245 quota = normalize_cfs_quota(tg, d);
8246 parent_quota = parent_b->hierarchical_quota;
8247
8248
8249
8250
8251
8252 if (quota == RUNTIME_INF)
8253 quota = parent_quota;
8254 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8255 return -EINVAL;
8256 }
8257 cfs_b->hierarchical_quota = quota;
8258
8259 return 0;
8260}
8261
8262static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
8263{
8264 int ret;
8265 struct cfs_schedulable_data data = {
8266 .tg = tg,
8267 .period = period,
8268 .quota = quota,
8269 };
8270
8271 if (quota != RUNTIME_INF) {
8272 do_div(data.period, NSEC_PER_USEC);
8273 do_div(data.quota, NSEC_PER_USEC);
8274 }
8275
8276 rcu_read_lock();
8277 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
8278 rcu_read_unlock();
8279
8280 return ret;
8281}
8282
8283static int cpu_stats_show(struct seq_file *sf, void *v)
8284{
8285 struct task_group *tg = css_tg(seq_css(sf));
8286 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8287
8288 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
8289 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
8290 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
8291
8292 return 0;
8293}
8294#endif
8295#endif
8296
8297#ifdef CONFIG_RT_GROUP_SCHED
8298static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
8299 struct cftype *cft, s64 val)
8300{
8301 return sched_group_set_rt_runtime(css_tg(css), val);
8302}
8303
8304static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
8305 struct cftype *cft)
8306{
8307 return sched_group_rt_runtime(css_tg(css));
8308}
8309
8310static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
8311 struct cftype *cftype, u64 rt_period_us)
8312{
8313 return sched_group_set_rt_period(css_tg(css), rt_period_us);
8314}
8315
8316static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
8317 struct cftype *cft)
8318{
8319 return sched_group_rt_period(css_tg(css));
8320}
8321#endif
8322
8323static struct cftype cpu_files[] = {
8324#ifdef CONFIG_FAIR_GROUP_SCHED
8325 {
8326 .name = "shares",
8327 .read_u64 = cpu_shares_read_u64,
8328 .write_u64 = cpu_shares_write_u64,
8329 },
8330#endif
8331#ifdef CONFIG_CFS_BANDWIDTH
8332 {
8333 .name = "cfs_quota_us",
8334 .read_s64 = cpu_cfs_quota_read_s64,
8335 .write_s64 = cpu_cfs_quota_write_s64,
8336 },
8337 {
8338 .name = "cfs_period_us",
8339 .read_u64 = cpu_cfs_period_read_u64,
8340 .write_u64 = cpu_cfs_period_write_u64,
8341 },
8342 {
8343 .name = "stat",
8344 .seq_show = cpu_stats_show,
8345 },
8346#endif
8347#ifdef CONFIG_RT_GROUP_SCHED
8348 {
8349 .name = "rt_runtime_us",
8350 .read_s64 = cpu_rt_runtime_read,
8351 .write_s64 = cpu_rt_runtime_write,
8352 },
8353 {
8354 .name = "rt_period_us",
8355 .read_u64 = cpu_rt_period_read_uint,
8356 .write_u64 = cpu_rt_period_write_uint,
8357 },
8358#endif
8359 { }
8360};
8361
8362struct cgroup_subsys cpu_cgrp_subsys = {
8363 .css_alloc = cpu_cgroup_css_alloc,
8364 .css_free = cpu_cgroup_css_free,
8365 .css_online = cpu_cgroup_css_online,
8366 .css_offline = cpu_cgroup_css_offline,
8367 .fork = cpu_cgroup_fork,
8368 .can_attach = cpu_cgroup_can_attach,
8369 .attach = cpu_cgroup_attach,
8370 .exit = cpu_cgroup_exit,
8371 .legacy_cftypes = cpu_files,
8372 .early_init = 1,
8373};
8374
8375#endif
8376
8377void dump_cpu_task(int cpu)
8378{
8379 pr_info("Task dump for CPU %d:\n", cpu);
8380 sched_show_task(cpu_curr(cpu));
8381}
8382