1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302
303
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320
321
322
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355
356
357
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371
372
373
374
375static void hrtick_clear(struct rq *rq)
376{
377 if (hrtimer_active(&rq->hrtick_timer))
378 hrtimer_cancel(&rq->hrtick_timer);
379}
380
381
382
383
384
385static enum hrtimer_restart hrtick(struct hrtimer *timer)
386{
387 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
388
389 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
390
391 raw_spin_lock(&rq->lock);
392 update_rq_clock(rq);
393 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
394 raw_spin_unlock(&rq->lock);
395
396 return HRTIMER_NORESTART;
397}
398
399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
409
410
411
412static void __hrtick_start(void *arg)
413{
414 struct rq *rq = arg;
415
416 raw_spin_lock(&rq->lock);
417 __hrtick_restart(rq);
418 rq->hrtick_csd_pending = 0;
419 raw_spin_unlock(&rq->lock);
420}
421
422
423
424
425
426
427void hrtick_start(struct rq *rq, u64 delay)
428{
429 struct hrtimer *timer = &rq->hrtick_timer;
430 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
431
432 hrtimer_set_expires(timer, time);
433
434 if (rq == this_rq()) {
435 __hrtick_restart(rq);
436 } else if (!rq->hrtick_csd_pending) {
437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
438 rq->hrtick_csd_pending = 1;
439 }
440}
441
442static int
443hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
444{
445 int cpu = (int)(long)hcpu;
446
447 switch (action) {
448 case CPU_UP_CANCELED:
449 case CPU_UP_CANCELED_FROZEN:
450 case CPU_DOWN_PREPARE:
451 case CPU_DOWN_PREPARE_FROZEN:
452 case CPU_DEAD:
453 case CPU_DEAD_FROZEN:
454 hrtick_clear(cpu_rq(cpu));
455 return NOTIFY_OK;
456 }
457
458 return NOTIFY_DONE;
459}
460
461static __init void init_hrtick(void)
462{
463 hotcpu_notifier(hotplug_hrtick, 0);
464}
465#else
466
467
468
469
470
471void hrtick_start(struct rq *rq, u64 delay)
472{
473 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
474 HRTIMER_MODE_REL_PINNED, 0);
475}
476
477static inline void init_hrtick(void)
478{
479}
480#endif
481
482static void init_rq_hrtick(struct rq *rq)
483{
484#ifdef CONFIG_SMP
485 rq->hrtick_csd_pending = 0;
486
487 rq->hrtick_csd.flags = 0;
488 rq->hrtick_csd.func = __hrtick_start;
489 rq->hrtick_csd.info = rq;
490#endif
491
492 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
493 rq->hrtick_timer.function = hrtick;
494}
495#else
496static inline void hrtick_clear(struct rq *rq)
497{
498}
499
500static inline void init_rq_hrtick(struct rq *rq)
501{
502}
503
504static inline void init_hrtick(void)
505{
506}
507#endif
508
509
510
511
512
513
514
515
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p)
518{
519 int cpu;
520
521 assert_raw_spin_locked(&task_rq(p)->lock);
522
523 if (test_tsk_need_resched(p))
524 return;
525
526 set_tsk_need_resched(p);
527
528 cpu = task_cpu(p);
529 if (cpu == smp_processor_id())
530 return;
531
532
533 smp_mb();
534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu);
536}
537
538void resched_cpu(int cpu)
539{
540 struct rq *rq = cpu_rq(cpu);
541 unsigned long flags;
542
543 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
544 return;
545 resched_task(cpu_curr(cpu));
546 raw_spin_unlock_irqrestore(&rq->lock, flags);
547}
548
549#ifdef CONFIG_NO_HZ_COMMON
550
551
552
553
554
555
556
557
558int get_nohz_timer_target(void)
559{
560 int cpu = smp_processor_id();
561 int i;
562 struct sched_domain *sd;
563
564 rcu_read_lock();
565 for_each_domain(cpu, sd) {
566 for_each_cpu(i, sched_domain_span(sd)) {
567 if (!idle_cpu(i)) {
568 cpu = i;
569 goto unlock;
570 }
571 }
572 }
573unlock:
574 rcu_read_unlock();
575 return cpu;
576}
577
578
579
580
581
582
583
584
585
586
587static void wake_up_idle_cpu(int cpu)
588{
589 struct rq *rq = cpu_rq(cpu);
590
591 if (cpu == smp_processor_id())
592 return;
593
594
595
596
597
598
599
600
601 if (rq->curr != rq->idle)
602 return;
603
604
605
606
607
608
609 set_tsk_need_resched(rq->idle);
610
611
612 smp_mb();
613 if (!tsk_is_polling(rq->idle))
614 smp_send_reschedule(cpu);
615}
616
617static bool wake_up_full_nohz_cpu(int cpu)
618{
619 if (tick_nohz_full_cpu(cpu)) {
620 if (cpu != smp_processor_id() ||
621 tick_nohz_tick_stopped())
622 smp_send_reschedule(cpu);
623 return true;
624 }
625
626 return false;
627}
628
629void wake_up_nohz_cpu(int cpu)
630{
631 if (!wake_up_full_nohz_cpu(cpu))
632 wake_up_idle_cpu(cpu);
633}
634
635static inline bool got_nohz_idle_kick(void)
636{
637 int cpu = smp_processor_id();
638
639 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
640 return false;
641
642 if (idle_cpu(cpu) && !need_resched())
643 return true;
644
645
646
647
648
649 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
650 return false;
651}
652
653#else
654
655static inline bool got_nohz_idle_kick(void)
656{
657 return false;
658}
659
660#endif
661
662#ifdef CONFIG_NO_HZ_FULL
663bool sched_can_stop_tick(void)
664{
665 struct rq *rq;
666
667 rq = this_rq();
668
669
670 smp_rmb();
671
672
673 if (rq->nr_running > 1)
674 return false;
675
676 return true;
677}
678#endif
679
680void sched_avg_update(struct rq *rq)
681{
682 s64 period = sched_avg_period();
683
684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
685
686
687
688
689
690 asm("" : "+rm" (rq->age_stamp));
691 rq->age_stamp += period;
692 rq->rt_avg /= 2;
693 }
694}
695
696#else
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif
703
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
705 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
706
707
708
709
710
711
712int walk_tg_tree_from(struct task_group *from,
713 tg_visitor down, tg_visitor up, void *data)
714{
715 struct task_group *parent, *child;
716 int ret;
717
718 parent = from;
719
720down:
721 ret = (*down)(parent, data);
722 if (ret)
723 goto out;
724 list_for_each_entry_rcu(child, &parent->children, siblings) {
725 parent = child;
726 goto down;
727
728up:
729 continue;
730 }
731 ret = (*up)(parent, data);
732 if (ret || parent == from)
733 goto out;
734
735 child = parent;
736 parent = parent->parent;
737 if (parent)
738 goto up;
739out:
740 return ret;
741}
742
743int tg_nop(struct task_group *tg, void *data)
744{
745 return 0;
746}
747#endif
748
749static void set_load_weight(struct task_struct *p)
750{
751 int prio = p->static_prio - MAX_RT_PRIO;
752 struct load_weight *load = &p->se.load;
753
754
755
756
757 if (p->policy == SCHED_IDLE) {
758 load->weight = scale_load(WEIGHT_IDLEPRIO);
759 load->inv_weight = WMULT_IDLEPRIO;
760 return;
761 }
762
763 load->weight = scale_load(prio_to_weight[prio]);
764 load->inv_weight = prio_to_wmult[prio];
765}
766
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{
769 update_rq_clock(rq);
770 sched_info_queued(p);
771 p->sched_class->enqueue_task(rq, p, flags);
772}
773
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{
776 update_rq_clock(rq);
777 sched_info_dequeued(p);
778 p->sched_class->dequeue_task(rq, p, flags);
779}
780
781void activate_task(struct rq *rq, struct task_struct *p, int flags)
782{
783 if (task_contributes_to_load(p))
784 rq->nr_uninterruptible--;
785
786 enqueue_task(rq, p, flags);
787}
788
789void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
790{
791 if (task_contributes_to_load(p))
792 rq->nr_uninterruptible++;
793
794 dequeue_task(rq, p, flags);
795}
796
797static void update_rq_clock_task(struct rq *rq, s64 delta)
798{
799
800
801
802
803#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
804 s64 steal = 0, irq_delta = 0;
805#endif
806#ifdef CONFIG_IRQ_TIME_ACCOUNTING
807 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824 if (irq_delta > delta)
825 irq_delta = delta;
826
827 rq->prev_irq_time += irq_delta;
828 delta -= irq_delta;
829#endif
830#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
831 if (static_key_false((¶virt_steal_rq_enabled))) {
832 u64 st;
833
834 steal = paravirt_steal_clock(cpu_of(rq));
835 steal -= rq->prev_steal_time_rq;
836
837 if (unlikely(steal > delta))
838 steal = delta;
839
840 st = steal_ticks(steal);
841 steal = st * TICK_NSEC;
842
843 rq->prev_steal_time_rq += steal;
844
845 delta -= steal;
846 }
847#endif
848
849 rq->clock_task += delta;
850
851#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
852 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
853 sched_rt_avg_update(rq, irq_delta + steal);
854#endif
855}
856
857void sched_set_stop_task(int cpu, struct task_struct *stop)
858{
859 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
860 struct task_struct *old_stop = cpu_rq(cpu)->stop;
861
862 if (stop) {
863
864
865
866
867
868
869
870
871 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
872
873 stop->sched_class = &stop_sched_class;
874 }
875
876 cpu_rq(cpu)->stop = stop;
877
878 if (old_stop) {
879
880
881
882
883 old_stop->sched_class = &rt_sched_class;
884 }
885}
886
887
888
889
890static inline int __normal_prio(struct task_struct *p)
891{
892 return p->static_prio;
893}
894
895
896
897
898
899
900
901
902static inline int normal_prio(struct task_struct *p)
903{
904 int prio;
905
906 if (task_has_rt_policy(p))
907 prio = MAX_RT_PRIO-1 - p->rt_priority;
908 else
909 prio = __normal_prio(p);
910 return prio;
911}
912
913
914
915
916
917
918
919
920static int effective_prio(struct task_struct *p)
921{
922 p->normal_prio = normal_prio(p);
923
924
925
926
927
928 if (!rt_prio(p->prio))
929 return p->normal_prio;
930 return p->prio;
931}
932
933
934
935
936
937
938
939inline int task_curr(const struct task_struct *p)
940{
941 return cpu_curr(task_cpu(p)) == p;
942}
943
944static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 const struct sched_class *prev_class,
946 int oldprio)
947{
948 if (prev_class != p->sched_class) {
949 if (prev_class->switched_from)
950 prev_class->switched_from(rq, p);
951 p->sched_class->switched_to(rq, p);
952 } else if (oldprio != p->prio)
953 p->sched_class->prio_changed(rq, p, oldprio);
954}
955
956void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
957{
958 const struct sched_class *class;
959
960 if (p->sched_class == rq->curr->sched_class) {
961 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
962 } else {
963 for_each_class(class) {
964 if (class == rq->curr->sched_class)
965 break;
966 if (class == p->sched_class) {
967 resched_task(rq->curr);
968 break;
969 }
970 }
971 }
972
973
974
975
976
977 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
978 rq->skip_clock_update = 1;
979}
980
981static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
982
983void register_task_migration_notifier(struct notifier_block *n)
984{
985 atomic_notifier_chain_register(&task_migration_notifier, n);
986}
987
988#ifdef CONFIG_SMP
989void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
990{
991#ifdef CONFIG_SCHED_DEBUG
992
993
994
995
996 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
997 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
998
999#ifdef CONFIG_LOCKDEP
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1011 lockdep_is_held(&task_rq(p)->lock)));
1012#endif
1013#endif
1014
1015 trace_sched_migrate_task(p, new_cpu);
1016
1017 if (task_cpu(p) != new_cpu) {
1018 struct task_migration_notifier tmn;
1019
1020 if (p->sched_class->migrate_task_rq)
1021 p->sched_class->migrate_task_rq(p, new_cpu);
1022 p->se.nr_migrations++;
1023 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1024
1025 tmn.task = p;
1026 tmn.from_cpu = task_cpu(p);
1027 tmn.to_cpu = new_cpu;
1028
1029 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1030 }
1031
1032 __set_task_cpu(p, new_cpu);
1033}
1034
1035struct migration_arg {
1036 struct task_struct *task;
1037 int dest_cpu;
1038};
1039
1040static int migration_cpu_stop(void *data);
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1059{
1060 unsigned long flags;
1061 int running, on_rq;
1062 unsigned long ncsw;
1063 struct rq *rq;
1064
1065 for (;;) {
1066
1067
1068
1069
1070
1071
1072 rq = task_rq(p);
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085 while (task_running(rq, p)) {
1086 if (match_state && unlikely(p->state != match_state))
1087 return 0;
1088 cpu_relax();
1089 }
1090
1091
1092
1093
1094
1095
1096 rq = task_rq_lock(p, &flags);
1097 trace_sched_wait_task(p);
1098 running = task_running(rq, p);
1099 on_rq = p->on_rq;
1100 ncsw = 0;
1101 if (!match_state || p->state == match_state)
1102 ncsw = p->nvcsw | LONG_MIN;
1103 task_rq_unlock(rq, p, &flags);
1104
1105
1106
1107
1108 if (unlikely(!ncsw))
1109 break;
1110
1111
1112
1113
1114
1115
1116
1117 if (unlikely(running)) {
1118 cpu_relax();
1119 continue;
1120 }
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131 if (unlikely(on_rq)) {
1132 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1133
1134 set_current_state(TASK_UNINTERRUPTIBLE);
1135 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1136 continue;
1137 }
1138
1139
1140
1141
1142
1143
1144 break;
1145 }
1146
1147 return ncsw;
1148}
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163void kick_process(struct task_struct *p)
1164{
1165 int cpu;
1166
1167 preempt_disable();
1168 cpu = task_cpu(p);
1169 if ((cpu != smp_processor_id()) && task_curr(p))
1170 smp_send_reschedule(cpu);
1171 preempt_enable();
1172}
1173EXPORT_SYMBOL_GPL(kick_process);
1174#endif
1175
1176#ifdef CONFIG_SMP
1177
1178
1179
1180static int select_fallback_rq(int cpu, struct task_struct *p)
1181{
1182 int nid = cpu_to_node(cpu);
1183 const struct cpumask *nodemask = NULL;
1184 enum { cpuset, possible, fail } state = cpuset;
1185 int dest_cpu;
1186
1187
1188
1189
1190
1191
1192 if (nid != -1) {
1193 nodemask = cpumask_of_node(nid);
1194
1195
1196 for_each_cpu(dest_cpu, nodemask) {
1197 if (!cpu_online(dest_cpu))
1198 continue;
1199 if (!cpu_active(dest_cpu))
1200 continue;
1201 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1202 return dest_cpu;
1203 }
1204 }
1205
1206 for (;;) {
1207
1208 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1209 if (!cpu_online(dest_cpu))
1210 continue;
1211 if (!cpu_active(dest_cpu))
1212 continue;
1213 goto out;
1214 }
1215
1216 switch (state) {
1217 case cpuset:
1218
1219 cpuset_cpus_allowed_fallback(p);
1220 state = possible;
1221 break;
1222
1223 case possible:
1224 do_set_cpus_allowed(p, cpu_possible_mask);
1225 state = fail;
1226 break;
1227
1228 case fail:
1229 BUG();
1230 break;
1231 }
1232 }
1233
1234out:
1235 if (state != cpuset) {
1236
1237
1238
1239
1240
1241 if (p->mm && printk_ratelimit()) {
1242 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1243 task_pid_nr(p), p->comm, cpu);
1244 }
1245 }
1246
1247 return dest_cpu;
1248}
1249
1250
1251
1252
1253static inline
1254int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1255{
1256 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1269 !cpu_online(cpu)))
1270 cpu = select_fallback_rq(task_cpu(p), p);
1271
1272 return cpu;
1273}
1274
1275static void update_avg(u64 *avg, u64 sample)
1276{
1277 s64 diff = sample - *avg;
1278 *avg += diff >> 3;
1279}
1280#endif
1281
1282static void
1283ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1284{
1285#ifdef CONFIG_SCHEDSTATS
1286 struct rq *rq = this_rq();
1287
1288#ifdef CONFIG_SMP
1289 int this_cpu = smp_processor_id();
1290
1291 if (cpu == this_cpu) {
1292 schedstat_inc(rq, ttwu_local);
1293 schedstat_inc(p, se.statistics.nr_wakeups_local);
1294 } else {
1295 struct sched_domain *sd;
1296
1297 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1298 rcu_read_lock();
1299 for_each_domain(this_cpu, sd) {
1300 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1301 schedstat_inc(sd, ttwu_wake_remote);
1302 break;
1303 }
1304 }
1305 rcu_read_unlock();
1306 }
1307
1308 if (wake_flags & WF_MIGRATED)
1309 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1310
1311#endif
1312
1313 schedstat_inc(rq, ttwu_count);
1314 schedstat_inc(p, se.statistics.nr_wakeups);
1315
1316 if (wake_flags & WF_SYNC)
1317 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1318
1319#endif
1320}
1321
1322static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1323{
1324 activate_task(rq, p, en_flags);
1325 p->on_rq = 1;
1326
1327
1328 if (p->flags & PF_WQ_WORKER)
1329 wq_worker_waking_up(p, cpu_of(rq));
1330}
1331
1332
1333
1334
1335static void
1336ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1337{
1338 check_preempt_curr(rq, p, wake_flags);
1339 trace_sched_wakeup(p, true);
1340
1341 p->state = TASK_RUNNING;
1342#ifdef CONFIG_SMP
1343 if (p->sched_class->task_woken)
1344 p->sched_class->task_woken(rq, p);
1345
1346 if (rq->idle_stamp) {
1347 u64 delta = rq_clock(rq) - rq->idle_stamp;
1348 u64 max = 2*sysctl_sched_migration_cost;
1349
1350 if (delta > max)
1351 rq->avg_idle = max;
1352 else
1353 update_avg(&rq->avg_idle, delta);
1354 rq->idle_stamp = 0;
1355 }
1356#endif
1357}
1358
1359static void
1360ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1361{
1362#ifdef CONFIG_SMP
1363 if (p->sched_contributes_to_load)
1364 rq->nr_uninterruptible--;
1365#endif
1366
1367 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1368 ttwu_do_wakeup(rq, p, wake_flags);
1369}
1370
1371
1372
1373
1374
1375
1376
1377static int ttwu_remote(struct task_struct *p, int wake_flags)
1378{
1379 struct rq *rq;
1380 int ret = 0;
1381
1382 rq = __task_rq_lock(p);
1383 if (p->on_rq) {
1384
1385 update_rq_clock(rq);
1386 ttwu_do_wakeup(rq, p, wake_flags);
1387 ret = 1;
1388 }
1389 __task_rq_unlock(rq);
1390
1391 return ret;
1392}
1393
1394#ifdef CONFIG_SMP
1395static void sched_ttwu_pending(void)
1396{
1397 struct rq *rq = this_rq();
1398 struct llist_node *llist = llist_del_all(&rq->wake_list);
1399 struct task_struct *p;
1400
1401 raw_spin_lock(&rq->lock);
1402
1403 while (llist) {
1404 p = llist_entry(llist, struct task_struct, wake_entry);
1405 llist = llist_next(llist);
1406 ttwu_do_activate(rq, p, 0);
1407 }
1408
1409 raw_spin_unlock(&rq->lock);
1410}
1411
1412void scheduler_ipi(void)
1413{
1414 if (llist_empty(&this_rq()->wake_list)
1415 && !tick_nohz_full_cpu(smp_processor_id())
1416 && !got_nohz_idle_kick())
1417 return;
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432 irq_enter();
1433 tick_nohz_full_check();
1434 sched_ttwu_pending();
1435
1436
1437
1438
1439 if (unlikely(got_nohz_idle_kick())) {
1440 this_rq()->idle_balance = 1;
1441 raise_softirq_irqoff(SCHED_SOFTIRQ);
1442 }
1443 irq_exit();
1444}
1445
1446static void ttwu_queue_remote(struct task_struct *p, int cpu)
1447{
1448 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1449 smp_send_reschedule(cpu);
1450}
1451
1452bool cpus_share_cache(int this_cpu, int that_cpu)
1453{
1454 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1455}
1456#endif
1457
1458static void ttwu_queue(struct task_struct *p, int cpu)
1459{
1460 struct rq *rq = cpu_rq(cpu);
1461
1462#if defined(CONFIG_SMP)
1463 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1464 sched_clock_cpu(cpu);
1465 ttwu_queue_remote(p, cpu);
1466 return;
1467 }
1468#endif
1469
1470 raw_spin_lock(&rq->lock);
1471 ttwu_do_activate(rq, p, 0);
1472 raw_spin_unlock(&rq->lock);
1473}
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490static int
1491try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1492{
1493 unsigned long flags;
1494 int cpu, success = 0;
1495
1496
1497
1498
1499
1500
1501
1502 smp_mb__before_spinlock();
1503 raw_spin_lock_irqsave(&p->pi_lock, flags);
1504 if (!(p->state & state))
1505 goto out;
1506
1507 success = 1;
1508 cpu = task_cpu(p);
1509
1510 if (p->on_rq && ttwu_remote(p, wake_flags))
1511 goto stat;
1512
1513#ifdef CONFIG_SMP
1514
1515
1516
1517
1518 while (p->on_cpu)
1519 cpu_relax();
1520
1521
1522
1523 smp_rmb();
1524
1525 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1526 p->state = TASK_WAKING;
1527
1528 if (p->sched_class->task_waking)
1529 p->sched_class->task_waking(p);
1530
1531 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1532 if (task_cpu(p) != cpu) {
1533 wake_flags |= WF_MIGRATED;
1534 set_task_cpu(p, cpu);
1535 }
1536#endif
1537
1538 ttwu_queue(p, cpu);
1539stat:
1540 ttwu_stat(p, cpu, wake_flags);
1541out:
1542 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1543
1544 return success;
1545}
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555static void try_to_wake_up_local(struct task_struct *p)
1556{
1557 struct rq *rq = task_rq(p);
1558
1559 if (WARN_ON_ONCE(rq != this_rq()) ||
1560 WARN_ON_ONCE(p == current))
1561 return;
1562
1563 lockdep_assert_held(&rq->lock);
1564
1565 if (!raw_spin_trylock(&p->pi_lock)) {
1566 raw_spin_unlock(&rq->lock);
1567 raw_spin_lock(&p->pi_lock);
1568 raw_spin_lock(&rq->lock);
1569 }
1570
1571 if (!(p->state & TASK_NORMAL))
1572 goto out;
1573
1574 if (!p->on_rq)
1575 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1576
1577 ttwu_do_wakeup(rq, p, 0);
1578 ttwu_stat(p, smp_processor_id(), 0);
1579out:
1580 raw_spin_unlock(&p->pi_lock);
1581}
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595int wake_up_process(struct task_struct *p)
1596{
1597 WARN_ON(task_is_stopped_or_traced(p));
1598 return try_to_wake_up(p, TASK_NORMAL, 0);
1599}
1600EXPORT_SYMBOL(wake_up_process);
1601
1602int wake_up_state(struct task_struct *p, unsigned int state)
1603{
1604 return try_to_wake_up(p, state, 0);
1605}
1606
1607
1608
1609
1610
1611
1612
1613static void __sched_fork(struct task_struct *p)
1614{
1615 p->on_rq = 0;
1616
1617 p->se.on_rq = 0;
1618 p->se.exec_start = 0;
1619 p->se.sum_exec_runtime = 0;
1620 p->se.prev_sum_exec_runtime = 0;
1621 p->se.nr_migrations = 0;
1622 p->se.vruntime = 0;
1623 INIT_LIST_HEAD(&p->se.group_node);
1624
1625#ifdef CONFIG_SCHEDSTATS
1626 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1627#endif
1628
1629 INIT_LIST_HEAD(&p->rt.run_list);
1630
1631#ifdef CONFIG_PREEMPT_NOTIFIERS
1632 INIT_HLIST_HEAD(&p->preempt_notifiers);
1633#endif
1634
1635#ifdef CONFIG_NUMA_BALANCING
1636 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1637 p->mm->numa_next_scan = jiffies;
1638 p->mm->numa_next_reset = jiffies;
1639 p->mm->numa_scan_seq = 0;
1640 }
1641
1642 p->node_stamp = 0ULL;
1643 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1644 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1645 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1646 p->numa_work.next = &p->numa_work;
1647#endif
1648}
1649
1650#ifdef CONFIG_NUMA_BALANCING
1651#ifdef CONFIG_SCHED_DEBUG
1652void set_numabalancing_state(bool enabled)
1653{
1654 if (enabled)
1655 sched_feat_set("NUMA");
1656 else
1657 sched_feat_set("NO_NUMA");
1658}
1659#else
1660__read_mostly bool numabalancing_enabled;
1661
1662void set_numabalancing_state(bool enabled)
1663{
1664 numabalancing_enabled = enabled;
1665}
1666#endif
1667#endif
1668
1669
1670
1671
1672void sched_fork(struct task_struct *p)
1673{
1674 unsigned long flags;
1675 int cpu = get_cpu();
1676
1677 __sched_fork(p);
1678
1679
1680
1681
1682
1683 p->state = TASK_RUNNING;
1684
1685
1686
1687
1688 p->prio = current->normal_prio;
1689
1690
1691
1692
1693 if (unlikely(p->sched_reset_on_fork)) {
1694 if (task_has_rt_policy(p)) {
1695 p->policy = SCHED_NORMAL;
1696 p->static_prio = NICE_TO_PRIO(0);
1697 p->rt_priority = 0;
1698 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1699 p->static_prio = NICE_TO_PRIO(0);
1700
1701 p->prio = p->normal_prio = __normal_prio(p);
1702 set_load_weight(p);
1703
1704
1705
1706
1707
1708 p->sched_reset_on_fork = 0;
1709 }
1710
1711 if (!rt_prio(p->prio))
1712 p->sched_class = &fair_sched_class;
1713
1714 if (p->sched_class->task_fork)
1715 p->sched_class->task_fork(p);
1716
1717
1718
1719
1720
1721
1722
1723
1724 raw_spin_lock_irqsave(&p->pi_lock, flags);
1725 set_task_cpu(p, cpu);
1726 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1727
1728#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1729 if (likely(sched_info_on()))
1730 memset(&p->sched_info, 0, sizeof(p->sched_info));
1731#endif
1732#if defined(CONFIG_SMP)
1733 p->on_cpu = 0;
1734#endif
1735#ifdef CONFIG_PREEMPT_COUNT
1736
1737 task_thread_info(p)->preempt_count = 1;
1738#endif
1739#ifdef CONFIG_SMP
1740 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1741#endif
1742
1743 put_cpu();
1744}
1745
1746
1747
1748
1749
1750
1751
1752
1753void wake_up_new_task(struct task_struct *p)
1754{
1755 unsigned long flags;
1756 struct rq *rq;
1757
1758 raw_spin_lock_irqsave(&p->pi_lock, flags);
1759#ifdef CONFIG_SMP
1760
1761
1762
1763
1764
1765 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1766#endif
1767
1768
1769 init_task_runnable_average(p);
1770 rq = __task_rq_lock(p);
1771 activate_task(rq, p, 0);
1772 p->on_rq = 1;
1773 trace_sched_wakeup_new(p, true);
1774 check_preempt_curr(rq, p, WF_FORK);
1775#ifdef CONFIG_SMP
1776 if (p->sched_class->task_woken)
1777 p->sched_class->task_woken(rq, p);
1778#endif
1779 task_rq_unlock(rq, p, &flags);
1780}
1781
1782#ifdef CONFIG_PREEMPT_NOTIFIERS
1783
1784
1785
1786
1787
1788void preempt_notifier_register(struct preempt_notifier *notifier)
1789{
1790 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1791}
1792EXPORT_SYMBOL_GPL(preempt_notifier_register);
1793
1794
1795
1796
1797
1798
1799
1800void preempt_notifier_unregister(struct preempt_notifier *notifier)
1801{
1802 hlist_del(¬ifier->link);
1803}
1804EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1805
1806static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1807{
1808 struct preempt_notifier *notifier;
1809
1810 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1811 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1812}
1813
1814static void
1815fire_sched_out_preempt_notifiers(struct task_struct *curr,
1816 struct task_struct *next)
1817{
1818 struct preempt_notifier *notifier;
1819
1820 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1821 notifier->ops->sched_out(notifier, next);
1822}
1823
1824#else
1825
1826static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1827{
1828}
1829
1830static void
1831fire_sched_out_preempt_notifiers(struct task_struct *curr,
1832 struct task_struct *next)
1833{
1834}
1835
1836#endif
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851static inline void
1852prepare_task_switch(struct rq *rq, struct task_struct *prev,
1853 struct task_struct *next)
1854{
1855 trace_sched_switch(prev, next);
1856 sched_info_switch(prev, next);
1857 perf_event_task_sched_out(prev, next);
1858 fire_sched_out_preempt_notifiers(prev, next);
1859 prepare_lock_switch(rq, next);
1860 prepare_arch_switch(next);
1861}
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1879 __releases(rq->lock)
1880{
1881 struct mm_struct *mm = rq->prev_mm;
1882 long prev_state;
1883
1884 rq->prev_mm = NULL;
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897 prev_state = prev->state;
1898 vtime_task_switch(prev);
1899 finish_arch_switch(prev);
1900 perf_event_task_sched_in(prev, current);
1901 finish_lock_switch(rq, prev);
1902 finish_arch_post_lock_switch();
1903
1904 fire_sched_in_preempt_notifiers(current);
1905 if (mm)
1906 mmdrop(mm);
1907 if (unlikely(prev_state == TASK_DEAD)) {
1908
1909
1910
1911
1912 kprobe_flush_task(prev);
1913 put_task_struct(prev);
1914 }
1915
1916 tick_nohz_task_switch(current);
1917}
1918
1919#ifdef CONFIG_SMP
1920
1921
1922static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1923{
1924 if (prev->sched_class->pre_schedule)
1925 prev->sched_class->pre_schedule(rq, prev);
1926}
1927
1928
1929static inline void post_schedule(struct rq *rq)
1930{
1931 if (rq->post_schedule) {
1932 unsigned long flags;
1933
1934 raw_spin_lock_irqsave(&rq->lock, flags);
1935 if (rq->curr->sched_class->post_schedule)
1936 rq->curr->sched_class->post_schedule(rq);
1937 raw_spin_unlock_irqrestore(&rq->lock, flags);
1938
1939 rq->post_schedule = 0;
1940 }
1941}
1942
1943#else
1944
1945static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1946{
1947}
1948
1949static inline void post_schedule(struct rq *rq)
1950{
1951}
1952
1953#endif
1954
1955
1956
1957
1958
1959asmlinkage void schedule_tail(struct task_struct *prev)
1960 __releases(rq->lock)
1961{
1962 struct rq *rq = this_rq();
1963
1964 finish_task_switch(rq, prev);
1965
1966
1967
1968
1969
1970 post_schedule(rq);
1971
1972#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1973
1974 preempt_enable();
1975#endif
1976 if (current->set_child_tid)
1977 put_user(task_pid_vnr(current), current->set_child_tid);
1978}
1979
1980
1981
1982
1983
1984static inline void
1985context_switch(struct rq *rq, struct task_struct *prev,
1986 struct task_struct *next)
1987{
1988 struct mm_struct *mm, *oldmm;
1989
1990 prepare_task_switch(rq, prev, next);
1991
1992 mm = next->mm;
1993 oldmm = prev->active_mm;
1994
1995
1996
1997
1998
1999 arch_start_context_switch(prev);
2000
2001 if (!mm) {
2002 next->active_mm = oldmm;
2003 atomic_inc(&oldmm->mm_count);
2004 enter_lazy_tlb(oldmm, next);
2005 } else
2006 switch_mm(oldmm, mm, next);
2007
2008 if (!prev->mm) {
2009 prev->active_mm = NULL;
2010 rq->prev_mm = oldmm;
2011 }
2012
2013
2014
2015
2016
2017
2018#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2019 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2020#endif
2021
2022 context_tracking_task_switch(prev, next);
2023
2024 switch_to(prev, next, prev);
2025
2026 barrier();
2027
2028
2029
2030
2031
2032 finish_task_switch(this_rq(), prev);
2033}
2034
2035
2036
2037
2038
2039
2040
2041unsigned long nr_running(void)
2042{
2043 unsigned long i, sum = 0;
2044
2045 for_each_online_cpu(i)
2046 sum += cpu_rq(i)->nr_running;
2047
2048 return sum;
2049}
2050
2051unsigned long long nr_context_switches(void)
2052{
2053 int i;
2054 unsigned long long sum = 0;
2055
2056 for_each_possible_cpu(i)
2057 sum += cpu_rq(i)->nr_switches;
2058
2059 return sum;
2060}
2061
2062unsigned long nr_iowait(void)
2063{
2064 unsigned long i, sum = 0;
2065
2066 for_each_possible_cpu(i)
2067 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2068
2069 return sum;
2070}
2071
2072unsigned long nr_iowait_cpu(int cpu)
2073{
2074 struct rq *this = cpu_rq(cpu);
2075 return atomic_read(&this->nr_iowait);
2076}
2077
2078#ifdef CONFIG_SMP
2079
2080
2081
2082
2083
2084void sched_exec(void)
2085{
2086 struct task_struct *p = current;
2087 unsigned long flags;
2088 int dest_cpu;
2089
2090 raw_spin_lock_irqsave(&p->pi_lock, flags);
2091 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2092 if (dest_cpu == smp_processor_id())
2093 goto unlock;
2094
2095 if (likely(cpu_active(dest_cpu))) {
2096 struct migration_arg arg = { p, dest_cpu };
2097
2098 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2099 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2100 return;
2101 }
2102unlock:
2103 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2104}
2105
2106#endif
2107
2108DEFINE_PER_CPU(struct kernel_stat, kstat);
2109DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2110
2111EXPORT_PER_CPU_SYMBOL(kstat);
2112EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2113
2114
2115
2116
2117
2118
2119
2120static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2121{
2122 u64 ns = 0;
2123
2124 if (task_current(rq, p)) {
2125 update_rq_clock(rq);
2126 ns = rq_clock_task(rq) - p->se.exec_start;
2127 if ((s64)ns < 0)
2128 ns = 0;
2129 }
2130
2131 return ns;
2132}
2133
2134unsigned long long task_delta_exec(struct task_struct *p)
2135{
2136 unsigned long flags;
2137 struct rq *rq;
2138 u64 ns = 0;
2139
2140 rq = task_rq_lock(p, &flags);
2141 ns = do_task_delta_exec(p, rq);
2142 task_rq_unlock(rq, p, &flags);
2143
2144 return ns;
2145}
2146
2147
2148
2149
2150
2151
2152unsigned long long task_sched_runtime(struct task_struct *p)
2153{
2154 unsigned long flags;
2155 struct rq *rq;
2156 u64 ns = 0;
2157
2158 rq = task_rq_lock(p, &flags);
2159 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2160 task_rq_unlock(rq, p, &flags);
2161
2162 return ns;
2163}
2164
2165
2166
2167
2168
2169void scheduler_tick(void)
2170{
2171 int cpu = smp_processor_id();
2172 struct rq *rq = cpu_rq(cpu);
2173 struct task_struct *curr = rq->curr;
2174
2175 sched_clock_tick();
2176
2177 raw_spin_lock(&rq->lock);
2178 update_rq_clock(rq);
2179 curr->sched_class->task_tick(rq, curr, 0);
2180 update_cpu_load_active(rq);
2181 raw_spin_unlock(&rq->lock);
2182
2183 perf_event_task_tick();
2184
2185#ifdef CONFIG_SMP
2186 rq->idle_balance = idle_cpu(cpu);
2187 trigger_load_balance(rq, cpu);
2188#endif
2189 rq_last_tick_reset(rq);
2190}
2191
2192#ifdef CONFIG_NO_HZ_FULL
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206u64 scheduler_tick_max_deferment(void)
2207{
2208 struct rq *rq = this_rq();
2209 unsigned long next, now = ACCESS_ONCE(jiffies);
2210
2211 next = rq->last_sched_tick + HZ;
2212
2213 if (time_before_eq(next, now))
2214 return 0;
2215
2216 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2217}
2218#endif
2219
2220notrace unsigned long get_parent_ip(unsigned long addr)
2221{
2222 if (in_lock_functions(addr)) {
2223 addr = CALLER_ADDR2;
2224 if (in_lock_functions(addr))
2225 addr = CALLER_ADDR3;
2226 }
2227 return addr;
2228}
2229
2230#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2231 defined(CONFIG_PREEMPT_TRACER))
2232
2233void __kprobes add_preempt_count(int val)
2234{
2235#ifdef CONFIG_DEBUG_PREEMPT
2236
2237
2238
2239 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2240 return;
2241#endif
2242 preempt_count() += val;
2243#ifdef CONFIG_DEBUG_PREEMPT
2244
2245
2246
2247 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2248 PREEMPT_MASK - 10);
2249#endif
2250 if (preempt_count() == val)
2251 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2252}
2253EXPORT_SYMBOL(add_preempt_count);
2254
2255void __kprobes sub_preempt_count(int val)
2256{
2257#ifdef CONFIG_DEBUG_PREEMPT
2258
2259
2260
2261 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2262 return;
2263
2264
2265
2266 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2267 !(preempt_count() & PREEMPT_MASK)))
2268 return;
2269#endif
2270
2271 if (preempt_count() == val)
2272 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2273 preempt_count() -= val;
2274}
2275EXPORT_SYMBOL(sub_preempt_count);
2276
2277#endif
2278
2279
2280
2281
2282static noinline void __schedule_bug(struct task_struct *prev)
2283{
2284 if (oops_in_progress)
2285 return;
2286
2287 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2288 prev->comm, prev->pid, preempt_count());
2289
2290 debug_show_held_locks(prev);
2291 print_modules();
2292 if (irqs_disabled())
2293 print_irqtrace_events(prev);
2294 dump_stack();
2295 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2296}
2297
2298
2299
2300
2301static inline void schedule_debug(struct task_struct *prev)
2302{
2303
2304
2305
2306
2307
2308 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2309 __schedule_bug(prev);
2310 rcu_sleep_check();
2311
2312 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2313
2314 schedstat_inc(this_rq(), sched_count);
2315}
2316
2317static void put_prev_task(struct rq *rq, struct task_struct *prev)
2318{
2319 if (prev->on_rq || rq->skip_clock_update < 0)
2320 update_rq_clock(rq);
2321 prev->sched_class->put_prev_task(rq, prev);
2322}
2323
2324
2325
2326
2327static inline struct task_struct *
2328pick_next_task(struct rq *rq)
2329{
2330 const struct sched_class *class;
2331 struct task_struct *p;
2332
2333
2334
2335
2336
2337 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2338 p = fair_sched_class.pick_next_task(rq);
2339 if (likely(p))
2340 return p;
2341 }
2342
2343 for_each_class(class) {
2344 p = class->pick_next_task(rq);
2345 if (p)
2346 return p;
2347 }
2348
2349 BUG();
2350}
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389static void __sched __schedule(void)
2390{
2391 struct task_struct *prev, *next;
2392 unsigned long *switch_count;
2393 struct rq *rq;
2394 int cpu;
2395
2396need_resched:
2397 preempt_disable();
2398 cpu = smp_processor_id();
2399 rq = cpu_rq(cpu);
2400 rcu_note_context_switch(cpu);
2401 prev = rq->curr;
2402
2403 schedule_debug(prev);
2404
2405 if (sched_feat(HRTICK))
2406 hrtick_clear(rq);
2407
2408
2409
2410
2411
2412
2413 smp_mb__before_spinlock();
2414 raw_spin_lock_irq(&rq->lock);
2415
2416 switch_count = &prev->nivcsw;
2417 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2418 if (unlikely(signal_pending_state(prev->state, prev))) {
2419 prev->state = TASK_RUNNING;
2420 } else {
2421 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2422 prev->on_rq = 0;
2423
2424
2425
2426
2427
2428
2429 if (prev->flags & PF_WQ_WORKER) {
2430 struct task_struct *to_wakeup;
2431
2432 to_wakeup = wq_worker_sleeping(prev, cpu);
2433 if (to_wakeup)
2434 try_to_wake_up_local(to_wakeup);
2435 }
2436 }
2437 switch_count = &prev->nvcsw;
2438 }
2439
2440 pre_schedule(rq, prev);
2441
2442 if (unlikely(!rq->nr_running))
2443 idle_balance(cpu, rq);
2444
2445 put_prev_task(rq, prev);
2446 next = pick_next_task(rq);
2447 clear_tsk_need_resched(prev);
2448 rq->skip_clock_update = 0;
2449
2450 if (likely(prev != next)) {
2451 rq->nr_switches++;
2452 rq->curr = next;
2453 ++*switch_count;
2454
2455 context_switch(rq, prev, next);
2456
2457
2458
2459
2460
2461
2462 cpu = smp_processor_id();
2463 rq = cpu_rq(cpu);
2464 } else
2465 raw_spin_unlock_irq(&rq->lock);
2466
2467 post_schedule(rq);
2468
2469 sched_preempt_enable_no_resched();
2470 if (need_resched())
2471 goto need_resched;
2472}
2473
2474static inline void sched_submit_work(struct task_struct *tsk)
2475{
2476 if (!tsk->state || tsk_is_pi_blocked(tsk))
2477 return;
2478
2479
2480
2481
2482 if (blk_needs_flush_plug(tsk))
2483 blk_schedule_flush_plug(tsk);
2484}
2485
2486asmlinkage void __sched schedule(void)
2487{
2488 struct task_struct *tsk = current;
2489
2490 sched_submit_work(tsk);
2491 __schedule();
2492}
2493EXPORT_SYMBOL(schedule);
2494
2495#ifdef CONFIG_CONTEXT_TRACKING
2496asmlinkage void __sched schedule_user(void)
2497{
2498
2499
2500
2501
2502
2503
2504 user_exit();
2505 schedule();
2506 user_enter();
2507}
2508#endif
2509
2510
2511
2512
2513
2514
2515void __sched schedule_preempt_disabled(void)
2516{
2517 sched_preempt_enable_no_resched();
2518 schedule();
2519 preempt_disable();
2520}
2521
2522#ifdef CONFIG_PREEMPT
2523
2524
2525
2526
2527
2528asmlinkage void __sched notrace preempt_schedule(void)
2529{
2530 struct thread_info *ti = current_thread_info();
2531
2532
2533
2534
2535
2536 if (likely(ti->preempt_count || irqs_disabled()))
2537 return;
2538
2539 do {
2540 add_preempt_count_notrace(PREEMPT_ACTIVE);
2541 __schedule();
2542 sub_preempt_count_notrace(PREEMPT_ACTIVE);
2543
2544
2545
2546
2547
2548 barrier();
2549 } while (need_resched());
2550}
2551EXPORT_SYMBOL(preempt_schedule);
2552
2553
2554
2555
2556
2557
2558
2559asmlinkage void __sched preempt_schedule_irq(void)
2560{
2561 struct thread_info *ti = current_thread_info();
2562 enum ctx_state prev_state;
2563
2564
2565 BUG_ON(ti->preempt_count || !irqs_disabled());
2566
2567 prev_state = exception_enter();
2568
2569 do {
2570 add_preempt_count(PREEMPT_ACTIVE);
2571 local_irq_enable();
2572 __schedule();
2573 local_irq_disable();
2574 sub_preempt_count(PREEMPT_ACTIVE);
2575
2576
2577
2578
2579
2580 barrier();
2581 } while (need_resched());
2582
2583 exception_exit(prev_state);
2584}
2585
2586#endif
2587
2588int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2589 void *key)
2590{
2591 return try_to_wake_up(curr->private, mode, wake_flags);
2592}
2593EXPORT_SYMBOL(default_wake_function);
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2605 int nr_exclusive, int wake_flags, void *key)
2606{
2607 wait_queue_t *curr, *next;
2608
2609 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2610 unsigned flags = curr->flags;
2611
2612 if (curr->func(curr, mode, wake_flags, key) &&
2613 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2614 break;
2615 }
2616}
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628void __wake_up(wait_queue_head_t *q, unsigned int mode,
2629 int nr_exclusive, void *key)
2630{
2631 unsigned long flags;
2632
2633 spin_lock_irqsave(&q->lock, flags);
2634 __wake_up_common(q, mode, nr_exclusive, 0, key);
2635 spin_unlock_irqrestore(&q->lock, flags);
2636}
2637EXPORT_SYMBOL(__wake_up);
2638
2639
2640
2641
2642void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2643{
2644 __wake_up_common(q, mode, nr, 0, NULL);
2645}
2646EXPORT_SYMBOL_GPL(__wake_up_locked);
2647
2648void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2649{
2650 __wake_up_common(q, mode, 1, 0, key);
2651}
2652EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2672 int nr_exclusive, void *key)
2673{
2674 unsigned long flags;
2675 int wake_flags = WF_SYNC;
2676
2677 if (unlikely(!q))
2678 return;
2679
2680 if (unlikely(!nr_exclusive))
2681 wake_flags = 0;
2682
2683 spin_lock_irqsave(&q->lock, flags);
2684 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2685 spin_unlock_irqrestore(&q->lock, flags);
2686}
2687EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2688
2689
2690
2691
2692void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2693{
2694 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2695}
2696EXPORT_SYMBOL_GPL(__wake_up_sync);
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710void complete(struct completion *x)
2711{
2712 unsigned long flags;
2713
2714 spin_lock_irqsave(&x->wait.lock, flags);
2715 x->done++;
2716 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2717 spin_unlock_irqrestore(&x->wait.lock, flags);
2718}
2719EXPORT_SYMBOL(complete);
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730void complete_all(struct completion *x)
2731{
2732 unsigned long flags;
2733
2734 spin_lock_irqsave(&x->wait.lock, flags);
2735 x->done += UINT_MAX/2;
2736 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2737 spin_unlock_irqrestore(&x->wait.lock, flags);
2738}
2739EXPORT_SYMBOL(complete_all);
2740
2741static inline long __sched
2742do_wait_for_common(struct completion *x,
2743 long (*action)(long), long timeout, int state)
2744{
2745 if (!x->done) {
2746 DECLARE_WAITQUEUE(wait, current);
2747
2748 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2749 do {
2750 if (signal_pending_state(state, current)) {
2751 timeout = -ERESTARTSYS;
2752 break;
2753 }
2754 __set_current_state(state);
2755 spin_unlock_irq(&x->wait.lock);
2756 timeout = action(timeout);
2757 spin_lock_irq(&x->wait.lock);
2758 } while (!x->done && timeout);
2759 __remove_wait_queue(&x->wait, &wait);
2760 if (!x->done)
2761 return timeout;
2762 }
2763 x->done--;
2764 return timeout ?: 1;
2765}
2766
2767static inline long __sched
2768__wait_for_common(struct completion *x,
2769 long (*action)(long), long timeout, int state)
2770{
2771 might_sleep();
2772
2773 spin_lock_irq(&x->wait.lock);
2774 timeout = do_wait_for_common(x, action, timeout, state);
2775 spin_unlock_irq(&x->wait.lock);
2776 return timeout;
2777}
2778
2779static long __sched
2780wait_for_common(struct completion *x, long timeout, int state)
2781{
2782 return __wait_for_common(x, schedule_timeout, timeout, state);
2783}
2784
2785static long __sched
2786wait_for_common_io(struct completion *x, long timeout, int state)
2787{
2788 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2789}
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801void __sched wait_for_completion(struct completion *x)
2802{
2803 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2804}
2805EXPORT_SYMBOL(wait_for_completion);
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819unsigned long __sched
2820wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2821{
2822 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2823}
2824EXPORT_SYMBOL(wait_for_completion_timeout);
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834void __sched wait_for_completion_io(struct completion *x)
2835{
2836 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2837}
2838EXPORT_SYMBOL(wait_for_completion_io);
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852unsigned long __sched
2853wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2854{
2855 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2856}
2857EXPORT_SYMBOL(wait_for_completion_io_timeout);
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868int __sched wait_for_completion_interruptible(struct completion *x)
2869{
2870 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2871 if (t == -ERESTARTSYS)
2872 return t;
2873 return 0;
2874}
2875EXPORT_SYMBOL(wait_for_completion_interruptible);
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888long __sched
2889wait_for_completion_interruptible_timeout(struct completion *x,
2890 unsigned long timeout)
2891{
2892 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2893}
2894EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905int __sched wait_for_completion_killable(struct completion *x)
2906{
2907 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2908 if (t == -ERESTARTSYS)
2909 return t;
2910 return 0;
2911}
2912EXPORT_SYMBOL(wait_for_completion_killable);
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926long __sched
2927wait_for_completion_killable_timeout(struct completion *x,
2928 unsigned long timeout)
2929{
2930 return wait_for_common(x, timeout, TASK_KILLABLE);
2931}
2932EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946bool try_wait_for_completion(struct completion *x)
2947{
2948 unsigned long flags;
2949 int ret = 1;
2950
2951 spin_lock_irqsave(&x->wait.lock, flags);
2952 if (!x->done)
2953 ret = 0;
2954 else
2955 x->done--;
2956 spin_unlock_irqrestore(&x->wait.lock, flags);
2957 return ret;
2958}
2959EXPORT_SYMBOL(try_wait_for_completion);
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969bool completion_done(struct completion *x)
2970{
2971 unsigned long flags;
2972 int ret = 1;
2973
2974 spin_lock_irqsave(&x->wait.lock, flags);
2975 if (!x->done)
2976 ret = 0;
2977 spin_unlock_irqrestore(&x->wait.lock, flags);
2978 return ret;
2979}
2980EXPORT_SYMBOL(completion_done);
2981
2982static long __sched
2983sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2984{
2985 unsigned long flags;
2986 wait_queue_t wait;
2987
2988 init_waitqueue_entry(&wait, current);
2989
2990 __set_current_state(state);
2991
2992 spin_lock_irqsave(&q->lock, flags);
2993 __add_wait_queue(q, &wait);
2994 spin_unlock(&q->lock);
2995 timeout = schedule_timeout(timeout);
2996 spin_lock_irq(&q->lock);
2997 __remove_wait_queue(q, &wait);
2998 spin_unlock_irqrestore(&q->lock, flags);
2999
3000 return timeout;
3001}
3002
3003void __sched interruptible_sleep_on(wait_queue_head_t *q)
3004{
3005 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3006}
3007EXPORT_SYMBOL(interruptible_sleep_on);
3008
3009long __sched
3010interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3011{
3012 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3013}
3014EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3015
3016void __sched sleep_on(wait_queue_head_t *q)
3017{
3018 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3019}
3020EXPORT_SYMBOL(sleep_on);
3021
3022long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3023{
3024 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3025}
3026EXPORT_SYMBOL(sleep_on_timeout);
3027
3028#ifdef CONFIG_RT_MUTEXES
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040void rt_mutex_setprio(struct task_struct *p, int prio)
3041{
3042 int oldprio, on_rq, running;
3043 struct rq *rq;
3044 const struct sched_class *prev_class;
3045
3046 BUG_ON(prio < 0 || prio > MAX_PRIO);
3047
3048 rq = __task_rq_lock(p);
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062 if (unlikely(p == rq->idle)) {
3063 WARN_ON(p != rq->curr);
3064 WARN_ON(p->pi_blocked_on);
3065 goto out_unlock;
3066 }
3067
3068 trace_sched_pi_setprio(p, prio);
3069 oldprio = p->prio;
3070 prev_class = p->sched_class;
3071 on_rq = p->on_rq;
3072 running = task_current(rq, p);
3073 if (on_rq)
3074 dequeue_task(rq, p, 0);
3075 if (running)
3076 p->sched_class->put_prev_task(rq, p);
3077
3078 if (rt_prio(prio))
3079 p->sched_class = &rt_sched_class;
3080 else
3081 p->sched_class = &fair_sched_class;
3082
3083 p->prio = prio;
3084
3085 if (running)
3086 p->sched_class->set_curr_task(rq);
3087 if (on_rq)
3088 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3089
3090 check_class_changed(rq, p, prev_class, oldprio);
3091out_unlock:
3092 __task_rq_unlock(rq);
3093}
3094#endif
3095void set_user_nice(struct task_struct *p, long nice)
3096{
3097 int old_prio, delta, on_rq;
3098 unsigned long flags;
3099 struct rq *rq;
3100
3101 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3102 return;
3103
3104
3105
3106
3107 rq = task_rq_lock(p, &flags);
3108
3109
3110
3111
3112
3113
3114 if (task_has_rt_policy(p)) {
3115 p->static_prio = NICE_TO_PRIO(nice);
3116 goto out_unlock;
3117 }
3118 on_rq = p->on_rq;
3119 if (on_rq)
3120 dequeue_task(rq, p, 0);
3121
3122 p->static_prio = NICE_TO_PRIO(nice);
3123 set_load_weight(p);
3124 old_prio = p->prio;
3125 p->prio = effective_prio(p);
3126 delta = p->prio - old_prio;
3127
3128 if (on_rq) {
3129 enqueue_task(rq, p, 0);
3130
3131
3132
3133
3134 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3135 resched_task(rq->curr);
3136 }
3137out_unlock:
3138 task_rq_unlock(rq, p, &flags);
3139}
3140EXPORT_SYMBOL(set_user_nice);
3141
3142
3143
3144
3145
3146
3147int can_nice(const struct task_struct *p, const int nice)
3148{
3149
3150 int nice_rlim = 20 - nice;
3151
3152 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3153 capable(CAP_SYS_NICE));
3154}
3155
3156#ifdef __ARCH_WANT_SYS_NICE
3157
3158
3159
3160
3161
3162
3163
3164
3165SYSCALL_DEFINE1(nice, int, increment)
3166{
3167 long nice, retval;
3168
3169
3170
3171
3172
3173
3174 if (increment < -40)
3175 increment = -40;
3176 if (increment > 40)
3177 increment = 40;
3178
3179 nice = TASK_NICE(current) + increment;
3180 if (nice < -20)
3181 nice = -20;
3182 if (nice > 19)
3183 nice = 19;
3184
3185 if (increment < 0 && !can_nice(current, nice))
3186 return -EPERM;
3187
3188 retval = security_task_setnice(current, nice);
3189 if (retval)
3190 return retval;
3191
3192 set_user_nice(current, nice);
3193 return 0;
3194}
3195
3196#endif
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206int task_prio(const struct task_struct *p)
3207{
3208 return p->prio - MAX_RT_PRIO;
3209}
3210
3211
3212
3213
3214
3215
3216
3217int task_nice(const struct task_struct *p)
3218{
3219 return TASK_NICE(p);
3220}
3221EXPORT_SYMBOL(task_nice);
3222
3223
3224
3225
3226
3227
3228
3229int idle_cpu(int cpu)
3230{
3231 struct rq *rq = cpu_rq(cpu);
3232
3233 if (rq->curr != rq->idle)
3234 return 0;
3235
3236 if (rq->nr_running)
3237 return 0;
3238
3239#ifdef CONFIG_SMP
3240 if (!llist_empty(&rq->wake_list))
3241 return 0;
3242#endif
3243
3244 return 1;
3245}
3246
3247
3248
3249
3250
3251
3252
3253struct task_struct *idle_task(int cpu)
3254{
3255 return cpu_rq(cpu)->idle;
3256}
3257
3258
3259
3260
3261
3262
3263
3264static struct task_struct *find_process_by_pid(pid_t pid)
3265{
3266 return pid ? find_task_by_vpid(pid) : current;
3267}
3268
3269
3270static void
3271__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3272{
3273 p->policy = policy;
3274 p->rt_priority = prio;
3275 p->normal_prio = normal_prio(p);
3276
3277 p->prio = rt_mutex_getprio(p);
3278 if (rt_prio(p->prio))
3279 p->sched_class = &rt_sched_class;
3280 else
3281 p->sched_class = &fair_sched_class;
3282 set_load_weight(p);
3283}
3284
3285
3286
3287
3288static bool check_same_owner(struct task_struct *p)
3289{
3290 const struct cred *cred = current_cred(), *pcred;
3291 bool match;
3292
3293 rcu_read_lock();
3294 pcred = __task_cred(p);
3295 match = (uid_eq(cred->euid, pcred->euid) ||
3296 uid_eq(cred->euid, pcred->uid));
3297 rcu_read_unlock();
3298 return match;
3299}
3300
3301static int __sched_setscheduler(struct task_struct *p, int policy,
3302 const struct sched_param *param, bool user)
3303{
3304 int retval, oldprio, oldpolicy = -1, on_rq, running;
3305 unsigned long flags;
3306 const struct sched_class *prev_class;
3307 struct rq *rq;
3308 int reset_on_fork;
3309
3310
3311 BUG_ON(in_interrupt());
3312recheck:
3313
3314 if (policy < 0) {
3315 reset_on_fork = p->sched_reset_on_fork;
3316 policy = oldpolicy = p->policy;
3317 } else {
3318 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3319 policy &= ~SCHED_RESET_ON_FORK;
3320
3321 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3322 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3323 policy != SCHED_IDLE)
3324 return -EINVAL;
3325 }
3326
3327
3328
3329
3330
3331
3332 if (param->sched_priority < 0 ||
3333 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3334 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3335 return -EINVAL;
3336 if (rt_policy(policy) != (param->sched_priority != 0))
3337 return -EINVAL;
3338
3339
3340
3341
3342 if (user && !capable(CAP_SYS_NICE)) {
3343 if (rt_policy(policy)) {
3344 unsigned long rlim_rtprio =
3345 task_rlimit(p, RLIMIT_RTPRIO);
3346
3347
3348 if (policy != p->policy && !rlim_rtprio)
3349 return -EPERM;
3350
3351
3352 if (param->sched_priority > p->rt_priority &&
3353 param->sched_priority > rlim_rtprio)
3354 return -EPERM;
3355 }
3356
3357
3358
3359
3360
3361 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3362 if (!can_nice(p, TASK_NICE(p)))
3363 return -EPERM;
3364 }
3365
3366
3367 if (!check_same_owner(p))
3368 return -EPERM;
3369
3370
3371 if (p->sched_reset_on_fork && !reset_on_fork)
3372 return -EPERM;
3373 }
3374
3375 if (user) {
3376 retval = security_task_setscheduler(p);
3377 if (retval)
3378 return retval;
3379 }
3380
3381
3382
3383
3384
3385
3386
3387
3388 rq = task_rq_lock(p, &flags);
3389
3390
3391
3392
3393 if (p == rq->stop) {
3394 task_rq_unlock(rq, p, &flags);
3395 return -EINVAL;
3396 }
3397
3398
3399
3400
3401 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3402 param->sched_priority == p->rt_priority))) {
3403 task_rq_unlock(rq, p, &flags);
3404 return 0;
3405 }
3406
3407#ifdef CONFIG_RT_GROUP_SCHED
3408 if (user) {
3409
3410
3411
3412
3413 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3414 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3415 !task_group_is_autogroup(task_group(p))) {
3416 task_rq_unlock(rq, p, &flags);
3417 return -EPERM;
3418 }
3419 }
3420#endif
3421
3422
3423 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3424 policy = oldpolicy = -1;
3425 task_rq_unlock(rq, p, &flags);
3426 goto recheck;
3427 }
3428 on_rq = p->on_rq;
3429 running = task_current(rq, p);
3430 if (on_rq)
3431 dequeue_task(rq, p, 0);
3432 if (running)
3433 p->sched_class->put_prev_task(rq, p);
3434
3435 p->sched_reset_on_fork = reset_on_fork;
3436
3437 oldprio = p->prio;
3438 prev_class = p->sched_class;
3439 __setscheduler(rq, p, policy, param->sched_priority);
3440
3441 if (running)
3442 p->sched_class->set_curr_task(rq);
3443 if (on_rq)
3444 enqueue_task(rq, p, 0);
3445
3446 check_class_changed(rq, p, prev_class, oldprio);
3447 task_rq_unlock(rq, p, &flags);
3448
3449 rt_mutex_adjust_pi(p);
3450
3451 return 0;
3452}
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464int sched_setscheduler(struct task_struct *p, int policy,
3465 const struct sched_param *param)
3466{
3467 return __sched_setscheduler(p, policy, param, true);
3468}
3469EXPORT_SYMBOL_GPL(sched_setscheduler);
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3485 const struct sched_param *param)
3486{
3487 return __sched_setscheduler(p, policy, param, false);
3488}
3489
3490static int
3491do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3492{
3493 struct sched_param lparam;
3494 struct task_struct *p;
3495 int retval;
3496
3497 if (!param || pid < 0)
3498 return -EINVAL;
3499 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3500 return -EFAULT;
3501
3502 rcu_read_lock();
3503 retval = -ESRCH;
3504 p = find_process_by_pid(pid);
3505 if (p != NULL)
3506 retval = sched_setscheduler(p, policy, &lparam);
3507 rcu_read_unlock();
3508
3509 return retval;
3510}
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3521 struct sched_param __user *, param)
3522{
3523
3524 if (policy < 0)
3525 return -EINVAL;
3526
3527 return do_sched_setscheduler(pid, policy, param);
3528}
3529
3530
3531
3532
3533
3534
3535
3536
3537SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3538{
3539 return do_sched_setscheduler(pid, -1, param);
3540}
3541
3542
3543
3544
3545
3546
3547
3548
3549SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3550{
3551 struct task_struct *p;
3552 int retval;
3553
3554 if (pid < 0)
3555 return -EINVAL;
3556
3557 retval = -ESRCH;
3558 rcu_read_lock();
3559 p = find_process_by_pid(pid);
3560 if (p) {
3561 retval = security_task_getscheduler(p);
3562 if (!retval)
3563 retval = p->policy
3564 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3565 }
3566 rcu_read_unlock();
3567 return retval;
3568}
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3579{
3580 struct sched_param lp;
3581 struct task_struct *p;
3582 int retval;
3583
3584 if (!param || pid < 0)
3585 return -EINVAL;
3586
3587 rcu_read_lock();
3588 p = find_process_by_pid(pid);
3589 retval = -ESRCH;
3590 if (!p)
3591 goto out_unlock;
3592
3593 retval = security_task_getscheduler(p);
3594 if (retval)
3595 goto out_unlock;
3596
3597 lp.sched_priority = p->rt_priority;
3598 rcu_read_unlock();
3599
3600
3601
3602
3603 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3604
3605 return retval;
3606
3607out_unlock:
3608 rcu_read_unlock();
3609 return retval;
3610}
3611
3612long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3613{
3614 cpumask_var_t cpus_allowed, new_mask;
3615 struct task_struct *p;
3616 int retval;
3617
3618 get_online_cpus();
3619 rcu_read_lock();
3620
3621 p = find_process_by_pid(pid);
3622 if (!p) {
3623 rcu_read_unlock();
3624 put_online_cpus();
3625 return -ESRCH;
3626 }
3627
3628
3629 get_task_struct(p);
3630 rcu_read_unlock();
3631
3632 if (p->flags & PF_NO_SETAFFINITY) {
3633 retval = -EINVAL;
3634 goto out_put_task;
3635 }
3636 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3637 retval = -ENOMEM;
3638 goto out_put_task;
3639 }
3640 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3641 retval = -ENOMEM;
3642 goto out_free_cpus_allowed;
3643 }
3644 retval = -EPERM;
3645 if (!check_same_owner(p)) {
3646 rcu_read_lock();
3647 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3648 rcu_read_unlock();
3649 goto out_unlock;
3650 }
3651 rcu_read_unlock();
3652 }
3653
3654 retval = security_task_setscheduler(p);
3655 if (retval)
3656 goto out_unlock;
3657
3658 cpuset_cpus_allowed(p, cpus_allowed);
3659 cpumask_and(new_mask, in_mask, cpus_allowed);
3660again:
3661 retval = set_cpus_allowed_ptr(p, new_mask);
3662
3663 if (!retval) {
3664 cpuset_cpus_allowed(p, cpus_allowed);
3665 if (!cpumask_subset(new_mask, cpus_allowed)) {
3666
3667
3668
3669
3670
3671 cpumask_copy(new_mask, cpus_allowed);
3672 goto again;
3673 }
3674 }
3675out_unlock:
3676 free_cpumask_var(new_mask);
3677out_free_cpus_allowed:
3678 free_cpumask_var(cpus_allowed);
3679out_put_task:
3680 put_task_struct(p);
3681 put_online_cpus();
3682 return retval;
3683}
3684
3685static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3686 struct cpumask *new_mask)
3687{
3688 if (len < cpumask_size())
3689 cpumask_clear(new_mask);
3690 else if (len > cpumask_size())
3691 len = cpumask_size();
3692
3693 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3694}
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3705 unsigned long __user *, user_mask_ptr)
3706{
3707 cpumask_var_t new_mask;
3708 int retval;
3709
3710 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3711 return -ENOMEM;
3712
3713 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3714 if (retval == 0)
3715 retval = sched_setaffinity(pid, new_mask);
3716 free_cpumask_var(new_mask);
3717 return retval;
3718}
3719
3720long sched_getaffinity(pid_t pid, struct cpumask *mask)
3721{
3722 struct task_struct *p;
3723 unsigned long flags;
3724 int retval;
3725
3726 get_online_cpus();
3727 rcu_read_lock();
3728
3729 retval = -ESRCH;
3730 p = find_process_by_pid(pid);
3731 if (!p)
3732 goto out_unlock;
3733
3734 retval = security_task_getscheduler(p);
3735 if (retval)
3736 goto out_unlock;
3737
3738 raw_spin_lock_irqsave(&p->pi_lock, flags);
3739 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
3740 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3741
3742out_unlock:
3743 rcu_read_unlock();
3744 put_online_cpus();
3745
3746 return retval;
3747}
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3758 unsigned long __user *, user_mask_ptr)
3759{
3760 int ret;
3761 cpumask_var_t mask;
3762
3763 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
3764 return -EINVAL;
3765 if (len & (sizeof(unsigned long)-1))
3766 return -EINVAL;
3767
3768 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
3769 return -ENOMEM;
3770
3771 ret = sched_getaffinity(pid, mask);
3772 if (ret == 0) {
3773 size_t retlen = min_t(size_t, len, cpumask_size());
3774
3775 if (copy_to_user(user_mask_ptr, mask, retlen))
3776 ret = -EFAULT;
3777 else
3778 ret = retlen;
3779 }
3780 free_cpumask_var(mask);
3781
3782 return ret;
3783}
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793SYSCALL_DEFINE0(sched_yield)
3794{
3795 struct rq *rq = this_rq_lock();
3796
3797 schedstat_inc(rq, yld_count);
3798 current->sched_class->yield_task(rq);
3799
3800
3801
3802
3803
3804 __release(rq->lock);
3805 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3806 do_raw_spin_unlock(&rq->lock);
3807 sched_preempt_enable_no_resched();
3808
3809 schedule();
3810
3811 return 0;
3812}
3813
3814static inline int should_resched(void)
3815{
3816 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3817}
3818
3819static void __cond_resched(void)
3820{
3821 add_preempt_count(PREEMPT_ACTIVE);
3822 __schedule();
3823 sub_preempt_count(PREEMPT_ACTIVE);
3824}
3825
3826int __sched _cond_resched(void)
3827{
3828 if (should_resched()) {
3829 __cond_resched();
3830 return 1;
3831 }
3832 return 0;
3833}
3834EXPORT_SYMBOL(_cond_resched);
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844int __cond_resched_lock(spinlock_t *lock)
3845{
3846 int resched = should_resched();
3847 int ret = 0;
3848
3849 lockdep_assert_held(lock);
3850
3851 if (spin_needbreak(lock) || resched) {
3852 spin_unlock(lock);
3853 if (resched)
3854 __cond_resched();
3855 else
3856 cpu_relax();
3857 ret = 1;
3858 spin_lock(lock);
3859 }
3860 return ret;
3861}
3862EXPORT_SYMBOL(__cond_resched_lock);
3863
3864int __sched __cond_resched_softirq(void)
3865{
3866 BUG_ON(!in_softirq());
3867
3868 if (should_resched()) {
3869 local_bh_enable();
3870 __cond_resched();
3871 local_bh_disable();
3872 return 1;
3873 }
3874 return 0;
3875}
3876EXPORT_SYMBOL(__cond_resched_softirq);
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900void __sched yield(void)
3901{
3902 set_current_state(TASK_RUNNING);
3903 sys_sched_yield();
3904}
3905EXPORT_SYMBOL(yield);
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922bool __sched yield_to(struct task_struct *p, bool preempt)
3923{
3924 struct task_struct *curr = current;
3925 struct rq *rq, *p_rq;
3926 unsigned long flags;
3927 int yielded = 0;
3928
3929 local_irq_save(flags);
3930 rq = this_rq();
3931
3932again:
3933 p_rq = task_rq(p);
3934
3935
3936
3937
3938 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
3939 yielded = -ESRCH;
3940 goto out_irq;
3941 }
3942
3943 double_rq_lock(rq, p_rq);
3944 while (task_rq(p) != p_rq) {
3945 double_rq_unlock(rq, p_rq);
3946 goto again;
3947 }
3948
3949 if (!curr->sched_class->yield_to_task)
3950 goto out_unlock;
3951
3952 if (curr->sched_class != p->sched_class)
3953 goto out_unlock;
3954
3955 if (task_running(p_rq, p) || p->state)
3956 goto out_unlock;
3957
3958 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
3959 if (yielded) {
3960 schedstat_inc(rq, yld_count);
3961
3962
3963
3964
3965 if (preempt && rq != p_rq)
3966 resched_task(p_rq->curr);
3967 }
3968
3969out_unlock:
3970 double_rq_unlock(rq, p_rq);
3971out_irq:
3972 local_irq_restore(flags);
3973
3974 if (yielded > 0)
3975 schedule();
3976
3977 return yielded;
3978}
3979EXPORT_SYMBOL_GPL(yield_to);
3980
3981
3982
3983
3984
3985void __sched io_schedule(void)
3986{
3987 struct rq *rq = raw_rq();
3988
3989 delayacct_blkio_start();
3990 atomic_inc(&rq->nr_iowait);
3991 blk_flush_plug(current);
3992 current->in_iowait = 1;
3993 schedule();
3994 current->in_iowait = 0;
3995 atomic_dec(&rq->nr_iowait);
3996 delayacct_blkio_end();
3997}
3998EXPORT_SYMBOL(io_schedule);
3999
4000long __sched io_schedule_timeout(long timeout)
4001{
4002 struct rq *rq = raw_rq();
4003 long ret;
4004
4005 delayacct_blkio_start();
4006 atomic_inc(&rq->nr_iowait);
4007 blk_flush_plug(current);
4008 current->in_iowait = 1;
4009 ret = schedule_timeout(timeout);
4010 current->in_iowait = 0;
4011 atomic_dec(&rq->nr_iowait);
4012 delayacct_blkio_end();
4013 return ret;
4014}
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4025{
4026 int ret = -EINVAL;
4027
4028 switch (policy) {
4029 case SCHED_FIFO:
4030 case SCHED_RR:
4031 ret = MAX_USER_RT_PRIO-1;
4032 break;
4033 case SCHED_NORMAL:
4034 case SCHED_BATCH:
4035 case SCHED_IDLE:
4036 ret = 0;
4037 break;
4038 }
4039 return ret;
4040}
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4051{
4052 int ret = -EINVAL;
4053
4054 switch (policy) {
4055 case SCHED_FIFO:
4056 case SCHED_RR:
4057 ret = 1;
4058 break;
4059 case SCHED_NORMAL:
4060 case SCHED_BATCH:
4061 case SCHED_IDLE:
4062 ret = 0;
4063 }
4064 return ret;
4065}
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4079 struct timespec __user *, interval)
4080{
4081 struct task_struct *p;
4082 unsigned int time_slice;
4083 unsigned long flags;
4084 struct rq *rq;
4085 int retval;
4086 struct timespec t;
4087
4088 if (pid < 0)
4089 return -EINVAL;
4090
4091 retval = -ESRCH;
4092 rcu_read_lock();
4093 p = find_process_by_pid(pid);
4094 if (!p)
4095 goto out_unlock;
4096
4097 retval = security_task_getscheduler(p);
4098 if (retval)
4099 goto out_unlock;
4100
4101 rq = task_rq_lock(p, &flags);
4102 time_slice = p->sched_class->get_rr_interval(rq, p);
4103 task_rq_unlock(rq, p, &flags);
4104
4105 rcu_read_unlock();
4106 jiffies_to_timespec(time_slice, &t);
4107 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4108 return retval;
4109
4110out_unlock:
4111 rcu_read_unlock();
4112 return retval;
4113}
4114
4115static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4116
4117void sched_show_task(struct task_struct *p)
4118{
4119 unsigned long free = 0;
4120 int ppid;
4121 unsigned state;
4122
4123 state = p->state ? __ffs(p->state) + 1 : 0;
4124 printk(KERN_INFO "%-15.15s %c", p->comm,
4125 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4126#if BITS_PER_LONG == 32
4127 if (state == TASK_RUNNING)
4128 printk(KERN_CONT " running ");
4129 else
4130 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4131#else
4132 if (state == TASK_RUNNING)
4133 printk(KERN_CONT " running task ");
4134 else
4135 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4136#endif
4137#ifdef CONFIG_DEBUG_STACK_USAGE
4138 free = stack_not_used(p);
4139#endif
4140 rcu_read_lock();
4141 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4142 rcu_read_unlock();
4143 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4144 task_pid_nr(p), ppid,
4145 (unsigned long)task_thread_info(p)->flags);
4146
4147 print_worker_info(KERN_INFO, p);
4148 show_stack(p, NULL);
4149}
4150
4151void show_state_filter(unsigned long state_filter)
4152{
4153 struct task_struct *g, *p;
4154
4155#if BITS_PER_LONG == 32
4156 printk(KERN_INFO
4157 " task PC stack pid father\n");
4158#else
4159 printk(KERN_INFO
4160 " task PC stack pid father\n");
4161#endif
4162 rcu_read_lock();
4163 do_each_thread(g, p) {
4164
4165
4166
4167
4168 touch_nmi_watchdog();
4169 if (!state_filter || (p->state & state_filter))
4170 sched_show_task(p);
4171 } while_each_thread(g, p);
4172
4173 touch_all_softlockup_watchdogs();
4174
4175#ifdef CONFIG_SCHED_DEBUG
4176 sysrq_sched_debug_show();
4177#endif
4178 rcu_read_unlock();
4179
4180
4181
4182 if (!state_filter)
4183 debug_show_all_locks();
4184}
4185
4186void init_idle_bootup_task(struct task_struct *idle)
4187{
4188 idle->sched_class = &idle_sched_class;
4189}
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199void init_idle(struct task_struct *idle, int cpu)
4200{
4201 struct rq *rq = cpu_rq(cpu);
4202 unsigned long flags;
4203
4204 raw_spin_lock_irqsave(&rq->lock, flags);
4205
4206 __sched_fork(idle);
4207 idle->state = TASK_RUNNING;
4208 idle->se.exec_start = sched_clock();
4209
4210 do_set_cpus_allowed(idle, cpumask_of(cpu));
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221 rcu_read_lock();
4222 __set_task_cpu(idle, cpu);
4223 rcu_read_unlock();
4224
4225 rq->curr = rq->idle = idle;
4226#if defined(CONFIG_SMP)
4227 idle->on_cpu = 1;
4228#endif
4229 raw_spin_unlock_irqrestore(&rq->lock, flags);
4230
4231
4232 task_thread_info(idle)->preempt_count = 0;
4233
4234
4235
4236
4237 idle->sched_class = &idle_sched_class;
4238 ftrace_graph_init_idle_task(idle, cpu);
4239 vtime_init_idle(idle, cpu);
4240#if defined(CONFIG_SMP)
4241 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4242#endif
4243}
4244
4245#ifdef CONFIG_SMP
4246void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4247{
4248 if (p->sched_class && p->sched_class->set_cpus_allowed)
4249 p->sched_class->set_cpus_allowed(p, new_mask);
4250
4251 cpumask_copy(&p->cpus_allowed, new_mask);
4252 p->nr_cpus_allowed = cpumask_weight(new_mask);
4253}
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4279{
4280 unsigned long flags;
4281 struct rq *rq;
4282 unsigned int dest_cpu;
4283 int ret = 0;
4284
4285 rq = task_rq_lock(p, &flags);
4286
4287 if (cpumask_equal(&p->cpus_allowed, new_mask))
4288 goto out;
4289
4290 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4291 ret = -EINVAL;
4292 goto out;
4293 }
4294
4295 do_set_cpus_allowed(p, new_mask);
4296
4297
4298 if (cpumask_test_cpu(task_cpu(p), new_mask))
4299 goto out;
4300
4301 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4302 if (p->on_rq) {
4303 struct migration_arg arg = { p, dest_cpu };
4304
4305 task_rq_unlock(rq, p, &flags);
4306 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4307 tlb_migrate_finish(p->mm);
4308 return 0;
4309 }
4310out:
4311 task_rq_unlock(rq, p, &flags);
4312
4313 return ret;
4314}
4315EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4329{
4330 struct rq *rq_dest, *rq_src;
4331 int ret = 0;
4332
4333 if (unlikely(!cpu_active(dest_cpu)))
4334 return ret;
4335
4336 rq_src = cpu_rq(src_cpu);
4337 rq_dest = cpu_rq(dest_cpu);
4338
4339 raw_spin_lock(&p->pi_lock);
4340 double_rq_lock(rq_src, rq_dest);
4341
4342 if (task_cpu(p) != src_cpu)
4343 goto done;
4344
4345 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4346 goto fail;
4347
4348
4349
4350
4351
4352 if (p->on_rq) {
4353 dequeue_task(rq_src, p, 0);
4354 set_task_cpu(p, dest_cpu);
4355 enqueue_task(rq_dest, p, 0);
4356 check_preempt_curr(rq_dest, p, 0);
4357 }
4358done:
4359 ret = 1;
4360fail:
4361 double_rq_unlock(rq_src, rq_dest);
4362 raw_spin_unlock(&p->pi_lock);
4363 return ret;
4364}
4365
4366
4367
4368
4369
4370
4371static int migration_cpu_stop(void *data)
4372{
4373 struct migration_arg *arg = data;
4374
4375
4376
4377
4378
4379 local_irq_disable();
4380 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4381 local_irq_enable();
4382 return 0;
4383}
4384
4385#ifdef CONFIG_HOTPLUG_CPU
4386
4387
4388
4389
4390
4391void idle_task_exit(void)
4392{
4393 struct mm_struct *mm = current->active_mm;
4394
4395 BUG_ON(cpu_online(smp_processor_id()));
4396
4397 if (mm != &init_mm)
4398 switch_mm(mm, &init_mm, current);
4399 mmdrop(mm);
4400}
4401
4402
4403
4404
4405
4406
4407
4408
4409static void calc_load_migrate(struct rq *rq)
4410{
4411 long delta = calc_load_fold_active(rq);
4412 if (delta)
4413 atomic_long_add(delta, &calc_load_tasks);
4414}
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424static void migrate_tasks(unsigned int dead_cpu)
4425{
4426 struct rq *rq = cpu_rq(dead_cpu);
4427 struct task_struct *next, *stop = rq->stop;
4428 int dest_cpu;
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439 rq->stop = NULL;
4440
4441
4442
4443
4444
4445
4446 update_rq_clock(rq);
4447
4448 for ( ; ; ) {
4449
4450
4451
4452
4453 if (rq->nr_running == 1)
4454 break;
4455
4456 next = pick_next_task(rq);
4457 BUG_ON(!next);
4458 next->sched_class->put_prev_task(rq, next);
4459
4460
4461 dest_cpu = select_fallback_rq(dead_cpu, next);
4462 raw_spin_unlock(&rq->lock);
4463
4464 __migrate_task(next, dead_cpu, dest_cpu);
4465
4466 raw_spin_lock(&rq->lock);
4467 }
4468
4469 rq->stop = stop;
4470}
4471
4472#endif
4473
4474#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4475
4476static struct ctl_table sd_ctl_dir[] = {
4477 {
4478 .procname = "sched_domain",
4479 .mode = 0555,
4480 },
4481 {}
4482};
4483
4484static struct ctl_table sd_ctl_root[] = {
4485 {
4486 .procname = "kernel",
4487 .mode = 0555,
4488 .child = sd_ctl_dir,
4489 },
4490 {}
4491};
4492
4493static struct ctl_table *sd_alloc_ctl_entry(int n)
4494{
4495 struct ctl_table *entry =
4496 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4497
4498 return entry;
4499}
4500
4501static void sd_free_ctl_entry(struct ctl_table **tablep)
4502{
4503 struct ctl_table *entry;
4504
4505
4506
4507
4508
4509
4510
4511 for (entry = *tablep; entry->mode; entry++) {
4512 if (entry->child)
4513 sd_free_ctl_entry(&entry->child);
4514 if (entry->proc_handler == NULL)
4515 kfree(entry->procname);
4516 }
4517
4518 kfree(*tablep);
4519 *tablep = NULL;
4520}
4521
4522static int min_load_idx = 0;
4523static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4524
4525static void
4526set_table_entry(struct ctl_table *entry,
4527 const char *procname, void *data, int maxlen,
4528 umode_t mode, proc_handler *proc_handler,
4529 bool load_idx)
4530{
4531 entry->procname = procname;
4532 entry->data = data;
4533 entry->maxlen = maxlen;
4534 entry->mode = mode;
4535 entry->proc_handler = proc_handler;
4536
4537 if (load_idx) {
4538 entry->extra1 = &min_load_idx;
4539 entry->extra2 = &max_load_idx;
4540 }
4541}
4542
4543static struct ctl_table *
4544sd_alloc_ctl_domain_table(struct sched_domain *sd)
4545{
4546 struct ctl_table *table = sd_alloc_ctl_entry(13);
4547
4548 if (table == NULL)
4549 return NULL;
4550
4551 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4552 sizeof(long), 0644, proc_doulongvec_minmax, false);
4553 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4554 sizeof(long), 0644, proc_doulongvec_minmax, false);
4555 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4556 sizeof(int), 0644, proc_dointvec_minmax, true);
4557 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4558 sizeof(int), 0644, proc_dointvec_minmax, true);
4559 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4560 sizeof(int), 0644, proc_dointvec_minmax, true);
4561 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4562 sizeof(int), 0644, proc_dointvec_minmax, true);
4563 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4564 sizeof(int), 0644, proc_dointvec_minmax, true);
4565 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4566 sizeof(int), 0644, proc_dointvec_minmax, false);
4567 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4568 sizeof(int), 0644, proc_dointvec_minmax, false);
4569 set_table_entry(&table[9], "cache_nice_tries",
4570 &sd->cache_nice_tries,
4571 sizeof(int), 0644, proc_dointvec_minmax, false);
4572 set_table_entry(&table[10], "flags", &sd->flags,
4573 sizeof(int), 0644, proc_dointvec_minmax, false);
4574 set_table_entry(&table[11], "name", sd->name,
4575 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4576
4577
4578 return table;
4579}
4580
4581static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4582{
4583 struct ctl_table *entry, *table;
4584 struct sched_domain *sd;
4585 int domain_num = 0, i;
4586 char buf[32];
4587
4588 for_each_domain(cpu, sd)
4589 domain_num++;
4590 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4591 if (table == NULL)
4592 return NULL;
4593
4594 i = 0;
4595 for_each_domain(cpu, sd) {
4596 snprintf(buf, 32, "domain%d", i);
4597 entry->procname = kstrdup(buf, GFP_KERNEL);
4598 entry->mode = 0555;
4599 entry->child = sd_alloc_ctl_domain_table(sd);
4600 entry++;
4601 i++;
4602 }
4603 return table;
4604}
4605
4606static struct ctl_table_header *sd_sysctl_header;
4607static void register_sched_domain_sysctl(void)
4608{
4609 int i, cpu_num = num_possible_cpus();
4610 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4611 char buf[32];
4612
4613 WARN_ON(sd_ctl_dir[0].child);
4614 sd_ctl_dir[0].child = entry;
4615
4616 if (entry == NULL)
4617 return;
4618
4619 for_each_possible_cpu(i) {
4620 snprintf(buf, 32, "cpu%d", i);
4621 entry->procname = kstrdup(buf, GFP_KERNEL);
4622 entry->mode = 0555;
4623 entry->child = sd_alloc_ctl_cpu_table(i);
4624 entry++;
4625 }
4626
4627 WARN_ON(sd_sysctl_header);
4628 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4629}
4630
4631
4632static void unregister_sched_domain_sysctl(void)
4633{
4634 if (sd_sysctl_header)
4635 unregister_sysctl_table(sd_sysctl_header);
4636 sd_sysctl_header = NULL;
4637 if (sd_ctl_dir[0].child)
4638 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4639}
4640#else
4641static void register_sched_domain_sysctl(void)
4642{
4643}
4644static void unregister_sched_domain_sysctl(void)
4645{
4646}
4647#endif
4648
4649static void set_rq_online(struct rq *rq)
4650{
4651 if (!rq->online) {
4652 const struct sched_class *class;
4653
4654 cpumask_set_cpu(rq->cpu, rq->rd->online);
4655 rq->online = 1;
4656
4657 for_each_class(class) {
4658 if (class->rq_online)
4659 class->rq_online(rq);
4660 }
4661 }
4662}
4663
4664static void set_rq_offline(struct rq *rq)
4665{
4666 if (rq->online) {
4667 const struct sched_class *class;
4668
4669 for_each_class(class) {
4670 if (class->rq_offline)
4671 class->rq_offline(rq);
4672 }
4673
4674 cpumask_clear_cpu(rq->cpu, rq->rd->online);
4675 rq->online = 0;
4676 }
4677}
4678
4679
4680
4681
4682
4683static int
4684migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4685{
4686 int cpu = (long)hcpu;
4687 unsigned long flags;
4688 struct rq *rq = cpu_rq(cpu);
4689
4690 switch (action & ~CPU_TASKS_FROZEN) {
4691
4692 case CPU_UP_PREPARE:
4693 rq->calc_load_update = calc_load_update;
4694 break;
4695
4696 case CPU_ONLINE:
4697
4698 raw_spin_lock_irqsave(&rq->lock, flags);
4699 if (rq->rd) {
4700 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4701
4702 set_rq_online(rq);
4703 }
4704 raw_spin_unlock_irqrestore(&rq->lock, flags);
4705 break;
4706
4707#ifdef CONFIG_HOTPLUG_CPU
4708 case CPU_DYING:
4709 sched_ttwu_pending();
4710
4711 raw_spin_lock_irqsave(&rq->lock, flags);
4712 if (rq->rd) {
4713 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4714 set_rq_offline(rq);
4715 }
4716 migrate_tasks(cpu);
4717 BUG_ON(rq->nr_running != 1);
4718 raw_spin_unlock_irqrestore(&rq->lock, flags);
4719 break;
4720
4721 case CPU_DEAD:
4722 calc_load_migrate(rq);
4723 break;
4724#endif
4725 }
4726
4727 update_max_interval();
4728
4729 return NOTIFY_OK;
4730}
4731
4732
4733
4734
4735
4736
4737static struct notifier_block migration_notifier = {
4738 .notifier_call = migration_call,
4739 .priority = CPU_PRI_MIGRATION,
4740};
4741
4742static int sched_cpu_active(struct notifier_block *nfb,
4743 unsigned long action, void *hcpu)
4744{
4745 switch (action & ~CPU_TASKS_FROZEN) {
4746 case CPU_STARTING:
4747 case CPU_DOWN_FAILED:
4748 set_cpu_active((long)hcpu, true);
4749 return NOTIFY_OK;
4750 default:
4751 return NOTIFY_DONE;
4752 }
4753}
4754
4755static int sched_cpu_inactive(struct notifier_block *nfb,
4756 unsigned long action, void *hcpu)
4757{
4758 switch (action & ~CPU_TASKS_FROZEN) {
4759 case CPU_DOWN_PREPARE:
4760 set_cpu_active((long)hcpu, false);
4761 return NOTIFY_OK;
4762 default:
4763 return NOTIFY_DONE;
4764 }
4765}
4766
4767static int __init migration_init(void)
4768{
4769 void *cpu = (void *)(long)smp_processor_id();
4770 int err;
4771
4772
4773 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4774 BUG_ON(err == NOTIFY_BAD);
4775 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4776 register_cpu_notifier(&migration_notifier);
4777
4778
4779 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
4780 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
4781
4782 return 0;
4783}
4784early_initcall(migration_init);
4785#endif
4786
4787#ifdef CONFIG_SMP
4788
4789static cpumask_var_t sched_domains_tmpmask;
4790
4791#ifdef CONFIG_SCHED_DEBUG
4792
4793static __read_mostly int sched_debug_enabled;
4794
4795static int __init sched_debug_setup(char *str)
4796{
4797 sched_debug_enabled = 1;
4798
4799 return 0;
4800}
4801early_param("sched_debug", sched_debug_setup);
4802
4803static inline bool sched_debug(void)
4804{
4805 return sched_debug_enabled;
4806}
4807
4808static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
4809 struct cpumask *groupmask)
4810{
4811 struct sched_group *group = sd->groups;
4812 char str[256];
4813
4814 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
4815 cpumask_clear(groupmask);
4816
4817 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
4818
4819 if (!(sd->flags & SD_LOAD_BALANCE)) {
4820 printk("does not load-balance\n");
4821 if (sd->parent)
4822 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
4823 " has parent");
4824 return -1;
4825 }
4826
4827 printk(KERN_CONT "span %s level %s\n", str, sd->name);
4828
4829 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
4830 printk(KERN_ERR "ERROR: domain->span does not contain "
4831 "CPU%d\n", cpu);
4832 }
4833 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
4834 printk(KERN_ERR "ERROR: domain->groups does not contain"
4835 " CPU%d\n", cpu);
4836 }
4837
4838 printk(KERN_DEBUG "%*s groups:", level + 1, "");
4839 do {
4840 if (!group) {
4841 printk("\n");
4842 printk(KERN_ERR "ERROR: group is NULL\n");
4843 break;
4844 }
4845
4846
4847
4848
4849
4850
4851 if (!group->sgp->power_orig) {
4852 printk(KERN_CONT "\n");
4853 printk(KERN_ERR "ERROR: domain->cpu_power not "
4854 "set\n");
4855 break;
4856 }
4857
4858 if (!cpumask_weight(sched_group_cpus(group))) {
4859 printk(KERN_CONT "\n");
4860 printk(KERN_ERR "ERROR: empty group\n");
4861 break;
4862 }
4863
4864 if (!(sd->flags & SD_OVERLAP) &&
4865 cpumask_intersects(groupmask, sched_group_cpus(group))) {
4866 printk(KERN_CONT "\n");
4867 printk(KERN_ERR "ERROR: repeated CPUs\n");
4868 break;
4869 }
4870
4871 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
4872
4873 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
4874
4875 printk(KERN_CONT " %s", str);
4876 if (group->sgp->power != SCHED_POWER_SCALE) {
4877 printk(KERN_CONT " (cpu_power = %d)",
4878 group->sgp->power);
4879 }
4880
4881 group = group->next;
4882 } while (group != sd->groups);
4883 printk(KERN_CONT "\n");
4884
4885 if (!cpumask_equal(sched_domain_span(sd), groupmask))
4886 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4887
4888 if (sd->parent &&
4889 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
4890 printk(KERN_ERR "ERROR: parent span is not a superset "
4891 "of domain->span\n");
4892 return 0;
4893}
4894
4895static void sched_domain_debug(struct sched_domain *sd, int cpu)
4896{
4897 int level = 0;
4898
4899 if (!sched_debug_enabled)
4900 return;
4901
4902 if (!sd) {
4903 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4904 return;
4905 }
4906
4907 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4908
4909 for (;;) {
4910 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
4911 break;
4912 level++;
4913 sd = sd->parent;
4914 if (!sd)
4915 break;
4916 }
4917}
4918#else
4919# define sched_domain_debug(sd, cpu) do { } while (0)
4920static inline bool sched_debug(void)
4921{
4922 return false;
4923}
4924#endif
4925
4926static int sd_degenerate(struct sched_domain *sd)
4927{
4928 if (cpumask_weight(sched_domain_span(sd)) == 1)
4929 return 1;
4930
4931
4932 if (sd->flags & (SD_LOAD_BALANCE |
4933 SD_BALANCE_NEWIDLE |
4934 SD_BALANCE_FORK |
4935 SD_BALANCE_EXEC |
4936 SD_SHARE_CPUPOWER |
4937 SD_SHARE_PKG_RESOURCES)) {
4938 if (sd->groups != sd->groups->next)
4939 return 0;
4940 }
4941
4942
4943 if (sd->flags & (SD_WAKE_AFFINE))
4944 return 0;
4945
4946 return 1;
4947}
4948
4949static int
4950sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4951{
4952 unsigned long cflags = sd->flags, pflags = parent->flags;
4953
4954 if (sd_degenerate(parent))
4955 return 1;
4956
4957 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
4958 return 0;
4959
4960
4961 if (parent->groups == parent->groups->next) {
4962 pflags &= ~(SD_LOAD_BALANCE |
4963 SD_BALANCE_NEWIDLE |
4964 SD_BALANCE_FORK |
4965 SD_BALANCE_EXEC |
4966 SD_SHARE_CPUPOWER |
4967 SD_SHARE_PKG_RESOURCES);
4968 if (nr_node_ids == 1)
4969 pflags &= ~SD_SERIALIZE;
4970 }
4971 if (~cflags & pflags)
4972 return 0;
4973
4974 return 1;
4975}
4976
4977static void free_rootdomain(struct rcu_head *rcu)
4978{
4979 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4980
4981 cpupri_cleanup(&rd->cpupri);
4982 free_cpumask_var(rd->rto_mask);
4983 free_cpumask_var(rd->online);
4984 free_cpumask_var(rd->span);
4985 kfree(rd);
4986}
4987
4988static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4989{
4990 struct root_domain *old_rd = NULL;
4991 unsigned long flags;
4992
4993 raw_spin_lock_irqsave(&rq->lock, flags);
4994
4995 if (rq->rd) {
4996 old_rd = rq->rd;
4997
4998 if (cpumask_test_cpu(rq->cpu, old_rd->online))
4999 set_rq_offline(rq);
5000
5001 cpumask_clear_cpu(rq->cpu, old_rd->span);
5002
5003
5004
5005
5006
5007
5008 if (!atomic_dec_and_test(&old_rd->refcount))
5009 old_rd = NULL;
5010 }
5011
5012 atomic_inc(&rd->refcount);
5013 rq->rd = rd;
5014
5015 cpumask_set_cpu(rq->cpu, rd->span);
5016 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5017 set_rq_online(rq);
5018
5019 raw_spin_unlock_irqrestore(&rq->lock, flags);
5020
5021 if (old_rd)
5022 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5023}
5024
5025static int init_rootdomain(struct root_domain *rd)
5026{
5027 memset(rd, 0, sizeof(*rd));
5028
5029 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5030 goto out;
5031 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5032 goto free_span;
5033 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5034 goto free_online;
5035
5036 if (cpupri_init(&rd->cpupri) != 0)
5037 goto free_rto_mask;
5038 return 0;
5039
5040free_rto_mask:
5041 free_cpumask_var(rd->rto_mask);
5042free_online:
5043 free_cpumask_var(rd->online);
5044free_span:
5045 free_cpumask_var(rd->span);
5046out:
5047 return -ENOMEM;
5048}
5049
5050
5051
5052
5053
5054struct root_domain def_root_domain;
5055
5056static void init_defrootdomain(void)
5057{
5058 init_rootdomain(&def_root_domain);
5059
5060 atomic_set(&def_root_domain.refcount, 1);
5061}
5062
5063static struct root_domain *alloc_rootdomain(void)
5064{
5065 struct root_domain *rd;
5066
5067 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5068 if (!rd)
5069 return NULL;
5070
5071 if (init_rootdomain(rd) != 0) {
5072 kfree(rd);
5073 return NULL;
5074 }
5075
5076 return rd;
5077}
5078
5079static void free_sched_groups(struct sched_group *sg, int free_sgp)
5080{
5081 struct sched_group *tmp, *first;
5082
5083 if (!sg)
5084 return;
5085
5086 first = sg;
5087 do {
5088 tmp = sg->next;
5089
5090 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5091 kfree(sg->sgp);
5092
5093 kfree(sg);
5094 sg = tmp;
5095 } while (sg != first);
5096}
5097
5098static void free_sched_domain(struct rcu_head *rcu)
5099{
5100 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5101
5102
5103
5104
5105
5106 if (sd->flags & SD_OVERLAP) {
5107 free_sched_groups(sd->groups, 1);
5108 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5109 kfree(sd->groups->sgp);
5110 kfree(sd->groups);
5111 }
5112 kfree(sd);
5113}
5114
5115static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5116{
5117 call_rcu(&sd->rcu, free_sched_domain);
5118}
5119
5120static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5121{
5122 for (; sd; sd = sd->parent)
5123 destroy_sched_domain(sd, cpu);
5124}
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5136DEFINE_PER_CPU(int, sd_llc_id);
5137
5138static void update_top_cache_domain(int cpu)
5139{
5140 struct sched_domain *sd;
5141 int id = cpu;
5142
5143 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5144 if (sd)
5145 id = cpumask_first(sched_domain_span(sd));
5146
5147 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5148 per_cpu(sd_llc_id, cpu) = id;
5149}
5150
5151
5152
5153
5154
5155static void
5156cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5157{
5158 struct rq *rq = cpu_rq(cpu);
5159 struct sched_domain *tmp;
5160
5161
5162 for (tmp = sd; tmp; ) {
5163 struct sched_domain *parent = tmp->parent;
5164 if (!parent)
5165 break;
5166
5167 if (sd_parent_degenerate(tmp, parent)) {
5168 tmp->parent = parent->parent;
5169 if (parent->parent)
5170 parent->parent->child = tmp;
5171 destroy_sched_domain(parent, cpu);
5172 } else
5173 tmp = tmp->parent;
5174 }
5175
5176 if (sd && sd_degenerate(sd)) {
5177 tmp = sd;
5178 sd = sd->parent;
5179 destroy_sched_domain(tmp, cpu);
5180 if (sd)
5181 sd->child = NULL;
5182 }
5183
5184 sched_domain_debug(sd, cpu);
5185
5186 rq_attach_root(rq, rd);
5187 tmp = rq->sd;
5188 rcu_assign_pointer(rq->sd, sd);
5189 destroy_sched_domains(tmp, cpu);
5190
5191 update_top_cache_domain(cpu);
5192}
5193
5194
5195static cpumask_var_t cpu_isolated_map;
5196
5197
5198static int __init isolated_cpu_setup(char *str)
5199{
5200 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5201 cpulist_parse(str, cpu_isolated_map);
5202 return 1;
5203}
5204
5205__setup("isolcpus=", isolated_cpu_setup);
5206
5207static const struct cpumask *cpu_cpu_mask(int cpu)
5208{
5209 return cpumask_of_node(cpu_to_node(cpu));
5210}
5211
5212struct sd_data {
5213 struct sched_domain **__percpu sd;
5214 struct sched_group **__percpu sg;
5215 struct sched_group_power **__percpu sgp;
5216};
5217
5218struct s_data {
5219 struct sched_domain ** __percpu sd;
5220 struct root_domain *rd;
5221};
5222
5223enum s_alloc {
5224 sa_rootdomain,
5225 sa_sd,
5226 sa_sd_storage,
5227 sa_none,
5228};
5229
5230struct sched_domain_topology_level;
5231
5232typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5233typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5234
5235#define SDTL_OVERLAP 0x01
5236
5237struct sched_domain_topology_level {
5238 sched_domain_init_f init;
5239 sched_domain_mask_f mask;
5240 int flags;
5241 int numa_level;
5242 struct sd_data data;
5243};
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5259{
5260 const struct cpumask *span = sched_domain_span(sd);
5261 struct sd_data *sdd = sd->private;
5262 struct sched_domain *sibling;
5263 int i;
5264
5265 for_each_cpu(i, span) {
5266 sibling = *per_cpu_ptr(sdd->sd, i);
5267 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5268 continue;
5269
5270 cpumask_set_cpu(i, sched_group_mask(sg));
5271 }
5272}
5273
5274
5275
5276
5277
5278int group_balance_cpu(struct sched_group *sg)
5279{
5280 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5281}
5282
5283static int
5284build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5285{
5286 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5287 const struct cpumask *span = sched_domain_span(sd);
5288 struct cpumask *covered = sched_domains_tmpmask;
5289 struct sd_data *sdd = sd->private;
5290 struct sched_domain *child;
5291 int i;
5292
5293 cpumask_clear(covered);
5294
5295 for_each_cpu(i, span) {
5296 struct cpumask *sg_span;
5297
5298 if (cpumask_test_cpu(i, covered))
5299 continue;
5300
5301 child = *per_cpu_ptr(sdd->sd, i);
5302
5303
5304 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5305 continue;
5306
5307 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5308 GFP_KERNEL, cpu_to_node(cpu));
5309
5310 if (!sg)
5311 goto fail;
5312
5313 sg_span = sched_group_cpus(sg);
5314 if (child->child) {
5315 child = child->child;
5316 cpumask_copy(sg_span, sched_domain_span(child));
5317 } else
5318 cpumask_set_cpu(i, sg_span);
5319
5320 cpumask_or(covered, covered, sg_span);
5321
5322 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5323 if (atomic_inc_return(&sg->sgp->ref) == 1)
5324 build_group_mask(sd, sg);
5325
5326
5327
5328
5329
5330
5331 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5332
5333
5334
5335
5336
5337
5338 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5339 group_balance_cpu(sg) == cpu)
5340 groups = sg;
5341
5342 if (!first)
5343 first = sg;
5344 if (last)
5345 last->next = sg;
5346 last = sg;
5347 last->next = first;
5348 }
5349 sd->groups = groups;
5350
5351 return 0;
5352
5353fail:
5354 free_sched_groups(first, 0);
5355
5356 return -ENOMEM;
5357}
5358
5359static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5360{
5361 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5362 struct sched_domain *child = sd->child;
5363
5364 if (child)
5365 cpu = cpumask_first(sched_domain_span(child));
5366
5367 if (sg) {
5368 *sg = *per_cpu_ptr(sdd->sg, cpu);
5369 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5370 atomic_set(&(*sg)->sgp->ref, 1);
5371 }
5372
5373 return cpu;
5374}
5375
5376
5377
5378
5379
5380
5381
5382
5383static int
5384build_sched_groups(struct sched_domain *sd, int cpu)
5385{
5386 struct sched_group *first = NULL, *last = NULL;
5387 struct sd_data *sdd = sd->private;
5388 const struct cpumask *span = sched_domain_span(sd);
5389 struct cpumask *covered;
5390 int i;
5391
5392 get_group(cpu, sdd, &sd->groups);
5393 atomic_inc(&sd->groups->ref);
5394
5395 if (cpu != cpumask_first(span))
5396 return 0;
5397
5398 lockdep_assert_held(&sched_domains_mutex);
5399 covered = sched_domains_tmpmask;
5400
5401 cpumask_clear(covered);
5402
5403 for_each_cpu(i, span) {
5404 struct sched_group *sg;
5405 int group, j;
5406
5407 if (cpumask_test_cpu(i, covered))
5408 continue;
5409
5410 group = get_group(i, sdd, &sg);
5411 cpumask_clear(sched_group_cpus(sg));
5412 sg->sgp->power = 0;
5413 cpumask_setall(sched_group_mask(sg));
5414
5415 for_each_cpu(j, span) {
5416 if (get_group(j, sdd, NULL) != group)
5417 continue;
5418
5419 cpumask_set_cpu(j, covered);
5420 cpumask_set_cpu(j, sched_group_cpus(sg));
5421 }
5422
5423 if (!first)
5424 first = sg;
5425 if (last)
5426 last->next = sg;
5427 last = sg;
5428 }
5429 last->next = first;
5430
5431 return 0;
5432}
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5445{
5446 struct sched_group *sg = sd->groups;
5447
5448 WARN_ON(!sg);
5449
5450 do {
5451 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5452 sg = sg->next;
5453 } while (sg != sd->groups);
5454
5455 if (cpu != group_balance_cpu(sg))
5456 return;
5457
5458 update_group_power(sd, cpu);
5459 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5460}
5461
5462int __weak arch_sd_sibling_asym_packing(void)
5463{
5464 return 0*SD_ASYM_PACKING;
5465}
5466
5467
5468
5469
5470
5471
5472#ifdef CONFIG_SCHED_DEBUG
5473# define SD_INIT_NAME(sd, type) sd->name = #type
5474#else
5475# define SD_INIT_NAME(sd, type) do { } while (0)
5476#endif
5477
5478#define SD_INIT_FUNC(type) \
5479static noinline struct sched_domain * \
5480sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5481{ \
5482 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5483 *sd = SD_##type##_INIT; \
5484 SD_INIT_NAME(sd, type); \
5485 sd->private = &tl->data; \
5486 return sd; \
5487}
5488
5489SD_INIT_FUNC(CPU)
5490#ifdef CONFIG_SCHED_SMT
5491 SD_INIT_FUNC(SIBLING)
5492#endif
5493#ifdef CONFIG_SCHED_MC
5494 SD_INIT_FUNC(MC)
5495#endif
5496#ifdef CONFIG_SCHED_BOOK
5497 SD_INIT_FUNC(BOOK)
5498#endif
5499
5500static int default_relax_domain_level = -1;
5501int sched_domain_level_max;
5502
5503static int __init setup_relax_domain_level(char *str)
5504{
5505 if (kstrtoint(str, 0, &default_relax_domain_level))
5506 pr_warn("Unable to set relax_domain_level\n");
5507
5508 return 1;
5509}
5510__setup("relax_domain_level=", setup_relax_domain_level);
5511
5512static void set_domain_attribute(struct sched_domain *sd,
5513 struct sched_domain_attr *attr)
5514{
5515 int request;
5516
5517 if (!attr || attr->relax_domain_level < 0) {
5518 if (default_relax_domain_level < 0)
5519 return;
5520 else
5521 request = default_relax_domain_level;
5522 } else
5523 request = attr->relax_domain_level;
5524 if (request < sd->level) {
5525
5526 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5527 } else {
5528
5529 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5530 }
5531}
5532
5533static void __sdt_free(const struct cpumask *cpu_map);
5534static int __sdt_alloc(const struct cpumask *cpu_map);
5535
5536static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5537 const struct cpumask *cpu_map)
5538{
5539 switch (what) {
5540 case sa_rootdomain:
5541 if (!atomic_read(&d->rd->refcount))
5542 free_rootdomain(&d->rd->rcu);
5543 case sa_sd:
5544 free_percpu(d->sd);
5545 case sa_sd_storage:
5546 __sdt_free(cpu_map);
5547 case sa_none:
5548 break;
5549 }
5550}
5551
5552static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5553 const struct cpumask *cpu_map)
5554{
5555 memset(d, 0, sizeof(*d));
5556
5557 if (__sdt_alloc(cpu_map))
5558 return sa_sd_storage;
5559 d->sd = alloc_percpu(struct sched_domain *);
5560 if (!d->sd)
5561 return sa_sd_storage;
5562 d->rd = alloc_rootdomain();
5563 if (!d->rd)
5564 return sa_sd;
5565 return sa_rootdomain;
5566}
5567
5568
5569
5570
5571
5572
5573static void claim_allocations(int cpu, struct sched_domain *sd)
5574{
5575 struct sd_data *sdd = sd->private;
5576
5577 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5578 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5579
5580 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5581 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5582
5583 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5584 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5585}
5586
5587#ifdef CONFIG_SCHED_SMT
5588static const struct cpumask *cpu_smt_mask(int cpu)
5589{
5590 return topology_thread_cpumask(cpu);
5591}
5592#endif
5593
5594
5595
5596
5597static struct sched_domain_topology_level default_topology[] = {
5598#ifdef CONFIG_SCHED_SMT
5599 { sd_init_SIBLING, cpu_smt_mask, },
5600#endif
5601#ifdef CONFIG_SCHED_MC
5602 { sd_init_MC, cpu_coregroup_mask, },
5603#endif
5604#ifdef CONFIG_SCHED_BOOK
5605 { sd_init_BOOK, cpu_book_mask, },
5606#endif
5607 { sd_init_CPU, cpu_cpu_mask, },
5608 { NULL, },
5609};
5610
5611static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5612
5613#define for_each_sd_topology(tl) \
5614 for (tl = sched_domain_topology; tl->init; tl++)
5615
5616#ifdef CONFIG_NUMA
5617
5618static int sched_domains_numa_levels;
5619static int *sched_domains_numa_distance;
5620static struct cpumask ***sched_domains_numa_masks;
5621static int sched_domains_curr_level;
5622
5623static inline int sd_local_flags(int level)
5624{
5625 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5626 return 0;
5627
5628 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5629}
5630
5631static struct sched_domain *
5632sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5633{
5634 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5635 int level = tl->numa_level;
5636 int sd_weight = cpumask_weight(
5637 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
5638
5639 *sd = (struct sched_domain){
5640 .min_interval = sd_weight,
5641 .max_interval = 2*sd_weight,
5642 .busy_factor = 32,
5643 .imbalance_pct = 125,
5644 .cache_nice_tries = 2,
5645 .busy_idx = 3,
5646 .idle_idx = 2,
5647 .newidle_idx = 0,
5648 .wake_idx = 0,
5649 .forkexec_idx = 0,
5650
5651 .flags = 1*SD_LOAD_BALANCE
5652 | 1*SD_BALANCE_NEWIDLE
5653 | 0*SD_BALANCE_EXEC
5654 | 0*SD_BALANCE_FORK
5655 | 0*SD_BALANCE_WAKE
5656 | 0*SD_WAKE_AFFINE
5657 | 0*SD_SHARE_CPUPOWER
5658 | 0*SD_SHARE_PKG_RESOURCES
5659 | 1*SD_SERIALIZE
5660 | 0*SD_PREFER_SIBLING
5661 | sd_local_flags(level)
5662 ,
5663 .last_balance = jiffies,
5664 .balance_interval = sd_weight,
5665 };
5666 SD_INIT_NAME(sd, NUMA);
5667 sd->private = &tl->data;
5668
5669
5670
5671
5672 sched_domains_curr_level = tl->numa_level;
5673
5674 return sd;
5675}
5676
5677static const struct cpumask *sd_numa_mask(int cpu)
5678{
5679 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
5680}
5681
5682static void sched_numa_warn(const char *str)
5683{
5684 static int done = false;
5685 int i,j;
5686
5687 if (done)
5688 return;
5689
5690 done = true;
5691
5692 printk(KERN_WARNING "ERROR: %s\n\n", str);
5693
5694 for (i = 0; i < nr_node_ids; i++) {
5695 printk(KERN_WARNING " ");
5696 for (j = 0; j < nr_node_ids; j++)
5697 printk(KERN_CONT "%02d ", node_distance(i,j));
5698 printk(KERN_CONT "\n");
5699 }
5700 printk(KERN_WARNING "\n");
5701}
5702
5703static bool find_numa_distance(int distance)
5704{
5705 int i;
5706
5707 if (distance == node_distance(0, 0))
5708 return true;
5709
5710 for (i = 0; i < sched_domains_numa_levels; i++) {
5711 if (sched_domains_numa_distance[i] == distance)
5712 return true;
5713 }
5714
5715 return false;
5716}
5717
5718static void sched_init_numa(void)
5719{
5720 int next_distance, curr_distance = node_distance(0, 0);
5721 struct sched_domain_topology_level *tl;
5722 int level = 0;
5723 int i, j, k;
5724
5725 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
5726 if (!sched_domains_numa_distance)
5727 return;
5728
5729
5730
5731
5732
5733
5734
5735
5736 next_distance = curr_distance;
5737 for (i = 0; i < nr_node_ids; i++) {
5738 for (j = 0; j < nr_node_ids; j++) {
5739 for (k = 0; k < nr_node_ids; k++) {
5740 int distance = node_distance(i, k);
5741
5742 if (distance > curr_distance &&
5743 (distance < next_distance ||
5744 next_distance == curr_distance))
5745 next_distance = distance;
5746
5747
5748
5749
5750
5751
5752 if (sched_debug() && node_distance(k, i) != distance)
5753 sched_numa_warn("Node-distance not symmetric");
5754
5755 if (sched_debug() && i && !find_numa_distance(distance))
5756 sched_numa_warn("Node-0 not representative");
5757 }
5758 if (next_distance != curr_distance) {
5759 sched_domains_numa_distance[level++] = next_distance;
5760 sched_domains_numa_levels = level;
5761 curr_distance = next_distance;
5762 } else break;
5763 }
5764
5765
5766
5767
5768 if (!sched_debug())
5769 break;
5770 }
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788 sched_domains_numa_levels = 0;
5789
5790 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
5791 if (!sched_domains_numa_masks)
5792 return;
5793
5794
5795
5796
5797
5798 for (i = 0; i < level; i++) {
5799 sched_domains_numa_masks[i] =
5800 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
5801 if (!sched_domains_numa_masks[i])
5802 return;
5803
5804 for (j = 0; j < nr_node_ids; j++) {
5805 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
5806 if (!mask)
5807 return;
5808
5809 sched_domains_numa_masks[i][j] = mask;
5810
5811 for (k = 0; k < nr_node_ids; k++) {
5812 if (node_distance(j, k) > sched_domains_numa_distance[i])
5813 continue;
5814
5815 cpumask_or(mask, mask, cpumask_of_node(k));
5816 }
5817 }
5818 }
5819
5820 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
5821 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
5822 if (!tl)
5823 return;
5824
5825
5826
5827
5828 for (i = 0; default_topology[i].init; i++)
5829 tl[i] = default_topology[i];
5830
5831
5832
5833
5834 for (j = 0; j < level; i++, j++) {
5835 tl[i] = (struct sched_domain_topology_level){
5836 .init = sd_numa_init,
5837 .mask = sd_numa_mask,
5838 .flags = SDTL_OVERLAP,
5839 .numa_level = j,
5840 };
5841 }
5842
5843 sched_domain_topology = tl;
5844
5845 sched_domains_numa_levels = level;
5846}
5847
5848static void sched_domains_numa_masks_set(int cpu)
5849{
5850 int i, j;
5851 int node = cpu_to_node(cpu);
5852
5853 for (i = 0; i < sched_domains_numa_levels; i++) {
5854 for (j = 0; j < nr_node_ids; j++) {
5855 if (node_distance(j, node) <= sched_domains_numa_distance[i])
5856 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
5857 }
5858 }
5859}
5860
5861static void sched_domains_numa_masks_clear(int cpu)
5862{
5863 int i, j;
5864 for (i = 0; i < sched_domains_numa_levels; i++) {
5865 for (j = 0; j < nr_node_ids; j++)
5866 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
5867 }
5868}
5869
5870
5871
5872
5873
5874static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5875 unsigned long action,
5876 void *hcpu)
5877{
5878 int cpu = (long)hcpu;
5879
5880 switch (action & ~CPU_TASKS_FROZEN) {
5881 case CPU_ONLINE:
5882 sched_domains_numa_masks_set(cpu);
5883 break;
5884
5885 case CPU_DEAD:
5886 sched_domains_numa_masks_clear(cpu);
5887 break;
5888
5889 default:
5890 return NOTIFY_DONE;
5891 }
5892
5893 return NOTIFY_OK;
5894}
5895#else
5896static inline void sched_init_numa(void)
5897{
5898}
5899
5900static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5901 unsigned long action,
5902 void *hcpu)
5903{
5904 return 0;
5905}
5906#endif
5907
5908static int __sdt_alloc(const struct cpumask *cpu_map)
5909{
5910 struct sched_domain_topology_level *tl;
5911 int j;
5912
5913 for_each_sd_topology(tl) {
5914 struct sd_data *sdd = &tl->data;
5915
5916 sdd->sd = alloc_percpu(struct sched_domain *);
5917 if (!sdd->sd)
5918 return -ENOMEM;
5919
5920 sdd->sg = alloc_percpu(struct sched_group *);
5921 if (!sdd->sg)
5922 return -ENOMEM;
5923
5924 sdd->sgp = alloc_percpu(struct sched_group_power *);
5925 if (!sdd->sgp)
5926 return -ENOMEM;
5927
5928 for_each_cpu(j, cpu_map) {
5929 struct sched_domain *sd;
5930 struct sched_group *sg;
5931 struct sched_group_power *sgp;
5932
5933 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
5934 GFP_KERNEL, cpu_to_node(j));
5935 if (!sd)
5936 return -ENOMEM;
5937
5938 *per_cpu_ptr(sdd->sd, j) = sd;
5939
5940 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5941 GFP_KERNEL, cpu_to_node(j));
5942 if (!sg)
5943 return -ENOMEM;
5944
5945 sg->next = sg;
5946
5947 *per_cpu_ptr(sdd->sg, j) = sg;
5948
5949 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
5950 GFP_KERNEL, cpu_to_node(j));
5951 if (!sgp)
5952 return -ENOMEM;
5953
5954 *per_cpu_ptr(sdd->sgp, j) = sgp;
5955 }
5956 }
5957
5958 return 0;
5959}
5960
5961static void __sdt_free(const struct cpumask *cpu_map)
5962{
5963 struct sched_domain_topology_level *tl;
5964 int j;
5965
5966 for_each_sd_topology(tl) {
5967 struct sd_data *sdd = &tl->data;
5968
5969 for_each_cpu(j, cpu_map) {
5970 struct sched_domain *sd;
5971
5972 if (sdd->sd) {
5973 sd = *per_cpu_ptr(sdd->sd, j);
5974 if (sd && (sd->flags & SD_OVERLAP))
5975 free_sched_groups(sd->groups, 0);
5976 kfree(*per_cpu_ptr(sdd->sd, j));
5977 }
5978
5979 if (sdd->sg)
5980 kfree(*per_cpu_ptr(sdd->sg, j));
5981 if (sdd->sgp)
5982 kfree(*per_cpu_ptr(sdd->sgp, j));
5983 }
5984 free_percpu(sdd->sd);
5985 sdd->sd = NULL;
5986 free_percpu(sdd->sg);
5987 sdd->sg = NULL;
5988 free_percpu(sdd->sgp);
5989 sdd->sgp = NULL;
5990 }
5991}
5992
5993struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
5994 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
5995 struct sched_domain *child, int cpu)
5996{
5997 struct sched_domain *sd = tl->init(tl, cpu);
5998 if (!sd)
5999 return child;
6000
6001 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6002 if (child) {
6003 sd->level = child->level + 1;
6004 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6005 child->parent = sd;
6006 sd->child = child;
6007 }
6008 set_domain_attribute(sd, attr);
6009
6010 return sd;
6011}
6012
6013
6014
6015
6016
6017static int build_sched_domains(const struct cpumask *cpu_map,
6018 struct sched_domain_attr *attr)
6019{
6020 enum s_alloc alloc_state;
6021 struct sched_domain *sd;
6022 struct s_data d;
6023 int i, ret = -ENOMEM;
6024
6025 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6026 if (alloc_state != sa_rootdomain)
6027 goto error;
6028
6029
6030 for_each_cpu(i, cpu_map) {
6031 struct sched_domain_topology_level *tl;
6032
6033 sd = NULL;
6034 for_each_sd_topology(tl) {
6035 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6036 if (tl == sched_domain_topology)
6037 *per_cpu_ptr(d.sd, i) = sd;
6038 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6039 sd->flags |= SD_OVERLAP;
6040 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6041 break;
6042 }
6043 }
6044
6045
6046 for_each_cpu(i, cpu_map) {
6047 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6048 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6049 if (sd->flags & SD_OVERLAP) {
6050 if (build_overlap_sched_groups(sd, i))
6051 goto error;
6052 } else {
6053 if (build_sched_groups(sd, i))
6054 goto error;
6055 }
6056 }
6057 }
6058
6059
6060 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6061 if (!cpumask_test_cpu(i, cpu_map))
6062 continue;
6063
6064 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6065 claim_allocations(i, sd);
6066 init_sched_groups_power(i, sd);
6067 }
6068 }
6069
6070
6071 rcu_read_lock();
6072 for_each_cpu(i, cpu_map) {
6073 sd = *per_cpu_ptr(d.sd, i);
6074 cpu_attach_domain(sd, d.rd, i);
6075 }
6076 rcu_read_unlock();
6077
6078 ret = 0;
6079error:
6080 __free_domain_allocs(&d, alloc_state, cpu_map);
6081 return ret;
6082}
6083
6084static cpumask_var_t *doms_cur;
6085static int ndoms_cur;
6086static struct sched_domain_attr *dattr_cur;
6087
6088
6089
6090
6091
6092
6093
6094static cpumask_var_t fallback_doms;
6095
6096
6097
6098
6099
6100
6101int __attribute__((weak)) arch_update_cpu_topology(void)
6102{
6103 return 0;
6104}
6105
6106cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6107{
6108 int i;
6109 cpumask_var_t *doms;
6110
6111 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6112 if (!doms)
6113 return NULL;
6114 for (i = 0; i < ndoms; i++) {
6115 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6116 free_sched_domains(doms, i);
6117 return NULL;
6118 }
6119 }
6120 return doms;
6121}
6122
6123void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6124{
6125 unsigned int i;
6126 for (i = 0; i < ndoms; i++)
6127 free_cpumask_var(doms[i]);
6128 kfree(doms);
6129}
6130
6131
6132
6133
6134
6135
6136static int init_sched_domains(const struct cpumask *cpu_map)
6137{
6138 int err;
6139
6140 arch_update_cpu_topology();
6141 ndoms_cur = 1;
6142 doms_cur = alloc_sched_domains(ndoms_cur);
6143 if (!doms_cur)
6144 doms_cur = &fallback_doms;
6145 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6146 err = build_sched_domains(doms_cur[0], NULL);
6147 register_sched_domain_sysctl();
6148
6149 return err;
6150}
6151
6152
6153
6154
6155
6156static void detach_destroy_domains(const struct cpumask *cpu_map)
6157{
6158 int i;
6159
6160 rcu_read_lock();
6161 for_each_cpu(i, cpu_map)
6162 cpu_attach_domain(NULL, &def_root_domain, i);
6163 rcu_read_unlock();
6164}
6165
6166
6167static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6168 struct sched_domain_attr *new, int idx_new)
6169{
6170 struct sched_domain_attr tmp;
6171
6172
6173 if (!new && !cur)
6174 return 1;
6175
6176 tmp = SD_ATTR_INIT;
6177 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6178 new ? (new + idx_new) : &tmp,
6179 sizeof(struct sched_domain_attr));
6180}
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6209 struct sched_domain_attr *dattr_new)
6210{
6211 int i, j, n;
6212 int new_topology;
6213
6214 mutex_lock(&sched_domains_mutex);
6215
6216
6217 unregister_sched_domain_sysctl();
6218
6219
6220 new_topology = arch_update_cpu_topology();
6221
6222 n = doms_new ? ndoms_new : 0;
6223
6224
6225 for (i = 0; i < ndoms_cur; i++) {
6226 for (j = 0; j < n && !new_topology; j++) {
6227 if (cpumask_equal(doms_cur[i], doms_new[j])
6228 && dattrs_equal(dattr_cur, i, dattr_new, j))
6229 goto match1;
6230 }
6231
6232 detach_destroy_domains(doms_cur[i]);
6233match1:
6234 ;
6235 }
6236
6237 if (doms_new == NULL) {
6238 ndoms_cur = 0;
6239 doms_new = &fallback_doms;
6240 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6241 WARN_ON_ONCE(dattr_new);
6242 }
6243
6244
6245 for (i = 0; i < ndoms_new; i++) {
6246 for (j = 0; j < ndoms_cur && !new_topology; j++) {
6247 if (cpumask_equal(doms_new[i], doms_cur[j])
6248 && dattrs_equal(dattr_new, i, dattr_cur, j))
6249 goto match2;
6250 }
6251
6252 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6253match2:
6254 ;
6255 }
6256
6257
6258 if (doms_cur != &fallback_doms)
6259 free_sched_domains(doms_cur, ndoms_cur);
6260 kfree(dattr_cur);
6261 doms_cur = doms_new;
6262 dattr_cur = dattr_new;
6263 ndoms_cur = ndoms_new;
6264
6265 register_sched_domain_sysctl();
6266
6267 mutex_unlock(&sched_domains_mutex);
6268}
6269
6270static int num_cpus_frozen;
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6281 void *hcpu)
6282{
6283 switch (action) {
6284 case CPU_ONLINE_FROZEN:
6285 case CPU_DOWN_FAILED_FROZEN:
6286
6287
6288
6289
6290
6291
6292
6293 num_cpus_frozen--;
6294 if (likely(num_cpus_frozen)) {
6295 partition_sched_domains(1, NULL, NULL);
6296 break;
6297 }
6298
6299
6300
6301
6302
6303
6304
6305 case CPU_ONLINE:
6306 case CPU_DOWN_FAILED:
6307 cpuset_update_active_cpus(true);
6308 break;
6309 default:
6310 return NOTIFY_DONE;
6311 }
6312 return NOTIFY_OK;
6313}
6314
6315static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6316 void *hcpu)
6317{
6318 switch (action) {
6319 case CPU_DOWN_PREPARE:
6320 cpuset_update_active_cpus(false);
6321 break;
6322 case CPU_DOWN_PREPARE_FROZEN:
6323 num_cpus_frozen++;
6324 partition_sched_domains(1, NULL, NULL);
6325 break;
6326 default:
6327 return NOTIFY_DONE;
6328 }
6329 return NOTIFY_OK;
6330}
6331
6332void __init sched_init_smp(void)
6333{
6334 cpumask_var_t non_isolated_cpus;
6335
6336 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6337 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6338
6339 sched_init_numa();
6340
6341 get_online_cpus();
6342 mutex_lock(&sched_domains_mutex);
6343 init_sched_domains(cpu_active_mask);
6344 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6345 if (cpumask_empty(non_isolated_cpus))
6346 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6347 mutex_unlock(&sched_domains_mutex);
6348 put_online_cpus();
6349
6350 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6351 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6352 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6353
6354 init_hrtick();
6355
6356
6357 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6358 BUG();
6359 sched_init_granularity();
6360 free_cpumask_var(non_isolated_cpus);
6361
6362 init_sched_rt_class();
6363}
6364#else
6365void __init sched_init_smp(void)
6366{
6367 sched_init_granularity();
6368}
6369#endif
6370
6371const_debug unsigned int sysctl_timer_migration = 1;
6372
6373int in_sched_functions(unsigned long addr)
6374{
6375 return in_lock_functions(addr) ||
6376 (addr >= (unsigned long)__sched_text_start
6377 && addr < (unsigned long)__sched_text_end);
6378}
6379
6380#ifdef CONFIG_CGROUP_SCHED
6381
6382
6383
6384
6385struct task_group root_task_group;
6386LIST_HEAD(task_groups);
6387#endif
6388
6389DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6390
6391void __init sched_init(void)
6392{
6393 int i, j;
6394 unsigned long alloc_size = 0, ptr;
6395
6396#ifdef CONFIG_FAIR_GROUP_SCHED
6397 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6398#endif
6399#ifdef CONFIG_RT_GROUP_SCHED
6400 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6401#endif
6402#ifdef CONFIG_CPUMASK_OFFSTACK
6403 alloc_size += num_possible_cpus() * cpumask_size();
6404#endif
6405 if (alloc_size) {
6406 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6407
6408#ifdef CONFIG_FAIR_GROUP_SCHED
6409 root_task_group.se = (struct sched_entity **)ptr;
6410 ptr += nr_cpu_ids * sizeof(void **);
6411
6412 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6413 ptr += nr_cpu_ids * sizeof(void **);
6414
6415#endif
6416#ifdef CONFIG_RT_GROUP_SCHED
6417 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6418 ptr += nr_cpu_ids * sizeof(void **);
6419
6420 root_task_group.rt_rq = (struct rt_rq **)ptr;
6421 ptr += nr_cpu_ids * sizeof(void **);
6422
6423#endif
6424#ifdef CONFIG_CPUMASK_OFFSTACK
6425 for_each_possible_cpu(i) {
6426 per_cpu(load_balance_mask, i) = (void *)ptr;
6427 ptr += cpumask_size();
6428 }
6429#endif
6430 }
6431
6432#ifdef CONFIG_SMP
6433 init_defrootdomain();
6434#endif
6435
6436 init_rt_bandwidth(&def_rt_bandwidth,
6437 global_rt_period(), global_rt_runtime());
6438
6439#ifdef CONFIG_RT_GROUP_SCHED
6440 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6441 global_rt_period(), global_rt_runtime());
6442#endif
6443
6444#ifdef CONFIG_CGROUP_SCHED
6445 list_add(&root_task_group.list, &task_groups);
6446 INIT_LIST_HEAD(&root_task_group.children);
6447 INIT_LIST_HEAD(&root_task_group.siblings);
6448 autogroup_init(&init_task);
6449
6450#endif
6451
6452 for_each_possible_cpu(i) {
6453 struct rq *rq;
6454
6455 rq = cpu_rq(i);
6456 raw_spin_lock_init(&rq->lock);
6457 rq->nr_running = 0;
6458 rq->calc_load_active = 0;
6459 rq->calc_load_update = jiffies + LOAD_FREQ;
6460 init_cfs_rq(&rq->cfs);
6461 init_rt_rq(&rq->rt, rq);
6462#ifdef CONFIG_FAIR_GROUP_SCHED
6463 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6464 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6485 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6486#endif
6487
6488 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6489#ifdef CONFIG_RT_GROUP_SCHED
6490 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6491 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6492#endif
6493
6494 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6495 rq->cpu_load[j] = 0;
6496
6497 rq->last_load_update_tick = jiffies;
6498
6499#ifdef CONFIG_SMP
6500 rq->sd = NULL;
6501 rq->rd = NULL;
6502 rq->cpu_power = SCHED_POWER_SCALE;
6503 rq->post_schedule = 0;
6504 rq->active_balance = 0;
6505 rq->next_balance = jiffies;
6506 rq->push_cpu = 0;
6507 rq->cpu = i;
6508 rq->online = 0;
6509 rq->idle_stamp = 0;
6510 rq->avg_idle = 2*sysctl_sched_migration_cost;
6511
6512 INIT_LIST_HEAD(&rq->cfs_tasks);
6513
6514 rq_attach_root(rq, &def_root_domain);
6515#ifdef CONFIG_NO_HZ_COMMON
6516 rq->nohz_flags = 0;
6517#endif
6518#ifdef CONFIG_NO_HZ_FULL
6519 rq->last_sched_tick = 0;
6520#endif
6521#endif
6522 init_rq_hrtick(rq);
6523 atomic_set(&rq->nr_iowait, 0);
6524 }
6525
6526 set_load_weight(&init_task);
6527
6528#ifdef CONFIG_PREEMPT_NOTIFIERS
6529 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6530#endif
6531
6532#ifdef CONFIG_RT_MUTEXES
6533 plist_head_init(&init_task.pi_waiters);
6534#endif
6535
6536
6537
6538
6539 atomic_inc(&init_mm.mm_count);
6540 enter_lazy_tlb(&init_mm, current);
6541
6542
6543
6544
6545
6546
6547
6548 init_idle(current, smp_processor_id());
6549
6550 calc_load_update = jiffies + LOAD_FREQ;
6551
6552
6553
6554
6555 current->sched_class = &fair_sched_class;
6556
6557#ifdef CONFIG_SMP
6558 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6559
6560 if (cpu_isolated_map == NULL)
6561 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6562 idle_thread_set_boot_cpu();
6563#endif
6564 init_sched_fair_class();
6565
6566 scheduler_running = 1;
6567}
6568
6569#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6570static inline int preempt_count_equals(int preempt_offset)
6571{
6572 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6573
6574 return (nested == preempt_offset);
6575}
6576
6577void __might_sleep(const char *file, int line, int preempt_offset)
6578{
6579 static unsigned long prev_jiffy;
6580
6581 rcu_sleep_check();
6582 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6583 system_state != SYSTEM_RUNNING || oops_in_progress)
6584 return;
6585 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6586 return;
6587 prev_jiffy = jiffies;
6588
6589 printk(KERN_ERR
6590 "BUG: sleeping function called from invalid context at %s:%d\n",
6591 file, line);
6592 printk(KERN_ERR
6593 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6594 in_atomic(), irqs_disabled(),
6595 current->pid, current->comm);
6596
6597 debug_show_held_locks(current);
6598 if (irqs_disabled())
6599 print_irqtrace_events(current);
6600 dump_stack();
6601}
6602EXPORT_SYMBOL(__might_sleep);
6603#endif
6604
6605#ifdef CONFIG_MAGIC_SYSRQ
6606static void normalize_task(struct rq *rq, struct task_struct *p)
6607{
6608 const struct sched_class *prev_class = p->sched_class;
6609 int old_prio = p->prio;
6610 int on_rq;
6611
6612 on_rq = p->on_rq;
6613 if (on_rq)
6614 dequeue_task(rq, p, 0);
6615 __setscheduler(rq, p, SCHED_NORMAL, 0);
6616 if (on_rq) {
6617 enqueue_task(rq, p, 0);
6618 resched_task(rq->curr);
6619 }
6620
6621 check_class_changed(rq, p, prev_class, old_prio);
6622}
6623
6624void normalize_rt_tasks(void)
6625{
6626 struct task_struct *g, *p;
6627 unsigned long flags;
6628 struct rq *rq;
6629
6630 read_lock_irqsave(&tasklist_lock, flags);
6631 do_each_thread(g, p) {
6632
6633
6634
6635 if (!p->mm)
6636 continue;
6637
6638 p->se.exec_start = 0;
6639#ifdef CONFIG_SCHEDSTATS
6640 p->se.statistics.wait_start = 0;
6641 p->se.statistics.sleep_start = 0;
6642 p->se.statistics.block_start = 0;
6643#endif
6644
6645 if (!rt_task(p)) {
6646
6647
6648
6649
6650 if (TASK_NICE(p) < 0 && p->mm)
6651 set_user_nice(p, 0);
6652 continue;
6653 }
6654
6655 raw_spin_lock(&p->pi_lock);
6656 rq = __task_rq_lock(p);
6657
6658 normalize_task(rq, p);
6659
6660 __task_rq_unlock(rq);
6661 raw_spin_unlock(&p->pi_lock);
6662 } while_each_thread(g, p);
6663
6664 read_unlock_irqrestore(&tasklist_lock, flags);
6665}
6666
6667#endif
6668
6669#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688struct task_struct *curr_task(int cpu)
6689{
6690 return cpu_curr(cpu);
6691}
6692
6693#endif
6694
6695#ifdef CONFIG_IA64
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711void set_curr_task(int cpu, struct task_struct *p)
6712{
6713 cpu_curr(cpu) = p;
6714}
6715
6716#endif
6717
6718#ifdef CONFIG_CGROUP_SCHED
6719
6720static DEFINE_SPINLOCK(task_group_lock);
6721
6722static void free_sched_group(struct task_group *tg)
6723{
6724 free_fair_sched_group(tg);
6725 free_rt_sched_group(tg);
6726 autogroup_free(tg);
6727 kfree(tg);
6728}
6729
6730
6731struct task_group *sched_create_group(struct task_group *parent)
6732{
6733 struct task_group *tg;
6734
6735 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6736 if (!tg)
6737 return ERR_PTR(-ENOMEM);
6738
6739 if (!alloc_fair_sched_group(tg, parent))
6740 goto err;
6741
6742 if (!alloc_rt_sched_group(tg, parent))
6743 goto err;
6744
6745 return tg;
6746
6747err:
6748 free_sched_group(tg);
6749 return ERR_PTR(-ENOMEM);
6750}
6751
6752void sched_online_group(struct task_group *tg, struct task_group *parent)
6753{
6754 unsigned long flags;
6755
6756 spin_lock_irqsave(&task_group_lock, flags);
6757 list_add_rcu(&tg->list, &task_groups);
6758
6759 WARN_ON(!parent);
6760
6761 tg->parent = parent;
6762 INIT_LIST_HEAD(&tg->children);
6763 list_add_rcu(&tg->siblings, &parent->children);
6764 spin_unlock_irqrestore(&task_group_lock, flags);
6765}
6766
6767
6768static void free_sched_group_rcu(struct rcu_head *rhp)
6769{
6770
6771 free_sched_group(container_of(rhp, struct task_group, rcu));
6772}
6773
6774
6775void sched_destroy_group(struct task_group *tg)
6776{
6777
6778 call_rcu(&tg->rcu, free_sched_group_rcu);
6779}
6780
6781void sched_offline_group(struct task_group *tg)
6782{
6783 unsigned long flags;
6784 int i;
6785
6786
6787 for_each_possible_cpu(i)
6788 unregister_fair_sched_group(tg, i);
6789
6790 spin_lock_irqsave(&task_group_lock, flags);
6791 list_del_rcu(&tg->list);
6792 list_del_rcu(&tg->siblings);
6793 spin_unlock_irqrestore(&task_group_lock, flags);
6794}
6795
6796
6797
6798
6799
6800
6801void sched_move_task(struct task_struct *tsk)
6802{
6803 struct task_group *tg;
6804 int on_rq, running;
6805 unsigned long flags;
6806 struct rq *rq;
6807
6808 rq = task_rq_lock(tsk, &flags);
6809
6810 running = task_current(rq, tsk);
6811 on_rq = tsk->on_rq;
6812
6813 if (on_rq)
6814 dequeue_task(rq, tsk, 0);
6815 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk);
6817
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg);
6822 tsk->sched_task_group = tg;
6823
6824#ifdef CONFIG_FAIR_GROUP_SCHED
6825 if (tsk->sched_class->task_move_group)
6826 tsk->sched_class->task_move_group(tsk, on_rq);
6827 else
6828#endif
6829 set_task_rq(tsk, task_cpu(tsk));
6830
6831 if (unlikely(running))
6832 tsk->sched_class->set_curr_task(rq);
6833 if (on_rq)
6834 enqueue_task(rq, tsk, 0);
6835
6836 task_rq_unlock(rq, tsk, &flags);
6837}
6838#endif
6839
6840#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6841static unsigned long to_ratio(u64 period, u64 runtime)
6842{
6843 if (runtime == RUNTIME_INF)
6844 return 1ULL << 20;
6845
6846 return div64_u64(runtime << 20, period);
6847}
6848#endif
6849
6850#ifdef CONFIG_RT_GROUP_SCHED
6851
6852
6853
6854static DEFINE_MUTEX(rt_constraints_mutex);
6855
6856
6857static inline int tg_has_rt_tasks(struct task_group *tg)
6858{
6859 struct task_struct *g, *p;
6860
6861 do_each_thread(g, p) {
6862 if (rt_task(p) && task_rq(p)->rt.tg == tg)
6863 return 1;
6864 } while_each_thread(g, p);
6865
6866 return 0;
6867}
6868
6869struct rt_schedulable_data {
6870 struct task_group *tg;
6871 u64 rt_period;
6872 u64 rt_runtime;
6873};
6874
6875static int tg_rt_schedulable(struct task_group *tg, void *data)
6876{
6877 struct rt_schedulable_data *d = data;
6878 struct task_group *child;
6879 unsigned long total, sum = 0;
6880 u64 period, runtime;
6881
6882 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6883 runtime = tg->rt_bandwidth.rt_runtime;
6884
6885 if (tg == d->tg) {
6886 period = d->rt_period;
6887 runtime = d->rt_runtime;
6888 }
6889
6890
6891
6892
6893 if (runtime > period && runtime != RUNTIME_INF)
6894 return -EINVAL;
6895
6896
6897
6898
6899 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
6900 return -EBUSY;
6901
6902 total = to_ratio(period, runtime);
6903
6904
6905
6906
6907 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
6908 return -EINVAL;
6909
6910
6911
6912
6913 list_for_each_entry_rcu(child, &tg->children, siblings) {
6914 period = ktime_to_ns(child->rt_bandwidth.rt_period);
6915 runtime = child->rt_bandwidth.rt_runtime;
6916
6917 if (child == d->tg) {
6918 period = d->rt_period;
6919 runtime = d->rt_runtime;
6920 }
6921
6922 sum += to_ratio(period, runtime);
6923 }
6924
6925 if (sum > total)
6926 return -EINVAL;
6927
6928 return 0;
6929}
6930
6931static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
6932{
6933 int ret;
6934
6935 struct rt_schedulable_data data = {
6936 .tg = tg,
6937 .rt_period = period,
6938 .rt_runtime = runtime,
6939 };
6940
6941 rcu_read_lock();
6942 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
6943 rcu_read_unlock();
6944
6945 return ret;
6946}
6947
6948static int tg_set_rt_bandwidth(struct task_group *tg,
6949 u64 rt_period, u64 rt_runtime)
6950{
6951 int i, err = 0;
6952
6953 mutex_lock(&rt_constraints_mutex);
6954 read_lock(&tasklist_lock);
6955 err = __rt_schedulable(tg, rt_period, rt_runtime);
6956 if (err)
6957 goto unlock;
6958
6959 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6960 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
6961 tg->rt_bandwidth.rt_runtime = rt_runtime;
6962
6963 for_each_possible_cpu(i) {
6964 struct rt_rq *rt_rq = tg->rt_rq[i];
6965
6966 raw_spin_lock(&rt_rq->rt_runtime_lock);
6967 rt_rq->rt_runtime = rt_runtime;
6968 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6969 }
6970 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6971unlock:
6972 read_unlock(&tasklist_lock);
6973 mutex_unlock(&rt_constraints_mutex);
6974
6975 return err;
6976}
6977
6978static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
6979{
6980 u64 rt_runtime, rt_period;
6981
6982 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6983 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
6984 if (rt_runtime_us < 0)
6985 rt_runtime = RUNTIME_INF;
6986
6987 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6988}
6989
6990static long sched_group_rt_runtime(struct task_group *tg)
6991{
6992 u64 rt_runtime_us;
6993
6994 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
6995 return -1;
6996
6997 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
6998 do_div(rt_runtime_us, NSEC_PER_USEC);
6999 return rt_runtime_us;
7000}
7001
7002static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7003{
7004 u64 rt_runtime, rt_period;
7005
7006 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7007 rt_runtime = tg->rt_bandwidth.rt_runtime;
7008
7009 if (rt_period == 0)
7010 return -EINVAL;
7011
7012 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7013}
7014
7015static long sched_group_rt_period(struct task_group *tg)
7016{
7017 u64 rt_period_us;
7018
7019 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7020 do_div(rt_period_us, NSEC_PER_USEC);
7021 return rt_period_us;
7022}
7023
7024static int sched_rt_global_constraints(void)
7025{
7026 u64 runtime, period;
7027 int ret = 0;
7028
7029 if (sysctl_sched_rt_period <= 0)
7030 return -EINVAL;
7031
7032 runtime = global_rt_runtime();
7033 period = global_rt_period();
7034
7035
7036
7037
7038 if (runtime > period && runtime != RUNTIME_INF)
7039 return -EINVAL;
7040
7041 mutex_lock(&rt_constraints_mutex);
7042 read_lock(&tasklist_lock);
7043 ret = __rt_schedulable(NULL, 0, 0);
7044 read_unlock(&tasklist_lock);
7045 mutex_unlock(&rt_constraints_mutex);
7046
7047 return ret;
7048}
7049
7050static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7051{
7052
7053 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7054 return 0;
7055
7056 return 1;
7057}
7058
7059#else
7060static int sched_rt_global_constraints(void)
7061{
7062 unsigned long flags;
7063 int i;
7064
7065 if (sysctl_sched_rt_period <= 0)
7066 return -EINVAL;
7067
7068
7069
7070
7071
7072 if (sysctl_sched_rt_runtime == 0)
7073 return -EBUSY;
7074
7075 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7076 for_each_possible_cpu(i) {
7077 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7078
7079 raw_spin_lock(&rt_rq->rt_runtime_lock);
7080 rt_rq->rt_runtime = global_rt_runtime();
7081 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7082 }
7083 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7084
7085 return 0;
7086}
7087#endif
7088
7089int sched_rr_handler(struct ctl_table *table, int write,
7090 void __user *buffer, size_t *lenp,
7091 loff_t *ppos)
7092{
7093 int ret;
7094 static DEFINE_MUTEX(mutex);
7095
7096 mutex_lock(&mutex);
7097 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7098
7099
7100 if (!ret && write) {
7101 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7102 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7103 }
7104 mutex_unlock(&mutex);
7105 return ret;
7106}
7107
7108int sched_rt_handler(struct ctl_table *table, int write,
7109 void __user *buffer, size_t *lenp,
7110 loff_t *ppos)
7111{
7112 int ret;
7113 int old_period, old_runtime;
7114 static DEFINE_MUTEX(mutex);
7115
7116 mutex_lock(&mutex);
7117 old_period = sysctl_sched_rt_period;
7118 old_runtime = sysctl_sched_rt_runtime;
7119
7120 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7121
7122 if (!ret && write) {
7123 ret = sched_rt_global_constraints();
7124 if (ret) {
7125 sysctl_sched_rt_period = old_period;
7126 sysctl_sched_rt_runtime = old_runtime;
7127 } else {
7128 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7129 def_rt_bandwidth.rt_period =
7130 ns_to_ktime(global_rt_period());
7131 }
7132 }
7133 mutex_unlock(&mutex);
7134
7135 return ret;
7136}
7137
7138#ifdef CONFIG_CGROUP_SCHED
7139
7140
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7144 struct task_group, css);
7145}
7146
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7148{
7149 struct task_group *tg, *parent;
7150
7151 if (!cgrp->parent) {
7152
7153 return &root_task_group.css;
7154 }
7155
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent);
7158 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM);
7160
7161 return &tg->css;
7162}
7163
7164static int cpu_cgroup_css_online(struct cgroup *cgrp)
7165{
7166 struct task_group *tg = cgroup_tg(cgrp);
7167 struct task_group *parent;
7168
7169 if (!cgrp->parent)
7170 return 0;
7171
7172 parent = cgroup_tg(cgrp->parent);
7173 sched_online_group(tg, parent);
7174 return 0;
7175}
7176
7177static void cpu_cgroup_css_free(struct cgroup *cgrp)
7178{
7179 struct task_group *tg = cgroup_tg(cgrp);
7180
7181 sched_destroy_group(tg);
7182}
7183
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7185{
7186 struct task_group *tg = cgroup_tg(cgrp);
7187
7188 sched_offline_group(tg);
7189}
7190
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7192 struct cgroup_taskset *tset)
7193{
7194 struct task_struct *task;
7195
7196 cgroup_taskset_for_each(task, cgrp, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7199 return -EINVAL;
7200#else
7201
7202 if (task->sched_class != &fair_sched_class)
7203 return -EINVAL;
7204#endif
7205 }
7206 return 0;
7207}
7208
7209static void cpu_cgroup_attach(struct cgroup *cgrp,
7210 struct cgroup_taskset *tset)
7211{
7212 struct task_struct *task;
7213
7214 cgroup_taskset_for_each(task, cgrp, tset)
7215 sched_move_task(task);
7216}
7217
7218static void
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7220 struct task_struct *task)
7221{
7222
7223
7224
7225
7226
7227 if (!(task->flags & PF_EXITING))
7228 return;
7229
7230 sched_move_task(task);
7231}
7232
7233#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7235 u64 shareval)
7236{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7238}
7239
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7241{
7242 struct task_group *tg = cgroup_tg(cgrp);
7243
7244 return (u64) scale_load_down(tg->shares);
7245}
7246
7247#ifdef CONFIG_CFS_BANDWIDTH
7248static DEFINE_MUTEX(cfs_constraints_mutex);
7249
7250const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7251const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7252
7253static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7254
7255static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7256{
7257 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7258 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7259
7260 if (tg == &root_task_group)
7261 return -EINVAL;
7262
7263
7264
7265
7266
7267
7268 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7269 return -EINVAL;
7270
7271
7272
7273
7274
7275
7276 if (period > max_cfs_quota_period)
7277 return -EINVAL;
7278
7279 mutex_lock(&cfs_constraints_mutex);
7280 ret = __cfs_schedulable(tg, period, quota);
7281 if (ret)
7282 goto out_unlock;
7283
7284 runtime_enabled = quota != RUNTIME_INF;
7285 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7286 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7287 raw_spin_lock_irq(&cfs_b->lock);
7288 cfs_b->period = ns_to_ktime(period);
7289 cfs_b->quota = quota;
7290
7291 __refill_cfs_bandwidth_runtime(cfs_b);
7292
7293 if (runtime_enabled && cfs_b->timer_active) {
7294
7295 cfs_b->timer_active = 0;
7296 __start_cfs_bandwidth(cfs_b);
7297 }
7298 raw_spin_unlock_irq(&cfs_b->lock);
7299
7300 for_each_possible_cpu(i) {
7301 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7302 struct rq *rq = cfs_rq->rq;
7303
7304 raw_spin_lock_irq(&rq->lock);
7305 cfs_rq->runtime_enabled = runtime_enabled;
7306 cfs_rq->runtime_remaining = 0;
7307
7308 if (cfs_rq->throttled)
7309 unthrottle_cfs_rq(cfs_rq);
7310 raw_spin_unlock_irq(&rq->lock);
7311 }
7312out_unlock:
7313 mutex_unlock(&cfs_constraints_mutex);
7314
7315 return ret;
7316}
7317
7318int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7319{
7320 u64 quota, period;
7321
7322 period = ktime_to_ns(tg->cfs_bandwidth.period);
7323 if (cfs_quota_us < 0)
7324 quota = RUNTIME_INF;
7325 else
7326 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7327
7328 return tg_set_cfs_bandwidth(tg, period, quota);
7329}
7330
7331long tg_get_cfs_quota(struct task_group *tg)
7332{
7333 u64 quota_us;
7334
7335 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7336 return -1;
7337
7338 quota_us = tg->cfs_bandwidth.quota;
7339 do_div(quota_us, NSEC_PER_USEC);
7340
7341 return quota_us;
7342}
7343
7344int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7345{
7346 u64 quota, period;
7347
7348 period = (u64)cfs_period_us * NSEC_PER_USEC;
7349 quota = tg->cfs_bandwidth.quota;
7350
7351 return tg_set_cfs_bandwidth(tg, period, quota);
7352}
7353
7354long tg_get_cfs_period(struct task_group *tg)
7355{
7356 u64 cfs_period_us;
7357
7358 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7359 do_div(cfs_period_us, NSEC_PER_USEC);
7360
7361 return cfs_period_us;
7362}
7363
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7365{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp));
7367}
7368
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7370 s64 cfs_quota_us)
7371{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7373}
7374
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7376{
7377 return tg_get_cfs_period(cgroup_tg(cgrp));
7378}
7379
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7381 u64 cfs_period_us)
7382{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7384}
7385
7386struct cfs_schedulable_data {
7387 struct task_group *tg;
7388 u64 period, quota;
7389};
7390
7391
7392
7393
7394
7395static u64 normalize_cfs_quota(struct task_group *tg,
7396 struct cfs_schedulable_data *d)
7397{
7398 u64 quota, period;
7399
7400 if (tg == d->tg) {
7401 period = d->period;
7402 quota = d->quota;
7403 } else {
7404 period = tg_get_cfs_period(tg);
7405 quota = tg_get_cfs_quota(tg);
7406 }
7407
7408
7409 if (quota == RUNTIME_INF || quota == -1)
7410 return RUNTIME_INF;
7411
7412 return to_ratio(period, quota);
7413}
7414
7415static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7416{
7417 struct cfs_schedulable_data *d = data;
7418 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7419 s64 quota = 0, parent_quota = -1;
7420
7421 if (!tg->parent) {
7422 quota = RUNTIME_INF;
7423 } else {
7424 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7425
7426 quota = normalize_cfs_quota(tg, d);
7427 parent_quota = parent_b->hierarchal_quota;
7428
7429
7430
7431
7432
7433 if (quota == RUNTIME_INF)
7434 quota = parent_quota;
7435 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7436 return -EINVAL;
7437 }
7438 cfs_b->hierarchal_quota = quota;
7439
7440 return 0;
7441}
7442
7443static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7444{
7445 int ret;
7446 struct cfs_schedulable_data data = {
7447 .tg = tg,
7448 .period = period,
7449 .quota = quota,
7450 };
7451
7452 if (quota != RUNTIME_INF) {
7453 do_div(data.period, NSEC_PER_USEC);
7454 do_div(data.quota, NSEC_PER_USEC);
7455 }
7456
7457 rcu_read_lock();
7458 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7459 rcu_read_unlock();
7460
7461 return ret;
7462}
7463
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7465 struct cgroup_map_cb *cb)
7466{
7467 struct task_group *tg = cgroup_tg(cgrp);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7471 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7472 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7473
7474 return 0;
7475}
7476#endif
7477#endif
7478
7479#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7481 s64 val)
7482{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7484}
7485
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7487{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp));
7489}
7490
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7492 u64 rt_period_us)
7493{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7495}
7496
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7498{
7499 return sched_group_rt_period(cgroup_tg(cgrp));
7500}
7501#endif
7502
7503static struct cftype cpu_files[] = {
7504#ifdef CONFIG_FAIR_GROUP_SCHED
7505 {
7506 .name = "shares",
7507 .read_u64 = cpu_shares_read_u64,
7508 .write_u64 = cpu_shares_write_u64,
7509 },
7510#endif
7511#ifdef CONFIG_CFS_BANDWIDTH
7512 {
7513 .name = "cfs_quota_us",
7514 .read_s64 = cpu_cfs_quota_read_s64,
7515 .write_s64 = cpu_cfs_quota_write_s64,
7516 },
7517 {
7518 .name = "cfs_period_us",
7519 .read_u64 = cpu_cfs_period_read_u64,
7520 .write_u64 = cpu_cfs_period_write_u64,
7521 },
7522 {
7523 .name = "stat",
7524 .read_map = cpu_stats_show,
7525 },
7526#endif
7527#ifdef CONFIG_RT_GROUP_SCHED
7528 {
7529 .name = "rt_runtime_us",
7530 .read_s64 = cpu_rt_runtime_read,
7531 .write_s64 = cpu_rt_runtime_write,
7532 },
7533 {
7534 .name = "rt_period_us",
7535 .read_u64 = cpu_rt_period_read_uint,
7536 .write_u64 = cpu_rt_period_write_uint,
7537 },
7538#endif
7539 { }
7540};
7541
7542struct cgroup_subsys cpu_cgroup_subsys = {
7543 .name = "cpu",
7544 .css_alloc = cpu_cgroup_css_alloc,
7545 .css_free = cpu_cgroup_css_free,
7546 .css_online = cpu_cgroup_css_online,
7547 .css_offline = cpu_cgroup_css_offline,
7548 .can_attach = cpu_cgroup_can_attach,
7549 .attach = cpu_cgroup_attach,
7550 .exit = cpu_cgroup_exit,
7551 .subsys_id = cpu_cgroup_subsys_id,
7552 .base_cftypes = cpu_files,
7553 .early_init = 1,
7554};
7555
7556#endif
7557
7558void dump_cpu_task(int cpu)
7559{
7560 pr_info("Task dump for CPU %d:\n", cpu);
7561 sched_show_task(cpu_curr(cpu));
7562}
7563