1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302
303
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320
321
322
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355
356
357
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371
372
373
374
375static void hrtick_clear(struct rq *rq)
376{
377 if (hrtimer_active(&rq->hrtick_timer))
378 hrtimer_cancel(&rq->hrtick_timer);
379}
380
381
382
383
384
385static enum hrtimer_restart hrtick(struct hrtimer *timer)
386{
387 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
388
389 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
390
391 raw_spin_lock(&rq->lock);
392 update_rq_clock(rq);
393 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
394 raw_spin_unlock(&rq->lock);
395
396 return HRTIMER_NORESTART;
397}
398
399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
409
410
411
412static void __hrtick_start(void *arg)
413{
414 struct rq *rq = arg;
415
416 raw_spin_lock(&rq->lock);
417 __hrtick_restart(rq);
418 rq->hrtick_csd_pending = 0;
419 raw_spin_unlock(&rq->lock);
420}
421
422
423
424
425
426
427void hrtick_start(struct rq *rq, u64 delay)
428{
429 struct hrtimer *timer = &rq->hrtick_timer;
430 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
431
432 hrtimer_set_expires(timer, time);
433
434 if (rq == this_rq()) {
435 __hrtick_restart(rq);
436 } else if (!rq->hrtick_csd_pending) {
437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
438 rq->hrtick_csd_pending = 1;
439 }
440}
441
442static int
443hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
444{
445 int cpu = (int)(long)hcpu;
446
447 switch (action) {
448 case CPU_UP_CANCELED:
449 case CPU_UP_CANCELED_FROZEN:
450 case CPU_DOWN_PREPARE:
451 case CPU_DOWN_PREPARE_FROZEN:
452 case CPU_DEAD:
453 case CPU_DEAD_FROZEN:
454 hrtick_clear(cpu_rq(cpu));
455 return NOTIFY_OK;
456 }
457
458 return NOTIFY_DONE;
459}
460
461static __init void init_hrtick(void)
462{
463 hotcpu_notifier(hotplug_hrtick, 0);
464}
465#else
466
467
468
469
470
471void hrtick_start(struct rq *rq, u64 delay)
472{
473 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
474 HRTIMER_MODE_REL_PINNED, 0);
475}
476
477static inline void init_hrtick(void)
478{
479}
480#endif
481
482static void init_rq_hrtick(struct rq *rq)
483{
484#ifdef CONFIG_SMP
485 rq->hrtick_csd_pending = 0;
486
487 rq->hrtick_csd.flags = 0;
488 rq->hrtick_csd.func = __hrtick_start;
489 rq->hrtick_csd.info = rq;
490#endif
491
492 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
493 rq->hrtick_timer.function = hrtick;
494}
495#else
496static inline void hrtick_clear(struct rq *rq)
497{
498}
499
500static inline void init_rq_hrtick(struct rq *rq)
501{
502}
503
504static inline void init_hrtick(void)
505{
506}
507#endif
508
509
510
511
512
513
514
515
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p)
518{
519 int cpu;
520
521 assert_raw_spin_locked(&task_rq(p)->lock);
522
523 if (test_tsk_need_resched(p))
524 return;
525
526 set_tsk_need_resched(p);
527
528 cpu = task_cpu(p);
529 if (cpu == smp_processor_id())
530 return;
531
532
533 smp_mb();
534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu);
536}
537
538void resched_cpu(int cpu)
539{
540 struct rq *rq = cpu_rq(cpu);
541 unsigned long flags;
542
543 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
544 return;
545 resched_task(cpu_curr(cpu));
546 raw_spin_unlock_irqrestore(&rq->lock, flags);
547}
548
549#ifdef CONFIG_NO_HZ_COMMON
550
551
552
553
554
555
556
557
558int get_nohz_timer_target(void)
559{
560 int cpu = smp_processor_id();
561 int i;
562 struct sched_domain *sd;
563
564 rcu_read_lock();
565 for_each_domain(cpu, sd) {
566 for_each_cpu(i, sched_domain_span(sd)) {
567 if (!idle_cpu(i)) {
568 cpu = i;
569 goto unlock;
570 }
571 }
572 }
573unlock:
574 rcu_read_unlock();
575 return cpu;
576}
577
578
579
580
581
582
583
584
585
586
587static void wake_up_idle_cpu(int cpu)
588{
589 struct rq *rq = cpu_rq(cpu);
590
591 if (cpu == smp_processor_id())
592 return;
593
594
595
596
597
598
599
600
601 if (rq->curr != rq->idle)
602 return;
603
604
605
606
607
608
609 set_tsk_need_resched(rq->idle);
610
611
612 smp_mb();
613 if (!tsk_is_polling(rq->idle))
614 smp_send_reschedule(cpu);
615}
616
617static bool wake_up_full_nohz_cpu(int cpu)
618{
619 if (tick_nohz_full_cpu(cpu)) {
620 if (cpu != smp_processor_id() ||
621 tick_nohz_tick_stopped())
622 smp_send_reschedule(cpu);
623 return true;
624 }
625
626 return false;
627}
628
629void wake_up_nohz_cpu(int cpu)
630{
631 if (!wake_up_full_nohz_cpu(cpu))
632 wake_up_idle_cpu(cpu);
633}
634
635static inline bool got_nohz_idle_kick(void)
636{
637 int cpu = smp_processor_id();
638
639 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
640 return false;
641
642 if (idle_cpu(cpu) && !need_resched())
643 return true;
644
645
646
647
648
649 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
650 return false;
651}
652
653#else
654
655static inline bool got_nohz_idle_kick(void)
656{
657 return false;
658}
659
660#endif
661
662#ifdef CONFIG_NO_HZ_FULL
663bool sched_can_stop_tick(void)
664{
665 struct rq *rq;
666
667 rq = this_rq();
668
669
670 smp_rmb();
671
672
673 if (rq->nr_running > 1)
674 return false;
675
676 return true;
677}
678#endif
679
680void sched_avg_update(struct rq *rq)
681{
682 s64 period = sched_avg_period();
683
684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
685
686
687
688
689
690 asm("" : "+rm" (rq->age_stamp));
691 rq->age_stamp += period;
692 rq->rt_avg /= 2;
693 }
694}
695
696#else
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif
703
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
705 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
706
707
708
709
710
711
712int walk_tg_tree_from(struct task_group *from,
713 tg_visitor down, tg_visitor up, void *data)
714{
715 struct task_group *parent, *child;
716 int ret;
717
718 parent = from;
719
720down:
721 ret = (*down)(parent, data);
722 if (ret)
723 goto out;
724 list_for_each_entry_rcu(child, &parent->children, siblings) {
725 parent = child;
726 goto down;
727
728up:
729 continue;
730 }
731 ret = (*up)(parent, data);
732 if (ret || parent == from)
733 goto out;
734
735 child = parent;
736 parent = parent->parent;
737 if (parent)
738 goto up;
739out:
740 return ret;
741}
742
743int tg_nop(struct task_group *tg, void *data)
744{
745 return 0;
746}
747#endif
748
749static void set_load_weight(struct task_struct *p)
750{
751 int prio = p->static_prio - MAX_RT_PRIO;
752 struct load_weight *load = &p->se.load;
753
754
755
756
757 if (p->policy == SCHED_IDLE) {
758 load->weight = scale_load(WEIGHT_IDLEPRIO);
759 load->inv_weight = WMULT_IDLEPRIO;
760 return;
761 }
762
763 load->weight = scale_load(prio_to_weight[prio]);
764 load->inv_weight = prio_to_wmult[prio];
765}
766
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{
769 update_rq_clock(rq);
770 sched_info_queued(p);
771 p->sched_class->enqueue_task(rq, p, flags);
772}
773
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{
776 update_rq_clock(rq);
777 sched_info_dequeued(p);
778 p->sched_class->dequeue_task(rq, p, flags);
779}
780
781void activate_task(struct rq *rq, struct task_struct *p, int flags)
782{
783 if (task_contributes_to_load(p))
784 rq->nr_uninterruptible--;
785
786 enqueue_task(rq, p, flags);
787}
788
789void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
790{
791 if (task_contributes_to_load(p))
792 rq->nr_uninterruptible++;
793
794 dequeue_task(rq, p, flags);
795}
796
797static void update_rq_clock_task(struct rq *rq, s64 delta)
798{
799
800
801
802
803#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
804 s64 steal = 0, irq_delta = 0;
805#endif
806#ifdef CONFIG_IRQ_TIME_ACCOUNTING
807 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824 if (irq_delta > delta)
825 irq_delta = delta;
826
827 rq->prev_irq_time += irq_delta;
828 delta -= irq_delta;
829#endif
830#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
831 if (static_key_false((¶virt_steal_rq_enabled))) {
832 u64 st;
833
834 steal = paravirt_steal_clock(cpu_of(rq));
835 steal -= rq->prev_steal_time_rq;
836
837 if (unlikely(steal > delta))
838 steal = delta;
839
840 st = steal_ticks(steal);
841 steal = st * TICK_NSEC;
842
843 rq->prev_steal_time_rq += steal;
844
845 delta -= steal;
846 }
847#endif
848
849 rq->clock_task += delta;
850
851#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
852 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
853 sched_rt_avg_update(rq, irq_delta + steal);
854#endif
855}
856
857void sched_set_stop_task(int cpu, struct task_struct *stop)
858{
859 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
860 struct task_struct *old_stop = cpu_rq(cpu)->stop;
861
862 if (stop) {
863
864
865
866
867
868
869
870
871 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
872
873 stop->sched_class = &stop_sched_class;
874 }
875
876 cpu_rq(cpu)->stop = stop;
877
878 if (old_stop) {
879
880
881
882
883 old_stop->sched_class = &rt_sched_class;
884 }
885}
886
887
888
889
890static inline int __normal_prio(struct task_struct *p)
891{
892 return p->static_prio;
893}
894
895
896
897
898
899
900
901
902static inline int normal_prio(struct task_struct *p)
903{
904 int prio;
905
906 if (task_has_rt_policy(p))
907 prio = MAX_RT_PRIO-1 - p->rt_priority;
908 else
909 prio = __normal_prio(p);
910 return prio;
911}
912
913
914
915
916
917
918
919
920static int effective_prio(struct task_struct *p)
921{
922 p->normal_prio = normal_prio(p);
923
924
925
926
927
928 if (!rt_prio(p->prio))
929 return p->normal_prio;
930 return p->prio;
931}
932
933
934
935
936
937
938
939inline int task_curr(const struct task_struct *p)
940{
941 return cpu_curr(task_cpu(p)) == p;
942}
943
944static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 const struct sched_class *prev_class,
946 int oldprio)
947{
948 if (prev_class != p->sched_class) {
949 if (prev_class->switched_from)
950 prev_class->switched_from(rq, p);
951 p->sched_class->switched_to(rq, p);
952 } else if (oldprio != p->prio)
953 p->sched_class->prio_changed(rq, p, oldprio);
954}
955
956void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
957{
958 const struct sched_class *class;
959
960 if (p->sched_class == rq->curr->sched_class) {
961 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
962 } else {
963 for_each_class(class) {
964 if (class == rq->curr->sched_class)
965 break;
966 if (class == p->sched_class) {
967 resched_task(rq->curr);
968 break;
969 }
970 }
971 }
972
973
974
975
976
977 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
978 rq->skip_clock_update = 1;
979}
980
981#ifdef CONFIG_SMP
982void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
983{
984#ifdef CONFIG_SCHED_DEBUG
985
986
987
988
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
991
992#ifdef CONFIG_LOCKDEP
993
994
995
996
997
998
999
1000
1001
1002
1003 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1004 lockdep_is_held(&task_rq(p)->lock)));
1005#endif
1006#endif
1007
1008 trace_sched_migrate_task(p, new_cpu);
1009
1010 if (task_cpu(p) != new_cpu) {
1011 if (p->sched_class->migrate_task_rq)
1012 p->sched_class->migrate_task_rq(p, new_cpu);
1013 p->se.nr_migrations++;
1014 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1015 }
1016
1017 __set_task_cpu(p, new_cpu);
1018}
1019
1020struct migration_arg {
1021 struct task_struct *task;
1022 int dest_cpu;
1023};
1024
1025static int migration_cpu_stop(void *data);
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1044{
1045 unsigned long flags;
1046 int running, on_rq;
1047 unsigned long ncsw;
1048 struct rq *rq;
1049
1050 for (;;) {
1051
1052
1053
1054
1055
1056
1057 rq = task_rq(p);
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070 while (task_running(rq, p)) {
1071 if (match_state && unlikely(p->state != match_state))
1072 return 0;
1073 cpu_relax();
1074 }
1075
1076
1077
1078
1079
1080
1081 rq = task_rq_lock(p, &flags);
1082 trace_sched_wait_task(p);
1083 running = task_running(rq, p);
1084 on_rq = p->on_rq;
1085 ncsw = 0;
1086 if (!match_state || p->state == match_state)
1087 ncsw = p->nvcsw | LONG_MIN;
1088 task_rq_unlock(rq, p, &flags);
1089
1090
1091
1092
1093 if (unlikely(!ncsw))
1094 break;
1095
1096
1097
1098
1099
1100
1101
1102 if (unlikely(running)) {
1103 cpu_relax();
1104 continue;
1105 }
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116 if (unlikely(on_rq)) {
1117 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1118
1119 set_current_state(TASK_UNINTERRUPTIBLE);
1120 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1121 continue;
1122 }
1123
1124
1125
1126
1127
1128
1129 break;
1130 }
1131
1132 return ncsw;
1133}
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148void kick_process(struct task_struct *p)
1149{
1150 int cpu;
1151
1152 preempt_disable();
1153 cpu = task_cpu(p);
1154 if ((cpu != smp_processor_id()) && task_curr(p))
1155 smp_send_reschedule(cpu);
1156 preempt_enable();
1157}
1158EXPORT_SYMBOL_GPL(kick_process);
1159#endif
1160
1161#ifdef CONFIG_SMP
1162
1163
1164
1165static int select_fallback_rq(int cpu, struct task_struct *p)
1166{
1167 int nid = cpu_to_node(cpu);
1168 const struct cpumask *nodemask = NULL;
1169 enum { cpuset, possible, fail } state = cpuset;
1170 int dest_cpu;
1171
1172
1173
1174
1175
1176
1177 if (nid != -1) {
1178 nodemask = cpumask_of_node(nid);
1179
1180
1181 for_each_cpu(dest_cpu, nodemask) {
1182 if (!cpu_online(dest_cpu))
1183 continue;
1184 if (!cpu_active(dest_cpu))
1185 continue;
1186 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1187 return dest_cpu;
1188 }
1189 }
1190
1191 for (;;) {
1192
1193 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1194 if (!cpu_online(dest_cpu))
1195 continue;
1196 if (!cpu_active(dest_cpu))
1197 continue;
1198 goto out;
1199 }
1200
1201 switch (state) {
1202 case cpuset:
1203
1204 cpuset_cpus_allowed_fallback(p);
1205 state = possible;
1206 break;
1207
1208 case possible:
1209 do_set_cpus_allowed(p, cpu_possible_mask);
1210 state = fail;
1211 break;
1212
1213 case fail:
1214 BUG();
1215 break;
1216 }
1217 }
1218
1219out:
1220 if (state != cpuset) {
1221
1222
1223
1224
1225
1226 if (p->mm && printk_ratelimit()) {
1227 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1228 task_pid_nr(p), p->comm, cpu);
1229 }
1230 }
1231
1232 return dest_cpu;
1233}
1234
1235
1236
1237
1238static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1240{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1254 !cpu_online(cpu)))
1255 cpu = select_fallback_rq(task_cpu(p), p);
1256
1257 return cpu;
1258}
1259
1260static void update_avg(u64 *avg, u64 sample)
1261{
1262 s64 diff = sample - *avg;
1263 *avg += diff >> 3;
1264}
1265#endif
1266
1267static void
1268ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1269{
1270#ifdef CONFIG_SCHEDSTATS
1271 struct rq *rq = this_rq();
1272
1273#ifdef CONFIG_SMP
1274 int this_cpu = smp_processor_id();
1275
1276 if (cpu == this_cpu) {
1277 schedstat_inc(rq, ttwu_local);
1278 schedstat_inc(p, se.statistics.nr_wakeups_local);
1279 } else {
1280 struct sched_domain *sd;
1281
1282 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1283 rcu_read_lock();
1284 for_each_domain(this_cpu, sd) {
1285 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1286 schedstat_inc(sd, ttwu_wake_remote);
1287 break;
1288 }
1289 }
1290 rcu_read_unlock();
1291 }
1292
1293 if (wake_flags & WF_MIGRATED)
1294 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1295
1296#endif
1297
1298 schedstat_inc(rq, ttwu_count);
1299 schedstat_inc(p, se.statistics.nr_wakeups);
1300
1301 if (wake_flags & WF_SYNC)
1302 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1303
1304#endif
1305}
1306
1307static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1308{
1309 activate_task(rq, p, en_flags);
1310 p->on_rq = 1;
1311
1312
1313 if (p->flags & PF_WQ_WORKER)
1314 wq_worker_waking_up(p, cpu_of(rq));
1315}
1316
1317
1318
1319
1320static void
1321ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1322{
1323 check_preempt_curr(rq, p, wake_flags);
1324 trace_sched_wakeup(p, true);
1325
1326 p->state = TASK_RUNNING;
1327#ifdef CONFIG_SMP
1328 if (p->sched_class->task_woken)
1329 p->sched_class->task_woken(rq, p);
1330
1331 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost;
1334
1335 if (delta > max)
1336 rq->avg_idle = max;
1337 else
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0;
1340 }
1341#endif
1342}
1343
1344static void
1345ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1346{
1347#ifdef CONFIG_SMP
1348 if (p->sched_contributes_to_load)
1349 rq->nr_uninterruptible--;
1350#endif
1351
1352 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1353 ttwu_do_wakeup(rq, p, wake_flags);
1354}
1355
1356
1357
1358
1359
1360
1361
1362static int ttwu_remote(struct task_struct *p, int wake_flags)
1363{
1364 struct rq *rq;
1365 int ret = 0;
1366
1367 rq = __task_rq_lock(p);
1368 if (p->on_rq) {
1369
1370 update_rq_clock(rq);
1371 ttwu_do_wakeup(rq, p, wake_flags);
1372 ret = 1;
1373 }
1374 __task_rq_unlock(rq);
1375
1376 return ret;
1377}
1378
1379#ifdef CONFIG_SMP
1380static void sched_ttwu_pending(void)
1381{
1382 struct rq *rq = this_rq();
1383 struct llist_node *llist = llist_del_all(&rq->wake_list);
1384 struct task_struct *p;
1385
1386 raw_spin_lock(&rq->lock);
1387
1388 while (llist) {
1389 p = llist_entry(llist, struct task_struct, wake_entry);
1390 llist = llist_next(llist);
1391 ttwu_do_activate(rq, p, 0);
1392 }
1393
1394 raw_spin_unlock(&rq->lock);
1395}
1396
1397void scheduler_ipi(void)
1398{
1399 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick())
1402 return;
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417 irq_enter();
1418 tick_nohz_full_check();
1419 sched_ttwu_pending();
1420
1421
1422
1423
1424 if (unlikely(got_nohz_idle_kick())) {
1425 this_rq()->idle_balance = 1;
1426 raise_softirq_irqoff(SCHED_SOFTIRQ);
1427 }
1428 irq_exit();
1429}
1430
1431static void ttwu_queue_remote(struct task_struct *p, int cpu)
1432{
1433 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1434 smp_send_reschedule(cpu);
1435}
1436
1437bool cpus_share_cache(int this_cpu, int that_cpu)
1438{
1439 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1440}
1441#endif
1442
1443static void ttwu_queue(struct task_struct *p, int cpu)
1444{
1445 struct rq *rq = cpu_rq(cpu);
1446
1447#if defined(CONFIG_SMP)
1448 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1449 sched_clock_cpu(cpu);
1450 ttwu_queue_remote(p, cpu);
1451 return;
1452 }
1453#endif
1454
1455 raw_spin_lock(&rq->lock);
1456 ttwu_do_activate(rq, p, 0);
1457 raw_spin_unlock(&rq->lock);
1458}
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475static int
1476try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1477{
1478 unsigned long flags;
1479 int cpu, success = 0;
1480
1481
1482
1483
1484
1485
1486
1487 smp_mb__before_spinlock();
1488 raw_spin_lock_irqsave(&p->pi_lock, flags);
1489 if (!(p->state & state))
1490 goto out;
1491
1492 success = 1;
1493 cpu = task_cpu(p);
1494
1495 if (p->on_rq && ttwu_remote(p, wake_flags))
1496 goto stat;
1497
1498#ifdef CONFIG_SMP
1499
1500
1501
1502
1503 while (p->on_cpu)
1504 cpu_relax();
1505
1506
1507
1508 smp_rmb();
1509
1510 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1511 p->state = TASK_WAKING;
1512
1513 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p);
1515
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu);
1520 }
1521#endif
1522
1523 ttwu_queue(p, cpu);
1524stat:
1525 ttwu_stat(p, cpu, wake_flags);
1526out:
1527 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1528
1529 return success;
1530}
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540static void try_to_wake_up_local(struct task_struct *p)
1541{
1542 struct rq *rq = task_rq(p);
1543
1544 if (WARN_ON_ONCE(rq != this_rq()) ||
1545 WARN_ON_ONCE(p == current))
1546 return;
1547
1548 lockdep_assert_held(&rq->lock);
1549
1550 if (!raw_spin_trylock(&p->pi_lock)) {
1551 raw_spin_unlock(&rq->lock);
1552 raw_spin_lock(&p->pi_lock);
1553 raw_spin_lock(&rq->lock);
1554 }
1555
1556 if (!(p->state & TASK_NORMAL))
1557 goto out;
1558
1559 if (!p->on_rq)
1560 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1561
1562 ttwu_do_wakeup(rq, p, 0);
1563 ttwu_stat(p, smp_processor_id(), 0);
1564out:
1565 raw_spin_unlock(&p->pi_lock);
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580int wake_up_process(struct task_struct *p)
1581{
1582 WARN_ON(task_is_stopped_or_traced(p));
1583 return try_to_wake_up(p, TASK_NORMAL, 0);
1584}
1585EXPORT_SYMBOL(wake_up_process);
1586
1587int wake_up_state(struct task_struct *p, unsigned int state)
1588{
1589 return try_to_wake_up(p, state, 0);
1590}
1591
1592
1593
1594
1595
1596
1597
1598static void __sched_fork(struct task_struct *p)
1599{
1600 p->on_rq = 0;
1601
1602 p->se.on_rq = 0;
1603 p->se.exec_start = 0;
1604 p->se.sum_exec_runtime = 0;
1605 p->se.prev_sum_exec_runtime = 0;
1606 p->se.nr_migrations = 0;
1607 p->se.vruntime = 0;
1608 INIT_LIST_HEAD(&p->se.group_node);
1609
1610#ifdef CONFIG_SCHEDSTATS
1611 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1612#endif
1613
1614 INIT_LIST_HEAD(&p->rt.run_list);
1615
1616#ifdef CONFIG_PREEMPT_NOTIFIERS
1617 INIT_HLIST_HEAD(&p->preempt_notifiers);
1618#endif
1619
1620#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies;
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0;
1625 }
1626
1627 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work;
1632#endif
1633}
1634
1635#ifdef CONFIG_NUMA_BALANCING
1636#ifdef CONFIG_SCHED_DEBUG
1637void set_numabalancing_state(bool enabled)
1638{
1639 if (enabled)
1640 sched_feat_set("NUMA");
1641 else
1642 sched_feat_set("NO_NUMA");
1643}
1644#else
1645__read_mostly bool numabalancing_enabled;
1646
1647void set_numabalancing_state(bool enabled)
1648{
1649 numabalancing_enabled = enabled;
1650}
1651#endif
1652#endif
1653
1654
1655
1656
1657void sched_fork(struct task_struct *p)
1658{
1659 unsigned long flags;
1660 int cpu = get_cpu();
1661
1662 __sched_fork(p);
1663
1664
1665
1666
1667
1668 p->state = TASK_RUNNING;
1669
1670
1671
1672
1673 p->prio = current->normal_prio;
1674
1675
1676
1677
1678 if (unlikely(p->sched_reset_on_fork)) {
1679 if (task_has_rt_policy(p)) {
1680 p->policy = SCHED_NORMAL;
1681 p->static_prio = NICE_TO_PRIO(0);
1682 p->rt_priority = 0;
1683 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1684 p->static_prio = NICE_TO_PRIO(0);
1685
1686 p->prio = p->normal_prio = __normal_prio(p);
1687 set_load_weight(p);
1688
1689
1690
1691
1692
1693 p->sched_reset_on_fork = 0;
1694 }
1695
1696 if (!rt_prio(p->prio))
1697 p->sched_class = &fair_sched_class;
1698
1699 if (p->sched_class->task_fork)
1700 p->sched_class->task_fork(p);
1701
1702
1703
1704
1705
1706
1707
1708
1709 raw_spin_lock_irqsave(&p->pi_lock, flags);
1710 set_task_cpu(p, cpu);
1711 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1712
1713#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1714 if (likely(sched_info_on()))
1715 memset(&p->sched_info, 0, sizeof(p->sched_info));
1716#endif
1717#if defined(CONFIG_SMP)
1718 p->on_cpu = 0;
1719#endif
1720#ifdef CONFIG_PREEMPT_COUNT
1721
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif
1727
1728 put_cpu();
1729}
1730
1731
1732
1733
1734
1735
1736
1737
1738void wake_up_new_task(struct task_struct *p)
1739{
1740 unsigned long flags;
1741 struct rq *rq;
1742
1743 raw_spin_lock_irqsave(&p->pi_lock, flags);
1744#ifdef CONFIG_SMP
1745
1746
1747
1748
1749
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1751#endif
1752
1753
1754 init_task_runnable_average(p);
1755 rq = __task_rq_lock(p);
1756 activate_task(rq, p, 0);
1757 p->on_rq = 1;
1758 trace_sched_wakeup_new(p, true);
1759 check_preempt_curr(rq, p, WF_FORK);
1760#ifdef CONFIG_SMP
1761 if (p->sched_class->task_woken)
1762 p->sched_class->task_woken(rq, p);
1763#endif
1764 task_rq_unlock(rq, p, &flags);
1765}
1766
1767#ifdef CONFIG_PREEMPT_NOTIFIERS
1768
1769
1770
1771
1772
1773void preempt_notifier_register(struct preempt_notifier *notifier)
1774{
1775 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1776}
1777EXPORT_SYMBOL_GPL(preempt_notifier_register);
1778
1779
1780
1781
1782
1783
1784
1785void preempt_notifier_unregister(struct preempt_notifier *notifier)
1786{
1787 hlist_del(¬ifier->link);
1788}
1789EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1790
1791static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1792{
1793 struct preempt_notifier *notifier;
1794
1795 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1796 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1797}
1798
1799static void
1800fire_sched_out_preempt_notifiers(struct task_struct *curr,
1801 struct task_struct *next)
1802{
1803 struct preempt_notifier *notifier;
1804
1805 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1806 notifier->ops->sched_out(notifier, next);
1807}
1808
1809#else
1810
1811static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1812{
1813}
1814
1815static void
1816fire_sched_out_preempt_notifiers(struct task_struct *curr,
1817 struct task_struct *next)
1818{
1819}
1820
1821#endif
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836static inline void
1837prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next)
1839{
1840 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next);
1842 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next);
1845 prepare_arch_switch(next);
1846}
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1864 __releases(rq->lock)
1865{
1866 struct mm_struct *mm = rq->prev_mm;
1867 long prev_state;
1868
1869 rq->prev_mm = NULL;
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882 prev_state = prev->state;
1883 vtime_task_switch(prev);
1884 finish_arch_switch(prev);
1885 perf_event_task_sched_in(prev, current);
1886 finish_lock_switch(rq, prev);
1887 finish_arch_post_lock_switch();
1888
1889 fire_sched_in_preempt_notifiers(current);
1890 if (mm)
1891 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) {
1893
1894
1895
1896
1897 kprobe_flush_task(prev);
1898 put_task_struct(prev);
1899 }
1900
1901 tick_nohz_task_switch(current);
1902}
1903
1904#ifdef CONFIG_SMP
1905
1906
1907static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1908{
1909 if (prev->sched_class->pre_schedule)
1910 prev->sched_class->pre_schedule(rq, prev);
1911}
1912
1913
1914static inline void post_schedule(struct rq *rq)
1915{
1916 if (rq->post_schedule) {
1917 unsigned long flags;
1918
1919 raw_spin_lock_irqsave(&rq->lock, flags);
1920 if (rq->curr->sched_class->post_schedule)
1921 rq->curr->sched_class->post_schedule(rq);
1922 raw_spin_unlock_irqrestore(&rq->lock, flags);
1923
1924 rq->post_schedule = 0;
1925 }
1926}
1927
1928#else
1929
1930static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1931{
1932}
1933
1934static inline void post_schedule(struct rq *rq)
1935{
1936}
1937
1938#endif
1939
1940
1941
1942
1943
1944asmlinkage void schedule_tail(struct task_struct *prev)
1945 __releases(rq->lock)
1946{
1947 struct rq *rq = this_rq();
1948
1949 finish_task_switch(rq, prev);
1950
1951
1952
1953
1954
1955 post_schedule(rq);
1956
1957#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1958
1959 preempt_enable();
1960#endif
1961 if (current->set_child_tid)
1962 put_user(task_pid_vnr(current), current->set_child_tid);
1963}
1964
1965
1966
1967
1968
1969static inline void
1970context_switch(struct rq *rq, struct task_struct *prev,
1971 struct task_struct *next)
1972{
1973 struct mm_struct *mm, *oldmm;
1974
1975 prepare_task_switch(rq, prev, next);
1976
1977 mm = next->mm;
1978 oldmm = prev->active_mm;
1979
1980
1981
1982
1983
1984 arch_start_context_switch(prev);
1985
1986 if (!mm) {
1987 next->active_mm = oldmm;
1988 atomic_inc(&oldmm->mm_count);
1989 enter_lazy_tlb(oldmm, next);
1990 } else
1991 switch_mm(oldmm, mm, next);
1992
1993 if (!prev->mm) {
1994 prev->active_mm = NULL;
1995 rq->prev_mm = oldmm;
1996 }
1997
1998
1999
2000
2001
2002
2003#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2004 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2005#endif
2006
2007 context_tracking_task_switch(prev, next);
2008
2009 switch_to(prev, next, prev);
2010
2011 barrier();
2012
2013
2014
2015
2016
2017 finish_task_switch(this_rq(), prev);
2018}
2019
2020
2021
2022
2023
2024
2025
2026unsigned long nr_running(void)
2027{
2028 unsigned long i, sum = 0;
2029
2030 for_each_online_cpu(i)
2031 sum += cpu_rq(i)->nr_running;
2032
2033 return sum;
2034}
2035
2036unsigned long long nr_context_switches(void)
2037{
2038 int i;
2039 unsigned long long sum = 0;
2040
2041 for_each_possible_cpu(i)
2042 sum += cpu_rq(i)->nr_switches;
2043
2044 return sum;
2045}
2046
2047unsigned long nr_iowait(void)
2048{
2049 unsigned long i, sum = 0;
2050
2051 for_each_possible_cpu(i)
2052 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2053
2054 return sum;
2055}
2056
2057unsigned long nr_iowait_cpu(int cpu)
2058{
2059 struct rq *this = cpu_rq(cpu);
2060 return atomic_read(&this->nr_iowait);
2061}
2062
2063#ifdef CONFIG_SMP
2064
2065
2066
2067
2068
2069void sched_exec(void)
2070{
2071 struct task_struct *p = current;
2072 unsigned long flags;
2073 int dest_cpu;
2074
2075 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id())
2078 goto unlock;
2079
2080 if (likely(cpu_active(dest_cpu))) {
2081 struct migration_arg arg = { p, dest_cpu };
2082
2083 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2084 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2085 return;
2086 }
2087unlock:
2088 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2089}
2090
2091#endif
2092
2093DEFINE_PER_CPU(struct kernel_stat, kstat);
2094DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2095
2096EXPORT_PER_CPU_SYMBOL(kstat);
2097EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2098
2099
2100
2101
2102
2103
2104
2105static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2106{
2107 u64 ns = 0;
2108
2109 if (task_current(rq, p)) {
2110 update_rq_clock(rq);
2111 ns = rq_clock_task(rq) - p->se.exec_start;
2112 if ((s64)ns < 0)
2113 ns = 0;
2114 }
2115
2116 return ns;
2117}
2118
2119unsigned long long task_delta_exec(struct task_struct *p)
2120{
2121 unsigned long flags;
2122 struct rq *rq;
2123 u64 ns = 0;
2124
2125 rq = task_rq_lock(p, &flags);
2126 ns = do_task_delta_exec(p, rq);
2127 task_rq_unlock(rq, p, &flags);
2128
2129 return ns;
2130}
2131
2132
2133
2134
2135
2136
2137unsigned long long task_sched_runtime(struct task_struct *p)
2138{
2139 unsigned long flags;
2140 struct rq *rq;
2141 u64 ns = 0;
2142
2143 rq = task_rq_lock(p, &flags);
2144 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2145 task_rq_unlock(rq, p, &flags);
2146
2147 return ns;
2148}
2149
2150
2151
2152
2153
2154void scheduler_tick(void)
2155{
2156 int cpu = smp_processor_id();
2157 struct rq *rq = cpu_rq(cpu);
2158 struct task_struct *curr = rq->curr;
2159
2160 sched_clock_tick();
2161
2162 raw_spin_lock(&rq->lock);
2163 update_rq_clock(rq);
2164 curr->sched_class->task_tick(rq, curr, 0);
2165 update_cpu_load_active(rq);
2166 raw_spin_unlock(&rq->lock);
2167
2168 perf_event_task_tick();
2169
2170#ifdef CONFIG_SMP
2171 rq->idle_balance = idle_cpu(cpu);
2172 trigger_load_balance(rq, cpu);
2173#endif
2174 rq_last_tick_reset(rq);
2175}
2176
2177#ifdef CONFIG_NO_HZ_FULL
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191u64 scheduler_tick_max_deferment(void)
2192{
2193 struct rq *rq = this_rq();
2194 unsigned long next, now = ACCESS_ONCE(jiffies);
2195
2196 next = rq->last_sched_tick + HZ;
2197
2198 if (time_before_eq(next, now))
2199 return 0;
2200
2201 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2202}
2203#endif
2204
2205notrace unsigned long get_parent_ip(unsigned long addr)
2206{
2207 if (in_lock_functions(addr)) {
2208 addr = CALLER_ADDR2;
2209 if (in_lock_functions(addr))
2210 addr = CALLER_ADDR3;
2211 }
2212 return addr;
2213}
2214
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER))
2217
2218void __kprobes add_preempt_count(int val)
2219{
2220#ifdef CONFIG_DEBUG_PREEMPT
2221
2222
2223
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return;
2226#endif
2227 preempt_count() += val;
2228#ifdef CONFIG_DEBUG_PREEMPT
2229
2230
2231
2232 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2233 PREEMPT_MASK - 10);
2234#endif
2235 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237}
2238EXPORT_SYMBOL(add_preempt_count);
2239
2240void __kprobes sub_preempt_count(int val)
2241{
2242#ifdef CONFIG_DEBUG_PREEMPT
2243
2244
2245
2246 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2247 return;
2248
2249
2250
2251 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2252 !(preempt_count() & PREEMPT_MASK)))
2253 return;
2254#endif
2255
2256 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val;
2259}
2260EXPORT_SYMBOL(sub_preempt_count);
2261
2262#endif
2263
2264
2265
2266
2267static noinline void __schedule_bug(struct task_struct *prev)
2268{
2269 if (oops_in_progress)
2270 return;
2271
2272 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2273 prev->comm, prev->pid, preempt_count());
2274
2275 debug_show_held_locks(prev);
2276 print_modules();
2277 if (irqs_disabled())
2278 print_irqtrace_events(prev);
2279 dump_stack();
2280 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2281}
2282
2283
2284
2285
2286static inline void schedule_debug(struct task_struct *prev)
2287{
2288
2289
2290
2291
2292
2293 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2294 __schedule_bug(prev);
2295 rcu_sleep_check();
2296
2297 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2298
2299 schedstat_inc(this_rq(), sched_count);
2300}
2301
2302static void put_prev_task(struct rq *rq, struct task_struct *prev)
2303{
2304 if (prev->on_rq || rq->skip_clock_update < 0)
2305 update_rq_clock(rq);
2306 prev->sched_class->put_prev_task(rq, prev);
2307}
2308
2309
2310
2311
2312static inline struct task_struct *
2313pick_next_task(struct rq *rq)
2314{
2315 const struct sched_class *class;
2316 struct task_struct *p;
2317
2318
2319
2320
2321
2322 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2323 p = fair_sched_class.pick_next_task(rq);
2324 if (likely(p))
2325 return p;
2326 }
2327
2328 for_each_class(class) {
2329 p = class->pick_next_task(rq);
2330 if (p)
2331 return p;
2332 }
2333
2334 BUG();
2335}
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374static void __sched __schedule(void)
2375{
2376 struct task_struct *prev, *next;
2377 unsigned long *switch_count;
2378 struct rq *rq;
2379 int cpu;
2380
2381need_resched:
2382 preempt_disable();
2383 cpu = smp_processor_id();
2384 rq = cpu_rq(cpu);
2385 rcu_note_context_switch(cpu);
2386 prev = rq->curr;
2387
2388 schedule_debug(prev);
2389
2390 if (sched_feat(HRTICK))
2391 hrtick_clear(rq);
2392
2393
2394
2395
2396
2397
2398 smp_mb__before_spinlock();
2399 raw_spin_lock_irq(&rq->lock);
2400
2401 switch_count = &prev->nivcsw;
2402 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2403 if (unlikely(signal_pending_state(prev->state, prev))) {
2404 prev->state = TASK_RUNNING;
2405 } else {
2406 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2407 prev->on_rq = 0;
2408
2409
2410
2411
2412
2413
2414 if (prev->flags & PF_WQ_WORKER) {
2415 struct task_struct *to_wakeup;
2416
2417 to_wakeup = wq_worker_sleeping(prev, cpu);
2418 if (to_wakeup)
2419 try_to_wake_up_local(to_wakeup);
2420 }
2421 }
2422 switch_count = &prev->nvcsw;
2423 }
2424
2425 pre_schedule(rq, prev);
2426
2427 if (unlikely(!rq->nr_running))
2428 idle_balance(cpu, rq);
2429
2430 put_prev_task(rq, prev);
2431 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev);
2433 rq->skip_clock_update = 0;
2434
2435 if (likely(prev != next)) {
2436 rq->nr_switches++;
2437 rq->curr = next;
2438 ++*switch_count;
2439
2440 context_switch(rq, prev, next);
2441
2442
2443
2444
2445
2446
2447 cpu = smp_processor_id();
2448 rq = cpu_rq(cpu);
2449 } else
2450 raw_spin_unlock_irq(&rq->lock);
2451
2452 post_schedule(rq);
2453
2454 sched_preempt_enable_no_resched();
2455 if (need_resched())
2456 goto need_resched;
2457}
2458
2459static inline void sched_submit_work(struct task_struct *tsk)
2460{
2461 if (!tsk->state || tsk_is_pi_blocked(tsk))
2462 return;
2463
2464
2465
2466
2467 if (blk_needs_flush_plug(tsk))
2468 blk_schedule_flush_plug(tsk);
2469}
2470
2471asmlinkage void __sched schedule(void)
2472{
2473 struct task_struct *tsk = current;
2474
2475 sched_submit_work(tsk);
2476 __schedule();
2477}
2478EXPORT_SYMBOL(schedule);
2479
2480#ifdef CONFIG_CONTEXT_TRACKING
2481asmlinkage void __sched schedule_user(void)
2482{
2483
2484
2485
2486
2487
2488
2489 user_exit();
2490 schedule();
2491 user_enter();
2492}
2493#endif
2494
2495
2496
2497
2498
2499
2500void __sched schedule_preempt_disabled(void)
2501{
2502 sched_preempt_enable_no_resched();
2503 schedule();
2504 preempt_disable();
2505}
2506
2507#ifdef CONFIG_PREEMPT
2508
2509
2510
2511
2512
2513asmlinkage void __sched notrace preempt_schedule(void)
2514{
2515
2516
2517
2518
2519 if (likely(!preemptible()))
2520 return;
2521
2522 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE);
2524 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE);
2526
2527
2528
2529
2530
2531 barrier();
2532 } while (need_resched());
2533}
2534EXPORT_SYMBOL(preempt_schedule);
2535
2536
2537
2538
2539
2540
2541
2542asmlinkage void __sched preempt_schedule_irq(void)
2543{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state;
2546
2547
2548 BUG_ON(ti->preempt_count || !irqs_disabled());
2549
2550 prev_state = exception_enter();
2551
2552 do {
2553 add_preempt_count(PREEMPT_ACTIVE);
2554 local_irq_enable();
2555 __schedule();
2556 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE);
2558
2559
2560
2561
2562
2563 barrier();
2564 } while (need_resched());
2565
2566 exception_exit(prev_state);
2567}
2568
2569#endif
2570
2571int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2572 void *key)
2573{
2574 return try_to_wake_up(curr->private, mode, wake_flags);
2575}
2576EXPORT_SYMBOL(default_wake_function);
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2588 int nr_exclusive, int wake_flags, void *key)
2589{
2590 wait_queue_t *curr, *next;
2591
2592 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2593 unsigned flags = curr->flags;
2594
2595 if (curr->func(curr, mode, wake_flags, key) &&
2596 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2597 break;
2598 }
2599}
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611void __wake_up(wait_queue_head_t *q, unsigned int mode,
2612 int nr_exclusive, void *key)
2613{
2614 unsigned long flags;
2615
2616 spin_lock_irqsave(&q->lock, flags);
2617 __wake_up_common(q, mode, nr_exclusive, 0, key);
2618 spin_unlock_irqrestore(&q->lock, flags);
2619}
2620EXPORT_SYMBOL(__wake_up);
2621
2622
2623
2624
2625void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2626{
2627 __wake_up_common(q, mode, nr, 0, NULL);
2628}
2629EXPORT_SYMBOL_GPL(__wake_up_locked);
2630
2631void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2632{
2633 __wake_up_common(q, mode, 1, 0, key);
2634}
2635EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2655 int nr_exclusive, void *key)
2656{
2657 unsigned long flags;
2658 int wake_flags = WF_SYNC;
2659
2660 if (unlikely(!q))
2661 return;
2662
2663 if (unlikely(nr_exclusive != 1))
2664 wake_flags = 0;
2665
2666 spin_lock_irqsave(&q->lock, flags);
2667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2668 spin_unlock_irqrestore(&q->lock, flags);
2669}
2670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2671
2672
2673
2674
2675void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2676{
2677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2678}
2679EXPORT_SYMBOL_GPL(__wake_up_sync);
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693void complete(struct completion *x)
2694{
2695 unsigned long flags;
2696
2697 spin_lock_irqsave(&x->wait.lock, flags);
2698 x->done++;
2699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2700 spin_unlock_irqrestore(&x->wait.lock, flags);
2701}
2702EXPORT_SYMBOL(complete);
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713void complete_all(struct completion *x)
2714{
2715 unsigned long flags;
2716
2717 spin_lock_irqsave(&x->wait.lock, flags);
2718 x->done += UINT_MAX/2;
2719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2720 spin_unlock_irqrestore(&x->wait.lock, flags);
2721}
2722EXPORT_SYMBOL(complete_all);
2723
2724static inline long __sched
2725do_wait_for_common(struct completion *x,
2726 long (*action)(long), long timeout, int state)
2727{
2728 if (!x->done) {
2729 DECLARE_WAITQUEUE(wait, current);
2730
2731 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2732 do {
2733 if (signal_pending_state(state, current)) {
2734 timeout = -ERESTARTSYS;
2735 break;
2736 }
2737 __set_current_state(state);
2738 spin_unlock_irq(&x->wait.lock);
2739 timeout = action(timeout);
2740 spin_lock_irq(&x->wait.lock);
2741 } while (!x->done && timeout);
2742 __remove_wait_queue(&x->wait, &wait);
2743 if (!x->done)
2744 return timeout;
2745 }
2746 x->done--;
2747 return timeout ?: 1;
2748}
2749
2750static inline long __sched
2751__wait_for_common(struct completion *x,
2752 long (*action)(long), long timeout, int state)
2753{
2754 might_sleep();
2755
2756 spin_lock_irq(&x->wait.lock);
2757 timeout = do_wait_for_common(x, action, timeout, state);
2758 spin_unlock_irq(&x->wait.lock);
2759 return timeout;
2760}
2761
2762static long __sched
2763wait_for_common(struct completion *x, long timeout, int state)
2764{
2765 return __wait_for_common(x, schedule_timeout, timeout, state);
2766}
2767
2768static long __sched
2769wait_for_common_io(struct completion *x, long timeout, int state)
2770{
2771 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2772}
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784void __sched wait_for_completion(struct completion *x)
2785{
2786 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2787}
2788EXPORT_SYMBOL(wait_for_completion);
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2804{
2805 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2806}
2807EXPORT_SYMBOL(wait_for_completion_timeout);
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817void __sched wait_for_completion_io(struct completion *x)
2818{
2819 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2820}
2821EXPORT_SYMBOL(wait_for_completion_io);
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2837{
2838 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2839}
2840EXPORT_SYMBOL(wait_for_completion_io_timeout);
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851int __sched wait_for_completion_interruptible(struct completion *x)
2852{
2853 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2854 if (t == -ERESTARTSYS)
2855 return t;
2856 return 0;
2857}
2858EXPORT_SYMBOL(wait_for_completion_interruptible);
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x,
2873 unsigned long timeout)
2874{
2875 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888int __sched wait_for_completion_killable(struct completion *x)
2889{
2890 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2891 if (t == -ERESTARTSYS)
2892 return t;
2893 return 0;
2894}
2895EXPORT_SYMBOL(wait_for_completion_killable);
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909long __sched
2910wait_for_completion_killable_timeout(struct completion *x,
2911 unsigned long timeout)
2912{
2913 return wait_for_common(x, timeout, TASK_KILLABLE);
2914}
2915EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929bool try_wait_for_completion(struct completion *x)
2930{
2931 unsigned long flags;
2932 int ret = 1;
2933
2934 spin_lock_irqsave(&x->wait.lock, flags);
2935 if (!x->done)
2936 ret = 0;
2937 else
2938 x->done--;
2939 spin_unlock_irqrestore(&x->wait.lock, flags);
2940 return ret;
2941}
2942EXPORT_SYMBOL(try_wait_for_completion);
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952bool completion_done(struct completion *x)
2953{
2954 unsigned long flags;
2955 int ret = 1;
2956
2957 spin_lock_irqsave(&x->wait.lock, flags);
2958 if (!x->done)
2959 ret = 0;
2960 spin_unlock_irqrestore(&x->wait.lock, flags);
2961 return ret;
2962}
2963EXPORT_SYMBOL(completion_done);
2964
2965static long __sched
2966sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2967{
2968 unsigned long flags;
2969 wait_queue_t wait;
2970
2971 init_waitqueue_entry(&wait, current);
2972
2973 __set_current_state(state);
2974
2975 spin_lock_irqsave(&q->lock, flags);
2976 __add_wait_queue(q, &wait);
2977 spin_unlock(&q->lock);
2978 timeout = schedule_timeout(timeout);
2979 spin_lock_irq(&q->lock);
2980 __remove_wait_queue(q, &wait);
2981 spin_unlock_irqrestore(&q->lock, flags);
2982
2983 return timeout;
2984}
2985
2986void __sched interruptible_sleep_on(wait_queue_head_t *q)
2987{
2988 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2989}
2990EXPORT_SYMBOL(interruptible_sleep_on);
2991
2992long __sched
2993interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2994{
2995 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2996}
2997EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2998
2999void __sched sleep_on(wait_queue_head_t *q)
3000{
3001 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3002}
3003EXPORT_SYMBOL(sleep_on);
3004
3005long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3006{
3007 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3008}
3009EXPORT_SYMBOL(sleep_on_timeout);
3010
3011#ifdef CONFIG_RT_MUTEXES
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023void rt_mutex_setprio(struct task_struct *p, int prio)
3024{
3025 int oldprio, on_rq, running;
3026 struct rq *rq;
3027 const struct sched_class *prev_class;
3028
3029 BUG_ON(prio < 0 || prio > MAX_PRIO);
3030
3031 rq = __task_rq_lock(p);
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045 if (unlikely(p == rq->idle)) {
3046 WARN_ON(p != rq->curr);
3047 WARN_ON(p->pi_blocked_on);
3048 goto out_unlock;
3049 }
3050
3051 trace_sched_pi_setprio(p, prio);
3052 oldprio = p->prio;
3053 prev_class = p->sched_class;
3054 on_rq = p->on_rq;
3055 running = task_current(rq, p);
3056 if (on_rq)
3057 dequeue_task(rq, p, 0);
3058 if (running)
3059 p->sched_class->put_prev_task(rq, p);
3060
3061 if (rt_prio(prio))
3062 p->sched_class = &rt_sched_class;
3063 else
3064 p->sched_class = &fair_sched_class;
3065
3066 p->prio = prio;
3067
3068 if (running)
3069 p->sched_class->set_curr_task(rq);
3070 if (on_rq)
3071 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3072
3073 check_class_changed(rq, p, prev_class, oldprio);
3074out_unlock:
3075 __task_rq_unlock(rq);
3076}
3077#endif
3078void set_user_nice(struct task_struct *p, long nice)
3079{
3080 int old_prio, delta, on_rq;
3081 unsigned long flags;
3082 struct rq *rq;
3083
3084 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3085 return;
3086
3087
3088
3089
3090 rq = task_rq_lock(p, &flags);
3091
3092
3093
3094
3095
3096
3097 if (task_has_rt_policy(p)) {
3098 p->static_prio = NICE_TO_PRIO(nice);
3099 goto out_unlock;
3100 }
3101 on_rq = p->on_rq;
3102 if (on_rq)
3103 dequeue_task(rq, p, 0);
3104
3105 p->static_prio = NICE_TO_PRIO(nice);
3106 set_load_weight(p);
3107 old_prio = p->prio;
3108 p->prio = effective_prio(p);
3109 delta = p->prio - old_prio;
3110
3111 if (on_rq) {
3112 enqueue_task(rq, p, 0);
3113
3114
3115
3116
3117 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3118 resched_task(rq->curr);
3119 }
3120out_unlock:
3121 task_rq_unlock(rq, p, &flags);
3122}
3123EXPORT_SYMBOL(set_user_nice);
3124
3125
3126
3127
3128
3129
3130int can_nice(const struct task_struct *p, const int nice)
3131{
3132
3133 int nice_rlim = 20 - nice;
3134
3135 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3136 capable(CAP_SYS_NICE));
3137}
3138
3139#ifdef __ARCH_WANT_SYS_NICE
3140
3141
3142
3143
3144
3145
3146
3147
3148SYSCALL_DEFINE1(nice, int, increment)
3149{
3150 long nice, retval;
3151
3152
3153
3154
3155
3156
3157 if (increment < -40)
3158 increment = -40;
3159 if (increment > 40)
3160 increment = 40;
3161
3162 nice = TASK_NICE(current) + increment;
3163 if (nice < -20)
3164 nice = -20;
3165 if (nice > 19)
3166 nice = 19;
3167
3168 if (increment < 0 && !can_nice(current, nice))
3169 return -EPERM;
3170
3171 retval = security_task_setnice(current, nice);
3172 if (retval)
3173 return retval;
3174
3175 set_user_nice(current, nice);
3176 return 0;
3177}
3178
3179#endif
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189int task_prio(const struct task_struct *p)
3190{
3191 return p->prio - MAX_RT_PRIO;
3192}
3193
3194
3195
3196
3197
3198
3199
3200int task_nice(const struct task_struct *p)
3201{
3202 return TASK_NICE(p);
3203}
3204EXPORT_SYMBOL(task_nice);
3205
3206
3207
3208
3209
3210
3211
3212int idle_cpu(int cpu)
3213{
3214 struct rq *rq = cpu_rq(cpu);
3215
3216 if (rq->curr != rq->idle)
3217 return 0;
3218
3219 if (rq->nr_running)
3220 return 0;
3221
3222#ifdef CONFIG_SMP
3223 if (!llist_empty(&rq->wake_list))
3224 return 0;
3225#endif
3226
3227 return 1;
3228}
3229
3230
3231
3232
3233
3234
3235
3236struct task_struct *idle_task(int cpu)
3237{
3238 return cpu_rq(cpu)->idle;
3239}
3240
3241
3242
3243
3244
3245
3246
3247static struct task_struct *find_process_by_pid(pid_t pid)
3248{
3249 return pid ? find_task_by_vpid(pid) : current;
3250}
3251
3252
3253static void
3254__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3255{
3256 p->policy = policy;
3257 p->rt_priority = prio;
3258 p->normal_prio = normal_prio(p);
3259
3260 p->prio = rt_mutex_getprio(p);
3261 if (rt_prio(p->prio))
3262 p->sched_class = &rt_sched_class;
3263 else
3264 p->sched_class = &fair_sched_class;
3265 set_load_weight(p);
3266}
3267
3268
3269
3270
3271static bool check_same_owner(struct task_struct *p)
3272{
3273 const struct cred *cred = current_cred(), *pcred;
3274 bool match;
3275
3276 rcu_read_lock();
3277 pcred = __task_cred(p);
3278 match = (uid_eq(cred->euid, pcred->euid) ||
3279 uid_eq(cred->euid, pcred->uid));
3280 rcu_read_unlock();
3281 return match;
3282}
3283
3284static int __sched_setscheduler(struct task_struct *p, int policy,
3285 const struct sched_param *param, bool user)
3286{
3287 int retval, oldprio, oldpolicy = -1, on_rq, running;
3288 unsigned long flags;
3289 const struct sched_class *prev_class;
3290 struct rq *rq;
3291 int reset_on_fork;
3292
3293
3294 BUG_ON(in_interrupt());
3295recheck:
3296
3297 if (policy < 0) {
3298 reset_on_fork = p->sched_reset_on_fork;
3299 policy = oldpolicy = p->policy;
3300 } else {
3301 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3302 policy &= ~SCHED_RESET_ON_FORK;
3303
3304 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3305 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3306 policy != SCHED_IDLE)
3307 return -EINVAL;
3308 }
3309
3310
3311
3312
3313
3314
3315 if (param->sched_priority < 0 ||
3316 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3317 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3318 return -EINVAL;
3319 if (rt_policy(policy) != (param->sched_priority != 0))
3320 return -EINVAL;
3321
3322
3323
3324
3325 if (user && !capable(CAP_SYS_NICE)) {
3326 if (rt_policy(policy)) {
3327 unsigned long rlim_rtprio =
3328 task_rlimit(p, RLIMIT_RTPRIO);
3329
3330
3331 if (policy != p->policy && !rlim_rtprio)
3332 return -EPERM;
3333
3334
3335 if (param->sched_priority > p->rt_priority &&
3336 param->sched_priority > rlim_rtprio)
3337 return -EPERM;
3338 }
3339
3340
3341
3342
3343
3344 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3345 if (!can_nice(p, TASK_NICE(p)))
3346 return -EPERM;
3347 }
3348
3349
3350 if (!check_same_owner(p))
3351 return -EPERM;
3352
3353
3354 if (p->sched_reset_on_fork && !reset_on_fork)
3355 return -EPERM;
3356 }
3357
3358 if (user) {
3359 retval = security_task_setscheduler(p);
3360 if (retval)
3361 return retval;
3362 }
3363
3364
3365
3366
3367
3368
3369
3370
3371 rq = task_rq_lock(p, &flags);
3372
3373
3374
3375
3376 if (p == rq->stop) {
3377 task_rq_unlock(rq, p, &flags);
3378 return -EINVAL;
3379 }
3380
3381
3382
3383
3384 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3385 param->sched_priority == p->rt_priority))) {
3386 task_rq_unlock(rq, p, &flags);
3387 return 0;
3388 }
3389
3390#ifdef CONFIG_RT_GROUP_SCHED
3391 if (user) {
3392
3393
3394
3395
3396 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3397 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3398 !task_group_is_autogroup(task_group(p))) {
3399 task_rq_unlock(rq, p, &flags);
3400 return -EPERM;
3401 }
3402 }
3403#endif
3404
3405
3406 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3407 policy = oldpolicy = -1;
3408 task_rq_unlock(rq, p, &flags);
3409 goto recheck;
3410 }
3411 on_rq = p->on_rq;
3412 running = task_current(rq, p);
3413 if (on_rq)
3414 dequeue_task(rq, p, 0);
3415 if (running)
3416 p->sched_class->put_prev_task(rq, p);
3417
3418 p->sched_reset_on_fork = reset_on_fork;
3419
3420 oldprio = p->prio;
3421 prev_class = p->sched_class;
3422 __setscheduler(rq, p, policy, param->sched_priority);
3423
3424 if (running)
3425 p->sched_class->set_curr_task(rq);
3426 if (on_rq)
3427 enqueue_task(rq, p, 0);
3428
3429 check_class_changed(rq, p, prev_class, oldprio);
3430 task_rq_unlock(rq, p, &flags);
3431
3432 rt_mutex_adjust_pi(p);
3433
3434 return 0;
3435}
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447int sched_setscheduler(struct task_struct *p, int policy,
3448 const struct sched_param *param)
3449{
3450 return __sched_setscheduler(p, policy, param, true);
3451}
3452EXPORT_SYMBOL_GPL(sched_setscheduler);
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3468 const struct sched_param *param)
3469{
3470 return __sched_setscheduler(p, policy, param, false);
3471}
3472
3473static int
3474do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3475{
3476 struct sched_param lparam;
3477 struct task_struct *p;
3478 int retval;
3479
3480 if (!param || pid < 0)
3481 return -EINVAL;
3482 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3483 return -EFAULT;
3484
3485 rcu_read_lock();
3486 retval = -ESRCH;
3487 p = find_process_by_pid(pid);
3488 if (p != NULL)
3489 retval = sched_setscheduler(p, policy, &lparam);
3490 rcu_read_unlock();
3491
3492 return retval;
3493}
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3504 struct sched_param __user *, param)
3505{
3506
3507 if (policy < 0)
3508 return -EINVAL;
3509
3510 return do_sched_setscheduler(pid, policy, param);
3511}
3512
3513
3514
3515
3516
3517
3518
3519
3520SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3521{
3522 return do_sched_setscheduler(pid, -1, param);
3523}
3524
3525
3526
3527
3528
3529
3530
3531
3532SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3533{
3534 struct task_struct *p;
3535 int retval;
3536
3537 if (pid < 0)
3538 return -EINVAL;
3539
3540 retval = -ESRCH;
3541 rcu_read_lock();
3542 p = find_process_by_pid(pid);
3543 if (p) {
3544 retval = security_task_getscheduler(p);
3545 if (!retval)
3546 retval = p->policy
3547 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3548 }
3549 rcu_read_unlock();
3550 return retval;
3551}
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3562{
3563 struct sched_param lp;
3564 struct task_struct *p;
3565 int retval;
3566
3567 if (!param || pid < 0)
3568 return -EINVAL;
3569
3570 rcu_read_lock();
3571 p = find_process_by_pid(pid);
3572 retval = -ESRCH;
3573 if (!p)
3574 goto out_unlock;
3575
3576 retval = security_task_getscheduler(p);
3577 if (retval)
3578 goto out_unlock;
3579
3580 lp.sched_priority = p->rt_priority;
3581 rcu_read_unlock();
3582
3583
3584
3585
3586 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3587
3588 return retval;
3589
3590out_unlock:
3591 rcu_read_unlock();
3592 return retval;
3593}
3594
3595long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3596{
3597 cpumask_var_t cpus_allowed, new_mask;
3598 struct task_struct *p;
3599 int retval;
3600
3601 get_online_cpus();
3602 rcu_read_lock();
3603
3604 p = find_process_by_pid(pid);
3605 if (!p) {
3606 rcu_read_unlock();
3607 put_online_cpus();
3608 return -ESRCH;
3609 }
3610
3611
3612 get_task_struct(p);
3613 rcu_read_unlock();
3614
3615 if (p->flags & PF_NO_SETAFFINITY) {
3616 retval = -EINVAL;
3617 goto out_put_task;
3618 }
3619 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3620 retval = -ENOMEM;
3621 goto out_put_task;
3622 }
3623 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3624 retval = -ENOMEM;
3625 goto out_free_cpus_allowed;
3626 }
3627 retval = -EPERM;
3628 if (!check_same_owner(p)) {
3629 rcu_read_lock();
3630 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3631 rcu_read_unlock();
3632 goto out_unlock;
3633 }
3634 rcu_read_unlock();
3635 }
3636
3637 retval = security_task_setscheduler(p);
3638 if (retval)
3639 goto out_unlock;
3640
3641 cpuset_cpus_allowed(p, cpus_allowed);
3642 cpumask_and(new_mask, in_mask, cpus_allowed);
3643again:
3644 retval = set_cpus_allowed_ptr(p, new_mask);
3645
3646 if (!retval) {
3647 cpuset_cpus_allowed(p, cpus_allowed);
3648 if (!cpumask_subset(new_mask, cpus_allowed)) {
3649
3650
3651
3652
3653
3654 cpumask_copy(new_mask, cpus_allowed);
3655 goto again;
3656 }
3657 }
3658out_unlock:
3659 free_cpumask_var(new_mask);
3660out_free_cpus_allowed:
3661 free_cpumask_var(cpus_allowed);
3662out_put_task:
3663 put_task_struct(p);
3664 put_online_cpus();
3665 return retval;
3666}
3667
3668static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3669 struct cpumask *new_mask)
3670{
3671 if (len < cpumask_size())
3672 cpumask_clear(new_mask);
3673 else if (len > cpumask_size())
3674 len = cpumask_size();
3675
3676 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3677}
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3688 unsigned long __user *, user_mask_ptr)
3689{
3690 cpumask_var_t new_mask;
3691 int retval;
3692
3693 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3694 return -ENOMEM;
3695
3696 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3697 if (retval == 0)
3698 retval = sched_setaffinity(pid, new_mask);
3699 free_cpumask_var(new_mask);
3700 return retval;
3701}
3702
3703long sched_getaffinity(pid_t pid, struct cpumask *mask)
3704{
3705 struct task_struct *p;
3706 unsigned long flags;
3707 int retval;
3708
3709 get_online_cpus();
3710 rcu_read_lock();
3711
3712 retval = -ESRCH;
3713 p = find_process_by_pid(pid);
3714 if (!p)
3715 goto out_unlock;
3716
3717 retval = security_task_getscheduler(p);
3718 if (retval)
3719 goto out_unlock;
3720
3721 raw_spin_lock_irqsave(&p->pi_lock, flags);
3722 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
3723 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3724
3725out_unlock:
3726 rcu_read_unlock();
3727 put_online_cpus();
3728
3729 return retval;
3730}
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3741 unsigned long __user *, user_mask_ptr)
3742{
3743 int ret;
3744 cpumask_var_t mask;
3745
3746 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
3747 return -EINVAL;
3748 if (len & (sizeof(unsigned long)-1))
3749 return -EINVAL;
3750
3751 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
3752 return -ENOMEM;
3753
3754 ret = sched_getaffinity(pid, mask);
3755 if (ret == 0) {
3756 size_t retlen = min_t(size_t, len, cpumask_size());
3757
3758 if (copy_to_user(user_mask_ptr, mask, retlen))
3759 ret = -EFAULT;
3760 else
3761 ret = retlen;
3762 }
3763 free_cpumask_var(mask);
3764
3765 return ret;
3766}
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776SYSCALL_DEFINE0(sched_yield)
3777{
3778 struct rq *rq = this_rq_lock();
3779
3780 schedstat_inc(rq, yld_count);
3781 current->sched_class->yield_task(rq);
3782
3783
3784
3785
3786
3787 __release(rq->lock);
3788 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3789 do_raw_spin_unlock(&rq->lock);
3790 sched_preempt_enable_no_resched();
3791
3792 schedule();
3793
3794 return 0;
3795}
3796
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void)
3803{
3804 add_preempt_count(PREEMPT_ACTIVE);
3805 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE);
3807}
3808
3809int __sched _cond_resched(void)
3810{
3811 if (should_resched()) {
3812 __cond_resched();
3813 return 1;
3814 }
3815 return 0;
3816}
3817EXPORT_SYMBOL(_cond_resched);
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827int __cond_resched_lock(spinlock_t *lock)
3828{
3829 int resched = should_resched();
3830 int ret = 0;
3831
3832 lockdep_assert_held(lock);
3833
3834 if (spin_needbreak(lock) || resched) {
3835 spin_unlock(lock);
3836 if (resched)
3837 __cond_resched();
3838 else
3839 cpu_relax();
3840 ret = 1;
3841 spin_lock(lock);
3842 }
3843 return ret;
3844}
3845EXPORT_SYMBOL(__cond_resched_lock);
3846
3847int __sched __cond_resched_softirq(void)
3848{
3849 BUG_ON(!in_softirq());
3850
3851 if (should_resched()) {
3852 local_bh_enable();
3853 __cond_resched();
3854 local_bh_disable();
3855 return 1;
3856 }
3857 return 0;
3858}
3859EXPORT_SYMBOL(__cond_resched_softirq);
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883void __sched yield(void)
3884{
3885 set_current_state(TASK_RUNNING);
3886 sys_sched_yield();
3887}
3888EXPORT_SYMBOL(yield);
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905bool __sched yield_to(struct task_struct *p, bool preempt)
3906{
3907 struct task_struct *curr = current;
3908 struct rq *rq, *p_rq;
3909 unsigned long flags;
3910 int yielded = 0;
3911
3912 local_irq_save(flags);
3913 rq = this_rq();
3914
3915again:
3916 p_rq = task_rq(p);
3917
3918
3919
3920
3921 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
3922 yielded = -ESRCH;
3923 goto out_irq;
3924 }
3925
3926 double_rq_lock(rq, p_rq);
3927 while (task_rq(p) != p_rq) {
3928 double_rq_unlock(rq, p_rq);
3929 goto again;
3930 }
3931
3932 if (!curr->sched_class->yield_to_task)
3933 goto out_unlock;
3934
3935 if (curr->sched_class != p->sched_class)
3936 goto out_unlock;
3937
3938 if (task_running(p_rq, p) || p->state)
3939 goto out_unlock;
3940
3941 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
3942 if (yielded) {
3943 schedstat_inc(rq, yld_count);
3944
3945
3946
3947
3948 if (preempt && rq != p_rq)
3949 resched_task(p_rq->curr);
3950 }
3951
3952out_unlock:
3953 double_rq_unlock(rq, p_rq);
3954out_irq:
3955 local_irq_restore(flags);
3956
3957 if (yielded > 0)
3958 schedule();
3959
3960 return yielded;
3961}
3962EXPORT_SYMBOL_GPL(yield_to);
3963
3964
3965
3966
3967
3968void __sched io_schedule(void)
3969{
3970 struct rq *rq = raw_rq();
3971
3972 delayacct_blkio_start();
3973 atomic_inc(&rq->nr_iowait);
3974 blk_flush_plug(current);
3975 current->in_iowait = 1;
3976 schedule();
3977 current->in_iowait = 0;
3978 atomic_dec(&rq->nr_iowait);
3979 delayacct_blkio_end();
3980}
3981EXPORT_SYMBOL(io_schedule);
3982
3983long __sched io_schedule_timeout(long timeout)
3984{
3985 struct rq *rq = raw_rq();
3986 long ret;
3987
3988 delayacct_blkio_start();
3989 atomic_inc(&rq->nr_iowait);
3990 blk_flush_plug(current);
3991 current->in_iowait = 1;
3992 ret = schedule_timeout(timeout);
3993 current->in_iowait = 0;
3994 atomic_dec(&rq->nr_iowait);
3995 delayacct_blkio_end();
3996 return ret;
3997}
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4008{
4009 int ret = -EINVAL;
4010
4011 switch (policy) {
4012 case SCHED_FIFO:
4013 case SCHED_RR:
4014 ret = MAX_USER_RT_PRIO-1;
4015 break;
4016 case SCHED_NORMAL:
4017 case SCHED_BATCH:
4018 case SCHED_IDLE:
4019 ret = 0;
4020 break;
4021 }
4022 return ret;
4023}
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4034{
4035 int ret = -EINVAL;
4036
4037 switch (policy) {
4038 case SCHED_FIFO:
4039 case SCHED_RR:
4040 ret = 1;
4041 break;
4042 case SCHED_NORMAL:
4043 case SCHED_BATCH:
4044 case SCHED_IDLE:
4045 ret = 0;
4046 }
4047 return ret;
4048}
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4062 struct timespec __user *, interval)
4063{
4064 struct task_struct *p;
4065 unsigned int time_slice;
4066 unsigned long flags;
4067 struct rq *rq;
4068 int retval;
4069 struct timespec t;
4070
4071 if (pid < 0)
4072 return -EINVAL;
4073
4074 retval = -ESRCH;
4075 rcu_read_lock();
4076 p = find_process_by_pid(pid);
4077 if (!p)
4078 goto out_unlock;
4079
4080 retval = security_task_getscheduler(p);
4081 if (retval)
4082 goto out_unlock;
4083
4084 rq = task_rq_lock(p, &flags);
4085 time_slice = p->sched_class->get_rr_interval(rq, p);
4086 task_rq_unlock(rq, p, &flags);
4087
4088 rcu_read_unlock();
4089 jiffies_to_timespec(time_slice, &t);
4090 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4091 return retval;
4092
4093out_unlock:
4094 rcu_read_unlock();
4095 return retval;
4096}
4097
4098static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4099
4100void sched_show_task(struct task_struct *p)
4101{
4102 unsigned long free = 0;
4103 int ppid;
4104 unsigned state;
4105
4106 state = p->state ? __ffs(p->state) + 1 : 0;
4107 printk(KERN_INFO "%-15.15s %c", p->comm,
4108 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4109#if BITS_PER_LONG == 32
4110 if (state == TASK_RUNNING)
4111 printk(KERN_CONT " running ");
4112 else
4113 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4114#else
4115 if (state == TASK_RUNNING)
4116 printk(KERN_CONT " running task ");
4117 else
4118 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4119#endif
4120#ifdef CONFIG_DEBUG_STACK_USAGE
4121 free = stack_not_used(p);
4122#endif
4123 rcu_read_lock();
4124 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4125 rcu_read_unlock();
4126 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4127 task_pid_nr(p), ppid,
4128 (unsigned long)task_thread_info(p)->flags);
4129
4130 print_worker_info(KERN_INFO, p);
4131 show_stack(p, NULL);
4132}
4133
4134void show_state_filter(unsigned long state_filter)
4135{
4136 struct task_struct *g, *p;
4137
4138#if BITS_PER_LONG == 32
4139 printk(KERN_INFO
4140 " task PC stack pid father\n");
4141#else
4142 printk(KERN_INFO
4143 " task PC stack pid father\n");
4144#endif
4145 rcu_read_lock();
4146 do_each_thread(g, p) {
4147
4148
4149
4150
4151 touch_nmi_watchdog();
4152 if (!state_filter || (p->state & state_filter))
4153 sched_show_task(p);
4154 } while_each_thread(g, p);
4155
4156 touch_all_softlockup_watchdogs();
4157
4158#ifdef CONFIG_SCHED_DEBUG
4159 sysrq_sched_debug_show();
4160#endif
4161 rcu_read_unlock();
4162
4163
4164
4165 if (!state_filter)
4166 debug_show_all_locks();
4167}
4168
4169void init_idle_bootup_task(struct task_struct *idle)
4170{
4171 idle->sched_class = &idle_sched_class;
4172}
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182void init_idle(struct task_struct *idle, int cpu)
4183{
4184 struct rq *rq = cpu_rq(cpu);
4185 unsigned long flags;
4186
4187 raw_spin_lock_irqsave(&rq->lock, flags);
4188
4189 __sched_fork(idle);
4190 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock();
4192
4193 do_set_cpus_allowed(idle, cpumask_of(cpu));
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204 rcu_read_lock();
4205 __set_task_cpu(idle, cpu);
4206 rcu_read_unlock();
4207
4208 rq->curr = rq->idle = idle;
4209#if defined(CONFIG_SMP)
4210 idle->on_cpu = 1;
4211#endif
4212 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213
4214
4215 task_thread_info(idle)->preempt_count = 0;
4216
4217
4218
4219
4220 idle->sched_class = &idle_sched_class;
4221 ftrace_graph_init_idle_task(idle, cpu);
4222 vtime_init_idle(idle, cpu);
4223#if defined(CONFIG_SMP)
4224 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4225#endif
4226}
4227
4228#ifdef CONFIG_SMP
4229void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4230{
4231 if (p->sched_class && p->sched_class->set_cpus_allowed)
4232 p->sched_class->set_cpus_allowed(p, new_mask);
4233
4234 cpumask_copy(&p->cpus_allowed, new_mask);
4235 p->nr_cpus_allowed = cpumask_weight(new_mask);
4236}
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4262{
4263 unsigned long flags;
4264 struct rq *rq;
4265 unsigned int dest_cpu;
4266 int ret = 0;
4267
4268 rq = task_rq_lock(p, &flags);
4269
4270 if (cpumask_equal(&p->cpus_allowed, new_mask))
4271 goto out;
4272
4273 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4274 ret = -EINVAL;
4275 goto out;
4276 }
4277
4278 do_set_cpus_allowed(p, new_mask);
4279
4280
4281 if (cpumask_test_cpu(task_cpu(p), new_mask))
4282 goto out;
4283
4284 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4285 if (p->on_rq) {
4286 struct migration_arg arg = { p, dest_cpu };
4287
4288 task_rq_unlock(rq, p, &flags);
4289 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4290 tlb_migrate_finish(p->mm);
4291 return 0;
4292 }
4293out:
4294 task_rq_unlock(rq, p, &flags);
4295
4296 return ret;
4297}
4298EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4312{
4313 struct rq *rq_dest, *rq_src;
4314 int ret = 0;
4315
4316 if (unlikely(!cpu_active(dest_cpu)))
4317 return ret;
4318
4319 rq_src = cpu_rq(src_cpu);
4320 rq_dest = cpu_rq(dest_cpu);
4321
4322 raw_spin_lock(&p->pi_lock);
4323 double_rq_lock(rq_src, rq_dest);
4324
4325 if (task_cpu(p) != src_cpu)
4326 goto done;
4327
4328 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4329 goto fail;
4330
4331
4332
4333
4334
4335 if (p->on_rq) {
4336 dequeue_task(rq_src, p, 0);
4337 set_task_cpu(p, dest_cpu);
4338 enqueue_task(rq_dest, p, 0);
4339 check_preempt_curr(rq_dest, p, 0);
4340 }
4341done:
4342 ret = 1;
4343fail:
4344 double_rq_unlock(rq_src, rq_dest);
4345 raw_spin_unlock(&p->pi_lock);
4346 return ret;
4347}
4348
4349
4350
4351
4352
4353
4354static int migration_cpu_stop(void *data)
4355{
4356 struct migration_arg *arg = data;
4357
4358
4359
4360
4361
4362 local_irq_disable();
4363 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4364 local_irq_enable();
4365 return 0;
4366}
4367
4368#ifdef CONFIG_HOTPLUG_CPU
4369
4370
4371
4372
4373
4374void idle_task_exit(void)
4375{
4376 struct mm_struct *mm = current->active_mm;
4377
4378 BUG_ON(cpu_online(smp_processor_id()));
4379
4380 if (mm != &init_mm)
4381 switch_mm(mm, &init_mm, current);
4382 mmdrop(mm);
4383}
4384
4385
4386
4387
4388
4389
4390
4391
4392static void calc_load_migrate(struct rq *rq)
4393{
4394 long delta = calc_load_fold_active(rq);
4395 if (delta)
4396 atomic_long_add(delta, &calc_load_tasks);
4397}
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407static void migrate_tasks(unsigned int dead_cpu)
4408{
4409 struct rq *rq = cpu_rq(dead_cpu);
4410 struct task_struct *next, *stop = rq->stop;
4411 int dest_cpu;
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422 rq->stop = NULL;
4423
4424
4425
4426
4427
4428
4429 update_rq_clock(rq);
4430
4431 for ( ; ; ) {
4432
4433
4434
4435
4436 if (rq->nr_running == 1)
4437 break;
4438
4439 next = pick_next_task(rq);
4440 BUG_ON(!next);
4441 next->sched_class->put_prev_task(rq, next);
4442
4443
4444 dest_cpu = select_fallback_rq(dead_cpu, next);
4445 raw_spin_unlock(&rq->lock);
4446
4447 __migrate_task(next, dead_cpu, dest_cpu);
4448
4449 raw_spin_lock(&rq->lock);
4450 }
4451
4452 rq->stop = stop;
4453}
4454
4455#endif
4456
4457#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4458
4459static struct ctl_table sd_ctl_dir[] = {
4460 {
4461 .procname = "sched_domain",
4462 .mode = 0555,
4463 },
4464 {}
4465};
4466
4467static struct ctl_table sd_ctl_root[] = {
4468 {
4469 .procname = "kernel",
4470 .mode = 0555,
4471 .child = sd_ctl_dir,
4472 },
4473 {}
4474};
4475
4476static struct ctl_table *sd_alloc_ctl_entry(int n)
4477{
4478 struct ctl_table *entry =
4479 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4480
4481 return entry;
4482}
4483
4484static void sd_free_ctl_entry(struct ctl_table **tablep)
4485{
4486 struct ctl_table *entry;
4487
4488
4489
4490
4491
4492
4493
4494 for (entry = *tablep; entry->mode; entry++) {
4495 if (entry->child)
4496 sd_free_ctl_entry(&entry->child);
4497 if (entry->proc_handler == NULL)
4498 kfree(entry->procname);
4499 }
4500
4501 kfree(*tablep);
4502 *tablep = NULL;
4503}
4504
4505static int min_load_idx = 0;
4506static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4507
4508static void
4509set_table_entry(struct ctl_table *entry,
4510 const char *procname, void *data, int maxlen,
4511 umode_t mode, proc_handler *proc_handler,
4512 bool load_idx)
4513{
4514 entry->procname = procname;
4515 entry->data = data;
4516 entry->maxlen = maxlen;
4517 entry->mode = mode;
4518 entry->proc_handler = proc_handler;
4519
4520 if (load_idx) {
4521 entry->extra1 = &min_load_idx;
4522 entry->extra2 = &max_load_idx;
4523 }
4524}
4525
4526static struct ctl_table *
4527sd_alloc_ctl_domain_table(struct sched_domain *sd)
4528{
4529 struct ctl_table *table = sd_alloc_ctl_entry(13);
4530
4531 if (table == NULL)
4532 return NULL;
4533
4534 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4535 sizeof(long), 0644, proc_doulongvec_minmax, false);
4536 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4537 sizeof(long), 0644, proc_doulongvec_minmax, false);
4538 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4539 sizeof(int), 0644, proc_dointvec_minmax, true);
4540 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4541 sizeof(int), 0644, proc_dointvec_minmax, true);
4542 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4543 sizeof(int), 0644, proc_dointvec_minmax, true);
4544 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4545 sizeof(int), 0644, proc_dointvec_minmax, true);
4546 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4547 sizeof(int), 0644, proc_dointvec_minmax, true);
4548 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4549 sizeof(int), 0644, proc_dointvec_minmax, false);
4550 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4551 sizeof(int), 0644, proc_dointvec_minmax, false);
4552 set_table_entry(&table[9], "cache_nice_tries",
4553 &sd->cache_nice_tries,
4554 sizeof(int), 0644, proc_dointvec_minmax, false);
4555 set_table_entry(&table[10], "flags", &sd->flags,
4556 sizeof(int), 0644, proc_dointvec_minmax, false);
4557 set_table_entry(&table[11], "name", sd->name,
4558 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4559
4560
4561 return table;
4562}
4563
4564static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4565{
4566 struct ctl_table *entry, *table;
4567 struct sched_domain *sd;
4568 int domain_num = 0, i;
4569 char buf[32];
4570
4571 for_each_domain(cpu, sd)
4572 domain_num++;
4573 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4574 if (table == NULL)
4575 return NULL;
4576
4577 i = 0;
4578 for_each_domain(cpu, sd) {
4579 snprintf(buf, 32, "domain%d", i);
4580 entry->procname = kstrdup(buf, GFP_KERNEL);
4581 entry->mode = 0555;
4582 entry->child = sd_alloc_ctl_domain_table(sd);
4583 entry++;
4584 i++;
4585 }
4586 return table;
4587}
4588
4589static struct ctl_table_header *sd_sysctl_header;
4590static void register_sched_domain_sysctl(void)
4591{
4592 int i, cpu_num = num_possible_cpus();
4593 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4594 char buf[32];
4595
4596 WARN_ON(sd_ctl_dir[0].child);
4597 sd_ctl_dir[0].child = entry;
4598
4599 if (entry == NULL)
4600 return;
4601
4602 for_each_possible_cpu(i) {
4603 snprintf(buf, 32, "cpu%d", i);
4604 entry->procname = kstrdup(buf, GFP_KERNEL);
4605 entry->mode = 0555;
4606 entry->child = sd_alloc_ctl_cpu_table(i);
4607 entry++;
4608 }
4609
4610 WARN_ON(sd_sysctl_header);
4611 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4612}
4613
4614
4615static void unregister_sched_domain_sysctl(void)
4616{
4617 if (sd_sysctl_header)
4618 unregister_sysctl_table(sd_sysctl_header);
4619 sd_sysctl_header = NULL;
4620 if (sd_ctl_dir[0].child)
4621 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4622}
4623#else
4624static void register_sched_domain_sysctl(void)
4625{
4626}
4627static void unregister_sched_domain_sysctl(void)
4628{
4629}
4630#endif
4631
4632static void set_rq_online(struct rq *rq)
4633{
4634 if (!rq->online) {
4635 const struct sched_class *class;
4636
4637 cpumask_set_cpu(rq->cpu, rq->rd->online);
4638 rq->online = 1;
4639
4640 for_each_class(class) {
4641 if (class->rq_online)
4642 class->rq_online(rq);
4643 }
4644 }
4645}
4646
4647static void set_rq_offline(struct rq *rq)
4648{
4649 if (rq->online) {
4650 const struct sched_class *class;
4651
4652 for_each_class(class) {
4653 if (class->rq_offline)
4654 class->rq_offline(rq);
4655 }
4656
4657 cpumask_clear_cpu(rq->cpu, rq->rd->online);
4658 rq->online = 0;
4659 }
4660}
4661
4662
4663
4664
4665
4666static int
4667migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4668{
4669 int cpu = (long)hcpu;
4670 unsigned long flags;
4671 struct rq *rq = cpu_rq(cpu);
4672
4673 switch (action & ~CPU_TASKS_FROZEN) {
4674
4675 case CPU_UP_PREPARE:
4676 rq->calc_load_update = calc_load_update;
4677 break;
4678
4679 case CPU_ONLINE:
4680
4681 raw_spin_lock_irqsave(&rq->lock, flags);
4682 if (rq->rd) {
4683 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4684
4685 set_rq_online(rq);
4686 }
4687 raw_spin_unlock_irqrestore(&rq->lock, flags);
4688 break;
4689
4690#ifdef CONFIG_HOTPLUG_CPU
4691 case CPU_DYING:
4692 sched_ttwu_pending();
4693
4694 raw_spin_lock_irqsave(&rq->lock, flags);
4695 if (rq->rd) {
4696 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4697 set_rq_offline(rq);
4698 }
4699 migrate_tasks(cpu);
4700 BUG_ON(rq->nr_running != 1);
4701 raw_spin_unlock_irqrestore(&rq->lock, flags);
4702 break;
4703
4704 case CPU_DEAD:
4705 calc_load_migrate(rq);
4706 break;
4707#endif
4708 }
4709
4710 update_max_interval();
4711
4712 return NOTIFY_OK;
4713}
4714
4715
4716
4717
4718
4719
4720static struct notifier_block migration_notifier = {
4721 .notifier_call = migration_call,
4722 .priority = CPU_PRI_MIGRATION,
4723};
4724
4725static int sched_cpu_active(struct notifier_block *nfb,
4726 unsigned long action, void *hcpu)
4727{
4728 switch (action & ~CPU_TASKS_FROZEN) {
4729 case CPU_STARTING:
4730 case CPU_DOWN_FAILED:
4731 set_cpu_active((long)hcpu, true);
4732 return NOTIFY_OK;
4733 default:
4734 return NOTIFY_DONE;
4735 }
4736}
4737
4738static int sched_cpu_inactive(struct notifier_block *nfb,
4739 unsigned long action, void *hcpu)
4740{
4741 switch (action & ~CPU_TASKS_FROZEN) {
4742 case CPU_DOWN_PREPARE:
4743 set_cpu_active((long)hcpu, false);
4744 return NOTIFY_OK;
4745 default:
4746 return NOTIFY_DONE;
4747 }
4748}
4749
4750static int __init migration_init(void)
4751{
4752 void *cpu = (void *)(long)smp_processor_id();
4753 int err;
4754
4755
4756 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4757 BUG_ON(err == NOTIFY_BAD);
4758 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4759 register_cpu_notifier(&migration_notifier);
4760
4761
4762 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
4763 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
4764
4765 return 0;
4766}
4767early_initcall(migration_init);
4768#endif
4769
4770#ifdef CONFIG_SMP
4771
4772static cpumask_var_t sched_domains_tmpmask;
4773
4774#ifdef CONFIG_SCHED_DEBUG
4775
4776static __read_mostly int sched_debug_enabled;
4777
4778static int __init sched_debug_setup(char *str)
4779{
4780 sched_debug_enabled = 1;
4781
4782 return 0;
4783}
4784early_param("sched_debug", sched_debug_setup);
4785
4786static inline bool sched_debug(void)
4787{
4788 return sched_debug_enabled;
4789}
4790
4791static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
4792 struct cpumask *groupmask)
4793{
4794 struct sched_group *group = sd->groups;
4795 char str[256];
4796
4797 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
4798 cpumask_clear(groupmask);
4799
4800 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
4801
4802 if (!(sd->flags & SD_LOAD_BALANCE)) {
4803 printk("does not load-balance\n");
4804 if (sd->parent)
4805 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
4806 " has parent");
4807 return -1;
4808 }
4809
4810 printk(KERN_CONT "span %s level %s\n", str, sd->name);
4811
4812 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
4813 printk(KERN_ERR "ERROR: domain->span does not contain "
4814 "CPU%d\n", cpu);
4815 }
4816 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
4817 printk(KERN_ERR "ERROR: domain->groups does not contain"
4818 " CPU%d\n", cpu);
4819 }
4820
4821 printk(KERN_DEBUG "%*s groups:", level + 1, "");
4822 do {
4823 if (!group) {
4824 printk("\n");
4825 printk(KERN_ERR "ERROR: group is NULL\n");
4826 break;
4827 }
4828
4829
4830
4831
4832
4833
4834 if (!group->sgp->power_orig) {
4835 printk(KERN_CONT "\n");
4836 printk(KERN_ERR "ERROR: domain->cpu_power not "
4837 "set\n");
4838 break;
4839 }
4840
4841 if (!cpumask_weight(sched_group_cpus(group))) {
4842 printk(KERN_CONT "\n");
4843 printk(KERN_ERR "ERROR: empty group\n");
4844 break;
4845 }
4846
4847 if (!(sd->flags & SD_OVERLAP) &&
4848 cpumask_intersects(groupmask, sched_group_cpus(group))) {
4849 printk(KERN_CONT "\n");
4850 printk(KERN_ERR "ERROR: repeated CPUs\n");
4851 break;
4852 }
4853
4854 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
4855
4856 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
4857
4858 printk(KERN_CONT " %s", str);
4859 if (group->sgp->power != SCHED_POWER_SCALE) {
4860 printk(KERN_CONT " (cpu_power = %d)",
4861 group->sgp->power);
4862 }
4863
4864 group = group->next;
4865 } while (group != sd->groups);
4866 printk(KERN_CONT "\n");
4867
4868 if (!cpumask_equal(sched_domain_span(sd), groupmask))
4869 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4870
4871 if (sd->parent &&
4872 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
4873 printk(KERN_ERR "ERROR: parent span is not a superset "
4874 "of domain->span\n");
4875 return 0;
4876}
4877
4878static void sched_domain_debug(struct sched_domain *sd, int cpu)
4879{
4880 int level = 0;
4881
4882 if (!sched_debug_enabled)
4883 return;
4884
4885 if (!sd) {
4886 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4887 return;
4888 }
4889
4890 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4891
4892 for (;;) {
4893 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
4894 break;
4895 level++;
4896 sd = sd->parent;
4897 if (!sd)
4898 break;
4899 }
4900}
4901#else
4902# define sched_domain_debug(sd, cpu) do { } while (0)
4903static inline bool sched_debug(void)
4904{
4905 return false;
4906}
4907#endif
4908
4909static int sd_degenerate(struct sched_domain *sd)
4910{
4911 if (cpumask_weight(sched_domain_span(sd)) == 1)
4912 return 1;
4913
4914
4915 if (sd->flags & (SD_LOAD_BALANCE |
4916 SD_BALANCE_NEWIDLE |
4917 SD_BALANCE_FORK |
4918 SD_BALANCE_EXEC |
4919 SD_SHARE_CPUPOWER |
4920 SD_SHARE_PKG_RESOURCES)) {
4921 if (sd->groups != sd->groups->next)
4922 return 0;
4923 }
4924
4925
4926 if (sd->flags & (SD_WAKE_AFFINE))
4927 return 0;
4928
4929 return 1;
4930}
4931
4932static int
4933sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4934{
4935 unsigned long cflags = sd->flags, pflags = parent->flags;
4936
4937 if (sd_degenerate(parent))
4938 return 1;
4939
4940 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
4941 return 0;
4942
4943
4944 if (parent->groups == parent->groups->next) {
4945 pflags &= ~(SD_LOAD_BALANCE |
4946 SD_BALANCE_NEWIDLE |
4947 SD_BALANCE_FORK |
4948 SD_BALANCE_EXEC |
4949 SD_SHARE_CPUPOWER |
4950 SD_SHARE_PKG_RESOURCES |
4951 SD_PREFER_SIBLING);
4952 if (nr_node_ids == 1)
4953 pflags &= ~SD_SERIALIZE;
4954 }
4955 if (~cflags & pflags)
4956 return 0;
4957
4958 return 1;
4959}
4960
4961static void free_rootdomain(struct rcu_head *rcu)
4962{
4963 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4964
4965 cpupri_cleanup(&rd->cpupri);
4966 free_cpumask_var(rd->rto_mask);
4967 free_cpumask_var(rd->online);
4968 free_cpumask_var(rd->span);
4969 kfree(rd);
4970}
4971
4972static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4973{
4974 struct root_domain *old_rd = NULL;
4975 unsigned long flags;
4976
4977 raw_spin_lock_irqsave(&rq->lock, flags);
4978
4979 if (rq->rd) {
4980 old_rd = rq->rd;
4981
4982 if (cpumask_test_cpu(rq->cpu, old_rd->online))
4983 set_rq_offline(rq);
4984
4985 cpumask_clear_cpu(rq->cpu, old_rd->span);
4986
4987
4988
4989
4990
4991
4992 if (!atomic_dec_and_test(&old_rd->refcount))
4993 old_rd = NULL;
4994 }
4995
4996 atomic_inc(&rd->refcount);
4997 rq->rd = rd;
4998
4999 cpumask_set_cpu(rq->cpu, rd->span);
5000 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5001 set_rq_online(rq);
5002
5003 raw_spin_unlock_irqrestore(&rq->lock, flags);
5004
5005 if (old_rd)
5006 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5007}
5008
5009static int init_rootdomain(struct root_domain *rd)
5010{
5011 memset(rd, 0, sizeof(*rd));
5012
5013 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5014 goto out;
5015 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5016 goto free_span;
5017 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5018 goto free_online;
5019
5020 if (cpupri_init(&rd->cpupri) != 0)
5021 goto free_rto_mask;
5022 return 0;
5023
5024free_rto_mask:
5025 free_cpumask_var(rd->rto_mask);
5026free_online:
5027 free_cpumask_var(rd->online);
5028free_span:
5029 free_cpumask_var(rd->span);
5030out:
5031 return -ENOMEM;
5032}
5033
5034
5035
5036
5037
5038struct root_domain def_root_domain;
5039
5040static void init_defrootdomain(void)
5041{
5042 init_rootdomain(&def_root_domain);
5043
5044 atomic_set(&def_root_domain.refcount, 1);
5045}
5046
5047static struct root_domain *alloc_rootdomain(void)
5048{
5049 struct root_domain *rd;
5050
5051 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5052 if (!rd)
5053 return NULL;
5054
5055 if (init_rootdomain(rd) != 0) {
5056 kfree(rd);
5057 return NULL;
5058 }
5059
5060 return rd;
5061}
5062
5063static void free_sched_groups(struct sched_group *sg, int free_sgp)
5064{
5065 struct sched_group *tmp, *first;
5066
5067 if (!sg)
5068 return;
5069
5070 first = sg;
5071 do {
5072 tmp = sg->next;
5073
5074 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5075 kfree(sg->sgp);
5076
5077 kfree(sg);
5078 sg = tmp;
5079 } while (sg != first);
5080}
5081
5082static void free_sched_domain(struct rcu_head *rcu)
5083{
5084 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5085
5086
5087
5088
5089
5090 if (sd->flags & SD_OVERLAP) {
5091 free_sched_groups(sd->groups, 1);
5092 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5093 kfree(sd->groups->sgp);
5094 kfree(sd->groups);
5095 }
5096 kfree(sd);
5097}
5098
5099static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5100{
5101 call_rcu(&sd->rcu, free_sched_domain);
5102}
5103
5104static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5105{
5106 for (; sd; sd = sd->parent)
5107 destroy_sched_domain(sd, cpu);
5108}
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id);
5122
5123static void update_top_cache_domain(int cpu)
5124{
5125 struct sched_domain *sd;
5126 int id = cpu;
5127 int size = 1;
5128
5129 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5130 if (sd) {
5131 id = cpumask_first(sched_domain_span(sd));
5132 size = cpumask_weight(sched_domain_span(sd));
5133 }
5134
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id;
5138}
5139
5140
5141
5142
5143
5144static void
5145cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5146{
5147 struct rq *rq = cpu_rq(cpu);
5148 struct sched_domain *tmp;
5149
5150
5151 for (tmp = sd; tmp; ) {
5152 struct sched_domain *parent = tmp->parent;
5153 if (!parent)
5154 break;
5155
5156 if (sd_parent_degenerate(tmp, parent)) {
5157 tmp->parent = parent->parent;
5158 if (parent->parent)
5159 parent->parent->child = tmp;
5160
5161
5162
5163
5164
5165 if (parent->flags & SD_PREFER_SIBLING)
5166 tmp->flags |= SD_PREFER_SIBLING;
5167 destroy_sched_domain(parent, cpu);
5168 } else
5169 tmp = tmp->parent;
5170 }
5171
5172 if (sd && sd_degenerate(sd)) {
5173 tmp = sd;
5174 sd = sd->parent;
5175 destroy_sched_domain(tmp, cpu);
5176 if (sd)
5177 sd->child = NULL;
5178 }
5179
5180 sched_domain_debug(sd, cpu);
5181
5182 rq_attach_root(rq, rd);
5183 tmp = rq->sd;
5184 rcu_assign_pointer(rq->sd, sd);
5185 destroy_sched_domains(tmp, cpu);
5186
5187 update_top_cache_domain(cpu);
5188}
5189
5190
5191static cpumask_var_t cpu_isolated_map;
5192
5193
5194static int __init isolated_cpu_setup(char *str)
5195{
5196 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5197 cpulist_parse(str, cpu_isolated_map);
5198 return 1;
5199}
5200
5201__setup("isolcpus=", isolated_cpu_setup);
5202
5203static const struct cpumask *cpu_cpu_mask(int cpu)
5204{
5205 return cpumask_of_node(cpu_to_node(cpu));
5206}
5207
5208struct sd_data {
5209 struct sched_domain **__percpu sd;
5210 struct sched_group **__percpu sg;
5211 struct sched_group_power **__percpu sgp;
5212};
5213
5214struct s_data {
5215 struct sched_domain ** __percpu sd;
5216 struct root_domain *rd;
5217};
5218
5219enum s_alloc {
5220 sa_rootdomain,
5221 sa_sd,
5222 sa_sd_storage,
5223 sa_none,
5224};
5225
5226struct sched_domain_topology_level;
5227
5228typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5229typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5230
5231#define SDTL_OVERLAP 0x01
5232
5233struct sched_domain_topology_level {
5234 sched_domain_init_f init;
5235 sched_domain_mask_f mask;
5236 int flags;
5237 int numa_level;
5238 struct sd_data data;
5239};
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5255{
5256 const struct cpumask *span = sched_domain_span(sd);
5257 struct sd_data *sdd = sd->private;
5258 struct sched_domain *sibling;
5259 int i;
5260
5261 for_each_cpu(i, span) {
5262 sibling = *per_cpu_ptr(sdd->sd, i);
5263 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5264 continue;
5265
5266 cpumask_set_cpu(i, sched_group_mask(sg));
5267 }
5268}
5269
5270
5271
5272
5273
5274int group_balance_cpu(struct sched_group *sg)
5275{
5276 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5277}
5278
5279static int
5280build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5281{
5282 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5283 const struct cpumask *span = sched_domain_span(sd);
5284 struct cpumask *covered = sched_domains_tmpmask;
5285 struct sd_data *sdd = sd->private;
5286 struct sched_domain *child;
5287 int i;
5288
5289 cpumask_clear(covered);
5290
5291 for_each_cpu(i, span) {
5292 struct cpumask *sg_span;
5293
5294 if (cpumask_test_cpu(i, covered))
5295 continue;
5296
5297 child = *per_cpu_ptr(sdd->sd, i);
5298
5299
5300 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5301 continue;
5302
5303 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5304 GFP_KERNEL, cpu_to_node(cpu));
5305
5306 if (!sg)
5307 goto fail;
5308
5309 sg_span = sched_group_cpus(sg);
5310 if (child->child) {
5311 child = child->child;
5312 cpumask_copy(sg_span, sched_domain_span(child));
5313 } else
5314 cpumask_set_cpu(i, sg_span);
5315
5316 cpumask_or(covered, covered, sg_span);
5317
5318 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5319 if (atomic_inc_return(&sg->sgp->ref) == 1)
5320 build_group_mask(sd, sg);
5321
5322
5323
5324
5325
5326
5327 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5328
5329
5330
5331
5332
5333
5334 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5335 group_balance_cpu(sg) == cpu)
5336 groups = sg;
5337
5338 if (!first)
5339 first = sg;
5340 if (last)
5341 last->next = sg;
5342 last = sg;
5343 last->next = first;
5344 }
5345 sd->groups = groups;
5346
5347 return 0;
5348
5349fail:
5350 free_sched_groups(first, 0);
5351
5352 return -ENOMEM;
5353}
5354
5355static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5356{
5357 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5358 struct sched_domain *child = sd->child;
5359
5360 if (child)
5361 cpu = cpumask_first(sched_domain_span(child));
5362
5363 if (sg) {
5364 *sg = *per_cpu_ptr(sdd->sg, cpu);
5365 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5366 atomic_set(&(*sg)->sgp->ref, 1);
5367 }
5368
5369 return cpu;
5370}
5371
5372
5373
5374
5375
5376
5377
5378
5379static int
5380build_sched_groups(struct sched_domain *sd, int cpu)
5381{
5382 struct sched_group *first = NULL, *last = NULL;
5383 struct sd_data *sdd = sd->private;
5384 const struct cpumask *span = sched_domain_span(sd);
5385 struct cpumask *covered;
5386 int i;
5387
5388 get_group(cpu, sdd, &sd->groups);
5389 atomic_inc(&sd->groups->ref);
5390
5391 if (cpu != cpumask_first(span))
5392 return 0;
5393
5394 lockdep_assert_held(&sched_domains_mutex);
5395 covered = sched_domains_tmpmask;
5396
5397 cpumask_clear(covered);
5398
5399 for_each_cpu(i, span) {
5400 struct sched_group *sg;
5401 int group, j;
5402
5403 if (cpumask_test_cpu(i, covered))
5404 continue;
5405
5406 group = get_group(i, sdd, &sg);
5407 cpumask_clear(sched_group_cpus(sg));
5408 sg->sgp->power = 0;
5409 cpumask_setall(sched_group_mask(sg));
5410
5411 for_each_cpu(j, span) {
5412 if (get_group(j, sdd, NULL) != group)
5413 continue;
5414
5415 cpumask_set_cpu(j, covered);
5416 cpumask_set_cpu(j, sched_group_cpus(sg));
5417 }
5418
5419 if (!first)
5420 first = sg;
5421 if (last)
5422 last->next = sg;
5423 last = sg;
5424 }
5425 last->next = first;
5426
5427 return 0;
5428}
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5441{
5442 struct sched_group *sg = sd->groups;
5443
5444 WARN_ON(!sg);
5445
5446 do {
5447 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5448 sg = sg->next;
5449 } while (sg != sd->groups);
5450
5451 if (cpu != group_balance_cpu(sg))
5452 return;
5453
5454 update_group_power(sd, cpu);
5455 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5456}
5457
5458int __weak arch_sd_sibling_asym_packing(void)
5459{
5460 return 0*SD_ASYM_PACKING;
5461}
5462
5463
5464
5465
5466
5467
5468#ifdef CONFIG_SCHED_DEBUG
5469# define SD_INIT_NAME(sd, type) sd->name = #type
5470#else
5471# define SD_INIT_NAME(sd, type) do { } while (0)
5472#endif
5473
5474#define SD_INIT_FUNC(type) \
5475static noinline struct sched_domain * \
5476sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5477{ \
5478 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5479 *sd = SD_##type##_INIT; \
5480 SD_INIT_NAME(sd, type); \
5481 sd->private = &tl->data; \
5482 return sd; \
5483}
5484
5485SD_INIT_FUNC(CPU)
5486#ifdef CONFIG_SCHED_SMT
5487 SD_INIT_FUNC(SIBLING)
5488#endif
5489#ifdef CONFIG_SCHED_MC
5490 SD_INIT_FUNC(MC)
5491#endif
5492#ifdef CONFIG_SCHED_BOOK
5493 SD_INIT_FUNC(BOOK)
5494#endif
5495
5496static int default_relax_domain_level = -1;
5497int sched_domain_level_max;
5498
5499static int __init setup_relax_domain_level(char *str)
5500{
5501 if (kstrtoint(str, 0, &default_relax_domain_level))
5502 pr_warn("Unable to set relax_domain_level\n");
5503
5504 return 1;
5505}
5506__setup("relax_domain_level=", setup_relax_domain_level);
5507
5508static void set_domain_attribute(struct sched_domain *sd,
5509 struct sched_domain_attr *attr)
5510{
5511 int request;
5512
5513 if (!attr || attr->relax_domain_level < 0) {
5514 if (default_relax_domain_level < 0)
5515 return;
5516 else
5517 request = default_relax_domain_level;
5518 } else
5519 request = attr->relax_domain_level;
5520 if (request < sd->level) {
5521
5522 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5523 } else {
5524
5525 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5526 }
5527}
5528
5529static void __sdt_free(const struct cpumask *cpu_map);
5530static int __sdt_alloc(const struct cpumask *cpu_map);
5531
5532static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5533 const struct cpumask *cpu_map)
5534{
5535 switch (what) {
5536 case sa_rootdomain:
5537 if (!atomic_read(&d->rd->refcount))
5538 free_rootdomain(&d->rd->rcu);
5539 case sa_sd:
5540 free_percpu(d->sd);
5541 case sa_sd_storage:
5542 __sdt_free(cpu_map);
5543 case sa_none:
5544 break;
5545 }
5546}
5547
5548static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5549 const struct cpumask *cpu_map)
5550{
5551 memset(d, 0, sizeof(*d));
5552
5553 if (__sdt_alloc(cpu_map))
5554 return sa_sd_storage;
5555 d->sd = alloc_percpu(struct sched_domain *);
5556 if (!d->sd)
5557 return sa_sd_storage;
5558 d->rd = alloc_rootdomain();
5559 if (!d->rd)
5560 return sa_sd;
5561 return sa_rootdomain;
5562}
5563
5564
5565
5566
5567
5568
5569static void claim_allocations(int cpu, struct sched_domain *sd)
5570{
5571 struct sd_data *sdd = sd->private;
5572
5573 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5574 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5575
5576 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5577 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5578
5579 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5580 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5581}
5582
5583#ifdef CONFIG_SCHED_SMT
5584static const struct cpumask *cpu_smt_mask(int cpu)
5585{
5586 return topology_thread_cpumask(cpu);
5587}
5588#endif
5589
5590
5591
5592
5593static struct sched_domain_topology_level default_topology[] = {
5594#ifdef CONFIG_SCHED_SMT
5595 { sd_init_SIBLING, cpu_smt_mask, },
5596#endif
5597#ifdef CONFIG_SCHED_MC
5598 { sd_init_MC, cpu_coregroup_mask, },
5599#endif
5600#ifdef CONFIG_SCHED_BOOK
5601 { sd_init_BOOK, cpu_book_mask, },
5602#endif
5603 { sd_init_CPU, cpu_cpu_mask, },
5604 { NULL, },
5605};
5606
5607static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5608
5609#define for_each_sd_topology(tl) \
5610 for (tl = sched_domain_topology; tl->init; tl++)
5611
5612#ifdef CONFIG_NUMA
5613
5614static int sched_domains_numa_levels;
5615static int *sched_domains_numa_distance;
5616static struct cpumask ***sched_domains_numa_masks;
5617static int sched_domains_curr_level;
5618
5619static inline int sd_local_flags(int level)
5620{
5621 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5622 return 0;
5623
5624 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5625}
5626
5627static struct sched_domain *
5628sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5629{
5630 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5631 int level = tl->numa_level;
5632 int sd_weight = cpumask_weight(
5633 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
5634
5635 *sd = (struct sched_domain){
5636 .min_interval = sd_weight,
5637 .max_interval = 2*sd_weight,
5638 .busy_factor = 32,
5639 .imbalance_pct = 125,
5640 .cache_nice_tries = 2,
5641 .busy_idx = 3,
5642 .idle_idx = 2,
5643 .newidle_idx = 0,
5644 .wake_idx = 0,
5645 .forkexec_idx = 0,
5646
5647 .flags = 1*SD_LOAD_BALANCE
5648 | 1*SD_BALANCE_NEWIDLE
5649 | 0*SD_BALANCE_EXEC
5650 | 0*SD_BALANCE_FORK
5651 | 0*SD_BALANCE_WAKE
5652 | 0*SD_WAKE_AFFINE
5653 | 0*SD_SHARE_CPUPOWER
5654 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING
5657 | sd_local_flags(level)
5658 ,
5659 .last_balance = jiffies,
5660 .balance_interval = sd_weight,
5661 };
5662 SD_INIT_NAME(sd, NUMA);
5663 sd->private = &tl->data;
5664
5665
5666
5667
5668 sched_domains_curr_level = tl->numa_level;
5669
5670 return sd;
5671}
5672
5673static const struct cpumask *sd_numa_mask(int cpu)
5674{
5675 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
5676}
5677
5678static void sched_numa_warn(const char *str)
5679{
5680 static int done = false;
5681 int i,j;
5682
5683 if (done)
5684 return;
5685
5686 done = true;
5687
5688 printk(KERN_WARNING "ERROR: %s\n\n", str);
5689
5690 for (i = 0; i < nr_node_ids; i++) {
5691 printk(KERN_WARNING " ");
5692 for (j = 0; j < nr_node_ids; j++)
5693 printk(KERN_CONT "%02d ", node_distance(i,j));
5694 printk(KERN_CONT "\n");
5695 }
5696 printk(KERN_WARNING "\n");
5697}
5698
5699static bool find_numa_distance(int distance)
5700{
5701 int i;
5702
5703 if (distance == node_distance(0, 0))
5704 return true;
5705
5706 for (i = 0; i < sched_domains_numa_levels; i++) {
5707 if (sched_domains_numa_distance[i] == distance)
5708 return true;
5709 }
5710
5711 return false;
5712}
5713
5714static void sched_init_numa(void)
5715{
5716 int next_distance, curr_distance = node_distance(0, 0);
5717 struct sched_domain_topology_level *tl;
5718 int level = 0;
5719 int i, j, k;
5720
5721 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
5722 if (!sched_domains_numa_distance)
5723 return;
5724
5725
5726
5727
5728
5729
5730
5731
5732 next_distance = curr_distance;
5733 for (i = 0; i < nr_node_ids; i++) {
5734 for (j = 0; j < nr_node_ids; j++) {
5735 for (k = 0; k < nr_node_ids; k++) {
5736 int distance = node_distance(i, k);
5737
5738 if (distance > curr_distance &&
5739 (distance < next_distance ||
5740 next_distance == curr_distance))
5741 next_distance = distance;
5742
5743
5744
5745
5746
5747
5748 if (sched_debug() && node_distance(k, i) != distance)
5749 sched_numa_warn("Node-distance not symmetric");
5750
5751 if (sched_debug() && i && !find_numa_distance(distance))
5752 sched_numa_warn("Node-0 not representative");
5753 }
5754 if (next_distance != curr_distance) {
5755 sched_domains_numa_distance[level++] = next_distance;
5756 sched_domains_numa_levels = level;
5757 curr_distance = next_distance;
5758 } else break;
5759 }
5760
5761
5762
5763
5764 if (!sched_debug())
5765 break;
5766 }
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784 sched_domains_numa_levels = 0;
5785
5786 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
5787 if (!sched_domains_numa_masks)
5788 return;
5789
5790
5791
5792
5793
5794 for (i = 0; i < level; i++) {
5795 sched_domains_numa_masks[i] =
5796 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
5797 if (!sched_domains_numa_masks[i])
5798 return;
5799
5800 for (j = 0; j < nr_node_ids; j++) {
5801 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
5802 if (!mask)
5803 return;
5804
5805 sched_domains_numa_masks[i][j] = mask;
5806
5807 for (k = 0; k < nr_node_ids; k++) {
5808 if (node_distance(j, k) > sched_domains_numa_distance[i])
5809 continue;
5810
5811 cpumask_or(mask, mask, cpumask_of_node(k));
5812 }
5813 }
5814 }
5815
5816 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
5817 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
5818 if (!tl)
5819 return;
5820
5821
5822
5823
5824 for (i = 0; default_topology[i].init; i++)
5825 tl[i] = default_topology[i];
5826
5827
5828
5829
5830 for (j = 0; j < level; i++, j++) {
5831 tl[i] = (struct sched_domain_topology_level){
5832 .init = sd_numa_init,
5833 .mask = sd_numa_mask,
5834 .flags = SDTL_OVERLAP,
5835 .numa_level = j,
5836 };
5837 }
5838
5839 sched_domain_topology = tl;
5840
5841 sched_domains_numa_levels = level;
5842}
5843
5844static void sched_domains_numa_masks_set(int cpu)
5845{
5846 int i, j;
5847 int node = cpu_to_node(cpu);
5848
5849 for (i = 0; i < sched_domains_numa_levels; i++) {
5850 for (j = 0; j < nr_node_ids; j++) {
5851 if (node_distance(j, node) <= sched_domains_numa_distance[i])
5852 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
5853 }
5854 }
5855}
5856
5857static void sched_domains_numa_masks_clear(int cpu)
5858{
5859 int i, j;
5860 for (i = 0; i < sched_domains_numa_levels; i++) {
5861 for (j = 0; j < nr_node_ids; j++)
5862 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
5863 }
5864}
5865
5866
5867
5868
5869
5870static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5871 unsigned long action,
5872 void *hcpu)
5873{
5874 int cpu = (long)hcpu;
5875
5876 switch (action & ~CPU_TASKS_FROZEN) {
5877 case CPU_ONLINE:
5878 sched_domains_numa_masks_set(cpu);
5879 break;
5880
5881 case CPU_DEAD:
5882 sched_domains_numa_masks_clear(cpu);
5883 break;
5884
5885 default:
5886 return NOTIFY_DONE;
5887 }
5888
5889 return NOTIFY_OK;
5890}
5891#else
5892static inline void sched_init_numa(void)
5893{
5894}
5895
5896static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5897 unsigned long action,
5898 void *hcpu)
5899{
5900 return 0;
5901}
5902#endif
5903
5904static int __sdt_alloc(const struct cpumask *cpu_map)
5905{
5906 struct sched_domain_topology_level *tl;
5907 int j;
5908
5909 for_each_sd_topology(tl) {
5910 struct sd_data *sdd = &tl->data;
5911
5912 sdd->sd = alloc_percpu(struct sched_domain *);
5913 if (!sdd->sd)
5914 return -ENOMEM;
5915
5916 sdd->sg = alloc_percpu(struct sched_group *);
5917 if (!sdd->sg)
5918 return -ENOMEM;
5919
5920 sdd->sgp = alloc_percpu(struct sched_group_power *);
5921 if (!sdd->sgp)
5922 return -ENOMEM;
5923
5924 for_each_cpu(j, cpu_map) {
5925 struct sched_domain *sd;
5926 struct sched_group *sg;
5927 struct sched_group_power *sgp;
5928
5929 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
5930 GFP_KERNEL, cpu_to_node(j));
5931 if (!sd)
5932 return -ENOMEM;
5933
5934 *per_cpu_ptr(sdd->sd, j) = sd;
5935
5936 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5937 GFP_KERNEL, cpu_to_node(j));
5938 if (!sg)
5939 return -ENOMEM;
5940
5941 sg->next = sg;
5942
5943 *per_cpu_ptr(sdd->sg, j) = sg;
5944
5945 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
5946 GFP_KERNEL, cpu_to_node(j));
5947 if (!sgp)
5948 return -ENOMEM;
5949
5950 *per_cpu_ptr(sdd->sgp, j) = sgp;
5951 }
5952 }
5953
5954 return 0;
5955}
5956
5957static void __sdt_free(const struct cpumask *cpu_map)
5958{
5959 struct sched_domain_topology_level *tl;
5960 int j;
5961
5962 for_each_sd_topology(tl) {
5963 struct sd_data *sdd = &tl->data;
5964
5965 for_each_cpu(j, cpu_map) {
5966 struct sched_domain *sd;
5967
5968 if (sdd->sd) {
5969 sd = *per_cpu_ptr(sdd->sd, j);
5970 if (sd && (sd->flags & SD_OVERLAP))
5971 free_sched_groups(sd->groups, 0);
5972 kfree(*per_cpu_ptr(sdd->sd, j));
5973 }
5974
5975 if (sdd->sg)
5976 kfree(*per_cpu_ptr(sdd->sg, j));
5977 if (sdd->sgp)
5978 kfree(*per_cpu_ptr(sdd->sgp, j));
5979 }
5980 free_percpu(sdd->sd);
5981 sdd->sd = NULL;
5982 free_percpu(sdd->sg);
5983 sdd->sg = NULL;
5984 free_percpu(sdd->sgp);
5985 sdd->sgp = NULL;
5986 }
5987}
5988
5989struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
5990 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
5991 struct sched_domain *child, int cpu)
5992{
5993 struct sched_domain *sd = tl->init(tl, cpu);
5994 if (!sd)
5995 return child;
5996
5997 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
5998 if (child) {
5999 sd->level = child->level + 1;
6000 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6001 child->parent = sd;
6002 sd->child = child;
6003 }
6004 set_domain_attribute(sd, attr);
6005
6006 return sd;
6007}
6008
6009
6010
6011
6012
6013static int build_sched_domains(const struct cpumask *cpu_map,
6014 struct sched_domain_attr *attr)
6015{
6016 enum s_alloc alloc_state;
6017 struct sched_domain *sd;
6018 struct s_data d;
6019 int i, ret = -ENOMEM;
6020
6021 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6022 if (alloc_state != sa_rootdomain)
6023 goto error;
6024
6025
6026 for_each_cpu(i, cpu_map) {
6027 struct sched_domain_topology_level *tl;
6028
6029 sd = NULL;
6030 for_each_sd_topology(tl) {
6031 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6032 if (tl == sched_domain_topology)
6033 *per_cpu_ptr(d.sd, i) = sd;
6034 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6035 sd->flags |= SD_OVERLAP;
6036 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6037 break;
6038 }
6039 }
6040
6041
6042 for_each_cpu(i, cpu_map) {
6043 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6044 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6045 if (sd->flags & SD_OVERLAP) {
6046 if (build_overlap_sched_groups(sd, i))
6047 goto error;
6048 } else {
6049 if (build_sched_groups(sd, i))
6050 goto error;
6051 }
6052 }
6053 }
6054
6055
6056 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6057 if (!cpumask_test_cpu(i, cpu_map))
6058 continue;
6059
6060 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6061 claim_allocations(i, sd);
6062 init_sched_groups_power(i, sd);
6063 }
6064 }
6065
6066
6067 rcu_read_lock();
6068 for_each_cpu(i, cpu_map) {
6069 sd = *per_cpu_ptr(d.sd, i);
6070 cpu_attach_domain(sd, d.rd, i);
6071 }
6072 rcu_read_unlock();
6073
6074 ret = 0;
6075error:
6076 __free_domain_allocs(&d, alloc_state, cpu_map);
6077 return ret;
6078}
6079
6080static cpumask_var_t *doms_cur;
6081static int ndoms_cur;
6082static struct sched_domain_attr *dattr_cur;
6083
6084
6085
6086
6087
6088
6089
6090static cpumask_var_t fallback_doms;
6091
6092
6093
6094
6095
6096
6097int __attribute__((weak)) arch_update_cpu_topology(void)
6098{
6099 return 0;
6100}
6101
6102cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6103{
6104 int i;
6105 cpumask_var_t *doms;
6106
6107 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6108 if (!doms)
6109 return NULL;
6110 for (i = 0; i < ndoms; i++) {
6111 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6112 free_sched_domains(doms, i);
6113 return NULL;
6114 }
6115 }
6116 return doms;
6117}
6118
6119void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6120{
6121 unsigned int i;
6122 for (i = 0; i < ndoms; i++)
6123 free_cpumask_var(doms[i]);
6124 kfree(doms);
6125}
6126
6127
6128
6129
6130
6131
6132static int init_sched_domains(const struct cpumask *cpu_map)
6133{
6134 int err;
6135
6136 arch_update_cpu_topology();
6137 ndoms_cur = 1;
6138 doms_cur = alloc_sched_domains(ndoms_cur);
6139 if (!doms_cur)
6140 doms_cur = &fallback_doms;
6141 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6142 err = build_sched_domains(doms_cur[0], NULL);
6143 register_sched_domain_sysctl();
6144
6145 return err;
6146}
6147
6148
6149
6150
6151
6152static void detach_destroy_domains(const struct cpumask *cpu_map)
6153{
6154 int i;
6155
6156 rcu_read_lock();
6157 for_each_cpu(i, cpu_map)
6158 cpu_attach_domain(NULL, &def_root_domain, i);
6159 rcu_read_unlock();
6160}
6161
6162
6163static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6164 struct sched_domain_attr *new, int idx_new)
6165{
6166 struct sched_domain_attr tmp;
6167
6168
6169 if (!new && !cur)
6170 return 1;
6171
6172 tmp = SD_ATTR_INIT;
6173 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6174 new ? (new + idx_new) : &tmp,
6175 sizeof(struct sched_domain_attr));
6176}
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6205 struct sched_domain_attr *dattr_new)
6206{
6207 int i, j, n;
6208 int new_topology;
6209
6210 mutex_lock(&sched_domains_mutex);
6211
6212
6213 unregister_sched_domain_sysctl();
6214
6215
6216 new_topology = arch_update_cpu_topology();
6217
6218 n = doms_new ? ndoms_new : 0;
6219
6220
6221 for (i = 0; i < ndoms_cur; i++) {
6222 for (j = 0; j < n && !new_topology; j++) {
6223 if (cpumask_equal(doms_cur[i], doms_new[j])
6224 && dattrs_equal(dattr_cur, i, dattr_new, j))
6225 goto match1;
6226 }
6227
6228 detach_destroy_domains(doms_cur[i]);
6229match1:
6230 ;
6231 }
6232
6233 n = ndoms_cur;
6234 if (doms_new == NULL) {
6235 n = 0;
6236 doms_new = &fallback_doms;
6237 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6238 WARN_ON_ONCE(dattr_new);
6239 }
6240
6241
6242 for (i = 0; i < ndoms_new; i++) {
6243 for (j = 0; j < n && !new_topology; j++) {
6244 if (cpumask_equal(doms_new[i], doms_cur[j])
6245 && dattrs_equal(dattr_new, i, dattr_cur, j))
6246 goto match2;
6247 }
6248
6249 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6250match2:
6251 ;
6252 }
6253
6254
6255 if (doms_cur != &fallback_doms)
6256 free_sched_domains(doms_cur, ndoms_cur);
6257 kfree(dattr_cur);
6258 doms_cur = doms_new;
6259 dattr_cur = dattr_new;
6260 ndoms_cur = ndoms_new;
6261
6262 register_sched_domain_sysctl();
6263
6264 mutex_unlock(&sched_domains_mutex);
6265}
6266
6267static int num_cpus_frozen;
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6278 void *hcpu)
6279{
6280 switch (action) {
6281 case CPU_ONLINE_FROZEN:
6282 case CPU_DOWN_FAILED_FROZEN:
6283
6284
6285
6286
6287
6288
6289
6290 num_cpus_frozen--;
6291 if (likely(num_cpus_frozen)) {
6292 partition_sched_domains(1, NULL, NULL);
6293 break;
6294 }
6295
6296
6297
6298
6299
6300
6301
6302 case CPU_ONLINE:
6303 case CPU_DOWN_FAILED:
6304 cpuset_update_active_cpus(true);
6305 break;
6306 default:
6307 return NOTIFY_DONE;
6308 }
6309 return NOTIFY_OK;
6310}
6311
6312static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6313 void *hcpu)
6314{
6315 switch (action) {
6316 case CPU_DOWN_PREPARE:
6317 cpuset_update_active_cpus(false);
6318 break;
6319 case CPU_DOWN_PREPARE_FROZEN:
6320 num_cpus_frozen++;
6321 partition_sched_domains(1, NULL, NULL);
6322 break;
6323 default:
6324 return NOTIFY_DONE;
6325 }
6326 return NOTIFY_OK;
6327}
6328
6329void __init sched_init_smp(void)
6330{
6331 cpumask_var_t non_isolated_cpus;
6332
6333 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6334 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6335
6336 sched_init_numa();
6337
6338 get_online_cpus();
6339 mutex_lock(&sched_domains_mutex);
6340 init_sched_domains(cpu_active_mask);
6341 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6342 if (cpumask_empty(non_isolated_cpus))
6343 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6344 mutex_unlock(&sched_domains_mutex);
6345 put_online_cpus();
6346
6347 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6348 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6349 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6350
6351 init_hrtick();
6352
6353
6354 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6355 BUG();
6356 sched_init_granularity();
6357 free_cpumask_var(non_isolated_cpus);
6358
6359 init_sched_rt_class();
6360}
6361#else
6362void __init sched_init_smp(void)
6363{
6364 sched_init_granularity();
6365}
6366#endif
6367
6368const_debug unsigned int sysctl_timer_migration = 1;
6369
6370int in_sched_functions(unsigned long addr)
6371{
6372 return in_lock_functions(addr) ||
6373 (addr >= (unsigned long)__sched_text_start
6374 && addr < (unsigned long)__sched_text_end);
6375}
6376
6377#ifdef CONFIG_CGROUP_SCHED
6378
6379
6380
6381
6382struct task_group root_task_group;
6383LIST_HEAD(task_groups);
6384#endif
6385
6386DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6387
6388void __init sched_init(void)
6389{
6390 int i, j;
6391 unsigned long alloc_size = 0, ptr;
6392
6393#ifdef CONFIG_FAIR_GROUP_SCHED
6394 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6395#endif
6396#ifdef CONFIG_RT_GROUP_SCHED
6397 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6398#endif
6399#ifdef CONFIG_CPUMASK_OFFSTACK
6400 alloc_size += num_possible_cpus() * cpumask_size();
6401#endif
6402 if (alloc_size) {
6403 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6404
6405#ifdef CONFIG_FAIR_GROUP_SCHED
6406 root_task_group.se = (struct sched_entity **)ptr;
6407 ptr += nr_cpu_ids * sizeof(void **);
6408
6409 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6410 ptr += nr_cpu_ids * sizeof(void **);
6411
6412#endif
6413#ifdef CONFIG_RT_GROUP_SCHED
6414 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6415 ptr += nr_cpu_ids * sizeof(void **);
6416
6417 root_task_group.rt_rq = (struct rt_rq **)ptr;
6418 ptr += nr_cpu_ids * sizeof(void **);
6419
6420#endif
6421#ifdef CONFIG_CPUMASK_OFFSTACK
6422 for_each_possible_cpu(i) {
6423 per_cpu(load_balance_mask, i) = (void *)ptr;
6424 ptr += cpumask_size();
6425 }
6426#endif
6427 }
6428
6429#ifdef CONFIG_SMP
6430 init_defrootdomain();
6431#endif
6432
6433 init_rt_bandwidth(&def_rt_bandwidth,
6434 global_rt_period(), global_rt_runtime());
6435
6436#ifdef CONFIG_RT_GROUP_SCHED
6437 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6438 global_rt_period(), global_rt_runtime());
6439#endif
6440
6441#ifdef CONFIG_CGROUP_SCHED
6442 list_add(&root_task_group.list, &task_groups);
6443 INIT_LIST_HEAD(&root_task_group.children);
6444 INIT_LIST_HEAD(&root_task_group.siblings);
6445 autogroup_init(&init_task);
6446
6447#endif
6448
6449 for_each_possible_cpu(i) {
6450 struct rq *rq;
6451
6452 rq = cpu_rq(i);
6453 raw_spin_lock_init(&rq->lock);
6454 rq->nr_running = 0;
6455 rq->calc_load_active = 0;
6456 rq->calc_load_update = jiffies + LOAD_FREQ;
6457 init_cfs_rq(&rq->cfs);
6458 init_rt_rq(&rq->rt, rq);
6459#ifdef CONFIG_FAIR_GROUP_SCHED
6460 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6461 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6482 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6483#endif
6484
6485 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6486#ifdef CONFIG_RT_GROUP_SCHED
6487 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6488 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6489#endif
6490
6491 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6492 rq->cpu_load[j] = 0;
6493
6494 rq->last_load_update_tick = jiffies;
6495
6496#ifdef CONFIG_SMP
6497 rq->sd = NULL;
6498 rq->rd = NULL;
6499 rq->cpu_power = SCHED_POWER_SCALE;
6500 rq->post_schedule = 0;
6501 rq->active_balance = 0;
6502 rq->next_balance = jiffies;
6503 rq->push_cpu = 0;
6504 rq->cpu = i;
6505 rq->online = 0;
6506 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost;
6508
6509 INIT_LIST_HEAD(&rq->cfs_tasks);
6510
6511 rq_attach_root(rq, &def_root_domain);
6512#ifdef CONFIG_NO_HZ_COMMON
6513 rq->nohz_flags = 0;
6514#endif
6515#ifdef CONFIG_NO_HZ_FULL
6516 rq->last_sched_tick = 0;
6517#endif
6518#endif
6519 init_rq_hrtick(rq);
6520 atomic_set(&rq->nr_iowait, 0);
6521 }
6522
6523 set_load_weight(&init_task);
6524
6525#ifdef CONFIG_PREEMPT_NOTIFIERS
6526 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6527#endif
6528
6529#ifdef CONFIG_RT_MUTEXES
6530 plist_head_init(&init_task.pi_waiters);
6531#endif
6532
6533
6534
6535
6536 atomic_inc(&init_mm.mm_count);
6537 enter_lazy_tlb(&init_mm, current);
6538
6539
6540
6541
6542
6543
6544
6545 init_idle(current, smp_processor_id());
6546
6547 calc_load_update = jiffies + LOAD_FREQ;
6548
6549
6550
6551
6552 current->sched_class = &fair_sched_class;
6553
6554#ifdef CONFIG_SMP
6555 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6556
6557 if (cpu_isolated_map == NULL)
6558 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6559 idle_thread_set_boot_cpu();
6560#endif
6561 init_sched_fair_class();
6562
6563 scheduler_running = 1;
6564}
6565
6566#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6567static inline int preempt_count_equals(int preempt_offset)
6568{
6569 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6570
6571 return (nested == preempt_offset);
6572}
6573
6574void __might_sleep(const char *file, int line, int preempt_offset)
6575{
6576 static unsigned long prev_jiffy;
6577
6578 rcu_sleep_check();
6579 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6580 system_state != SYSTEM_RUNNING || oops_in_progress)
6581 return;
6582 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6583 return;
6584 prev_jiffy = jiffies;
6585
6586 printk(KERN_ERR
6587 "BUG: sleeping function called from invalid context at %s:%d\n",
6588 file, line);
6589 printk(KERN_ERR
6590 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6591 in_atomic(), irqs_disabled(),
6592 current->pid, current->comm);
6593
6594 debug_show_held_locks(current);
6595 if (irqs_disabled())
6596 print_irqtrace_events(current);
6597 dump_stack();
6598}
6599EXPORT_SYMBOL(__might_sleep);
6600#endif
6601
6602#ifdef CONFIG_MAGIC_SYSRQ
6603static void normalize_task(struct rq *rq, struct task_struct *p)
6604{
6605 const struct sched_class *prev_class = p->sched_class;
6606 int old_prio = p->prio;
6607 int on_rq;
6608
6609 on_rq = p->on_rq;
6610 if (on_rq)
6611 dequeue_task(rq, p, 0);
6612 __setscheduler(rq, p, SCHED_NORMAL, 0);
6613 if (on_rq) {
6614 enqueue_task(rq, p, 0);
6615 resched_task(rq->curr);
6616 }
6617
6618 check_class_changed(rq, p, prev_class, old_prio);
6619}
6620
6621void normalize_rt_tasks(void)
6622{
6623 struct task_struct *g, *p;
6624 unsigned long flags;
6625 struct rq *rq;
6626
6627 read_lock_irqsave(&tasklist_lock, flags);
6628 do_each_thread(g, p) {
6629
6630
6631
6632 if (!p->mm)
6633 continue;
6634
6635 p->se.exec_start = 0;
6636#ifdef CONFIG_SCHEDSTATS
6637 p->se.statistics.wait_start = 0;
6638 p->se.statistics.sleep_start = 0;
6639 p->se.statistics.block_start = 0;
6640#endif
6641
6642 if (!rt_task(p)) {
6643
6644
6645
6646
6647 if (TASK_NICE(p) < 0 && p->mm)
6648 set_user_nice(p, 0);
6649 continue;
6650 }
6651
6652 raw_spin_lock(&p->pi_lock);
6653 rq = __task_rq_lock(p);
6654
6655 normalize_task(rq, p);
6656
6657 __task_rq_unlock(rq);
6658 raw_spin_unlock(&p->pi_lock);
6659 } while_each_thread(g, p);
6660
6661 read_unlock_irqrestore(&tasklist_lock, flags);
6662}
6663
6664#endif
6665
6666#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685struct task_struct *curr_task(int cpu)
6686{
6687 return cpu_curr(cpu);
6688}
6689
6690#endif
6691
6692#ifdef CONFIG_IA64
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708void set_curr_task(int cpu, struct task_struct *p)
6709{
6710 cpu_curr(cpu) = p;
6711}
6712
6713#endif
6714
6715#ifdef CONFIG_CGROUP_SCHED
6716
6717static DEFINE_SPINLOCK(task_group_lock);
6718
6719static void free_sched_group(struct task_group *tg)
6720{
6721 free_fair_sched_group(tg);
6722 free_rt_sched_group(tg);
6723 autogroup_free(tg);
6724 kfree(tg);
6725}
6726
6727
6728struct task_group *sched_create_group(struct task_group *parent)
6729{
6730 struct task_group *tg;
6731
6732 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6733 if (!tg)
6734 return ERR_PTR(-ENOMEM);
6735
6736 if (!alloc_fair_sched_group(tg, parent))
6737 goto err;
6738
6739 if (!alloc_rt_sched_group(tg, parent))
6740 goto err;
6741
6742 return tg;
6743
6744err:
6745 free_sched_group(tg);
6746 return ERR_PTR(-ENOMEM);
6747}
6748
6749void sched_online_group(struct task_group *tg, struct task_group *parent)
6750{
6751 unsigned long flags;
6752
6753 spin_lock_irqsave(&task_group_lock, flags);
6754 list_add_rcu(&tg->list, &task_groups);
6755
6756 WARN_ON(!parent);
6757
6758 tg->parent = parent;
6759 INIT_LIST_HEAD(&tg->children);
6760 list_add_rcu(&tg->siblings, &parent->children);
6761 spin_unlock_irqrestore(&task_group_lock, flags);
6762}
6763
6764
6765static void free_sched_group_rcu(struct rcu_head *rhp)
6766{
6767
6768 free_sched_group(container_of(rhp, struct task_group, rcu));
6769}
6770
6771
6772void sched_destroy_group(struct task_group *tg)
6773{
6774
6775 call_rcu(&tg->rcu, free_sched_group_rcu);
6776}
6777
6778void sched_offline_group(struct task_group *tg)
6779{
6780 unsigned long flags;
6781 int i;
6782
6783
6784 for_each_possible_cpu(i)
6785 unregister_fair_sched_group(tg, i);
6786
6787 spin_lock_irqsave(&task_group_lock, flags);
6788 list_del_rcu(&tg->list);
6789 list_del_rcu(&tg->siblings);
6790 spin_unlock_irqrestore(&task_group_lock, flags);
6791}
6792
6793
6794
6795
6796
6797
6798void sched_move_task(struct task_struct *tsk)
6799{
6800 struct task_group *tg;
6801 int on_rq, running;
6802 unsigned long flags;
6803 struct rq *rq;
6804
6805 rq = task_rq_lock(tsk, &flags);
6806
6807 running = task_current(rq, tsk);
6808 on_rq = tsk->on_rq;
6809
6810 if (on_rq)
6811 dequeue_task(rq, tsk, 0);
6812 if (unlikely(running))
6813 tsk->sched_class->put_prev_task(rq, tsk);
6814
6815 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6816 lockdep_is_held(&tsk->sighand->siglock)),
6817 struct task_group, css);
6818 tg = autogroup_task_group(tsk, tg);
6819 tsk->sched_task_group = tg;
6820
6821#ifdef CONFIG_FAIR_GROUP_SCHED
6822 if (tsk->sched_class->task_move_group)
6823 tsk->sched_class->task_move_group(tsk, on_rq);
6824 else
6825#endif
6826 set_task_rq(tsk, task_cpu(tsk));
6827
6828 if (unlikely(running))
6829 tsk->sched_class->set_curr_task(rq);
6830 if (on_rq)
6831 enqueue_task(rq, tsk, 0);
6832
6833 task_rq_unlock(rq, tsk, &flags);
6834}
6835#endif
6836
6837#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6838static unsigned long to_ratio(u64 period, u64 runtime)
6839{
6840 if (runtime == RUNTIME_INF)
6841 return 1ULL << 20;
6842
6843 return div64_u64(runtime << 20, period);
6844}
6845#endif
6846
6847#ifdef CONFIG_RT_GROUP_SCHED
6848
6849
6850
6851static DEFINE_MUTEX(rt_constraints_mutex);
6852
6853
6854static inline int tg_has_rt_tasks(struct task_group *tg)
6855{
6856 struct task_struct *g, *p;
6857
6858 do_each_thread(g, p) {
6859 if (rt_task(p) && task_rq(p)->rt.tg == tg)
6860 return 1;
6861 } while_each_thread(g, p);
6862
6863 return 0;
6864}
6865
6866struct rt_schedulable_data {
6867 struct task_group *tg;
6868 u64 rt_period;
6869 u64 rt_runtime;
6870};
6871
6872static int tg_rt_schedulable(struct task_group *tg, void *data)
6873{
6874 struct rt_schedulable_data *d = data;
6875 struct task_group *child;
6876 unsigned long total, sum = 0;
6877 u64 period, runtime;
6878
6879 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6880 runtime = tg->rt_bandwidth.rt_runtime;
6881
6882 if (tg == d->tg) {
6883 period = d->rt_period;
6884 runtime = d->rt_runtime;
6885 }
6886
6887
6888
6889
6890 if (runtime > period && runtime != RUNTIME_INF)
6891 return -EINVAL;
6892
6893
6894
6895
6896 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
6897 return -EBUSY;
6898
6899 total = to_ratio(period, runtime);
6900
6901
6902
6903
6904 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
6905 return -EINVAL;
6906
6907
6908
6909
6910 list_for_each_entry_rcu(child, &tg->children, siblings) {
6911 period = ktime_to_ns(child->rt_bandwidth.rt_period);
6912 runtime = child->rt_bandwidth.rt_runtime;
6913
6914 if (child == d->tg) {
6915 period = d->rt_period;
6916 runtime = d->rt_runtime;
6917 }
6918
6919 sum += to_ratio(period, runtime);
6920 }
6921
6922 if (sum > total)
6923 return -EINVAL;
6924
6925 return 0;
6926}
6927
6928static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
6929{
6930 int ret;
6931
6932 struct rt_schedulable_data data = {
6933 .tg = tg,
6934 .rt_period = period,
6935 .rt_runtime = runtime,
6936 };
6937
6938 rcu_read_lock();
6939 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
6940 rcu_read_unlock();
6941
6942 return ret;
6943}
6944
6945static int tg_set_rt_bandwidth(struct task_group *tg,
6946 u64 rt_period, u64 rt_runtime)
6947{
6948 int i, err = 0;
6949
6950 mutex_lock(&rt_constraints_mutex);
6951 read_lock(&tasklist_lock);
6952 err = __rt_schedulable(tg, rt_period, rt_runtime);
6953 if (err)
6954 goto unlock;
6955
6956 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6957 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
6958 tg->rt_bandwidth.rt_runtime = rt_runtime;
6959
6960 for_each_possible_cpu(i) {
6961 struct rt_rq *rt_rq = tg->rt_rq[i];
6962
6963 raw_spin_lock(&rt_rq->rt_runtime_lock);
6964 rt_rq->rt_runtime = rt_runtime;
6965 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6966 }
6967 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6968unlock:
6969 read_unlock(&tasklist_lock);
6970 mutex_unlock(&rt_constraints_mutex);
6971
6972 return err;
6973}
6974
6975static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
6976{
6977 u64 rt_runtime, rt_period;
6978
6979 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6980 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
6981 if (rt_runtime_us < 0)
6982 rt_runtime = RUNTIME_INF;
6983
6984 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6985}
6986
6987static long sched_group_rt_runtime(struct task_group *tg)
6988{
6989 u64 rt_runtime_us;
6990
6991 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
6992 return -1;
6993
6994 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
6995 do_div(rt_runtime_us, NSEC_PER_USEC);
6996 return rt_runtime_us;
6997}
6998
6999static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7000{
7001 u64 rt_runtime, rt_period;
7002
7003 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7004 rt_runtime = tg->rt_bandwidth.rt_runtime;
7005
7006 if (rt_period == 0)
7007 return -EINVAL;
7008
7009 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7010}
7011
7012static long sched_group_rt_period(struct task_group *tg)
7013{
7014 u64 rt_period_us;
7015
7016 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7017 do_div(rt_period_us, NSEC_PER_USEC);
7018 return rt_period_us;
7019}
7020
7021static int sched_rt_global_constraints(void)
7022{
7023 u64 runtime, period;
7024 int ret = 0;
7025
7026 if (sysctl_sched_rt_period <= 0)
7027 return -EINVAL;
7028
7029 runtime = global_rt_runtime();
7030 period = global_rt_period();
7031
7032
7033
7034
7035 if (runtime > period && runtime != RUNTIME_INF)
7036 return -EINVAL;
7037
7038 mutex_lock(&rt_constraints_mutex);
7039 read_lock(&tasklist_lock);
7040 ret = __rt_schedulable(NULL, 0, 0);
7041 read_unlock(&tasklist_lock);
7042 mutex_unlock(&rt_constraints_mutex);
7043
7044 return ret;
7045}
7046
7047static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7048{
7049
7050 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7051 return 0;
7052
7053 return 1;
7054}
7055
7056#else
7057static int sched_rt_global_constraints(void)
7058{
7059 unsigned long flags;
7060 int i;
7061
7062 if (sysctl_sched_rt_period <= 0)
7063 return -EINVAL;
7064
7065
7066
7067
7068
7069 if (sysctl_sched_rt_runtime == 0)
7070 return -EBUSY;
7071
7072 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7073 for_each_possible_cpu(i) {
7074 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7075
7076 raw_spin_lock(&rt_rq->rt_runtime_lock);
7077 rt_rq->rt_runtime = global_rt_runtime();
7078 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7079 }
7080 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7081
7082 return 0;
7083}
7084#endif
7085
7086int sched_rr_handler(struct ctl_table *table, int write,
7087 void __user *buffer, size_t *lenp,
7088 loff_t *ppos)
7089{
7090 int ret;
7091 static DEFINE_MUTEX(mutex);
7092
7093 mutex_lock(&mutex);
7094 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7095
7096
7097 if (!ret && write) {
7098 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7099 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7100 }
7101 mutex_unlock(&mutex);
7102 return ret;
7103}
7104
7105int sched_rt_handler(struct ctl_table *table, int write,
7106 void __user *buffer, size_t *lenp,
7107 loff_t *ppos)
7108{
7109 int ret;
7110 int old_period, old_runtime;
7111 static DEFINE_MUTEX(mutex);
7112
7113 mutex_lock(&mutex);
7114 old_period = sysctl_sched_rt_period;
7115 old_runtime = sysctl_sched_rt_runtime;
7116
7117 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7118
7119 if (!ret && write) {
7120 ret = sched_rt_global_constraints();
7121 if (ret) {
7122 sysctl_sched_rt_period = old_period;
7123 sysctl_sched_rt_runtime = old_runtime;
7124 } else {
7125 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7126 def_rt_bandwidth.rt_period =
7127 ns_to_ktime(global_rt_period());
7128 }
7129 }
7130 mutex_unlock(&mutex);
7131
7132 return ret;
7133}
7134
7135#ifdef CONFIG_CGROUP_SCHED
7136
7137static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7138{
7139 return css ? container_of(css, struct task_group, css) : NULL;
7140}
7141
7142static struct cgroup_subsys_state *
7143cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7144{
7145 struct task_group *parent = css_tg(parent_css);
7146 struct task_group *tg;
7147
7148 if (!parent) {
7149
7150 return &root_task_group.css;
7151 }
7152
7153 tg = sched_create_group(parent);
7154 if (IS_ERR(tg))
7155 return ERR_PTR(-ENOMEM);
7156
7157 return &tg->css;
7158}
7159
7160static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7161{
7162 struct task_group *tg = css_tg(css);
7163 struct task_group *parent = css_tg(css_parent(css));
7164
7165 if (parent)
7166 sched_online_group(tg, parent);
7167 return 0;
7168}
7169
7170static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7171{
7172 struct task_group *tg = css_tg(css);
7173
7174 sched_destroy_group(tg);
7175}
7176
7177static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7178{
7179 struct task_group *tg = css_tg(css);
7180
7181 sched_offline_group(tg);
7182}
7183
7184static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7185 struct cgroup_taskset *tset)
7186{
7187 struct task_struct *task;
7188
7189 cgroup_taskset_for_each(task, css, tset) {
7190#ifdef CONFIG_RT_GROUP_SCHED
7191 if (!sched_rt_can_attach(css_tg(css), task))
7192 return -EINVAL;
7193#else
7194
7195 if (task->sched_class != &fair_sched_class)
7196 return -EINVAL;
7197#endif
7198 }
7199 return 0;
7200}
7201
7202static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7203 struct cgroup_taskset *tset)
7204{
7205 struct task_struct *task;
7206
7207 cgroup_taskset_for_each(task, css, tset)
7208 sched_move_task(task);
7209}
7210
7211static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7212 struct cgroup_subsys_state *old_css,
7213 struct task_struct *task)
7214{
7215
7216
7217
7218
7219
7220 if (!(task->flags & PF_EXITING))
7221 return;
7222
7223 sched_move_task(task);
7224}
7225
7226#ifdef CONFIG_FAIR_GROUP_SCHED
7227static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7228 struct cftype *cftype, u64 shareval)
7229{
7230 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7231}
7232
7233static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7234 struct cftype *cft)
7235{
7236 struct task_group *tg = css_tg(css);
7237
7238 return (u64) scale_load_down(tg->shares);
7239}
7240
7241#ifdef CONFIG_CFS_BANDWIDTH
7242static DEFINE_MUTEX(cfs_constraints_mutex);
7243
7244const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7245const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7246
7247static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7248
7249static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7250{
7251 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7252 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7253
7254 if (tg == &root_task_group)
7255 return -EINVAL;
7256
7257
7258
7259
7260
7261
7262 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7263 return -EINVAL;
7264
7265
7266
7267
7268
7269
7270 if (period > max_cfs_quota_period)
7271 return -EINVAL;
7272
7273 mutex_lock(&cfs_constraints_mutex);
7274 ret = __cfs_schedulable(tg, period, quota);
7275 if (ret)
7276 goto out_unlock;
7277
7278 runtime_enabled = quota != RUNTIME_INF;
7279 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7280 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7281 raw_spin_lock_irq(&cfs_b->lock);
7282 cfs_b->period = ns_to_ktime(period);
7283 cfs_b->quota = quota;
7284
7285 __refill_cfs_bandwidth_runtime(cfs_b);
7286
7287 if (runtime_enabled && cfs_b->timer_active) {
7288
7289 cfs_b->timer_active = 0;
7290 __start_cfs_bandwidth(cfs_b);
7291 }
7292 raw_spin_unlock_irq(&cfs_b->lock);
7293
7294 for_each_possible_cpu(i) {
7295 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7296 struct rq *rq = cfs_rq->rq;
7297
7298 raw_spin_lock_irq(&rq->lock);
7299 cfs_rq->runtime_enabled = runtime_enabled;
7300 cfs_rq->runtime_remaining = 0;
7301
7302 if (cfs_rq->throttled)
7303 unthrottle_cfs_rq(cfs_rq);
7304 raw_spin_unlock_irq(&rq->lock);
7305 }
7306out_unlock:
7307 mutex_unlock(&cfs_constraints_mutex);
7308
7309 return ret;
7310}
7311
7312int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7313{
7314 u64 quota, period;
7315
7316 period = ktime_to_ns(tg->cfs_bandwidth.period);
7317 if (cfs_quota_us < 0)
7318 quota = RUNTIME_INF;
7319 else
7320 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7321
7322 return tg_set_cfs_bandwidth(tg, period, quota);
7323}
7324
7325long tg_get_cfs_quota(struct task_group *tg)
7326{
7327 u64 quota_us;
7328
7329 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7330 return -1;
7331
7332 quota_us = tg->cfs_bandwidth.quota;
7333 do_div(quota_us, NSEC_PER_USEC);
7334
7335 return quota_us;
7336}
7337
7338int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7339{
7340 u64 quota, period;
7341
7342 period = (u64)cfs_period_us * NSEC_PER_USEC;
7343 quota = tg->cfs_bandwidth.quota;
7344
7345 return tg_set_cfs_bandwidth(tg, period, quota);
7346}
7347
7348long tg_get_cfs_period(struct task_group *tg)
7349{
7350 u64 cfs_period_us;
7351
7352 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7353 do_div(cfs_period_us, NSEC_PER_USEC);
7354
7355 return cfs_period_us;
7356}
7357
7358static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7359 struct cftype *cft)
7360{
7361 return tg_get_cfs_quota(css_tg(css));
7362}
7363
7364static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7365 struct cftype *cftype, s64 cfs_quota_us)
7366{
7367 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7368}
7369
7370static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7371 struct cftype *cft)
7372{
7373 return tg_get_cfs_period(css_tg(css));
7374}
7375
7376static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7377 struct cftype *cftype, u64 cfs_period_us)
7378{
7379 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7380}
7381
7382struct cfs_schedulable_data {
7383 struct task_group *tg;
7384 u64 period, quota;
7385};
7386
7387
7388
7389
7390
7391static u64 normalize_cfs_quota(struct task_group *tg,
7392 struct cfs_schedulable_data *d)
7393{
7394 u64 quota, period;
7395
7396 if (tg == d->tg) {
7397 period = d->period;
7398 quota = d->quota;
7399 } else {
7400 period = tg_get_cfs_period(tg);
7401 quota = tg_get_cfs_quota(tg);
7402 }
7403
7404
7405 if (quota == RUNTIME_INF || quota == -1)
7406 return RUNTIME_INF;
7407
7408 return to_ratio(period, quota);
7409}
7410
7411static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7412{
7413 struct cfs_schedulable_data *d = data;
7414 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7415 s64 quota = 0, parent_quota = -1;
7416
7417 if (!tg->parent) {
7418 quota = RUNTIME_INF;
7419 } else {
7420 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7421
7422 quota = normalize_cfs_quota(tg, d);
7423 parent_quota = parent_b->hierarchal_quota;
7424
7425
7426
7427
7428
7429 if (quota == RUNTIME_INF)
7430 quota = parent_quota;
7431 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7432 return -EINVAL;
7433 }
7434 cfs_b->hierarchal_quota = quota;
7435
7436 return 0;
7437}
7438
7439static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7440{
7441 int ret;
7442 struct cfs_schedulable_data data = {
7443 .tg = tg,
7444 .period = period,
7445 .quota = quota,
7446 };
7447
7448 if (quota != RUNTIME_INF) {
7449 do_div(data.period, NSEC_PER_USEC);
7450 do_div(data.quota, NSEC_PER_USEC);
7451 }
7452
7453 rcu_read_lock();
7454 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7455 rcu_read_unlock();
7456
7457 return ret;
7458}
7459
7460static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7461 struct cgroup_map_cb *cb)
7462{
7463 struct task_group *tg = css_tg(css);
7464 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7465
7466 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7467 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7468 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7469
7470 return 0;
7471}
7472#endif
7473#endif
7474
7475#ifdef CONFIG_RT_GROUP_SCHED
7476static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7477 struct cftype *cft, s64 val)
7478{
7479 return sched_group_set_rt_runtime(css_tg(css), val);
7480}
7481
7482static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7483 struct cftype *cft)
7484{
7485 return sched_group_rt_runtime(css_tg(css));
7486}
7487
7488static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7489 struct cftype *cftype, u64 rt_period_us)
7490{
7491 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7492}
7493
7494static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7495 struct cftype *cft)
7496{
7497 return sched_group_rt_period(css_tg(css));
7498}
7499#endif
7500
7501static struct cftype cpu_files[] = {
7502#ifdef CONFIG_FAIR_GROUP_SCHED
7503 {
7504 .name = "shares",
7505 .read_u64 = cpu_shares_read_u64,
7506 .write_u64 = cpu_shares_write_u64,
7507 },
7508#endif
7509#ifdef CONFIG_CFS_BANDWIDTH
7510 {
7511 .name = "cfs_quota_us",
7512 .read_s64 = cpu_cfs_quota_read_s64,
7513 .write_s64 = cpu_cfs_quota_write_s64,
7514 },
7515 {
7516 .name = "cfs_period_us",
7517 .read_u64 = cpu_cfs_period_read_u64,
7518 .write_u64 = cpu_cfs_period_write_u64,
7519 },
7520 {
7521 .name = "stat",
7522 .read_map = cpu_stats_show,
7523 },
7524#endif
7525#ifdef CONFIG_RT_GROUP_SCHED
7526 {
7527 .name = "rt_runtime_us",
7528 .read_s64 = cpu_rt_runtime_read,
7529 .write_s64 = cpu_rt_runtime_write,
7530 },
7531 {
7532 .name = "rt_period_us",
7533 .read_u64 = cpu_rt_period_read_uint,
7534 .write_u64 = cpu_rt_period_write_uint,
7535 },
7536#endif
7537 { }
7538};
7539
7540struct cgroup_subsys cpu_cgroup_subsys = {
7541 .name = "cpu",
7542 .css_alloc = cpu_cgroup_css_alloc,
7543 .css_free = cpu_cgroup_css_free,
7544 .css_online = cpu_cgroup_css_online,
7545 .css_offline = cpu_cgroup_css_offline,
7546 .can_attach = cpu_cgroup_can_attach,
7547 .attach = cpu_cgroup_attach,
7548 .exit = cpu_cgroup_exit,
7549 .subsys_id = cpu_cgroup_subsys_id,
7550 .base_cftypes = cpu_files,
7551 .early_init = 1,
7552};
7553
7554#endif
7555
7556void dump_cpu_task(int cpu)
7557{
7558 pr_info("Task dump for CPU %d:\n", cpu);
7559 sched_show_task(cpu_curr(cpu));
7560}
7561