1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302
303
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320
321
322
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355
356
357
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371
372
373
374
375static void hrtick_clear(struct rq *rq)
376{
377 if (hrtimer_active(&rq->hrtick_timer))
378 hrtimer_cancel(&rq->hrtick_timer);
379}
380
381
382
383
384
385static enum hrtimer_restart hrtick(struct hrtimer *timer)
386{
387 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
388
389 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
390
391 raw_spin_lock(&rq->lock);
392 update_rq_clock(rq);
393 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
394 raw_spin_unlock(&rq->lock);
395
396 return HRTIMER_NORESTART;
397}
398
399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
409
410
411
412static void __hrtick_start(void *arg)
413{
414 struct rq *rq = arg;
415
416 raw_spin_lock(&rq->lock);
417 __hrtick_restart(rq);
418 rq->hrtick_csd_pending = 0;
419 raw_spin_unlock(&rq->lock);
420}
421
422
423
424
425
426
427void hrtick_start(struct rq *rq, u64 delay)
428{
429 struct hrtimer *timer = &rq->hrtick_timer;
430 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
431
432 hrtimer_set_expires(timer, time);
433
434 if (rq == this_rq()) {
435 __hrtick_restart(rq);
436 } else if (!rq->hrtick_csd_pending) {
437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
438 rq->hrtick_csd_pending = 1;
439 }
440}
441
442static int
443hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
444{
445 int cpu = (int)(long)hcpu;
446
447 switch (action) {
448 case CPU_UP_CANCELED:
449 case CPU_UP_CANCELED_FROZEN:
450 case CPU_DOWN_PREPARE:
451 case CPU_DOWN_PREPARE_FROZEN:
452 case CPU_DEAD:
453 case CPU_DEAD_FROZEN:
454 hrtick_clear(cpu_rq(cpu));
455 return NOTIFY_OK;
456 }
457
458 return NOTIFY_DONE;
459}
460
461static __init void init_hrtick(void)
462{
463 hotcpu_notifier(hotplug_hrtick, 0);
464}
465#else
466
467
468
469
470
471void hrtick_start(struct rq *rq, u64 delay)
472{
473 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
474 HRTIMER_MODE_REL_PINNED, 0);
475}
476
477static inline void init_hrtick(void)
478{
479}
480#endif
481
482static void init_rq_hrtick(struct rq *rq)
483{
484#ifdef CONFIG_SMP
485 rq->hrtick_csd_pending = 0;
486
487 rq->hrtick_csd.flags = 0;
488 rq->hrtick_csd.func = __hrtick_start;
489 rq->hrtick_csd.info = rq;
490#endif
491
492 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
493 rq->hrtick_timer.function = hrtick;
494}
495#else
496static inline void hrtick_clear(struct rq *rq)
497{
498}
499
500static inline void init_rq_hrtick(struct rq *rq)
501{
502}
503
504static inline void init_hrtick(void)
505{
506}
507#endif
508
509
510
511
512
513
514
515
516void resched_task(struct task_struct *p)
517{
518 int cpu;
519
520 lockdep_assert_held(&task_rq(p)->lock);
521
522 if (test_tsk_need_resched(p))
523 return;
524
525 set_tsk_need_resched(p);
526
527 cpu = task_cpu(p);
528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return;
531 }
532
533
534 smp_mb();
535 if (!tsk_is_polling(p))
536 smp_send_reschedule(cpu);
537}
538
539void resched_cpu(int cpu)
540{
541 struct rq *rq = cpu_rq(cpu);
542 unsigned long flags;
543
544 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
545 return;
546 resched_task(cpu_curr(cpu));
547 raw_spin_unlock_irqrestore(&rq->lock, flags);
548}
549
550#ifdef CONFIG_SMP
551#ifdef CONFIG_NO_HZ_COMMON
552
553
554
555
556
557
558
559
560int get_nohz_timer_target(void)
561{
562 int cpu = smp_processor_id();
563 int i;
564 struct sched_domain *sd;
565
566 rcu_read_lock();
567 for_each_domain(cpu, sd) {
568 for_each_cpu(i, sched_domain_span(sd)) {
569 if (!idle_cpu(i)) {
570 cpu = i;
571 goto unlock;
572 }
573 }
574 }
575unlock:
576 rcu_read_unlock();
577 return cpu;
578}
579
580
581
582
583
584
585
586
587
588
589static void wake_up_idle_cpu(int cpu)
590{
591 struct rq *rq = cpu_rq(cpu);
592
593 if (cpu == smp_processor_id())
594 return;
595
596
597
598
599
600
601
602
603 if (rq->curr != rq->idle)
604 return;
605
606
607
608
609
610
611 set_tsk_need_resched(rq->idle);
612
613
614 smp_mb();
615 if (!tsk_is_polling(rq->idle))
616 smp_send_reschedule(cpu);
617}
618
619static bool wake_up_full_nohz_cpu(int cpu)
620{
621 if (tick_nohz_full_cpu(cpu)) {
622 if (cpu != smp_processor_id() ||
623 tick_nohz_tick_stopped())
624 smp_send_reschedule(cpu);
625 return true;
626 }
627
628 return false;
629}
630
631void wake_up_nohz_cpu(int cpu)
632{
633 if (!wake_up_full_nohz_cpu(cpu))
634 wake_up_idle_cpu(cpu);
635}
636
637static inline bool got_nohz_idle_kick(void)
638{
639 int cpu = smp_processor_id();
640
641 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
642 return false;
643
644 if (idle_cpu(cpu) && !need_resched())
645 return true;
646
647
648
649
650
651 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
652 return false;
653}
654
655#else
656
657static inline bool got_nohz_idle_kick(void)
658{
659 return false;
660}
661
662#endif
663
664#ifdef CONFIG_NO_HZ_FULL
665bool sched_can_stop_tick(void)
666{
667 struct rq *rq;
668
669 rq = this_rq();
670
671
672 smp_rmb();
673
674
675 if (rq->nr_running > 1)
676 return false;
677
678 return true;
679}
680#endif
681
682void sched_avg_update(struct rq *rq)
683{
684 s64 period = sched_avg_period();
685
686 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
687
688
689
690
691
692 asm("" : "+rm" (rq->age_stamp));
693 rq->age_stamp += period;
694 rq->rt_avg /= 2;
695 }
696}
697
698#endif
699
700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
701 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
702
703
704
705
706
707
708int walk_tg_tree_from(struct task_group *from,
709 tg_visitor down, tg_visitor up, void *data)
710{
711 struct task_group *parent, *child;
712 int ret;
713
714 parent = from;
715
716down:
717 ret = (*down)(parent, data);
718 if (ret)
719 goto out;
720 list_for_each_entry_rcu(child, &parent->children, siblings) {
721 parent = child;
722 goto down;
723
724up:
725 continue;
726 }
727 ret = (*up)(parent, data);
728 if (ret || parent == from)
729 goto out;
730
731 child = parent;
732 parent = parent->parent;
733 if (parent)
734 goto up;
735out:
736 return ret;
737}
738
739int tg_nop(struct task_group *tg, void *data)
740{
741 return 0;
742}
743#endif
744
745static void set_load_weight(struct task_struct *p)
746{
747 int prio = p->static_prio - MAX_RT_PRIO;
748 struct load_weight *load = &p->se.load;
749
750
751
752
753 if (p->policy == SCHED_IDLE) {
754 load->weight = scale_load(WEIGHT_IDLEPRIO);
755 load->inv_weight = WMULT_IDLEPRIO;
756 return;
757 }
758
759 load->weight = scale_load(prio_to_weight[prio]);
760 load->inv_weight = prio_to_wmult[prio];
761}
762
763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
764{
765 update_rq_clock(rq);
766 sched_info_queued(rq, p);
767 p->sched_class->enqueue_task(rq, p, flags);
768}
769
770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
771{
772 update_rq_clock(rq);
773 sched_info_dequeued(rq, p);
774 p->sched_class->dequeue_task(rq, p, flags);
775}
776
777void activate_task(struct rq *rq, struct task_struct *p, int flags)
778{
779 if (task_contributes_to_load(p))
780 rq->nr_uninterruptible--;
781
782 enqueue_task(rq, p, flags);
783}
784
785void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
786{
787 if (task_contributes_to_load(p))
788 rq->nr_uninterruptible++;
789
790 dequeue_task(rq, p, flags);
791}
792
793static void update_rq_clock_task(struct rq *rq, s64 delta)
794{
795
796
797
798
799#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
800 s64 steal = 0, irq_delta = 0;
801#endif
802#ifdef CONFIG_IRQ_TIME_ACCOUNTING
803 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820 if (irq_delta > delta)
821 irq_delta = delta;
822
823 rq->prev_irq_time += irq_delta;
824 delta -= irq_delta;
825#endif
826#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
827 if (static_key_false((¶virt_steal_rq_enabled))) {
828 u64 st;
829
830 steal = paravirt_steal_clock(cpu_of(rq));
831 steal -= rq->prev_steal_time_rq;
832
833 if (unlikely(steal > delta))
834 steal = delta;
835
836 st = steal_ticks(steal);
837 steal = st * TICK_NSEC;
838
839 rq->prev_steal_time_rq += steal;
840
841 delta -= steal;
842 }
843#endif
844
845 rq->clock_task += delta;
846
847#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
848 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
849 sched_rt_avg_update(rq, irq_delta + steal);
850#endif
851}
852
853void sched_set_stop_task(int cpu, struct task_struct *stop)
854{
855 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
856 struct task_struct *old_stop = cpu_rq(cpu)->stop;
857
858 if (stop) {
859
860
861
862
863
864
865
866
867 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
868
869 stop->sched_class = &stop_sched_class;
870 }
871
872 cpu_rq(cpu)->stop = stop;
873
874 if (old_stop) {
875
876
877
878
879 old_stop->sched_class = &rt_sched_class;
880 }
881}
882
883
884
885
886static inline int __normal_prio(struct task_struct *p)
887{
888 return p->static_prio;
889}
890
891
892
893
894
895
896
897
898static inline int normal_prio(struct task_struct *p)
899{
900 int prio;
901
902 if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else
905 prio = __normal_prio(p);
906 return prio;
907}
908
909
910
911
912
913
914
915
916static int effective_prio(struct task_struct *p)
917{
918 p->normal_prio = normal_prio(p);
919
920
921
922
923
924 if (!rt_prio(p->prio))
925 return p->normal_prio;
926 return p->prio;
927}
928
929
930
931
932
933
934
935inline int task_curr(const struct task_struct *p)
936{
937 return cpu_curr(task_cpu(p)) == p;
938}
939
940static inline void check_class_changed(struct rq *rq, struct task_struct *p,
941 const struct sched_class *prev_class,
942 int oldprio)
943{
944 if (prev_class != p->sched_class) {
945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio)
949 p->sched_class->prio_changed(rq, p, oldprio);
950}
951
952void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
953{
954 const struct sched_class *class;
955
956 if (p->sched_class == rq->curr->sched_class) {
957 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
958 } else {
959 for_each_class(class) {
960 if (class == rq->curr->sched_class)
961 break;
962 if (class == p->sched_class) {
963 resched_task(rq->curr);
964 break;
965 }
966 }
967 }
968
969
970
971
972
973 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
974 rq->skip_clock_update = 1;
975}
976
977#ifdef CONFIG_SMP
978void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
979{
980#ifdef CONFIG_SCHED_DEBUG
981
982
983
984
985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
987
988#ifdef CONFIG_LOCKDEP
989
990
991
992
993
994
995
996
997
998
999 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1000 lockdep_is_held(&task_rq(p)->lock)));
1001#endif
1002#endif
1003
1004 trace_sched_migrate_task(p, new_cpu);
1005
1006 if (task_cpu(p) != new_cpu) {
1007 if (p->sched_class->migrate_task_rq)
1008 p->sched_class->migrate_task_rq(p, new_cpu);
1009 p->se.nr_migrations++;
1010 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1011 }
1012
1013 __set_task_cpu(p, new_cpu);
1014}
1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029
1030
1031
1032
1033
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080
1081
1082
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098
1099
1100
1101
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1115}
1116
1117struct migration_arg {
1118 struct task_struct *task;
1119 int dest_cpu;
1120};
1121
1122static int migration_cpu_stop(void *data);
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1141{
1142 unsigned long flags;
1143 int running, on_rq;
1144 unsigned long ncsw;
1145 struct rq *rq;
1146
1147 for (;;) {
1148
1149
1150
1151
1152
1153
1154 rq = task_rq(p);
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167 while (task_running(rq, p)) {
1168 if (match_state && unlikely(p->state != match_state))
1169 return 0;
1170 cpu_relax();
1171 }
1172
1173
1174
1175
1176
1177
1178 rq = task_rq_lock(p, &flags);
1179 trace_sched_wait_task(p);
1180 running = task_running(rq, p);
1181 on_rq = p->on_rq;
1182 ncsw = 0;
1183 if (!match_state || p->state == match_state)
1184 ncsw = p->nvcsw | LONG_MIN;
1185 task_rq_unlock(rq, p, &flags);
1186
1187
1188
1189
1190 if (unlikely(!ncsw))
1191 break;
1192
1193
1194
1195
1196
1197
1198
1199 if (unlikely(running)) {
1200 cpu_relax();
1201 continue;
1202 }
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213 if (unlikely(on_rq)) {
1214 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1215
1216 set_current_state(TASK_UNINTERRUPTIBLE);
1217 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1218 continue;
1219 }
1220
1221
1222
1223
1224
1225
1226 break;
1227 }
1228
1229 return ncsw;
1230}
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245void kick_process(struct task_struct *p)
1246{
1247 int cpu;
1248
1249 preempt_disable();
1250 cpu = task_cpu(p);
1251 if ((cpu != smp_processor_id()) && task_curr(p))
1252 smp_send_reschedule(cpu);
1253 preempt_enable();
1254}
1255EXPORT_SYMBOL_GPL(kick_process);
1256#endif
1257
1258#ifdef CONFIG_SMP
1259
1260
1261
1262static int select_fallback_rq(int cpu, struct task_struct *p)
1263{
1264 int nid = cpu_to_node(cpu);
1265 const struct cpumask *nodemask = NULL;
1266 enum { cpuset, possible, fail } state = cpuset;
1267 int dest_cpu;
1268
1269
1270
1271
1272
1273
1274 if (nid != -1) {
1275 nodemask = cpumask_of_node(nid);
1276
1277
1278 for_each_cpu(dest_cpu, nodemask) {
1279 if (!cpu_online(dest_cpu))
1280 continue;
1281 if (!cpu_active(dest_cpu))
1282 continue;
1283 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1284 return dest_cpu;
1285 }
1286 }
1287
1288 for (;;) {
1289
1290 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1291 if (!cpu_online(dest_cpu))
1292 continue;
1293 if (!cpu_active(dest_cpu))
1294 continue;
1295 goto out;
1296 }
1297
1298 switch (state) {
1299 case cpuset:
1300
1301 cpuset_cpus_allowed_fallback(p);
1302 state = possible;
1303 break;
1304
1305 case possible:
1306 do_set_cpus_allowed(p, cpu_possible_mask);
1307 state = fail;
1308 break;
1309
1310 case fail:
1311 BUG();
1312 break;
1313 }
1314 }
1315
1316out:
1317 if (state != cpuset) {
1318
1319
1320
1321
1322
1323 if (p->mm && printk_ratelimit()) {
1324 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1325 task_pid_nr(p), p->comm, cpu);
1326 }
1327 }
1328
1329 return dest_cpu;
1330}
1331
1332
1333
1334
1335static inline
1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1337{
1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1351 !cpu_online(cpu)))
1352 cpu = select_fallback_rq(task_cpu(p), p);
1353
1354 return cpu;
1355}
1356
1357static void update_avg(u64 *avg, u64 sample)
1358{
1359 s64 diff = sample - *avg;
1360 *avg += diff >> 3;
1361}
1362#endif
1363
1364static void
1365ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1366{
1367#ifdef CONFIG_SCHEDSTATS
1368 struct rq *rq = this_rq();
1369
1370#ifdef CONFIG_SMP
1371 int this_cpu = smp_processor_id();
1372
1373 if (cpu == this_cpu) {
1374 schedstat_inc(rq, ttwu_local);
1375 schedstat_inc(p, se.statistics.nr_wakeups_local);
1376 } else {
1377 struct sched_domain *sd;
1378
1379 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1380 rcu_read_lock();
1381 for_each_domain(this_cpu, sd) {
1382 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1383 schedstat_inc(sd, ttwu_wake_remote);
1384 break;
1385 }
1386 }
1387 rcu_read_unlock();
1388 }
1389
1390 if (wake_flags & WF_MIGRATED)
1391 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1392
1393#endif
1394
1395 schedstat_inc(rq, ttwu_count);
1396 schedstat_inc(p, se.statistics.nr_wakeups);
1397
1398 if (wake_flags & WF_SYNC)
1399 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1400
1401#endif
1402}
1403
1404static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1405{
1406 activate_task(rq, p, en_flags);
1407 p->on_rq = 1;
1408
1409
1410 if (p->flags & PF_WQ_WORKER)
1411 wq_worker_waking_up(p, cpu_of(rq));
1412}
1413
1414
1415
1416
1417static void
1418ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1419{
1420 check_preempt_curr(rq, p, wake_flags);
1421 trace_sched_wakeup(p, true);
1422
1423 p->state = TASK_RUNNING;
1424#ifdef CONFIG_SMP
1425 if (p->sched_class->task_woken)
1426 p->sched_class->task_woken(rq, p);
1427
1428 if (rq->idle_stamp) {
1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1430 u64 max = 2*rq->max_idle_balance_cost;
1431
1432 update_avg(&rq->avg_idle, delta);
1433
1434 if (rq->avg_idle > max)
1435 rq->avg_idle = max;
1436
1437 rq->idle_stamp = 0;
1438 }
1439#endif
1440}
1441
1442static void
1443ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1444{
1445#ifdef CONFIG_SMP
1446 if (p->sched_contributes_to_load)
1447 rq->nr_uninterruptible--;
1448#endif
1449
1450 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1451 ttwu_do_wakeup(rq, p, wake_flags);
1452}
1453
1454
1455
1456
1457
1458
1459
1460static int ttwu_remote(struct task_struct *p, int wake_flags)
1461{
1462 struct rq *rq;
1463 int ret = 0;
1464
1465 rq = __task_rq_lock(p);
1466 if (p->on_rq) {
1467
1468 update_rq_clock(rq);
1469 ttwu_do_wakeup(rq, p, wake_flags);
1470 ret = 1;
1471 }
1472 __task_rq_unlock(rq);
1473
1474 return ret;
1475}
1476
1477#ifdef CONFIG_SMP
1478static void sched_ttwu_pending(void)
1479{
1480 struct rq *rq = this_rq();
1481 struct llist_node *llist = llist_del_all(&rq->wake_list);
1482 struct task_struct *p;
1483
1484 raw_spin_lock(&rq->lock);
1485
1486 while (llist) {
1487 p = llist_entry(llist, struct task_struct, wake_entry);
1488 llist = llist_next(llist);
1489 ttwu_do_activate(rq, p, 0);
1490 }
1491
1492 raw_spin_unlock(&rq->lock);
1493}
1494
1495void scheduler_ipi(void)
1496{
1497
1498
1499
1500
1501
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id())
1507 && !got_nohz_idle_kick())
1508 return;
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523 irq_enter();
1524 tick_nohz_full_check();
1525 sched_ttwu_pending();
1526
1527
1528
1529
1530 if (unlikely(got_nohz_idle_kick())) {
1531 this_rq()->idle_balance = 1;
1532 raise_softirq_irqoff(SCHED_SOFTIRQ);
1533 }
1534 irq_exit();
1535}
1536
1537static void ttwu_queue_remote(struct task_struct *p, int cpu)
1538{
1539 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1540 smp_send_reschedule(cpu);
1541}
1542
1543bool cpus_share_cache(int this_cpu, int that_cpu)
1544{
1545 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1546}
1547#endif
1548
1549static void ttwu_queue(struct task_struct *p, int cpu)
1550{
1551 struct rq *rq = cpu_rq(cpu);
1552
1553#if defined(CONFIG_SMP)
1554 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1555 sched_clock_cpu(cpu);
1556 ttwu_queue_remote(p, cpu);
1557 return;
1558 }
1559#endif
1560
1561 raw_spin_lock(&rq->lock);
1562 ttwu_do_activate(rq, p, 0);
1563 raw_spin_unlock(&rq->lock);
1564}
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581static int
1582try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1583{
1584 unsigned long flags;
1585 int cpu, success = 0;
1586
1587
1588
1589
1590
1591
1592
1593 smp_mb__before_spinlock();
1594 raw_spin_lock_irqsave(&p->pi_lock, flags);
1595 if (!(p->state & state))
1596 goto out;
1597
1598 success = 1;
1599 cpu = task_cpu(p);
1600
1601 if (p->on_rq && ttwu_remote(p, wake_flags))
1602 goto stat;
1603
1604#ifdef CONFIG_SMP
1605
1606
1607
1608
1609 while (p->on_cpu)
1610 cpu_relax();
1611
1612
1613
1614 smp_rmb();
1615
1616 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1617 p->state = TASK_WAKING;
1618
1619 if (p->sched_class->task_waking)
1620 p->sched_class->task_waking(p);
1621
1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1623 if (task_cpu(p) != cpu) {
1624 wake_flags |= WF_MIGRATED;
1625 set_task_cpu(p, cpu);
1626 }
1627#endif
1628
1629 ttwu_queue(p, cpu);
1630stat:
1631 ttwu_stat(p, cpu, wake_flags);
1632out:
1633 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1634
1635 return success;
1636}
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646static void try_to_wake_up_local(struct task_struct *p)
1647{
1648 struct rq *rq = task_rq(p);
1649
1650 if (WARN_ON_ONCE(rq != this_rq()) ||
1651 WARN_ON_ONCE(p == current))
1652 return;
1653
1654 lockdep_assert_held(&rq->lock);
1655
1656 if (!raw_spin_trylock(&p->pi_lock)) {
1657 raw_spin_unlock(&rq->lock);
1658 raw_spin_lock(&p->pi_lock);
1659 raw_spin_lock(&rq->lock);
1660 }
1661
1662 if (!(p->state & TASK_NORMAL))
1663 goto out;
1664
1665 if (!p->on_rq)
1666 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1667
1668 ttwu_do_wakeup(rq, p, 0);
1669 ttwu_stat(p, smp_processor_id(), 0);
1670out:
1671 raw_spin_unlock(&p->pi_lock);
1672}
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686int wake_up_process(struct task_struct *p)
1687{
1688 WARN_ON(task_is_stopped_or_traced(p));
1689 return try_to_wake_up(p, TASK_NORMAL, 0);
1690}
1691EXPORT_SYMBOL(wake_up_process);
1692
1693int wake_up_state(struct task_struct *p, unsigned int state)
1694{
1695 return try_to_wake_up(p, state, 0);
1696}
1697
1698
1699
1700
1701
1702
1703
1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1705{
1706 p->on_rq = 0;
1707
1708 p->se.on_rq = 0;
1709 p->se.exec_start = 0;
1710 p->se.sum_exec_runtime = 0;
1711 p->se.prev_sum_exec_runtime = 0;
1712 p->se.nr_migrations = 0;
1713 p->se.vruntime = 0;
1714 INIT_LIST_HEAD(&p->se.group_node);
1715
1716#ifdef CONFIG_SCHEDSTATS
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif
1719
1720 INIT_LIST_HEAD(&p->rt.run_list);
1721
1722#ifdef CONFIG_PREEMPT_NOTIFIERS
1723 INIT_HLIST_HEAD(&p->preempt_notifiers);
1724#endif
1725
1726#ifdef CONFIG_NUMA_BALANCING
1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1729 p->mm->numa_scan_seq = 0;
1730 }
1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1737 p->node_stamp = 0ULL;
1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1746#endif
1747}
1748
1749#ifdef CONFIG_NUMA_BALANCING
1750#ifdef CONFIG_SCHED_DEBUG
1751void set_numabalancing_state(bool enabled)
1752{
1753 if (enabled)
1754 sched_feat_set("NUMA");
1755 else
1756 sched_feat_set("NO_NUMA");
1757}
1758#else
1759__read_mostly bool numabalancing_enabled;
1760
1761void set_numabalancing_state(bool enabled)
1762{
1763 numabalancing_enabled = enabled;
1764}
1765#endif
1766#endif
1767
1768
1769
1770
1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{
1773 unsigned long flags;
1774 int cpu = get_cpu();
1775
1776 __sched_fork(clone_flags, p);
1777
1778
1779
1780
1781
1782 p->state = TASK_RUNNING;
1783
1784
1785
1786
1787 p->prio = current->normal_prio;
1788
1789
1790
1791
1792 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0;
1797 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1798 p->static_prio = NICE_TO_PRIO(0);
1799
1800 p->prio = p->normal_prio = __normal_prio(p);
1801 set_load_weight(p);
1802
1803
1804
1805
1806
1807 p->sched_reset_on_fork = 0;
1808 }
1809
1810 if (!rt_prio(p->prio))
1811 p->sched_class = &fair_sched_class;
1812
1813 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p);
1815
1816
1817
1818
1819
1820
1821
1822
1823 raw_spin_lock_irqsave(&p->pi_lock, flags);
1824 set_task_cpu(p, cpu);
1825 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1826
1827#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1828 if (likely(sched_info_on()))
1829 memset(&p->sched_info, 0, sizeof(p->sched_info));
1830#endif
1831#if defined(CONFIG_SMP)
1832 p->on_cpu = 0;
1833#endif
1834 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1837#endif
1838
1839 put_cpu();
1840}
1841
1842
1843
1844
1845
1846
1847
1848
1849void wake_up_new_task(struct task_struct *p)
1850{
1851 unsigned long flags;
1852 struct rq *rq;
1853
1854 raw_spin_lock_irqsave(&p->pi_lock, flags);
1855#ifdef CONFIG_SMP
1856
1857
1858
1859
1860
1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1862#endif
1863
1864
1865 init_task_runnable_average(p);
1866 rq = __task_rq_lock(p);
1867 activate_task(rq, p, 0);
1868 p->on_rq = 1;
1869 trace_sched_wakeup_new(p, true);
1870 check_preempt_curr(rq, p, WF_FORK);
1871#ifdef CONFIG_SMP
1872 if (p->sched_class->task_woken)
1873 p->sched_class->task_woken(rq, p);
1874#endif
1875 task_rq_unlock(rq, p, &flags);
1876}
1877
1878#ifdef CONFIG_PREEMPT_NOTIFIERS
1879
1880
1881
1882
1883
1884void preempt_notifier_register(struct preempt_notifier *notifier)
1885{
1886 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1887}
1888EXPORT_SYMBOL_GPL(preempt_notifier_register);
1889
1890
1891
1892
1893
1894
1895
1896void preempt_notifier_unregister(struct preempt_notifier *notifier)
1897{
1898 hlist_del(¬ifier->link);
1899}
1900EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1901
1902static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1903{
1904 struct preempt_notifier *notifier;
1905
1906 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1907 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1908}
1909
1910static void
1911fire_sched_out_preempt_notifiers(struct task_struct *curr,
1912 struct task_struct *next)
1913{
1914 struct preempt_notifier *notifier;
1915
1916 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1917 notifier->ops->sched_out(notifier, next);
1918}
1919
1920#else
1921
1922static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1923{
1924}
1925
1926static void
1927fire_sched_out_preempt_notifiers(struct task_struct *curr,
1928 struct task_struct *next)
1929{
1930}
1931
1932#endif
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947static inline void
1948prepare_task_switch(struct rq *rq, struct task_struct *prev,
1949 struct task_struct *next)
1950{
1951 trace_sched_switch(prev, next);
1952 sched_info_switch(rq, prev, next);
1953 perf_event_task_sched_out(prev, next);
1954 fire_sched_out_preempt_notifiers(prev, next);
1955 prepare_lock_switch(rq, next);
1956 prepare_arch_switch(next);
1957}
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1975 __releases(rq->lock)
1976{
1977 struct mm_struct *mm = rq->prev_mm;
1978 long prev_state;
1979
1980 rq->prev_mm = NULL;
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993 prev_state = prev->state;
1994 vtime_task_switch(prev);
1995 finish_arch_switch(prev);
1996 perf_event_task_sched_in(prev, current);
1997 finish_lock_switch(rq, prev);
1998 finish_arch_post_lock_switch();
1999
2000 fire_sched_in_preempt_notifiers(current);
2001 if (mm)
2002 mmdrop(mm);
2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
2006
2007
2008
2009
2010 kprobe_flush_task(prev);
2011 put_task_struct(prev);
2012 }
2013
2014 tick_nohz_task_switch(current);
2015}
2016
2017#ifdef CONFIG_SMP
2018
2019
2020static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2021{
2022 if (prev->sched_class->pre_schedule)
2023 prev->sched_class->pre_schedule(rq, prev);
2024}
2025
2026
2027static inline void post_schedule(struct rq *rq)
2028{
2029 if (rq->post_schedule) {
2030 unsigned long flags;
2031
2032 raw_spin_lock_irqsave(&rq->lock, flags);
2033 if (rq->curr->sched_class->post_schedule)
2034 rq->curr->sched_class->post_schedule(rq);
2035 raw_spin_unlock_irqrestore(&rq->lock, flags);
2036
2037 rq->post_schedule = 0;
2038 }
2039}
2040
2041#else
2042
2043static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2044{
2045}
2046
2047static inline void post_schedule(struct rq *rq)
2048{
2049}
2050
2051#endif
2052
2053
2054
2055
2056
2057asmlinkage void schedule_tail(struct task_struct *prev)
2058 __releases(rq->lock)
2059{
2060 struct rq *rq = this_rq();
2061
2062 finish_task_switch(rq, prev);
2063
2064
2065
2066
2067
2068 post_schedule(rq);
2069
2070#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2071
2072 preempt_enable();
2073#endif
2074 if (current->set_child_tid)
2075 put_user(task_pid_vnr(current), current->set_child_tid);
2076}
2077
2078
2079
2080
2081
2082static inline void
2083context_switch(struct rq *rq, struct task_struct *prev,
2084 struct task_struct *next)
2085{
2086 struct mm_struct *mm, *oldmm;
2087
2088 prepare_task_switch(rq, prev, next);
2089
2090 mm = next->mm;
2091 oldmm = prev->active_mm;
2092
2093
2094
2095
2096
2097 arch_start_context_switch(prev);
2098
2099 if (!mm) {
2100 next->active_mm = oldmm;
2101 atomic_inc(&oldmm->mm_count);
2102 enter_lazy_tlb(oldmm, next);
2103 } else
2104 switch_mm(oldmm, mm, next);
2105
2106 if (!prev->mm) {
2107 prev->active_mm = NULL;
2108 rq->prev_mm = oldmm;
2109 }
2110
2111
2112
2113
2114
2115
2116#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2117 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2118#endif
2119
2120 context_tracking_task_switch(prev, next);
2121
2122 switch_to(prev, next, prev);
2123
2124 barrier();
2125
2126
2127
2128
2129
2130 finish_task_switch(this_rq(), prev);
2131}
2132
2133
2134
2135
2136
2137
2138
2139unsigned long nr_running(void)
2140{
2141 unsigned long i, sum = 0;
2142
2143 for_each_online_cpu(i)
2144 sum += cpu_rq(i)->nr_running;
2145
2146 return sum;
2147}
2148
2149unsigned long long nr_context_switches(void)
2150{
2151 int i;
2152 unsigned long long sum = 0;
2153
2154 for_each_possible_cpu(i)
2155 sum += cpu_rq(i)->nr_switches;
2156
2157 return sum;
2158}
2159
2160unsigned long nr_iowait(void)
2161{
2162 unsigned long i, sum = 0;
2163
2164 for_each_possible_cpu(i)
2165 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2166
2167 return sum;
2168}
2169
2170unsigned long nr_iowait_cpu(int cpu)
2171{
2172 struct rq *this = cpu_rq(cpu);
2173 return atomic_read(&this->nr_iowait);
2174}
2175
2176#ifdef CONFIG_SMP
2177
2178
2179
2180
2181
2182void sched_exec(void)
2183{
2184 struct task_struct *p = current;
2185 unsigned long flags;
2186 int dest_cpu;
2187
2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2190 if (dest_cpu == smp_processor_id())
2191 goto unlock;
2192
2193 if (likely(cpu_active(dest_cpu))) {
2194 struct migration_arg arg = { p, dest_cpu };
2195
2196 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2197 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2198 return;
2199 }
2200unlock:
2201 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2202}
2203
2204#endif
2205
2206DEFINE_PER_CPU(struct kernel_stat, kstat);
2207DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2208
2209EXPORT_PER_CPU_SYMBOL(kstat);
2210EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2211
2212
2213
2214
2215
2216
2217
2218static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2219{
2220 u64 ns = 0;
2221
2222 if (task_current(rq, p)) {
2223 update_rq_clock(rq);
2224 ns = rq_clock_task(rq) - p->se.exec_start;
2225 if ((s64)ns < 0)
2226 ns = 0;
2227 }
2228
2229 return ns;
2230}
2231
2232unsigned long long task_delta_exec(struct task_struct *p)
2233{
2234 unsigned long flags;
2235 struct rq *rq;
2236 u64 ns = 0;
2237
2238 rq = task_rq_lock(p, &flags);
2239 ns = do_task_delta_exec(p, rq);
2240 task_rq_unlock(rq, p, &flags);
2241
2242 return ns;
2243}
2244
2245
2246
2247
2248
2249
2250unsigned long long task_sched_runtime(struct task_struct *p)
2251{
2252 unsigned long flags;
2253 struct rq *rq;
2254 u64 ns = 0;
2255
2256#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266 if (!p->on_cpu)
2267 return p->se.sum_exec_runtime;
2268#endif
2269
2270 rq = task_rq_lock(p, &flags);
2271 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2272 task_rq_unlock(rq, p, &flags);
2273
2274 return ns;
2275}
2276
2277
2278
2279
2280
2281void scheduler_tick(void)
2282{
2283 int cpu = smp_processor_id();
2284 struct rq *rq = cpu_rq(cpu);
2285 struct task_struct *curr = rq->curr;
2286
2287 sched_clock_tick();
2288
2289 raw_spin_lock(&rq->lock);
2290 update_rq_clock(rq);
2291 curr->sched_class->task_tick(rq, curr, 0);
2292 update_cpu_load_active(rq);
2293 raw_spin_unlock(&rq->lock);
2294
2295 perf_event_task_tick();
2296
2297#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu);
2300#endif
2301 rq_last_tick_reset(rq);
2302}
2303
2304#ifdef CONFIG_NO_HZ_FULL
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318u64 scheduler_tick_max_deferment(void)
2319{
2320 struct rq *rq = this_rq();
2321 unsigned long next, now = ACCESS_ONCE(jiffies);
2322
2323 next = rq->last_sched_tick + HZ;
2324
2325 if (time_before_eq(next, now))
2326 return 0;
2327
2328 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2329}
2330#endif
2331
2332notrace unsigned long get_parent_ip(unsigned long addr)
2333{
2334 if (in_lock_functions(addr)) {
2335 addr = CALLER_ADDR2;
2336 if (in_lock_functions(addr))
2337 addr = CALLER_ADDR3;
2338 }
2339 return addr;
2340}
2341
2342#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2343 defined(CONFIG_PREEMPT_TRACER))
2344
2345void __kprobes preempt_count_add(int val)
2346{
2347#ifdef CONFIG_DEBUG_PREEMPT
2348
2349
2350
2351 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2352 return;
2353#endif
2354 __preempt_count_add(val);
2355#ifdef CONFIG_DEBUG_PREEMPT
2356
2357
2358
2359 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2360 PREEMPT_MASK - 10);
2361#endif
2362 if (preempt_count() == val)
2363 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2364}
2365EXPORT_SYMBOL(preempt_count_add);
2366
2367void __kprobes preempt_count_sub(int val)
2368{
2369#ifdef CONFIG_DEBUG_PREEMPT
2370
2371
2372
2373 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2374 return;
2375
2376
2377
2378 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2379 !(preempt_count() & PREEMPT_MASK)))
2380 return;
2381#endif
2382
2383 if (preempt_count() == val)
2384 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2385 __preempt_count_sub(val);
2386}
2387EXPORT_SYMBOL(preempt_count_sub);
2388
2389#endif
2390
2391
2392
2393
2394static noinline void __schedule_bug(struct task_struct *prev)
2395{
2396 if (oops_in_progress)
2397 return;
2398
2399 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2400 prev->comm, prev->pid, preempt_count());
2401
2402 debug_show_held_locks(prev);
2403 print_modules();
2404 if (irqs_disabled())
2405 print_irqtrace_events(prev);
2406 dump_stack();
2407 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2408}
2409
2410
2411
2412
2413static inline void schedule_debug(struct task_struct *prev)
2414{
2415
2416
2417
2418
2419
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2421 __schedule_bug(prev);
2422 rcu_sleep_check();
2423
2424 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2425
2426 schedstat_inc(this_rq(), sched_count);
2427}
2428
2429static void put_prev_task(struct rq *rq, struct task_struct *prev)
2430{
2431 if (prev->on_rq || rq->skip_clock_update < 0)
2432 update_rq_clock(rq);
2433 prev->sched_class->put_prev_task(rq, prev);
2434}
2435
2436
2437
2438
2439static inline struct task_struct *
2440pick_next_task(struct rq *rq)
2441{
2442 const struct sched_class *class;
2443 struct task_struct *p;
2444
2445
2446
2447
2448
2449 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2450 p = fair_sched_class.pick_next_task(rq);
2451 if (likely(p))
2452 return p;
2453 }
2454
2455 for_each_class(class) {
2456 p = class->pick_next_task(rq);
2457 if (p)
2458 return p;
2459 }
2460
2461 BUG();
2462}
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501static void __sched __schedule(void)
2502{
2503 struct task_struct *prev, *next;
2504 unsigned long *switch_count;
2505 struct rq *rq;
2506 int cpu;
2507
2508need_resched:
2509 preempt_disable();
2510 cpu = smp_processor_id();
2511 rq = cpu_rq(cpu);
2512 rcu_note_context_switch(cpu);
2513 prev = rq->curr;
2514
2515 schedule_debug(prev);
2516
2517 if (sched_feat(HRTICK))
2518 hrtick_clear(rq);
2519
2520
2521
2522
2523
2524
2525 smp_mb__before_spinlock();
2526 raw_spin_lock_irq(&rq->lock);
2527
2528 switch_count = &prev->nivcsw;
2529 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2530 if (unlikely(signal_pending_state(prev->state, prev))) {
2531 prev->state = TASK_RUNNING;
2532 } else {
2533 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2534 prev->on_rq = 0;
2535
2536
2537
2538
2539
2540
2541 if (prev->flags & PF_WQ_WORKER) {
2542 struct task_struct *to_wakeup;
2543
2544 to_wakeup = wq_worker_sleeping(prev, cpu);
2545 if (to_wakeup)
2546 try_to_wake_up_local(to_wakeup);
2547 }
2548 }
2549 switch_count = &prev->nvcsw;
2550 }
2551
2552 pre_schedule(rq, prev);
2553
2554 if (unlikely(!rq->nr_running))
2555 idle_balance(cpu, rq);
2556
2557 put_prev_task(rq, prev);
2558 next = pick_next_task(rq);
2559 clear_tsk_need_resched(prev);
2560 clear_preempt_need_resched();
2561 rq->skip_clock_update = 0;
2562
2563 if (likely(prev != next)) {
2564 rq->nr_switches++;
2565 rq->curr = next;
2566 ++*switch_count;
2567
2568 context_switch(rq, prev, next);
2569
2570
2571
2572
2573
2574
2575 cpu = smp_processor_id();
2576 rq = cpu_rq(cpu);
2577 } else
2578 raw_spin_unlock_irq(&rq->lock);
2579
2580 post_schedule(rq);
2581
2582 sched_preempt_enable_no_resched();
2583 if (need_resched())
2584 goto need_resched;
2585}
2586
2587static inline void sched_submit_work(struct task_struct *tsk)
2588{
2589 if (!tsk->state || tsk_is_pi_blocked(tsk))
2590 return;
2591
2592
2593
2594
2595 if (blk_needs_flush_plug(tsk))
2596 blk_schedule_flush_plug(tsk);
2597}
2598
2599asmlinkage void __sched schedule(void)
2600{
2601 struct task_struct *tsk = current;
2602
2603 sched_submit_work(tsk);
2604 __schedule();
2605}
2606EXPORT_SYMBOL(schedule);
2607
2608#ifdef CONFIG_CONTEXT_TRACKING
2609asmlinkage void __sched schedule_user(void)
2610{
2611
2612
2613
2614
2615
2616
2617 user_exit();
2618 schedule();
2619 user_enter();
2620}
2621#endif
2622
2623
2624
2625
2626
2627
2628void __sched schedule_preempt_disabled(void)
2629{
2630 sched_preempt_enable_no_resched();
2631 schedule();
2632 preempt_disable();
2633}
2634
2635#ifdef CONFIG_PREEMPT
2636
2637
2638
2639
2640
2641asmlinkage void __sched notrace preempt_schedule(void)
2642{
2643
2644
2645
2646
2647 if (likely(!preemptible()))
2648 return;
2649
2650 do {
2651 __preempt_count_add(PREEMPT_ACTIVE);
2652 __schedule();
2653 __preempt_count_sub(PREEMPT_ACTIVE);
2654
2655
2656
2657
2658
2659 barrier();
2660 } while (need_resched());
2661}
2662EXPORT_SYMBOL(preempt_schedule);
2663#endif
2664
2665
2666
2667
2668
2669
2670
2671asmlinkage void __sched preempt_schedule_irq(void)
2672{
2673 enum ctx_state prev_state;
2674
2675
2676 BUG_ON(preempt_count() || !irqs_disabled());
2677
2678 prev_state = exception_enter();
2679
2680 do {
2681 __preempt_count_add(PREEMPT_ACTIVE);
2682 local_irq_enable();
2683 __schedule();
2684 local_irq_disable();
2685 __preempt_count_sub(PREEMPT_ACTIVE);
2686
2687
2688
2689
2690
2691 barrier();
2692 } while (need_resched());
2693
2694 exception_exit(prev_state);
2695}
2696
2697int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2698 void *key)
2699{
2700 return try_to_wake_up(curr->private, mode, wake_flags);
2701}
2702EXPORT_SYMBOL(default_wake_function);
2703
2704static long __sched
2705sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2706{
2707 unsigned long flags;
2708 wait_queue_t wait;
2709
2710 init_waitqueue_entry(&wait, current);
2711
2712 __set_current_state(state);
2713
2714 spin_lock_irqsave(&q->lock, flags);
2715 __add_wait_queue(q, &wait);
2716 spin_unlock(&q->lock);
2717 timeout = schedule_timeout(timeout);
2718 spin_lock_irq(&q->lock);
2719 __remove_wait_queue(q, &wait);
2720 spin_unlock_irqrestore(&q->lock, flags);
2721
2722 return timeout;
2723}
2724
2725void __sched interruptible_sleep_on(wait_queue_head_t *q)
2726{
2727 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2728}
2729EXPORT_SYMBOL(interruptible_sleep_on);
2730
2731long __sched
2732interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2733{
2734 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2735}
2736EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2737
2738void __sched sleep_on(wait_queue_head_t *q)
2739{
2740 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2741}
2742EXPORT_SYMBOL(sleep_on);
2743
2744long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2745{
2746 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2747}
2748EXPORT_SYMBOL(sleep_on_timeout);
2749
2750#ifdef CONFIG_RT_MUTEXES
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762void rt_mutex_setprio(struct task_struct *p, int prio)
2763{
2764 int oldprio, on_rq, running;
2765 struct rq *rq;
2766 const struct sched_class *prev_class;
2767
2768 BUG_ON(prio < 0 || prio > MAX_PRIO);
2769
2770 rq = __task_rq_lock(p);
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784 if (unlikely(p == rq->idle)) {
2785 WARN_ON(p != rq->curr);
2786 WARN_ON(p->pi_blocked_on);
2787 goto out_unlock;
2788 }
2789
2790 trace_sched_pi_setprio(p, prio);
2791 oldprio = p->prio;
2792 prev_class = p->sched_class;
2793 on_rq = p->on_rq;
2794 running = task_current(rq, p);
2795 if (on_rq)
2796 dequeue_task(rq, p, 0);
2797 if (running)
2798 p->sched_class->put_prev_task(rq, p);
2799
2800 if (rt_prio(prio))
2801 p->sched_class = &rt_sched_class;
2802 else
2803 p->sched_class = &fair_sched_class;
2804
2805 p->prio = prio;
2806
2807 if (running)
2808 p->sched_class->set_curr_task(rq);
2809 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
2811
2812 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock:
2814 __task_rq_unlock(rq);
2815}
2816#endif
2817void set_user_nice(struct task_struct *p, long nice)
2818{
2819 int old_prio, delta, on_rq;
2820 unsigned long flags;
2821 struct rq *rq;
2822
2823 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2824 return;
2825
2826
2827
2828
2829 rq = task_rq_lock(p, &flags);
2830
2831
2832
2833
2834
2835
2836 if (task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock;
2839 }
2840 on_rq = p->on_rq;
2841 if (on_rq)
2842 dequeue_task(rq, p, 0);
2843
2844 p->static_prio = NICE_TO_PRIO(nice);
2845 set_load_weight(p);
2846 old_prio = p->prio;
2847 p->prio = effective_prio(p);
2848 delta = p->prio - old_prio;
2849
2850 if (on_rq) {
2851 enqueue_task(rq, p, 0);
2852
2853
2854
2855
2856 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2857 resched_task(rq->curr);
2858 }
2859out_unlock:
2860 task_rq_unlock(rq, p, &flags);
2861}
2862EXPORT_SYMBOL(set_user_nice);
2863
2864
2865
2866
2867
2868
2869int can_nice(const struct task_struct *p, const int nice)
2870{
2871
2872 int nice_rlim = 20 - nice;
2873
2874 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
2875 capable(CAP_SYS_NICE));
2876}
2877
2878#ifdef __ARCH_WANT_SYS_NICE
2879
2880
2881
2882
2883
2884
2885
2886
2887SYSCALL_DEFINE1(nice, int, increment)
2888{
2889 long nice, retval;
2890
2891
2892
2893
2894
2895
2896 if (increment < -40)
2897 increment = -40;
2898 if (increment > 40)
2899 increment = 40;
2900
2901 nice = TASK_NICE(current) + increment;
2902 if (nice < -20)
2903 nice = -20;
2904 if (nice > 19)
2905 nice = 19;
2906
2907 if (increment < 0 && !can_nice(current, nice))
2908 return -EPERM;
2909
2910 retval = security_task_setnice(current, nice);
2911 if (retval)
2912 return retval;
2913
2914 set_user_nice(current, nice);
2915 return 0;
2916}
2917
2918#endif
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928int task_prio(const struct task_struct *p)
2929{
2930 return p->prio - MAX_RT_PRIO;
2931}
2932
2933
2934
2935
2936
2937
2938
2939int task_nice(const struct task_struct *p)
2940{
2941 return TASK_NICE(p);
2942}
2943EXPORT_SYMBOL(task_nice);
2944
2945
2946
2947
2948
2949
2950
2951int idle_cpu(int cpu)
2952{
2953 struct rq *rq = cpu_rq(cpu);
2954
2955 if (rq->curr != rq->idle)
2956 return 0;
2957
2958 if (rq->nr_running)
2959 return 0;
2960
2961#ifdef CONFIG_SMP
2962 if (!llist_empty(&rq->wake_list))
2963 return 0;
2964#endif
2965
2966 return 1;
2967}
2968
2969
2970
2971
2972
2973
2974
2975struct task_struct *idle_task(int cpu)
2976{
2977 return cpu_rq(cpu)->idle;
2978}
2979
2980
2981
2982
2983
2984
2985
2986static struct task_struct *find_process_by_pid(pid_t pid)
2987{
2988 return pid ? find_task_by_vpid(pid) : current;
2989}
2990
2991
2992static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
2994{
2995 p->policy = policy;
2996 p->rt_priority = prio;
2997 p->normal_prio = normal_prio(p);
2998
2999 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class;
3002 else
3003 p->sched_class = &fair_sched_class;
3004 set_load_weight(p);
3005}
3006
3007
3008
3009
3010static bool check_same_owner(struct task_struct *p)
3011{
3012 const struct cred *cred = current_cred(), *pcred;
3013 bool match;
3014
3015 rcu_read_lock();
3016 pcred = __task_cred(p);
3017 match = (uid_eq(cred->euid, pcred->euid) ||
3018 uid_eq(cred->euid, pcred->uid));
3019 rcu_read_unlock();
3020 return match;
3021}
3022
3023static int __sched_setscheduler(struct task_struct *p, int policy,
3024 const struct sched_param *param, bool user)
3025{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running;
3027 unsigned long flags;
3028 const struct sched_class *prev_class;
3029 struct rq *rq;
3030 int reset_on_fork;
3031
3032
3033 BUG_ON(in_interrupt());
3034recheck:
3035
3036 if (policy < 0) {
3037 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy;
3039 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042
3043 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE)
3046 return -EINVAL;
3047 }
3048
3049
3050
3051
3052
3053
3054 if (param->sched_priority < 0 ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0))
3059 return -EINVAL;
3060
3061
3062
3063
3064 if (user && !capable(CAP_SYS_NICE)) {
3065 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO);
3068
3069
3070 if (policy != p->policy && !rlim_rtprio)
3071 return -EPERM;
3072
3073
3074 if (param->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio)
3076 return -EPERM;
3077 }
3078
3079
3080
3081
3082
3083 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3084 if (!can_nice(p, TASK_NICE(p)))
3085 return -EPERM;
3086 }
3087
3088
3089 if (!check_same_owner(p))
3090 return -EPERM;
3091
3092
3093 if (p->sched_reset_on_fork && !reset_on_fork)
3094 return -EPERM;
3095 }
3096
3097 if (user) {
3098 retval = security_task_setscheduler(p);
3099 if (retval)
3100 return retval;
3101 }
3102
3103
3104
3105
3106
3107
3108
3109
3110 rq = task_rq_lock(p, &flags);
3111
3112
3113
3114
3115 if (p == rq->stop) {
3116 task_rq_unlock(rq, p, &flags);
3117 return -EINVAL;
3118 }
3119
3120
3121
3122
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3124 param->sched_priority == p->rt_priority))) {
3125 task_rq_unlock(rq, p, &flags);
3126 return 0;
3127 }
3128
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) {
3131
3132
3133
3134
3135 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3136 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3137 !task_group_is_autogroup(task_group(p))) {
3138 task_rq_unlock(rq, p, &flags);
3139 return -EPERM;
3140 }
3141 }
3142#endif
3143
3144
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3146 policy = oldpolicy = -1;
3147 task_rq_unlock(rq, p, &flags);
3148 goto recheck;
3149 }
3150 on_rq = p->on_rq;
3151 running = task_current(rq, p);
3152 if (on_rq)
3153 dequeue_task(rq, p, 0);
3154 if (running)
3155 p->sched_class->put_prev_task(rq, p);
3156
3157 p->sched_reset_on_fork = reset_on_fork;
3158
3159 oldprio = p->prio;
3160 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority);
3162
3163 if (running)
3164 p->sched_class->set_curr_task(rq);
3165 if (on_rq)
3166 enqueue_task(rq, p, 0);
3167
3168 check_class_changed(rq, p, prev_class, oldprio);
3169 task_rq_unlock(rq, p, &flags);
3170
3171 rt_mutex_adjust_pi(p);
3172
3173 return 0;
3174}
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param)
3188{
3189 return __sched_setscheduler(p, policy, param, true);
3190}
3191EXPORT_SYMBOL_GPL(sched_setscheduler);
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param)
3208{
3209 return __sched_setscheduler(p, policy, param, false);
3210}
3211
3212static int
3213do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3214{
3215 struct sched_param lparam;
3216 struct task_struct *p;
3217 int retval;
3218
3219 if (!param || pid < 0)
3220 return -EINVAL;
3221 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3222 return -EFAULT;
3223
3224 rcu_read_lock();
3225 retval = -ESRCH;
3226 p = find_process_by_pid(pid);
3227 if (p != NULL)
3228 retval = sched_setscheduler(p, policy, &lparam);
3229 rcu_read_unlock();
3230
3231 return retval;
3232}
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3243 struct sched_param __user *, param)
3244{
3245
3246 if (policy < 0)
3247 return -EINVAL;
3248
3249 return do_sched_setscheduler(pid, policy, param);
3250}
3251
3252
3253
3254
3255
3256
3257
3258
3259SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3260{
3261 return do_sched_setscheduler(pid, -1, param);
3262}
3263
3264
3265
3266
3267
3268
3269
3270
3271SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3272{
3273 struct task_struct *p;
3274 int retval;
3275
3276 if (pid < 0)
3277 return -EINVAL;
3278
3279 retval = -ESRCH;
3280 rcu_read_lock();
3281 p = find_process_by_pid(pid);
3282 if (p) {
3283 retval = security_task_getscheduler(p);
3284 if (!retval)
3285 retval = p->policy
3286 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3287 }
3288 rcu_read_unlock();
3289 return retval;
3290}
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3301{
3302 struct sched_param lp;
3303 struct task_struct *p;
3304 int retval;
3305
3306 if (!param || pid < 0)
3307 return -EINVAL;
3308
3309 rcu_read_lock();
3310 p = find_process_by_pid(pid);
3311 retval = -ESRCH;
3312 if (!p)
3313 goto out_unlock;
3314
3315 retval = security_task_getscheduler(p);
3316 if (retval)
3317 goto out_unlock;
3318
3319 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock();
3321
3322
3323
3324
3325 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3326
3327 return retval;
3328
3329out_unlock:
3330 rcu_read_unlock();
3331 return retval;
3332}
3333
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{
3336 cpumask_var_t cpus_allowed, new_mask;
3337 struct task_struct *p;
3338 int retval;
3339
3340 rcu_read_lock();
3341
3342 p = find_process_by_pid(pid);
3343 if (!p) {
3344 rcu_read_unlock();
3345 return -ESRCH;
3346 }
3347
3348
3349 get_task_struct(p);
3350 rcu_read_unlock();
3351
3352 if (p->flags & PF_NO_SETAFFINITY) {
3353 retval = -EINVAL;
3354 goto out_put_task;
3355 }
3356 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3357 retval = -ENOMEM;
3358 goto out_put_task;
3359 }
3360 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3361 retval = -ENOMEM;
3362 goto out_free_cpus_allowed;
3363 }
3364 retval = -EPERM;
3365 if (!check_same_owner(p)) {
3366 rcu_read_lock();
3367 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3368 rcu_read_unlock();
3369 goto out_unlock;
3370 }
3371 rcu_read_unlock();
3372 }
3373
3374 retval = security_task_setscheduler(p);
3375 if (retval)
3376 goto out_unlock;
3377
3378 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed);
3380again:
3381 retval = set_cpus_allowed_ptr(p, new_mask);
3382
3383 if (!retval) {
3384 cpuset_cpus_allowed(p, cpus_allowed);
3385 if (!cpumask_subset(new_mask, cpus_allowed)) {
3386
3387
3388
3389
3390
3391 cpumask_copy(new_mask, cpus_allowed);
3392 goto again;
3393 }
3394 }
3395out_unlock:
3396 free_cpumask_var(new_mask);
3397out_free_cpus_allowed:
3398 free_cpumask_var(cpus_allowed);
3399out_put_task:
3400 put_task_struct(p);
3401 return retval;
3402}
3403
3404static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3405 struct cpumask *new_mask)
3406{
3407 if (len < cpumask_size())
3408 cpumask_clear(new_mask);
3409 else if (len > cpumask_size())
3410 len = cpumask_size();
3411
3412 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3413}
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3424 unsigned long __user *, user_mask_ptr)
3425{
3426 cpumask_var_t new_mask;
3427 int retval;
3428
3429 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3430 return -ENOMEM;
3431
3432 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3433 if (retval == 0)
3434 retval = sched_setaffinity(pid, new_mask);
3435 free_cpumask_var(new_mask);
3436 return retval;
3437}
3438
3439long sched_getaffinity(pid_t pid, struct cpumask *mask)
3440{
3441 struct task_struct *p;
3442 unsigned long flags;
3443 int retval;
3444
3445 rcu_read_lock();
3446
3447 retval = -ESRCH;
3448 p = find_process_by_pid(pid);
3449 if (!p)
3450 goto out_unlock;
3451
3452 retval = security_task_getscheduler(p);
3453 if (retval)
3454 goto out_unlock;
3455
3456 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3458 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3459
3460out_unlock:
3461 rcu_read_unlock();
3462
3463 return retval;
3464}
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3475 unsigned long __user *, user_mask_ptr)
3476{
3477 int ret;
3478 cpumask_var_t mask;
3479
3480 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
3481 return -EINVAL;
3482 if (len & (sizeof(unsigned long)-1))
3483 return -EINVAL;
3484
3485 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
3486 return -ENOMEM;
3487
3488 ret = sched_getaffinity(pid, mask);
3489 if (ret == 0) {
3490 size_t retlen = min_t(size_t, len, cpumask_size());
3491
3492 if (copy_to_user(user_mask_ptr, mask, retlen))
3493 ret = -EFAULT;
3494 else
3495 ret = retlen;
3496 }
3497 free_cpumask_var(mask);
3498
3499 return ret;
3500}
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510SYSCALL_DEFINE0(sched_yield)
3511{
3512 struct rq *rq = this_rq_lock();
3513
3514 schedstat_inc(rq, yld_count);
3515 current->sched_class->yield_task(rq);
3516
3517
3518
3519
3520
3521 __release(rq->lock);
3522 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3523 do_raw_spin_unlock(&rq->lock);
3524 sched_preempt_enable_no_resched();
3525
3526 schedule();
3527
3528 return 0;
3529}
3530
3531static void __cond_resched(void)
3532{
3533 __preempt_count_add(PREEMPT_ACTIVE);
3534 __schedule();
3535 __preempt_count_sub(PREEMPT_ACTIVE);
3536}
3537
3538int __sched _cond_resched(void)
3539{
3540 if (should_resched()) {
3541 __cond_resched();
3542 return 1;
3543 }
3544 return 0;
3545}
3546EXPORT_SYMBOL(_cond_resched);
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556int __cond_resched_lock(spinlock_t *lock)
3557{
3558 int resched = should_resched();
3559 int ret = 0;
3560
3561 lockdep_assert_held(lock);
3562
3563 if (spin_needbreak(lock) || resched) {
3564 spin_unlock(lock);
3565 if (resched)
3566 __cond_resched();
3567 else
3568 cpu_relax();
3569 ret = 1;
3570 spin_lock(lock);
3571 }
3572 return ret;
3573}
3574EXPORT_SYMBOL(__cond_resched_lock);
3575
3576int __sched __cond_resched_softirq(void)
3577{
3578 BUG_ON(!in_softirq());
3579
3580 if (should_resched()) {
3581 local_bh_enable();
3582 __cond_resched();
3583 local_bh_disable();
3584 return 1;
3585 }
3586 return 0;
3587}
3588EXPORT_SYMBOL(__cond_resched_softirq);
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612void __sched yield(void)
3613{
3614 set_current_state(TASK_RUNNING);
3615 sys_sched_yield();
3616}
3617EXPORT_SYMBOL(yield);
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634bool __sched yield_to(struct task_struct *p, bool preempt)
3635{
3636 struct task_struct *curr = current;
3637 struct rq *rq, *p_rq;
3638 unsigned long flags;
3639 int yielded = 0;
3640
3641 local_irq_save(flags);
3642 rq = this_rq();
3643
3644again:
3645 p_rq = task_rq(p);
3646
3647
3648
3649
3650 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
3651 yielded = -ESRCH;
3652 goto out_irq;
3653 }
3654
3655 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq);
3658 goto again;
3659 }
3660
3661 if (!curr->sched_class->yield_to_task)
3662 goto out_unlock;
3663
3664 if (curr->sched_class != p->sched_class)
3665 goto out_unlock;
3666
3667 if (task_running(p_rq, p) || p->state)
3668 goto out_unlock;
3669
3670 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
3671 if (yielded) {
3672 schedstat_inc(rq, yld_count);
3673
3674
3675
3676
3677 if (preempt && rq != p_rq)
3678 resched_task(p_rq->curr);
3679 }
3680
3681out_unlock:
3682 double_rq_unlock(rq, p_rq);
3683out_irq:
3684 local_irq_restore(flags);
3685
3686 if (yielded > 0)
3687 schedule();
3688
3689 return yielded;
3690}
3691EXPORT_SYMBOL_GPL(yield_to);
3692
3693
3694
3695
3696
3697void __sched io_schedule(void)
3698{
3699 struct rq *rq = raw_rq();
3700
3701 delayacct_blkio_start();
3702 atomic_inc(&rq->nr_iowait);
3703 blk_flush_plug(current);
3704 current->in_iowait = 1;
3705 schedule();
3706 current->in_iowait = 0;
3707 atomic_dec(&rq->nr_iowait);
3708 delayacct_blkio_end();
3709}
3710EXPORT_SYMBOL(io_schedule);
3711
3712long __sched io_schedule_timeout(long timeout)
3713{
3714 struct rq *rq = raw_rq();
3715 long ret;
3716
3717 delayacct_blkio_start();
3718 atomic_inc(&rq->nr_iowait);
3719 blk_flush_plug(current);
3720 current->in_iowait = 1;
3721 ret = schedule_timeout(timeout);
3722 current->in_iowait = 0;
3723 atomic_dec(&rq->nr_iowait);
3724 delayacct_blkio_end();
3725 return ret;
3726}
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3737{
3738 int ret = -EINVAL;
3739
3740 switch (policy) {
3741 case SCHED_FIFO:
3742 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1;
3744 break;
3745 case SCHED_NORMAL:
3746 case SCHED_BATCH:
3747 case SCHED_IDLE:
3748 ret = 0;
3749 break;
3750 }
3751 return ret;
3752}
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3763{
3764 int ret = -EINVAL;
3765
3766 switch (policy) {
3767 case SCHED_FIFO:
3768 case SCHED_RR:
3769 ret = 1;
3770 break;
3771 case SCHED_NORMAL:
3772 case SCHED_BATCH:
3773 case SCHED_IDLE:
3774 ret = 0;
3775 }
3776 return ret;
3777}
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
3791 struct timespec __user *, interval)
3792{
3793 struct task_struct *p;
3794 unsigned int time_slice;
3795 unsigned long flags;
3796 struct rq *rq;
3797 int retval;
3798 struct timespec t;
3799
3800 if (pid < 0)
3801 return -EINVAL;
3802
3803 retval = -ESRCH;
3804 rcu_read_lock();
3805 p = find_process_by_pid(pid);
3806 if (!p)
3807 goto out_unlock;
3808
3809 retval = security_task_getscheduler(p);
3810 if (retval)
3811 goto out_unlock;
3812
3813 rq = task_rq_lock(p, &flags);
3814 time_slice = p->sched_class->get_rr_interval(rq, p);
3815 task_rq_unlock(rq, p, &flags);
3816
3817 rcu_read_unlock();
3818 jiffies_to_timespec(time_slice, &t);
3819 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3820 return retval;
3821
3822out_unlock:
3823 rcu_read_unlock();
3824 return retval;
3825}
3826
3827static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
3828
3829void sched_show_task(struct task_struct *p)
3830{
3831 unsigned long free = 0;
3832 int ppid;
3833 unsigned state;
3834
3835 state = p->state ? __ffs(p->state) + 1 : 0;
3836 printk(KERN_INFO "%-15.15s %c", p->comm,
3837 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
3838#if BITS_PER_LONG == 32
3839 if (state == TASK_RUNNING)
3840 printk(KERN_CONT " running ");
3841 else
3842 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
3843#else
3844 if (state == TASK_RUNNING)
3845 printk(KERN_CONT " running task ");
3846 else
3847 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
3848#endif
3849#ifdef CONFIG_DEBUG_STACK_USAGE
3850 free = stack_not_used(p);
3851#endif
3852 rcu_read_lock();
3853 ppid = task_pid_nr(rcu_dereference(p->real_parent));
3854 rcu_read_unlock();
3855 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
3856 task_pid_nr(p), ppid,
3857 (unsigned long)task_thread_info(p)->flags);
3858
3859 print_worker_info(KERN_INFO, p);
3860 show_stack(p, NULL);
3861}
3862
3863void show_state_filter(unsigned long state_filter)
3864{
3865 struct task_struct *g, *p;
3866
3867#if BITS_PER_LONG == 32
3868 printk(KERN_INFO
3869 " task PC stack pid father\n");
3870#else
3871 printk(KERN_INFO
3872 " task PC stack pid father\n");
3873#endif
3874 rcu_read_lock();
3875 do_each_thread(g, p) {
3876
3877
3878
3879
3880 touch_nmi_watchdog();
3881 if (!state_filter || (p->state & state_filter))
3882 sched_show_task(p);
3883 } while_each_thread(g, p);
3884
3885 touch_all_softlockup_watchdogs();
3886
3887#ifdef CONFIG_SCHED_DEBUG
3888 sysrq_sched_debug_show();
3889#endif
3890 rcu_read_unlock();
3891
3892
3893
3894 if (!state_filter)
3895 debug_show_all_locks();
3896}
3897
3898void init_idle_bootup_task(struct task_struct *idle)
3899{
3900 idle->sched_class = &idle_sched_class;
3901}
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911void init_idle(struct task_struct *idle, int cpu)
3912{
3913 struct rq *rq = cpu_rq(cpu);
3914 unsigned long flags;
3915
3916 raw_spin_lock_irqsave(&rq->lock, flags);
3917
3918 __sched_fork(0, idle);
3919 idle->state = TASK_RUNNING;
3920 idle->se.exec_start = sched_clock();
3921
3922 do_set_cpus_allowed(idle, cpumask_of(cpu));
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933 rcu_read_lock();
3934 __set_task_cpu(idle, cpu);
3935 rcu_read_unlock();
3936
3937 rq->curr = rq->idle = idle;
3938#if defined(CONFIG_SMP)
3939 idle->on_cpu = 1;
3940#endif
3941 raw_spin_unlock_irqrestore(&rq->lock, flags);
3942
3943
3944 init_idle_preempt_count(idle, cpu);
3945
3946
3947
3948
3949 idle->sched_class = &idle_sched_class;
3950 ftrace_graph_init_idle_task(idle, cpu);
3951 vtime_init_idle(idle, cpu);
3952#if defined(CONFIG_SMP)
3953 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
3954#endif
3955}
3956
3957#ifdef CONFIG_SMP
3958void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
3959{
3960 if (p->sched_class && p->sched_class->set_cpus_allowed)
3961 p->sched_class->set_cpus_allowed(p, new_mask);
3962
3963 cpumask_copy(&p->cpus_allowed, new_mask);
3964 p->nr_cpus_allowed = cpumask_weight(new_mask);
3965}
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
3991{
3992 unsigned long flags;
3993 struct rq *rq;
3994 unsigned int dest_cpu;
3995 int ret = 0;
3996
3997 rq = task_rq_lock(p, &flags);
3998
3999 if (cpumask_equal(&p->cpus_allowed, new_mask))
4000 goto out;
4001
4002 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4003 ret = -EINVAL;
4004 goto out;
4005 }
4006
4007 do_set_cpus_allowed(p, new_mask);
4008
4009
4010 if (cpumask_test_cpu(task_cpu(p), new_mask))
4011 goto out;
4012
4013 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4014 if (p->on_rq) {
4015 struct migration_arg arg = { p, dest_cpu };
4016
4017 task_rq_unlock(rq, p, &flags);
4018 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4019 tlb_migrate_finish(p->mm);
4020 return 0;
4021 }
4022out:
4023 task_rq_unlock(rq, p, &flags);
4024
4025 return ret;
4026}
4027EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4041{
4042 struct rq *rq_dest, *rq_src;
4043 int ret = 0;
4044
4045 if (unlikely(!cpu_active(dest_cpu)))
4046 return ret;
4047
4048 rq_src = cpu_rq(src_cpu);
4049 rq_dest = cpu_rq(dest_cpu);
4050
4051 raw_spin_lock(&p->pi_lock);
4052 double_rq_lock(rq_src, rq_dest);
4053
4054 if (task_cpu(p) != src_cpu)
4055 goto done;
4056
4057 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4058 goto fail;
4059
4060
4061
4062
4063
4064 if (p->on_rq) {
4065 dequeue_task(rq_src, p, 0);
4066 set_task_cpu(p, dest_cpu);
4067 enqueue_task(rq_dest, p, 0);
4068 check_preempt_curr(rq_dest, p, 0);
4069 }
4070done:
4071 ret = 1;
4072fail:
4073 double_rq_unlock(rq_src, rq_dest);
4074 raw_spin_unlock(&p->pi_lock);
4075 return ret;
4076}
4077
4078#ifdef CONFIG_NUMA_BALANCING
4079
4080int migrate_task_to(struct task_struct *p, int target_cpu)
4081{
4082 struct migration_arg arg = { p, target_cpu };
4083 int curr_cpu = task_cpu(p);
4084
4085 if (curr_cpu == target_cpu)
4086 return 0;
4087
4088 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4089 return -EINVAL;
4090
4091
4092
4093 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4094}
4095
4096
4097
4098
4099
4100void sched_setnuma(struct task_struct *p, int nid)
4101{
4102 struct rq *rq;
4103 unsigned long flags;
4104 bool on_rq, running;
4105
4106 rq = task_rq_lock(p, &flags);
4107 on_rq = p->on_rq;
4108 running = task_current(rq, p);
4109
4110 if (on_rq)
4111 dequeue_task(rq, p, 0);
4112 if (running)
4113 p->sched_class->put_prev_task(rq, p);
4114
4115 p->numa_preferred_nid = nid;
4116
4117 if (running)
4118 p->sched_class->set_curr_task(rq);
4119 if (on_rq)
4120 enqueue_task(rq, p, 0);
4121 task_rq_unlock(rq, p, &flags);
4122}
4123#endif
4124
4125
4126
4127
4128
4129
4130static int migration_cpu_stop(void *data)
4131{
4132 struct migration_arg *arg = data;
4133
4134
4135
4136
4137
4138 local_irq_disable();
4139 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4140 local_irq_enable();
4141 return 0;
4142}
4143
4144#ifdef CONFIG_HOTPLUG_CPU
4145
4146
4147
4148
4149
4150void idle_task_exit(void)
4151{
4152 struct mm_struct *mm = current->active_mm;
4153
4154 BUG_ON(cpu_online(smp_processor_id()));
4155
4156 if (mm != &init_mm)
4157 switch_mm(mm, &init_mm, current);
4158 mmdrop(mm);
4159}
4160
4161
4162
4163
4164
4165
4166
4167
4168static void calc_load_migrate(struct rq *rq)
4169{
4170 long delta = calc_load_fold_active(rq);
4171 if (delta)
4172 atomic_long_add(delta, &calc_load_tasks);
4173}
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183static void migrate_tasks(unsigned int dead_cpu)
4184{
4185 struct rq *rq = cpu_rq(dead_cpu);
4186 struct task_struct *next, *stop = rq->stop;
4187 int dest_cpu;
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198 rq->stop = NULL;
4199
4200
4201
4202
4203
4204
4205 update_rq_clock(rq);
4206
4207 for ( ; ; ) {
4208
4209
4210
4211
4212 if (rq->nr_running == 1)
4213 break;
4214
4215 next = pick_next_task(rq);
4216 BUG_ON(!next);
4217 next->sched_class->put_prev_task(rq, next);
4218
4219
4220 dest_cpu = select_fallback_rq(dead_cpu, next);
4221 raw_spin_unlock(&rq->lock);
4222
4223 __migrate_task(next, dead_cpu, dest_cpu);
4224
4225 raw_spin_lock(&rq->lock);
4226 }
4227
4228 rq->stop = stop;
4229}
4230
4231#endif
4232
4233#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4234
4235static struct ctl_table sd_ctl_dir[] = {
4236 {
4237 .procname = "sched_domain",
4238 .mode = 0555,
4239 },
4240 {}
4241};
4242
4243static struct ctl_table sd_ctl_root[] = {
4244 {
4245 .procname = "kernel",
4246 .mode = 0555,
4247 .child = sd_ctl_dir,
4248 },
4249 {}
4250};
4251
4252static struct ctl_table *sd_alloc_ctl_entry(int n)
4253{
4254 struct ctl_table *entry =
4255 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4256
4257 return entry;
4258}
4259
4260static void sd_free_ctl_entry(struct ctl_table **tablep)
4261{
4262 struct ctl_table *entry;
4263
4264
4265
4266
4267
4268
4269
4270 for (entry = *tablep; entry->mode; entry++) {
4271 if (entry->child)
4272 sd_free_ctl_entry(&entry->child);
4273 if (entry->proc_handler == NULL)
4274 kfree(entry->procname);
4275 }
4276
4277 kfree(*tablep);
4278 *tablep = NULL;
4279}
4280
4281static int min_load_idx = 0;
4282static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4283
4284static void
4285set_table_entry(struct ctl_table *entry,
4286 const char *procname, void *data, int maxlen,
4287 umode_t mode, proc_handler *proc_handler,
4288 bool load_idx)
4289{
4290 entry->procname = procname;
4291 entry->data = data;
4292 entry->maxlen = maxlen;
4293 entry->mode = mode;
4294 entry->proc_handler = proc_handler;
4295
4296 if (load_idx) {
4297 entry->extra1 = &min_load_idx;
4298 entry->extra2 = &max_load_idx;
4299 }
4300}
4301
4302static struct ctl_table *
4303sd_alloc_ctl_domain_table(struct sched_domain *sd)
4304{
4305 struct ctl_table *table = sd_alloc_ctl_entry(13);
4306
4307 if (table == NULL)
4308 return NULL;
4309
4310 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4311 sizeof(long), 0644, proc_doulongvec_minmax, false);
4312 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4313 sizeof(long), 0644, proc_doulongvec_minmax, false);
4314 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4315 sizeof(int), 0644, proc_dointvec_minmax, true);
4316 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4317 sizeof(int), 0644, proc_dointvec_minmax, true);
4318 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4319 sizeof(int), 0644, proc_dointvec_minmax, true);
4320 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4321 sizeof(int), 0644, proc_dointvec_minmax, true);
4322 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4323 sizeof(int), 0644, proc_dointvec_minmax, true);
4324 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4325 sizeof(int), 0644, proc_dointvec_minmax, false);
4326 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4327 sizeof(int), 0644, proc_dointvec_minmax, false);
4328 set_table_entry(&table[9], "cache_nice_tries",
4329 &sd->cache_nice_tries,
4330 sizeof(int), 0644, proc_dointvec_minmax, false);
4331 set_table_entry(&table[10], "flags", &sd->flags,
4332 sizeof(int), 0644, proc_dointvec_minmax, false);
4333 set_table_entry(&table[11], "name", sd->name,
4334 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4335
4336
4337 return table;
4338}
4339
4340static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4341{
4342 struct ctl_table *entry, *table;
4343 struct sched_domain *sd;
4344 int domain_num = 0, i;
4345 char buf[32];
4346
4347 for_each_domain(cpu, sd)
4348 domain_num++;
4349 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4350 if (table == NULL)
4351 return NULL;
4352
4353 i = 0;
4354 for_each_domain(cpu, sd) {
4355 snprintf(buf, 32, "domain%d", i);
4356 entry->procname = kstrdup(buf, GFP_KERNEL);
4357 entry->mode = 0555;
4358 entry->child = sd_alloc_ctl_domain_table(sd);
4359 entry++;
4360 i++;
4361 }
4362 return table;
4363}
4364
4365static struct ctl_table_header *sd_sysctl_header;
4366static void register_sched_domain_sysctl(void)
4367{
4368 int i, cpu_num = num_possible_cpus();
4369 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4370 char buf[32];
4371
4372 WARN_ON(sd_ctl_dir[0].child);
4373 sd_ctl_dir[0].child = entry;
4374
4375 if (entry == NULL)
4376 return;
4377
4378 for_each_possible_cpu(i) {
4379 snprintf(buf, 32, "cpu%d", i);
4380 entry->procname = kstrdup(buf, GFP_KERNEL);
4381 entry->mode = 0555;
4382 entry->child = sd_alloc_ctl_cpu_table(i);
4383 entry++;
4384 }
4385
4386 WARN_ON(sd_sysctl_header);
4387 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4388}
4389
4390
4391static void unregister_sched_domain_sysctl(void)
4392{
4393 if (sd_sysctl_header)
4394 unregister_sysctl_table(sd_sysctl_header);
4395 sd_sysctl_header = NULL;
4396 if (sd_ctl_dir[0].child)
4397 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4398}
4399#else
4400static void register_sched_domain_sysctl(void)
4401{
4402}
4403static void unregister_sched_domain_sysctl(void)
4404{
4405}
4406#endif
4407
4408static void set_rq_online(struct rq *rq)
4409{
4410 if (!rq->online) {
4411 const struct sched_class *class;
4412
4413 cpumask_set_cpu(rq->cpu, rq->rd->online);
4414 rq->online = 1;
4415
4416 for_each_class(class) {
4417 if (class->rq_online)
4418 class->rq_online(rq);
4419 }
4420 }
4421}
4422
4423static void set_rq_offline(struct rq *rq)
4424{
4425 if (rq->online) {
4426 const struct sched_class *class;
4427
4428 for_each_class(class) {
4429 if (class->rq_offline)
4430 class->rq_offline(rq);
4431 }
4432
4433 cpumask_clear_cpu(rq->cpu, rq->rd->online);
4434 rq->online = 0;
4435 }
4436}
4437
4438
4439
4440
4441
4442static int
4443migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4444{
4445 int cpu = (long)hcpu;
4446 unsigned long flags;
4447 struct rq *rq = cpu_rq(cpu);
4448
4449 switch (action & ~CPU_TASKS_FROZEN) {
4450
4451 case CPU_UP_PREPARE:
4452 rq->calc_load_update = calc_load_update;
4453 break;
4454
4455 case CPU_ONLINE:
4456
4457 raw_spin_lock_irqsave(&rq->lock, flags);
4458 if (rq->rd) {
4459 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4460
4461 set_rq_online(rq);
4462 }
4463 raw_spin_unlock_irqrestore(&rq->lock, flags);
4464 break;
4465
4466#ifdef CONFIG_HOTPLUG_CPU
4467 case CPU_DYING:
4468 sched_ttwu_pending();
4469
4470 raw_spin_lock_irqsave(&rq->lock, flags);
4471 if (rq->rd) {
4472 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
4473 set_rq_offline(rq);
4474 }
4475 migrate_tasks(cpu);
4476 BUG_ON(rq->nr_running != 1);
4477 raw_spin_unlock_irqrestore(&rq->lock, flags);
4478 break;
4479
4480 case CPU_DEAD:
4481 calc_load_migrate(rq);
4482 break;
4483#endif
4484 }
4485
4486 update_max_interval();
4487
4488 return NOTIFY_OK;
4489}
4490
4491
4492
4493
4494
4495
4496static struct notifier_block migration_notifier = {
4497 .notifier_call = migration_call,
4498 .priority = CPU_PRI_MIGRATION,
4499};
4500
4501static int sched_cpu_active(struct notifier_block *nfb,
4502 unsigned long action, void *hcpu)
4503{
4504 switch (action & ~CPU_TASKS_FROZEN) {
4505 case CPU_STARTING:
4506 case CPU_DOWN_FAILED:
4507 set_cpu_active((long)hcpu, true);
4508 return NOTIFY_OK;
4509 default:
4510 return NOTIFY_DONE;
4511 }
4512}
4513
4514static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu)
4516{
4517 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false);
4520 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 }
4524}
4525
4526static int __init migration_init(void)
4527{
4528 void *cpu = (void *)(long)smp_processor_id();
4529 int err;
4530
4531
4532 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4533 BUG_ON(err == NOTIFY_BAD);
4534 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4535 register_cpu_notifier(&migration_notifier);
4536
4537
4538 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
4539 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
4540
4541 return 0;
4542}
4543early_initcall(migration_init);
4544#endif
4545
4546#ifdef CONFIG_SMP
4547
4548static cpumask_var_t sched_domains_tmpmask;
4549
4550#ifdef CONFIG_SCHED_DEBUG
4551
4552static __read_mostly int sched_debug_enabled;
4553
4554static int __init sched_debug_setup(char *str)
4555{
4556 sched_debug_enabled = 1;
4557
4558 return 0;
4559}
4560early_param("sched_debug", sched_debug_setup);
4561
4562static inline bool sched_debug(void)
4563{
4564 return sched_debug_enabled;
4565}
4566
4567static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
4568 struct cpumask *groupmask)
4569{
4570 struct sched_group *group = sd->groups;
4571 char str[256];
4572
4573 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
4574 cpumask_clear(groupmask);
4575
4576 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
4577
4578 if (!(sd->flags & SD_LOAD_BALANCE)) {
4579 printk("does not load-balance\n");
4580 if (sd->parent)
4581 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
4582 " has parent");
4583 return -1;
4584 }
4585
4586 printk(KERN_CONT "span %s level %s\n", str, sd->name);
4587
4588 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
4589 printk(KERN_ERR "ERROR: domain->span does not contain "
4590 "CPU%d\n", cpu);
4591 }
4592 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
4593 printk(KERN_ERR "ERROR: domain->groups does not contain"
4594 " CPU%d\n", cpu);
4595 }
4596
4597 printk(KERN_DEBUG "%*s groups:", level + 1, "");
4598 do {
4599 if (!group) {
4600 printk("\n");
4601 printk(KERN_ERR "ERROR: group is NULL\n");
4602 break;
4603 }
4604
4605
4606
4607
4608
4609
4610 if (!group->sgp->power_orig) {
4611 printk(KERN_CONT "\n");
4612 printk(KERN_ERR "ERROR: domain->cpu_power not "
4613 "set\n");
4614 break;
4615 }
4616
4617 if (!cpumask_weight(sched_group_cpus(group))) {
4618 printk(KERN_CONT "\n");
4619 printk(KERN_ERR "ERROR: empty group\n");
4620 break;
4621 }
4622
4623 if (!(sd->flags & SD_OVERLAP) &&
4624 cpumask_intersects(groupmask, sched_group_cpus(group))) {
4625 printk(KERN_CONT "\n");
4626 printk(KERN_ERR "ERROR: repeated CPUs\n");
4627 break;
4628 }
4629
4630 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
4631
4632 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
4633
4634 printk(KERN_CONT " %s", str);
4635 if (group->sgp->power != SCHED_POWER_SCALE) {
4636 printk(KERN_CONT " (cpu_power = %d)",
4637 group->sgp->power);
4638 }
4639
4640 group = group->next;
4641 } while (group != sd->groups);
4642 printk(KERN_CONT "\n");
4643
4644 if (!cpumask_equal(sched_domain_span(sd), groupmask))
4645 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4646
4647 if (sd->parent &&
4648 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
4649 printk(KERN_ERR "ERROR: parent span is not a superset "
4650 "of domain->span\n");
4651 return 0;
4652}
4653
4654static void sched_domain_debug(struct sched_domain *sd, int cpu)
4655{
4656 int level = 0;
4657
4658 if (!sched_debug_enabled)
4659 return;
4660
4661 if (!sd) {
4662 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4663 return;
4664 }
4665
4666 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4667
4668 for (;;) {
4669 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
4670 break;
4671 level++;
4672 sd = sd->parent;
4673 if (!sd)
4674 break;
4675 }
4676}
4677#else
4678# define sched_domain_debug(sd, cpu) do { } while (0)
4679static inline bool sched_debug(void)
4680{
4681 return false;
4682}
4683#endif
4684
4685static int sd_degenerate(struct sched_domain *sd)
4686{
4687 if (cpumask_weight(sched_domain_span(sd)) == 1)
4688 return 1;
4689
4690
4691 if (sd->flags & (SD_LOAD_BALANCE |
4692 SD_BALANCE_NEWIDLE |
4693 SD_BALANCE_FORK |
4694 SD_BALANCE_EXEC |
4695 SD_SHARE_CPUPOWER |
4696 SD_SHARE_PKG_RESOURCES)) {
4697 if (sd->groups != sd->groups->next)
4698 return 0;
4699 }
4700
4701
4702 if (sd->flags & (SD_WAKE_AFFINE))
4703 return 0;
4704
4705 return 1;
4706}
4707
4708static int
4709sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4710{
4711 unsigned long cflags = sd->flags, pflags = parent->flags;
4712
4713 if (sd_degenerate(parent))
4714 return 1;
4715
4716 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
4717 return 0;
4718
4719
4720 if (parent->groups == parent->groups->next) {
4721 pflags &= ~(SD_LOAD_BALANCE |
4722 SD_BALANCE_NEWIDLE |
4723 SD_BALANCE_FORK |
4724 SD_BALANCE_EXEC |
4725 SD_SHARE_CPUPOWER |
4726 SD_SHARE_PKG_RESOURCES |
4727 SD_PREFER_SIBLING);
4728 if (nr_node_ids == 1)
4729 pflags &= ~SD_SERIALIZE;
4730 }
4731 if (~cflags & pflags)
4732 return 0;
4733
4734 return 1;
4735}
4736
4737static void free_rootdomain(struct rcu_head *rcu)
4738{
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740
4741 cpupri_cleanup(&rd->cpupri);
4742 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span);
4745 kfree(rd);
4746}
4747
4748static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4749{
4750 struct root_domain *old_rd = NULL;
4751 unsigned long flags;
4752
4753 raw_spin_lock_irqsave(&rq->lock, flags);
4754
4755 if (rq->rd) {
4756 old_rd = rq->rd;
4757
4758 if (cpumask_test_cpu(rq->cpu, old_rd->online))
4759 set_rq_offline(rq);
4760
4761 cpumask_clear_cpu(rq->cpu, old_rd->span);
4762
4763
4764
4765
4766
4767
4768 if (!atomic_dec_and_test(&old_rd->refcount))
4769 old_rd = NULL;
4770 }
4771
4772 atomic_inc(&rd->refcount);
4773 rq->rd = rd;
4774
4775 cpumask_set_cpu(rq->cpu, rd->span);
4776 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
4777 set_rq_online(rq);
4778
4779 raw_spin_unlock_irqrestore(&rq->lock, flags);
4780
4781 if (old_rd)
4782 call_rcu_sched(&old_rd->rcu, free_rootdomain);
4783}
4784
4785static int init_rootdomain(struct root_domain *rd)
4786{
4787 memset(rd, 0, sizeof(*rd));
4788
4789 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
4790 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
4794 goto free_online;
4795
4796 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask;
4798 return 0;
4799
4800free_rto_mask:
4801 free_cpumask_var(rd->rto_mask);
4802free_online:
4803 free_cpumask_var(rd->online);
4804free_span:
4805 free_cpumask_var(rd->span);
4806out:
4807 return -ENOMEM;
4808}
4809
4810
4811
4812
4813
4814struct root_domain def_root_domain;
4815
4816static void init_defrootdomain(void)
4817{
4818 init_rootdomain(&def_root_domain);
4819
4820 atomic_set(&def_root_domain.refcount, 1);
4821}
4822
4823static struct root_domain *alloc_rootdomain(void)
4824{
4825 struct root_domain *rd;
4826
4827 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
4828 if (!rd)
4829 return NULL;
4830
4831 if (init_rootdomain(rd) != 0) {
4832 kfree(rd);
4833 return NULL;
4834 }
4835
4836 return rd;
4837}
4838
4839static void free_sched_groups(struct sched_group *sg, int free_sgp)
4840{
4841 struct sched_group *tmp, *first;
4842
4843 if (!sg)
4844 return;
4845
4846 first = sg;
4847 do {
4848 tmp = sg->next;
4849
4850 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
4851 kfree(sg->sgp);
4852
4853 kfree(sg);
4854 sg = tmp;
4855 } while (sg != first);
4856}
4857
4858static void free_sched_domain(struct rcu_head *rcu)
4859{
4860 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
4861
4862
4863
4864
4865
4866 if (sd->flags & SD_OVERLAP) {
4867 free_sched_groups(sd->groups, 1);
4868 } else if (atomic_dec_and_test(&sd->groups->ref)) {
4869 kfree(sd->groups->sgp);
4870 kfree(sd->groups);
4871 }
4872 kfree(sd);
4873}
4874
4875static void destroy_sched_domain(struct sched_domain *sd, int cpu)
4876{
4877 call_rcu(&sd->rcu, free_sched_domain);
4878}
4879
4880static void destroy_sched_domains(struct sched_domain *sd, int cpu)
4881{
4882 for (; sd; sd = sd->parent)
4883 destroy_sched_domain(sd, cpu);
4884}
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895DEFINE_PER_CPU(struct sched_domain *, sd_llc);
4896DEFINE_PER_CPU(int, sd_llc_size);
4897DEFINE_PER_CPU(int, sd_llc_id);
4898DEFINE_PER_CPU(struct sched_domain *, sd_numa);
4899DEFINE_PER_CPU(struct sched_domain *, sd_busy);
4900DEFINE_PER_CPU(struct sched_domain *, sd_asym);
4901
4902static void update_top_cache_domain(int cpu)
4903{
4904 struct sched_domain *sd;
4905 struct sched_domain *busy_sd = NULL;
4906 int id = cpu;
4907 int size = 1;
4908
4909 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
4910 if (sd) {
4911 id = cpumask_first(sched_domain_span(sd));
4912 size = cpumask_weight(sched_domain_span(sd));
4913 busy_sd = sd->parent;
4914 }
4915 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
4916
4917 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
4918 per_cpu(sd_llc_size, cpu) = size;
4919 per_cpu(sd_llc_id, cpu) = id;
4920
4921 sd = lowest_flag_domain(cpu, SD_NUMA);
4922 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
4923
4924 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
4925 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
4926}
4927
4928
4929
4930
4931
4932static void
4933cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
4934{
4935 struct rq *rq = cpu_rq(cpu);
4936 struct sched_domain *tmp;
4937
4938
4939 for (tmp = sd; tmp; ) {
4940 struct sched_domain *parent = tmp->parent;
4941 if (!parent)
4942 break;
4943
4944 if (sd_parent_degenerate(tmp, parent)) {
4945 tmp->parent = parent->parent;
4946 if (parent->parent)
4947 parent->parent->child = tmp;
4948
4949
4950
4951
4952
4953 if (parent->flags & SD_PREFER_SIBLING)
4954 tmp->flags |= SD_PREFER_SIBLING;
4955 destroy_sched_domain(parent, cpu);
4956 } else
4957 tmp = tmp->parent;
4958 }
4959
4960 if (sd && sd_degenerate(sd)) {
4961 tmp = sd;
4962 sd = sd->parent;
4963 destroy_sched_domain(tmp, cpu);
4964 if (sd)
4965 sd->child = NULL;
4966 }
4967
4968 sched_domain_debug(sd, cpu);
4969
4970 rq_attach_root(rq, rd);
4971 tmp = rq->sd;
4972 rcu_assign_pointer(rq->sd, sd);
4973 destroy_sched_domains(tmp, cpu);
4974
4975 update_top_cache_domain(cpu);
4976}
4977
4978
4979static cpumask_var_t cpu_isolated_map;
4980
4981
4982static int __init isolated_cpu_setup(char *str)
4983{
4984 alloc_bootmem_cpumask_var(&cpu_isolated_map);
4985 cpulist_parse(str, cpu_isolated_map);
4986 return 1;
4987}
4988
4989__setup("isolcpus=", isolated_cpu_setup);
4990
4991static const struct cpumask *cpu_cpu_mask(int cpu)
4992{
4993 return cpumask_of_node(cpu_to_node(cpu));
4994}
4995
4996struct sd_data {
4997 struct sched_domain **__percpu sd;
4998 struct sched_group **__percpu sg;
4999 struct sched_group_power **__percpu sgp;
5000};
5001
5002struct s_data {
5003 struct sched_domain ** __percpu sd;
5004 struct root_domain *rd;
5005};
5006
5007enum s_alloc {
5008 sa_rootdomain,
5009 sa_sd,
5010 sa_sd_storage,
5011 sa_none,
5012};
5013
5014struct sched_domain_topology_level;
5015
5016typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5017typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5018
5019#define SDTL_OVERLAP 0x01
5020
5021struct sched_domain_topology_level {
5022 sched_domain_init_f init;
5023 sched_domain_mask_f mask;
5024 int flags;
5025 int numa_level;
5026 struct sd_data data;
5027};
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5043{
5044 const struct cpumask *span = sched_domain_span(sd);
5045 struct sd_data *sdd = sd->private;
5046 struct sched_domain *sibling;
5047 int i;
5048
5049 for_each_cpu(i, span) {
5050 sibling = *per_cpu_ptr(sdd->sd, i);
5051 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5052 continue;
5053
5054 cpumask_set_cpu(i, sched_group_mask(sg));
5055 }
5056}
5057
5058
5059
5060
5061
5062int group_balance_cpu(struct sched_group *sg)
5063{
5064 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5065}
5066
5067static int
5068build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5069{
5070 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5071 const struct cpumask *span = sched_domain_span(sd);
5072 struct cpumask *covered = sched_domains_tmpmask;
5073 struct sd_data *sdd = sd->private;
5074 struct sched_domain *child;
5075 int i;
5076
5077 cpumask_clear(covered);
5078
5079 for_each_cpu(i, span) {
5080 struct cpumask *sg_span;
5081
5082 if (cpumask_test_cpu(i, covered))
5083 continue;
5084
5085 child = *per_cpu_ptr(sdd->sd, i);
5086
5087
5088 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5089 continue;
5090
5091 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5092 GFP_KERNEL, cpu_to_node(cpu));
5093
5094 if (!sg)
5095 goto fail;
5096
5097 sg_span = sched_group_cpus(sg);
5098 if (child->child) {
5099 child = child->child;
5100 cpumask_copy(sg_span, sched_domain_span(child));
5101 } else
5102 cpumask_set_cpu(i, sg_span);
5103
5104 cpumask_or(covered, covered, sg_span);
5105
5106 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5107 if (atomic_inc_return(&sg->sgp->ref) == 1)
5108 build_group_mask(sd, sg);
5109
5110
5111
5112
5113
5114
5115 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5116 sg->sgp->power_orig = sg->sgp->power;
5117
5118
5119
5120
5121
5122
5123 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5124 group_balance_cpu(sg) == cpu)
5125 groups = sg;
5126
5127 if (!first)
5128 first = sg;
5129 if (last)
5130 last->next = sg;
5131 last = sg;
5132 last->next = first;
5133 }
5134 sd->groups = groups;
5135
5136 return 0;
5137
5138fail:
5139 free_sched_groups(first, 0);
5140
5141 return -ENOMEM;
5142}
5143
5144static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5145{
5146 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5147 struct sched_domain *child = sd->child;
5148
5149 if (child)
5150 cpu = cpumask_first(sched_domain_span(child));
5151
5152 if (sg) {
5153 *sg = *per_cpu_ptr(sdd->sg, cpu);
5154 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5155 atomic_set(&(*sg)->sgp->ref, 1);
5156 }
5157
5158 return cpu;
5159}
5160
5161
5162
5163
5164
5165
5166
5167
5168static int
5169build_sched_groups(struct sched_domain *sd, int cpu)
5170{
5171 struct sched_group *first = NULL, *last = NULL;
5172 struct sd_data *sdd = sd->private;
5173 const struct cpumask *span = sched_domain_span(sd);
5174 struct cpumask *covered;
5175 int i;
5176
5177 get_group(cpu, sdd, &sd->groups);
5178 atomic_inc(&sd->groups->ref);
5179
5180 if (cpu != cpumask_first(span))
5181 return 0;
5182
5183 lockdep_assert_held(&sched_domains_mutex);
5184 covered = sched_domains_tmpmask;
5185
5186 cpumask_clear(covered);
5187
5188 for_each_cpu(i, span) {
5189 struct sched_group *sg;
5190 int group, j;
5191
5192 if (cpumask_test_cpu(i, covered))
5193 continue;
5194
5195 group = get_group(i, sdd, &sg);
5196 cpumask_clear(sched_group_cpus(sg));
5197 sg->sgp->power = 0;
5198 cpumask_setall(sched_group_mask(sg));
5199
5200 for_each_cpu(j, span) {
5201 if (get_group(j, sdd, NULL) != group)
5202 continue;
5203
5204 cpumask_set_cpu(j, covered);
5205 cpumask_set_cpu(j, sched_group_cpus(sg));
5206 }
5207
5208 if (!first)
5209 first = sg;
5210 if (last)
5211 last->next = sg;
5212 last = sg;
5213 }
5214 last->next = first;
5215
5216 return 0;
5217}
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5230{
5231 struct sched_group *sg = sd->groups;
5232
5233 WARN_ON(!sg);
5234
5235 do {
5236 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5237 sg = sg->next;
5238 } while (sg != sd->groups);
5239
5240 if (cpu != group_balance_cpu(sg))
5241 return;
5242
5243 update_group_power(sd, cpu);
5244 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5245}
5246
5247int __weak arch_sd_sibling_asym_packing(void)
5248{
5249 return 0*SD_ASYM_PACKING;
5250}
5251
5252
5253
5254
5255
5256
5257#ifdef CONFIG_SCHED_DEBUG
5258# define SD_INIT_NAME(sd, type) sd->name = #type
5259#else
5260# define SD_INIT_NAME(sd, type) do { } while (0)
5261#endif
5262
5263#define SD_INIT_FUNC(type) \
5264static noinline struct sched_domain * \
5265sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5266{ \
5267 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5268 *sd = SD_##type##_INIT; \
5269 SD_INIT_NAME(sd, type); \
5270 sd->private = &tl->data; \
5271 return sd; \
5272}
5273
5274SD_INIT_FUNC(CPU)
5275#ifdef CONFIG_SCHED_SMT
5276 SD_INIT_FUNC(SIBLING)
5277#endif
5278#ifdef CONFIG_SCHED_MC
5279 SD_INIT_FUNC(MC)
5280#endif
5281#ifdef CONFIG_SCHED_BOOK
5282 SD_INIT_FUNC(BOOK)
5283#endif
5284
5285static int default_relax_domain_level = -1;
5286int sched_domain_level_max;
5287
5288static int __init setup_relax_domain_level(char *str)
5289{
5290 if (kstrtoint(str, 0, &default_relax_domain_level))
5291 pr_warn("Unable to set relax_domain_level\n");
5292
5293 return 1;
5294}
5295__setup("relax_domain_level=", setup_relax_domain_level);
5296
5297static void set_domain_attribute(struct sched_domain *sd,
5298 struct sched_domain_attr *attr)
5299{
5300 int request;
5301
5302 if (!attr || attr->relax_domain_level < 0) {
5303 if (default_relax_domain_level < 0)
5304 return;
5305 else
5306 request = default_relax_domain_level;
5307 } else
5308 request = attr->relax_domain_level;
5309 if (request < sd->level) {
5310
5311 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5312 } else {
5313
5314 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5315 }
5316}
5317
5318static void __sdt_free(const struct cpumask *cpu_map);
5319static int __sdt_alloc(const struct cpumask *cpu_map);
5320
5321static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5322 const struct cpumask *cpu_map)
5323{
5324 switch (what) {
5325 case sa_rootdomain:
5326 if (!atomic_read(&d->rd->refcount))
5327 free_rootdomain(&d->rd->rcu);
5328 case sa_sd:
5329 free_percpu(d->sd);
5330 case sa_sd_storage:
5331 __sdt_free(cpu_map);
5332 case sa_none:
5333 break;
5334 }
5335}
5336
5337static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5338 const struct cpumask *cpu_map)
5339{
5340 memset(d, 0, sizeof(*d));
5341
5342 if (__sdt_alloc(cpu_map))
5343 return sa_sd_storage;
5344 d->sd = alloc_percpu(struct sched_domain *);
5345 if (!d->sd)
5346 return sa_sd_storage;
5347 d->rd = alloc_rootdomain();
5348 if (!d->rd)
5349 return sa_sd;
5350 return sa_rootdomain;
5351}
5352
5353
5354
5355
5356
5357
5358static void claim_allocations(int cpu, struct sched_domain *sd)
5359{
5360 struct sd_data *sdd = sd->private;
5361
5362 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5363 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5364
5365 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5366 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5367
5368 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5369 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5370}
5371
5372#ifdef CONFIG_SCHED_SMT
5373static const struct cpumask *cpu_smt_mask(int cpu)
5374{
5375 return topology_thread_cpumask(cpu);
5376}
5377#endif
5378
5379
5380
5381
5382static struct sched_domain_topology_level default_topology[] = {
5383#ifdef CONFIG_SCHED_SMT
5384 { sd_init_SIBLING, cpu_smt_mask, },
5385#endif
5386#ifdef CONFIG_SCHED_MC
5387 { sd_init_MC, cpu_coregroup_mask, },
5388#endif
5389#ifdef CONFIG_SCHED_BOOK
5390 { sd_init_BOOK, cpu_book_mask, },
5391#endif
5392 { sd_init_CPU, cpu_cpu_mask, },
5393 { NULL, },
5394};
5395
5396static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5397
5398#define for_each_sd_topology(tl) \
5399 for (tl = sched_domain_topology; tl->init; tl++)
5400
5401#ifdef CONFIG_NUMA
5402
5403static int sched_domains_numa_levels;
5404static int *sched_domains_numa_distance;
5405static struct cpumask ***sched_domains_numa_masks;
5406static int sched_domains_curr_level;
5407
5408static inline int sd_local_flags(int level)
5409{
5410 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5411 return 0;
5412
5413 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5414}
5415
5416static struct sched_domain *
5417sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5418{
5419 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5420 int level = tl->numa_level;
5421 int sd_weight = cpumask_weight(
5422 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
5423
5424 *sd = (struct sched_domain){
5425 .min_interval = sd_weight,
5426 .max_interval = 2*sd_weight,
5427 .busy_factor = 32,
5428 .imbalance_pct = 125,
5429 .cache_nice_tries = 2,
5430 .busy_idx = 3,
5431 .idle_idx = 2,
5432 .newidle_idx = 0,
5433 .wake_idx = 0,
5434 .forkexec_idx = 0,
5435
5436 .flags = 1*SD_LOAD_BALANCE
5437 | 1*SD_BALANCE_NEWIDLE
5438 | 0*SD_BALANCE_EXEC
5439 | 0*SD_BALANCE_FORK
5440 | 0*SD_BALANCE_WAKE
5441 | 0*SD_WAKE_AFFINE
5442 | 0*SD_SHARE_CPUPOWER
5443 | 0*SD_SHARE_PKG_RESOURCES
5444 | 1*SD_SERIALIZE
5445 | 0*SD_PREFER_SIBLING
5446 | 1*SD_NUMA
5447 | sd_local_flags(level)
5448 ,
5449 .last_balance = jiffies,
5450 .balance_interval = sd_weight,
5451 };
5452 SD_INIT_NAME(sd, NUMA);
5453 sd->private = &tl->data;
5454
5455
5456
5457
5458 sched_domains_curr_level = tl->numa_level;
5459
5460 return sd;
5461}
5462
5463static const struct cpumask *sd_numa_mask(int cpu)
5464{
5465 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
5466}
5467
5468static void sched_numa_warn(const char *str)
5469{
5470 static int done = false;
5471 int i,j;
5472
5473 if (done)
5474 return;
5475
5476 done = true;
5477
5478 printk(KERN_WARNING "ERROR: %s\n\n", str);
5479
5480 for (i = 0; i < nr_node_ids; i++) {
5481 printk(KERN_WARNING " ");
5482 for (j = 0; j < nr_node_ids; j++)
5483 printk(KERN_CONT "%02d ", node_distance(i,j));
5484 printk(KERN_CONT "\n");
5485 }
5486 printk(KERN_WARNING "\n");
5487}
5488
5489static bool find_numa_distance(int distance)
5490{
5491 int i;
5492
5493 if (distance == node_distance(0, 0))
5494 return true;
5495
5496 for (i = 0; i < sched_domains_numa_levels; i++) {
5497 if (sched_domains_numa_distance[i] == distance)
5498 return true;
5499 }
5500
5501 return false;
5502}
5503
5504static void sched_init_numa(void)
5505{
5506 int next_distance, curr_distance = node_distance(0, 0);
5507 struct sched_domain_topology_level *tl;
5508 int level = 0;
5509 int i, j, k;
5510
5511 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
5512 if (!sched_domains_numa_distance)
5513 return;
5514
5515
5516
5517
5518
5519
5520
5521
5522 next_distance = curr_distance;
5523 for (i = 0; i < nr_node_ids; i++) {
5524 for (j = 0; j < nr_node_ids; j++) {
5525 for (k = 0; k < nr_node_ids; k++) {
5526 int distance = node_distance(i, k);
5527
5528 if (distance > curr_distance &&
5529 (distance < next_distance ||
5530 next_distance == curr_distance))
5531 next_distance = distance;
5532
5533
5534
5535
5536
5537
5538 if (sched_debug() && node_distance(k, i) != distance)
5539 sched_numa_warn("Node-distance not symmetric");
5540
5541 if (sched_debug() && i && !find_numa_distance(distance))
5542 sched_numa_warn("Node-0 not representative");
5543 }
5544 if (next_distance != curr_distance) {
5545 sched_domains_numa_distance[level++] = next_distance;
5546 sched_domains_numa_levels = level;
5547 curr_distance = next_distance;
5548 } else break;
5549 }
5550
5551
5552
5553
5554 if (!sched_debug())
5555 break;
5556 }
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574 sched_domains_numa_levels = 0;
5575
5576 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
5577 if (!sched_domains_numa_masks)
5578 return;
5579
5580
5581
5582
5583
5584 for (i = 0; i < level; i++) {
5585 sched_domains_numa_masks[i] =
5586 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
5587 if (!sched_domains_numa_masks[i])
5588 return;
5589
5590 for (j = 0; j < nr_node_ids; j++) {
5591 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
5592 if (!mask)
5593 return;
5594
5595 sched_domains_numa_masks[i][j] = mask;
5596
5597 for (k = 0; k < nr_node_ids; k++) {
5598 if (node_distance(j, k) > sched_domains_numa_distance[i])
5599 continue;
5600
5601 cpumask_or(mask, mask, cpumask_of_node(k));
5602 }
5603 }
5604 }
5605
5606 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
5607 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
5608 if (!tl)
5609 return;
5610
5611
5612
5613
5614 for (i = 0; default_topology[i].init; i++)
5615 tl[i] = default_topology[i];
5616
5617
5618
5619
5620 for (j = 0; j < level; i++, j++) {
5621 tl[i] = (struct sched_domain_topology_level){
5622 .init = sd_numa_init,
5623 .mask = sd_numa_mask,
5624 .flags = SDTL_OVERLAP,
5625 .numa_level = j,
5626 };
5627 }
5628
5629 sched_domain_topology = tl;
5630
5631 sched_domains_numa_levels = level;
5632}
5633
5634static void sched_domains_numa_masks_set(int cpu)
5635{
5636 int i, j;
5637 int node = cpu_to_node(cpu);
5638
5639 for (i = 0; i < sched_domains_numa_levels; i++) {
5640 for (j = 0; j < nr_node_ids; j++) {
5641 if (node_distance(j, node) <= sched_domains_numa_distance[i])
5642 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
5643 }
5644 }
5645}
5646
5647static void sched_domains_numa_masks_clear(int cpu)
5648{
5649 int i, j;
5650 for (i = 0; i < sched_domains_numa_levels; i++) {
5651 for (j = 0; j < nr_node_ids; j++)
5652 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
5653 }
5654}
5655
5656
5657
5658
5659
5660static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5661 unsigned long action,
5662 void *hcpu)
5663{
5664 int cpu = (long)hcpu;
5665
5666 switch (action & ~CPU_TASKS_FROZEN) {
5667 case CPU_ONLINE:
5668 sched_domains_numa_masks_set(cpu);
5669 break;
5670
5671 case CPU_DEAD:
5672 sched_domains_numa_masks_clear(cpu);
5673 break;
5674
5675 default:
5676 return NOTIFY_DONE;
5677 }
5678
5679 return NOTIFY_OK;
5680}
5681#else
5682static inline void sched_init_numa(void)
5683{
5684}
5685
5686static int sched_domains_numa_masks_update(struct notifier_block *nfb,
5687 unsigned long action,
5688 void *hcpu)
5689{
5690 return 0;
5691}
5692#endif
5693
5694static int __sdt_alloc(const struct cpumask *cpu_map)
5695{
5696 struct sched_domain_topology_level *tl;
5697 int j;
5698
5699 for_each_sd_topology(tl) {
5700 struct sd_data *sdd = &tl->data;
5701
5702 sdd->sd = alloc_percpu(struct sched_domain *);
5703 if (!sdd->sd)
5704 return -ENOMEM;
5705
5706 sdd->sg = alloc_percpu(struct sched_group *);
5707 if (!sdd->sg)
5708 return -ENOMEM;
5709
5710 sdd->sgp = alloc_percpu(struct sched_group_power *);
5711 if (!sdd->sgp)
5712 return -ENOMEM;
5713
5714 for_each_cpu(j, cpu_map) {
5715 struct sched_domain *sd;
5716 struct sched_group *sg;
5717 struct sched_group_power *sgp;
5718
5719 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
5720 GFP_KERNEL, cpu_to_node(j));
5721 if (!sd)
5722 return -ENOMEM;
5723
5724 *per_cpu_ptr(sdd->sd, j) = sd;
5725
5726 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5727 GFP_KERNEL, cpu_to_node(j));
5728 if (!sg)
5729 return -ENOMEM;
5730
5731 sg->next = sg;
5732
5733 *per_cpu_ptr(sdd->sg, j) = sg;
5734
5735 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
5736 GFP_KERNEL, cpu_to_node(j));
5737 if (!sgp)
5738 return -ENOMEM;
5739
5740 *per_cpu_ptr(sdd->sgp, j) = sgp;
5741 }
5742 }
5743
5744 return 0;
5745}
5746
5747static void __sdt_free(const struct cpumask *cpu_map)
5748{
5749 struct sched_domain_topology_level *tl;
5750 int j;
5751
5752 for_each_sd_topology(tl) {
5753 struct sd_data *sdd = &tl->data;
5754
5755 for_each_cpu(j, cpu_map) {
5756 struct sched_domain *sd;
5757
5758 if (sdd->sd) {
5759 sd = *per_cpu_ptr(sdd->sd, j);
5760 if (sd && (sd->flags & SD_OVERLAP))
5761 free_sched_groups(sd->groups, 0);
5762 kfree(*per_cpu_ptr(sdd->sd, j));
5763 }
5764
5765 if (sdd->sg)
5766 kfree(*per_cpu_ptr(sdd->sg, j));
5767 if (sdd->sgp)
5768 kfree(*per_cpu_ptr(sdd->sgp, j));
5769 }
5770 free_percpu(sdd->sd);
5771 sdd->sd = NULL;
5772 free_percpu(sdd->sg);
5773 sdd->sg = NULL;
5774 free_percpu(sdd->sgp);
5775 sdd->sgp = NULL;
5776 }
5777}
5778
5779struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
5780 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
5781 struct sched_domain *child, int cpu)
5782{
5783 struct sched_domain *sd = tl->init(tl, cpu);
5784 if (!sd)
5785 return child;
5786
5787 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
5788 if (child) {
5789 sd->level = child->level + 1;
5790 sched_domain_level_max = max(sched_domain_level_max, sd->level);
5791 child->parent = sd;
5792 sd->child = child;
5793 }
5794 set_domain_attribute(sd, attr);
5795
5796 return sd;
5797}
5798
5799
5800
5801
5802
5803static int build_sched_domains(const struct cpumask *cpu_map,
5804 struct sched_domain_attr *attr)
5805{
5806 enum s_alloc alloc_state;
5807 struct sched_domain *sd;
5808 struct s_data d;
5809 int i, ret = -ENOMEM;
5810
5811 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
5812 if (alloc_state != sa_rootdomain)
5813 goto error;
5814
5815
5816 for_each_cpu(i, cpu_map) {
5817 struct sched_domain_topology_level *tl;
5818
5819 sd = NULL;
5820 for_each_sd_topology(tl) {
5821 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
5822 if (tl == sched_domain_topology)
5823 *per_cpu_ptr(d.sd, i) = sd;
5824 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
5825 sd->flags |= SD_OVERLAP;
5826 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
5827 break;
5828 }
5829 }
5830
5831
5832 for_each_cpu(i, cpu_map) {
5833 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
5834 sd->span_weight = cpumask_weight(sched_domain_span(sd));
5835 if (sd->flags & SD_OVERLAP) {
5836 if (build_overlap_sched_groups(sd, i))
5837 goto error;
5838 } else {
5839 if (build_sched_groups(sd, i))
5840 goto error;
5841 }
5842 }
5843 }
5844
5845
5846 for (i = nr_cpumask_bits-1; i >= 0; i--) {
5847 if (!cpumask_test_cpu(i, cpu_map))
5848 continue;
5849
5850 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
5851 claim_allocations(i, sd);
5852 init_sched_groups_power(i, sd);
5853 }
5854 }
5855
5856
5857 rcu_read_lock();
5858 for_each_cpu(i, cpu_map) {
5859 sd = *per_cpu_ptr(d.sd, i);
5860 cpu_attach_domain(sd, d.rd, i);
5861 }
5862 rcu_read_unlock();
5863
5864 ret = 0;
5865error:
5866 __free_domain_allocs(&d, alloc_state, cpu_map);
5867 return ret;
5868}
5869
5870static cpumask_var_t *doms_cur;
5871static int ndoms_cur;
5872static struct sched_domain_attr *dattr_cur;
5873
5874
5875
5876
5877
5878
5879
5880static cpumask_var_t fallback_doms;
5881
5882
5883
5884
5885
5886
5887int __attribute__((weak)) arch_update_cpu_topology(void)
5888{
5889 return 0;
5890}
5891
5892cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
5893{
5894 int i;
5895 cpumask_var_t *doms;
5896
5897 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
5898 if (!doms)
5899 return NULL;
5900 for (i = 0; i < ndoms; i++) {
5901 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
5902 free_sched_domains(doms, i);
5903 return NULL;
5904 }
5905 }
5906 return doms;
5907}
5908
5909void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
5910{
5911 unsigned int i;
5912 for (i = 0; i < ndoms; i++)
5913 free_cpumask_var(doms[i]);
5914 kfree(doms);
5915}
5916
5917
5918
5919
5920
5921
5922static int init_sched_domains(const struct cpumask *cpu_map)
5923{
5924 int err;
5925
5926 arch_update_cpu_topology();
5927 ndoms_cur = 1;
5928 doms_cur = alloc_sched_domains(ndoms_cur);
5929 if (!doms_cur)
5930 doms_cur = &fallback_doms;
5931 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
5932 err = build_sched_domains(doms_cur[0], NULL);
5933 register_sched_domain_sysctl();
5934
5935 return err;
5936}
5937
5938
5939
5940
5941
5942static void detach_destroy_domains(const struct cpumask *cpu_map)
5943{
5944 int i;
5945
5946 rcu_read_lock();
5947 for_each_cpu(i, cpu_map)
5948 cpu_attach_domain(NULL, &def_root_domain, i);
5949 rcu_read_unlock();
5950}
5951
5952
5953static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
5954 struct sched_domain_attr *new, int idx_new)
5955{
5956 struct sched_domain_attr tmp;
5957
5958
5959 if (!new && !cur)
5960 return 1;
5961
5962 tmp = SD_ATTR_INIT;
5963 return !memcmp(cur ? (cur + idx_cur) : &tmp,
5964 new ? (new + idx_new) : &tmp,
5965 sizeof(struct sched_domain_attr));
5966}
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
5995 struct sched_domain_attr *dattr_new)
5996{
5997 int i, j, n;
5998 int new_topology;
5999
6000 mutex_lock(&sched_domains_mutex);
6001
6002
6003 unregister_sched_domain_sysctl();
6004
6005
6006 new_topology = arch_update_cpu_topology();
6007
6008 n = doms_new ? ndoms_new : 0;
6009
6010
6011 for (i = 0; i < ndoms_cur; i++) {
6012 for (j = 0; j < n && !new_topology; j++) {
6013 if (cpumask_equal(doms_cur[i], doms_new[j])
6014 && dattrs_equal(dattr_cur, i, dattr_new, j))
6015 goto match1;
6016 }
6017
6018 detach_destroy_domains(doms_cur[i]);
6019match1:
6020 ;
6021 }
6022
6023 n = ndoms_cur;
6024 if (doms_new == NULL) {
6025 n = 0;
6026 doms_new = &fallback_doms;
6027 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6028 WARN_ON_ONCE(dattr_new);
6029 }
6030
6031
6032 for (i = 0; i < ndoms_new; i++) {
6033 for (j = 0; j < n && !new_topology; j++) {
6034 if (cpumask_equal(doms_new[i], doms_cur[j])
6035 && dattrs_equal(dattr_new, i, dattr_cur, j))
6036 goto match2;
6037 }
6038
6039 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6040match2:
6041 ;
6042 }
6043
6044
6045 if (doms_cur != &fallback_doms)
6046 free_sched_domains(doms_cur, ndoms_cur);
6047 kfree(dattr_cur);
6048 doms_cur = doms_new;
6049 dattr_cur = dattr_new;
6050 ndoms_cur = ndoms_new;
6051
6052 register_sched_domain_sysctl();
6053
6054 mutex_unlock(&sched_domains_mutex);
6055}
6056
6057static int num_cpus_frozen;
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6068 void *hcpu)
6069{
6070 switch (action) {
6071 case CPU_ONLINE_FROZEN:
6072 case CPU_DOWN_FAILED_FROZEN:
6073
6074
6075
6076
6077
6078
6079
6080 num_cpus_frozen--;
6081 if (likely(num_cpus_frozen)) {
6082 partition_sched_domains(1, NULL, NULL);
6083 break;
6084 }
6085
6086
6087
6088
6089
6090
6091
6092 case CPU_ONLINE:
6093 case CPU_DOWN_FAILED:
6094 cpuset_update_active_cpus(true);
6095 break;
6096 default:
6097 return NOTIFY_DONE;
6098 }
6099 return NOTIFY_OK;
6100}
6101
6102static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6103 void *hcpu)
6104{
6105 switch (action) {
6106 case CPU_DOWN_PREPARE:
6107 cpuset_update_active_cpus(false);
6108 break;
6109 case CPU_DOWN_PREPARE_FROZEN:
6110 num_cpus_frozen++;
6111 partition_sched_domains(1, NULL, NULL);
6112 break;
6113 default:
6114 return NOTIFY_DONE;
6115 }
6116 return NOTIFY_OK;
6117}
6118
6119void __init sched_init_smp(void)
6120{
6121 cpumask_var_t non_isolated_cpus;
6122
6123 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6124 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6125
6126 sched_init_numa();
6127
6128
6129
6130
6131
6132
6133 mutex_lock(&sched_domains_mutex);
6134 init_sched_domains(cpu_active_mask);
6135 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6136 if (cpumask_empty(non_isolated_cpus))
6137 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6138 mutex_unlock(&sched_domains_mutex);
6139
6140 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6141 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6142 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6143
6144 init_hrtick();
6145
6146
6147 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6148 BUG();
6149 sched_init_granularity();
6150 free_cpumask_var(non_isolated_cpus);
6151
6152 init_sched_rt_class();
6153}
6154#else
6155void __init sched_init_smp(void)
6156{
6157 sched_init_granularity();
6158}
6159#endif
6160
6161const_debug unsigned int sysctl_timer_migration = 1;
6162
6163int in_sched_functions(unsigned long addr)
6164{
6165 return in_lock_functions(addr) ||
6166 (addr >= (unsigned long)__sched_text_start
6167 && addr < (unsigned long)__sched_text_end);
6168}
6169
6170#ifdef CONFIG_CGROUP_SCHED
6171
6172
6173
6174
6175struct task_group root_task_group;
6176LIST_HEAD(task_groups);
6177#endif
6178
6179DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6180
6181void __init sched_init(void)
6182{
6183 int i, j;
6184 unsigned long alloc_size = 0, ptr;
6185
6186#ifdef CONFIG_FAIR_GROUP_SCHED
6187 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6188#endif
6189#ifdef CONFIG_RT_GROUP_SCHED
6190 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6191#endif
6192#ifdef CONFIG_CPUMASK_OFFSTACK
6193 alloc_size += num_possible_cpus() * cpumask_size();
6194#endif
6195 if (alloc_size) {
6196 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6197
6198#ifdef CONFIG_FAIR_GROUP_SCHED
6199 root_task_group.se = (struct sched_entity **)ptr;
6200 ptr += nr_cpu_ids * sizeof(void **);
6201
6202 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6203 ptr += nr_cpu_ids * sizeof(void **);
6204
6205#endif
6206#ifdef CONFIG_RT_GROUP_SCHED
6207 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6208 ptr += nr_cpu_ids * sizeof(void **);
6209
6210 root_task_group.rt_rq = (struct rt_rq **)ptr;
6211 ptr += nr_cpu_ids * sizeof(void **);
6212
6213#endif
6214#ifdef CONFIG_CPUMASK_OFFSTACK
6215 for_each_possible_cpu(i) {
6216 per_cpu(load_balance_mask, i) = (void *)ptr;
6217 ptr += cpumask_size();
6218 }
6219#endif
6220 }
6221
6222#ifdef CONFIG_SMP
6223 init_defrootdomain();
6224#endif
6225
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime());
6232#endif
6233
6234#ifdef CONFIG_CGROUP_SCHED
6235 list_add(&root_task_group.list, &task_groups);
6236 INIT_LIST_HEAD(&root_task_group.children);
6237 INIT_LIST_HEAD(&root_task_group.siblings);
6238 autogroup_init(&init_task);
6239
6240#endif
6241
6242 for_each_possible_cpu(i) {
6243 struct rq *rq;
6244
6245 rq = cpu_rq(i);
6246 raw_spin_lock_init(&rq->lock);
6247 rq->nr_running = 0;
6248 rq->calc_load_active = 0;
6249 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6275 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6276#endif
6277
6278 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6279#ifdef CONFIG_RT_GROUP_SCHED
6280 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6281 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6282#endif
6283
6284 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6285 rq->cpu_load[j] = 0;
6286
6287 rq->last_load_update_tick = jiffies;
6288
6289#ifdef CONFIG_SMP
6290 rq->sd = NULL;
6291 rq->rd = NULL;
6292 rq->cpu_power = SCHED_POWER_SCALE;
6293 rq->post_schedule = 0;
6294 rq->active_balance = 0;
6295 rq->next_balance = jiffies;
6296 rq->push_cpu = 0;
6297 rq->cpu = i;
6298 rq->online = 0;
6299 rq->idle_stamp = 0;
6300 rq->avg_idle = 2*sysctl_sched_migration_cost;
6301 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6302
6303 INIT_LIST_HEAD(&rq->cfs_tasks);
6304
6305 rq_attach_root(rq, &def_root_domain);
6306#ifdef CONFIG_NO_HZ_COMMON
6307 rq->nohz_flags = 0;
6308#endif
6309#ifdef CONFIG_NO_HZ_FULL
6310 rq->last_sched_tick = 0;
6311#endif
6312#endif
6313 init_rq_hrtick(rq);
6314 atomic_set(&rq->nr_iowait, 0);
6315 }
6316
6317 set_load_weight(&init_task);
6318
6319#ifdef CONFIG_PREEMPT_NOTIFIERS
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif
6322
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327
6328
6329
6330 atomic_inc(&init_mm.mm_count);
6331 enter_lazy_tlb(&init_mm, current);
6332
6333
6334
6335
6336
6337
6338
6339 init_idle(current, smp_processor_id());
6340
6341 calc_load_update = jiffies + LOAD_FREQ;
6342
6343
6344
6345
6346 current->sched_class = &fair_sched_class;
6347
6348#ifdef CONFIG_SMP
6349 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6350
6351 if (cpu_isolated_map == NULL)
6352 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6353 idle_thread_set_boot_cpu();
6354#endif
6355 init_sched_fair_class();
6356
6357 scheduler_running = 1;
6358}
6359
6360#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6361static inline int preempt_count_equals(int preempt_offset)
6362{
6363 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6364
6365 return (nested == preempt_offset);
6366}
6367
6368void __might_sleep(const char *file, int line, int preempt_offset)
6369{
6370 static unsigned long prev_jiffy;
6371
6372 rcu_sleep_check();
6373 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6374 system_state != SYSTEM_RUNNING || oops_in_progress)
6375 return;
6376 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6377 return;
6378 prev_jiffy = jiffies;
6379
6380 printk(KERN_ERR
6381 "BUG: sleeping function called from invalid context at %s:%d\n",
6382 file, line);
6383 printk(KERN_ERR
6384 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6385 in_atomic(), irqs_disabled(),
6386 current->pid, current->comm);
6387
6388 debug_show_held_locks(current);
6389 if (irqs_disabled())
6390 print_irqtrace_events(current);
6391 dump_stack();
6392}
6393EXPORT_SYMBOL(__might_sleep);
6394#endif
6395
6396#ifdef CONFIG_MAGIC_SYSRQ
6397static void normalize_task(struct rq *rq, struct task_struct *p)
6398{
6399 const struct sched_class *prev_class = p->sched_class;
6400 int old_prio = p->prio;
6401 int on_rq;
6402
6403 on_rq = p->on_rq;
6404 if (on_rq)
6405 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0);
6407 if (on_rq) {
6408 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr);
6410 }
6411
6412 check_class_changed(rq, p, prev_class, old_prio);
6413}
6414
6415void normalize_rt_tasks(void)
6416{
6417 struct task_struct *g, *p;
6418 unsigned long flags;
6419 struct rq *rq;
6420
6421 read_lock_irqsave(&tasklist_lock, flags);
6422 do_each_thread(g, p) {
6423
6424
6425
6426 if (!p->mm)
6427 continue;
6428
6429 p->se.exec_start = 0;
6430#ifdef CONFIG_SCHEDSTATS
6431 p->se.statistics.wait_start = 0;
6432 p->se.statistics.sleep_start = 0;
6433 p->se.statistics.block_start = 0;
6434#endif
6435
6436 if (!rt_task(p)) {
6437
6438
6439
6440
6441 if (TASK_NICE(p) < 0 && p->mm)
6442 set_user_nice(p, 0);
6443 continue;
6444 }
6445
6446 raw_spin_lock(&p->pi_lock);
6447 rq = __task_rq_lock(p);
6448
6449 normalize_task(rq, p);
6450
6451 __task_rq_unlock(rq);
6452 raw_spin_unlock(&p->pi_lock);
6453 } while_each_thread(g, p);
6454
6455 read_unlock_irqrestore(&tasklist_lock, flags);
6456}
6457
6458#endif
6459
6460#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479struct task_struct *curr_task(int cpu)
6480{
6481 return cpu_curr(cpu);
6482}
6483
6484#endif
6485
6486#ifdef CONFIG_IA64
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502void set_curr_task(int cpu, struct task_struct *p)
6503{
6504 cpu_curr(cpu) = p;
6505}
6506
6507#endif
6508
6509#ifdef CONFIG_CGROUP_SCHED
6510
6511static DEFINE_SPINLOCK(task_group_lock);
6512
6513static void free_sched_group(struct task_group *tg)
6514{
6515 free_fair_sched_group(tg);
6516 free_rt_sched_group(tg);
6517 autogroup_free(tg);
6518 kfree(tg);
6519}
6520
6521
6522struct task_group *sched_create_group(struct task_group *parent)
6523{
6524 struct task_group *tg;
6525
6526 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6527 if (!tg)
6528 return ERR_PTR(-ENOMEM);
6529
6530 if (!alloc_fair_sched_group(tg, parent))
6531 goto err;
6532
6533 if (!alloc_rt_sched_group(tg, parent))
6534 goto err;
6535
6536 return tg;
6537
6538err:
6539 free_sched_group(tg);
6540 return ERR_PTR(-ENOMEM);
6541}
6542
6543void sched_online_group(struct task_group *tg, struct task_group *parent)
6544{
6545 unsigned long flags;
6546
6547 spin_lock_irqsave(&task_group_lock, flags);
6548 list_add_rcu(&tg->list, &task_groups);
6549
6550 WARN_ON(!parent);
6551
6552 tg->parent = parent;
6553 INIT_LIST_HEAD(&tg->children);
6554 list_add_rcu(&tg->siblings, &parent->children);
6555 spin_unlock_irqrestore(&task_group_lock, flags);
6556}
6557
6558
6559static void free_sched_group_rcu(struct rcu_head *rhp)
6560{
6561
6562 free_sched_group(container_of(rhp, struct task_group, rcu));
6563}
6564
6565
6566void sched_destroy_group(struct task_group *tg)
6567{
6568
6569 call_rcu(&tg->rcu, free_sched_group_rcu);
6570}
6571
6572void sched_offline_group(struct task_group *tg)
6573{
6574 unsigned long flags;
6575 int i;
6576
6577
6578 for_each_possible_cpu(i)
6579 unregister_fair_sched_group(tg, i);
6580
6581 spin_lock_irqsave(&task_group_lock, flags);
6582 list_del_rcu(&tg->list);
6583 list_del_rcu(&tg->siblings);
6584 spin_unlock_irqrestore(&task_group_lock, flags);
6585}
6586
6587
6588
6589
6590
6591
6592void sched_move_task(struct task_struct *tsk)
6593{
6594 struct task_group *tg;
6595 int on_rq, running;
6596 unsigned long flags;
6597 struct rq *rq;
6598
6599 rq = task_rq_lock(tsk, &flags);
6600
6601 running = task_current(rq, tsk);
6602 on_rq = tsk->on_rq;
6603
6604 if (on_rq)
6605 dequeue_task(rq, tsk, 0);
6606 if (unlikely(running))
6607 tsk->sched_class->put_prev_task(rq, tsk);
6608
6609 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6610 lockdep_is_held(&tsk->sighand->siglock)),
6611 struct task_group, css);
6612 tg = autogroup_task_group(tsk, tg);
6613 tsk->sched_task_group = tg;
6614
6615#ifdef CONFIG_FAIR_GROUP_SCHED
6616 if (tsk->sched_class->task_move_group)
6617 tsk->sched_class->task_move_group(tsk, on_rq);
6618 else
6619#endif
6620 set_task_rq(tsk, task_cpu(tsk));
6621
6622 if (unlikely(running))
6623 tsk->sched_class->set_curr_task(rq);
6624 if (on_rq)
6625 enqueue_task(rq, tsk, 0);
6626
6627 task_rq_unlock(rq, tsk, &flags);
6628}
6629#endif
6630
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED
6642
6643
6644
6645static DEFINE_MUTEX(rt_constraints_mutex);
6646
6647
6648static inline int tg_has_rt_tasks(struct task_group *tg)
6649{
6650 struct task_struct *g, *p;
6651
6652 do_each_thread(g, p) {
6653 if (rt_task(p) && task_rq(p)->rt.tg == tg)
6654 return 1;
6655 } while_each_thread(g, p);
6656
6657 return 0;
6658}
6659
6660struct rt_schedulable_data {
6661 struct task_group *tg;
6662 u64 rt_period;
6663 u64 rt_runtime;
6664};
6665
6666static int tg_rt_schedulable(struct task_group *tg, void *data)
6667{
6668 struct rt_schedulable_data *d = data;
6669 struct task_group *child;
6670 unsigned long total, sum = 0;
6671 u64 period, runtime;
6672
6673 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6674 runtime = tg->rt_bandwidth.rt_runtime;
6675
6676 if (tg == d->tg) {
6677 period = d->rt_period;
6678 runtime = d->rt_runtime;
6679 }
6680
6681
6682
6683
6684 if (runtime > period && runtime != RUNTIME_INF)
6685 return -EINVAL;
6686
6687
6688
6689
6690 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
6691 return -EBUSY;
6692
6693 total = to_ratio(period, runtime);
6694
6695
6696
6697
6698 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
6699 return -EINVAL;
6700
6701
6702
6703
6704 list_for_each_entry_rcu(child, &tg->children, siblings) {
6705 period = ktime_to_ns(child->rt_bandwidth.rt_period);
6706 runtime = child->rt_bandwidth.rt_runtime;
6707
6708 if (child == d->tg) {
6709 period = d->rt_period;
6710 runtime = d->rt_runtime;
6711 }
6712
6713 sum += to_ratio(period, runtime);
6714 }
6715
6716 if (sum > total)
6717 return -EINVAL;
6718
6719 return 0;
6720}
6721
6722static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
6723{
6724 int ret;
6725
6726 struct rt_schedulable_data data = {
6727 .tg = tg,
6728 .rt_period = period,
6729 .rt_runtime = runtime,
6730 };
6731
6732 rcu_read_lock();
6733 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
6734 rcu_read_unlock();
6735
6736 return ret;
6737}
6738
6739static int tg_set_rt_bandwidth(struct task_group *tg,
6740 u64 rt_period, u64 rt_runtime)
6741{
6742 int i, err = 0;
6743
6744 mutex_lock(&rt_constraints_mutex);
6745 read_lock(&tasklist_lock);
6746 err = __rt_schedulable(tg, rt_period, rt_runtime);
6747 if (err)
6748 goto unlock;
6749
6750 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6751 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
6752 tg->rt_bandwidth.rt_runtime = rt_runtime;
6753
6754 for_each_possible_cpu(i) {
6755 struct rt_rq *rt_rq = tg->rt_rq[i];
6756
6757 raw_spin_lock(&rt_rq->rt_runtime_lock);
6758 rt_rq->rt_runtime = rt_runtime;
6759 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6760 }
6761 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
6762unlock:
6763 read_unlock(&tasklist_lock);
6764 mutex_unlock(&rt_constraints_mutex);
6765
6766 return err;
6767}
6768
6769static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
6770{
6771 u64 rt_runtime, rt_period;
6772
6773 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
6774 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
6775 if (rt_runtime_us < 0)
6776 rt_runtime = RUNTIME_INF;
6777
6778 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6779}
6780
6781static long sched_group_rt_runtime(struct task_group *tg)
6782{
6783 u64 rt_runtime_us;
6784
6785 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
6786 return -1;
6787
6788 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
6789 do_div(rt_runtime_us, NSEC_PER_USEC);
6790 return rt_runtime_us;
6791}
6792
6793static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
6794{
6795 u64 rt_runtime, rt_period;
6796
6797 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
6798 rt_runtime = tg->rt_bandwidth.rt_runtime;
6799
6800 if (rt_period == 0)
6801 return -EINVAL;
6802
6803 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
6804}
6805
6806static long sched_group_rt_period(struct task_group *tg)
6807{
6808 u64 rt_period_us;
6809
6810 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
6811 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us;
6813}
6814
6815static int sched_rt_global_constraints(void)
6816{
6817 u64 runtime, period;
6818 int ret = 0;
6819
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826
6827
6828
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0);
6835 read_unlock(&tasklist_lock);
6836 mutex_unlock(&rt_constraints_mutex);
6837
6838 return ret;
6839}
6840
6841static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6842{
6843
6844 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
6845 return 0;
6846
6847 return 1;
6848}
6849
6850#else
6851static int sched_rt_global_constraints(void)
6852{
6853 unsigned long flags;
6854 int i;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859
6860
6861
6862
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) {
6868 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
6869
6870 raw_spin_lock(&rt_rq->rt_runtime_lock);
6871 rt_rq->rt_runtime = global_rt_runtime();
6872 raw_spin_unlock(&rt_rq->rt_runtime_lock);
6873 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875
6876 return 0;
6877}
6878#endif
6879
6880int sched_rr_handler(struct ctl_table *table, int write,
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{
6884 int ret;
6885 static DEFINE_MUTEX(mutex);
6886
6887 mutex_lock(&mutex);
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6889
6890
6891 if (!ret && write) {
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
6894 }
6895 mutex_unlock(&mutex);
6896 return ret;
6897}
6898
6899int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp,
6901 loff_t *ppos)
6902{
6903 int ret;
6904 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex);
6906
6907 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period;
6909 old_runtime = sysctl_sched_rt_runtime;
6910
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912
6913 if (!ret && write) {
6914 ret = sched_rt_global_constraints();
6915 if (ret) {
6916 sysctl_sched_rt_period = old_period;
6917 sysctl_sched_rt_runtime = old_runtime;
6918 } else {
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime();
6920 def_rt_bandwidth.rt_period =
6921 ns_to_ktime(global_rt_period());
6922 }
6923 }
6924 mutex_unlock(&mutex);
6925
6926 return ret;
6927}
6928
6929#ifdef CONFIG_CGROUP_SCHED
6930
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6932{
6933 return css ? container_of(css, struct task_group, css) : NULL;
6934}
6935
6936static struct cgroup_subsys_state *
6937cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6938{
6939 struct task_group *parent = css_tg(parent_css);
6940 struct task_group *tg;
6941
6942 if (!parent) {
6943
6944 return &root_task_group.css;
6945 }
6946
6947 tg = sched_create_group(parent);
6948 if (IS_ERR(tg))
6949 return ERR_PTR(-ENOMEM);
6950
6951 return &tg->css;
6952}
6953
6954static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6955{
6956 struct task_group *tg = css_tg(css);
6957 struct task_group *parent = css_tg(css_parent(css));
6958
6959 if (parent)
6960 sched_online_group(tg, parent);
6961 return 0;
6962}
6963
6964static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6965{
6966 struct task_group *tg = css_tg(css);
6967
6968 sched_destroy_group(tg);
6969}
6970
6971static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
6972{
6973 struct task_group *tg = css_tg(css);
6974
6975 sched_offline_group(tg);
6976}
6977
6978static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
6979 struct cgroup_taskset *tset)
6980{
6981 struct task_struct *task;
6982
6983 cgroup_taskset_for_each(task, css, tset) {
6984#ifdef CONFIG_RT_GROUP_SCHED
6985 if (!sched_rt_can_attach(css_tg(css), task))
6986 return -EINVAL;
6987#else
6988
6989 if (task->sched_class != &fair_sched_class)
6990 return -EINVAL;
6991#endif
6992 }
6993 return 0;
6994}
6995
6996static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
6997 struct cgroup_taskset *tset)
6998{
6999 struct task_struct *task;
7000
7001 cgroup_taskset_for_each(task, css, tset)
7002 sched_move_task(task);
7003}
7004
7005static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7006 struct cgroup_subsys_state *old_css,
7007 struct task_struct *task)
7008{
7009
7010
7011
7012
7013
7014 if (!(task->flags & PF_EXITING))
7015 return;
7016
7017 sched_move_task(task);
7018}
7019
7020#ifdef CONFIG_FAIR_GROUP_SCHED
7021static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7022 struct cftype *cftype, u64 shareval)
7023{
7024 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7025}
7026
7027static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7028 struct cftype *cft)
7029{
7030 struct task_group *tg = css_tg(css);
7031
7032 return (u64) scale_load_down(tg->shares);
7033}
7034
7035#ifdef CONFIG_CFS_BANDWIDTH
7036static DEFINE_MUTEX(cfs_constraints_mutex);
7037
7038const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7039const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7040
7041static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7042
7043static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7044{
7045 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7046 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7047
7048 if (tg == &root_task_group)
7049 return -EINVAL;
7050
7051
7052
7053
7054
7055
7056 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7057 return -EINVAL;
7058
7059
7060
7061
7062
7063
7064 if (period > max_cfs_quota_period)
7065 return -EINVAL;
7066
7067 mutex_lock(&cfs_constraints_mutex);
7068 ret = __cfs_schedulable(tg, period, quota);
7069 if (ret)
7070 goto out_unlock;
7071
7072 runtime_enabled = quota != RUNTIME_INF;
7073 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7074
7075
7076
7077
7078 if (runtime_enabled && !runtime_was_enabled)
7079 cfs_bandwidth_usage_inc();
7080 raw_spin_lock_irq(&cfs_b->lock);
7081 cfs_b->period = ns_to_ktime(period);
7082 cfs_b->quota = quota;
7083
7084 __refill_cfs_bandwidth_runtime(cfs_b);
7085
7086 if (runtime_enabled && cfs_b->timer_active) {
7087
7088 cfs_b->timer_active = 0;
7089 __start_cfs_bandwidth(cfs_b);
7090 }
7091 raw_spin_unlock_irq(&cfs_b->lock);
7092
7093 for_each_possible_cpu(i) {
7094 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7095 struct rq *rq = cfs_rq->rq;
7096
7097 raw_spin_lock_irq(&rq->lock);
7098 cfs_rq->runtime_enabled = runtime_enabled;
7099 cfs_rq->runtime_remaining = 0;
7100
7101 if (cfs_rq->throttled)
7102 unthrottle_cfs_rq(cfs_rq);
7103 raw_spin_unlock_irq(&rq->lock);
7104 }
7105 if (runtime_was_enabled && !runtime_enabled)
7106 cfs_bandwidth_usage_dec();
7107out_unlock:
7108 mutex_unlock(&cfs_constraints_mutex);
7109
7110 return ret;
7111}
7112
7113int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7114{
7115 u64 quota, period;
7116
7117 period = ktime_to_ns(tg->cfs_bandwidth.period);
7118 if (cfs_quota_us < 0)
7119 quota = RUNTIME_INF;
7120 else
7121 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7122
7123 return tg_set_cfs_bandwidth(tg, period, quota);
7124}
7125
7126long tg_get_cfs_quota(struct task_group *tg)
7127{
7128 u64 quota_us;
7129
7130 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7131 return -1;
7132
7133 quota_us = tg->cfs_bandwidth.quota;
7134 do_div(quota_us, NSEC_PER_USEC);
7135
7136 return quota_us;
7137}
7138
7139int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7140{
7141 u64 quota, period;
7142
7143 period = (u64)cfs_period_us * NSEC_PER_USEC;
7144 quota = tg->cfs_bandwidth.quota;
7145
7146 return tg_set_cfs_bandwidth(tg, period, quota);
7147}
7148
7149long tg_get_cfs_period(struct task_group *tg)
7150{
7151 u64 cfs_period_us;
7152
7153 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7154 do_div(cfs_period_us, NSEC_PER_USEC);
7155
7156 return cfs_period_us;
7157}
7158
7159static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7160 struct cftype *cft)
7161{
7162 return tg_get_cfs_quota(css_tg(css));
7163}
7164
7165static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7166 struct cftype *cftype, s64 cfs_quota_us)
7167{
7168 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7169}
7170
7171static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7172 struct cftype *cft)
7173{
7174 return tg_get_cfs_period(css_tg(css));
7175}
7176
7177static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7178 struct cftype *cftype, u64 cfs_period_us)
7179{
7180 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7181}
7182
7183struct cfs_schedulable_data {
7184 struct task_group *tg;
7185 u64 period, quota;
7186};
7187
7188
7189
7190
7191
7192static u64 normalize_cfs_quota(struct task_group *tg,
7193 struct cfs_schedulable_data *d)
7194{
7195 u64 quota, period;
7196
7197 if (tg == d->tg) {
7198 period = d->period;
7199 quota = d->quota;
7200 } else {
7201 period = tg_get_cfs_period(tg);
7202 quota = tg_get_cfs_quota(tg);
7203 }
7204
7205
7206 if (quota == RUNTIME_INF || quota == -1)
7207 return RUNTIME_INF;
7208
7209 return to_ratio(period, quota);
7210}
7211
7212static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7213{
7214 struct cfs_schedulable_data *d = data;
7215 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7216 s64 quota = 0, parent_quota = -1;
7217
7218 if (!tg->parent) {
7219 quota = RUNTIME_INF;
7220 } else {
7221 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7222
7223 quota = normalize_cfs_quota(tg, d);
7224 parent_quota = parent_b->hierarchal_quota;
7225
7226
7227
7228
7229
7230 if (quota == RUNTIME_INF)
7231 quota = parent_quota;
7232 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7233 return -EINVAL;
7234 }
7235 cfs_b->hierarchal_quota = quota;
7236
7237 return 0;
7238}
7239
7240static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7241{
7242 int ret;
7243 struct cfs_schedulable_data data = {
7244 .tg = tg,
7245 .period = period,
7246 .quota = quota,
7247 };
7248
7249 if (quota != RUNTIME_INF) {
7250 do_div(data.period, NSEC_PER_USEC);
7251 do_div(data.quota, NSEC_PER_USEC);
7252 }
7253
7254 rcu_read_lock();
7255 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7256 rcu_read_unlock();
7257
7258 return ret;
7259}
7260
7261static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7262 struct cgroup_map_cb *cb)
7263{
7264 struct task_group *tg = css_tg(css);
7265 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7266
7267 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7268 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7269 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7270
7271 return 0;
7272}
7273#endif
7274#endif
7275
7276#ifdef CONFIG_RT_GROUP_SCHED
7277static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7278 struct cftype *cft, s64 val)
7279{
7280 return sched_group_set_rt_runtime(css_tg(css), val);
7281}
7282
7283static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7284 struct cftype *cft)
7285{
7286 return sched_group_rt_runtime(css_tg(css));
7287}
7288
7289static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7290 struct cftype *cftype, u64 rt_period_us)
7291{
7292 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7293}
7294
7295static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7296 struct cftype *cft)
7297{
7298 return sched_group_rt_period(css_tg(css));
7299}
7300#endif
7301
7302static struct cftype cpu_files[] = {
7303#ifdef CONFIG_FAIR_GROUP_SCHED
7304 {
7305 .name = "shares",
7306 .read_u64 = cpu_shares_read_u64,
7307 .write_u64 = cpu_shares_write_u64,
7308 },
7309#endif
7310#ifdef CONFIG_CFS_BANDWIDTH
7311 {
7312 .name = "cfs_quota_us",
7313 .read_s64 = cpu_cfs_quota_read_s64,
7314 .write_s64 = cpu_cfs_quota_write_s64,
7315 },
7316 {
7317 .name = "cfs_period_us",
7318 .read_u64 = cpu_cfs_period_read_u64,
7319 .write_u64 = cpu_cfs_period_write_u64,
7320 },
7321 {
7322 .name = "stat",
7323 .read_map = cpu_stats_show,
7324 },
7325#endif
7326#ifdef CONFIG_RT_GROUP_SCHED
7327 {
7328 .name = "rt_runtime_us",
7329 .read_s64 = cpu_rt_runtime_read,
7330 .write_s64 = cpu_rt_runtime_write,
7331 },
7332 {
7333 .name = "rt_period_us",
7334 .read_u64 = cpu_rt_period_read_uint,
7335 .write_u64 = cpu_rt_period_write_uint,
7336 },
7337#endif
7338 { }
7339};
7340
7341struct cgroup_subsys cpu_cgroup_subsys = {
7342 .name = "cpu",
7343 .css_alloc = cpu_cgroup_css_alloc,
7344 .css_free = cpu_cgroup_css_free,
7345 .css_online = cpu_cgroup_css_online,
7346 .css_offline = cpu_cgroup_css_offline,
7347 .can_attach = cpu_cgroup_can_attach,
7348 .attach = cpu_cgroup_attach,
7349 .exit = cpu_cgroup_exit,
7350 .subsys_id = cpu_cgroup_subsys_id,
7351 .base_cftypes = cpu_files,
7352 .early_init = 1,
7353};
7354
7355#endif
7356
7357void dump_cpu_task(int cpu)
7358{
7359 pr_info("Task dump for CPU %d:\n", cpu);
7360 sched_show_task(cpu_curr(cpu));
7361}
7362