1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302
303
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320
321
322
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355
356
357
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371
372
373
374
375
376
377
378
379
380
381
382static void hrtick_clear(struct rq *rq)
383{
384 if (hrtimer_active(&rq->hrtick_timer))
385 hrtimer_cancel(&rq->hrtick_timer);
386}
387
388
389
390
391
392static enum hrtimer_restart hrtick(struct hrtimer *timer)
393{
394 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
395
396 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
397
398 raw_spin_lock(&rq->lock);
399 update_rq_clock(rq);
400 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
401 raw_spin_unlock(&rq->lock);
402
403 return HRTIMER_NORESTART;
404}
405
406#ifdef CONFIG_SMP
407
408
409
410static void __hrtick_start(void *arg)
411{
412 struct rq *rq = arg;
413
414 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer);
416 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock);
418}
419
420
421
422
423
424
425void hrtick_start(struct rq *rq, u64 delay)
426{
427 struct hrtimer *timer = &rq->hrtick_timer;
428 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430 hrtimer_set_expires(timer, time);
431
432 if (rq == this_rq()) {
433 hrtimer_restart(timer);
434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1;
437 }
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443 int cpu = (int)(long)hcpu;
444
445 switch (action) {
446 case CPU_UP_CANCELED:
447 case CPU_UP_CANCELED_FROZEN:
448 case CPU_DOWN_PREPARE:
449 case CPU_DOWN_PREPARE_FROZEN:
450 case CPU_DEAD:
451 case CPU_DEAD_FROZEN:
452 hrtick_clear(cpu_rq(cpu));
453 return NOTIFY_OK;
454 }
455
456 return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461 hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464
465
466
467
468
469void hrtick_start(struct rq *rq, u64 delay)
470{
471 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472 HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483 rq->hrtick_csd_pending = 0;
484
485 rq->hrtick_csd.flags = 0;
486 rq->hrtick_csd.func = __hrtick_start;
487 rq->hrtick_csd.info = rq;
488#endif
489
490 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491 rq->hrtick_timer.function = hrtick;
492}
493#else
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif
506
507
508
509
510
511
512
513
514#ifdef CONFIG_SMP
515void resched_task(struct task_struct *p)
516{
517 int cpu;
518
519 assert_raw_spin_locked(&task_rq(p)->lock);
520
521 if (test_tsk_need_resched(p))
522 return;
523
524 set_tsk_need_resched(p);
525
526 cpu = task_cpu(p);
527 if (cpu == smp_processor_id())
528 return;
529
530
531 smp_mb();
532 if (!tsk_is_polling(p))
533 smp_send_reschedule(cpu);
534}
535
536void resched_cpu(int cpu)
537{
538 struct rq *rq = cpu_rq(cpu);
539 unsigned long flags;
540
541 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
542 return;
543 resched_task(cpu_curr(cpu));
544 raw_spin_unlock_irqrestore(&rq->lock, flags);
545}
546
547#ifdef CONFIG_NO_HZ_COMMON
548
549
550
551
552
553
554
555
556int get_nohz_timer_target(void)
557{
558 int cpu = smp_processor_id();
559 int i;
560 struct sched_domain *sd;
561
562 rcu_read_lock();
563 for_each_domain(cpu, sd) {
564 for_each_cpu(i, sched_domain_span(sd)) {
565 if (!idle_cpu(i)) {
566 cpu = i;
567 goto unlock;
568 }
569 }
570 }
571unlock:
572 rcu_read_unlock();
573 return cpu;
574}
575
576
577
578
579
580
581
582
583
584
585static void wake_up_idle_cpu(int cpu)
586{
587 struct rq *rq = cpu_rq(cpu);
588
589 if (cpu == smp_processor_id())
590 return;
591
592
593
594
595
596
597
598
599 if (rq->curr != rq->idle)
600 return;
601
602
603
604
605
606
607 set_tsk_need_resched(rq->idle);
608
609
610 smp_mb();
611 if (!tsk_is_polling(rq->idle))
612 smp_send_reschedule(cpu);
613}
614
615static bool wake_up_full_nohz_cpu(int cpu)
616{
617 if (tick_nohz_full_cpu(cpu)) {
618 if (cpu != smp_processor_id() ||
619 tick_nohz_tick_stopped())
620 smp_send_reschedule(cpu);
621 return true;
622 }
623
624 return false;
625}
626
627void wake_up_nohz_cpu(int cpu)
628{
629 if (!wake_up_full_nohz_cpu(cpu))
630 wake_up_idle_cpu(cpu);
631}
632
633static inline bool got_nohz_idle_kick(void)
634{
635 int cpu = smp_processor_id();
636
637 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
638 return false;
639
640 if (idle_cpu(cpu) && !need_resched())
641 return true;
642
643
644
645
646
647 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
648 return false;
649}
650
651#else
652
653static inline bool got_nohz_idle_kick(void)
654{
655 return false;
656}
657
658#endif
659
660#ifdef CONFIG_NO_HZ_FULL
661bool sched_can_stop_tick(void)
662{
663 struct rq *rq;
664
665 rq = this_rq();
666
667
668 smp_rmb();
669
670
671 if (rq->nr_running > 1)
672 return false;
673
674 return true;
675}
676#endif
677
678void sched_avg_update(struct rq *rq)
679{
680 s64 period = sched_avg_period();
681
682 while ((s64)(rq->clock - rq->age_stamp) > period) {
683
684
685
686
687
688 asm("" : "+rm" (rq->age_stamp));
689 rq->age_stamp += period;
690 rq->rt_avg /= 2;
691 }
692}
693
694#else
695void resched_task(struct task_struct *p)
696{
697 assert_raw_spin_locked(&task_rq(p)->lock);
698 set_tsk_need_resched(p);
699}
700#endif
701
702#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
703 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
704
705
706
707
708
709
710int walk_tg_tree_from(struct task_group *from,
711 tg_visitor down, tg_visitor up, void *data)
712{
713 struct task_group *parent, *child;
714 int ret;
715
716 parent = from;
717
718down:
719 ret = (*down)(parent, data);
720 if (ret)
721 goto out;
722 list_for_each_entry_rcu(child, &parent->children, siblings) {
723 parent = child;
724 goto down;
725
726up:
727 continue;
728 }
729 ret = (*up)(parent, data);
730 if (ret || parent == from)
731 goto out;
732
733 child = parent;
734 parent = parent->parent;
735 if (parent)
736 goto up;
737out:
738 return ret;
739}
740
741int tg_nop(struct task_group *tg, void *data)
742{
743 return 0;
744}
745#endif
746
747static void set_load_weight(struct task_struct *p)
748{
749 int prio = p->static_prio - MAX_RT_PRIO;
750 struct load_weight *load = &p->se.load;
751
752
753
754
755 if (p->policy == SCHED_IDLE) {
756 load->weight = scale_load(WEIGHT_IDLEPRIO);
757 load->inv_weight = WMULT_IDLEPRIO;
758 return;
759 }
760
761 load->weight = scale_load(prio_to_weight[prio]);
762 load->inv_weight = prio_to_wmult[prio];
763}
764
765static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
766{
767 update_rq_clock(rq);
768 sched_info_queued(p);
769 p->sched_class->enqueue_task(rq, p, flags);
770}
771
772static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
773{
774 update_rq_clock(rq);
775 sched_info_dequeued(p);
776 p->sched_class->dequeue_task(rq, p, flags);
777}
778
779void activate_task(struct rq *rq, struct task_struct *p, int flags)
780{
781 if (task_contributes_to_load(p))
782 rq->nr_uninterruptible--;
783
784 enqueue_task(rq, p, flags);
785}
786
787void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
788{
789 if (task_contributes_to_load(p))
790 rq->nr_uninterruptible++;
791
792 dequeue_task(rq, p, flags);
793}
794
795static void update_rq_clock_task(struct rq *rq, s64 delta)
796{
797
798
799
800
801#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
802 s64 steal = 0, irq_delta = 0;
803#endif
804#ifdef CONFIG_IRQ_TIME_ACCOUNTING
805 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822 if (irq_delta > delta)
823 irq_delta = delta;
824
825 rq->prev_irq_time += irq_delta;
826 delta -= irq_delta;
827#endif
828#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
829 if (static_key_false((¶virt_steal_rq_enabled))) {
830 u64 st;
831
832 steal = paravirt_steal_clock(cpu_of(rq));
833 steal -= rq->prev_steal_time_rq;
834
835 if (unlikely(steal > delta))
836 steal = delta;
837
838 st = steal_ticks(steal);
839 steal = st * TICK_NSEC;
840
841 rq->prev_steal_time_rq += steal;
842
843 delta -= steal;
844 }
845#endif
846
847 rq->clock_task += delta;
848
849#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
850 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
851 sched_rt_avg_update(rq, irq_delta + steal);
852#endif
853}
854
855void sched_set_stop_task(int cpu, struct task_struct *stop)
856{
857 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
858 struct task_struct *old_stop = cpu_rq(cpu)->stop;
859
860 if (stop) {
861
862
863
864
865
866
867
868
869 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
870
871 stop->sched_class = &stop_sched_class;
872 }
873
874 cpu_rq(cpu)->stop = stop;
875
876 if (old_stop) {
877
878
879
880
881 old_stop->sched_class = &rt_sched_class;
882 }
883}
884
885
886
887
888static inline int __normal_prio(struct task_struct *p)
889{
890 return p->static_prio;
891}
892
893
894
895
896
897
898
899
900static inline int normal_prio(struct task_struct *p)
901{
902 int prio;
903
904 if (task_has_rt_policy(p))
905 prio = MAX_RT_PRIO-1 - p->rt_priority;
906 else
907 prio = __normal_prio(p);
908 return prio;
909}
910
911
912
913
914
915
916
917
918static int effective_prio(struct task_struct *p)
919{
920 p->normal_prio = normal_prio(p);
921
922
923
924
925
926 if (!rt_prio(p->prio))
927 return p->normal_prio;
928 return p->prio;
929}
930
931
932
933
934
935inline int task_curr(const struct task_struct *p)
936{
937 return cpu_curr(task_cpu(p)) == p;
938}
939
940static inline void check_class_changed(struct rq *rq, struct task_struct *p,
941 const struct sched_class *prev_class,
942 int oldprio)
943{
944 if (prev_class != p->sched_class) {
945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio)
949 p->sched_class->prio_changed(rq, p, oldprio);
950}
951
952void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
953{
954 const struct sched_class *class;
955
956 if (p->sched_class == rq->curr->sched_class) {
957 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
958 } else {
959 for_each_class(class) {
960 if (class == rq->curr->sched_class)
961 break;
962 if (class == p->sched_class) {
963 resched_task(rq->curr);
964 break;
965 }
966 }
967 }
968
969
970
971
972
973 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
974 rq->skip_clock_update = 1;
975}
976
977static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
978
979void register_task_migration_notifier(struct notifier_block *n)
980{
981 atomic_notifier_chain_register(&task_migration_notifier, n);
982}
983
984#ifdef CONFIG_SMP
985void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
986{
987#ifdef CONFIG_SCHED_DEBUG
988
989
990
991
992 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
993 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
994
995#ifdef CONFIG_LOCKDEP
996
997
998
999
1000
1001
1002
1003
1004
1005
1006 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1007 lockdep_is_held(&task_rq(p)->lock)));
1008#endif
1009#endif
1010
1011 trace_sched_migrate_task(p, new_cpu);
1012
1013 if (task_cpu(p) != new_cpu) {
1014 struct task_migration_notifier tmn;
1015
1016 if (p->sched_class->migrate_task_rq)
1017 p->sched_class->migrate_task_rq(p, new_cpu);
1018 p->se.nr_migrations++;
1019 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1020
1021 tmn.task = p;
1022 tmn.from_cpu = task_cpu(p);
1023 tmn.to_cpu = new_cpu;
1024
1025 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1026 }
1027
1028 __set_task_cpu(p, new_cpu);
1029}
1030
1031struct migration_arg {
1032 struct task_struct *task;
1033 int dest_cpu;
1034};
1035
1036static int migration_cpu_stop(void *data);
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1055{
1056 unsigned long flags;
1057 int running, on_rq;
1058 unsigned long ncsw;
1059 struct rq *rq;
1060
1061 for (;;) {
1062
1063
1064
1065
1066
1067
1068 rq = task_rq(p);
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 while (task_running(rq, p)) {
1082 if (match_state && unlikely(p->state != match_state))
1083 return 0;
1084 cpu_relax();
1085 }
1086
1087
1088
1089
1090
1091
1092 rq = task_rq_lock(p, &flags);
1093 trace_sched_wait_task(p);
1094 running = task_running(rq, p);
1095 on_rq = p->on_rq;
1096 ncsw = 0;
1097 if (!match_state || p->state == match_state)
1098 ncsw = p->nvcsw | LONG_MIN;
1099 task_rq_unlock(rq, p, &flags);
1100
1101
1102
1103
1104 if (unlikely(!ncsw))
1105 break;
1106
1107
1108
1109
1110
1111
1112
1113 if (unlikely(running)) {
1114 cpu_relax();
1115 continue;
1116 }
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127 if (unlikely(on_rq)) {
1128 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1129
1130 set_current_state(TASK_UNINTERRUPTIBLE);
1131 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1132 continue;
1133 }
1134
1135
1136
1137
1138
1139
1140 break;
1141 }
1142
1143 return ncsw;
1144}
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159void kick_process(struct task_struct *p)
1160{
1161 int cpu;
1162
1163 preempt_disable();
1164 cpu = task_cpu(p);
1165 if ((cpu != smp_processor_id()) && task_curr(p))
1166 smp_send_reschedule(cpu);
1167 preempt_enable();
1168}
1169EXPORT_SYMBOL_GPL(kick_process);
1170#endif
1171
1172#ifdef CONFIG_SMP
1173
1174
1175
1176static int select_fallback_rq(int cpu, struct task_struct *p)
1177{
1178 int nid = cpu_to_node(cpu);
1179 const struct cpumask *nodemask = NULL;
1180 enum { cpuset, possible, fail } state = cpuset;
1181 int dest_cpu;
1182
1183
1184
1185
1186
1187
1188 if (nid != -1) {
1189 nodemask = cpumask_of_node(nid);
1190
1191
1192 for_each_cpu(dest_cpu, nodemask) {
1193 if (!cpu_online(dest_cpu))
1194 continue;
1195 if (!cpu_active(dest_cpu))
1196 continue;
1197 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1198 return dest_cpu;
1199 }
1200 }
1201
1202 for (;;) {
1203
1204 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1205 if (!cpu_online(dest_cpu))
1206 continue;
1207 if (!cpu_active(dest_cpu))
1208 continue;
1209 goto out;
1210 }
1211
1212 switch (state) {
1213 case cpuset:
1214
1215 cpuset_cpus_allowed_fallback(p);
1216 state = possible;
1217 break;
1218
1219 case possible:
1220 do_set_cpus_allowed(p, cpu_possible_mask);
1221 state = fail;
1222 break;
1223
1224 case fail:
1225 BUG();
1226 break;
1227 }
1228 }
1229
1230out:
1231 if (state != cpuset) {
1232
1233
1234
1235
1236
1237 if (p->mm && printk_ratelimit()) {
1238 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1239 task_pid_nr(p), p->comm, cpu);
1240 }
1241 }
1242
1243 return dest_cpu;
1244}
1245
1246
1247
1248
1249static inline
1250int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1251{
1252 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1265 !cpu_online(cpu)))
1266 cpu = select_fallback_rq(task_cpu(p), p);
1267
1268 return cpu;
1269}
1270
1271static void update_avg(u64 *avg, u64 sample)
1272{
1273 s64 diff = sample - *avg;
1274 *avg += diff >> 3;
1275}
1276#endif
1277
1278static void
1279ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1280{
1281#ifdef CONFIG_SCHEDSTATS
1282 struct rq *rq = this_rq();
1283
1284#ifdef CONFIG_SMP
1285 int this_cpu = smp_processor_id();
1286
1287 if (cpu == this_cpu) {
1288 schedstat_inc(rq, ttwu_local);
1289 schedstat_inc(p, se.statistics.nr_wakeups_local);
1290 } else {
1291 struct sched_domain *sd;
1292
1293 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1294 rcu_read_lock();
1295 for_each_domain(this_cpu, sd) {
1296 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1297 schedstat_inc(sd, ttwu_wake_remote);
1298 break;
1299 }
1300 }
1301 rcu_read_unlock();
1302 }
1303
1304 if (wake_flags & WF_MIGRATED)
1305 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1306
1307#endif
1308
1309 schedstat_inc(rq, ttwu_count);
1310 schedstat_inc(p, se.statistics.nr_wakeups);
1311
1312 if (wake_flags & WF_SYNC)
1313 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1314
1315#endif
1316}
1317
1318static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1319{
1320 activate_task(rq, p, en_flags);
1321 p->on_rq = 1;
1322
1323
1324 if (p->flags & PF_WQ_WORKER)
1325 wq_worker_waking_up(p, cpu_of(rq));
1326}
1327
1328
1329
1330
1331static void
1332ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1333{
1334 check_preempt_curr(rq, p, wake_flags);
1335 trace_sched_wakeup(p, true);
1336
1337 p->state = TASK_RUNNING;
1338#ifdef CONFIG_SMP
1339 if (p->sched_class->task_woken)
1340 p->sched_class->task_woken(rq, p);
1341
1342 if (rq->idle_stamp) {
1343 u64 delta = rq->clock - rq->idle_stamp;
1344 u64 max = 2*sysctl_sched_migration_cost;
1345
1346 if (delta > max)
1347 rq->avg_idle = max;
1348 else
1349 update_avg(&rq->avg_idle, delta);
1350 rq->idle_stamp = 0;
1351 }
1352#endif
1353}
1354
1355static void
1356ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1357{
1358#ifdef CONFIG_SMP
1359 if (p->sched_contributes_to_load)
1360 rq->nr_uninterruptible--;
1361#endif
1362
1363 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1364 ttwu_do_wakeup(rq, p, wake_flags);
1365}
1366
1367
1368
1369
1370
1371
1372
1373static int ttwu_remote(struct task_struct *p, int wake_flags)
1374{
1375 struct rq *rq;
1376 int ret = 0;
1377
1378 rq = __task_rq_lock(p);
1379 if (p->on_rq) {
1380 ttwu_do_wakeup(rq, p, wake_flags);
1381 ret = 1;
1382 }
1383 __task_rq_unlock(rq);
1384
1385 return ret;
1386}
1387
1388#ifdef CONFIG_SMP
1389static void sched_ttwu_pending(void)
1390{
1391 struct rq *rq = this_rq();
1392 struct llist_node *llist = llist_del_all(&rq->wake_list);
1393 struct task_struct *p;
1394
1395 raw_spin_lock(&rq->lock);
1396
1397 while (llist) {
1398 p = llist_entry(llist, struct task_struct, wake_entry);
1399 llist = llist_next(llist);
1400 ttwu_do_activate(rq, p, 0);
1401 }
1402
1403 raw_spin_unlock(&rq->lock);
1404}
1405
1406void scheduler_ipi(void)
1407{
1408 if (llist_empty(&this_rq()->wake_list)
1409 && !tick_nohz_full_cpu(smp_processor_id())
1410 && !got_nohz_idle_kick())
1411 return;
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426 irq_enter();
1427 tick_nohz_full_check();
1428 sched_ttwu_pending();
1429
1430
1431
1432
1433 if (unlikely(got_nohz_idle_kick())) {
1434 this_rq()->idle_balance = 1;
1435 raise_softirq_irqoff(SCHED_SOFTIRQ);
1436 }
1437 irq_exit();
1438}
1439
1440static void ttwu_queue_remote(struct task_struct *p, int cpu)
1441{
1442 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1443 smp_send_reschedule(cpu);
1444}
1445
1446bool cpus_share_cache(int this_cpu, int that_cpu)
1447{
1448 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1449}
1450#endif
1451
1452static void ttwu_queue(struct task_struct *p, int cpu)
1453{
1454 struct rq *rq = cpu_rq(cpu);
1455
1456#if defined(CONFIG_SMP)
1457 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1458 sched_clock_cpu(cpu);
1459 ttwu_queue_remote(p, cpu);
1460 return;
1461 }
1462#endif
1463
1464 raw_spin_lock(&rq->lock);
1465 ttwu_do_activate(rq, p, 0);
1466 raw_spin_unlock(&rq->lock);
1467}
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484static int
1485try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1486{
1487 unsigned long flags;
1488 int cpu, success = 0;
1489
1490 smp_wmb();
1491 raw_spin_lock_irqsave(&p->pi_lock, flags);
1492 if (!(p->state & state))
1493 goto out;
1494
1495 success = 1;
1496 cpu = task_cpu(p);
1497
1498 if (p->on_rq && ttwu_remote(p, wake_flags))
1499 goto stat;
1500
1501#ifdef CONFIG_SMP
1502
1503
1504
1505
1506 while (p->on_cpu)
1507 cpu_relax();
1508
1509
1510
1511 smp_rmb();
1512
1513 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1514 p->state = TASK_WAKING;
1515
1516 if (p->sched_class->task_waking)
1517 p->sched_class->task_waking(p);
1518
1519 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1520 if (task_cpu(p) != cpu) {
1521 wake_flags |= WF_MIGRATED;
1522 set_task_cpu(p, cpu);
1523 }
1524#endif
1525
1526 ttwu_queue(p, cpu);
1527stat:
1528 ttwu_stat(p, cpu, wake_flags);
1529out:
1530 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1531
1532 return success;
1533}
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543static void try_to_wake_up_local(struct task_struct *p)
1544{
1545 struct rq *rq = task_rq(p);
1546
1547 if (WARN_ON_ONCE(rq != this_rq()) ||
1548 WARN_ON_ONCE(p == current))
1549 return;
1550
1551 lockdep_assert_held(&rq->lock);
1552
1553 if (!raw_spin_trylock(&p->pi_lock)) {
1554 raw_spin_unlock(&rq->lock);
1555 raw_spin_lock(&p->pi_lock);
1556 raw_spin_lock(&rq->lock);
1557 }
1558
1559 if (!(p->state & TASK_NORMAL))
1560 goto out;
1561
1562 if (!p->on_rq)
1563 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1564
1565 ttwu_do_wakeup(rq, p, 0);
1566 ttwu_stat(p, smp_processor_id(), 0);
1567out:
1568 raw_spin_unlock(&p->pi_lock);
1569}
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582int wake_up_process(struct task_struct *p)
1583{
1584 WARN_ON(task_is_stopped_or_traced(p));
1585 return try_to_wake_up(p, TASK_NORMAL, 0);
1586}
1587EXPORT_SYMBOL(wake_up_process);
1588
1589int wake_up_state(struct task_struct *p, unsigned int state)
1590{
1591 return try_to_wake_up(p, state, 0);
1592}
1593
1594
1595
1596
1597
1598
1599
1600static void __sched_fork(struct task_struct *p)
1601{
1602 p->on_rq = 0;
1603
1604 p->se.on_rq = 0;
1605 p->se.exec_start = 0;
1606 p->se.sum_exec_runtime = 0;
1607 p->se.prev_sum_exec_runtime = 0;
1608 p->se.nr_migrations = 0;
1609 p->se.vruntime = 0;
1610 INIT_LIST_HEAD(&p->se.group_node);
1611
1612
1613
1614
1615
1616
1617#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1618 p->se.avg.runnable_avg_period = 0;
1619 p->se.avg.runnable_avg_sum = 0;
1620#endif
1621#ifdef CONFIG_SCHEDSTATS
1622 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1623#endif
1624
1625 INIT_LIST_HEAD(&p->rt.run_list);
1626
1627#ifdef CONFIG_PREEMPT_NOTIFIERS
1628 INIT_HLIST_HEAD(&p->preempt_notifiers);
1629#endif
1630
1631#ifdef CONFIG_NUMA_BALANCING
1632 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1633 p->mm->numa_next_scan = jiffies;
1634 p->mm->numa_next_reset = jiffies;
1635 p->mm->numa_scan_seq = 0;
1636 }
1637
1638 p->node_stamp = 0ULL;
1639 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1640 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1641 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1642 p->numa_work.next = &p->numa_work;
1643#endif
1644}
1645
1646#ifdef CONFIG_NUMA_BALANCING
1647#ifdef CONFIG_SCHED_DEBUG
1648void set_numabalancing_state(bool enabled)
1649{
1650 if (enabled)
1651 sched_feat_set("NUMA");
1652 else
1653 sched_feat_set("NO_NUMA");
1654}
1655#else
1656__read_mostly bool numabalancing_enabled;
1657
1658void set_numabalancing_state(bool enabled)
1659{
1660 numabalancing_enabled = enabled;
1661}
1662#endif
1663#endif
1664
1665
1666
1667
1668void sched_fork(struct task_struct *p)
1669{
1670 unsigned long flags;
1671 int cpu = get_cpu();
1672
1673 __sched_fork(p);
1674
1675
1676
1677
1678
1679 p->state = TASK_RUNNING;
1680
1681
1682
1683
1684 p->prio = current->normal_prio;
1685
1686
1687
1688
1689 if (unlikely(p->sched_reset_on_fork)) {
1690 if (task_has_rt_policy(p)) {
1691 p->policy = SCHED_NORMAL;
1692 p->static_prio = NICE_TO_PRIO(0);
1693 p->rt_priority = 0;
1694 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1695 p->static_prio = NICE_TO_PRIO(0);
1696
1697 p->prio = p->normal_prio = __normal_prio(p);
1698 set_load_weight(p);
1699
1700
1701
1702
1703
1704 p->sched_reset_on_fork = 0;
1705 }
1706
1707 if (!rt_prio(p->prio))
1708 p->sched_class = &fair_sched_class;
1709
1710 if (p->sched_class->task_fork)
1711 p->sched_class->task_fork(p);
1712
1713
1714
1715
1716
1717
1718
1719
1720 raw_spin_lock_irqsave(&p->pi_lock, flags);
1721 set_task_cpu(p, cpu);
1722 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1723
1724#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1725 if (likely(sched_info_on()))
1726 memset(&p->sched_info, 0, sizeof(p->sched_info));
1727#endif
1728#if defined(CONFIG_SMP)
1729 p->on_cpu = 0;
1730#endif
1731#ifdef CONFIG_PREEMPT_COUNT
1732
1733 task_thread_info(p)->preempt_count = 1;
1734#endif
1735#ifdef CONFIG_SMP
1736 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1737#endif
1738
1739 put_cpu();
1740}
1741
1742
1743
1744
1745
1746
1747
1748
1749void wake_up_new_task(struct task_struct *p)
1750{
1751 unsigned long flags;
1752 struct rq *rq;
1753
1754 raw_spin_lock_irqsave(&p->pi_lock, flags);
1755#ifdef CONFIG_SMP
1756
1757
1758
1759
1760
1761 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1762#endif
1763
1764 rq = __task_rq_lock(p);
1765 activate_task(rq, p, 0);
1766 p->on_rq = 1;
1767 trace_sched_wakeup_new(p, true);
1768 check_preempt_curr(rq, p, WF_FORK);
1769#ifdef CONFIG_SMP
1770 if (p->sched_class->task_woken)
1771 p->sched_class->task_woken(rq, p);
1772#endif
1773 task_rq_unlock(rq, p, &flags);
1774}
1775
1776#ifdef CONFIG_PREEMPT_NOTIFIERS
1777
1778
1779
1780
1781
1782void preempt_notifier_register(struct preempt_notifier *notifier)
1783{
1784 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1785}
1786EXPORT_SYMBOL_GPL(preempt_notifier_register);
1787
1788
1789
1790
1791
1792
1793
1794void preempt_notifier_unregister(struct preempt_notifier *notifier)
1795{
1796 hlist_del(¬ifier->link);
1797}
1798EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1799
1800static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1801{
1802 struct preempt_notifier *notifier;
1803
1804 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1805 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1806}
1807
1808static void
1809fire_sched_out_preempt_notifiers(struct task_struct *curr,
1810 struct task_struct *next)
1811{
1812 struct preempt_notifier *notifier;
1813
1814 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1815 notifier->ops->sched_out(notifier, next);
1816}
1817
1818#else
1819
1820static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1821{
1822}
1823
1824static void
1825fire_sched_out_preempt_notifiers(struct task_struct *curr,
1826 struct task_struct *next)
1827{
1828}
1829
1830#endif
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845static inline void
1846prepare_task_switch(struct rq *rq, struct task_struct *prev,
1847 struct task_struct *next)
1848{
1849 trace_sched_switch(prev, next);
1850 sched_info_switch(prev, next);
1851 perf_event_task_sched_out(prev, next);
1852 fire_sched_out_preempt_notifiers(prev, next);
1853 prepare_lock_switch(rq, next);
1854 prepare_arch_switch(next);
1855}
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1873 __releases(rq->lock)
1874{
1875 struct mm_struct *mm = rq->prev_mm;
1876 long prev_state;
1877
1878 rq->prev_mm = NULL;
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891 prev_state = prev->state;
1892 vtime_task_switch(prev);
1893 finish_arch_switch(prev);
1894 perf_event_task_sched_in(prev, current);
1895 finish_lock_switch(rq, prev);
1896 finish_arch_post_lock_switch();
1897
1898 fire_sched_in_preempt_notifiers(current);
1899 if (mm)
1900 mmdrop(mm);
1901 if (unlikely(prev_state == TASK_DEAD)) {
1902
1903
1904
1905
1906 kprobe_flush_task(prev);
1907 put_task_struct(prev);
1908 }
1909
1910 tick_nohz_task_switch(current);
1911}
1912
1913#ifdef CONFIG_SMP
1914
1915
1916static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1917{
1918 if (prev->sched_class->pre_schedule)
1919 prev->sched_class->pre_schedule(rq, prev);
1920}
1921
1922
1923static inline void post_schedule(struct rq *rq)
1924{
1925 if (rq->post_schedule) {
1926 unsigned long flags;
1927
1928 raw_spin_lock_irqsave(&rq->lock, flags);
1929 if (rq->curr->sched_class->post_schedule)
1930 rq->curr->sched_class->post_schedule(rq);
1931 raw_spin_unlock_irqrestore(&rq->lock, flags);
1932
1933 rq->post_schedule = 0;
1934 }
1935}
1936
1937#else
1938
1939static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1940{
1941}
1942
1943static inline void post_schedule(struct rq *rq)
1944{
1945}
1946
1947#endif
1948
1949
1950
1951
1952
1953asmlinkage void schedule_tail(struct task_struct *prev)
1954 __releases(rq->lock)
1955{
1956 struct rq *rq = this_rq();
1957
1958 finish_task_switch(rq, prev);
1959
1960
1961
1962
1963
1964 post_schedule(rq);
1965
1966#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1967
1968 preempt_enable();
1969#endif
1970 if (current->set_child_tid)
1971 put_user(task_pid_vnr(current), current->set_child_tid);
1972}
1973
1974
1975
1976
1977
1978static inline void
1979context_switch(struct rq *rq, struct task_struct *prev,
1980 struct task_struct *next)
1981{
1982 struct mm_struct *mm, *oldmm;
1983
1984 prepare_task_switch(rq, prev, next);
1985
1986 mm = next->mm;
1987 oldmm = prev->active_mm;
1988
1989
1990
1991
1992
1993 arch_start_context_switch(prev);
1994
1995 if (!mm) {
1996 next->active_mm = oldmm;
1997 atomic_inc(&oldmm->mm_count);
1998 enter_lazy_tlb(oldmm, next);
1999 } else
2000 switch_mm(oldmm, mm, next);
2001
2002 if (!prev->mm) {
2003 prev->active_mm = NULL;
2004 rq->prev_mm = oldmm;
2005 }
2006
2007
2008
2009
2010
2011
2012#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2013 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2014#endif
2015
2016 context_tracking_task_switch(prev, next);
2017
2018 switch_to(prev, next, prev);
2019
2020 barrier();
2021
2022
2023
2024
2025
2026 finish_task_switch(this_rq(), prev);
2027}
2028
2029
2030
2031
2032
2033
2034
2035unsigned long nr_running(void)
2036{
2037 unsigned long i, sum = 0;
2038
2039 for_each_online_cpu(i)
2040 sum += cpu_rq(i)->nr_running;
2041
2042 return sum;
2043}
2044
2045unsigned long long nr_context_switches(void)
2046{
2047 int i;
2048 unsigned long long sum = 0;
2049
2050 for_each_possible_cpu(i)
2051 sum += cpu_rq(i)->nr_switches;
2052
2053 return sum;
2054}
2055
2056unsigned long nr_iowait(void)
2057{
2058 unsigned long i, sum = 0;
2059
2060 for_each_possible_cpu(i)
2061 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2062
2063 return sum;
2064}
2065
2066unsigned long nr_iowait_cpu(int cpu)
2067{
2068 struct rq *this = cpu_rq(cpu);
2069 return atomic_read(&this->nr_iowait);
2070}
2071
2072unsigned long this_cpu_load(void)
2073{
2074 struct rq *this = this_rq();
2075 return this->cpu_load[0];
2076}
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127static atomic_long_t calc_load_tasks;
2128static unsigned long calc_load_update;
2129unsigned long avenrun[3];
2130EXPORT_SYMBOL(avenrun);
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2141{
2142 loads[0] = (avenrun[0] + offset) << shift;
2143 loads[1] = (avenrun[1] + offset) << shift;
2144 loads[2] = (avenrun[2] + offset) << shift;
2145}
2146
2147static long calc_load_fold_active(struct rq *this_rq)
2148{
2149 long nr_active, delta = 0;
2150
2151 nr_active = this_rq->nr_running;
2152 nr_active += (long) this_rq->nr_uninterruptible;
2153
2154 if (nr_active != this_rq->calc_load_active) {
2155 delta = nr_active - this_rq->calc_load_active;
2156 this_rq->calc_load_active = nr_active;
2157 }
2158
2159 return delta;
2160}
2161
2162
2163
2164
2165static unsigned long
2166calc_load(unsigned long load, unsigned long exp, unsigned long active)
2167{
2168 load *= exp;
2169 load += active * (FIXED_1 - exp);
2170 load += 1UL << (FSHIFT - 1);
2171 return load >> FSHIFT;
2172}
2173
2174#ifdef CONFIG_NO_HZ_COMMON
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217static atomic_long_t calc_load_idle[2];
2218static int calc_load_idx;
2219
2220static inline int calc_load_write_idx(void)
2221{
2222 int idx = calc_load_idx;
2223
2224
2225
2226
2227
2228 smp_rmb();
2229
2230
2231
2232
2233
2234 if (!time_before(jiffies, calc_load_update))
2235 idx++;
2236
2237 return idx & 1;
2238}
2239
2240static inline int calc_load_read_idx(void)
2241{
2242 return calc_load_idx & 1;
2243}
2244
2245void calc_load_enter_idle(void)
2246{
2247 struct rq *this_rq = this_rq();
2248 long delta;
2249
2250
2251
2252
2253
2254 delta = calc_load_fold_active(this_rq);
2255 if (delta) {
2256 int idx = calc_load_write_idx();
2257 atomic_long_add(delta, &calc_load_idle[idx]);
2258 }
2259}
2260
2261void calc_load_exit_idle(void)
2262{
2263 struct rq *this_rq = this_rq();
2264
2265
2266
2267
2268 if (time_before(jiffies, this_rq->calc_load_update))
2269 return;
2270
2271
2272
2273
2274
2275
2276 this_rq->calc_load_update = calc_load_update;
2277 if (time_before(jiffies, this_rq->calc_load_update + 10))
2278 this_rq->calc_load_update += LOAD_FREQ;
2279}
2280
2281static long calc_load_fold_idle(void)
2282{
2283 int idx = calc_load_read_idx();
2284 long delta = 0;
2285
2286 if (atomic_long_read(&calc_load_idle[idx]))
2287 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2288
2289 return delta;
2290}
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307static unsigned long
2308fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2309{
2310 unsigned long result = 1UL << frac_bits;
2311
2312 if (n) for (;;) {
2313 if (n & 1) {
2314 result *= x;
2315 result += 1UL << (frac_bits - 1);
2316 result >>= frac_bits;
2317 }
2318 n >>= 1;
2319 if (!n)
2320 break;
2321 x *= x;
2322 x += 1UL << (frac_bits - 1);
2323 x >>= frac_bits;
2324 }
2325
2326 return result;
2327}
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352static unsigned long
2353calc_load_n(unsigned long load, unsigned long exp,
2354 unsigned long active, unsigned int n)
2355{
2356
2357 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2358}
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369static void calc_global_nohz(void)
2370{
2371 long delta, active, n;
2372
2373 if (!time_before(jiffies, calc_load_update + 10)) {
2374
2375
2376
2377 delta = jiffies - calc_load_update - 10;
2378 n = 1 + (delta / LOAD_FREQ);
2379
2380 active = atomic_long_read(&calc_load_tasks);
2381 active = active > 0 ? active * FIXED_1 : 0;
2382
2383 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2384 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2385 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2386
2387 calc_load_update += n * LOAD_FREQ;
2388 }
2389
2390
2391
2392
2393
2394
2395
2396
2397 smp_wmb();
2398 calc_load_idx++;
2399}
2400#else
2401
2402static inline long calc_load_fold_idle(void) { return 0; }
2403static inline void calc_global_nohz(void) { }
2404
2405#endif
2406
2407
2408
2409
2410
2411void calc_global_load(unsigned long ticks)
2412{
2413 long active, delta;
2414
2415 if (time_before(jiffies, calc_load_update + 10))
2416 return;
2417
2418
2419
2420
2421 delta = calc_load_fold_idle();
2422 if (delta)
2423 atomic_long_add(delta, &calc_load_tasks);
2424
2425 active = atomic_long_read(&calc_load_tasks);
2426 active = active > 0 ? active * FIXED_1 : 0;
2427
2428 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2429 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2430 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2431
2432 calc_load_update += LOAD_FREQ;
2433
2434
2435
2436
2437 calc_global_nohz();
2438}
2439
2440
2441
2442
2443
2444static void calc_load_account_active(struct rq *this_rq)
2445{
2446 long delta;
2447
2448 if (time_before(jiffies, this_rq->calc_load_update))
2449 return;
2450
2451 delta = calc_load_fold_active(this_rq);
2452 if (delta)
2453 atomic_long_add(delta, &calc_load_tasks);
2454
2455 this_rq->calc_load_update += LOAD_FREQ;
2456}
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489#define DEGRADE_SHIFT 7
2490static const unsigned char
2491 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2492static const unsigned char
2493 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2494 {0, 0, 0, 0, 0, 0, 0, 0},
2495 {64, 32, 8, 0, 0, 0, 0, 0},
2496 {96, 72, 40, 12, 1, 0, 0},
2497 {112, 98, 75, 43, 15, 1, 0},
2498 {120, 112, 98, 76, 45, 16, 2} };
2499
2500
2501
2502
2503
2504
2505static unsigned long
2506decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2507{
2508 int j = 0;
2509
2510 if (!missed_updates)
2511 return load;
2512
2513 if (missed_updates >= degrade_zero_ticks[idx])
2514 return 0;
2515
2516 if (idx == 1)
2517 return load >> missed_updates;
2518
2519 while (missed_updates) {
2520 if (missed_updates % 2)
2521 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2522
2523 missed_updates >>= 1;
2524 j++;
2525 }
2526 return load;
2527}
2528
2529
2530
2531
2532
2533
2534static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2535 unsigned long pending_updates)
2536{
2537 int i, scale;
2538
2539 this_rq->nr_load_updates++;
2540
2541
2542 this_rq->cpu_load[0] = this_load;
2543 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2544 unsigned long old_load, new_load;
2545
2546
2547
2548 old_load = this_rq->cpu_load[i];
2549 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2550 new_load = this_load;
2551
2552
2553
2554
2555
2556 if (new_load > old_load)
2557 new_load += scale - 1;
2558
2559 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2560 }
2561
2562 sched_avg_update(this_rq);
2563}
2564
2565#ifdef CONFIG_NO_HZ_COMMON
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583void update_idle_cpu_load(struct rq *this_rq)
2584{
2585 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2586 unsigned long load = this_rq->load.weight;
2587 unsigned long pending_updates;
2588
2589
2590
2591
2592 if (load || curr_jiffies == this_rq->last_load_update_tick)
2593 return;
2594
2595 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2596 this_rq->last_load_update_tick = curr_jiffies;
2597
2598 __update_cpu_load(this_rq, load, pending_updates);
2599}
2600
2601
2602
2603
2604void update_cpu_load_nohz(void)
2605{
2606 struct rq *this_rq = this_rq();
2607 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2608 unsigned long pending_updates;
2609
2610 if (curr_jiffies == this_rq->last_load_update_tick)
2611 return;
2612
2613 raw_spin_lock(&this_rq->lock);
2614 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2615 if (pending_updates) {
2616 this_rq->last_load_update_tick = curr_jiffies;
2617
2618
2619
2620
2621 __update_cpu_load(this_rq, 0, pending_updates);
2622 }
2623 raw_spin_unlock(&this_rq->lock);
2624}
2625#endif
2626
2627
2628
2629
2630static void update_cpu_load_active(struct rq *this_rq)
2631{
2632
2633
2634
2635 this_rq->last_load_update_tick = jiffies;
2636 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2637
2638 calc_load_account_active(this_rq);
2639}
2640
2641#ifdef CONFIG_SMP
2642
2643
2644
2645
2646
2647void sched_exec(void)
2648{
2649 struct task_struct *p = current;
2650 unsigned long flags;
2651 int dest_cpu;
2652
2653 raw_spin_lock_irqsave(&p->pi_lock, flags);
2654 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2655 if (dest_cpu == smp_processor_id())
2656 goto unlock;
2657
2658 if (likely(cpu_active(dest_cpu))) {
2659 struct migration_arg arg = { p, dest_cpu };
2660
2661 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2662 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2663 return;
2664 }
2665unlock:
2666 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2667}
2668
2669#endif
2670
2671DEFINE_PER_CPU(struct kernel_stat, kstat);
2672DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2673
2674EXPORT_PER_CPU_SYMBOL(kstat);
2675EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2676
2677
2678
2679
2680
2681
2682
2683static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2684{
2685 u64 ns = 0;
2686
2687 if (task_current(rq, p)) {
2688 update_rq_clock(rq);
2689 ns = rq->clock_task - p->se.exec_start;
2690 if ((s64)ns < 0)
2691 ns = 0;
2692 }
2693
2694 return ns;
2695}
2696
2697unsigned long long task_delta_exec(struct task_struct *p)
2698{
2699 unsigned long flags;
2700 struct rq *rq;
2701 u64 ns = 0;
2702
2703 rq = task_rq_lock(p, &flags);
2704 ns = do_task_delta_exec(p, rq);
2705 task_rq_unlock(rq, p, &flags);
2706
2707 return ns;
2708}
2709
2710
2711
2712
2713
2714
2715unsigned long long task_sched_runtime(struct task_struct *p)
2716{
2717 unsigned long flags;
2718 struct rq *rq;
2719 u64 ns = 0;
2720
2721 rq = task_rq_lock(p, &flags);
2722 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2723 task_rq_unlock(rq, p, &flags);
2724
2725 return ns;
2726}
2727
2728
2729
2730
2731
2732void scheduler_tick(void)
2733{
2734 int cpu = smp_processor_id();
2735 struct rq *rq = cpu_rq(cpu);
2736 struct task_struct *curr = rq->curr;
2737
2738 sched_clock_tick();
2739
2740 raw_spin_lock(&rq->lock);
2741 update_rq_clock(rq);
2742 update_cpu_load_active(rq);
2743 curr->sched_class->task_tick(rq, curr, 0);
2744 raw_spin_unlock(&rq->lock);
2745
2746 perf_event_task_tick();
2747
2748#ifdef CONFIG_SMP
2749 rq->idle_balance = idle_cpu(cpu);
2750 trigger_load_balance(rq, cpu);
2751#endif
2752 rq_last_tick_reset(rq);
2753}
2754
2755#ifdef CONFIG_NO_HZ_FULL
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767u64 scheduler_tick_max_deferment(void)
2768{
2769 struct rq *rq = this_rq();
2770 unsigned long next, now = ACCESS_ONCE(jiffies);
2771
2772 next = rq->last_sched_tick + HZ;
2773
2774 if (time_before_eq(next, now))
2775 return 0;
2776
2777 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2778}
2779#endif
2780
2781notrace unsigned long get_parent_ip(unsigned long addr)
2782{
2783 if (in_lock_functions(addr)) {
2784 addr = CALLER_ADDR2;
2785 if (in_lock_functions(addr))
2786 addr = CALLER_ADDR3;
2787 }
2788 return addr;
2789}
2790
2791#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2792 defined(CONFIG_PREEMPT_TRACER))
2793
2794void __kprobes add_preempt_count(int val)
2795{
2796#ifdef CONFIG_DEBUG_PREEMPT
2797
2798
2799
2800 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2801 return;
2802#endif
2803 preempt_count() += val;
2804#ifdef CONFIG_DEBUG_PREEMPT
2805
2806
2807
2808 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2809 PREEMPT_MASK - 10);
2810#endif
2811 if (preempt_count() == val)
2812 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2813}
2814EXPORT_SYMBOL(add_preempt_count);
2815
2816void __kprobes sub_preempt_count(int val)
2817{
2818#ifdef CONFIG_DEBUG_PREEMPT
2819
2820
2821
2822 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2823 return;
2824
2825
2826
2827 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2828 !(preempt_count() & PREEMPT_MASK)))
2829 return;
2830#endif
2831
2832 if (preempt_count() == val)
2833 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2834 preempt_count() -= val;
2835}
2836EXPORT_SYMBOL(sub_preempt_count);
2837
2838#endif
2839
2840
2841
2842
2843static noinline void __schedule_bug(struct task_struct *prev)
2844{
2845 if (oops_in_progress)
2846 return;
2847
2848 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2849 prev->comm, prev->pid, preempt_count());
2850
2851 debug_show_held_locks(prev);
2852 print_modules();
2853 if (irqs_disabled())
2854 print_irqtrace_events(prev);
2855 dump_stack();
2856 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2857}
2858
2859
2860
2861
2862static inline void schedule_debug(struct task_struct *prev)
2863{
2864
2865
2866
2867
2868
2869 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2870 __schedule_bug(prev);
2871 rcu_sleep_check();
2872
2873 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2874
2875 schedstat_inc(this_rq(), sched_count);
2876}
2877
2878static void put_prev_task(struct rq *rq, struct task_struct *prev)
2879{
2880 if (prev->on_rq || rq->skip_clock_update < 0)
2881 update_rq_clock(rq);
2882 prev->sched_class->put_prev_task(rq, prev);
2883}
2884
2885
2886
2887
2888static inline struct task_struct *
2889pick_next_task(struct rq *rq)
2890{
2891 const struct sched_class *class;
2892 struct task_struct *p;
2893
2894
2895
2896
2897
2898 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2899 p = fair_sched_class.pick_next_task(rq);
2900 if (likely(p))
2901 return p;
2902 }
2903
2904 for_each_class(class) {
2905 p = class->pick_next_task(rq);
2906 if (p)
2907 return p;
2908 }
2909
2910 BUG();
2911}
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950static void __sched __schedule(void)
2951{
2952 struct task_struct *prev, *next;
2953 unsigned long *switch_count;
2954 struct rq *rq;
2955 int cpu;
2956
2957need_resched:
2958 preempt_disable();
2959 cpu = smp_processor_id();
2960 rq = cpu_rq(cpu);
2961 rcu_note_context_switch(cpu);
2962 prev = rq->curr;
2963
2964 schedule_debug(prev);
2965
2966 if (sched_feat(HRTICK))
2967 hrtick_clear(rq);
2968
2969 raw_spin_lock_irq(&rq->lock);
2970
2971 switch_count = &prev->nivcsw;
2972 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2973 if (unlikely(signal_pending_state(prev->state, prev))) {
2974 prev->state = TASK_RUNNING;
2975 } else {
2976 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2977 prev->on_rq = 0;
2978
2979
2980
2981
2982
2983
2984 if (prev->flags & PF_WQ_WORKER) {
2985 struct task_struct *to_wakeup;
2986
2987 to_wakeup = wq_worker_sleeping(prev, cpu);
2988 if (to_wakeup)
2989 try_to_wake_up_local(to_wakeup);
2990 }
2991 }
2992 switch_count = &prev->nvcsw;
2993 }
2994
2995 pre_schedule(rq, prev);
2996
2997 if (unlikely(!rq->nr_running))
2998 idle_balance(cpu, rq);
2999
3000 put_prev_task(rq, prev);
3001 next = pick_next_task(rq);
3002 clear_tsk_need_resched(prev);
3003 rq->skip_clock_update = 0;
3004
3005 if (likely(prev != next)) {
3006 rq->nr_switches++;
3007 rq->curr = next;
3008 ++*switch_count;
3009
3010 context_switch(rq, prev, next);
3011
3012
3013
3014
3015
3016
3017 cpu = smp_processor_id();
3018 rq = cpu_rq(cpu);
3019 } else
3020 raw_spin_unlock_irq(&rq->lock);
3021
3022 post_schedule(rq);
3023
3024 sched_preempt_enable_no_resched();
3025 if (need_resched())
3026 goto need_resched;
3027}
3028
3029static inline void sched_submit_work(struct task_struct *tsk)
3030{
3031 if (!tsk->state || tsk_is_pi_blocked(tsk))
3032 return;
3033
3034
3035
3036
3037 if (blk_needs_flush_plug(tsk))
3038 blk_schedule_flush_plug(tsk);
3039}
3040
3041asmlinkage void __sched schedule(void)
3042{
3043 struct task_struct *tsk = current;
3044
3045 sched_submit_work(tsk);
3046 __schedule();
3047}
3048EXPORT_SYMBOL(schedule);
3049
3050#ifdef CONFIG_CONTEXT_TRACKING
3051asmlinkage void __sched schedule_user(void)
3052{
3053
3054
3055
3056
3057
3058
3059 user_exit();
3060 schedule();
3061 user_enter();
3062}
3063#endif
3064
3065
3066
3067
3068
3069
3070void __sched schedule_preempt_disabled(void)
3071{
3072 sched_preempt_enable_no_resched();
3073 schedule();
3074 preempt_disable();
3075}
3076
3077#ifdef CONFIG_PREEMPT
3078
3079
3080
3081
3082
3083asmlinkage void __sched notrace preempt_schedule(void)
3084{
3085 struct thread_info *ti = current_thread_info();
3086
3087
3088
3089
3090
3091 if (likely(ti->preempt_count || irqs_disabled()))
3092 return;
3093
3094 do {
3095 add_preempt_count_notrace(PREEMPT_ACTIVE);
3096 __schedule();
3097 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3098
3099
3100
3101
3102
3103 barrier();
3104 } while (need_resched());
3105}
3106EXPORT_SYMBOL(preempt_schedule);
3107
3108
3109
3110
3111
3112
3113
3114asmlinkage void __sched preempt_schedule_irq(void)
3115{
3116 struct thread_info *ti = current_thread_info();
3117 enum ctx_state prev_state;
3118
3119
3120 BUG_ON(ti->preempt_count || !irqs_disabled());
3121
3122 prev_state = exception_enter();
3123
3124 do {
3125 add_preempt_count(PREEMPT_ACTIVE);
3126 local_irq_enable();
3127 __schedule();
3128 local_irq_disable();
3129 sub_preempt_count(PREEMPT_ACTIVE);
3130
3131
3132
3133
3134
3135 barrier();
3136 } while (need_resched());
3137
3138 exception_exit(prev_state);
3139}
3140
3141#endif
3142
3143int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3144 void *key)
3145{
3146 return try_to_wake_up(curr->private, mode, wake_flags);
3147}
3148EXPORT_SYMBOL(default_wake_function);
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3160 int nr_exclusive, int wake_flags, void *key)
3161{
3162 wait_queue_t *curr, *next;
3163
3164 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3165 unsigned flags = curr->flags;
3166
3167 if (curr->func(curr, mode, wake_flags, key) &&
3168 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3169 break;
3170 }
3171}
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183void __wake_up(wait_queue_head_t *q, unsigned int mode,
3184 int nr_exclusive, void *key)
3185{
3186 unsigned long flags;
3187
3188 spin_lock_irqsave(&q->lock, flags);
3189 __wake_up_common(q, mode, nr_exclusive, 0, key);
3190 spin_unlock_irqrestore(&q->lock, flags);
3191}
3192EXPORT_SYMBOL(__wake_up);
3193
3194
3195
3196
3197void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3198{
3199 __wake_up_common(q, mode, nr, 0, NULL);
3200}
3201EXPORT_SYMBOL_GPL(__wake_up_locked);
3202
3203void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3204{
3205 __wake_up_common(q, mode, 1, 0, key);
3206}
3207EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3227 int nr_exclusive, void *key)
3228{
3229 unsigned long flags;
3230 int wake_flags = WF_SYNC;
3231
3232 if (unlikely(!q))
3233 return;
3234
3235 if (unlikely(!nr_exclusive))
3236 wake_flags = 0;
3237
3238 spin_lock_irqsave(&q->lock, flags);
3239 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3240 spin_unlock_irqrestore(&q->lock, flags);
3241}
3242EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3243
3244
3245
3246
3247void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3248{
3249 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3250}
3251EXPORT_SYMBOL_GPL(__wake_up_sync);
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265void complete(struct completion *x)
3266{
3267 unsigned long flags;
3268
3269 spin_lock_irqsave(&x->wait.lock, flags);
3270 x->done++;
3271 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3272 spin_unlock_irqrestore(&x->wait.lock, flags);
3273}
3274EXPORT_SYMBOL(complete);
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285void complete_all(struct completion *x)
3286{
3287 unsigned long flags;
3288
3289 spin_lock_irqsave(&x->wait.lock, flags);
3290 x->done += UINT_MAX/2;
3291 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3292 spin_unlock_irqrestore(&x->wait.lock, flags);
3293}
3294EXPORT_SYMBOL(complete_all);
3295
3296static inline long __sched
3297do_wait_for_common(struct completion *x,
3298 long (*action)(long), long timeout, int state)
3299{
3300 if (!x->done) {
3301 DECLARE_WAITQUEUE(wait, current);
3302
3303 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3304 do {
3305 if (signal_pending_state(state, current)) {
3306 timeout = -ERESTARTSYS;
3307 break;
3308 }
3309 __set_current_state(state);
3310 spin_unlock_irq(&x->wait.lock);
3311 timeout = action(timeout);
3312 spin_lock_irq(&x->wait.lock);
3313 } while (!x->done && timeout);
3314 __remove_wait_queue(&x->wait, &wait);
3315 if (!x->done)
3316 return timeout;
3317 }
3318 x->done--;
3319 return timeout ?: 1;
3320}
3321
3322static inline long __sched
3323__wait_for_common(struct completion *x,
3324 long (*action)(long), long timeout, int state)
3325{
3326 might_sleep();
3327
3328 spin_lock_irq(&x->wait.lock);
3329 timeout = do_wait_for_common(x, action, timeout, state);
3330 spin_unlock_irq(&x->wait.lock);
3331 return timeout;
3332}
3333
3334static long __sched
3335wait_for_common(struct completion *x, long timeout, int state)
3336{
3337 return __wait_for_common(x, schedule_timeout, timeout, state);
3338}
3339
3340static long __sched
3341wait_for_common_io(struct completion *x, long timeout, int state)
3342{
3343 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3344}
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356void __sched wait_for_completion(struct completion *x)
3357{
3358 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3359}
3360EXPORT_SYMBOL(wait_for_completion);
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374unsigned long __sched
3375wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3376{
3377 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3378}
3379EXPORT_SYMBOL(wait_for_completion_timeout);
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389void __sched wait_for_completion_io(struct completion *x)
3390{
3391 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3392}
3393EXPORT_SYMBOL(wait_for_completion_io);
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407unsigned long __sched
3408wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3409{
3410 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3411}
3412EXPORT_SYMBOL(wait_for_completion_io_timeout);
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423int __sched wait_for_completion_interruptible(struct completion *x)
3424{
3425 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3426 if (t == -ERESTARTSYS)
3427 return t;
3428 return 0;
3429}
3430EXPORT_SYMBOL(wait_for_completion_interruptible);
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443long __sched
3444wait_for_completion_interruptible_timeout(struct completion *x,
3445 unsigned long timeout)
3446{
3447 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3448}
3449EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460int __sched wait_for_completion_killable(struct completion *x)
3461{
3462 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3463 if (t == -ERESTARTSYS)
3464 return t;
3465 return 0;
3466}
3467EXPORT_SYMBOL(wait_for_completion_killable);
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481long __sched
3482wait_for_completion_killable_timeout(struct completion *x,
3483 unsigned long timeout)
3484{
3485 return wait_for_common(x, timeout, TASK_KILLABLE);
3486}
3487EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501bool try_wait_for_completion(struct completion *x)
3502{
3503 unsigned long flags;
3504 int ret = 1;
3505
3506 spin_lock_irqsave(&x->wait.lock, flags);
3507 if (!x->done)
3508 ret = 0;
3509 else
3510 x->done--;
3511 spin_unlock_irqrestore(&x->wait.lock, flags);
3512 return ret;
3513}
3514EXPORT_SYMBOL(try_wait_for_completion);
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524bool completion_done(struct completion *x)
3525{
3526 unsigned long flags;
3527 int ret = 1;
3528
3529 spin_lock_irqsave(&x->wait.lock, flags);
3530 if (!x->done)
3531 ret = 0;
3532 spin_unlock_irqrestore(&x->wait.lock, flags);
3533 return ret;
3534}
3535EXPORT_SYMBOL(completion_done);
3536
3537static long __sched
3538sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3539{
3540 unsigned long flags;
3541 wait_queue_t wait;
3542
3543 init_waitqueue_entry(&wait, current);
3544
3545 __set_current_state(state);
3546
3547 spin_lock_irqsave(&q->lock, flags);
3548 __add_wait_queue(q, &wait);
3549 spin_unlock(&q->lock);
3550 timeout = schedule_timeout(timeout);
3551 spin_lock_irq(&q->lock);
3552 __remove_wait_queue(q, &wait);
3553 spin_unlock_irqrestore(&q->lock, flags);
3554
3555 return timeout;
3556}
3557
3558void __sched interruptible_sleep_on(wait_queue_head_t *q)
3559{
3560 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3561}
3562EXPORT_SYMBOL(interruptible_sleep_on);
3563
3564long __sched
3565interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3566{
3567 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3568}
3569EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3570
3571void __sched sleep_on(wait_queue_head_t *q)
3572{
3573 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3574}
3575EXPORT_SYMBOL(sleep_on);
3576
3577long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3578{
3579 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3580}
3581EXPORT_SYMBOL(sleep_on_timeout);
3582
3583#ifdef CONFIG_RT_MUTEXES
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595void rt_mutex_setprio(struct task_struct *p, int prio)
3596{
3597 int oldprio, on_rq, running;
3598 struct rq *rq;
3599 const struct sched_class *prev_class;
3600
3601 BUG_ON(prio < 0 || prio > MAX_PRIO);
3602
3603 rq = __task_rq_lock(p);
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617 if (unlikely(p == rq->idle)) {
3618 WARN_ON(p != rq->curr);
3619 WARN_ON(p->pi_blocked_on);
3620 goto out_unlock;
3621 }
3622
3623 trace_sched_pi_setprio(p, prio);
3624 oldprio = p->prio;
3625 prev_class = p->sched_class;
3626 on_rq = p->on_rq;
3627 running = task_current(rq, p);
3628 if (on_rq)
3629 dequeue_task(rq, p, 0);
3630 if (running)
3631 p->sched_class->put_prev_task(rq, p);
3632
3633 if (rt_prio(prio))
3634 p->sched_class = &rt_sched_class;
3635 else
3636 p->sched_class = &fair_sched_class;
3637
3638 p->prio = prio;
3639
3640 if (running)
3641 p->sched_class->set_curr_task(rq);
3642 if (on_rq)
3643 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3644
3645 check_class_changed(rq, p, prev_class, oldprio);
3646out_unlock:
3647 __task_rq_unlock(rq);
3648}
3649#endif
3650void set_user_nice(struct task_struct *p, long nice)
3651{
3652 int old_prio, delta, on_rq;
3653 unsigned long flags;
3654 struct rq *rq;
3655
3656 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3657 return;
3658
3659
3660
3661
3662 rq = task_rq_lock(p, &flags);
3663
3664
3665
3666
3667
3668
3669 if (task_has_rt_policy(p)) {
3670 p->static_prio = NICE_TO_PRIO(nice);
3671 goto out_unlock;
3672 }
3673 on_rq = p->on_rq;
3674 if (on_rq)
3675 dequeue_task(rq, p, 0);
3676
3677 p->static_prio = NICE_TO_PRIO(nice);
3678 set_load_weight(p);
3679 old_prio = p->prio;
3680 p->prio = effective_prio(p);
3681 delta = p->prio - old_prio;
3682
3683 if (on_rq) {
3684 enqueue_task(rq, p, 0);
3685
3686
3687
3688
3689 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3690 resched_task(rq->curr);
3691 }
3692out_unlock:
3693 task_rq_unlock(rq, p, &flags);
3694}
3695EXPORT_SYMBOL(set_user_nice);
3696
3697
3698
3699
3700
3701
3702int can_nice(const struct task_struct *p, const int nice)
3703{
3704
3705 int nice_rlim = 20 - nice;
3706
3707 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3708 capable(CAP_SYS_NICE));
3709}
3710
3711#ifdef __ARCH_WANT_SYS_NICE
3712
3713
3714
3715
3716
3717
3718
3719
3720SYSCALL_DEFINE1(nice, int, increment)
3721{
3722 long nice, retval;
3723
3724
3725
3726
3727
3728
3729 if (increment < -40)
3730 increment = -40;
3731 if (increment > 40)
3732 increment = 40;
3733
3734 nice = TASK_NICE(current) + increment;
3735 if (nice < -20)
3736 nice = -20;
3737 if (nice > 19)
3738 nice = 19;
3739
3740 if (increment < 0 && !can_nice(current, nice))
3741 return -EPERM;
3742
3743 retval = security_task_setnice(current, nice);
3744 if (retval)
3745 return retval;
3746
3747 set_user_nice(current, nice);
3748 return 0;
3749}
3750
3751#endif
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761int task_prio(const struct task_struct *p)
3762{
3763 return p->prio - MAX_RT_PRIO;
3764}
3765
3766
3767
3768
3769
3770int task_nice(const struct task_struct *p)
3771{
3772 return TASK_NICE(p);
3773}
3774EXPORT_SYMBOL(task_nice);
3775
3776
3777
3778
3779
3780int idle_cpu(int cpu)
3781{
3782 struct rq *rq = cpu_rq(cpu);
3783
3784 if (rq->curr != rq->idle)
3785 return 0;
3786
3787 if (rq->nr_running)
3788 return 0;
3789
3790#ifdef CONFIG_SMP
3791 if (!llist_empty(&rq->wake_list))
3792 return 0;
3793#endif
3794
3795 return 1;
3796}
3797
3798
3799
3800
3801
3802struct task_struct *idle_task(int cpu)
3803{
3804 return cpu_rq(cpu)->idle;
3805}
3806
3807
3808
3809
3810
3811static struct task_struct *find_process_by_pid(pid_t pid)
3812{
3813 return pid ? find_task_by_vpid(pid) : current;
3814}
3815
3816
3817static void
3818__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3819{
3820 p->policy = policy;
3821 p->rt_priority = prio;
3822 p->normal_prio = normal_prio(p);
3823
3824 p->prio = rt_mutex_getprio(p);
3825 if (rt_prio(p->prio))
3826 p->sched_class = &rt_sched_class;
3827 else
3828 p->sched_class = &fair_sched_class;
3829 set_load_weight(p);
3830}
3831
3832
3833
3834
3835static bool check_same_owner(struct task_struct *p)
3836{
3837 const struct cred *cred = current_cred(), *pcred;
3838 bool match;
3839
3840 rcu_read_lock();
3841 pcred = __task_cred(p);
3842 match = (uid_eq(cred->euid, pcred->euid) ||
3843 uid_eq(cred->euid, pcred->uid));
3844 rcu_read_unlock();
3845 return match;
3846}
3847
3848static int __sched_setscheduler(struct task_struct *p, int policy,
3849 const struct sched_param *param, bool user)
3850{
3851 int retval, oldprio, oldpolicy = -1, on_rq, running;
3852 unsigned long flags;
3853 const struct sched_class *prev_class;
3854 struct rq *rq;
3855 int reset_on_fork;
3856
3857
3858 BUG_ON(in_interrupt());
3859recheck:
3860
3861 if (policy < 0) {
3862 reset_on_fork = p->sched_reset_on_fork;
3863 policy = oldpolicy = p->policy;
3864 } else {
3865 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3866 policy &= ~SCHED_RESET_ON_FORK;
3867
3868 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3869 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3870 policy != SCHED_IDLE)
3871 return -EINVAL;
3872 }
3873
3874
3875
3876
3877
3878
3879 if (param->sched_priority < 0 ||
3880 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3881 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3882 return -EINVAL;
3883 if (rt_policy(policy) != (param->sched_priority != 0))
3884 return -EINVAL;
3885
3886
3887
3888
3889 if (user && !capable(CAP_SYS_NICE)) {
3890 if (rt_policy(policy)) {
3891 unsigned long rlim_rtprio =
3892 task_rlimit(p, RLIMIT_RTPRIO);
3893
3894
3895 if (policy != p->policy && !rlim_rtprio)
3896 return -EPERM;
3897
3898
3899 if (param->sched_priority > p->rt_priority &&
3900 param->sched_priority > rlim_rtprio)
3901 return -EPERM;
3902 }
3903
3904
3905
3906
3907
3908 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3909 if (!can_nice(p, TASK_NICE(p)))
3910 return -EPERM;
3911 }
3912
3913
3914 if (!check_same_owner(p))
3915 return -EPERM;
3916
3917
3918 if (p->sched_reset_on_fork && !reset_on_fork)
3919 return -EPERM;
3920 }
3921
3922 if (user) {
3923 retval = security_task_setscheduler(p);
3924 if (retval)
3925 return retval;
3926 }
3927
3928
3929
3930
3931
3932
3933
3934
3935 rq = task_rq_lock(p, &flags);
3936
3937
3938
3939
3940 if (p == rq->stop) {
3941 task_rq_unlock(rq, p, &flags);
3942 return -EINVAL;
3943 }
3944
3945
3946
3947
3948 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3949 param->sched_priority == p->rt_priority))) {
3950 task_rq_unlock(rq, p, &flags);
3951 return 0;
3952 }
3953
3954#ifdef CONFIG_RT_GROUP_SCHED
3955 if (user) {
3956
3957
3958
3959
3960 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3961 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3962 !task_group_is_autogroup(task_group(p))) {
3963 task_rq_unlock(rq, p, &flags);
3964 return -EPERM;
3965 }
3966 }
3967#endif
3968
3969
3970 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3971 policy = oldpolicy = -1;
3972 task_rq_unlock(rq, p, &flags);
3973 goto recheck;
3974 }
3975 on_rq = p->on_rq;
3976 running = task_current(rq, p);
3977 if (on_rq)
3978 dequeue_task(rq, p, 0);
3979 if (running)
3980 p->sched_class->put_prev_task(rq, p);
3981
3982 p->sched_reset_on_fork = reset_on_fork;
3983
3984 oldprio = p->prio;
3985 prev_class = p->sched_class;
3986 __setscheduler(rq, p, policy, param->sched_priority);
3987
3988 if (running)
3989 p->sched_class->set_curr_task(rq);
3990 if (on_rq)
3991 enqueue_task(rq, p, 0);
3992
3993 check_class_changed(rq, p, prev_class, oldprio);
3994 task_rq_unlock(rq, p, &flags);
3995
3996 rt_mutex_adjust_pi(p);
3997
3998 return 0;
3999}
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009int sched_setscheduler(struct task_struct *p, int policy,
4010 const struct sched_param *param)
4011{
4012 return __sched_setscheduler(p, policy, param, true);
4013}
4014EXPORT_SYMBOL_GPL(sched_setscheduler);
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4028 const struct sched_param *param)
4029{
4030 return __sched_setscheduler(p, policy, param, false);
4031}
4032
4033static int
4034do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4035{
4036 struct sched_param lparam;
4037 struct task_struct *p;
4038 int retval;
4039
4040 if (!param || pid < 0)
4041 return -EINVAL;
4042 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4043 return -EFAULT;
4044
4045 rcu_read_lock();
4046 retval = -ESRCH;
4047 p = find_process_by_pid(pid);
4048 if (p != NULL)
4049 retval = sched_setscheduler(p, policy, &lparam);
4050 rcu_read_unlock();
4051
4052 return retval;
4053}
4054
4055
4056
4057
4058
4059
4060
4061SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4062 struct sched_param __user *, param)
4063{
4064
4065 if (policy < 0)
4066 return -EINVAL;
4067
4068 return do_sched_setscheduler(pid, policy, param);
4069}
4070
4071
4072
4073
4074
4075
4076SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4077{
4078 return do_sched_setscheduler(pid, -1, param);
4079}
4080
4081
4082
4083
4084
4085SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4086{
4087 struct task_struct *p;
4088 int retval;
4089
4090 if (pid < 0)
4091 return -EINVAL;
4092
4093 retval = -ESRCH;
4094 rcu_read_lock();
4095 p = find_process_by_pid(pid);
4096 if (p) {
4097 retval = security_task_getscheduler(p);
4098 if (!retval)
4099 retval = p->policy
4100 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4101 }
4102 rcu_read_unlock();
4103 return retval;
4104}
4105
4106
4107
4108
4109
4110
4111SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4112{
4113 struct sched_param lp;
4114 struct task_struct *p;
4115 int retval;
4116
4117 if (!param || pid < 0)
4118 return -EINVAL;
4119
4120 rcu_read_lock();
4121 p = find_process_by_pid(pid);
4122 retval = -ESRCH;
4123 if (!p)
4124 goto out_unlock;
4125
4126 retval = security_task_getscheduler(p);
4127 if (retval)
4128 goto out_unlock;
4129
4130 lp.sched_priority = p->rt_priority;
4131 rcu_read_unlock();
4132
4133
4134
4135
4136 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4137
4138 return retval;
4139
4140out_unlock:
4141 rcu_read_unlock();
4142 return retval;
4143}
4144
4145long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4146{
4147 cpumask_var_t cpus_allowed, new_mask;
4148 struct task_struct *p;
4149 int retval;
4150
4151 get_online_cpus();
4152 rcu_read_lock();
4153
4154 p = find_process_by_pid(pid);
4155 if (!p) {
4156 rcu_read_unlock();
4157 put_online_cpus();
4158 return -ESRCH;
4159 }
4160
4161
4162 get_task_struct(p);
4163 rcu_read_unlock();
4164
4165 if (p->flags & PF_NO_SETAFFINITY) {
4166 retval = -EINVAL;
4167 goto out_put_task;
4168 }
4169 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4170 retval = -ENOMEM;
4171 goto out_put_task;
4172 }
4173 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4174 retval = -ENOMEM;
4175 goto out_free_cpus_allowed;
4176 }
4177 retval = -EPERM;
4178 if (!check_same_owner(p)) {
4179 rcu_read_lock();
4180 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4181 rcu_read_unlock();
4182 goto out_unlock;
4183 }
4184 rcu_read_unlock();
4185 }
4186
4187 retval = security_task_setscheduler(p);
4188 if (retval)
4189 goto out_unlock;
4190
4191 cpuset_cpus_allowed(p, cpus_allowed);
4192 cpumask_and(new_mask, in_mask, cpus_allowed);
4193again:
4194 retval = set_cpus_allowed_ptr(p, new_mask);
4195
4196 if (!retval) {
4197 cpuset_cpus_allowed(p, cpus_allowed);
4198 if (!cpumask_subset(new_mask, cpus_allowed)) {
4199
4200
4201
4202
4203
4204 cpumask_copy(new_mask, cpus_allowed);
4205 goto again;
4206 }
4207 }
4208out_unlock:
4209 free_cpumask_var(new_mask);
4210out_free_cpus_allowed:
4211 free_cpumask_var(cpus_allowed);
4212out_put_task:
4213 put_task_struct(p);
4214 put_online_cpus();
4215 return retval;
4216}
4217
4218static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4219 struct cpumask *new_mask)
4220{
4221 if (len < cpumask_size())
4222 cpumask_clear(new_mask);
4223 else if (len > cpumask_size())
4224 len = cpumask_size();
4225
4226 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4227}
4228
4229
4230
4231
4232
4233
4234
4235SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4236 unsigned long __user *, user_mask_ptr)
4237{
4238 cpumask_var_t new_mask;
4239 int retval;
4240
4241 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4242 return -ENOMEM;
4243
4244 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4245 if (retval == 0)
4246 retval = sched_setaffinity(pid, new_mask);
4247 free_cpumask_var(new_mask);
4248 return retval;
4249}
4250
4251long sched_getaffinity(pid_t pid, struct cpumask *mask)
4252{
4253 struct task_struct *p;
4254 unsigned long flags;
4255 int retval;
4256
4257 get_online_cpus();
4258 rcu_read_lock();
4259
4260 retval = -ESRCH;
4261 p = find_process_by_pid(pid);
4262 if (!p)
4263 goto out_unlock;
4264
4265 retval = security_task_getscheduler(p);
4266 if (retval)
4267 goto out_unlock;
4268
4269 raw_spin_lock_irqsave(&p->pi_lock, flags);
4270 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4271 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4272
4273out_unlock:
4274 rcu_read_unlock();
4275 put_online_cpus();
4276
4277 return retval;
4278}
4279
4280
4281
4282
4283
4284
4285
4286SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4287 unsigned long __user *, user_mask_ptr)
4288{
4289 int ret;
4290 cpumask_var_t mask;
4291
4292 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4293 return -EINVAL;
4294 if (len & (sizeof(unsigned long)-1))
4295 return -EINVAL;
4296
4297 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4298 return -ENOMEM;
4299
4300 ret = sched_getaffinity(pid, mask);
4301 if (ret == 0) {
4302 size_t retlen = min_t(size_t, len, cpumask_size());
4303
4304 if (copy_to_user(user_mask_ptr, mask, retlen))
4305 ret = -EFAULT;
4306 else
4307 ret = retlen;
4308 }
4309 free_cpumask_var(mask);
4310
4311 return ret;
4312}
4313
4314
4315
4316
4317
4318
4319
4320SYSCALL_DEFINE0(sched_yield)
4321{
4322 struct rq *rq = this_rq_lock();
4323
4324 schedstat_inc(rq, yld_count);
4325 current->sched_class->yield_task(rq);
4326
4327
4328
4329
4330
4331 __release(rq->lock);
4332 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4333 do_raw_spin_unlock(&rq->lock);
4334 sched_preempt_enable_no_resched();
4335
4336 schedule();
4337
4338 return 0;
4339}
4340
4341static inline int should_resched(void)
4342{
4343 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4344}
4345
4346static void __cond_resched(void)
4347{
4348 add_preempt_count(PREEMPT_ACTIVE);
4349 __schedule();
4350 sub_preempt_count(PREEMPT_ACTIVE);
4351}
4352
4353int __sched _cond_resched(void)
4354{
4355 if (should_resched()) {
4356 __cond_resched();
4357 return 1;
4358 }
4359 return 0;
4360}
4361EXPORT_SYMBOL(_cond_resched);
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371int __cond_resched_lock(spinlock_t *lock)
4372{
4373 int resched = should_resched();
4374 int ret = 0;
4375
4376 lockdep_assert_held(lock);
4377
4378 if (spin_needbreak(lock) || resched) {
4379 spin_unlock(lock);
4380 if (resched)
4381 __cond_resched();
4382 else
4383 cpu_relax();
4384 ret = 1;
4385 spin_lock(lock);
4386 }
4387 return ret;
4388}
4389EXPORT_SYMBOL(__cond_resched_lock);
4390
4391int __sched __cond_resched_softirq(void)
4392{
4393 BUG_ON(!in_softirq());
4394
4395 if (should_resched()) {
4396 local_bh_enable();
4397 __cond_resched();
4398 local_bh_disable();
4399 return 1;
4400 }
4401 return 0;
4402}
4403EXPORT_SYMBOL(__cond_resched_softirq);
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427void __sched yield(void)
4428{
4429 set_current_state(TASK_RUNNING);
4430 sys_sched_yield();
4431}
4432EXPORT_SYMBOL(yield);
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449bool __sched yield_to(struct task_struct *p, bool preempt)
4450{
4451 struct task_struct *curr = current;
4452 struct rq *rq, *p_rq;
4453 unsigned long flags;
4454 int yielded = 0;
4455
4456 local_irq_save(flags);
4457 rq = this_rq();
4458
4459again:
4460 p_rq = task_rq(p);
4461
4462
4463
4464
4465 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4466 yielded = -ESRCH;
4467 goto out_irq;
4468 }
4469
4470 double_rq_lock(rq, p_rq);
4471 while (task_rq(p) != p_rq) {
4472 double_rq_unlock(rq, p_rq);
4473 goto again;
4474 }
4475
4476 if (!curr->sched_class->yield_to_task)
4477 goto out_unlock;
4478
4479 if (curr->sched_class != p->sched_class)
4480 goto out_unlock;
4481
4482 if (task_running(p_rq, p) || p->state)
4483 goto out_unlock;
4484
4485 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4486 if (yielded) {
4487 schedstat_inc(rq, yld_count);
4488
4489
4490
4491
4492 if (preempt && rq != p_rq)
4493 resched_task(p_rq->curr);
4494 }
4495
4496out_unlock:
4497 double_rq_unlock(rq, p_rq);
4498out_irq:
4499 local_irq_restore(flags);
4500
4501 if (yielded > 0)
4502 schedule();
4503
4504 return yielded;
4505}
4506EXPORT_SYMBOL_GPL(yield_to);
4507
4508
4509
4510
4511
4512void __sched io_schedule(void)
4513{
4514 struct rq *rq = raw_rq();
4515
4516 delayacct_blkio_start();
4517 atomic_inc(&rq->nr_iowait);
4518 blk_flush_plug(current);
4519 current->in_iowait = 1;
4520 schedule();
4521 current->in_iowait = 0;
4522 atomic_dec(&rq->nr_iowait);
4523 delayacct_blkio_end();
4524}
4525EXPORT_SYMBOL(io_schedule);
4526
4527long __sched io_schedule_timeout(long timeout)
4528{
4529 struct rq *rq = raw_rq();
4530 long ret;
4531
4532 delayacct_blkio_start();
4533 atomic_inc(&rq->nr_iowait);
4534 blk_flush_plug(current);
4535 current->in_iowait = 1;
4536 ret = schedule_timeout(timeout);
4537 current->in_iowait = 0;
4538 atomic_dec(&rq->nr_iowait);
4539 delayacct_blkio_end();
4540 return ret;
4541}
4542
4543
4544
4545
4546
4547
4548
4549
4550SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4551{
4552 int ret = -EINVAL;
4553
4554 switch (policy) {
4555 case SCHED_FIFO:
4556 case SCHED_RR:
4557 ret = MAX_USER_RT_PRIO-1;
4558 break;
4559 case SCHED_NORMAL:
4560 case SCHED_BATCH:
4561 case SCHED_IDLE:
4562 ret = 0;
4563 break;
4564 }
4565 return ret;
4566}
4567
4568
4569
4570
4571
4572
4573
4574
4575SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4576{
4577 int ret = -EINVAL;
4578
4579 switch (policy) {
4580 case SCHED_FIFO:
4581 case SCHED_RR:
4582 ret = 1;
4583 break;
4584 case SCHED_NORMAL:
4585 case SCHED_BATCH:
4586 case SCHED_IDLE:
4587 ret = 0;
4588 }
4589 return ret;
4590}
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4601 struct timespec __user *, interval)
4602{
4603 struct task_struct *p;
4604 unsigned int time_slice;
4605 unsigned long flags;
4606 struct rq *rq;
4607 int retval;
4608 struct timespec t;
4609
4610 if (pid < 0)
4611 return -EINVAL;
4612
4613 retval = -ESRCH;
4614 rcu_read_lock();
4615 p = find_process_by_pid(pid);
4616 if (!p)
4617 goto out_unlock;
4618
4619 retval = security_task_getscheduler(p);
4620 if (retval)
4621 goto out_unlock;
4622
4623 rq = task_rq_lock(p, &flags);
4624 time_slice = p->sched_class->get_rr_interval(rq, p);
4625 task_rq_unlock(rq, p, &flags);
4626
4627 rcu_read_unlock();
4628 jiffies_to_timespec(time_slice, &t);
4629 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4630 return retval;
4631
4632out_unlock:
4633 rcu_read_unlock();
4634 return retval;
4635}
4636
4637static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4638
4639void sched_show_task(struct task_struct *p)
4640{
4641 unsigned long free = 0;
4642 int ppid;
4643 unsigned state;
4644
4645 state = p->state ? __ffs(p->state) + 1 : 0;
4646 printk(KERN_INFO "%-15.15s %c", p->comm,
4647 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4648#if BITS_PER_LONG == 32
4649 if (state == TASK_RUNNING)
4650 printk(KERN_CONT " running ");
4651 else
4652 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4653#else
4654 if (state == TASK_RUNNING)
4655 printk(KERN_CONT " running task ");
4656 else
4657 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4658#endif
4659#ifdef CONFIG_DEBUG_STACK_USAGE
4660 free = stack_not_used(p);
4661#endif
4662 rcu_read_lock();
4663 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4664 rcu_read_unlock();
4665 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4666 task_pid_nr(p), ppid,
4667 (unsigned long)task_thread_info(p)->flags);
4668
4669 print_worker_info(KERN_INFO, p);
4670 show_stack(p, NULL);
4671}
4672
4673void show_state_filter(unsigned long state_filter)
4674{
4675 struct task_struct *g, *p;
4676
4677#if BITS_PER_LONG == 32
4678 printk(KERN_INFO
4679 " task PC stack pid father\n");
4680#else
4681 printk(KERN_INFO
4682 " task PC stack pid father\n");
4683#endif
4684 rcu_read_lock();
4685 do_each_thread(g, p) {
4686
4687
4688
4689
4690 touch_nmi_watchdog();
4691 if (!state_filter || (p->state & state_filter))
4692 sched_show_task(p);
4693 } while_each_thread(g, p);
4694
4695 touch_all_softlockup_watchdogs();
4696
4697#ifdef CONFIG_SCHED_DEBUG
4698 sysrq_sched_debug_show();
4699#endif
4700 rcu_read_unlock();
4701
4702
4703
4704 if (!state_filter)
4705 debug_show_all_locks();
4706}
4707
4708void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4709{
4710 idle->sched_class = &idle_sched_class;
4711}
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721void __cpuinit init_idle(struct task_struct *idle, int cpu)
4722{
4723 struct rq *rq = cpu_rq(cpu);
4724 unsigned long flags;
4725
4726 raw_spin_lock_irqsave(&rq->lock, flags);
4727
4728 __sched_fork(idle);
4729 idle->state = TASK_RUNNING;
4730 idle->se.exec_start = sched_clock();
4731
4732 do_set_cpus_allowed(idle, cpumask_of(cpu));
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743 rcu_read_lock();
4744 __set_task_cpu(idle, cpu);
4745 rcu_read_unlock();
4746
4747 rq->curr = rq->idle = idle;
4748#if defined(CONFIG_SMP)
4749 idle->on_cpu = 1;
4750#endif
4751 raw_spin_unlock_irqrestore(&rq->lock, flags);
4752
4753
4754 task_thread_info(idle)->preempt_count = 0;
4755
4756
4757
4758
4759 idle->sched_class = &idle_sched_class;
4760 ftrace_graph_init_idle_task(idle, cpu);
4761 vtime_init_idle(idle, cpu);
4762#if defined(CONFIG_SMP)
4763 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4764#endif
4765}
4766
4767#ifdef CONFIG_SMP
4768void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4769{
4770 if (p->sched_class && p->sched_class->set_cpus_allowed)
4771 p->sched_class->set_cpus_allowed(p, new_mask);
4772
4773 cpumask_copy(&p->cpus_allowed, new_mask);
4774 p->nr_cpus_allowed = cpumask_weight(new_mask);
4775}
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4801{
4802 unsigned long flags;
4803 struct rq *rq;
4804 unsigned int dest_cpu;
4805 int ret = 0;
4806
4807 rq = task_rq_lock(p, &flags);
4808
4809 if (cpumask_equal(&p->cpus_allowed, new_mask))
4810 goto out;
4811
4812 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4813 ret = -EINVAL;
4814 goto out;
4815 }
4816
4817 do_set_cpus_allowed(p, new_mask);
4818
4819
4820 if (cpumask_test_cpu(task_cpu(p), new_mask))
4821 goto out;
4822
4823 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4824 if (p->on_rq) {
4825 struct migration_arg arg = { p, dest_cpu };
4826
4827 task_rq_unlock(rq, p, &flags);
4828 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4829 tlb_migrate_finish(p->mm);
4830 return 0;
4831 }
4832out:
4833 task_rq_unlock(rq, p, &flags);
4834
4835 return ret;
4836}
4837EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4851{
4852 struct rq *rq_dest, *rq_src;
4853 int ret = 0;
4854
4855 if (unlikely(!cpu_active(dest_cpu)))
4856 return ret;
4857
4858 rq_src = cpu_rq(src_cpu);
4859 rq_dest = cpu_rq(dest_cpu);
4860
4861 raw_spin_lock(&p->pi_lock);
4862 double_rq_lock(rq_src, rq_dest);
4863
4864 if (task_cpu(p) != src_cpu)
4865 goto done;
4866
4867 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4868 goto fail;
4869
4870
4871
4872
4873
4874 if (p->on_rq) {
4875 dequeue_task(rq_src, p, 0);
4876 set_task_cpu(p, dest_cpu);
4877 enqueue_task(rq_dest, p, 0);
4878 check_preempt_curr(rq_dest, p, 0);
4879 }
4880done:
4881 ret = 1;
4882fail:
4883 double_rq_unlock(rq_src, rq_dest);
4884 raw_spin_unlock(&p->pi_lock);
4885 return ret;
4886}
4887
4888
4889
4890
4891
4892
4893static int migration_cpu_stop(void *data)
4894{
4895 struct migration_arg *arg = data;
4896
4897
4898
4899
4900
4901 local_irq_disable();
4902 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4903 local_irq_enable();
4904 return 0;
4905}
4906
4907#ifdef CONFIG_HOTPLUG_CPU
4908
4909
4910
4911
4912
4913void idle_task_exit(void)
4914{
4915 struct mm_struct *mm = current->active_mm;
4916
4917 BUG_ON(cpu_online(smp_processor_id()));
4918
4919 if (mm != &init_mm)
4920 switch_mm(mm, &init_mm, current);
4921 mmdrop(mm);
4922}
4923
4924
4925
4926
4927
4928
4929
4930
4931static void calc_load_migrate(struct rq *rq)
4932{
4933 long delta = calc_load_fold_active(rq);
4934 if (delta)
4935 atomic_long_add(delta, &calc_load_tasks);
4936}
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946static void migrate_tasks(unsigned int dead_cpu)
4947{
4948 struct rq *rq = cpu_rq(dead_cpu);
4949 struct task_struct *next, *stop = rq->stop;
4950 int dest_cpu;
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961 rq->stop = NULL;
4962
4963 for ( ; ; ) {
4964
4965
4966
4967
4968 if (rq->nr_running == 1)
4969 break;
4970
4971 next = pick_next_task(rq);
4972 BUG_ON(!next);
4973 next->sched_class->put_prev_task(rq, next);
4974
4975
4976 dest_cpu = select_fallback_rq(dead_cpu, next);
4977 raw_spin_unlock(&rq->lock);
4978
4979 __migrate_task(next, dead_cpu, dest_cpu);
4980
4981 raw_spin_lock(&rq->lock);
4982 }
4983
4984 rq->stop = stop;
4985}
4986
4987#endif
4988
4989#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4990
4991static struct ctl_table sd_ctl_dir[] = {
4992 {
4993 .procname = "sched_domain",
4994 .mode = 0555,
4995 },
4996 {}
4997};
4998
4999static struct ctl_table sd_ctl_root[] = {
5000 {
5001 .procname = "kernel",
5002 .mode = 0555,
5003 .child = sd_ctl_dir,
5004 },
5005 {}
5006};
5007
5008static struct ctl_table *sd_alloc_ctl_entry(int n)
5009{
5010 struct ctl_table *entry =
5011 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5012
5013 return entry;
5014}
5015
5016static void sd_free_ctl_entry(struct ctl_table **tablep)
5017{
5018 struct ctl_table *entry;
5019
5020
5021
5022
5023
5024
5025
5026 for (entry = *tablep; entry->mode; entry++) {
5027 if (entry->child)
5028 sd_free_ctl_entry(&entry->child);
5029 if (entry->proc_handler == NULL)
5030 kfree(entry->procname);
5031 }
5032
5033 kfree(*tablep);
5034 *tablep = NULL;
5035}
5036
5037static int min_load_idx = 0;
5038static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5039
5040static void
5041set_table_entry(struct ctl_table *entry,
5042 const char *procname, void *data, int maxlen,
5043 umode_t mode, proc_handler *proc_handler,
5044 bool load_idx)
5045{
5046 entry->procname = procname;
5047 entry->data = data;
5048 entry->maxlen = maxlen;
5049 entry->mode = mode;
5050 entry->proc_handler = proc_handler;
5051
5052 if (load_idx) {
5053 entry->extra1 = &min_load_idx;
5054 entry->extra2 = &max_load_idx;
5055 }
5056}
5057
5058static struct ctl_table *
5059sd_alloc_ctl_domain_table(struct sched_domain *sd)
5060{
5061 struct ctl_table *table = sd_alloc_ctl_entry(13);
5062
5063 if (table == NULL)
5064 return NULL;
5065
5066 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5067 sizeof(long), 0644, proc_doulongvec_minmax, false);
5068 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5069 sizeof(long), 0644, proc_doulongvec_minmax, false);
5070 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5071 sizeof(int), 0644, proc_dointvec_minmax, true);
5072 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5073 sizeof(int), 0644, proc_dointvec_minmax, true);
5074 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5075 sizeof(int), 0644, proc_dointvec_minmax, true);
5076 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5077 sizeof(int), 0644, proc_dointvec_minmax, true);
5078 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5079 sizeof(int), 0644, proc_dointvec_minmax, true);
5080 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5081 sizeof(int), 0644, proc_dointvec_minmax, false);
5082 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5083 sizeof(int), 0644, proc_dointvec_minmax, false);
5084 set_table_entry(&table[9], "cache_nice_tries",
5085 &sd->cache_nice_tries,
5086 sizeof(int), 0644, proc_dointvec_minmax, false);
5087 set_table_entry(&table[10], "flags", &sd->flags,
5088 sizeof(int), 0644, proc_dointvec_minmax, false);
5089 set_table_entry(&table[11], "name", sd->name,
5090 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5091
5092
5093 return table;
5094}
5095
5096static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5097{
5098 struct ctl_table *entry, *table;
5099 struct sched_domain *sd;
5100 int domain_num = 0, i;
5101 char buf[32];
5102
5103 for_each_domain(cpu, sd)
5104 domain_num++;
5105 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5106 if (table == NULL)
5107 return NULL;
5108
5109 i = 0;
5110 for_each_domain(cpu, sd) {
5111 snprintf(buf, 32, "domain%d", i);
5112 entry->procname = kstrdup(buf, GFP_KERNEL);
5113 entry->mode = 0555;
5114 entry->child = sd_alloc_ctl_domain_table(sd);
5115 entry++;
5116 i++;
5117 }
5118 return table;
5119}
5120
5121static struct ctl_table_header *sd_sysctl_header;
5122static void register_sched_domain_sysctl(void)
5123{
5124 int i, cpu_num = num_possible_cpus();
5125 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5126 char buf[32];
5127
5128 WARN_ON(sd_ctl_dir[0].child);
5129 sd_ctl_dir[0].child = entry;
5130
5131 if (entry == NULL)
5132 return;
5133
5134 for_each_possible_cpu(i) {
5135 snprintf(buf, 32, "cpu%d", i);
5136 entry->procname = kstrdup(buf, GFP_KERNEL);
5137 entry->mode = 0555;
5138 entry->child = sd_alloc_ctl_cpu_table(i);
5139 entry++;
5140 }
5141
5142 WARN_ON(sd_sysctl_header);
5143 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5144}
5145
5146
5147static void unregister_sched_domain_sysctl(void)
5148{
5149 if (sd_sysctl_header)
5150 unregister_sysctl_table(sd_sysctl_header);
5151 sd_sysctl_header = NULL;
5152 if (sd_ctl_dir[0].child)
5153 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5154}
5155#else
5156static void register_sched_domain_sysctl(void)
5157{
5158}
5159static void unregister_sched_domain_sysctl(void)
5160{
5161}
5162#endif
5163
5164static void set_rq_online(struct rq *rq)
5165{
5166 if (!rq->online) {
5167 const struct sched_class *class;
5168
5169 cpumask_set_cpu(rq->cpu, rq->rd->online);
5170 rq->online = 1;
5171
5172 for_each_class(class) {
5173 if (class->rq_online)
5174 class->rq_online(rq);
5175 }
5176 }
5177}
5178
5179static void set_rq_offline(struct rq *rq)
5180{
5181 if (rq->online) {
5182 const struct sched_class *class;
5183
5184 for_each_class(class) {
5185 if (class->rq_offline)
5186 class->rq_offline(rq);
5187 }
5188
5189 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5190 rq->online = 0;
5191 }
5192}
5193
5194
5195
5196
5197
5198static int __cpuinit
5199migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5200{
5201 int cpu = (long)hcpu;
5202 unsigned long flags;
5203 struct rq *rq = cpu_rq(cpu);
5204
5205 switch (action & ~CPU_TASKS_FROZEN) {
5206
5207 case CPU_UP_PREPARE:
5208 rq->calc_load_update = calc_load_update;
5209 break;
5210
5211 case CPU_ONLINE:
5212
5213 raw_spin_lock_irqsave(&rq->lock, flags);
5214 if (rq->rd) {
5215 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5216
5217 set_rq_online(rq);
5218 }
5219 raw_spin_unlock_irqrestore(&rq->lock, flags);
5220 break;
5221
5222#ifdef CONFIG_HOTPLUG_CPU
5223 case CPU_DYING:
5224 sched_ttwu_pending();
5225
5226 raw_spin_lock_irqsave(&rq->lock, flags);
5227 if (rq->rd) {
5228 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5229 set_rq_offline(rq);
5230 }
5231 migrate_tasks(cpu);
5232 BUG_ON(rq->nr_running != 1);
5233 raw_spin_unlock_irqrestore(&rq->lock, flags);
5234 break;
5235
5236 case CPU_DEAD:
5237 calc_load_migrate(rq);
5238 break;
5239#endif
5240 }
5241
5242 update_max_interval();
5243
5244 return NOTIFY_OK;
5245}
5246
5247
5248
5249
5250
5251
5252static struct notifier_block __cpuinitdata migration_notifier = {
5253 .notifier_call = migration_call,
5254 .priority = CPU_PRI_MIGRATION,
5255};
5256
5257static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5258 unsigned long action, void *hcpu)
5259{
5260 switch (action & ~CPU_TASKS_FROZEN) {
5261 case CPU_STARTING:
5262 case CPU_DOWN_FAILED:
5263 set_cpu_active((long)hcpu, true);
5264 return NOTIFY_OK;
5265 default:
5266 return NOTIFY_DONE;
5267 }
5268}
5269
5270static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5271 unsigned long action, void *hcpu)
5272{
5273 switch (action & ~CPU_TASKS_FROZEN) {
5274 case CPU_DOWN_PREPARE:
5275 set_cpu_active((long)hcpu, false);
5276 return NOTIFY_OK;
5277 default:
5278 return NOTIFY_DONE;
5279 }
5280}
5281
5282static int __init migration_init(void)
5283{
5284 void *cpu = (void *)(long)smp_processor_id();
5285 int err;
5286
5287
5288 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5289 BUG_ON(err == NOTIFY_BAD);
5290 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5291 register_cpu_notifier(&migration_notifier);
5292
5293
5294 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5295 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5296
5297 return 0;
5298}
5299early_initcall(migration_init);
5300#endif
5301
5302#ifdef CONFIG_SMP
5303
5304static cpumask_var_t sched_domains_tmpmask;
5305
5306#ifdef CONFIG_SCHED_DEBUG
5307
5308static __read_mostly int sched_debug_enabled;
5309
5310static int __init sched_debug_setup(char *str)
5311{
5312 sched_debug_enabled = 1;
5313
5314 return 0;
5315}
5316early_param("sched_debug", sched_debug_setup);
5317
5318static inline bool sched_debug(void)
5319{
5320 return sched_debug_enabled;
5321}
5322
5323static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5324 struct cpumask *groupmask)
5325{
5326 struct sched_group *group = sd->groups;
5327 char str[256];
5328
5329 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5330 cpumask_clear(groupmask);
5331
5332 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5333
5334 if (!(sd->flags & SD_LOAD_BALANCE)) {
5335 printk("does not load-balance\n");
5336 if (sd->parent)
5337 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5338 " has parent");
5339 return -1;
5340 }
5341
5342 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5343
5344 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5345 printk(KERN_ERR "ERROR: domain->span does not contain "
5346 "CPU%d\n", cpu);
5347 }
5348 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5349 printk(KERN_ERR "ERROR: domain->groups does not contain"
5350 " CPU%d\n", cpu);
5351 }
5352
5353 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5354 do {
5355 if (!group) {
5356 printk("\n");
5357 printk(KERN_ERR "ERROR: group is NULL\n");
5358 break;
5359 }
5360
5361
5362
5363
5364
5365
5366 if (!group->sgp->power_orig) {
5367 printk(KERN_CONT "\n");
5368 printk(KERN_ERR "ERROR: domain->cpu_power not "
5369 "set\n");
5370 break;
5371 }
5372
5373 if (!cpumask_weight(sched_group_cpus(group))) {
5374 printk(KERN_CONT "\n");
5375 printk(KERN_ERR "ERROR: empty group\n");
5376 break;
5377 }
5378
5379 if (!(sd->flags & SD_OVERLAP) &&
5380 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5381 printk(KERN_CONT "\n");
5382 printk(KERN_ERR "ERROR: repeated CPUs\n");
5383 break;
5384 }
5385
5386 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5387
5388 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5389
5390 printk(KERN_CONT " %s", str);
5391 if (group->sgp->power != SCHED_POWER_SCALE) {
5392 printk(KERN_CONT " (cpu_power = %d)",
5393 group->sgp->power);
5394 }
5395
5396 group = group->next;
5397 } while (group != sd->groups);
5398 printk(KERN_CONT "\n");
5399
5400 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5401 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5402
5403 if (sd->parent &&
5404 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5405 printk(KERN_ERR "ERROR: parent span is not a superset "
5406 "of domain->span\n");
5407 return 0;
5408}
5409
5410static void sched_domain_debug(struct sched_domain *sd, int cpu)
5411{
5412 int level = 0;
5413
5414 if (!sched_debug_enabled)
5415 return;
5416
5417 if (!sd) {
5418 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5419 return;
5420 }
5421
5422 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5423
5424 for (;;) {
5425 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5426 break;
5427 level++;
5428 sd = sd->parent;
5429 if (!sd)
5430 break;
5431 }
5432}
5433#else
5434# define sched_domain_debug(sd, cpu) do { } while (0)
5435static inline bool sched_debug(void)
5436{
5437 return false;
5438}
5439#endif
5440
5441static int sd_degenerate(struct sched_domain *sd)
5442{
5443 if (cpumask_weight(sched_domain_span(sd)) == 1)
5444 return 1;
5445
5446
5447 if (sd->flags & (SD_LOAD_BALANCE |
5448 SD_BALANCE_NEWIDLE |
5449 SD_BALANCE_FORK |
5450 SD_BALANCE_EXEC |
5451 SD_SHARE_CPUPOWER |
5452 SD_SHARE_PKG_RESOURCES)) {
5453 if (sd->groups != sd->groups->next)
5454 return 0;
5455 }
5456
5457
5458 if (sd->flags & (SD_WAKE_AFFINE))
5459 return 0;
5460
5461 return 1;
5462}
5463
5464static int
5465sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5466{
5467 unsigned long cflags = sd->flags, pflags = parent->flags;
5468
5469 if (sd_degenerate(parent))
5470 return 1;
5471
5472 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5473 return 0;
5474
5475
5476 if (parent->groups == parent->groups->next) {
5477 pflags &= ~(SD_LOAD_BALANCE |
5478 SD_BALANCE_NEWIDLE |
5479 SD_BALANCE_FORK |
5480 SD_BALANCE_EXEC |
5481 SD_SHARE_CPUPOWER |
5482 SD_SHARE_PKG_RESOURCES);
5483 if (nr_node_ids == 1)
5484 pflags &= ~SD_SERIALIZE;
5485 }
5486 if (~cflags & pflags)
5487 return 0;
5488
5489 return 1;
5490}
5491
5492static void free_rootdomain(struct rcu_head *rcu)
5493{
5494 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5495
5496 cpupri_cleanup(&rd->cpupri);
5497 free_cpumask_var(rd->rto_mask);
5498 free_cpumask_var(rd->online);
5499 free_cpumask_var(rd->span);
5500 kfree(rd);
5501}
5502
5503static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5504{
5505 struct root_domain *old_rd = NULL;
5506 unsigned long flags;
5507
5508 raw_spin_lock_irqsave(&rq->lock, flags);
5509
5510 if (rq->rd) {
5511 old_rd = rq->rd;
5512
5513 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5514 set_rq_offline(rq);
5515
5516 cpumask_clear_cpu(rq->cpu, old_rd->span);
5517
5518
5519
5520
5521
5522
5523 if (!atomic_dec_and_test(&old_rd->refcount))
5524 old_rd = NULL;
5525 }
5526
5527 atomic_inc(&rd->refcount);
5528 rq->rd = rd;
5529
5530 cpumask_set_cpu(rq->cpu, rd->span);
5531 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5532 set_rq_online(rq);
5533
5534 raw_spin_unlock_irqrestore(&rq->lock, flags);
5535
5536 if (old_rd)
5537 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5538}
5539
5540static int init_rootdomain(struct root_domain *rd)
5541{
5542 memset(rd, 0, sizeof(*rd));
5543
5544 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5545 goto out;
5546 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5547 goto free_span;
5548 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5549 goto free_online;
5550
5551 if (cpupri_init(&rd->cpupri) != 0)
5552 goto free_rto_mask;
5553 return 0;
5554
5555free_rto_mask:
5556 free_cpumask_var(rd->rto_mask);
5557free_online:
5558 free_cpumask_var(rd->online);
5559free_span:
5560 free_cpumask_var(rd->span);
5561out:
5562 return -ENOMEM;
5563}
5564
5565
5566
5567
5568
5569struct root_domain def_root_domain;
5570
5571static void init_defrootdomain(void)
5572{
5573 init_rootdomain(&def_root_domain);
5574
5575 atomic_set(&def_root_domain.refcount, 1);
5576}
5577
5578static struct root_domain *alloc_rootdomain(void)
5579{
5580 struct root_domain *rd;
5581
5582 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5583 if (!rd)
5584 return NULL;
5585
5586 if (init_rootdomain(rd) != 0) {
5587 kfree(rd);
5588 return NULL;
5589 }
5590
5591 return rd;
5592}
5593
5594static void free_sched_groups(struct sched_group *sg, int free_sgp)
5595{
5596 struct sched_group *tmp, *first;
5597
5598 if (!sg)
5599 return;
5600
5601 first = sg;
5602 do {
5603 tmp = sg->next;
5604
5605 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5606 kfree(sg->sgp);
5607
5608 kfree(sg);
5609 sg = tmp;
5610 } while (sg != first);
5611}
5612
5613static void free_sched_domain(struct rcu_head *rcu)
5614{
5615 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5616
5617
5618
5619
5620
5621 if (sd->flags & SD_OVERLAP) {
5622 free_sched_groups(sd->groups, 1);
5623 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5624 kfree(sd->groups->sgp);
5625 kfree(sd->groups);
5626 }
5627 kfree(sd);
5628}
5629
5630static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5631{
5632 call_rcu(&sd->rcu, free_sched_domain);
5633}
5634
5635static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5636{
5637 for (; sd; sd = sd->parent)
5638 destroy_sched_domain(sd, cpu);
5639}
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5651DEFINE_PER_CPU(int, sd_llc_id);
5652
5653static void update_top_cache_domain(int cpu)
5654{
5655 struct sched_domain *sd;
5656 int id = cpu;
5657
5658 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5659 if (sd)
5660 id = cpumask_first(sched_domain_span(sd));
5661
5662 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5663 per_cpu(sd_llc_id, cpu) = id;
5664}
5665
5666
5667
5668
5669
5670static void
5671cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5672{
5673 struct rq *rq = cpu_rq(cpu);
5674 struct sched_domain *tmp;
5675
5676
5677 for (tmp = sd; tmp; ) {
5678 struct sched_domain *parent = tmp->parent;
5679 if (!parent)
5680 break;
5681
5682 if (sd_parent_degenerate(tmp, parent)) {
5683 tmp->parent = parent->parent;
5684 if (parent->parent)
5685 parent->parent->child = tmp;
5686 destroy_sched_domain(parent, cpu);
5687 } else
5688 tmp = tmp->parent;
5689 }
5690
5691 if (sd && sd_degenerate(sd)) {
5692 tmp = sd;
5693 sd = sd->parent;
5694 destroy_sched_domain(tmp, cpu);
5695 if (sd)
5696 sd->child = NULL;
5697 }
5698
5699 sched_domain_debug(sd, cpu);
5700
5701 rq_attach_root(rq, rd);
5702 tmp = rq->sd;
5703 rcu_assign_pointer(rq->sd, sd);
5704 destroy_sched_domains(tmp, cpu);
5705
5706 update_top_cache_domain(cpu);
5707}
5708
5709
5710static cpumask_var_t cpu_isolated_map;
5711
5712
5713static int __init isolated_cpu_setup(char *str)
5714{
5715 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5716 cpulist_parse(str, cpu_isolated_map);
5717 return 1;
5718}
5719
5720__setup("isolcpus=", isolated_cpu_setup);
5721
5722static const struct cpumask *cpu_cpu_mask(int cpu)
5723{
5724 return cpumask_of_node(cpu_to_node(cpu));
5725}
5726
5727struct sd_data {
5728 struct sched_domain **__percpu sd;
5729 struct sched_group **__percpu sg;
5730 struct sched_group_power **__percpu sgp;
5731};
5732
5733struct s_data {
5734 struct sched_domain ** __percpu sd;
5735 struct root_domain *rd;
5736};
5737
5738enum s_alloc {
5739 sa_rootdomain,
5740 sa_sd,
5741 sa_sd_storage,
5742 sa_none,
5743};
5744
5745struct sched_domain_topology_level;
5746
5747typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5748typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5749
5750#define SDTL_OVERLAP 0x01
5751
5752struct sched_domain_topology_level {
5753 sched_domain_init_f init;
5754 sched_domain_mask_f mask;
5755 int flags;
5756 int numa_level;
5757 struct sd_data data;
5758};
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5774{
5775 const struct cpumask *span = sched_domain_span(sd);
5776 struct sd_data *sdd = sd->private;
5777 struct sched_domain *sibling;
5778 int i;
5779
5780 for_each_cpu(i, span) {
5781 sibling = *per_cpu_ptr(sdd->sd, i);
5782 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5783 continue;
5784
5785 cpumask_set_cpu(i, sched_group_mask(sg));
5786 }
5787}
5788
5789
5790
5791
5792
5793int group_balance_cpu(struct sched_group *sg)
5794{
5795 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5796}
5797
5798static int
5799build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5800{
5801 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5802 const struct cpumask *span = sched_domain_span(sd);
5803 struct cpumask *covered = sched_domains_tmpmask;
5804 struct sd_data *sdd = sd->private;
5805 struct sched_domain *child;
5806 int i;
5807
5808 cpumask_clear(covered);
5809
5810 for_each_cpu(i, span) {
5811 struct cpumask *sg_span;
5812
5813 if (cpumask_test_cpu(i, covered))
5814 continue;
5815
5816 child = *per_cpu_ptr(sdd->sd, i);
5817
5818
5819 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5820 continue;
5821
5822 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5823 GFP_KERNEL, cpu_to_node(cpu));
5824
5825 if (!sg)
5826 goto fail;
5827
5828 sg_span = sched_group_cpus(sg);
5829 if (child->child) {
5830 child = child->child;
5831 cpumask_copy(sg_span, sched_domain_span(child));
5832 } else
5833 cpumask_set_cpu(i, sg_span);
5834
5835 cpumask_or(covered, covered, sg_span);
5836
5837 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5838 if (atomic_inc_return(&sg->sgp->ref) == 1)
5839 build_group_mask(sd, sg);
5840
5841
5842
5843
5844
5845
5846 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5847
5848
5849
5850
5851
5852
5853 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5854 group_balance_cpu(sg) == cpu)
5855 groups = sg;
5856
5857 if (!first)
5858 first = sg;
5859 if (last)
5860 last->next = sg;
5861 last = sg;
5862 last->next = first;
5863 }
5864 sd->groups = groups;
5865
5866 return 0;
5867
5868fail:
5869 free_sched_groups(first, 0);
5870
5871 return -ENOMEM;
5872}
5873
5874static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5875{
5876 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5877 struct sched_domain *child = sd->child;
5878
5879 if (child)
5880 cpu = cpumask_first(sched_domain_span(child));
5881
5882 if (sg) {
5883 *sg = *per_cpu_ptr(sdd->sg, cpu);
5884 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5885 atomic_set(&(*sg)->sgp->ref, 1);
5886 }
5887
5888 return cpu;
5889}
5890
5891
5892
5893
5894
5895
5896
5897
5898static int
5899build_sched_groups(struct sched_domain *sd, int cpu)
5900{
5901 struct sched_group *first = NULL, *last = NULL;
5902 struct sd_data *sdd = sd->private;
5903 const struct cpumask *span = sched_domain_span(sd);
5904 struct cpumask *covered;
5905 int i;
5906
5907 get_group(cpu, sdd, &sd->groups);
5908 atomic_inc(&sd->groups->ref);
5909
5910 if (cpu != cpumask_first(sched_domain_span(sd)))
5911 return 0;
5912
5913 lockdep_assert_held(&sched_domains_mutex);
5914 covered = sched_domains_tmpmask;
5915
5916 cpumask_clear(covered);
5917
5918 for_each_cpu(i, span) {
5919 struct sched_group *sg;
5920 int group = get_group(i, sdd, &sg);
5921 int j;
5922
5923 if (cpumask_test_cpu(i, covered))
5924 continue;
5925
5926 cpumask_clear(sched_group_cpus(sg));
5927 sg->sgp->power = 0;
5928 cpumask_setall(sched_group_mask(sg));
5929
5930 for_each_cpu(j, span) {
5931 if (get_group(j, sdd, NULL) != group)
5932 continue;
5933
5934 cpumask_set_cpu(j, covered);
5935 cpumask_set_cpu(j, sched_group_cpus(sg));
5936 }
5937
5938 if (!first)
5939 first = sg;
5940 if (last)
5941 last->next = sg;
5942 last = sg;
5943 }
5944 last->next = first;
5945
5946 return 0;
5947}
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5960{
5961 struct sched_group *sg = sd->groups;
5962
5963 WARN_ON(!sd || !sg);
5964
5965 do {
5966 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5967 sg = sg->next;
5968 } while (sg != sd->groups);
5969
5970 if (cpu != group_balance_cpu(sg))
5971 return;
5972
5973 update_group_power(sd, cpu);
5974 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5975}
5976
5977int __weak arch_sd_sibling_asym_packing(void)
5978{
5979 return 0*SD_ASYM_PACKING;
5980}
5981
5982
5983
5984
5985
5986
5987#ifdef CONFIG_SCHED_DEBUG
5988# define SD_INIT_NAME(sd, type) sd->name = #type
5989#else
5990# define SD_INIT_NAME(sd, type) do { } while (0)
5991#endif
5992
5993#define SD_INIT_FUNC(type) \
5994static noinline struct sched_domain * \
5995sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5996{ \
5997 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5998 *sd = SD_##type##_INIT; \
5999 SD_INIT_NAME(sd, type); \
6000 sd->private = &tl->data; \
6001 return sd; \
6002}
6003
6004SD_INIT_FUNC(CPU)
6005#ifdef CONFIG_SCHED_SMT
6006 SD_INIT_FUNC(SIBLING)
6007#endif
6008#ifdef CONFIG_SCHED_MC
6009 SD_INIT_FUNC(MC)
6010#endif
6011#ifdef CONFIG_SCHED_BOOK
6012 SD_INIT_FUNC(BOOK)
6013#endif
6014
6015static int default_relax_domain_level = -1;
6016int sched_domain_level_max;
6017
6018static int __init setup_relax_domain_level(char *str)
6019{
6020 if (kstrtoint(str, 0, &default_relax_domain_level))
6021 pr_warn("Unable to set relax_domain_level\n");
6022
6023 return 1;
6024}
6025__setup("relax_domain_level=", setup_relax_domain_level);
6026
6027static void set_domain_attribute(struct sched_domain *sd,
6028 struct sched_domain_attr *attr)
6029{
6030 int request;
6031
6032 if (!attr || attr->relax_domain_level < 0) {
6033 if (default_relax_domain_level < 0)
6034 return;
6035 else
6036 request = default_relax_domain_level;
6037 } else
6038 request = attr->relax_domain_level;
6039 if (request < sd->level) {
6040
6041 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6042 } else {
6043
6044 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6045 }
6046}
6047
6048static void __sdt_free(const struct cpumask *cpu_map);
6049static int __sdt_alloc(const struct cpumask *cpu_map);
6050
6051static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6052 const struct cpumask *cpu_map)
6053{
6054 switch (what) {
6055 case sa_rootdomain:
6056 if (!atomic_read(&d->rd->refcount))
6057 free_rootdomain(&d->rd->rcu);
6058 case sa_sd:
6059 free_percpu(d->sd);
6060 case sa_sd_storage:
6061 __sdt_free(cpu_map);
6062 case sa_none:
6063 break;
6064 }
6065}
6066
6067static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6068 const struct cpumask *cpu_map)
6069{
6070 memset(d, 0, sizeof(*d));
6071
6072 if (__sdt_alloc(cpu_map))
6073 return sa_sd_storage;
6074 d->sd = alloc_percpu(struct sched_domain *);
6075 if (!d->sd)
6076 return sa_sd_storage;
6077 d->rd = alloc_rootdomain();
6078 if (!d->rd)
6079 return sa_sd;
6080 return sa_rootdomain;
6081}
6082
6083
6084
6085
6086
6087
6088static void claim_allocations(int cpu, struct sched_domain *sd)
6089{
6090 struct sd_data *sdd = sd->private;
6091
6092 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6093 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6094
6095 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6096 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6097
6098 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6099 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6100}
6101
6102#ifdef CONFIG_SCHED_SMT
6103static const struct cpumask *cpu_smt_mask(int cpu)
6104{
6105 return topology_thread_cpumask(cpu);
6106}
6107#endif
6108
6109
6110
6111
6112static struct sched_domain_topology_level default_topology[] = {
6113#ifdef CONFIG_SCHED_SMT
6114 { sd_init_SIBLING, cpu_smt_mask, },
6115#endif
6116#ifdef CONFIG_SCHED_MC
6117 { sd_init_MC, cpu_coregroup_mask, },
6118#endif
6119#ifdef CONFIG_SCHED_BOOK
6120 { sd_init_BOOK, cpu_book_mask, },
6121#endif
6122 { sd_init_CPU, cpu_cpu_mask, },
6123 { NULL, },
6124};
6125
6126static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6127
6128#ifdef CONFIG_NUMA
6129
6130static int sched_domains_numa_levels;
6131static int *sched_domains_numa_distance;
6132static struct cpumask ***sched_domains_numa_masks;
6133static int sched_domains_curr_level;
6134
6135static inline int sd_local_flags(int level)
6136{
6137 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6138 return 0;
6139
6140 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6141}
6142
6143static struct sched_domain *
6144sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6145{
6146 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6147 int level = tl->numa_level;
6148 int sd_weight = cpumask_weight(
6149 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6150
6151 *sd = (struct sched_domain){
6152 .min_interval = sd_weight,
6153 .max_interval = 2*sd_weight,
6154 .busy_factor = 32,
6155 .imbalance_pct = 125,
6156 .cache_nice_tries = 2,
6157 .busy_idx = 3,
6158 .idle_idx = 2,
6159 .newidle_idx = 0,
6160 .wake_idx = 0,
6161 .forkexec_idx = 0,
6162
6163 .flags = 1*SD_LOAD_BALANCE
6164 | 1*SD_BALANCE_NEWIDLE
6165 | 0*SD_BALANCE_EXEC
6166 | 0*SD_BALANCE_FORK
6167 | 0*SD_BALANCE_WAKE
6168 | 0*SD_WAKE_AFFINE
6169 | 0*SD_SHARE_CPUPOWER
6170 | 0*SD_SHARE_PKG_RESOURCES
6171 | 1*SD_SERIALIZE
6172 | 0*SD_PREFER_SIBLING
6173 | sd_local_flags(level)
6174 ,
6175 .last_balance = jiffies,
6176 .balance_interval = sd_weight,
6177 };
6178 SD_INIT_NAME(sd, NUMA);
6179 sd->private = &tl->data;
6180
6181
6182
6183
6184 sched_domains_curr_level = tl->numa_level;
6185
6186 return sd;
6187}
6188
6189static const struct cpumask *sd_numa_mask(int cpu)
6190{
6191 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6192}
6193
6194static void sched_numa_warn(const char *str)
6195{
6196 static int done = false;
6197 int i,j;
6198
6199 if (done)
6200 return;
6201
6202 done = true;
6203
6204 printk(KERN_WARNING "ERROR: %s\n\n", str);
6205
6206 for (i = 0; i < nr_node_ids; i++) {
6207 printk(KERN_WARNING " ");
6208 for (j = 0; j < nr_node_ids; j++)
6209 printk(KERN_CONT "%02d ", node_distance(i,j));
6210 printk(KERN_CONT "\n");
6211 }
6212 printk(KERN_WARNING "\n");
6213}
6214
6215static bool find_numa_distance(int distance)
6216{
6217 int i;
6218
6219 if (distance == node_distance(0, 0))
6220 return true;
6221
6222 for (i = 0; i < sched_domains_numa_levels; i++) {
6223 if (sched_domains_numa_distance[i] == distance)
6224 return true;
6225 }
6226
6227 return false;
6228}
6229
6230static void sched_init_numa(void)
6231{
6232 int next_distance, curr_distance = node_distance(0, 0);
6233 struct sched_domain_topology_level *tl;
6234 int level = 0;
6235 int i, j, k;
6236
6237 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6238 if (!sched_domains_numa_distance)
6239 return;
6240
6241
6242
6243
6244
6245
6246
6247
6248 next_distance = curr_distance;
6249 for (i = 0; i < nr_node_ids; i++) {
6250 for (j = 0; j < nr_node_ids; j++) {
6251 for (k = 0; k < nr_node_ids; k++) {
6252 int distance = node_distance(i, k);
6253
6254 if (distance > curr_distance &&
6255 (distance < next_distance ||
6256 next_distance == curr_distance))
6257 next_distance = distance;
6258
6259
6260
6261
6262
6263
6264 if (sched_debug() && node_distance(k, i) != distance)
6265 sched_numa_warn("Node-distance not symmetric");
6266
6267 if (sched_debug() && i && !find_numa_distance(distance))
6268 sched_numa_warn("Node-0 not representative");
6269 }
6270 if (next_distance != curr_distance) {
6271 sched_domains_numa_distance[level++] = next_distance;
6272 sched_domains_numa_levels = level;
6273 curr_distance = next_distance;
6274 } else break;
6275 }
6276
6277
6278
6279
6280 if (!sched_debug())
6281 break;
6282 }
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300 sched_domains_numa_levels = 0;
6301
6302 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6303 if (!sched_domains_numa_masks)
6304 return;
6305
6306
6307
6308
6309
6310 for (i = 0; i < level; i++) {
6311 sched_domains_numa_masks[i] =
6312 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6313 if (!sched_domains_numa_masks[i])
6314 return;
6315
6316 for (j = 0; j < nr_node_ids; j++) {
6317 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6318 if (!mask)
6319 return;
6320
6321 sched_domains_numa_masks[i][j] = mask;
6322
6323 for (k = 0; k < nr_node_ids; k++) {
6324 if (node_distance(j, k) > sched_domains_numa_distance[i])
6325 continue;
6326
6327 cpumask_or(mask, mask, cpumask_of_node(k));
6328 }
6329 }
6330 }
6331
6332 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6333 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6334 if (!tl)
6335 return;
6336
6337
6338
6339
6340 for (i = 0; default_topology[i].init; i++)
6341 tl[i] = default_topology[i];
6342
6343
6344
6345
6346 for (j = 0; j < level; i++, j++) {
6347 tl[i] = (struct sched_domain_topology_level){
6348 .init = sd_numa_init,
6349 .mask = sd_numa_mask,
6350 .flags = SDTL_OVERLAP,
6351 .numa_level = j,
6352 };
6353 }
6354
6355 sched_domain_topology = tl;
6356
6357 sched_domains_numa_levels = level;
6358}
6359
6360static void sched_domains_numa_masks_set(int cpu)
6361{
6362 int i, j;
6363 int node = cpu_to_node(cpu);
6364
6365 for (i = 0; i < sched_domains_numa_levels; i++) {
6366 for (j = 0; j < nr_node_ids; j++) {
6367 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6368 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6369 }
6370 }
6371}
6372
6373static void sched_domains_numa_masks_clear(int cpu)
6374{
6375 int i, j;
6376 for (i = 0; i < sched_domains_numa_levels; i++) {
6377 for (j = 0; j < nr_node_ids; j++)
6378 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6379 }
6380}
6381
6382
6383
6384
6385
6386static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6387 unsigned long action,
6388 void *hcpu)
6389{
6390 int cpu = (long)hcpu;
6391
6392 switch (action & ~CPU_TASKS_FROZEN) {
6393 case CPU_ONLINE:
6394 sched_domains_numa_masks_set(cpu);
6395 break;
6396
6397 case CPU_DEAD:
6398 sched_domains_numa_masks_clear(cpu);
6399 break;
6400
6401 default:
6402 return NOTIFY_DONE;
6403 }
6404
6405 return NOTIFY_OK;
6406}
6407#else
6408static inline void sched_init_numa(void)
6409{
6410}
6411
6412static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6413 unsigned long action,
6414 void *hcpu)
6415{
6416 return 0;
6417}
6418#endif
6419
6420static int __sdt_alloc(const struct cpumask *cpu_map)
6421{
6422 struct sched_domain_topology_level *tl;
6423 int j;
6424
6425 for (tl = sched_domain_topology; tl->init; tl++) {
6426 struct sd_data *sdd = &tl->data;
6427
6428 sdd->sd = alloc_percpu(struct sched_domain *);
6429 if (!sdd->sd)
6430 return -ENOMEM;
6431
6432 sdd->sg = alloc_percpu(struct sched_group *);
6433 if (!sdd->sg)
6434 return -ENOMEM;
6435
6436 sdd->sgp = alloc_percpu(struct sched_group_power *);
6437 if (!sdd->sgp)
6438 return -ENOMEM;
6439
6440 for_each_cpu(j, cpu_map) {
6441 struct sched_domain *sd;
6442 struct sched_group *sg;
6443 struct sched_group_power *sgp;
6444
6445 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6446 GFP_KERNEL, cpu_to_node(j));
6447 if (!sd)
6448 return -ENOMEM;
6449
6450 *per_cpu_ptr(sdd->sd, j) = sd;
6451
6452 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6453 GFP_KERNEL, cpu_to_node(j));
6454 if (!sg)
6455 return -ENOMEM;
6456
6457 sg->next = sg;
6458
6459 *per_cpu_ptr(sdd->sg, j) = sg;
6460
6461 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6462 GFP_KERNEL, cpu_to_node(j));
6463 if (!sgp)
6464 return -ENOMEM;
6465
6466 *per_cpu_ptr(sdd->sgp, j) = sgp;
6467 }
6468 }
6469
6470 return 0;
6471}
6472
6473static void __sdt_free(const struct cpumask *cpu_map)
6474{
6475 struct sched_domain_topology_level *tl;
6476 int j;
6477
6478 for (tl = sched_domain_topology; tl->init; tl++) {
6479 struct sd_data *sdd = &tl->data;
6480
6481 for_each_cpu(j, cpu_map) {
6482 struct sched_domain *sd;
6483
6484 if (sdd->sd) {
6485 sd = *per_cpu_ptr(sdd->sd, j);
6486 if (sd && (sd->flags & SD_OVERLAP))
6487 free_sched_groups(sd->groups, 0);
6488 kfree(*per_cpu_ptr(sdd->sd, j));
6489 }
6490
6491 if (sdd->sg)
6492 kfree(*per_cpu_ptr(sdd->sg, j));
6493 if (sdd->sgp)
6494 kfree(*per_cpu_ptr(sdd->sgp, j));
6495 }
6496 free_percpu(sdd->sd);
6497 sdd->sd = NULL;
6498 free_percpu(sdd->sg);
6499 sdd->sg = NULL;
6500 free_percpu(sdd->sgp);
6501 sdd->sgp = NULL;
6502 }
6503}
6504
6505struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6506 struct s_data *d, const struct cpumask *cpu_map,
6507 struct sched_domain_attr *attr, struct sched_domain *child,
6508 int cpu)
6509{
6510 struct sched_domain *sd = tl->init(tl, cpu);
6511 if (!sd)
6512 return child;
6513
6514 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6515 if (child) {
6516 sd->level = child->level + 1;
6517 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6518 child->parent = sd;
6519 }
6520 sd->child = child;
6521 set_domain_attribute(sd, attr);
6522
6523 return sd;
6524}
6525
6526
6527
6528
6529
6530static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_attr *attr)
6532{
6533 enum s_alloc alloc_state = sa_none;
6534 struct sched_domain *sd;
6535 struct s_data d;
6536 int i, ret = -ENOMEM;
6537
6538 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6539 if (alloc_state != sa_rootdomain)
6540 goto error;
6541
6542
6543 for_each_cpu(i, cpu_map) {
6544 struct sched_domain_topology_level *tl;
6545
6546 sd = NULL;
6547 for (tl = sched_domain_topology; tl->init; tl++) {
6548 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6549 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6550 sd->flags |= SD_OVERLAP;
6551 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6552 break;
6553 }
6554
6555 while (sd->child)
6556 sd = sd->child;
6557
6558 *per_cpu_ptr(d.sd, i) = sd;
6559 }
6560
6561
6562 for_each_cpu(i, cpu_map) {
6563 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6564 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6565 if (sd->flags & SD_OVERLAP) {
6566 if (build_overlap_sched_groups(sd, i))
6567 goto error;
6568 } else {
6569 if (build_sched_groups(sd, i))
6570 goto error;
6571 }
6572 }
6573 }
6574
6575
6576 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6577 if (!cpumask_test_cpu(i, cpu_map))
6578 continue;
6579
6580 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6581 claim_allocations(i, sd);
6582 init_sched_groups_power(i, sd);
6583 }
6584 }
6585
6586
6587 rcu_read_lock();
6588 for_each_cpu(i, cpu_map) {
6589 sd = *per_cpu_ptr(d.sd, i);
6590 cpu_attach_domain(sd, d.rd, i);
6591 }
6592 rcu_read_unlock();
6593
6594 ret = 0;
6595error:
6596 __free_domain_allocs(&d, alloc_state, cpu_map);
6597 return ret;
6598}
6599
6600static cpumask_var_t *doms_cur;
6601static int ndoms_cur;
6602static struct sched_domain_attr *dattr_cur;
6603
6604
6605
6606
6607
6608
6609
6610static cpumask_var_t fallback_doms;
6611
6612
6613
6614
6615
6616
6617int __attribute__((weak)) arch_update_cpu_topology(void)
6618{
6619 return 0;
6620}
6621
6622cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6623{
6624 int i;
6625 cpumask_var_t *doms;
6626
6627 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6628 if (!doms)
6629 return NULL;
6630 for (i = 0; i < ndoms; i++) {
6631 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6632 free_sched_domains(doms, i);
6633 return NULL;
6634 }
6635 }
6636 return doms;
6637}
6638
6639void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6640{
6641 unsigned int i;
6642 for (i = 0; i < ndoms; i++)
6643 free_cpumask_var(doms[i]);
6644 kfree(doms);
6645}
6646
6647
6648
6649
6650
6651
6652static int init_sched_domains(const struct cpumask *cpu_map)
6653{
6654 int err;
6655
6656 arch_update_cpu_topology();
6657 ndoms_cur = 1;
6658 doms_cur = alloc_sched_domains(ndoms_cur);
6659 if (!doms_cur)
6660 doms_cur = &fallback_doms;
6661 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6662 err = build_sched_domains(doms_cur[0], NULL);
6663 register_sched_domain_sysctl();
6664
6665 return err;
6666}
6667
6668
6669
6670
6671
6672static void detach_destroy_domains(const struct cpumask *cpu_map)
6673{
6674 int i;
6675
6676 rcu_read_lock();
6677 for_each_cpu(i, cpu_map)
6678 cpu_attach_domain(NULL, &def_root_domain, i);
6679 rcu_read_unlock();
6680}
6681
6682
6683static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6684 struct sched_domain_attr *new, int idx_new)
6685{
6686 struct sched_domain_attr tmp;
6687
6688
6689 if (!new && !cur)
6690 return 1;
6691
6692 tmp = SD_ATTR_INIT;
6693 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6694 new ? (new + idx_new) : &tmp,
6695 sizeof(struct sched_domain_attr));
6696}
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6725 struct sched_domain_attr *dattr_new)
6726{
6727 int i, j, n;
6728 int new_topology;
6729
6730 mutex_lock(&sched_domains_mutex);
6731
6732
6733 unregister_sched_domain_sysctl();
6734
6735
6736 new_topology = arch_update_cpu_topology();
6737
6738 n = doms_new ? ndoms_new : 0;
6739
6740
6741 for (i = 0; i < ndoms_cur; i++) {
6742 for (j = 0; j < n && !new_topology; j++) {
6743 if (cpumask_equal(doms_cur[i], doms_new[j])
6744 && dattrs_equal(dattr_cur, i, dattr_new, j))
6745 goto match1;
6746 }
6747
6748 detach_destroy_domains(doms_cur[i]);
6749match1:
6750 ;
6751 }
6752
6753 if (doms_new == NULL) {
6754 ndoms_cur = 0;
6755 doms_new = &fallback_doms;
6756 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6757 WARN_ON_ONCE(dattr_new);
6758 }
6759
6760
6761 for (i = 0; i < ndoms_new; i++) {
6762 for (j = 0; j < ndoms_cur && !new_topology; j++) {
6763 if (cpumask_equal(doms_new[i], doms_cur[j])
6764 && dattrs_equal(dattr_new, i, dattr_cur, j))
6765 goto match2;
6766 }
6767
6768 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6769match2:
6770 ;
6771 }
6772
6773
6774 if (doms_cur != &fallback_doms)
6775 free_sched_domains(doms_cur, ndoms_cur);
6776 kfree(dattr_cur);
6777 doms_cur = doms_new;
6778 dattr_cur = dattr_new;
6779 ndoms_cur = ndoms_new;
6780
6781 register_sched_domain_sysctl();
6782
6783 mutex_unlock(&sched_domains_mutex);
6784}
6785
6786static int num_cpus_frozen;
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6797 void *hcpu)
6798{
6799 switch (action) {
6800 case CPU_ONLINE_FROZEN:
6801 case CPU_DOWN_FAILED_FROZEN:
6802
6803
6804
6805
6806
6807
6808
6809 num_cpus_frozen--;
6810 if (likely(num_cpus_frozen)) {
6811 partition_sched_domains(1, NULL, NULL);
6812 break;
6813 }
6814
6815
6816
6817
6818
6819
6820
6821 case CPU_ONLINE:
6822 case CPU_DOWN_FAILED:
6823 cpuset_update_active_cpus(true);
6824 break;
6825 default:
6826 return NOTIFY_DONE;
6827 }
6828 return NOTIFY_OK;
6829}
6830
6831static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6832 void *hcpu)
6833{
6834 switch (action) {
6835 case CPU_DOWN_PREPARE:
6836 cpuset_update_active_cpus(false);
6837 break;
6838 case CPU_DOWN_PREPARE_FROZEN:
6839 num_cpus_frozen++;
6840 partition_sched_domains(1, NULL, NULL);
6841 break;
6842 default:
6843 return NOTIFY_DONE;
6844 }
6845 return NOTIFY_OK;
6846}
6847
6848void __init sched_init_smp(void)
6849{
6850 cpumask_var_t non_isolated_cpus;
6851
6852 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6853 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6854
6855 sched_init_numa();
6856
6857 get_online_cpus();
6858 mutex_lock(&sched_domains_mutex);
6859 init_sched_domains(cpu_active_mask);
6860 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6861 if (cpumask_empty(non_isolated_cpus))
6862 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6863 mutex_unlock(&sched_domains_mutex);
6864 put_online_cpus();
6865
6866 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6867 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6868 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6869
6870
6871 hotcpu_notifier(update_runtime, 0);
6872
6873 init_hrtick();
6874
6875
6876 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6877 BUG();
6878 sched_init_granularity();
6879 free_cpumask_var(non_isolated_cpus);
6880
6881 init_sched_rt_class();
6882}
6883#else
6884void __init sched_init_smp(void)
6885{
6886 sched_init_granularity();
6887}
6888#endif
6889
6890const_debug unsigned int sysctl_timer_migration = 1;
6891
6892int in_sched_functions(unsigned long addr)
6893{
6894 return in_lock_functions(addr) ||
6895 (addr >= (unsigned long)__sched_text_start
6896 && addr < (unsigned long)__sched_text_end);
6897}
6898
6899#ifdef CONFIG_CGROUP_SCHED
6900
6901
6902
6903
6904struct task_group root_task_group;
6905LIST_HEAD(task_groups);
6906#endif
6907
6908DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6909
6910void __init sched_init(void)
6911{
6912 int i, j;
6913 unsigned long alloc_size = 0, ptr;
6914
6915#ifdef CONFIG_FAIR_GROUP_SCHED
6916 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6917#endif
6918#ifdef CONFIG_RT_GROUP_SCHED
6919 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6920#endif
6921#ifdef CONFIG_CPUMASK_OFFSTACK
6922 alloc_size += num_possible_cpus() * cpumask_size();
6923#endif
6924 if (alloc_size) {
6925 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6926
6927#ifdef CONFIG_FAIR_GROUP_SCHED
6928 root_task_group.se = (struct sched_entity **)ptr;
6929 ptr += nr_cpu_ids * sizeof(void **);
6930
6931 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6932 ptr += nr_cpu_ids * sizeof(void **);
6933
6934#endif
6935#ifdef CONFIG_RT_GROUP_SCHED
6936 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6937 ptr += nr_cpu_ids * sizeof(void **);
6938
6939 root_task_group.rt_rq = (struct rt_rq **)ptr;
6940 ptr += nr_cpu_ids * sizeof(void **);
6941
6942#endif
6943#ifdef CONFIG_CPUMASK_OFFSTACK
6944 for_each_possible_cpu(i) {
6945 per_cpu(load_balance_mask, i) = (void *)ptr;
6946 ptr += cpumask_size();
6947 }
6948#endif
6949 }
6950
6951#ifdef CONFIG_SMP
6952 init_defrootdomain();
6953#endif
6954
6955 init_rt_bandwidth(&def_rt_bandwidth,
6956 global_rt_period(), global_rt_runtime());
6957
6958#ifdef CONFIG_RT_GROUP_SCHED
6959 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6960 global_rt_period(), global_rt_runtime());
6961#endif
6962
6963#ifdef CONFIG_CGROUP_SCHED
6964 list_add(&root_task_group.list, &task_groups);
6965 INIT_LIST_HEAD(&root_task_group.children);
6966 INIT_LIST_HEAD(&root_task_group.siblings);
6967 autogroup_init(&init_task);
6968
6969#endif
6970
6971 for_each_possible_cpu(i) {
6972 struct rq *rq;
6973
6974 rq = cpu_rq(i);
6975 raw_spin_lock_init(&rq->lock);
6976 rq->nr_running = 0;
6977 rq->calc_load_active = 0;
6978 rq->calc_load_update = jiffies + LOAD_FREQ;
6979 init_cfs_rq(&rq->cfs);
6980 init_rt_rq(&rq->rt, rq);
6981#ifdef CONFIG_FAIR_GROUP_SCHED
6982 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6983 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
7004 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7005#endif
7006
7007 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7008#ifdef CONFIG_RT_GROUP_SCHED
7009 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7010 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7011#endif
7012
7013 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7014 rq->cpu_load[j] = 0;
7015
7016 rq->last_load_update_tick = jiffies;
7017
7018#ifdef CONFIG_SMP
7019 rq->sd = NULL;
7020 rq->rd = NULL;
7021 rq->cpu_power = SCHED_POWER_SCALE;
7022 rq->post_schedule = 0;
7023 rq->active_balance = 0;
7024 rq->next_balance = jiffies;
7025 rq->push_cpu = 0;
7026 rq->cpu = i;
7027 rq->online = 0;
7028 rq->idle_stamp = 0;
7029 rq->avg_idle = 2*sysctl_sched_migration_cost;
7030
7031 INIT_LIST_HEAD(&rq->cfs_tasks);
7032
7033 rq_attach_root(rq, &def_root_domain);
7034#ifdef CONFIG_NO_HZ_COMMON
7035 rq->nohz_flags = 0;
7036#endif
7037#ifdef CONFIG_NO_HZ_FULL
7038 rq->last_sched_tick = 0;
7039#endif
7040#endif
7041 init_rq_hrtick(rq);
7042 atomic_set(&rq->nr_iowait, 0);
7043 }
7044
7045 set_load_weight(&init_task);
7046
7047#ifdef CONFIG_PREEMPT_NOTIFIERS
7048 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7049#endif
7050
7051#ifdef CONFIG_RT_MUTEXES
7052 plist_head_init(&init_task.pi_waiters);
7053#endif
7054
7055
7056
7057
7058 atomic_inc(&init_mm.mm_count);
7059 enter_lazy_tlb(&init_mm, current);
7060
7061
7062
7063
7064
7065
7066
7067 init_idle(current, smp_processor_id());
7068
7069 calc_load_update = jiffies + LOAD_FREQ;
7070
7071
7072
7073
7074 current->sched_class = &fair_sched_class;
7075
7076#ifdef CONFIG_SMP
7077 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7078
7079 if (cpu_isolated_map == NULL)
7080 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7081 idle_thread_set_boot_cpu();
7082#endif
7083 init_sched_fair_class();
7084
7085 scheduler_running = 1;
7086}
7087
7088#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7089static inline int preempt_count_equals(int preempt_offset)
7090{
7091 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7092
7093 return (nested == preempt_offset);
7094}
7095
7096void __might_sleep(const char *file, int line, int preempt_offset)
7097{
7098 static unsigned long prev_jiffy;
7099
7100 rcu_sleep_check();
7101 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7102 system_state != SYSTEM_RUNNING || oops_in_progress)
7103 return;
7104 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7105 return;
7106 prev_jiffy = jiffies;
7107
7108 printk(KERN_ERR
7109 "BUG: sleeping function called from invalid context at %s:%d\n",
7110 file, line);
7111 printk(KERN_ERR
7112 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7113 in_atomic(), irqs_disabled(),
7114 current->pid, current->comm);
7115
7116 debug_show_held_locks(current);
7117 if (irqs_disabled())
7118 print_irqtrace_events(current);
7119 dump_stack();
7120}
7121EXPORT_SYMBOL(__might_sleep);
7122#endif
7123
7124#ifdef CONFIG_MAGIC_SYSRQ
7125static void normalize_task(struct rq *rq, struct task_struct *p)
7126{
7127 const struct sched_class *prev_class = p->sched_class;
7128 int old_prio = p->prio;
7129 int on_rq;
7130
7131 on_rq = p->on_rq;
7132 if (on_rq)
7133 dequeue_task(rq, p, 0);
7134 __setscheduler(rq, p, SCHED_NORMAL, 0);
7135 if (on_rq) {
7136 enqueue_task(rq, p, 0);
7137 resched_task(rq->curr);
7138 }
7139
7140 check_class_changed(rq, p, prev_class, old_prio);
7141}
7142
7143void normalize_rt_tasks(void)
7144{
7145 struct task_struct *g, *p;
7146 unsigned long flags;
7147 struct rq *rq;
7148
7149 read_lock_irqsave(&tasklist_lock, flags);
7150 do_each_thread(g, p) {
7151
7152
7153
7154 if (!p->mm)
7155 continue;
7156
7157 p->se.exec_start = 0;
7158#ifdef CONFIG_SCHEDSTATS
7159 p->se.statistics.wait_start = 0;
7160 p->se.statistics.sleep_start = 0;
7161 p->se.statistics.block_start = 0;
7162#endif
7163
7164 if (!rt_task(p)) {
7165
7166
7167
7168
7169 if (TASK_NICE(p) < 0 && p->mm)
7170 set_user_nice(p, 0);
7171 continue;
7172 }
7173
7174 raw_spin_lock(&p->pi_lock);
7175 rq = __task_rq_lock(p);
7176
7177 normalize_task(rq, p);
7178
7179 __task_rq_unlock(rq);
7180 raw_spin_unlock(&p->pi_lock);
7181 } while_each_thread(g, p);
7182
7183 read_unlock_irqrestore(&tasklist_lock, flags);
7184}
7185
7186#endif
7187
7188#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205struct task_struct *curr_task(int cpu)
7206{
7207 return cpu_curr(cpu);
7208}
7209
7210#endif
7211
7212#ifdef CONFIG_IA64
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228void set_curr_task(int cpu, struct task_struct *p)
7229{
7230 cpu_curr(cpu) = p;
7231}
7232
7233#endif
7234
7235#ifdef CONFIG_CGROUP_SCHED
7236
7237static DEFINE_SPINLOCK(task_group_lock);
7238
7239static void free_sched_group(struct task_group *tg)
7240{
7241 free_fair_sched_group(tg);
7242 free_rt_sched_group(tg);
7243 autogroup_free(tg);
7244 kfree(tg);
7245}
7246
7247
7248struct task_group *sched_create_group(struct task_group *parent)
7249{
7250 struct task_group *tg;
7251
7252 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7253 if (!tg)
7254 return ERR_PTR(-ENOMEM);
7255
7256 if (!alloc_fair_sched_group(tg, parent))
7257 goto err;
7258
7259 if (!alloc_rt_sched_group(tg, parent))
7260 goto err;
7261
7262 return tg;
7263
7264err:
7265 free_sched_group(tg);
7266 return ERR_PTR(-ENOMEM);
7267}
7268
7269void sched_online_group(struct task_group *tg, struct task_group *parent)
7270{
7271 unsigned long flags;
7272
7273 spin_lock_irqsave(&task_group_lock, flags);
7274 list_add_rcu(&tg->list, &task_groups);
7275
7276 WARN_ON(!parent);
7277
7278 tg->parent = parent;
7279 INIT_LIST_HEAD(&tg->children);
7280 list_add_rcu(&tg->siblings, &parent->children);
7281 spin_unlock_irqrestore(&task_group_lock, flags);
7282}
7283
7284
7285static void free_sched_group_rcu(struct rcu_head *rhp)
7286{
7287
7288 free_sched_group(container_of(rhp, struct task_group, rcu));
7289}
7290
7291
7292void sched_destroy_group(struct task_group *tg)
7293{
7294
7295 call_rcu(&tg->rcu, free_sched_group_rcu);
7296}
7297
7298void sched_offline_group(struct task_group *tg)
7299{
7300 unsigned long flags;
7301 int i;
7302
7303
7304 for_each_possible_cpu(i)
7305 unregister_fair_sched_group(tg, i);
7306
7307 spin_lock_irqsave(&task_group_lock, flags);
7308 list_del_rcu(&tg->list);
7309 list_del_rcu(&tg->siblings);
7310 spin_unlock_irqrestore(&task_group_lock, flags);
7311}
7312
7313
7314
7315
7316
7317
7318void sched_move_task(struct task_struct *tsk)
7319{
7320 struct task_group *tg;
7321 int on_rq, running;
7322 unsigned long flags;
7323 struct rq *rq;
7324
7325 rq = task_rq_lock(tsk, &flags);
7326
7327 running = task_current(rq, tsk);
7328 on_rq = tsk->on_rq;
7329
7330 if (on_rq)
7331 dequeue_task(rq, tsk, 0);
7332 if (unlikely(running))
7333 tsk->sched_class->put_prev_task(rq, tsk);
7334
7335 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7336 lockdep_is_held(&tsk->sighand->siglock)),
7337 struct task_group, css);
7338 tg = autogroup_task_group(tsk, tg);
7339 tsk->sched_task_group = tg;
7340
7341#ifdef CONFIG_FAIR_GROUP_SCHED
7342 if (tsk->sched_class->task_move_group)
7343 tsk->sched_class->task_move_group(tsk, on_rq);
7344 else
7345#endif
7346 set_task_rq(tsk, task_cpu(tsk));
7347
7348 if (unlikely(running))
7349 tsk->sched_class->set_curr_task(rq);
7350 if (on_rq)
7351 enqueue_task(rq, tsk, 0);
7352
7353 task_rq_unlock(rq, tsk, &flags);
7354}
7355#endif
7356
7357#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7358static unsigned long to_ratio(u64 period, u64 runtime)
7359{
7360 if (runtime == RUNTIME_INF)
7361 return 1ULL << 20;
7362
7363 return div64_u64(runtime << 20, period);
7364}
7365#endif
7366
7367#ifdef CONFIG_RT_GROUP_SCHED
7368
7369
7370
7371static DEFINE_MUTEX(rt_constraints_mutex);
7372
7373
7374static inline int tg_has_rt_tasks(struct task_group *tg)
7375{
7376 struct task_struct *g, *p;
7377
7378 do_each_thread(g, p) {
7379 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7380 return 1;
7381 } while_each_thread(g, p);
7382
7383 return 0;
7384}
7385
7386struct rt_schedulable_data {
7387 struct task_group *tg;
7388 u64 rt_period;
7389 u64 rt_runtime;
7390};
7391
7392static int tg_rt_schedulable(struct task_group *tg, void *data)
7393{
7394 struct rt_schedulable_data *d = data;
7395 struct task_group *child;
7396 unsigned long total, sum = 0;
7397 u64 period, runtime;
7398
7399 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7400 runtime = tg->rt_bandwidth.rt_runtime;
7401
7402 if (tg == d->tg) {
7403 period = d->rt_period;
7404 runtime = d->rt_runtime;
7405 }
7406
7407
7408
7409
7410 if (runtime > period && runtime != RUNTIME_INF)
7411 return -EINVAL;
7412
7413
7414
7415
7416 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7417 return -EBUSY;
7418
7419 total = to_ratio(period, runtime);
7420
7421
7422
7423
7424 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7425 return -EINVAL;
7426
7427
7428
7429
7430 list_for_each_entry_rcu(child, &tg->children, siblings) {
7431 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7432 runtime = child->rt_bandwidth.rt_runtime;
7433
7434 if (child == d->tg) {
7435 period = d->rt_period;
7436 runtime = d->rt_runtime;
7437 }
7438
7439 sum += to_ratio(period, runtime);
7440 }
7441
7442 if (sum > total)
7443 return -EINVAL;
7444
7445 return 0;
7446}
7447
7448static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7449{
7450 int ret;
7451
7452 struct rt_schedulable_data data = {
7453 .tg = tg,
7454 .rt_period = period,
7455 .rt_runtime = runtime,
7456 };
7457
7458 rcu_read_lock();
7459 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7460 rcu_read_unlock();
7461
7462 return ret;
7463}
7464
7465static int tg_set_rt_bandwidth(struct task_group *tg,
7466 u64 rt_period, u64 rt_runtime)
7467{
7468 int i, err = 0;
7469
7470 mutex_lock(&rt_constraints_mutex);
7471 read_lock(&tasklist_lock);
7472 err = __rt_schedulable(tg, rt_period, rt_runtime);
7473 if (err)
7474 goto unlock;
7475
7476 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7477 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7478 tg->rt_bandwidth.rt_runtime = rt_runtime;
7479
7480 for_each_possible_cpu(i) {
7481 struct rt_rq *rt_rq = tg->rt_rq[i];
7482
7483 raw_spin_lock(&rt_rq->rt_runtime_lock);
7484 rt_rq->rt_runtime = rt_runtime;
7485 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7486 }
7487 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7488unlock:
7489 read_unlock(&tasklist_lock);
7490 mutex_unlock(&rt_constraints_mutex);
7491
7492 return err;
7493}
7494
7495static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7496{
7497 u64 rt_runtime, rt_period;
7498
7499 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7500 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7501 if (rt_runtime_us < 0)
7502 rt_runtime = RUNTIME_INF;
7503
7504 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7505}
7506
7507static long sched_group_rt_runtime(struct task_group *tg)
7508{
7509 u64 rt_runtime_us;
7510
7511 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7512 return -1;
7513
7514 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7515 do_div(rt_runtime_us, NSEC_PER_USEC);
7516 return rt_runtime_us;
7517}
7518
7519static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7520{
7521 u64 rt_runtime, rt_period;
7522
7523 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7524 rt_runtime = tg->rt_bandwidth.rt_runtime;
7525
7526 if (rt_period == 0)
7527 return -EINVAL;
7528
7529 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7530}
7531
7532static long sched_group_rt_period(struct task_group *tg)
7533{
7534 u64 rt_period_us;
7535
7536 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7537 do_div(rt_period_us, NSEC_PER_USEC);
7538 return rt_period_us;
7539}
7540
7541static int sched_rt_global_constraints(void)
7542{
7543 u64 runtime, period;
7544 int ret = 0;
7545
7546 if (sysctl_sched_rt_period <= 0)
7547 return -EINVAL;
7548
7549 runtime = global_rt_runtime();
7550 period = global_rt_period();
7551
7552
7553
7554
7555 if (runtime > period && runtime != RUNTIME_INF)
7556 return -EINVAL;
7557
7558 mutex_lock(&rt_constraints_mutex);
7559 read_lock(&tasklist_lock);
7560 ret = __rt_schedulable(NULL, 0, 0);
7561 read_unlock(&tasklist_lock);
7562 mutex_unlock(&rt_constraints_mutex);
7563
7564 return ret;
7565}
7566
7567static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7568{
7569
7570 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7571 return 0;
7572
7573 return 1;
7574}
7575
7576#else
7577static int sched_rt_global_constraints(void)
7578{
7579 unsigned long flags;
7580 int i;
7581
7582 if (sysctl_sched_rt_period <= 0)
7583 return -EINVAL;
7584
7585
7586
7587
7588
7589 if (sysctl_sched_rt_runtime == 0)
7590 return -EBUSY;
7591
7592 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7593 for_each_possible_cpu(i) {
7594 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7595
7596 raw_spin_lock(&rt_rq->rt_runtime_lock);
7597 rt_rq->rt_runtime = global_rt_runtime();
7598 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7599 }
7600 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7601
7602 return 0;
7603}
7604#endif
7605
7606int sched_rr_handler(struct ctl_table *table, int write,
7607 void __user *buffer, size_t *lenp,
7608 loff_t *ppos)
7609{
7610 int ret;
7611 static DEFINE_MUTEX(mutex);
7612
7613 mutex_lock(&mutex);
7614 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7615
7616
7617 if (!ret && write) {
7618 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7619 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7620 }
7621 mutex_unlock(&mutex);
7622 return ret;
7623}
7624
7625int sched_rt_handler(struct ctl_table *table, int write,
7626 void __user *buffer, size_t *lenp,
7627 loff_t *ppos)
7628{
7629 int ret;
7630 int old_period, old_runtime;
7631 static DEFINE_MUTEX(mutex);
7632
7633 mutex_lock(&mutex);
7634 old_period = sysctl_sched_rt_period;
7635 old_runtime = sysctl_sched_rt_runtime;
7636
7637 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7638
7639 if (!ret && write) {
7640 ret = sched_rt_global_constraints();
7641 if (ret) {
7642 sysctl_sched_rt_period = old_period;
7643 sysctl_sched_rt_runtime = old_runtime;
7644 } else {
7645 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7646 def_rt_bandwidth.rt_period =
7647 ns_to_ktime(global_rt_period());
7648 }
7649 }
7650 mutex_unlock(&mutex);
7651
7652 return ret;
7653}
7654
7655#ifdef CONFIG_CGROUP_SCHED
7656
7657
7658static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7659{
7660 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7661 struct task_group, css);
7662}
7663
7664static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7665{
7666 struct task_group *tg, *parent;
7667
7668 if (!cgrp->parent) {
7669
7670 return &root_task_group.css;
7671 }
7672
7673 parent = cgroup_tg(cgrp->parent);
7674 tg = sched_create_group(parent);
7675 if (IS_ERR(tg))
7676 return ERR_PTR(-ENOMEM);
7677
7678 return &tg->css;
7679}
7680
7681static int cpu_cgroup_css_online(struct cgroup *cgrp)
7682{
7683 struct task_group *tg = cgroup_tg(cgrp);
7684 struct task_group *parent;
7685
7686 if (!cgrp->parent)
7687 return 0;
7688
7689 parent = cgroup_tg(cgrp->parent);
7690 sched_online_group(tg, parent);
7691 return 0;
7692}
7693
7694static void cpu_cgroup_css_free(struct cgroup *cgrp)
7695{
7696 struct task_group *tg = cgroup_tg(cgrp);
7697
7698 sched_destroy_group(tg);
7699}
7700
7701static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7702{
7703 struct task_group *tg = cgroup_tg(cgrp);
7704
7705 sched_offline_group(tg);
7706}
7707
7708static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7709 struct cgroup_taskset *tset)
7710{
7711 struct task_struct *task;
7712
7713 cgroup_taskset_for_each(task, cgrp, tset) {
7714#ifdef CONFIG_RT_GROUP_SCHED
7715 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7716 return -EINVAL;
7717#else
7718
7719 if (task->sched_class != &fair_sched_class)
7720 return -EINVAL;
7721#endif
7722 }
7723 return 0;
7724}
7725
7726static void cpu_cgroup_attach(struct cgroup *cgrp,
7727 struct cgroup_taskset *tset)
7728{
7729 struct task_struct *task;
7730
7731 cgroup_taskset_for_each(task, cgrp, tset)
7732 sched_move_task(task);
7733}
7734
7735static void
7736cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7737 struct task_struct *task)
7738{
7739
7740
7741
7742
7743
7744 if (!(task->flags & PF_EXITING))
7745 return;
7746
7747 sched_move_task(task);
7748}
7749
7750#ifdef CONFIG_FAIR_GROUP_SCHED
7751static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7752 u64 shareval)
7753{
7754 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7755}
7756
7757static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7758{
7759 struct task_group *tg = cgroup_tg(cgrp);
7760
7761 return (u64) scale_load_down(tg->shares);
7762}
7763
7764#ifdef CONFIG_CFS_BANDWIDTH
7765static DEFINE_MUTEX(cfs_constraints_mutex);
7766
7767const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
7768const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
7769
7770static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7771
7772static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7773{
7774 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7775 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7776
7777 if (tg == &root_task_group)
7778 return -EINVAL;
7779
7780
7781
7782
7783
7784
7785 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7786 return -EINVAL;
7787
7788
7789
7790
7791
7792
7793 if (period > max_cfs_quota_period)
7794 return -EINVAL;
7795
7796 mutex_lock(&cfs_constraints_mutex);
7797 ret = __cfs_schedulable(tg, period, quota);
7798 if (ret)
7799 goto out_unlock;
7800
7801 runtime_enabled = quota != RUNTIME_INF;
7802 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7803 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7804 raw_spin_lock_irq(&cfs_b->lock);
7805 cfs_b->period = ns_to_ktime(period);
7806 cfs_b->quota = quota;
7807
7808 __refill_cfs_bandwidth_runtime(cfs_b);
7809
7810 if (runtime_enabled && cfs_b->timer_active) {
7811
7812 cfs_b->timer_active = 0;
7813 __start_cfs_bandwidth(cfs_b);
7814 }
7815 raw_spin_unlock_irq(&cfs_b->lock);
7816
7817 for_each_possible_cpu(i) {
7818 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7819 struct rq *rq = cfs_rq->rq;
7820
7821 raw_spin_lock_irq(&rq->lock);
7822 cfs_rq->runtime_enabled = runtime_enabled;
7823 cfs_rq->runtime_remaining = 0;
7824
7825 if (cfs_rq->throttled)
7826 unthrottle_cfs_rq(cfs_rq);
7827 raw_spin_unlock_irq(&rq->lock);
7828 }
7829out_unlock:
7830 mutex_unlock(&cfs_constraints_mutex);
7831
7832 return ret;
7833}
7834
7835int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7836{
7837 u64 quota, period;
7838
7839 period = ktime_to_ns(tg->cfs_bandwidth.period);
7840 if (cfs_quota_us < 0)
7841 quota = RUNTIME_INF;
7842 else
7843 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7844
7845 return tg_set_cfs_bandwidth(tg, period, quota);
7846}
7847
7848long tg_get_cfs_quota(struct task_group *tg)
7849{
7850 u64 quota_us;
7851
7852 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7853 return -1;
7854
7855 quota_us = tg->cfs_bandwidth.quota;
7856 do_div(quota_us, NSEC_PER_USEC);
7857
7858 return quota_us;
7859}
7860
7861int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7862{
7863 u64 quota, period;
7864
7865 period = (u64)cfs_period_us * NSEC_PER_USEC;
7866 quota = tg->cfs_bandwidth.quota;
7867
7868 return tg_set_cfs_bandwidth(tg, period, quota);
7869}
7870
7871long tg_get_cfs_period(struct task_group *tg)
7872{
7873 u64 cfs_period_us;
7874
7875 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7876 do_div(cfs_period_us, NSEC_PER_USEC);
7877
7878 return cfs_period_us;
7879}
7880
7881static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7882{
7883 return tg_get_cfs_quota(cgroup_tg(cgrp));
7884}
7885
7886static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7887 s64 cfs_quota_us)
7888{
7889 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7890}
7891
7892static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7893{
7894 return tg_get_cfs_period(cgroup_tg(cgrp));
7895}
7896
7897static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7898 u64 cfs_period_us)
7899{
7900 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7901}
7902
7903struct cfs_schedulable_data {
7904 struct task_group *tg;
7905 u64 period, quota;
7906};
7907
7908
7909
7910
7911
7912static u64 normalize_cfs_quota(struct task_group *tg,
7913 struct cfs_schedulable_data *d)
7914{
7915 u64 quota, period;
7916
7917 if (tg == d->tg) {
7918 period = d->period;
7919 quota = d->quota;
7920 } else {
7921 period = tg_get_cfs_period(tg);
7922 quota = tg_get_cfs_quota(tg);
7923 }
7924
7925
7926 if (quota == RUNTIME_INF || quota == -1)
7927 return RUNTIME_INF;
7928
7929 return to_ratio(period, quota);
7930}
7931
7932static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7933{
7934 struct cfs_schedulable_data *d = data;
7935 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7936 s64 quota = 0, parent_quota = -1;
7937
7938 if (!tg->parent) {
7939 quota = RUNTIME_INF;
7940 } else {
7941 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7942
7943 quota = normalize_cfs_quota(tg, d);
7944 parent_quota = parent_b->hierarchal_quota;
7945
7946
7947
7948
7949
7950 if (quota == RUNTIME_INF)
7951 quota = parent_quota;
7952 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7953 return -EINVAL;
7954 }
7955 cfs_b->hierarchal_quota = quota;
7956
7957 return 0;
7958}
7959
7960static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7961{
7962 int ret;
7963 struct cfs_schedulable_data data = {
7964 .tg = tg,
7965 .period = period,
7966 .quota = quota,
7967 };
7968
7969 if (quota != RUNTIME_INF) {
7970 do_div(data.period, NSEC_PER_USEC);
7971 do_div(data.quota, NSEC_PER_USEC);
7972 }
7973
7974 rcu_read_lock();
7975 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7976 rcu_read_unlock();
7977
7978 return ret;
7979}
7980
7981static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7982 struct cgroup_map_cb *cb)
7983{
7984 struct task_group *tg = cgroup_tg(cgrp);
7985 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7986
7987 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7988 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7989 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7990
7991 return 0;
7992}
7993#endif
7994#endif
7995
7996#ifdef CONFIG_RT_GROUP_SCHED
7997static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7998 s64 val)
7999{
8000 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8001}
8002
8003static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8004{
8005 return sched_group_rt_runtime(cgroup_tg(cgrp));
8006}
8007
8008static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8009 u64 rt_period_us)
8010{
8011 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8012}
8013
8014static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8015{
8016 return sched_group_rt_period(cgroup_tg(cgrp));
8017}
8018#endif
8019
8020static struct cftype cpu_files[] = {
8021#ifdef CONFIG_FAIR_GROUP_SCHED
8022 {
8023 .name = "shares",
8024 .read_u64 = cpu_shares_read_u64,
8025 .write_u64 = cpu_shares_write_u64,
8026 },
8027#endif
8028#ifdef CONFIG_CFS_BANDWIDTH
8029 {
8030 .name = "cfs_quota_us",
8031 .read_s64 = cpu_cfs_quota_read_s64,
8032 .write_s64 = cpu_cfs_quota_write_s64,
8033 },
8034 {
8035 .name = "cfs_period_us",
8036 .read_u64 = cpu_cfs_period_read_u64,
8037 .write_u64 = cpu_cfs_period_write_u64,
8038 },
8039 {
8040 .name = "stat",
8041 .read_map = cpu_stats_show,
8042 },
8043#endif
8044#ifdef CONFIG_RT_GROUP_SCHED
8045 {
8046 .name = "rt_runtime_us",
8047 .read_s64 = cpu_rt_runtime_read,
8048 .write_s64 = cpu_rt_runtime_write,
8049 },
8050 {
8051 .name = "rt_period_us",
8052 .read_u64 = cpu_rt_period_read_uint,
8053 .write_u64 = cpu_rt_period_write_uint,
8054 },
8055#endif
8056 { }
8057};
8058
8059struct cgroup_subsys cpu_cgroup_subsys = {
8060 .name = "cpu",
8061 .css_alloc = cpu_cgroup_css_alloc,
8062 .css_free = cpu_cgroup_css_free,
8063 .css_online = cpu_cgroup_css_online,
8064 .css_offline = cpu_cgroup_css_offline,
8065 .can_attach = cpu_cgroup_can_attach,
8066 .attach = cpu_cgroup_attach,
8067 .exit = cpu_cgroup_exit,
8068 .subsys_id = cpu_cgroup_subsys_id,
8069 .base_cftypes = cpu_files,
8070 .early_init = 1,
8071};
8072
8073#endif
8074
8075void dump_cpu_task(int cpu)
8076{
8077 pr_info("Task dump for CPU %d:\n", cpu);
8078 sched_show_task(cpu_curr(cpu));
8079}
8080